Spaces:

aca2024
/

StableSpann3R

Runtime error

App Files Files Community

Stable-X commited on Oct 10, 2024

Commit

d1dbe71

1 Parent(s): 6f6423c

feat: Add backend for refinement

Browse files

Files changed (3) hide show

app.py +62 -15
backend_utils.py +144 -0
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -15,7 +15,9 @@ from scipy.spatial.transform import Rotation
 from transformers import AutoModelForImageSegmentation
 from torchvision import transforms
 from PIL import Image
-import spaces
 # Default values
 DEFAULT_CKPT_PATH = './checkpoints/spann3r.pth'
@@ -143,7 +145,6 @@ def generate_mask(image: np.ndarray):
     mask_np = np.array(mask) / 255.0
     return mask_np
-@spaces.GPU
 @torch.no_grad()
 def reconstruct(video_path, conf_thresh, kf_every, as_pointcloud=False, remove_background=False):
     # Extract frames from video
@@ -176,7 +177,7 @@ def reconstruct(video_path, conf_thresh, kf_every, as_pointcloud=False, remove_b
         if remove_background:
             mask = generate_mask(image)
         else:
-            mask = np.ones_like(conf)  # Change this to match conf shape
         images_all.append((image[None, ...] + 1.0)/2.0)
         pts_all.append(pts[None, ...])
@@ -192,6 +193,54 @@ def reconstruct(video_path, conf_thresh, kf_every, as_pointcloud=False, remove_b
     conf_sig_all = (conf_all-1) / conf_all
     combined_mask = (conf_sig_all > conf_thresh) & (mask_all > 0.5)
     scene = trimesh.Scene()
     if as_pointcloud:
@@ -206,37 +255,35 @@ def reconstruct(video_path, conf_thresh, kf_every, as_pointcloud=False, remove_b
             meshes.append(pts3d_to_trimesh(images_all[i], pts_all[i], combined_mask[i]))
         mesh = trimesh.Trimesh(**cat_meshes(meshes))
         scene.add_geometry(mesh)
     rot = np.eye(4)
     rot[:3, :3] = Rotation.from_euler('y', np.deg2rad(180)).as_matrix()
     scene.apply_transform(np.linalg.inv(OPENGL @ rot))
-    # Save the scene as GLB
     if as_pointcloud:
-        output_path = tempfile.mktemp(suffix='.ply')
     else:
         output_path = tempfile.mktemp(suffix='.obj')
     scene.export(output_path)
-    # Clean up temporary directory
-    os.system(f"rm -rf {demo_path}")
-    return output_path, f"Reconstruction completed. FPS: {fps:.2f}"
 iface = gr.Interface(
     fn=reconstruct,
     inputs=[
         gr.Video(label="Input Video"),
-        gr.Slider(0, 1, value=1e-3, label="Confidence Threshold"),
         gr.Slider(1, 30, step=1, value=5, label="Keyframe Interval"),
         gr.Checkbox(label="As Pointcloud", value=False),
         gr.Checkbox(label="Remove Background", value=False)
     ],
     outputs=[
-        gr.Model3D(label="3D Model", display_mode="solid"),
         gr.Textbox(label="Status")
     ],
-    title="3D Reconstruction with Spatial Memory and Background Removal",
 )
 if __name__ == "__main__":

 from transformers import AutoModelForImageSegmentation
 from torchvision import transforms
 from PIL import Image
+import open3d as o3d
+from backend_utils import improved_multiway_registration
 # Default values
 DEFAULT_CKPT_PATH = './checkpoints/spann3r.pth'
     mask_np = np.array(mask) / 255.0
     return mask_np
 @torch.no_grad()
 def reconstruct(video_path, conf_thresh, kf_every, as_pointcloud=False, remove_background=False):
     # Extract frames from video
         if remove_background:
             mask = generate_mask(image)
         else:
+            mask = np.ones_like(conf)
         images_all.append((image[None, ...] + 1.0)/2.0)
         pts_all.append(pts[None, ...])
     conf_sig_all = (conf_all-1) / conf_all
     combined_mask = (conf_sig_all > conf_thresh) & (mask_all > 0.5)
+    # Create coarse result
+    coarse_scene = create_scene(pts_all, images_all, combined_mask, as_pointcloud)
+    coarse_output_path = save_scene(coarse_scene, as_pointcloud)
+    yield coarse_output_path, None, f"Reconstruction completed. FPS: {fps:.2f}"
+    # Create point clouds for multiway registration
+    pcds = []
+    for j in range(len(pts_all)):
+        pcd = o3d.geometry.PointCloud()
+        mask = combined_mask[j]
+        pcd.points = o3d.utility.Vector3dVector(pts_all[j][mask])
+        pcd.colors = o3d.utility.Vector3dVector(images_all[j][mask])
+        pcds.append(pcd)
+    # Perform global optimization
+    print("Performing global registration...")
+    transformed_pcds, pose_graph = improved_multiway_registration(pcds, voxel_size=0.01)
+    # Apply transformations from pose_graph to original pts_all
+    transformed_pts_all = np.zeros_like(pts_all)
+    for j in range(len(pts_all)):
+        # Get the transformation matrix from the pose graph
+        transformation = pose_graph.nodes[j].pose
+        # Reshape pts_all[j] to (H*W, 3)
+        H, W, _ = pts_all[j].shape
+        pts_reshaped = pts_all[j].reshape(-1, 3)
+        # Apply transformation to all points
+        homogeneous_pts = np.hstack((pts_reshaped, np.ones((pts_reshaped.shape[0], 1))))
+        transformed_pts = (transformation @ homogeneous_pts.T).T[:, :3]
+        # Reshape back to (H, W, 3) and store
+        transformed_pts_all[j] = transformed_pts.reshape(H, W, 3)
+    print(f"Original shape: {pts_all.shape}, Transformed shape: {transformed_pts_all.shape}")
+    # Create refined result
+    refined_scene = create_scene(transformed_pts_all, images_all, combined_mask, as_pointcloud)
+    refined_output_path = save_scene(refined_scene, as_pointcloud)
+    # Clean up temporary directory
+    os.system(f"rm -rf {demo_path}")
+    yield coarse_output_path, refined_output_path, f"Refinement completed. FPS: {fps:.2f}"
+def create_scene(pts_all, images_all, combined_mask, as_pointcloud):
     scene = trimesh.Scene()
     if as_pointcloud:
             meshes.append(pts3d_to_trimesh(images_all[i], pts_all[i], combined_mask[i]))
         mesh = trimesh.Trimesh(**cat_meshes(meshes))
         scene.add_geometry(mesh)
     rot = np.eye(4)
     rot[:3, :3] = Rotation.from_euler('y', np.deg2rad(180)).as_matrix()
     scene.apply_transform(np.linalg.inv(OPENGL @ rot))
+    return scene
+def save_scene(scene, as_pointcloud):
     if as_pointcloud:
+        output_path = tempfile.mktemp(suffix='.ply')
     else:
         output_path = tempfile.mktemp(suffix='.obj')
     scene.export(output_path)
+    return output_path
+# Update the Gradio interface
 iface = gr.Interface(
     fn=reconstruct,
     inputs=[
         gr.Video(label="Input Video"),
+        gr.Slider(0, 1, value=1e-6, label="Confidence Threshold"),
         gr.Slider(1, 30, step=1, value=5, label="Keyframe Interval"),
         gr.Checkbox(label="As Pointcloud", value=False),
         gr.Checkbox(label="Remove Background", value=False)
     ],
     outputs=[
+        gr.Model3D(label="Coarse 3D Model", display_mode="solid"),
+        gr.Model3D(label="Refined 3D Model", display_mode="solid"),
         gr.Textbox(label="Status")
     ],
+    title="3D Reconstruction with Spatial Memory, Background Removal, and Global Optimization",
 )
 if __name__ == "__main__":

backend_utils.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import numpy as np
+import open3d as o3d
+def improved_multiway_registration(pcds, voxel_size=0.05, max_correspondence_distance_coarse=None, max_correspondence_distance_fine=None, overlap=3, quadratic_overlap=True, use_colored_icp=True):
+    if max_correspondence_distance_coarse is None:
+        max_correspondence_distance_coarse = voxel_size * 15
+    if max_correspondence_distance_fine is None:
+        max_correspondence_distance_fine = voxel_size * 1.5
+    def preprocess_point_cloud(pcd, voxel_size):
+        pcd_down = pcd.voxel_down_sample(voxel_size)
+        pcd_down.estimate_normals(
+            o3d.geometry.KDTreeSearchParamHybrid(radius=voxel_size * 2, max_nn=30))
+        # Apply statistical outlier removal
+        cl, ind = pcd_down.remove_statistical_outlier(nb_neighbors=20, std_ratio=2.0)
+        pcd_down = pcd_down.select_by_index(ind)
+        return pcd_down
+    def pairwise_registration(source, target, use_colored_icp, voxel_size, max_correspondence_distance_coarse, max_correspondence_distance_fine):
+        current_transformation = np.identity(4)  # Start with identity matrix
+        if use_colored_icp:
+            print("Apply colored point cloud registration")
+            voxel_radius = [5*voxel_size, 3*voxel_size, voxel_size]
+            max_iter = [60, 35, 20]
+            for scale in range(3):
+                iter = max_iter[scale]
+                radius = voxel_radius[scale]
+                source_down = source.voxel_down_sample(radius)
+                target_down = target.voxel_down_sample(radius)
+                source_down.estimate_normals(
+                    o3d.geometry.KDTreeSearchParamHybrid(radius=radius * 2, max_nn=30))
+                target_down.estimate_normals(
+                    o3d.geometry.KDTreeSearchParamHybrid(radius=radius * 2, max_nn=30))
+                try:
+                    result_icp = o3d.pipelines.registration.registration_colored_icp(
+                        source_down, target_down, radius, current_transformation,
+                        o3d.pipelines.registration.TransformationEstimationForColoredICP(),
+                        o3d.pipelines.registration.ICPConvergenceCriteria(relative_fitness=1e-6,
+                                                                        relative_rmse=1e-6,
+                                                                        max_iteration=iter))
+                    current_transformation = result_icp.transformation
+                except RuntimeError as e:
+                    print(f"Colored ICP failed at scale {scale}: {str(e)}")
+                    print("Keeping the previous transformation")
+                    # We keep the previous transformation, no need to reassign
+            transformation_icp = current_transformation
+        else:
+            print("Apply point-to-plane ICP")
+            try:
+                icp_coarse = o3d.pipelines.registration.registration_icp(
+                    source, target, max_correspondence_distance_coarse, current_transformation,
+                    o3d.pipelines.registration.TransformationEstimationPointToPlane())
+                current_transformation = icp_coarse.transformation
+                icp_fine = o3d.pipelines.registration.registration_icp(
+                    source, target, max_correspondence_distance_fine,
+                    current_transformation,
+                    o3d.pipelines.registration.TransformationEstimationPointToPlane())
+                transformation_icp = icp_fine.transformation
+            except RuntimeError as e:
+                print(f"Point-to-plane ICP failed: {str(e)}")
+                print("Keeping the best available transformation")
+                transformation_icp = current_transformation
+        try:
+            information_icp = o3d.pipelines.registration.get_information_matrix_from_point_clouds(
+                source, target, max_correspondence_distance_fine,
+                transformation_icp)
+        except RuntimeError as e:
+            print(f"Failed to compute information matrix: {str(e)}")
+            print("Using identity information matrix")
+            information_icp = np.identity(6)
+        return transformation_icp, information_icp
+    def full_registration(pcds_down):
+        pose_graph = o3d.pipelines.registration.PoseGraph()
+        odometry = np.identity(4)
+        pose_graph.nodes.append(o3d.pipelines.registration.PoseGraphNode(odometry))
+        n_pcds = len(pcds_down)
+        pairs = []
+        for i in range(n_pcds - 1):
+            for j in range(i + 1, min(i + overlap + 1, n_pcds)):
+                pairs.append((i, j))
+                if quadratic_overlap:
+                    q = 2**(j-i)
+                    if q > overlap and i + q < n_pcds:
+                        pairs.append((i, i + q))
+        for source_id, target_id in pairs:
+            transformation_icp, information_icp = pairwise_registration(
+                pcds_down[source_id], pcds_down[target_id], use_colored_icp,
+                voxel_size, max_correspondence_distance_coarse, max_correspondence_distance_fine)
+            print(f"Build PoseGraph: {source_id} -> {target_id}")
+            if target_id == source_id + 1:
+                odometry = np.dot(transformation_icp, odometry)
+                pose_graph.nodes.append(
+                    o3d.pipelines.registration.PoseGraphNode(
+                        np.linalg.inv(odometry)))
+            pose_graph.edges.append(
+                o3d.pipelines.registration.PoseGraphEdge(source_id,
+                                                         target_id,
+                                                         transformation_icp,
+                                                         information_icp,
+                                                         uncertain=False))
+        return pose_graph
+    # Preprocess point clouds
+    print("Preprocessing point clouds...")
+    pcds_down = [preprocess_point_cloud(pcd, voxel_size) for pcd in pcds]
+    print("Full registration ...")
+    pose_graph = full_registration(pcds_down)
+    print("Optimizing PoseGraph ...")
+    option = o3d.pipelines.registration.GlobalOptimizationOption(
+        max_correspondence_distance=max_correspondence_distance_fine,
+        edge_prune_threshold=0.25,
+        reference_node=0)
+    with o3d.utility.VerbosityContextManager(o3d.utility.VerbosityLevel.Debug) as cm:
+        o3d.pipelines.registration.global_optimization(
+            pose_graph,
+            o3d.pipelines.registration.GlobalOptimizationLevenbergMarquardt(),
+            o3d.pipelines.registration.GlobalOptimizationConvergenceCriteria(),
+            option)
+    print("Transform points and combine")
+    pcd_combined = o3d.geometry.PointCloud()
+    for point_id in range(len(pcds)):
+        print(pose_graph.nodes[point_id].pose)
+        pcds[point_id].transform(pose_graph.nodes[point_id].pose)
+        pcd_combined += pcds[point_id]
+    return pcd_combined, pose_graph

requirements.txt CHANGED Viewed

@@ -16,4 +16,5 @@ gdown
 imageio[ffmpeg]
 transformers
 kornia
-timm

 imageio[ffmpeg]
 transformers
 kornia
+timm
+open3d