Spaces:

siyan824
/

slam3r-i2p_demo

Running

File size: 9,516 Bytes

8bd45de

# Copyright (C) 2024-present Naver Corporation. All rights reserved.
# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
#
# --------------------------------------------------------
# Dataloader for preprocessed Co3d_v2
# dataset at https://github.com/facebookresearch/co3d - Creative Commons Attribution-NonCommercial 4.0 International
# See datasets_preprocess/preprocess_co3d.py
# --------------------------------------------------------
import os.path as osp
import json
import itertools
from collections import deque
import cv2
import numpy as np

SLAM3R_DIR = osp.dirname(osp.dirname(osp.dirname(osp.abspath(__file__))))
import sys # noqa: E402
sys.path.insert(0, SLAM3R_DIR) # noqa: E402
from slam3r.datasets.base.base_stereo_view_dataset import BaseStereoViewDataset
from slam3r.utils.image import imread_cv2

TRAINING_CATEGORIES = [
    "apple","backpack","banana","baseballbat","baseballglove","bench","bicycle",
    "bottle","bowl","broccoli","cake","car","carrot","cellphone","chair","cup","donut","hairdryer","handbag","hydrant","keyboard",
    "laptop","microwave","motorcycle","mouse","orange","parkingmeter","pizza","plant","stopsign","teddybear","toaster","toilet",
    "toybus","toyplane","toytrain","toytruck","tv","umbrella","vase","wineglass",
]
TEST_CATEGORIES = ["ball", "book", "couch", "frisbee", "hotdog", "kite", "remote", "sandwich", "skateboard", "suitcase"]


class Co3d_Seq(BaseStereoViewDataset):
    def __init__(self, 
                 mask_bg=True, 
                 ROOT="data/co3d_processed", 
                 num_views=2,
                 degree=90,  # degree range to select views
                 sel_num=1,  # number of views to select inside a degree range
                 *args, 
                 **kwargs):
        self.ROOT = ROOT
        super().__init__(*args, **kwargs)
        assert mask_bg in (True, False, 'rand')
        self.mask_bg = mask_bg
        self.degree = degree
        self.winsize = int(degree / 360 * 100)
        self.sel_num = sel_num
        self.sel_num_perseq = (101 - self.winsize) * self.sel_num
        self.num_views = num_views

        # load all scenes
        if self.split == 'train':
            self.categories = TRAINING_CATEGORIES
        elif self.split == 'test':
            self.categories = TEST_CATEGORIES
        else:
            raise ValueError(f"Unknown split {self.split}")
        self.scenes = {}
        for cate in TRAINING_CATEGORIES:
            with open(osp.join(self.ROOT, cate, f'selected_seqs_{self.split}.json'), 'r') as f:
                self.scenes[cate] = json.load(f)
        self.scenes = {(k, k2): v2 for k, v in self.scenes.items()
                        for k2, v2 in v.items()}
        self.scene_list = list(self.scenes.keys()) # for each scene, we have about 100 images ==> 360 degrees (so 25 frames ~= 90 degrees)
        self.scene_lens = [len(v) for k,v in self.scenes.items()]
        # print(np.unique(np.array(self.scene_lens)))
        self.invalidate = {scene: {} for scene in self.scene_list}
        
        print(self)

    def __len__(self):
        return len(self.scene_list) * self.sel_num_perseq

    def get_img_idxes(self, idx, rng):
        sid = max(0, idx // self.sel_num - 1) #from 0 to 99-winsize
        eid = sid + self.winsize
        if idx % self.sel_num == 0:
            # generate a uniform sample between sid and eid
            return np.linspace(sid, eid, self.num_views, endpoint=True, dtype=int)
            
        # select the first and last, and randomly select the rest n-2 in between
        if self.num_views == 2:
            return [sid, eid]
        sel_ids = rng.choice(range(sid+1, eid), self.num_views-2, replace=False)
        sel_ids.sort()
        return [sid] + list(sel_ids) + [eid]
    

    def _get_views(self, idx, resolution, rng):
        # choose a scene
        obj, instance = self.scene_list[idx // self.sel_num_perseq]
        image_pool = self.scenes[obj, instance]
        last = len(image_pool)-1
        if last <= self.winsize:
            return self._get_views(rng.integers(0, len(self)-1), resolution, rng)

        imgs_idxs = self.get_img_idxes(idx % self.sel_num_perseq, rng)
        
        for i, idx in enumerate(imgs_idxs):
            if idx > last:
                idx = idx % last
                imgs_idxs[i] = idx 
        # print(imgs_idxs)

        if resolution not in self.invalidate[obj, instance]:  # flag invalid images
            self.invalidate[obj, instance][resolution] = [False for _ in range(len(image_pool))]

        # decide now if we mask the bg
        mask_bg = (self.mask_bg == True) or (self.mask_bg == 'rand' and rng.choice(2))

        views = []
        imgs_idxs = deque(imgs_idxs)
        
        while len(imgs_idxs) > 0:  # some images (few) have zero depth
            im_idx = imgs_idxs.popleft()
        
            if self.invalidate[obj, instance][resolution][im_idx]:
                # search for a valid image
                random_direction = 2 * rng.choice(2) - 1
                for offset in range(1, len(image_pool)):
                    tentative_im_idx = (im_idx + (random_direction * offset)) % len(image_pool)
                    if not self.invalidate[obj, instance][resolution][tentative_im_idx]:
                        im_idx = tentative_im_idx
                        break
                if offset == len(image_pool) - 1:
                    # no valid image found
                    return self._get_views((idx+1)%len(self), resolution, rng)

            view_idx = image_pool[im_idx]

            impath = osp.join(self.ROOT, obj, instance, 'images', f'frame{view_idx:06n}.jpg')

            # load camera params
            input_metadata = np.load(impath.replace('jpg', 'npz'))
            camera_pose = input_metadata['camera_pose'].astype(np.float32)
            intrinsics = input_metadata['camera_intrinsics'].astype(np.float32)

            # load image and depth
            rgb_image = imread_cv2(impath)
            depthmap = imread_cv2(impath.replace('images', 'depths') + '.geometric.png', cv2.IMREAD_UNCHANGED)
            depthmap = (depthmap.astype(np.float32) / 65535) * np.nan_to_num(input_metadata['maximum_depth'])
            if mask_bg:
                # load object mask
                maskpath = osp.join(self.ROOT, obj, instance, 'masks', f'frame{view_idx:06n}.png')
                maskmap = imread_cv2(maskpath, cv2.IMREAD_UNCHANGED).astype(np.float32)
                maskmap = (maskmap / 255.0) > 0.1

                # update the depthmap with mask
                depthmap *= maskmap
                
            rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
                rgb_image, depthmap, intrinsics, resolution, rng=rng, info=impath)

            # TODO: check if this is resonable
            valid_depth = depthmap[depthmap > 0.0]
            if valid_depth.size > 0:
                median_depth = np.median(valid_depth)
                # print(f"median depth: {median_depth}")
                depthmap[depthmap > median_depth*3] = 0. # filter out floatig points 
            
            num_valid = (depthmap > 0.0).sum()
            if num_valid == 0:
                # problem, invalidate image and retry
                self.invalidate[obj, instance][resolution][im_idx] = True
                imgs_idxs.append(im_idx)
                continue

            views.append(dict(
                img=rgb_image,
                depthmap=depthmap,
                camera_pose=camera_pose,
                camera_intrinsics=intrinsics,
                dataset='Co3d_v2',
                label=f"{obj}_{instance}_frame{view_idx:06n}.jpg",
                instance=osp.split(impath)[1],
            ))
        return views


if __name__ == "__main__":
    from slam3r.datasets.base.base_stereo_view_dataset import view_name
    import os
    import trimesh

    num_views = 11
    dataset = Co3d_Seq(split='train', 
                       mask_bg=False, resolution=224, aug_crop=16,
                       num_views=num_views, degree=90, sel_num=3)

    save_dir = "visualization/co3d_seq_views"
    os.makedirs(save_dir, exist_ok=True)

    # import tqdm
    # for idx in tqdm.tqdm(np.random.permutation(len(dataset))):
    #     views = dataset[(idx,0)]
    #     print([view['instance'] for view in views])

    for idx in np.random.permutation(len(dataset))[:10]:
    # for idx in range(len(dataset))[5:10000:2000]:
        os.makedirs(osp.join(save_dir, str(idx)), exist_ok=True)
        views = dataset[(idx,0)]
        assert len(views) == num_views
        all_pts = []
        all_color=[]
        for i, view in enumerate(views):
            img = np.array(view['img']).transpose(1, 2, 0)
            save_path = osp.join(save_dir, str(idx), f"{i}_{view['label']}")
            # img=cv2.COLOR_RGB2BGR(img)
            img=img[...,::-1]
            img = (img+1)/2
            cv2.imwrite(save_path, img*255)
            print(f"save to {save_path}")
            pts3d = np.array(view['pts3d']).reshape(-1,3)
            img = img[...,::-1]
            pct = trimesh.PointCloud(pts3d, colors=img.reshape(-1, 3))
            pct.export(save_path.replace('.jpg','.ply'))
            all_pts.append(pts3d)
            all_color.append(img.reshape(-1, 3))
        all_pts = np.concatenate(all_pts, axis=0)
        all_color = np.concatenate(all_color, axis=0)
        pct = trimesh.PointCloud(all_pts, all_color)
        pct.export(osp.join(save_dir, str(idx), f"all.ply"))