scene-level embeddings updated on gdrive

gauravpradeep · gauravpradeep · commit e0cb480af6fd · 2025-10-18T13:48:23.000+05:30
diff --git a/DATA.md b/DATA.md
@@ -20,8 +20,11 @@ We detail data download and release instructions for preprocessing with scripts
 ### Generated Embedding Data
 We release the scene level embeddings created with CrossOver on the currenly used datasets in [GDrive](https://drive.google.com/drive/folders/12vn5CCvnI9zagFyYrGzLLlMPTgF7rndW?usp=sharing), which can be used for cross-modal retrieval with custom data as detailed in demo section.
 
-- `embed_scannet.pt`: Scene Embeddings For All Modalities (Point Cloud, RGB, Floorplan, Referral) in ScanNet
-- `embed_scan3r.pt` : Scene Embeddings For All Modalities (Point Cloud, RGB, Referral) in 3RScan
+- `embed_scannet.npz`: Scene Embeddings For All Modalities (Point Cloud, RGB, Floorplan, Referral) in ScanNet
+- `embed_scan3r.npz` : Scene Embeddings For All Modalities (Point Cloud, RGB, Referral) in 3RScan
+- `embed_multiscan.npz` : Scene Embeddings For All Modalities (Point Cloud, RGB, Referral) in MultiScan
+- `embed_arkitscenes.npz` : Scene Embeddings For All Modalities (Point Cloud, RGB, Referral) in ARKitScenes
+
 
 > You agree to the terms of ScanNet, 3RScan, ShapeNet, Scan2CAD, MultiScan, ARKitScenes and SceneVerse datasets by downloading our hosted data.
 
diff --git a/README.md b/README.md
@@ -70,10 +70,8 @@ assume complete data availability across all modalities. We present **CrossOver*
 # :newspaper: News
 - ![](https://img.shields.io/badge/New!-8A2BE2) **Version 1.0** - **CrossOver is now stronger than ever**. We recommend updating to this version; changes include:
   - More powerful pre-trained checkpoints; now available on Huggingface 👉 [here](https://huggingface.co/gradient-spaces/CrossOver/tree/main).
-  - Support for 2 additional datasets - ARKitScenes & MultiScan
-  
+  - Support for 2 additional datasets - ARKitScenes & MultiScan  
 
-- [2025-05] Pretrained checkpoints have been moved to HuggingFace 👉 [here](https://huggingface.co/gradient-spaces/CrossOver/tree/main).
 - [2025-03] CrossOver is accepted to **CVPR 2025** as **Highlight**. 🔥
 - [2025-02] **Version 0.1** - We release CrossOver on arXiv with codebase + pre-trained checkpoints. Checkout our [paper](https://arxiv.org/abs/2502.15011) and [website](https://sayands.github.io/crossover/).
 
@@ -141,15 +139,15 @@ $ python demo/demo_instance_retrieval.py
 
 Various configurable parameters:
 
-- `--query_path`: Path to query object(point cloud, image, or text) 
+- `--query_path`: Path to query object (point cloud, image, or text) 
 - `--query_modality`: Query modality - Options: `point`, `rgb`, `referral` 
 - `--scan_id`: Scene ID to search in 
 - `--target_modality`: Target modality to match against - Options: `point`, `rgb`, `referral`, `cad` 
 - `--dataset`: Dataset name - Options: `scannet`, `scan3r`, `arkitscenes`, `multiscan` 
-- `--data_dir`: Path to dataset directory - default: `/drive/datasets/Scannet`
+- `--data_dir`: Path to dataset directory 
 - `--process_dir`: Path to preprocessed features directory (for gt-projection-seg.npz)
-- `--ckpt`: Path to model checkpoint 
-- `--top_k`: Number of top results to return - default: `5`
+- `--ckpt`: Path to pre-trained instance crossover model checkpoint (details [here](#checkpoints)) 
+- `--top_k`: Number of top results to return
 
 
 ## Scene Retrieval Demo
@@ -166,7 +164,7 @@ Various configurable parameters:
 - `--database_path`: Path to the precomputed embeddings of the database scenes downloaded before (eg: `./release_data/embed_scannet.pt`).
 - `--query_modality`: Modality of the query scene, Options: `point`, `rgb`, `floorplan`, `referral`
 - `--database_modality`: Modality used for retrieval. Same options as above.
-- `--ckpt`: Path to the pre-trained scene crossover model checkpoint (details [here](#checkpoints)), example_path: `./checkpoints/scene_crossover_scannet+scan3r.pth/`).
+- `--ckpt`: Path to the pre-trained scene crossover model checkpoint (details [here](#checkpoints)), example_path: `./checkpoints/scene_crossover_scannet+scan3r.pth/`.
 
 For embedding and pre-trained model download, refer to [generated embedding data](DATA.md#generated-embedding-data) and [checkpoints](#checkpoints) sections.
 
diff --git a/single_inference/datasets/arkit.py b/single_inference/datasets/arkit.py
@@ -15,7 +15,7 @@
 import albumentations as A
 
 class ARKitScenesInferDataset(Dataset):
-    def __init__(self, data_dir, process_dir, voxel_size=0.02, frame_skip=5, image_size=[224, 224]) -> None:
+    def __init__(self, data_dir, process_dir, voxel_size=0.02, frame_skip=1, image_size=[224, 224]) -> None:
         self.voxel_size = voxel_size
         self.frame_skip = frame_skip
         self.image_size = image_size
@@ -45,7 +45,8 @@ def __init__(self, data_dir, process_dir, voxel_size=0.02, frame_skip=5, image_s
         self.normalize_color = A.Normalize(mean=color_mean, std=color_std)
     
     def extract_images(self, scan_id, color_path):
-        pose_data = arkit.load_poses(self.scans_dir, scan_id, skip=self.frame_skip)      
+        scan_dir = osp.join(self.scans_dir, scan_id)
+        pose_data = arkit.load_poses(scan_dir, scan_id, skip=self.frame_skip)      
         frame_idxs = list(pose_data.keys())
         
         pose_data_arr = []
diff --git a/single_inference/scene_inference.py b/single_inference/scene_inference.py
@@ -50,8 +50,8 @@ def run_inference(args, scan_id=None):
     # print(f'Total number of parameters: {total_params}')
     # assert False
     
-    data = { 'scene': []}
     if scan_id is not None:
+        # Single scan inference
         data_dict = dataset[scan_id]
         with torch.no_grad():
             output = model(data_dict)
@@ -60,43 +60,79 @@ def run_inference(args, scan_id=None):
         for modality in output['embeddings']:
             output_np[modality] = output['embeddings'][modality].cpu().numpy()
         
-        data['scene'].append({'scan_id': scan_id, 'scene_embeds': output_np, 'masks': output['masks']})
+        data = {'scene': [{'scan_id': scan_id, 'scene_embeds': output_np, 'masks': output['masks']}]}
         save_data = {
             'scene': data['scene']
         }
         np.savez(f'embed_{args.dataset.lower()}_{scan_id}.npz', **save_data)
         log.info(f'Saved embeddings for {scan_id}.')
 
     else:
-        for idx, scan_id in tqdm(enumerate(dataset.scan_ids)):
-            data_dict = dataset[idx]
-            with torch.no_grad():
-                output = model(data_dict)
+        output_file = f'/drive/dumps/multimodal-spaces/v1.0_release/embed_{args.dataset.lower()}.npz'
+        
+        existing_data = {'scene': []}
+        processed_scan_ids = set()
+        
+        if osp.exists(output_file):
+            try:
+                existing_npz = np.load(output_file, allow_pickle=True)
+                existing_data = {'scene': existing_npz['scene'].tolist()}
+                processed_scan_ids = {item['scan_id'] for item in existing_data['scene']}
+                log.info(f'Loaded existing embeddings for {len(processed_scan_ids)} scans. Resuming from where we left off.')
+            except Exception as e:
+                log.warning(f'Could not load existing file {output_file}: {e}. Starting fresh.')
+                existing_data = {'scene': []}
+                processed_scan_ids = set()
+        
+        remaining_scans = [(idx, scan_id) for idx, scan_id in enumerate(dataset.scan_ids) 
+                          if scan_id not in processed_scan_ids]
+        
+        if not remaining_scans:
+            log.info('All scans already processed.')
+            return
+            
+        log.info(f'Processing {len(remaining_scans)} remaining scans out of {len(dataset.scan_ids)} total scans.')
+        
+        for idx, scan_id in tqdm(remaining_scans, desc="Processing scans"):
+            try:
+                data_dict = dataset[idx]
+                with torch.no_grad():
+                    output = model(data_dict)
+                    
+                    output_np = {}
+                    for modality in output['embeddings']:
+                        output_np[modality] = output['embeddings'][modality].cpu().numpy()
+                    
+                    existing_data['scene'].append({
+                        'scan_id': scan_id, 
+                        'scene_embeds': output_np, 
+                        'masks': output['masks']
+                    })
                 
-                output_np = {}
-                for modality in output['embeddings']:
-                    output_np[modality] = output['embeddings'][modality].cpu().numpy()
+                save_data = {
+                    'scene': existing_data['scene']
+                }
+                np.savez_compressed(output_file, **save_data)
+                log.info(f'Processed and saved scan {scan_id} ({len(existing_data["scene"])}/{len(dataset.scan_ids)} total).')
                 
-                data['scene'].append({'scan_id': scan_id, 'scene_embeds': output_np, 'masks': output['masks']})
-            
-        save_data = {
-            'scene': data['scene']
-        }
-        np.savez(f'/drive/dumps/multimodal-spaces/v1.0_release/embed_{args.dataset.lower()}.npz', **save_data)
-        log.info(f'Saved embeddings for {len(data["scene"])} scenes.')
+            except Exception as e:
+                log.error(f'Error processing scan {scan_id}: {e}. Skipping and continuing.')
+                continue
+        
+        log.info(f'Completed processing. Final embeddings saved for {len(existing_data["scene"])} scenes.')
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description='Scene Inference')
-    parser.add_argument('--dataset', default='Scannet', type=str, required=False)
-    parser.add_argument('--data_dir', default='/drive/datasets/Scannet', type=str, required=False)
-    parser.add_argument('--process_dir', default='/drive/dumps/multimodal-spaces/preprocess_feats/Scannet', type=str, required=False)
-    parser.add_argument('--ckpt', default='/drive/dumps/multimodal-spaces/runs/new_runs/rgb/scene_crossover_scannet+scan3r+multiscan+arkitscenes_scratch.pth', type=str, required=False)
+    parser.add_argument('--dataset', default='Scan3R', type=str, required=False)
+    parser.add_argument('--data_dir', default='/scratch/users/gauravp/datasets/Scan3R', type=str, required=False)
+    parser.add_argument('--process_dir', default='/scratch/users/gauravp/dumps/preprocess_feats/Scan3R', type=str, required=False)
+    parser.add_argument('--ckpt', default='/scratch/users/gauravp/ckpts/scene_crossover_scannet+scan3r+multiscan+arkitscenes_scratch.pth', type=str, required=False)
     parser.add_argument('--scan_id', default='', type=str, required=False)
     parser.add_argument('--input_dim_3d', default=512, type=int, required=False)
     parser.add_argument('--input_dim_2d', default=1536, type=int, required=False)
     parser.add_argument('--input_dim_1d', default=768, type=int, required=False)
     parser.add_argument('--out_dim', default=768, type=int, required=False)
-    
+
     # Reproducibility
     random.seed(42)
     np.random.seed(42)