feat: support INVR dataset

JasonLSC · JasonLSC · commit 52fb40aa65fe · 2025-01-03T13:41:56.000+08:00
diff --git a/examples/datasets/INVR_N3D.py b/examples/datasets/INVR_N3D.py
@@ -9,6 +9,7 @@
 import imageio.v2 as imageio
 from PIL import Image
 import numpy as np
+
 import torch
 from pycolmap import SceneManager
 try:
@@ -38,23 +39,27 @@ def __init__(
         multiview: bool = False,
         duration: int = 5, # only for testing
         resolution_scales: list = [1.0],
-        resolution: int = 2,
+        downscale_factor: int = 2,
         data_device: str = "cpu",
+        test_view_id: List[int] = [0]
     ):
         self.model_path = model_path 
         self.source_path = source_path 
         self.images_phrase = images_phrase
         self.eval = eval
         self.duration = duration
         self.resolution_scales = resolution_scales
+        self.test_view_id = test_view_id
         
         self.train_cameras = {}
         self.test_cameras = {}
         raydict = {}
         
         # Get scene info
         if loader == "colmap": # colmapvalid only for testing
-            scene_info = sceneLoadTypeCallbacks["Colmap"](self.source_path, self.images_phrase, self.eval, multiview, duration=self.duration) # SceneInfo() - NamedTuple
+            scene_info = sceneLoadTypeCallbacks["Colmap"](self.source_path, self.images_phrase, self.eval, multiview, duration=self.duration, test_view_id=self.test_view_id, downscale_factor=downscale_factor) # SceneInfo() - NamedTuple
+        elif loader == "invr":
+            scene_info = sceneLoadTypeCallbacks["INVR"](self.source_path, self.images_phrase, self.eval, multiview, duration=self.duration) # SceneInfo() - NamedTuple
         else:
             assert False, "Could not recognize scene type!"
 
@@ -65,7 +70,7 @@ def __init__(
         # need modification
         class ModelParams(): 
             def __init__(self):
-                self.resolution = resolution
+                self.downscale_factor = downscale_factor
                 self.data_device = data_device
         args = ModelParams()
         self.args = args
@@ -121,6 +126,7 @@ def __init__(
         self.num_views = num_views
         self.parser = parser
         self.resolution_scale = self.parser.resolution_scales[0]
+        self.split = split
         
         if split == "train":
             self.scene_info = self.parser.scene_info[1]
@@ -153,7 +159,7 @@ def __init__(
         
         self.start_frame = min(scene_by_t.keys())
         
-    def __len__(self):
+    def __len__(self): # num of timestamp
         return  self.fake_length if self.use_fake_length else len(self.scene_by_t)
         # return len(self.scene_info)
 
@@ -164,16 +170,19 @@ def fetch_image(self, path):
     def __getitem__(self, index: int) -> Dict[str, Any]:
         tid = index % len(self.scene_by_t)
         t_infos = self.scene_by_t[tid + self.start_frame]
-        try:
-            frame_infos = random.sample(t_infos, k=self.num_views)
-        except: #replace
-            frame_infos = random.choices(t_infos, k=self.num_views)
-        # frame_infos = np.random.choice(t_infos, self.num_views, replace=False) 
+        if self.split == "train":
+            try:
+                frame_infos = random.sample(t_infos, k=self.num_views)
+            except: #replace
+                frame_infos = random.choices(t_infos, k=self.num_views)
+        else:
+            frame_infos = t_infos[:self.num_views] # take out frames in single imgstamp by default order
+
         K = self.parser.K
-        scale = self.parser.args.resolution
+        downscale_factor = self.parser.args.downscale_factor
         Ks, images, image_paths, rays, timesteps, camtoworlds = [], [], [], [], [], []
         for globalid, cami, finfo in frame_infos:
-            resolution = (int(finfo.width / scale), int(finfo.height / scale))
+            resolution = (int(finfo.width / downscale_factor), int(finfo.height / downscale_factor))
             
             images.append(PILtoTorch_new(self.fetch_image(finfo.image_path), resolution).permute(1,2,0))
             image_paths.append(finfo.image_path)
diff --git a/examples/helper/STG/camera_utils.py b/examples/helper/STG/camera_utils.py
@@ -25,10 +25,10 @@
 def loadCam(args, id, cam_info, resolution_scale):
     orig_w, orig_h = cam_info.image.size
 
-    if args.resolution in [1, 2, 4, 8]:
-        resolution = round(orig_w/(resolution_scale * args.resolution)), round(orig_h/(resolution_scale * args.resolution))
+    if args.downscale_factor in [1, 2, 4, 8]:
+        resolution = round(orig_w/(resolution_scale * args.downscale_factor)), round(orig_h/(resolution_scale * args.downscale_factor))
     else:  # should be a type that converts to float
-        if args.resolution == -1:
+        if args.downscale_factor == -1:
             if orig_w > 1600:
                 global WARNED
                 if not WARNED:
@@ -39,7 +39,7 @@ def loadCam(args, id, cam_info, resolution_scale):
             else:
                 global_down = 1
         else:
-            global_down = orig_w / args.resolution
+            global_down = orig_w / args.downscale_factor
 
         scale = float(global_down) * float(resolution_scale)
         resolution = (int(orig_w / scale), int(orig_h / scale))
@@ -70,10 +70,10 @@ def loadCam(args, id, cam_info, resolution_scale):
 # @timer
 def loadCamv2(args, id, cam_info, resolution_scale):
     orig_w, orig_h =  cam_info.width, cam_info.height
-    if args.resolution in [1, 2, 4, 8]:
-        resolution = round(orig_w/(resolution_scale * args.resolution)), round(orig_h/(resolution_scale * args.resolution))
+    if args.downscale_factor in [1, 2, 4, 8]:
+        resolution = round(orig_w/(resolution_scale * args.downscale_factor)), round(orig_h/(resolution_scale * args.downscale_factor))
     else:  # should be a type that converts to float
-        if args.resolution == -1:
+        if args.downscale_factor == -1:
             if orig_w > 1600:
                 global WARNED
                 if not WARNED:
@@ -84,7 +84,7 @@ def loadCamv2(args, id, cam_info, resolution_scale):
             else:
                 global_down = 1
         else:
-            global_down = orig_w / args.resolution
+            global_down = orig_w / args.downscale_factor
 
         scale = float(global_down) * float(resolution_scale)
         resolution = (int(orig_w / scale), int(orig_h / scale))
@@ -107,6 +107,10 @@ def loadCamv2(args, id, cam_info, resolution_scale):
     else :
         rays_o = None
         rays_d = None
+
+    if gt_image is None:
+        gt_image = (resolution[0], resolution[1])
+    
     return Camera(colmap_id=cam_info.uid, R=cam_info.R, T=cam_info.T, 
                   FoVx=cam_info.FovX, FoVy=cam_info.FovY, 
                   image=gt_image, gt_alpha_mask=loaded_mask,
@@ -122,10 +126,10 @@ def loadCamv2(args, id, cam_info, resolution_scale):
 def loadCamv2timing(args, id, cam_info, resolution_scale):
     orig_w, orig_h = cam_info.image.size
 
-    if args.resolution in [1, 2, 4, 8]:
-        resolution = round(orig_w/(resolution_scale * args.resolution)), round(orig_h/(resolution_scale * args.resolution))
+    if args.downscale_factor in [1, 2, 4, 8]:
+        resolution = round(orig_w/(resolution_scale * args.downscale_factor)), round(orig_h/(resolution_scale * args.downscale_factor))
     else:  # should be a type that converts to float
-        if args.resolution == -1:
+        if args.downscale_factor == -1:
             if orig_w > 1600:
                 global WARNED
                 if not WARNED:
@@ -136,7 +140,7 @@ def loadCamv2timing(args, id, cam_info, resolution_scale):
             else:
                 global_down = 1
         else:
-            global_down = orig_w / args.resolution
+            global_down = orig_w / args.downscale_factor
 
         scale = float(global_down) * float(resolution_scale)
         resolution = (int(orig_w / scale), int(orig_h / scale))
@@ -166,11 +170,11 @@ def loadCamv2timing(args, id, cam_info, resolution_scale):
 
 def loadCamv2ss(args, id, cam_info, resolution_scale):
     orig_w, orig_h = cam_info.image.size
-    assert args.resolution == 1
-    if args.resolution in [1, 2, 4, 8]:
-        resolution = round(orig_w/(resolution_scale * args.resolution)), round(orig_h/(resolution_scale * args.resolution))
+    assert args.downscale_factor == 1
+    if args.downscale_factor in [1, 2, 4, 8]:
+        resolution = round(orig_w/(resolution_scale * args.downscale_factor)), round(orig_h/(resolution_scale * args.downscale_factor))
     else:  # should be a type that converts to float
-        if args.resolution == -1:
+        if args.downscale_factor == -1:
             if orig_w > 1600:
                 global WARNED
                 if not WARNED:
@@ -181,7 +185,7 @@ def loadCamv2ss(args, id, cam_info, resolution_scale):
             else:
                 global_down = 1
         else:
-            global_down = orig_w / args.resolution
+            global_down = orig_w / args.downscale_factor
 
         scale = float(global_down) * float(resolution_scale)
         resolution = (int(orig_w / scale), int(orig_h / scale))
@@ -214,10 +218,10 @@ def loadCamv2ss(args, id, cam_info, resolution_scale):
 def loadCamnogt(args, id, cam_info, resolution_scale):
     orig_w, orig_h = cam_info.width, cam_info.height
 
-    if args.resolution in [1, 2, 4, 8]:
-        resolution = round(orig_w/(resolution_scale * args.resolution)), round(orig_h/(resolution_scale * args.resolution))
+    if args.downscale_factor in [1, 2, 4, 8]:
+        resolution = round(orig_w/(resolution_scale * args.downscale_factor)), round(orig_h/(resolution_scale * args.downscale_factor))
     else:  # should be a type that converts to float
-        if args.resolution == -1:
+        if args.downscale_factor == -1:
             if orig_w > 1600:
                 global WARNED
                 if not WARNED:
@@ -228,7 +232,7 @@ def loadCamnogt(args, id, cam_info, resolution_scale):
             else:
                 global_down = 1
         else:
-            global_down = orig_w / args.resolution
+            global_down = orig_w / args.downscale_factor
 
         scale = float(global_down) * float(resolution_scale)
         resolution = (int(orig_w / scale), int(orig_h / scale))
diff --git a/examples/helper/STG/dataset_readers.py b/examples/helper/STG/dataset_readers.py