final code

Xecades · Xecades · commit 33a7b197f5bc · 2025-05-09T16:55:37.000+08:00
diff --git a/experiments/train_roma.py b/experiments/train_roma.py
@@ -9,7 +9,7 @@
 
 import romatch.utils.writer as writ
 from romatch.benchmarks import MixedDenseBenchmark, MixedVisualizeBenchmark
-from romatch.datasets import get_mixed_dataset, get_extredata_dataset
+from romatch.datasets import get_mixed_dataset, get_extredata_dataset, get_megadepth_dataset
 from romatch.losses.robust_loss import RobustLosses
 from romatch.utils.collate import collate_fn_with
 
@@ -200,7 +200,7 @@ def train(args):
         experiment_name += "_pretrained_weights"
 
     writ.init_writer(experiment_name, rank)
-    pl.seed_everything(args.seed)  # for reproducibility
+    pl.seed_everything(args.seed)
 
     checkpoint_dir = "workspace/checkpoints/"
     h, w = resolutions[resolution]
@@ -234,12 +234,11 @@ def train(args):
     if not romatch.TEST_MODE:
         # Data
         if args.use_pretained_roma:
-            # When finetuning, use extredata dataset only
-            dataset, dataset_ws = get_extredata_dataset(
-                h, w, train=True)
+            dataset, dataset_ws = get_mixed_dataset(
+                h, w, train=True, mega_percent=0.8)
         else:
             dataset, dataset_ws = get_mixed_dataset(
-                h, w, train=True, mega_percent=0.1)
+                h, w, train=True, mega_percent=0.9)
 
         # Loss and optimizer
         depth_loss = RobustLosses(
@@ -254,8 +253,8 @@ def train(args):
     if args.use_pretained_roma:
         # Use smaller learning rate for pretrained weights
         parameters = [
-            {"params": model.encoder.parameters(), "lr": romatch.STEP_SIZE * 5e-6 / 80},
-            {"params": model.decoder.parameters(), "lr": romatch.STEP_SIZE * 1e-4 / 80},
+            {"params": model.encoder.parameters(), "lr": romatch.STEP_SIZE * 5e-6 / 800},
+            {"params": model.decoder.parameters(), "lr": romatch.STEP_SIZE * 1e-4 / 800},
         ]
     else:
         parameters = [
@@ -271,11 +270,8 @@ def train(args):
         h=h, w=w, num_samples=1000, dataset="extredata")
     megadepth_benchmark = MixedDenseBenchmark(
         h=h, w=w, num_samples=1000, dataset="megadepth")
-
-    # When finetuning, use extredata dataset only
-    vis_dataset = "extredata" if args.use_pretained_roma else "mixed"
     mixed_visualize_benchmark = MixedVisualizeBenchmark(
-        h=h, w=w, count=8, dataset=vis_dataset)
+        h=h, w=w, count=8, dataset="mixed")
 
     checkpointer = CheckPoint(checkpoint_dir, experiment_name)
     model, optimizer, lr_scheduler, global_step = checkpointer.load(
@@ -305,7 +301,7 @@ def train(args):
                     dataset,
                     batch_size=batch_size,
                     sampler=sampler,
-                    num_workers=8,
+                    num_workers=32,
                     collate_fn=collate_fn_with(dataset),
                 )
             )
diff --git a/romatch/benchmarks/mixed_dense_benchmark.py b/romatch/benchmarks/mixed_dense_benchmark.py
@@ -19,7 +19,7 @@ def __init__(self, h=384, w=512, num_samples=2000, dataset="megadepth") -> None:
             self.dataset, self.ws = get_megadepth_dataset(h, w, train=False)
         elif dataset == "mixed":
             self.dataset, self.ws = get_mixed_dataset(
-                h, w, train=False, mega_percent=0.1)
+                h, w, train=False, mega_percent=0.6)
 
     def geometric_dist(self, depth1, depth2, T_1to2, K1, K2, dense_matches):
         b, h1, w1, d = dense_matches.shape
diff --git a/romatch/benchmarks/mixed_visualize_benchmark.py b/romatch/benchmarks/mixed_visualize_benchmark.py
@@ -19,7 +19,7 @@ def __init__(self, h=384, w=512, count=8, dataset="mixed") -> None:
             self.dataset, self.ws = get_megadepth_dataset(h, w, train=False)
         elif dataset == "mixed":
             self.dataset, self.ws = get_mixed_dataset(
-                h, w, train=False, mega_percent=0.1)
+                h, w, train=False, mega_percent=0.6)
 
         self.sampler = torch.utils.data.WeightedRandomSampler(
             self.ws, replacement=False, num_samples=100
diff --git a/romatch/datasets/extredata.py b/romatch/datasets/extredata.py
@@ -2,6 +2,8 @@
 from PIL import Image
 from torch.utils.data import ConcatDataset
 from romatch.utils import get_tuple_transform_ops, get_depth_tuple_transform_ops
+from romatch.utils.transforms import RandomColorAug
+import torchvision.transforms.functional as tvf
 import numpy as np
 import torch
 
@@ -21,7 +23,12 @@ def __init__(
         wt=560,
         min_overlap=0.0,
         max_overlap=1.0,
+        shake_t=0,
         normalize=True,
+        use_horizontal_flip_aug=False,
+        use_single_horizontal_flip_aug=False,
+        random_eraser=None,
+        use_randaug=False,
         max_num_pairs=20000,  # * total 2499030
     ) -> None:
         self.data_root = data_root
@@ -49,18 +56,38 @@ def __init__(
             self.pairs = self.pairs[pairinds]
             self.overlaps = self.overlaps[pairinds]
 
-        self.wt, self.ht = wt, ht
         self.im_transform_ops = get_tuple_transform_ops(
             resize=(ht, wt),
             normalize=normalize,
         )
         self.depth_transform_ops = get_depth_tuple_transform_ops(
             resize=(ht, wt)
         )
+        self.wt, self.ht = wt, ht
+        self.shake_t = shake_t
+
+        if use_horizontal_flip_aug and use_single_horizontal_flip_aug:
+            raise ValueError("Can't both flip both images and only flip one")
+        self.use_horizontal_flip_aug = use_horizontal_flip_aug
+        self.use_single_horizontal_flip_aug = use_single_horizontal_flip_aug
+
+        self.use_randaug = use_randaug
+        self.random_eraser = random_eraser
 
     def load_im(self, path):
         return Image.open(path)
 
+    def horizontal_flip(self, im_A, im_B, depth_A, depth_B,  K_A, K_B):
+        im_A = im_A.flip(-1)
+        im_B = im_B.flip(-1)
+        depth_A, depth_B = depth_A.flip(-1), depth_B.flip(-1)
+        flip_mat = torch.tensor(
+            [[-1, 0, self.wt], [0, 1, 0], [0, 0, 1.]]).to(K_A.device)
+        K_A = flip_mat@K_A
+        K_B = flip_mat@K_B
+
+        return im_A, im_B, depth_A, depth_B, K_A, K_B
+
     def load_depth(self, depth_ref):
         depth = cv2.imread(depth_ref, cv2.IMREAD_UNCHANGED)
         return torch.tensor(depth[:, :, 0])
@@ -73,6 +100,24 @@ def scale_intrinsic(self, K, wi, hi):
         sK = torch.tensor([[sx, 0, 0], [0, sy, 0], [0, 0, 1]])
         return sK @ K
 
+    def rand_shake(self, *things):
+        t = np.random.choice(range(-self.shake_t, self.shake_t + 1), size=2)
+        return [
+            tvf.affine(thing, angle=0.0, translate=list(
+                t), scale=1.0, shear=[0.0, 0.0])
+            for thing in things
+        ], t
+
+    def rand_augment(self, im_A, im_B):
+        im_A = np.array(im_A)
+        im_B = np.array(im_B)
+        random_color_aug = RandomColorAug()
+        im_A = random_color_aug(im_A)
+        im_B = random_color_aug(im_B)
+        im_A = Image.fromarray(im_A)
+        im_B = Image.fromarray(im_B)
+        return im_A, im_B
+
     def __getitem__(self, pair_idx):
         # read intrinsics of original size
         idx1, idx2 = self.pairs[pair_idx]
@@ -104,40 +149,46 @@ def __getitem__(self, pair_idx):
         K1 = self.scale_intrinsic(K1, im_A.width, im_A.height)
         K2 = self.scale_intrinsic(K2, im_B.width, im_B.height)
 
-        # * im_A: (640, 512) ImageFile
-        # * depth_A: [512, 640]
-        # plt.figure()
-        # plt.subplot(2, 2, 1)
-        # plt.imshow(im_A)
-        # plt.subplot(2, 2, 2)
-        # plt.imshow(depth_A)
-
         # Process images
-        im_A, im_B = self.im_transform_ops((im_A, im_B))
-        depth_A, depth_B = self.depth_transform_ops(
-            (depth_A[None, None], depth_B[None, None])
-        )
-
-        # * im_A: [3, 560, 560]
-        # * depth_A: [1, 1, 560, 560]
-        # plt.subplot(2, 2, 3)
-        # plt.imshow(im_A.permute(1, 2, 0) * 0.5 + 0.5)
-        # plt.subplot(2, 2, 4)
-        # plt.imshow(depth_A[0, 0])
-        # plt.tight_layout()
-        # plt.show()
+        try:
+            if self.use_randaug:
+                im_A, im_B = self.rand_augment(im_A, im_B)
 
-        im_A, im_B = im_A[None], im_B[None]
+            im_A, im_B = self.im_transform_ops((im_A, im_B))
+            depth_A, depth_B = self.depth_transform_ops(
+                (depth_A[None, None], depth_B[None, None])
+            )
 
-        # * im_A: [1, 3, 560, 560]
-        # * depth_A: [1, 1, 560, 560]
+            [im_A, im_B, depth_A, depth_B], t = self.rand_shake(
+                im_A, im_B, depth_A, depth_B)
+            K1[:2, 2] += t
+            K2[:2, 2] += t
+
+            im_A, im_B = im_A[None], im_B[None]
+            if self.random_eraser is not None:
+                im_A, depth_A = self.random_eraser(im_A, depth_A)
+                im_B, depth_B = self.random_eraser(im_B, depth_B)
+
+            if self.use_horizontal_flip_aug:
+                if np.random.rand() > 0.5:
+                    im_A, im_B, depth_A, depth_B, K1, K2 = self.horizontal_flip(
+                        im_A, im_B, depth_A, depth_B, K1, K2)
+
+            if self.use_single_horizontal_flip_aug:
+                if np.random.rand() > 0.5:
+                    im_B, depth_B, K2 = self.single_horizontal_flip(
+                        im_B, depth_B, K2)
+        except Exception as e:
+            print(
+                f"Error in transform ({self.image_paths[idx1]}, {self.image_paths[idx1]}):", e)
+            return None
 
         data_dict = {
-            "im_A": im_A[0],  # * [3, 560, 560]
+            "im_A": im_A[0],
             "im_A_identifier": self.image_paths[idx1].split("/")[-1].split(".jpg")[0],
             "im_B": im_B[0],
             "im_B_identifier": self.image_paths[idx2].split("/")[-1].split(".jpg")[0],
-            "im_A_depth": depth_A[0, 0],  # * [560, 560]
+            "im_A_depth": depth_A[0, 0],
             "im_B_depth": depth_B[0, 0],
             "K1": K1,
             "K2": K2,
@@ -154,19 +205,19 @@ def __init__(self, data_root: str = "./data/extredata") -> None:
         self.data_root = data_root
         self.scene_info_root = os.path.join(data_root, "scene_info")
         self.all_scenes = set(os.listdir(self.scene_info_root))
-        self.test_scenes = {"Madrid4_117@-83@276@68@0@90.npy",
-                            "Madrid4_90@-33@76@58@0@90.npy",
-                            "Madrid1_93@467@65@51@0@90.npy",
-                            "Berlin6_141@17@21@70@0@90.npy",
-                            "Tokyo5_92@167@326@64@0@90.npy",
-                            "Madrid1_93@-233@-385@59@0@90.npy",
-                            "German5_61@-263@139@63@0@90.npy",
-                            "Milano3_123@-434@318@58@0@90.npy",
-                            "NewYork4_138@-133@169@66@0@90.npy",
-                            "Bern0_143@216@-387@51@0@90.npy",
-                            "Berlin0_111@-133@-280@52@0@90.npy",
-                            "Madrid0_122@167@215@51@0@90.npy",
-                            "Milano2_134@116@218@51@0@90.npy"}
+        self.test_scenes = {"Madrid4_117@-83@276@68@0@90.npz",
+                            "Madrid4_90@-33@76@58@0@90.npz",
+                            "Madrid1_93@467@65@51@0@90.npz",
+                            "Berlin6_141@17@21@70@0@90.npz",
+                            "Tokyo5_92@167@326@64@0@90.npz",
+                            "Madrid1_93@-233@-385@59@0@90.npz",
+                            "German5_61@-263@139@63@0@90.npz",
+                            "Milano3_123@-434@318@58@0@90.npz",
+                            "NewYork4_138@-133@169@66@0@90.npz",
+                            "Bern0_143@216@-387@51@0@90.npz",
+                            "Berlin0_111@-133@-280@52@0@90.npz",
+                            "Madrid0_122@167@215@51@0@90.npz",
+                            "Milano2_134@116@218@51@0@90.npz"}
         self.ignore_scenes = set()
 
     def build_scenes(self, split: str = "train", **kwargs):
@@ -179,10 +230,10 @@ def build_scenes(self, split: str = "train", **kwargs):
 
         scenes = []
         for scene_name in scene_names:
-            if ".npy" not in scene_name:
+            if ".npz" not in scene_name:
                 continue
             scene_info_path = os.path.join(self.scene_info_root, scene_name)
-            scene_info = np.load(scene_info_path, allow_pickle=True).item()
+            scene_info = np.load(scene_info_path, allow_pickle=True)
             scene = ExtredataScene(
                 data_root=self.data_root,
                 scene_info=scene_info,
@@ -199,10 +250,3 @@ def weight_scenes(self, concat_dataset, alpha: float = 0.5) -> torch.Tensor:
             ns.append(len(d))
         ws = torch.cat([torch.ones(n) / n**alpha for n in ns])
         return ws
-
-
-if __name__ == "__main__":
-    dataset = ExtredataBuilder()
-    train1 = dataset.build_scenes()
-    train = ConcatDataset(train1)
-    print(len(train))  # * 2499030
diff --git a/romatch/datasets/megadepth.py b/romatch/datasets/megadepth.py
@@ -4,9 +4,9 @@
 import numpy as np
 import torch
 import torchvision.transforms.functional as tvf
-from romatch.utils import get_depth_tuple_transform_ops, get_tuple_transform_ops
-import romatch
 from romatch.utils import *
+from romatch.utils.transforms import RandomColorAug
+import romatch
 import math
 
 
@@ -149,34 +149,38 @@ def __getitem__(self, pair_idx):
         K1 = self.scale_intrinsic(K1, im_A.width, im_A.height)
         K2 = self.scale_intrinsic(K2, im_B.width, im_B.height)
 
-        if self.use_randaug:
-            im_A, im_B = self.rand_augment(im_A, im_B)
-
         # Process images
-        im_A, im_B = self.im_transform_ops((im_A, im_B))
-        depth_A, depth_B = self.depth_transform_ops(
-            (depth_A[None, None], depth_B[None, None])
-        )
+        try:
+            if self.use_randaug:
+                im_A, im_B = self.rand_augment(im_A, im_B)
 
-        [im_A, im_B, depth_A, depth_B], t = self.rand_shake(
-            im_A, im_B, depth_A, depth_B)
-        K1[:2, 2] += t
-        K2[:2, 2] += t
-
-        im_A, im_B = im_A[None], im_B[None]
-        if self.random_eraser is not None:
-            im_A, depth_A = self.random_eraser(im_A, depth_A)
-            im_B, depth_B = self.random_eraser(im_B, depth_B)
-
-        if self.use_horizontal_flip_aug:
-            if np.random.rand() > 0.5:
-                im_A, im_B, depth_A, depth_B, K1, K2 = self.horizontal_flip(
-                    im_A, im_B, depth_A, depth_B, K1, K2)
-
-        if self.use_single_horizontal_flip_aug:
-            if np.random.rand() > 0.5:
-                im_B, depth_B, K2 = self.single_horizontal_flip(
-                    im_B, depth_B, K2)
+            im_A, im_B = self.im_transform_ops((im_A, im_B))
+            depth_A, depth_B = self.depth_transform_ops(
+                (depth_A[None, None], depth_B[None, None])
+            )
+
+            [im_A, im_B, depth_A, depth_B], t = self.rand_shake(
+                im_A, im_B, depth_A, depth_B)
+            K1[:2, 2] += t
+            K2[:2, 2] += t
+
+            im_A, im_B = im_A[None], im_B[None]
+            if self.random_eraser is not None:
+                im_A, depth_A = self.random_eraser(im_A, depth_A)
+                im_B, depth_B = self.random_eraser(im_B, depth_B)
+
+            if self.use_horizontal_flip_aug:
+                if np.random.rand() > 0.5:
+                    im_A, im_B, depth_A, depth_B, K1, K2 = self.horizontal_flip(
+                        im_A, im_B, depth_A, depth_B, K1, K2)
+
+            if self.use_single_horizontal_flip_aug:
+                if np.random.rand() > 0.5:
+                    im_B, depth_B, K2 = self.single_horizontal_flip(
+                        im_B, depth_B, K2)
+        except Exception as e:
+            print(f"Error in transform ({self.image_paths[idx1]}, {self.image_paths[idx1]}):", e)
+            return None
 
         if romatch.DEBUG_MODE:
             tensor_to_pil(im_A[0], unnormalize=True)\
diff --git a/romatch/datasets/mixed.py b/romatch/datasets/mixed.py
diff --git a/romatch/utils/transforms.py b/romatch/utils/transforms.py