diff --git a/.gitignore b/.gitignore
index 9d7352ea..8273e1f9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -147,3 +147,13 @@ skyseg.onnx
 # pixi environments
 .pixi
 *.egg-info
+
+slurm_outs/
+logs/
+.vscode/
+annotations/
+output_dir/
+constants.py
+wandb/
+/data/
+outputs/
\ No newline at end of file
diff --git a/co3d.py b/co3d.py
new file mode 100644
index 00000000..88c77a81
--- /dev/null
+++ b/co3d.py
@@ -0,0 +1,101 @@
+from training.data.dataset_util import *
+from pathlib import Path
+import torch
+import numpy as np
+import os.path as osp
+import json
+import gzip
+
+def save_ply(points, filename):
+    import open3d as o3d                
+    if torch.is_tensor(points):
+        points_visual = points.reshape(-1, 3).cpu().numpy()
+    else:
+        points_visual = points.reshape(-1, 3)
+    pcd = o3d.geometry.PointCloud()
+    pcd.points = o3d.utility.Vector3dVector(points_visual.astype(np.float64))
+    # pcd.colors = o3d.utility.Vector3dVector(points_visual_rgb.astype(np.float64))
+    o3d.io.write_point_cloud(filename, pcd, write_ascii=True)
+
+
+def co3d_annotation_to_opencv_pose(frame_data):
+    p = frame_data['viewpoint']['principal_point']
+    f = frame_data['viewpoint']['focal_length']
+    h, w = frame_data['image']['size']
+    K = np.eye(3)
+    s = (min(h, w) - 1) / 2
+    K[0, 0] = f[0] * (w - 1) / 2
+    K[1, 1] = f[1] * (h - 1) / 2
+    K[0, 2] = -p[0] * s + (w - 1) / 2
+    K[1, 2] = -p[1] * s + (h - 1) / 2
+
+    R = np.asarray(frame_data['viewpoint']['R']).T   # note the transpose here
+    T = np.asarray(frame_data['viewpoint']['T'])
+    pose = np.concatenate([R,T[:,None]],1)
+    # pose = np.diag([-1,-1,1]).astype(np.float32) @ pose # flip the direction of x,y axis
+
+    return pose, K
+
+def _load_16big_png_depth(depth_png):
+    with Image.open(depth_png) as depth_pil:
+        # the image is stored with 16-bit depth but PIL reads it as I (32 bit).
+        # we cast it to uint16, then reinterpret as float16, then cast to float32
+        depth = (
+            np.frombuffer(np.array(depth_pil, dtype=np.uint16), dtype=np.float16)
+            .astype(np.float32)
+            .reshape((depth_pil.size[1], depth_pil.size[0]))
+        )
+    return depth
+
+root = Path("/mimer/NOBACKUP/groups/3d-dl/co3d_full/apple")
+
+frame_file = osp.join(root, "frame_annotations.jgz")
+
+with gzip.open(frame_file, "r") as fin:
+    frame_data = json.loads(fin.read())
+
+frame_data_processed = {}
+for f_data in frame_data:
+    sequence_name = f_data["sequence_name"]
+    frame_data_processed.setdefault(sequence_name, {})[f_data["frame_number"]] = f_data
+
+seq_name = "12_90_489"
+seq_data = frame_data_processed[seq_name]
+
+seq_dir = root / seq_name
+images_dir = seq_dir / "images"
+frames = sorted([p.name for p in images_dir.iterdir() if p.suffix == ".jpg"])
+
+
+total_world_points = []
+for i, frame in enumerate(frames[:10]):
+    frame_data = seq_data[i]
+
+    extrinsic, intrinsic = co3d_annotation_to_opencv_pose(frame_data)
+
+    filepath= frame_data['image']['path']
+    image_path = osp.join("/mimer/NOBACKUP/groups/3d-dl/co3d_full", filepath)
+    depth_path = image_path.replace("/images", "/depths") + ".geometric.png"
+
+    # extrinsic = np.vstack([extrinsic, [0, 0, 0, 1]])
+    # extrinsic = np.linalg.inv(extrinsic)
+
+    extri_opencv = np.array(extrinsic[:3].tolist())
+    intri_opencv = np.array(intrinsic.tolist())
+
+    depth_map = _load_16big_png_depth(depth_path)
+
+    depth_map = cv2.resize(depth_map, (1024//4, 1896//4), interpolation=cv2.INTER_NEAREST)
+
+    world_coords_points, cam_coords_points, point_mask = (
+        depth_to_world_coords_points(depth_map, extri_opencv, intri_opencv)
+    )
+    total_world_points.append(world_coords_points)
+
+total_world_points = np.concatenate(total_world_points, axis=0)
+print('Total points: ', total_world_points.shape)
+
+save_ply(
+    total_world_points.reshape(-1, 3), 
+    f"yum.ply"
+)
diff --git a/evaluation/configs/eval.yaml b/evaluation/configs/eval.yaml
new file mode 100644
index 00000000..4a2f7f6f
--- /dev/null
+++ b/evaluation/configs/eval.yaml
@@ -0,0 +1,55 @@
+# @package _global_
+
+
+name: mv_recon
+work_dir: ${hydra:runtime.cwd}
+output_dir: ${work_dir}/outputs/${name}
+
+hydra:
+  run:
+    dir: ${output_dir}/hydra/${now:%Y-%m-%d_%H-%M-%S}
+
+debug: false
+
+pi3:
+  pretrained_model_name_or_path: yyfz233/Pi3
+  # pretrained_model_name_or_path: checkpoints/Pi3
+
+eval_datasets:
+  # - DTU
+  - ETH3D
+
+no_crop: True
+load_img_size: 518
+
+device: cuda
+
+verbose: False
+
+seed: 42
+
+save_suffix: null
+
+data:
+  DTU:
+    cfg:
+      _target_: evaluation.datasets.dtu.DTU
+      split: test
+      DTU_DIR: /mimer/NOBACKUP/groups/3d-dl/dtu_test_mvsnet_release
+      load_img_size: ${load_img_size}
+      cache_file: data/dataset_cache/dtu_mv_recon_cache.npy
+    sampling:
+      strategy: stride
+      kf_every: 5
+    seq_id_map: evaluation/datasets/seq-id-maps/DTU_mv-recon_seq-id-map-kf5.json
+
+  ETH3D:
+    cfg:
+      _target_: evaluation.datasets.eth3d.ETH3D
+      ETH3D_DIR: /mimer/NOBACKUP/groups/3d-dl/eth3d
+      load_img_size: ${load_img_size}
+      cache_file: data/dataset_cache/eth3d_mv_recon_cache.npy
+    sampling:
+      strategy: stride
+      kf_every: 5
+    seq_id_map: evaluation/datasets/seq-id-maps/ETH3D_mv-recon_seq-id-map-kf5.json
\ No newline at end of file
diff --git a/evaluation/datasets/dtu.py b/evaluation/datasets/dtu.py
new file mode 100644
index 00000000..47075a9b
--- /dev/null
+++ b/evaluation/datasets/dtu.py
@@ -0,0 +1,231 @@
+import os.path as osp
+import os 
+import numpy as np
+import torch
+import cv2
+import torchvision.transforms as tvf
+
+from typing import Optional, Union, List
+from PIL import Image, ImageFile
+from torch.utils.data import Dataset
+from tqdm import tqdm
+from evaluation.utils.geometry import unproject_depth_map_to_point_map
+from evaluation.utils.cropping import resize_image, resize_image_depth_and_intrinsic
+
+Image.MAX_IMAGE_PIXELS = None
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+to_tensor = tvf.ToTensor()
+
+def load_cam_mvsnet(words, interval_scale=1):
+    """read camera txt file"""
+    cam = np.zeros((2, 4, 4))
+    # words = file.read().split()
+    words = words.split()
+    # read extrinsic
+    for i in range(0, 4):
+        for j in range(0, 4):
+            extrinsic_index = 4 * i + j + 1
+            cam[0][i][j] = words[extrinsic_index]
+
+    # read intrinsic
+    for i in range(0, 3):
+        for j in range(0, 3):
+            intrinsic_index = 3 * i + j + 18
+            cam[1][i][j] = words[intrinsic_index]
+
+    if len(words) == 29:
+        cam[1][3][0] = words[27]
+        cam[1][3][1] = float(words[28]) * interval_scale
+        cam[1][3][2] = 192
+        cam[1][3][3] = cam[1][3][0] + cam[1][3][1] * cam[1][3][2]
+    elif len(words) == 30:
+        cam[1][3][0] = words[27]
+        cam[1][3][1] = float(words[28]) * interval_scale
+        cam[1][3][2] = words[29]
+        cam[1][3][3] = cam[1][3][0] + cam[1][3][1] * cam[1][3][2]
+    elif len(words) == 31:
+        cam[1][3][0] = words[27]
+        cam[1][3][1] = float(words[28]) * interval_scale
+        cam[1][3][2] = words[29]
+        cam[1][3][3] = words[30]
+    else:
+        cam[1][3][0] = 0
+        cam[1][3][1] = 0
+        cam[1][3][2] = 0
+        cam[1][3][3] = 0
+
+    extrinsic = cam[0].astype(np.float32)
+    intrinsic = cam[1].astype(np.float32)
+
+    return intrinsic, extrinsic
+
+class DTU(Dataset):
+    def __init__(
+        self,
+        DTU_DIR: str,
+        split: str = "test",
+        load_img_size: int = 518,
+        cache_file: str = "data/dataset_cache/dtu_mv_recon_cache.npy",
+    ):
+        
+        self.DTU_DIR = DTU_DIR
+        if DTU_DIR == None:
+            raise NotImplementedError
+        print(f"DTU_DIR is {DTU_DIR}")
+
+        self.split = split
+        assert split == 'test', "Only test set preprocessed."
+        if self.split == 'train':
+            seq_numbers = [
+                2, 6, 7, 8, 14, 16, 18, 19, 20, 22, 30, 31, 36, 39, 41, 42, 44,
+                45, 46, 47, 50, 51, 52, 53, 55, 57, 58, 60, 61, 63, 64, 65, 68, 69, 70, 71, 72,
+                74, 76, 83, 84, 85, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
+                101, 102, 103, 104, 105, 107, 108, 109, 111, 112, 113, 115, 116, 119, 120,
+                121, 122, 123, 124, 125, 126, 127, 128
+            ]
+        elif self.split == 'valid':
+            seq_numbers = [3, 5, 17, 21, 28, 35, 37, 38, 40, 43, 56, 59, 66, 67, 82, 86, 106, 117]
+        elif self.split == 'test':
+            seq_numbers = [1, 4, 9, 10, 11, 12, 13, 15, 23, 24, 29, 32, 33, 34, 48, 49, 62, 75, 77, 110, 114, 118]
+        else:
+            raise ValueError(f"Invalid split: {self.split}. Must be 'train', 'valid' or 'test'.")
+
+        if osp.exists(cache_file):
+            print(f"[DTU] Loading from cache file: {cache_file}")
+            self.metadata = np.load(cache_file, allow_pickle=True).item()
+            self.sequence_list = sorted(list(self.metadata.keys()))
+        else:
+            print(f"[DTU] Cache file not found, loading from {DTU_DIR}")
+
+            self.sequence_list = [f"scan{num}" for num in seq_numbers]
+            
+            self.metadata = {}
+            for seq in tqdm(self.sequence_list):
+                rgb_root = osp.join(DTU_DIR, seq, 'images')
+                
+                all_imgs = sorted([d for d in os.listdir(rgb_root) if d.endswith('.jpg')])
+
+                all_img_numbers = [int(imgname.split('.')[0]) for imgname in all_imgs]
+                if all_img_numbers[0] != 0 or all_img_numbers[-1] + 1 != len(all_img_numbers):
+                    raise ValueError(f"Image number not regular, with first image {all_imgs[0]} and last image {all_imgs[-1]} but number of images {len(all_imgs)}")
+
+                self.metadata[seq] = len(all_imgs)
+
+            np.save(cache_file, self.metadata)
+
+        self.load_img_size = load_img_size
+        print(f"[DTU] Data size: {len(self)}")
+
+    def __len__(self):
+        return len(self.sequence_list)
+
+    def get_seq_framenum(self, index: Optional[int] = None, sequence_name: Optional[str] = None):
+        if sequence_name is None:
+            if index is None:
+                raise ValueError("Please specify either index or sequence_name")
+            sequence_name = self.sequence_list[index]
+        return self.metadata[sequence_name]
+
+    def __getitem__(self, idx_N):
+        """Fetch item by index and a dynamic variable n_per_seq."""
+
+        # Different from most pytorch datasets,
+        # here we not only get index, but also a dynamic variable n_per_seq
+        # supported by DynamicBatchSampler
+
+        index, n_per_seq = idx_N
+        sequence_name = self.sequence_list[index]
+        metadata = self.metadata[sequence_name]
+        ids = np.random.choice(len(metadata), n_per_seq, replace=False)
+        return self.get_data(index=index, ids=ids)
+
+    def get_data(
+            self,
+            index: Optional[int] = None,
+            sequence_name: Optional[str] = None,
+            ids: Union[List[int], np.ndarray, None] = None,
+        ):
+        if sequence_name is None:
+            if index is None:
+                raise ValueError("Please specify either index or sequence_name")
+            sequence_name: str = self.sequence_list[index]
+        seq_len: int = self.metadata[sequence_name]
+
+        if ids is None:
+            ids = np.arange(seq_len).tolist()
+        elif isinstance(ids, np.ndarray):
+            assert ids.ndim == 1, f"ids should be a 1D array, but got {ids.ndim}D"
+            ids = ids.tolist()
+
+        image_path = osp.join(self.DTU_DIR, sequence_name, "images")
+        depth_path = osp.join(self.DTU_DIR, sequence_name, "depths")
+        mask_path = osp.join(self.DTU_DIR, sequence_name, "binary_masks")
+        cam_path = osp.join(self.DTU_DIR, sequence_name, "cams")
+
+        image_paths: list      = [""] * len(ids)
+        images: list           = [0]  * len(ids)
+        depths: list           = [0]  * len(ids)
+        extrinsics: np.ndarray = np.zeros((len(ids), 3, 4))
+        intrinsics: np.ndarray = np.zeros((len(ids), 3, 3))
+
+        for id_index, id in enumerate(ids):
+            impath = osp.join(image_path, f"{id:08d}.jpg")
+            depthpath = osp.join(depth_path, f"{id:08d}.npy")
+            campath = osp.join(cam_path, f"{id:08d}_cam.txt")
+            maskpath = osp.join(mask_path, f"{id:08d}.png")
+
+            rgb_image: Image.Image = Image.open(impath)
+            depthmap: np.ndarray   = np.load(depthpath)
+            rgb_image: Image.Image = resize_image(rgb_image, (depthmap.shape[1], depthmap.shape[0]))
+
+            depthmap = np.nan_to_num(depthmap.astype(np.float32), 0.0)
+
+            mask = cv2.imread(maskpath, cv2.IMREAD_UNCHANGED) / 255.0
+            mask = mask.astype(np.float32)
+
+            mask[mask > 0.5] = 1.0
+            mask[mask < 0.5] = 0.0
+
+            mask = cv2.resize(
+                mask,
+                (depthmap.shape[1], depthmap.shape[0]),
+                interpolation=cv2.INTER_NEAREST,
+            )
+            kernel = np.ones((10, 10), np.uint8)  # Define the erosion kernel
+            mask = cv2.erode(mask, kernel, iterations=1)
+            depthmap = depthmap * mask
+
+            cur_intrinsics, extrinsic = load_cam_mvsnet(open(campath, "r").read())
+            intrinsic = cur_intrinsics[:3, :3]
+
+            rgb_image, depthmap, intrinsic = resize_image_depth_and_intrinsic(
+                image=rgb_image,
+                depth_map=depthmap,
+                intrinsic=intrinsic,
+                output_width=self.load_img_size, # finally width = 518, height = 388
+            )
+
+            image_paths[id_index] = impath
+            images[id_index]      = to_tensor(rgb_image)
+            depths[id_index]      = depthmap
+            intrinsics[id_index]  = intrinsic
+            extrinsics[id_index]  = extrinsic[:3, :]
+
+        depths = np.array(depths)  # (S, H, W)
+        pointclouds = unproject_depth_map_to_point_map(
+            depth_map=depths[..., None],
+            intrinsics_cam=intrinsics,
+            extrinsics_cam=extrinsics
+        )
+
+        batch = {"seq_id": sequence_name, "seq_len": seq_len, "ind": torch.tensor(ids)}
+        batch['image_paths'] = image_paths  # list of str
+        batch['images']      = torch.stack(images, dim=0)
+        batch['pointclouds'] = pointclouds  # in numpy
+        batch['valid_mask']  = depths > 1e-4
+        # batch["extrs"] = extrinsics
+        # batch["intrs"] = intrinsics
+        # batch["w"] = metadata["w"]
+        # batch["h"] = metadata["h"]
+
+        return batch
\ No newline at end of file
diff --git a/evaluation/datasets/eth3d.py b/evaluation/datasets/eth3d.py
new file mode 100644
index 00000000..4a82831f
--- /dev/null
+++ b/evaluation/datasets/eth3d.py
@@ -0,0 +1,147 @@
+import os.path as osp
+import os 
+import numpy as np
+import torch
+import torchvision.transforms as tvf
+
+from typing import Optional, Union, List
+from PIL import Image, ImageFile
+from torch.utils.data import Dataset
+from evaluation.utils.geometry import unproject_depth_map_to_point_map
+from evaluation.utils.cropping import resize_image_depth_and_intrinsic
+
+Image.MAX_IMAGE_PIXELS = None
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+to_tensor = tvf.ToTensor()
+
+class ETH3D(Dataset):
+    def __init__(
+        self,
+        ETH3D_DIR: str,
+        load_img_size: int = 518,
+        cache_file: str = "data/dataset_cache/eth3d_mv_recon_cache.npy",
+    ):
+        
+        self.ETH3D_DIR = ETH3D_DIR
+        if ETH3D_DIR == None:
+            raise NotImplementedError
+        print(f"ETH3D_DIR is {ETH3D_DIR}")
+
+        if osp.exists(cache_file):
+            print(f"[ETH3D] Loading from cache file: {cache_file}")
+            self.metadata = np.load(cache_file, allow_pickle=True).item()
+            self.sequence_list = sorted(self.metadata.keys())
+        else:
+            print(f"[ETH3D] Cache file not found, loading from {ETH3D_DIR}")
+
+            self.sequence_list = [seq for seq in os.listdir(ETH3D_DIR) if os.path.isdir(osp.join(ETH3D_DIR, seq))]
+            self.sequence_list = sorted(self.sequence_list)
+
+            self.metadata = {}
+            for seq in self.sequence_list:
+                seq_image_root = osp.join(ETH3D_DIR, seq, 'images', 'custom_undistorted')
+                image_list = [imgname for imgname in os.listdir(seq_image_root) if imgname.endswith('.JPG')]
+                image_list = sorted(image_list)
+
+                self.metadata[seq] = image_list
+
+            np.save(cache_file, self.metadata)
+
+        self.load_img_size = load_img_size
+        print(f"[ETH3D] Data size: {len(self)}")
+
+    def __len__(self):
+        return len(self.sequence_list)
+
+    def get_seq_framenum(self, index: Optional[int] = None, sequence_name: Optional[str] = None):
+        if sequence_name is None:
+            if index is None:
+                raise ValueError("Please specify either index or sequence_name")
+            sequence_name = self.sequence_list[index]
+        return len(self.metadata[sequence_name])
+
+    def __getitem__(self, idx_N):
+        """Fetch item by index and a dynamic variable n_per_seq."""
+
+        # Different from most pytorch datasets,
+        # here we not only get index, but also a dynamic variable n_per_seq
+        # supported by DynamicBatchSampler
+
+        index, n_per_seq = idx_N
+        sequence_name = self.sequence_list[index]
+        metadata = self.metadata[sequence_name]
+        ids = np.random.choice(len(metadata), n_per_seq, replace=False)
+        return self.get_data(index=index, ids=ids)
+
+    def get_data(
+            self,
+            index: Optional[int] = None,
+            sequence_name: Optional[str] = None,
+            ids: Union[List[int], np.ndarray, None] = None,
+        ):
+        if sequence_name is None:
+            if index is None:
+                raise ValueError("Please specify either index or sequence_name")
+            sequence_name: str = self.sequence_list[index]
+        image_list: list = self.metadata[sequence_name]
+        seq_len: int     = len(image_list)
+
+        if ids is None:
+            ids = np.arange(seq_len).tolist()
+        elif isinstance(ids, np.ndarray):
+            assert ids.ndim == 1, f"ids should be a 1D array, but got {ids.ndim}D"
+            ids = ids.tolist()
+
+        image_paths: list      = [""] * len(ids)
+        images: list           = [0]  * len(ids)
+        depths: list           = [0] * len(ids)
+        extrinsics: np.ndarray = np.zeros((len(ids), 3, 4))
+        intrinsics: np.ndarray = np.zeros((len(ids), 3, 3))
+
+        for id_index, id in enumerate(ids):
+            img_name = image_list[id]
+            impath = os.path.join(self.ETH3D_DIR, sequence_name, 'images', 'custom_undistorted', img_name)
+            depthpath = os.path.join(self.ETH3D_DIR, sequence_name, 'ground_truth_depth', 'custom_undistorted', img_name)
+            cam_path = os.path.join(self.ETH3D_DIR, sequence_name,  'custom_undistorted_cam', img_name.replace('JPG', 'npz'))
+
+            cam = np.load(cam_path)
+            intrinsic = cam['intrinsics']
+            extrinsic = cam['extrinsics']
+
+            # load image and depth
+            rgb_image: Image.Image = Image.open(impath)
+            width, height          = rgb_image.size
+            depthmap: np.ndarray   = np.fromfile(depthpath, dtype=np.float32).reshape(height, width)
+            depthmap[~np.isfinite(depthmap)] = -1
+
+            rgb_image, depthmap, intrinsic = resize_image_depth_and_intrinsic(
+                image=rgb_image,
+                depth_map=depthmap,
+                intrinsic=intrinsic,
+                output_width=self.load_img_size, # finally width = 518, height = 388
+            )
+
+            image_paths[id_index] = impath
+            images[id_index]      = to_tensor(rgb_image)
+            depths[id_index]      = depthmap
+            intrinsics[id_index]  = intrinsic
+            extrinsics[id_index]  = extrinsic[:3, :]
+
+        depths = np.array(depths)  # (S, H, W)
+        pointclouds = unproject_depth_map_to_point_map(
+            depth_map=depths[..., None],
+            intrinsics_cam=intrinsics,
+            extrinsics_cam=extrinsics
+        )
+
+        batch = {"seq_id": sequence_name, "seq_len": seq_len, "ind": torch.tensor(ids)}
+        batch['image_paths'] = image_paths  # list of str
+        batch['images']      = torch.stack(images, dim=0)
+        batch['pointclouds'] = pointclouds  # in numpy
+        batch['valid_mask']  = depths > 1e-4
+        # batch["extrs"] = extrinsics
+        # batch["intrs"] = intrinsics
+        # batch["w"] = metadata["w"]
+        # batch["h"] = metadata["h"]
+
+        return batch
\ No newline at end of file
diff --git a/evaluation/datasets/seq-id-maps/DTU_mv-recon_seq-id-map-kf5.json b/evaluation/datasets/seq-id-maps/DTU_mv-recon_seq-id-map-kf5.json
new file mode 100644
index 00000000..12f53a28
--- /dev/null
+++ b/evaluation/datasets/seq-id-maps/DTU_mv-recon_seq-id-map-kf5.json
@@ -0,0 +1,266 @@
+{
+    "scan1": [
+        0,
+        5,
+        10,
+        15,
+        20,
+        25,
+        30,
+        35,
+        40,
+        45
+    ],
+    "scan10": [
+        0,
+        5,
+        10,
+        15,
+        20,
+        25,
+        30,
+        35,
+        40,
+        45
+    ],
+    "scan11": [
+        0,
+        5,
+        10,
+        15,
+        20,
+        25,
+        30,
+        35,
+        40,
+        45
+    ],
+    "scan110": [
+        0,
+        5,
+        10,
+        15,
+        20,
+        25,
+        30,
+        35,
+        40,
+        45
+    ],
+    "scan114": [
+        0,
+        5,
+        10,
+        15,
+        20,
+        25,
+        30,
+        35,
+        40,
+        45
+    ],
+    "scan118": [
+        0,
+        5,
+        10,
+        15,
+        20,
+        25,
+        30,
+        35,
+        40,
+        45
+    ],
+    "scan12": [
+        0,
+        5,
+        10,
+        15,
+        20,
+        25,
+        30,
+        35,
+        40,
+        45
+    ],
+    "scan13": [
+        0,
+        5,
+        10,
+        15,
+        20,
+        25,
+        30,
+        35,
+        40,
+        45
+    ],
+    "scan15": [
+        0,
+        5,
+        10,
+        15,
+        20,
+        25,
+        30,
+        35,
+        40,
+        45
+    ],
+    "scan23": [
+        0,
+        5,
+        10,
+        15,
+        20,
+        25,
+        30,
+        35,
+        40,
+        45
+    ],
+    "scan24": [
+        0,
+        5,
+        10,
+        15,
+        20,
+        25,
+        30,
+        35,
+        40,
+        45
+    ],
+    "scan29": [
+        0,
+        5,
+        10,
+        15,
+        20,
+        25,
+        30,
+        35,
+        40,
+        45
+    ],
+    "scan32": [
+        0,
+        5,
+        10,
+        15,
+        20,
+        25,
+        30,
+        35,
+        40,
+        45
+    ],
+    "scan33": [
+        0,
+        5,
+        10,
+        15,
+        20,
+        25,
+        30,
+        35,
+        40,
+        45
+    ],
+    "scan34": [
+        0,
+        5,
+        10,
+        15,
+        20,
+        25,
+        30,
+        35,
+        40,
+        45
+    ],
+    "scan4": [
+        0,
+        5,
+        10,
+        15,
+        20,
+        25,
+        30,
+        35,
+        40,
+        45
+    ],
+    "scan48": [
+        0,
+        5,
+        10,
+        15,
+        20,
+        25,
+        30,
+        35,
+        40,
+        45
+    ],
+    "scan49": [
+        0,
+        5,
+        10,
+        15,
+        20,
+        25,
+        30,
+        35,
+        40,
+        45
+    ],
+    "scan62": [
+        0,
+        5,
+        10,
+        15,
+        20,
+        25,
+        30,
+        35,
+        40,
+        45
+    ],
+    "scan75": [
+        0,
+        5,
+        10,
+        15,
+        20,
+        25,
+        30,
+        35,
+        40,
+        45
+    ],
+    "scan77": [
+        0,
+        5,
+        10,
+        15,
+        20,
+        25,
+        30,
+        35,
+        40,
+        45
+    ],
+    "scan9": [
+        0,
+        5,
+        10,
+        15,
+        20,
+        25,
+        30,
+        35,
+        40,
+        45
+    ]
+}
\ No newline at end of file
diff --git a/evaluation/datasets/seq-id-maps/ETH3D_mv-recon_seq-id-map-kf5.json b/evaluation/datasets/seq-id-maps/ETH3D_mv-recon_seq-id-map-kf5.json
new file mode 100644
index 00000000..02b19b58
--- /dev/null
+++ b/evaluation/datasets/seq-id-maps/ETH3D_mv-recon_seq-id-map-kf5.json
@@ -0,0 +1,125 @@
+{
+    "courtyard": [
+        0,
+        5,
+        10,
+        15,
+        20,
+        25,
+        30,
+        35
+    ],
+    "delivery_area": [
+        0,
+        5,
+        10,
+        15,
+        20,
+        25,
+        30,
+        35,
+        40
+    ],
+    "electro": [
+        0,
+        5,
+        10,
+        15,
+        20,
+        25,
+        30,
+        35,
+        40
+    ],
+    "facade": [
+        0,
+        5,
+        10,
+        15,
+        20,
+        25,
+        30,
+        35,
+        40,
+        45,
+        50,
+        55,
+        60,
+        65,
+        70,
+        75
+    ],
+    "kicker": [
+        0,
+        5,
+        10,
+        15,
+        20,
+        25,
+        30
+    ],
+    "meadow": [
+        0,
+        5,
+        10
+    ],
+    "office": [
+        0,
+        5,
+        10,
+        15,
+        20,
+        25
+    ],
+    "pipes": [
+        0,
+        5,
+        10
+    ],
+    "playground": [
+        0,
+        5,
+        10,
+        15,
+        20,
+        25,
+        30,
+        35
+    ],
+    "relief": [
+        0,
+        5,
+        10,
+        15,
+        20,
+        25,
+        30
+    ],
+    "relief_2": [
+        0,
+        5,
+        10,
+        15,
+        20,
+        25,
+        30
+    ],
+    "terrace": [
+        0,
+        5,
+        10,
+        15,
+        20
+    ],
+    "terrains": [
+        0,
+        5,
+        10,
+        15,
+        20,
+        25,
+        30,
+        35,
+        40
+    ]
+}
\ No newline at end of file
diff --git a/evaluation/pointcloud.py b/evaluation/pointcloud.py
new file mode 100644
index 00000000..5be18ac3
--- /dev/null
+++ b/evaluation/pointcloud.py
@@ -0,0 +1,203 @@
+# https://github.com/facebookresearch/vggt/issues/208
+# https://github.com/doubleZ0108/GeoMVSNet
+# https://github.com/yyfz/Pi3/blob/evaluation/mv_recon/eval.py
+
+import os
+import json
+import torch
+import numpy as np
+import open3d as o3d
+import os.path as osp
+import hydra
+
+from omegaconf import DictConfig
+from evaluation.utils.interfaces import infer_mv_pointclouds
+from evaluation.utils.mv_recon import umeyama, accuracy, completion
+from evaluation.utils.messages import set_default_arg, write_csv
+from evaluation.utils.vis_utils import save_image_grid_auto
+from evaluation.utils import load_model
+
+
+@hydra.main(version_base="1.2", config_path="./configs", config_name="eval")
+def main(hydra_cfg: DictConfig):
+
+    all_eval_datasets: DictConfig = hydra_cfg.eval_datasets  # see configs/evaluation/mv_recon.yaml
+    all_data_info: DictConfig     = hydra_cfg.data           # see configs/data
+    pretrained_model_name_or_path: str = hydra_cfg.pi3.pretrained_model_name_or_path  # see configs/evaluation/relpose-angular.yaml
+
+    # 0. create model
+    # model = VGGT.from_pretrained(pretrained_model_name_or_path).to(hydra_cfg.device).eval()
+    model = load_model(
+        device=hydra_cfg.device,
+        # model_path="/mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/pretrained_models/model_tracker_fixed_e20.pt",
+        # big_model=True,
+        # model_path="/mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/training/logs/mum_exp004/ckpts/checkpoint.pt",
+        # model_path="/mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/training/logs/dinov3_exp004/ckpts/checkpoint.pt",
+        model_path="/mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/training/logs/crocov2_exp001/ckpts/checkpoint.pt",
+        big_model=False,
+        # encoder="dinov3"
+        # encoder="mum"
+        encoder="crocov2"
+    )
+    print(f"Loaded VGGT from {pretrained_model_name_or_path}")
+
+    for idx_dataset, dataset_name in enumerate(all_eval_datasets, start=1):
+        # 1.1 look up dataset config from configs/data, decide the dataset name, and load the dataset
+        if dataset_name not in all_data_info:
+            raise ValueError(f"Unknown dataset in global data information: {dataset_name}")
+        dataset_info = all_data_info[dataset_name]
+        dataset = hydra.utils.instantiate(dataset_info.cfg)
+
+        # 1.2 ready for output directory & metrics
+        output_root = osp.join(hydra_cfg.output_dir, dataset_name)
+        os.makedirs(output_root, exist_ok=True)
+        all_data_dict = {
+            "Acc-mean":  0.0,  "Acc-med":  0.0,
+            "Comp-mean": 0.0,  "Comp-med": 0.0,
+            "NC-mean":   0.0,  "NC-med":   0.0,
+            "NC1-mean":  0.0,  "NC1-med":  0.0,
+            "NC2-mean":  0.0,  "NC2-med":  0.0,
+        }
+
+        # 1.3 load pre-sampled seq-id-map
+        print(f"[{idx_dataset}/{len(all_eval_datasets)}] Evaluating Multi-View Pointcloud Reconstruction of Pi3 on dataset {dataset_name}...")
+        sample_config: DictConfig = dataset_info.sampling
+        print(f"Sampling strategy: {sample_config.strategy}")
+        with open(dataset_info.seq_id_map, "r") as f:
+            seq_id_map: dict = json.load(f)
+
+        if osp.exists(osp.join(output_root, "_all_samples.csv")):
+            os.remove(osp.join(output_root, "_all_samples.csv"))  # remove old csv file
+        for seq_idx, (seq_name, ids) in enumerate(seq_id_map.items(), start=1):
+            # 2. load data, choose specific ids of a sequence
+            data = dataset.get_data(sequence_name=seq_name, ids=ids)
+            filelist: list         = data['image_paths']  # [str] * N
+            images: torch.Tensor   = data['images']       # (N, 3, H, W)
+            gt_pts: np.ndarray     = data['pointclouds']  # (N, H, W, 3)
+            valid_mask: np.ndarray = data['valid_mask']   # (N, H, W)
+
+            # 3. real inference, predicted pointcloud aligned to ground truth (data_h, data_w)
+            data_h, data_w         = images.shape[-2:]
+            pred_pts: np.ndarray   = infer_mv_pointclouds(filelist, model, hydra_cfg, (data_h, data_w))  # (N, H, W, 3)
+            assert pred_pts.shape == gt_pts.shape, f"Predicted points shape {pred_pts.shape} does not match ground truth shape {gt_pts.shape}."
+
+            # 4. save input images
+            seq_name = seq_name.replace("/", "-")
+            save_image_grid_auto(images, osp.join(output_root, f"{seq_name}.png"))
+            colors = images.permute(0, 2, 3, 1)[valid_mask].cpu().numpy().reshape(-1, 3)
+
+            # 5. coarse align
+            c, R, t = umeyama(pred_pts[valid_mask].T, gt_pts[valid_mask].T)
+            pred_pts = c * np.einsum('nhwj, ij -> nhwi', pred_pts, R) + t.T
+
+            # 6. filter invalid points
+            pred_pts = pred_pts[valid_mask].reshape(-1, 3)
+            gt_pts = gt_pts[valid_mask].reshape(-1, 3)
+
+            pred_pts = np.ascontiguousarray(pred_pts, dtype=np.float64)
+            gt_pts = np.ascontiguousarray(gt_pts, dtype=np.float64)
+            colors = np.ascontiguousarray(colors, dtype=np.float64)
+
+            # 7. save predicted & ground truth point clouds
+            pcd = o3d.geometry.PointCloud()
+            pcd.points = o3d.utility.Vector3dVector(pred_pts)
+            pcd.colors = o3d.utility.Vector3dVector(colors)
+            o3d.io.write_point_cloud(osp.join(output_root, f"{seq_name}-pred.ply"), pcd)
+
+            pcd_gt = o3d.geometry.PointCloud()
+            pcd_gt.points = o3d.utility.Vector3dVector(gt_pts)
+            pcd_gt.colors = o3d.utility.Vector3dVector(colors)
+            o3d.io.write_point_cloud(osp.join(output_root, f"{seq_name}-gt.ply"), pcd_gt)
+
+            # 8. ICP align refinement
+            if "DTU" in dataset_name:
+                threshold = 100
+            else:
+                threshold = 0.1
+
+            trans_init = np.eye(4)
+            reg_p2p = o3d.pipelines.registration.registration_icp(
+                pcd,
+                pcd_gt,
+                threshold,
+                trans_init,
+                o3d.pipelines.registration.TransformationEstimationPointToPoint(),
+            )
+
+            transformation = reg_p2p.transformation
+            pcd = pcd.transform(transformation)
+            
+            # 9. estimate normals
+            pcd.estimate_normals()
+            pcd_gt.estimate_normals()
+            pred_normal = np.asarray(pcd.normals)
+            gt_normal = np.asarray(pcd_gt.normals)
+
+            # o3d.io.write_point_cloud(
+            #     os.path.join(
+            #         save_path, f"{seq.replace('/', '_')}-mask-icp.ply"
+            #     ),
+            #     pcd,
+            # )
+
+            # 10. compute metrics
+            acc, acc_med, nc1, nc1_med = accuracy(
+                pcd_gt.points, pcd.points, gt_normal, pred_normal
+            )
+            comp, comp_med, nc2, nc2_med = completion(
+                pcd_gt.points, pcd.points, gt_normal, pred_normal
+            )
+            print(
+                f"[{dataset_name} {seq_idx}/{len(dataset.sequence_list)}] Seq: {seq_name}, Acc: {acc}, Comp: {comp}, NC1: {nc1}, NC2: {nc2} - Acc_med: {acc_med}, Compc_med: {comp_med}, NC1c_med: {nc1_med}, NC2c_med: {nc2_med}"
+            )
+
+            # 11. save metrics to csv
+            write_csv(osp.join(output_root, f"_all_samples.csv"), {
+                "seq":       seq_name,
+                "Acc-mean":  acc,
+                "Acc-med":   acc_med,
+                "Comp-mean": comp,
+                "Comp-med":  comp_med,
+                "NC1-mean":  nc1,
+                "NC1-med":   nc1_med,
+                "NC2-mean":  nc2,
+                "NC2-med":   nc2_med,
+            })
+            all_data_dict["Acc-mean"]  += acc
+            all_data_dict["Acc-med"]   += acc_med
+            all_data_dict["Comp-mean"] += comp
+            all_data_dict["Comp-med"]  += comp_med
+            all_data_dict["NC-mean"]   += (nc1 + nc2) / 2
+            all_data_dict["NC-med"]    += (nc1_med + nc2_med) / 2
+            all_data_dict["NC1-mean"]  += nc1
+            all_data_dict["NC1-med"]   += nc1_med
+            all_data_dict["NC2-mean"]  += nc2
+            all_data_dict["NC2-med"]   += nc2_med
+
+            # release cuda memory
+            torch.cuda.empty_cache()
+
+        num_samples = len(dataset)
+        metric_dict = {
+            metric: value / num_samples
+            for metric, value in all_data_dict.items()
+            if metric != "model"
+        }
+
+        statistics_file = osp.join(hydra_cfg.output_dir, f"{dataset_name}-metric")  # + ".csv"
+        if getattr(hydra_cfg, "save_suffix", None) is not None:
+            statistics_file += f"-{hydra_cfg.save_suffix}"
+        statistics_file += ".csv"
+        print(metric_dict)
+        write_csv(statistics_file, metric_dict)
+    
+    del model
+    torch.cuda.empty_cache()
+    print(f"Finished evaluating Pi3 on all datasets.")
+
+
+if __name__ == "__main__":
+    # set_default_arg("evaluation", "mv_recon")
+    os.environ["HYDRA_FULL_ERROR"] = '1'
+    with torch.no_grad():
+        main()
\ No newline at end of file
diff --git a/evaluation/preprocess/download_re10k.py b/evaluation/preprocess/download_re10k.py
new file mode 100644
index 00000000..fcda03d0
--- /dev/null
+++ b/evaluation/preprocess/download_re10k.py
@@ -0,0 +1,234 @@
+"""
+References:
+[cashiwamochi/RealEstate10K_Downloader](https://github.com/cashiwamochi/RealEstate10K_Downloader/blob/master/generate_dataset.py)
+The scripts provided here are for reference only. Please ensure you have obtained the necessary licenses from the original dataset providers before proceeding.
+
+datasets/sequences/re10k_test_1800.txt: test sequences chosen by [PoseDiffusion](https://github.com/facebookresearch/PoseDiffusion/blob/main/pose_diffusion/datasets/re10k_test_1800.txt)
+However, some of the youtube videos are not available now, so we evaluate [Pi3](https://github.com/yyfz/Pi3) on datasets/sequences/re10k_test_1719.txt
+
+You may run into 403 error when downloading youtube videos, please refer to original pytube/pytubefix repo for help or use other downloader like yt-dlp.
+However, this script works for us when doing evaluation for [Pi3](https://github.com/yyfz/Pi3).
+
+For resolutions, most sequences are (640, 360), with a few exceptions:
+3b0b55657925fb34: (640, 272)
+3e034bde9426ae9f: (640, 338)
+2c2cfc0ac780a3aa: (640, 338)
+"""
+
+import os
+import os.path as osp
+import glob
+
+from pytubefix import YouTube
+
+class Data:
+    def __init__(self, url, seqname, list_timestamps):
+        self.url = url
+        self.list_seqnames = []
+        self.list_list_timestamps = []
+
+        self.list_seqnames.append(seqname)
+        self.list_list_timestamps.append(list_timestamps)
+
+    def add(self, seqname, list_timestamps):
+        self.list_seqnames.append(seqname)
+        self.list_list_timestamps.append(list_timestamps)
+
+    def __len__(self):
+        return len(self.list_seqnames)
+
+
+def process(data, seq_id, videoname, output_root):
+    seqname = data.list_seqnames[seq_id]
+    image_dir = os.path.join(output_root, seqname, "images")
+    if not os.path.exists(image_dir):
+        os.makedirs(image_dir)
+    else:
+        print("[INFO] Something Wrong, stop process")
+        return True
+
+    list_str_timestamps = []
+    for timestamp in data.list_list_timestamps[seq_id]:
+        timestamp = int(timestamp/1000) 
+        str_hour = str(int(timestamp/3600000)).zfill(2)
+        str_min = str(int(int(timestamp%3600000)/60000)).zfill(2)
+        str_sec = str(int(int(int(timestamp%3600000)%60000)/1000)).zfill(2)
+        str_mill = str(int(int(int(timestamp%3600000)%60000)%1000)).zfill(3)
+        _str_timestamp = str_hour+":"+str_min+":"+str_sec+"."+str_mill
+        list_str_timestamps.append(_str_timestamp)
+
+    # extract frames from a video
+    for idx, str_timestamp in enumerate(list_str_timestamps):
+        command = 'ffmpeg -ss '+str_timestamp+' -i '+videoname+' -vframes 1 -f image2 '+image_dir+'/'+str(data.list_list_timestamps[seq_id][idx])+'.png'
+        # print("current command is {}".format(command))
+        os.system(command)
+
+    png_files = sorted(
+        glob.glob(os.path.join(image_dir, "*.png")),
+        key=lambda x: int(os.path.splitext(os.path.basename(x))[0])
+    )
+
+    for idx, old_path in enumerate(png_files):
+        new_name = f"{idx:04d}.png"
+        new_path = os.path.join(image_dir, new_name)
+        os.rename(old_path, new_path)
+        print(f"Renamed: {os.path.basename(old_path)} -> {new_name}")
+
+    return False
+
+def wrap_process(list_args):
+    return process(*list_args)
+
+class DataDownloader:
+    def __init__ (
+            self,
+            meta_root: str,
+            output_root: str,
+            sequence_list: list,  # end with .txt
+            mode: str = "test",
+        ):
+        print("[Re10k Downloader] Loading data list ... ")
+        self.meta_root = meta_root
+        all_seqnames = glob.glob(osp.join(meta_root, mode, '*.txt'))
+        all_seqnames = sorted([osp.basename(x) for x in all_seqnames])
+        all_seqnames = set(all_seqnames)
+
+        the_other_mode = "train" if mode == "test" else "test"
+        assert mode == "test", "Currently only support test mode, please set mode to 'test'"
+        all_seq_exists = True
+        seq_not_exists = []
+        all_other_seqnames = {}
+        for seqname in sequence_list:
+            if seqname not in all_seqnames:
+                if all_seq_exists:
+                    all_other_seqnames = sorted(glob.glob(osp.join(meta_root, the_other_mode, '*.txt')))
+                    all_other_seqnames = set(all_other_seqnames)
+                
+                if seqname not in all_other_seqnames:
+                    print(f"[Error] {seqname} not in bote train and test meta")
+                else:
+                    print(f"[Warning] {seqname} not in {mode} meta, but in {the_other_mode} meta")
+                seq_not_exists.append(seqname)
+                all_seq_exists = False
+        if not all_seq_exists:
+            print("---------------------------------------------")
+            print(seq_not_exists)
+            raise ValueError(f"{mode} meta not exists, please check the path")
+        print(f"[Re10k Downloader] {len(sequence_list)} sequences are to download in {mode} mode")
+
+        self.output_root = output_root
+        os.makedirs(self.output_root, exist_ok=True)
+        self.mode = mode
+        # self.sequence_list = sequence_list
+
+        self.isDone = False
+
+        self.list_data = []
+        for txt_file in sequence_list:
+            seq_name = txt_file.split('.')[0]
+            if osp.exists(osp.join(output_root, seq_name)):
+                print(f"[Re10k Downloader] {seq_name} already exists, skip")
+                continue
+
+            # extract info from txt
+            txt_path = osp.join(self.meta_root, self.mode, txt_file)
+            with open(txt_path, "r") as seq_file:
+                lines = seq_file.readlines()
+                youtube_url = ""
+                list_timestamps= []
+                for idx, line in enumerate(lines):
+                    if idx == 0:
+                        youtube_url = line.strip()
+                    else:
+                        timestamp = int(line.split(' ')[0])
+                        list_timestamps.append(timestamp)
+
+            isRegistered = False
+            for i in range(len(self.list_data)):
+                if youtube_url == self.list_data[i].url:
+                    isRegistered = True
+                    self.list_data[i].add(seq_name, list_timestamps)
+                else:
+                    pass
+
+            if not isRegistered:
+                self.list_data.append(Data(youtube_url, seq_name, list_timestamps))
+
+        print("[Re10k Downloader] {} movies are used in {} mode".format(len(self.list_data), self.mode))
+
+
+    def Run(self, tmp_dir):
+        print("[Re10k Downloader] Start downloading {} movies".format(len(self.list_data)))
+
+        os.makedirs(tmp_dir, exist_ok=True)
+        for global_count, data in enumerate(self.list_data):
+            print("[Re10k Downloader] Downloading {} ".format(data.url))
+            try :
+                # sometimes this fails because of known issues of pytube and unknown factors
+                yt = YouTube(data.url)
+                stream = yt.streams.first()
+                stream.download(tmp_dir, data.url.split("=")[-1])
+            except :
+                failure_log = open(osp.join(self.output_root, 'failed_videos.txt'), 'a')
+                for seqname in data.list_seqnames:
+                    failure_log.writelines(seqname + '\n')
+                failure_log.close()
+                continue
+
+            videoname = osp.join(tmp_dir, data.url.split("=")[-1])
+            if len(data) == 1: # len(data) is len(data.list_seqnames)
+                process(data, 0, videoname, self.output_root)
+            else:
+                for seq_id in range(len(data)):
+                    process(data, seq_id, videoname, self.output_root)
+                    print("Process {} done".format(seq_id))
+
+            # remove videos
+            command = "rm " + videoname 
+            os.system(command)
+
+            if self.isDone:
+                return False
+
+        return True
+
+    def Show(self):
+        print("########################################")
+        global_count = 0
+        for data in self.list_data:
+            print(" URL : {}".format(data.url))
+            for idx in range(len(data)):
+                print(" SEQ_{} : {}".format(idx, data.list_seqnames[idx]))
+                print(" LEN_{} : {}".format(idx, len(data.list_list_timestamps[idx])))
+                global_count = global_count + 1
+            print("----------------------------------------")
+
+        print("TOTAL : {} sequnces".format(global_count))
+
+if __name__ == "__main__":
+    # setup_debug(True, 10033)
+    MODE = "test"
+    RE10K_METAROOT = "data/re10k/metadata"
+    OUTPUT_ROOT = "data/re10k"
+    SEQUENCE_LIST_FILE = "evaluation/preprocess/re10k_test_1800.txt"
+    TMP_DIR = osp.join(OUTPUT_ROOT, "tmp")
+    
+    with open(SEQUENCE_LIST_FILE, "r") as f:
+        sequence_list = f.read().splitlines()
+        for idx, seq in enumerate(sequence_list):
+            sequence_list[idx] = seq + '.txt' if seq[-4:] != '.txt' else seq
+
+    downloader = DataDownloader(
+        meta_root     = RE10K_METAROOT,
+        output_root   = OUTPUT_ROOT,
+        sequence_list = sequence_list,
+        mode          = MODE,
+    )
+
+    downloader.Show()
+    isOK = downloader.Run(tmp_dir=TMP_DIR)
+
+    if isOK:
+        print("Done!")
+    else:
+        print("Failed")
\ No newline at end of file
diff --git a/evaluation/preprocess/prepare_re10k.py b/evaluation/preprocess/prepare_re10k.py
new file mode 100644
index 00000000..509217a1
--- /dev/null
+++ b/evaluation/preprocess/prepare_re10k.py
@@ -0,0 +1,90 @@
+import os.path as osp
+import json
+from typing import List, Tuple
+from PIL import Image
+from tqdm import tqdm
+import gzip
+import numpy as np
+
+def load_seq_cameras(example_path: str) -> Tuple[List[List[float]], List[List[List[float]]]]:
+    with open(example_path, "r") as f:
+        lines = f.read().splitlines()
+
+    # url = lines[0]
+    # timestamps = []
+    # cameras = []
+
+    intrinsic_list = []
+    extrinsic_list = []
+    for line in lines[1:]:
+        timestamp, *camera = line.split(" ")
+        camera = [float(param) for param in camera]
+        intrinsic = camera[:4]  # fx, fy, cx, cy
+        extrinsic = camera[6:]  # 3 * 4 matrix
+        extrinsic = [
+            extrinsic[i:i+4] 
+            for i in range(0, len(extrinsic), 4)
+        ]
+        intrinsic_list.append(intrinsic)
+        extrinsic_list.append(extrinsic)
+
+    return intrinsic_list, extrinsic_list
+
+MODE = "test"
+RE10K_METAROOT = "data/re10k/metadata"
+OUTPUT_ROOT = "data/re10k"
+# SEQUENCE_LIST_FILE = "evaluation/preprocess/re10k_test_1719.txt"
+SEQUENCE_LIST_FILE = "evaluation/preprocess/re10k_test.txt"
+
+with open(SEQUENCE_LIST_FILE, "r") as f:
+    sequence_list = f.read().splitlines()
+
+# seq = '498688760312447b'
+
+out = {}
+for seq in tqdm(sequence_list):
+    first_image_path = osp.join(OUTPUT_ROOT, seq, "images", "0000.png")
+    first_image = Image.open(first_image_path)
+    anno_save_file = osp.join(OUTPUT_ROOT, seq, f"annotations.json")
+    width, height = first_image.size
+
+    seq_meta_file = osp.join(RE10K_METAROOT, MODE, f"{seq}.txt")
+    intrinsic_list, extrinsic_list = load_seq_cameras(seq_meta_file)
+    seq_info = []
+    seq_info_standard = []
+    for idx, (intrinsics, extrinsics) in enumerate(zip(intrinsic_list, extrinsic_list)):
+        # intrinsics, OpenCV-style 3*3 K
+        # https://google.github.io/realestate10k/download.html
+        fx, fy, cx, cy = intrinsics
+        intrinsics = [
+            [width * fx, 0,           width * cx ],
+            [0,          height * fy, height * cy],
+            [0,          0,           1          ]
+        ]
+
+        seq_info_standard.append({
+            "filepath": osp.join(seq, "images", f"{idx:04d}.png"),
+            "extri": extrinsics,
+            "intri": intrinsics,
+        })
+        # print('Np array shape extrinsics:', np.array(extrinsics).shape)
+
+        # extrinsics, OpenCV-style W2C 3*4
+        # extrinsics.append([0, 0, 0, 1])  # Add the last row for homogeneous coordinates
+
+        # seq_info.append({
+        #     "idx": idx,
+        #     "filepath": osp.join(seq, "images", f"{idx:04d}.png"),
+        #     "intrinsics": intrinsics,
+        #     "extrinsics": extrinsics,
+        # })
+
+    out[seq] = [seq_info_standard]
+
+    # with open(anno_save_file, "w") as f:
+    #     f.write(json.dumps(seq_info, indent=4))
+
+root = "/mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt"
+
+with gzip.open(root+"/annotations/re10k/test.jgz", "wt", encoding="utf-8") as f:
+    json.dump(out, f, ensure_ascii=False, indent=4)
\ No newline at end of file
diff --git a/evaluation/preprocess/prepare_re10k.sh b/evaluation/preprocess/prepare_re10k.sh
new file mode 100644
index 00000000..89b99b91
--- /dev/null
+++ b/evaluation/preprocess/prepare_re10k.sh
@@ -0,0 +1,9 @@
+# The scripts provided here are for reference only. Please ensure you have obtained the necessary licenses from the original dataset providers before proceeding.
+
+# First download the data from here: https://google.github.io/realestate10k/download.html
+
+# download re10k test sequences by yourself, you can refer to the below script
+python evaluation/preprocess/download_re10k.py
+
+# convert camera annotations in metadata to video data folder
+python evaluation/preprocess/prepare_re10k.py
\ No newline at end of file
diff --git a/evaluation/preprocess/preprocess_co3d.py b/evaluation/preprocess/preprocess_co3d.py
new file mode 100644
index 00000000..e55fd380
--- /dev/null
+++ b/evaluation/preprocess/preprocess_co3d.py
@@ -0,0 +1,131 @@
+# Modified from https://github.com/amyxlase/relpose-plus-plus/blob/main/preprocess/preprocess_co3d.py
+
+
+"""
+Usage:
+    python -m preprocess.preprocess_co3d --category all \
+        --co3d_v2_dir /path/to/co3d_v2
+"""
+import argparse
+import gzip
+import json
+import os
+import os.path as osp
+
+import matplotlib.pyplot as plt
+import numpy as np
+from tqdm.auto import tqdm
+
+# fmt: off
+# CATEGORIES = [
+#     "apple", "backpack", "ball", "banana", "baseballbat", "baseballglove",
+#     "bench", "bicycle", "book", "bottle", "bowl", "broccoli", "cake", "car", "carrot",
+#     "cellphone", "chair", "couch", "cup", "donut", "frisbee", "hairdryer", "handbag",
+#     "hotdog", "hydrant", "keyboard", "kite", "laptop", "microwave", "motorcycle",
+#     "mouse", "orange", "parkingmeter", "pizza", "plant", "remote", "sandwich",
+#     "skateboard", "stopsign", "suitcase", "teddybear", "toaster", "toilet", "toybus",
+#     "toyplane", "toytrain", "toytruck", "tv", "umbrella", "vase", "wineglass",
+# ]
+CATEGORIES = [
+    "apple", "bench", "bowl", "cellphone", "frisbee", "hotdog", "keyboard", "parkingmeter", "teddybear", "toybus",
+    "backpack", "book", "car", "donut", "handbag", "hydrant", "motorcycle", "pizza", "stopsign", "toaster", "tv"
+]
+# fmt: on
+
+
+def get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--category", type=str, default="apple")
+    parser.add_argument("--output_dir", type=str, default="annotations/co3d_v2_annotations")
+    parser.add_argument("--co3d_v2_dir", type=str, default="data/co3d_v2")
+    parser.add_argument(
+        "--min_quality",
+        type=float,
+        default=0.5,
+        help="Minimum viewpoint quality score.",
+    )
+    return parser
+
+
+
+
+def process_poses(co3d_dir, category, output_dir, min_quality):
+    category_dir = osp.join(co3d_dir, category)
+    print("Processing category:", category)
+    frame_file = osp.join(category_dir, "frame_annotations.jgz")
+    sequence_file = osp.join(category_dir, "sequence_annotations.jgz")
+    subset_lists_file = osp.join(category_dir, "set_lists/set_lists_fewview_dev.json")
+
+    # bbox_file = osp.join(output_dir, f"{category}_bbox.jgz")
+
+    with open(subset_lists_file) as f:
+        subset_lists_data = json.load(f)
+
+    with gzip.open(sequence_file, "r") as fin:
+        sequence_data = json.loads(fin.read())
+
+    with gzip.open(frame_file, "r") as fin:
+        frame_data = json.loads(fin.read())
+
+    # with gzip.open(bbox_file, "r") as fin:
+        # bbox_data = json.loads(fin.read())
+
+    frame_data_processed = {}
+    for f_data in frame_data:
+        sequence_name = f_data["sequence_name"]
+        if sequence_name not in frame_data_processed:
+            frame_data_processed[sequence_name] = {}
+        frame_data_processed[sequence_name][f_data["frame_number"]] = f_data
+
+    good_quality_sequences = set()
+    for seq_data in sequence_data:
+        if seq_data["viewpoint_quality_score"] > min_quality:
+            good_quality_sequences.add(seq_data["sequence_name"])
+
+    for subset in ["train", "test"]:
+        category_data = {}  # {sequence_name: [{filepath, R, T}]}
+        for seq_name, frame_number, filepath in subset_lists_data[subset]:
+            if seq_name not in good_quality_sequences:
+                continue
+
+            if seq_name not in category_data:
+                category_data[seq_name] = []
+
+            # mask_path = filepath.replace("images", "masks").replace(".jpg", ".png")
+            # bbox = bbox_data[mask_path]
+            # if bbox == []:
+                # Mask did not include any object.
+                # continue
+
+            frame_data = frame_data_processed[seq_name][frame_number]
+            category_data[seq_name].append(
+                {
+                    "filepath": filepath,
+                    "R": frame_data["viewpoint"]["R"],
+                    "T": frame_data["viewpoint"]["T"],
+                    "focal_length": frame_data["viewpoint"]["focal_length"],
+                    "principal_point": frame_data["viewpoint"]["principal_point"],
+                    # "bbox": bbox,
+                }
+            )
+
+        output_file = osp.join(output_dir, f"{category}_{subset}.jgz")
+        with gzip.open(output_file, "w") as f:
+            f.write(json.dumps(category_data).encode("utf-8"))
+
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    if args.category == "all":
+        categories = CATEGORIES
+    else:
+        categories = [args.category]
+    for category in categories:
+        process_poses(
+            co3d_dir=args.co3d_v2_dir,
+            category=category,
+            output_dir=args.output_dir,
+            min_quality=args.min_quality,
+        )
\ No newline at end of file
diff --git a/evaluation/preprocess/preprocess_scannet1500.py b/evaluation/preprocess/preprocess_scannet1500.py
new file mode 100644
index 00000000..8c594492
--- /dev/null
+++ b/evaluation/preprocess/preprocess_scannet1500.py
@@ -0,0 +1,89 @@
+from pathlib import Path
+import json
+import gzip
+import numpy as np
+import torch
+from tqdm import tqdm
+
+def read_scannet_pose(path):
+    """ Read ScanNet's Camera2World pose and transform it to World2Camera.
+
+    Returns:
+        pose_w2c (np.ndarray): (4, 4)
+    """
+    cam2world = np.loadtxt(path, delimiter=' ')
+
+    if not np.isfinite(cam2world).all():
+        return None
+
+    world2cam = np.linalg.inv(cam2world)
+    return world2cam
+
+
+def read_scannet_intrinsic(path):
+    """ Read ScanNet's intrinsic matrix and return the 3x3 matrix.
+    """
+    intrinsic = np.loadtxt(path, delimiter=' ')
+    return torch.tensor(intrinsic[:-1, :-1], dtype = torch.float)
+
+# Root folder where everything starts
+root = Path("/mimer/NOBACKUP/groups/3d-dl/scannet/scannet_test_1500")
+
+out = {}
+
+valid_frames = 0
+invalid_frames = 0
+for scene_dir in tqdm(root.iterdir()):
+
+    if scene_dir.name in ["scannet_indices", "scannet_indices.tar", "intrinsics.npz", "test.npz"]:
+        print(f"Skipping {scene_dir.name}")
+        continue
+
+    intrinsics = read_scannet_intrinsic(scene_dir / "intrinsic/intrinsic_color.txt")
+    
+    frames = sorted([p.name for p in (scene_dir / "color").iterdir() if p.suffix == ".jpg"])
+
+    # Maybe resized undistorted images are too high resolution?
+    num_frames = len(frames)
+
+    # Since the images are taken in a sequence we will just chunk up the sequences
+    
+    sequence_data = []
+    for frame in frames:
+        pose_path = scene_dir / "pose" / (frame.replace(".jpg", ".txt"))
+        pose_w2c = read_scannet_pose(pose_path)
+        if pose_w2c is None:
+            print(f"Warning: Pose contains NaN, skipping frame {pose_path}")
+            invalid_frames += 1
+            continue
+        valid_frames += 1
+        R = pose_w2c[:3, :3]
+        assert not np.isnan(pose_w2c).any(), f"Pose contains NaN: {pose_w2c}"
+        # print('Determinant of R: ', np.linalg.det(R))
+        # assert np.allclose(np.linalg.det(R), 1.0, atol=1e-3), f"Rotation matrix determinant is not 1 but {np.linalg.det(R)}, R is {R}"
+
+        frame_data = {
+            "filepath": f"{scene_dir.name}/color/{frame}",
+            "extri": pose_w2c[:3].tolist(),
+            "intri": intrinsics.tolist(),
+        }
+        # Sanity check
+        assert len(pose_w2c) == 4 and len(pose_w2c[0]) == 4
+        assert len(intrinsics) == 3 and len(intrinsics[0]) == 3
+
+        sequence_data.append(frame_data)
+
+    out[scene_dir.name] = [sequence_data]
+
+    print(f"  Created a sequences for {scene_dir.name} with {len(sequence_data)} frames (out of {num_frames} original frames).")
+
+
+print('Valid frames: ', valid_frames)
+print('Invalid frames: ', invalid_frames)
+root = "/mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt"
+
+with gzip.open(root+"/annotations/scannet/scannet_test_1500.jgz", "wt", encoding="utf-8") as f:
+    json.dump(out, f, ensure_ascii=False, indent=4)
+
+print(f"Processed {len(out)} scenes with a total of {sum(len(v) for v in out.values())} images.")
+
diff --git a/evaluation/preprocess/re10k_test.txt b/evaluation/preprocess/re10k_test.txt
new file mode 100644
index 00000000..7f87ff51
--- /dev/null
+++ b/evaluation/preprocess/re10k_test.txt
@@ -0,0 +1,1386 @@
+497364635884d8aa
+12dc074fab6ada73
+0be9a0dcbfe032f1
+0e5e7fbe8914352c
+2864dc6c129cf3cc
+0fbe6d76015f75d4
+4200282fe9b4015a
+133ee6e537353604
+020a41f988981396
+0f7061acbeed50dd
+2e4013ea92d04301
+31a087ee5b1976da
+22598e2596e6bae7
+4938177a0e6e2fbe
+1fe394077e7c3de0
+0bd7e6e9f0185aa3
+431795f999dc215f
+33288d55dde83e72
+389f65e97bd902b2
+0131c9aed0fb3940
+0c4c5d5f751aabf5
+0de8a88480533be6
+0f1f245fa1c181ae
+14bfd05497764243
+45dce690caec2917
+454197dc5b50b45f
+47573da5cb0e5e44
+05c423623c9f6f56
+1fd5f9af785e6e5c
+05b77cb7c0f79f0f
+039cc34e9cdbcf8f
+0f47577ab3441480
+47c88dcfb1134255
+3fd084afa49b6499
+15aba05919bae167
+3862411e9bf455cd
+30140756550dc38e
+16da9b4fdfe4883b
+3dc0058dce3828d9
+16a2c55f96e6aa18
+168e0f4071d2fe58
+2fca5797ae48529b
+325ff82707386438
+298276fb3c0330e5
+2cd1705407546b72
+33f7565ccb685cb7
+49c758aa3c35ed86
+4c502551adddea8b
+2e35fc35559543f2
+31838e9542a906be
+23808c0cfcc72e72
+0baa633d2094d2c1
+22552c9a2a2a2ce7
+0ed8b86b87a30d38
+0e728af85650dcb3
+17f552ef56d85c55
+41936ce6152fee64
+05a6149f1fcee38d
+4c0ef61c55467706
+46200541f9943d16
+3b7443b24830d388
+04b580ea1f4df0a5
+33baf3e18e5d7256
+01be77405b16df11
+477df7ad0c2e7fdb
+2c3a96fb820e1ab7
+3e1af0b953407ef7
+28a5318660ab60ba
+3317c40fd3e0a7b7
+293e02c7c1fa31a8
+3dba1838ed366ab5
+0e8995dcbdd22f48
+3814a3a8046c8af3
+32991f419c96ea0e
+0068e97c1c1f61aa
+242fc4972c7bb385
+0f5c5385dbcd96df
+2a1fed061b29b25b
+335794d48a9b168a
+446f557155994097
+39074bca3524418b
+46e2ddf094d0c3a1
+0f18fb6736efb1c2
+3c44d53659dbe4fe
+389851bf0ac38227
+0de78cb98105f8c2
+2dc3af70d25d3043
+411c4dd047c49cde
+1be80ff36848e758
+2ad09b7837010330
+375cff10cab07955
+0387ef3895b1393c
+308de3d523189c72
+297b57a9296052ce
+483c2b4c67e32c19
+0c0f298ace7c875b
+0a6680fe6e8e09d7
+372f324a1f4d6898
+200ad247448c5577
+03ef5f13e0a30864
+01842c6b21e1d679
+210fc445b9e254f5
+3b1f9cedfc40b06c
+03cf40616d79cb6a
+03e756bff92d49dd
+3c64a373bc1c53bd
+04ca03945611febb
+0145c694b53b120d
+2d524e9324228d6e
+3349f3089ea84d6b
+1695a74d194b65c0
+3115672ced7c5694
+0c3d3b45ff4a4326
+3bceca99e87d64c5
+4bdfa30358809038
+431e6542fde13130
+0aa8646901d156e4
+1798c9640d8875e6
+3a488ff3afa463e2
+17dba5fa8138ed92
+46fb9c990b6f8114
+0485a8528fa72698
+04433dcf217ad9a0
+1dad44855584a4ea
+052430ff6e2c07c4
+01cf55ae3e378faf
+339c95e2e709d044
+0cd63c88350eef60
+46fdfa2a16c7c811
+0e2c96cd97e73a38
+3953f37661087a95
+3d2486ac8822da47
+4c4fa41951e37e78
+14cdf4aa7a2de14b
+16cd4f1cb2a467c1
+020991bdfbdbe504
+2b8f367d01df3601
+0d8fd962cbfc81b7
+36e94b0ad3a62c7d
+32a2c04fd8321bbb
+20100b779d28b6d5
+414e2bf42ee45cc4
+43fee307c6339b5e
+1ca4db19711258a0
+45064c8142f3a360
+0aa49c0b75e51ba4
+2d16da80d7e3b64b
+454fc1e32db7cc41
+1f7770ac5cbb41eb
+3115ce06e0160828
+18ce480be0ececbd
+49c9324758b5e867
+2cf1a544b179b1a7
+03f1781c4cc126e6
+3f33ed2971149ea0
+4161944d7d592071
+196ffec2c68cafd5
+45cb862034851efe
+1cddaac7be8ecfa5
+3cb1489b614e5f39
+46ea97f6f3757209
+1defdda324307269
+1c11709814a1a2f2
+0f64c5e4fead6cf2
+0c8438d86bb28f7d
+2b4a934049f932d0
+1cc1ff58dc89d230
+20d0e788abca4aa9
+22085848f943c2c6
+3bae42d603be2266
+1e847ebd7cd1174e
+4c2383e60aaf26cc
+12cb4aa3f5b59ad6
+02f801e372d67cfd
+165696025b477097
+06954737f53b8688
+464d97e527dd5f8a
+19ec130ecea98d5e
+3f4f553239e96d90
+306e2b7785657539
+29c8267c1d10b23e
+17519e763b34fe14
+1cd3638cceebed08
+3c19657356e9e229
+41f3291d82fc4d93
+0e41af1514f92887
+173a82eeea56aed3
+41649e3e8f9a4be0
+0a4cf8d9b81b4c6e
+284efc2041b1d1d8
+29832cbdb4144601
+15bc7fa1ed5567cc
+075654f497170f90
+2b625e92f2cf9de4
+40f6d540b9b16531
+2b4c1f50687b2bcf
+1d8017cad8dc1d56
+27daccf898b206de
+2482c4388b32f225
+22c8b35c589276c4
+21a6081709444ebf
+3b6d8db52c54b174
+4b457a008376cd73
+3d0a0fecfbdada35
+2b81fbc1af01f0ad
+45166f266dd609a3
+3a86a812a1eaa20e
+2347b1e3e70842ac
+2b0b2259a7216762
+17a75f0b036c9cf8
+27f772c12c97b594
+332294213fa15c56
+2b973e6f676eb243
+1ce68f950e7cdf8c
+2a8ef9e44f580d13
+44be029ec85609c5
+37d4e43b2b029a80
+309bed43e4406d72
+3c9c37132583a3d2
+314c584ff3842715
+17fc81293f337cc3
+0c2b3463c27c5ac3
+0ce3839aa5b66e3f
+08868143749f321b
+0282160b901229a7
+32d28b7513f873be
+31afa3dcf3b737fa
+474afb2d4641a228
+3232f7457b27dbbc
+036f135766f38f78
+069a4c442912c405
+05a0ad1e2aa632e7
+2e64a2d17f9a76f7
+127884736471b631
+1214f2a11a9fc1ed
+19310eb8261d4bb2
+45108618c40e26a7
+182054e13eaf58fc
+05bacb6d6a4741b0
+3d2f4958db5aefbf
+07ba5489b56b7d62
+3cc4c306db84c6fe
+01cefee9563f691a
+46df36a031f50a04
+1be5a5c98a51b1cc
+361d722ef5009e09
+3d5114f5d7496cdd
+1648b3e7471e3766
+0c9d930d226d6bd6
+1fe9f9bc178a1778
+41c600ed9f88871b
+33a5a85f06fcc77b
+3ce90c0ea2537c48
+4422b38e60e3bc2f
+00969c45a093d43c
+489254d4b26a04c6
+3f45b8234504020c
+28760a14d0a5ff3a
+3a79b9aefafb0b8d
+11e62395c85c250d
+430d7b5b77861810
+3d8d753f0851bf3b
+2e06abf6286040e2
+1be3758972b35151
+154072be49bb3c1d
+128bf83073de3ba1
+19b0cd79a126e8bd
+055e4612c1ea70f9
+0e2f2538b26a179c
+1837b8eebc9c2457
+0eadbf8806794990
+368fa2dd830843c7
+1c29b1d8fd1dab3f
+0972074fece891f2
+44b78f9fcb5cd8d8
+4b41d03353967b40
+0c916bcc9351521e
+15b902774d67a394
+12b7562944c06836
+252a24d25a1ea81d
+0cdfa29561cb24e0
+32d2163aa65c0e8a
+08c9e7365f0707a4
+28190e57702bcfbc
+247bc2e47eb7f6fa
+15bc9eb752c6dcbd
+049a98f70ce9f471
+3ea8d9787998f70a
+1058fe0400a873e0
+21d9134faec148f2
+11491a312c6b8f58
+4cbb82a6bab25a0f
+3a0328ed13dd8c8b
+0ed4c9cac4a615fd
+07842cc567e9beec
+10ad4fc499c48b38
+36056faee50a621f
+0752baf20fbc2285
+004e9db3337e8206
+1150003196de2529
+2313ea0fb17cbed6
+0de79d4f3d7a9171
+1259726fc1f8e966
+30bd5b88e47f6d0e
+42b88f7ee71a7ba9
+2f5af4b429b2c992
+107b78daf075d371
+4aa1973c40d2eb93
+08718fb99eaafea7
+4a6a057fc644624e
+1d7af31482baf61c
+002ae53df0e0afe2
+3b273cb40c55db95
+2c02607ae436a9fb
+379884ce61c4daa7
+4ce58504b055463e
+0b211a1457076450
+390ddb7ee9b716ca
+1bc87c160d1dc982
+2f4bcd593fe37158
+0c8c4363e0dca250
+489f9441d513634e
+45a4515834848010
+460455f96fa1a1d6
+30127e00a789ed7c
+30029fac7c5621de
+007876f71baf453f
+4cc4c8a8cfa8e944
+375f9c448cf31ccb
+114d9c301b847239
+02ee66b3efbf3b0a
+2868a1b43e9eceff
+3aaed2e6422d7d57
+41d8b4350913ca64
+4b7071b34e8cc67d
+1beb8e6662d36ee6
+0bf152ef84195293
+07d0229847bd7408
+04fe4ec70781a0e8
+4b3644bfc6083588
+118f563fe2ed4998
+2d6d5e82bda0611c
+43759ced44693671
+485f996ecf360da7
+3b59c7d97b900724
+2b4e1061f6415a4a
+2e86767798c005df
+2af206730de6f439
+06eba57d1c333a3c
+4bf9ef4705f35e8f
+395caf9235fde098
+4526c5ac1bfebcfb
+4b5a6dd314bebe88
+1d704b9365e9c86b
+36d85599a9cbb6b3
+180542b70f713d5b
+10c8e54590f715f7
+4bc1c3a888a8bfba
+2c805f56d92a2e22
+1db274e904e3fb07
+06f4bfa5f9d5fe0e
+0c1012a308ee2788
+3af70052616f7fa1
+382a5736d9134153
+12497730f691d00c
+04ef6a410f034514
+2eb515bff528d3c3
+1e01b910ceba4573
+2c6e76b362eed8e4
+45ac5168bda9d3e2
+1f084607245e4462
+24548ce6c15bc2cf
+403951b5d632b5ab
+2defb4625a3ccb54
+1ec7e7dce1175aee
+145708c0216a06a7
+01eca393f86d37c5
+497d2450ed65a678
+32ce9b303717a29d
+18c6473be3bd827a
+1227d00562c106e2
+0ccb28128213f19d
+407eefe8017f6070
+0bcef9ed1c18f74d
+059058768c222bd6
+29f52c76f269ae48
+177d39d72e983b69
+202a627de66ad397
+401a94bd9d84f501
+1eab4db6941be725
+236f9dc6456cf32c
+2a8cd9f87b3c9a2b
+0d01d4d6c5d5297e
+1da33647873725fd
+39b58270c2e99310
+2cecdd7df86ff8d3
+4b6e9a02975ef9a4
+3289ad7a811d2348
+3a4d7cbcf0c84668
+28c9d20b865f5d56
+4acd145b0c133dca
+4bc7ca44cc62b8b1
+17ca3b8ad5815b35
+072a60bc7e0b0dfd
+061f829d3dd2e46e
+1203cc23b881ab8d
+31d903660e1647c5
+172c99489b18e0e6
+46e653208e529783
+058c67085c217b96
+33a3fc21efdc8547
+4a736d7c30ae9280
+2b3e3c5d30c17bd9
+388cb2f0ff1a6cad
+3a52947c66de5920
+3c33566bacd602f2
+45823117f0acb627
+223b5e20753d4fa3
+3c83e9817c9e022d
+1e2a2be2df033527
+3e8363be673dafa2
+37d09bda74c92a93
+0fd536fc3c8fdf19
+146581180e89666d
+232aee1c62a1cd8b
+2177ca3a775a9ee9
+015631b21f792a12
+338ff9f6c02a6a40
+2bcb26e95f5d152b
+22b16c2f5af0f3ef
+05ac37966de4e7fb
+0ecf489d873b7f52
+1b747b8eba6f7b45
+21005252fe2383ba
+3f7b6f511421e395
+2a058bafbecaccf9
+45122648522d4180
+2c48ab563b92a1d5
+4766f2062abaaf74
+4308efab35deb3ec
+3cc97c3d778975f2
+1c8d34a791deaaf1
+36afe96c11a8211c
+0e7b68884ac4d959
+436a235ed74c3d89
+0720bead0cc7cbbf
+2e619c31122ee40f
+311db764bdc7f537
+3eb718c3170fcd8e
+1f4279d98e283206
+48bb743178166598
+2ebedb0f027df101
+0c6b149da098b121
+46067fb6d992860e
+0a5eeb4466dd19bb
+00ca5123d8ff6f83
+241aa9bcdbdc7ac6
+283059a56e7f3e75
+38f7ba7fd9a83069
+333c649f75c3c7bc
+2f8e1946600c65d4
+1c144e2404c5da89
+2341162bce213f2e
+171b3a4c2f95f981
+29791bf60e718c6b
+36fb4d41b00581c4
+0d22ced53b1db7d3
+2cc330488326fd4f
+1f1a76ed6db1dae5
+19694d2dc528d75d
+1efbd8f8949b15e8
+1cca8650a292e7b0
+41aa58e688a04336
+43db8c6515021c01
+2aca85b3bdf90a09
+3172feb32990cf09
+18e659699338f835
+22274e48b847c860
+077d42bb51ee2793
+39c662fa32a3b5c6
+33e5bb3820c171a5
+0db2394602b8b81c
+02b59cd60efb924e
+4c791225522d45ba
+17535adda2aa3b90
+473e6ec61583d90f
+10351dc7a37a44c1
+440b5d1587251680
+30006eb23f62aa57
+2d4e81e66ce80039
+2b8778726c1f2fe4
+1df20a29cdec61a4
+03e141d7afac53e4
+3970859f54703c88
+0b79ada01eb45be9
+1c9772d765e0679e
+1c73def8a62301a9
+04f5153fc5255516
+4c943ac66f6c277e
+2e2ad99d45033d6a
+004dd4b46a06e5be
+365cc620c2fcbb05
+3b9e04113b202116
+17a489f3cec39fea
+2ad50852a84faf51
+1bce163cad1e1d20
+21b9d476c5a49c63
+1047cc04fa16e0d7
+471abe46b812be64
+4260ca20e2430c67
+11337164b772b7c9
+2ca8e34592e0c415
+3edff71624eac3ee
+0bb99505a71035cc
+2fdfa70413053b84
+2c6fb46edb748fe5
+2a89b2a52cee9f5f
+008cd8c450342e49
+14e540cf0ff7ff91
+13c4059da4e56a8c
+45a0fe252a89e008
+193c3bd339eb0a75
+20384754a6e5d1b0
+371c9182ffd46ce3
+0cda9adbbedd7948
+2d29ff162920db5e
+3c84329b60bfa7cb
+3af43fe8d514f7c5
+4aa594c0ad661f28
+2d60837ef2e52abd
+0368abd976e8d82e
+3f265c5edb13f00d
+18be7c1b9895691f
+2d62a5d66d6e4931
+446626a2bd617d24
+179ff8424ec7ad13
+040a26b288e7bda4
+45e5fcd5c8978342
+316035dd285c5e27
+3b650a9e2ebdfde2
+20e7a3651ec30386
+2c5e21f9f91e2e09
+0e01be9445403642
+0f7267e7e369b7d6
+4b0fdb10ae15684b
+3984d005557cbd6f
+24fea6c2c7caa434
+3e4057a188e15ac3
+1cdfd3abfcc3a64e
+01b08e2f20321127
+07e8ffa32746c7ce
+043c48135c5e8cc2
+29a09527214b3dc5
+09265a9e57075e7c
+12d4ca1236a2cf26
+03fe94a439456692
+44d2532c5b5296a1
+024152256b6bcac7
+23a6c9168abdb38e
+379470c7d22c498a
+098263de57257005
+2f18f5579583e648
+2bec33eeeab0bb9d
+28f59b68509ce59d
+0af60a9ffd747a1c
+15ca8e1fce488c19
+3fcf6c1b81b14af5
+2f98ee24d3fc43a2
+2ef881551a7fda22
+44ab295bc3092c28
+406bdec5b68b1a71
+204fc9ff2c7ff92c
+2a2d971fd44ae258
+4b85062505816744
+3c31d7b9f2792ed9
+4ce642bc93f1bb5a
+4221bc1d4aea1a02
+08138c1a3ba1ce8d
+037e8191b3985142
+0d46043105cf3185
+20422596003ac855
+42742db2633d2eb5
+3b434b5302dea908
+0588138dfec165a1
+0d68a05801d48984
+40c517d28a412a5a
+2f7f2369486cc959
+2afb8a0a98e15155
+01fa6190cd47d125
+3290731e5f908b92
+4b86587ecd3325f4
+33842f4b169e4145
+0f6206df8a8e440a
+038137c9569c60eb
+1d125b16063c96c4
+03906f66d3bca71a
+01f7915dce639515
+04db26572a791881
+171fdcc554d303d8
+0f68374b76390082
+1c5514d49d61bafc
+195ca8350ff27f6f
+44138776bdbfe28c
+45f5a75e63afd4a2
+1b881261742799f7
+2d3f982ada31489c
+0a7c052273895bb3
+0667d5bedfdbc555
+0c356641df7c72b8
+42f761f7e655bdce
+3c5163ede747b187
+31a642fe4cd1e232
+223c9627b978d127
+0954f5a326941fb1
+1512ea7a9754ac34
+040895d45bf4e580
+05ec2d3e4c027220
+0aa284f8166e19e4
+29f0d7c051d80035
+1d04977fb85a8b3c
+36fc018c7b62b997
+0277b87a9c943ed5
+02c0c9192fb9a6a1
+111356766833a7df
+09754b77eeea6dae
+0a6c499522efa0d5
+0d06be83296cf911
+48a3049cabb54c0d
+01e18dbbf22ff263
+44d12349e0609ba3
+10f9d6f46e438d36
+188e6f96fa74ebe7
+0951fdaf9d399411
+08f82d3899d6b726
+179a1357a581ad51
+01cf2d900cb03afb
+068140e09ae5ae8f
+20f8c6738e22e764
+33dbdc4396938ae1
+44f9aac9faaac569
+1ebb496a04a1bd76
+0b04644621e97d30
+03b3f603a1001de0
+1bf49251fdd23cc0
+3cc40f129447cb31
+3714123f055e06a1
+38d44ebf460ac132
+378cb83947bf2d23
+4054d32655ba5eda
+444d7d8445cd444e
+3f0d9e856d93b8b3
+2fe5274c4baf665f
+25b3854e6efb747d
+3824335ebd7a4097
+2a41503583d146ed
+452fff658953aab1
+0fb6678e63316201
+397037082e6eb839
+0a0027a48d9ff2ae
+1fcfd8a36e171639
+4b4c0c27204604a3
+17a39d87a22ac1ec
+47191aa41a979900
+07d1d5769e8d797c
+1bf4fe9301893904
+41be40880094d8d1
+0d76da0fcac26af8
+125c92c36a04a68a
+39f444469cf39006
+025192166c704a39
+246579087204ba8f
+4596160a24b1af1f
+2e96b2142fd337dd
+1e1e13de4ebea05a
+3ad4793daf6adc19
+01570ac1c73e9ca2
+1424acd0007d40b5
+2060cce4dad6f988
+330b925cef643b3f
+1d4d81f8629b119e
+286393e1e797cdad
+3928ea8b8c134846
+0a72a3fd46a88ef6
+32eba3e4cfb61f93
+00620c2b77518524
+3b1c57027302837f
+365fc12b4f33ada3
+0d4de33c6888a754
+487d83675a8d1574
+237ef12cd69e2aa0
+20b38e0a985506ba
+289ce0f2b82dcd0e
+088e115752ce9e56
+2d3e1349898addb6
+0d5112a7eb22d61a
+21e97eb0cfbff775
+22a0db80d91128e4
+32dc25ef78b564a1
+1f0e06e4388dd600
+322261824c4a3003
+0ef68550315f57c7
+06a2911e9add96c6
+0807e84457d5ef58
+4ace59951acbae4e
+46b2a13f6ab0be05
+0ede2c8fbe52c1d3
+37697c41773d597a
+01497290d8b93a9b
+0d5a4dcdf8ec9d36
+0e00a382b62667c0
+2a1769dddc1dbf8d
+23df266716914368
+094fd37f09dc318c
+2326e9820982ad81
+3ed3ffd0ae9c3224
+20764a96cc70fe46
+166818269e4e2568
+37d0f351f07ee925
+3087828bc27bc4c7
+06a8196a66e125af
+49e48d66787ecb8f
+0f5bb0704084e290
+3eb795302924e912
+2d0bd035f7df86b0
+2aa1e311e4bc039b
+0bcde26e5a802638
+0c5ed899789e60ad
+0ac6adb37a92f549
+0122933cf8ab3317
+19391676f0fc7982
+3a9c883b11e86530
+23e428c0dc43f046
+4317016b336431be
+37dbb9846f2fdc01
+004334c94bbc8bd5
+0992802044bf665d
+1f5df6019b0bb73c
+0871e2540b0a6804
+0c609c435b1f7114
+322d03d487fc0f01
+1e9b1dc1c096d68d
+1bf668db0194cf83
+08d5cde674e47324
+2a3baeaa72b86812
+0aacb1732fee7a3c
+04cb1526cf3c43cc
+3a6dc09185951ae3
+3fdaa028b8baad4a
+24b4e8ff5a9a6439
+0a3b5fb184936a83
+0b3674ffb90b641a
+3c35b868a8ec3433
+44b13221c50914d9
+02cb3a4fd80ee0cc
+41fafa6144b58c39
+45e81a557d2dd78b
+4ae4456267802484
+3e07add8413f8157
+4c96e034f8af77d6
+38b7a1d23745fbf3
+1c58aa75858147f1
+2e9d6a76c40b707f
+362be988d0b68a4c
+3ab70559ec30a57c
+1433f61e9591ea9b
+139a615209ee09ac
+4811a66f87b0dd6a
+2b1e0225f0952a09
+3db49ddb3f470436
+018f7907401f2fef
+0e241f40ce0cd802
+04145b4b73b2d313
+1f73124222492f1c
+24fe21f0a899701f
+4a1d79baac733df7
+06db5bb2465ae58e
+466150f780ad7b80
+1e1742072c0b2d6b
+4955f54f807ef5aa
+42565e9d863220ae
+35d5a242ba40f31f
+15e0783c6b9683be
+45e6fa48ddd00e87
+396065ee739ea046
+280443260e3dced9
+10c4c9600bfaa4d8
+0181b66a65650830
+1fd615fea825fe87
+15c9a45c9c3d73f2
+33ac471b97ddf5c5
+3f2bf7371b72e40f
+0a45b99f42fb0ecb
+2ddca94aefd55b8f
+00a54225c5cb1913
+0d6a534d75f20921
+072b4bb46d80484e
+4a046d13e389b505
+07559b44fa10672c
+07479835711d6f8d
+4242fb49c775710c
+15b93cbe9fc5220d
+10f4acad3ed87288
+453c980210335f26
+291db63458af0613
+49b8f80c849dc341
+05ce34e3cd48c449
+15d324ad8ff2dd83
+3cc1fcf538c81442
+0ab14ffa7e541b0b
+4043989a4ae95a01
+4c9030a5917a1328
+21b548b570c0b415
+3c44f7a30e0ad967
+1b88fb5063cd8916
+4c69bf407b142b93
+3059f523501dfd97
+4118895a33890c5a
+30b1d229ad4c6353
+18cde229723f22df
+000db54a47bd43fe
+0e2653d00e3fc05a
+3e6d44a66c0d7a0a
+4319ddbd5e8f20e1
+459a954b63f98d8a
+4a177be7db12edcd
+497f507a5901bf4b
+25aa5f50072ce7f4
+2276982dbc5a23e2
+47a66fa042406908
+2a7387e017c241a4
+0ff896ed26db5da3
+464e3851f923f8d0
+1e969786d2a8c7a2
+45f2d7abb5fafaa8
+3dcef43736468b29
+2a9d8ba86290db0e
+21f3cc00e0cfe8bc
+0ef054fbdabce0cd
+3784777516b00247
+103777494841b376
+2ffe00ad70fe9c00
+289bcb973ed702a1
+186f18684ed4b516
+0d7f00ff38b135f7
+11a680776863b321
+21b26eda16f7cb88
+285aef90afaaf565
+0ff193c92d415b18
+49f6f14c580b71b9
+1b65111d34f57bb0
+015d8a2a2834d38c
+3d394fbabe0e733d
+4263257ec6099434
+39b3e7b2a8bf30a6
+0326e5c562bdf1b5
+2dd67f5e68c8d72b
+27d163d7046d36b5
+3094afb27266ee6a
+3c90ad3bb72adcf8
+374fa34fe701a30d
+225ae1d37a7fa519
+1c742e548e8698c9
+16a7b5a41f31feb8
+2245fcd7f76c2ecd
+196069a792ebbf31
+0ab163a1b88f1128
+15d8b7e256ffd066
+05c57211be152630
+3b9420585a1e66fc
+1f56ccfabbfce568
+0498c9066256055c
+2d5e1c16ba1f89c2
+37ff2186f55b3fd8
+0e5b9dcdb891b82b
+0fe2286088ece98e
+0e714d042fa59506
+3e5f747d06bc84a3
+0f540553fc30f16d
+01866b81c3b90f2c
+3105d330651c2726
+0ed25f15cbccd939
+234271629f7099df
+2e109379f53bb221
+3efda95897eb23d1
+1c7b9f93752085f4
+240e89ef33ff15b6
+2b41e71d509a8b8f
+31bf989cb15492d4
+36e69606a7599644
+07b667b34838336d
+0b4d5beb7d3bd867
+3fc2c221557a205c
+4908fab97c9bcec7
+3b3880eb01373479
+33a29a351c1d9800
+1c919c7e4ec601de
+138bb7b0b25e4669
+2e715d4c5bf6c45e
+2bff9ec89ca982c9
+167c2e0c6e9ffa5d
+1fcc400e42725a95
+24668d960406587f
+0cb83cef3177a006
+0d0a99d7f22aab71
+11e9cb1ccb9abe9f
+4a7f0556fb58a5cc
+24d95746999b7f7e
+4560f57598efe5ab
+16930ebf3f0f6b84
+14f0b962fabfaae2
+4bae8b3980bf32d0
+12dce44829d88985
+3c85dbda51f7e9b3
+49cc37f7f96be5a9
+2e4cac06a4f92261
+31607cc68ada0108
+0f7e8bf1137abcac
+20171db88f887218
+3cfb6cb5052ce744
+0bd819cb30a432c6
+4089ef1b1bdb1d36
+283334520a3f8a43
+472e2674ece00632
+17d35e133dc3ce90
+09c1b7a0876c08df
+3ce8f87fcfc988a8
+28ed97894371982f
+2b43428fa1cf1a7e
+0302fcf06bfba582
+30b1dffa5f783ecb
+2e261d7661282e40
+29927d9ef0b472d7
+228ed4b87c8a6ea7
+2bd7cee1fa9c8996
+33e23b97daf5d9f0
+46de062e5ff787c5
+027c8c3fc3e7d056
+39f1b33acc70ad7b
+0b07051f912592d7
+4675ea4e00c2544d
+0ecde93bfa1f08d9
+214df1c2863d2959
+2c52d9d606a3ece2
+4bd922d1e75cc936
+000c3ab189999a83
+389df03f3c2d7291
+0c9b371cc6225682
+2c5249093fc26fde
+1930a64d9a119b13
+2007d4829b187feb
+19267b6a68d2701b
+0539bcecbc483dfe
+4a7e531e1a35d424
+22da7610855d6b9d
+40954e72e02dc771
+2ed398e8368e0c6f
+3e00b129b656fbce
+1526707312c94a92
+0553c19e8933374a
+33a3f65849195eba
+1593596b99e2dde9
+118ffef2ad3950f5
+3d7a1ebc77f683b4
+374b2f4abac6dbb9
+32db375ab51d77a4
+1d2cab92bdcc1453
+33a93f85d5713a71
+4a1b9fb940541809
+150b45a39c57623d
+1fa073504b4facaf
+20e9bf845c4bd9e7
+3176f1532a468cbd
+021575237abe0684
+3260a42ccfba8973
+0bb7da710cbf4bb9
+23beb4b246e236fb
+2473b5003a95628c
+1564900dd040c718
+41c2ed4944dec77f
+132bbf5a9e9626ce
+46502a6038bd288f
+20fba1d53c349851
+2412e9f45282fd15
+430c6d1f8676fabc
+16a75920e79c3710
+2cc5f95fbe24ffe5
+37ff932a6a608c24
+00e8df74b6805da7
+233d7cca6c4c628b
+0286d2ed56e8f107
+1735d8d1b4015669
+225c5f2cdcd2753c
+069a9416dc6a373b
+203c8a4d66c74338
+397bbed49e1ee8dc
+1ca02bf1c0b65675
+166bb958d7f4798a
+02c485bd207116d4
+369f3639d9605255
+392c21ee30b21459
+0d08611c8b251e15
+1eca36ec55b88fe4
+2ff40df261e17697
+2e715b2e0162f768
+15d4a976e4e7d3dd
+1d46e25b06eef337
+23d7f14af4b7ba08
+42cab73e14195475
+24bcb936908f3a31
+4ade6d5fe4b32738
+10a3511b61f40243
+1d9254a5cb93d4ff
+3fc266558ec5c07b
+2064e46352532375
+3f6e7ee98174056b
+07225d96742d2a6c
+17c234c2eec050a0
+2b1a013698fea3a1
+0190fe72a727c853
+11eb02d24a3241a9
+0efeb5654da456c6
+0c824455996db331
+14f477e7d5af5b91
+232a0b3133326242
+1bc87f52eba89cc4
+1402fce28722610b
+172bd46c6ddda95d
+2c55c5a96b50ab36
+31262e902165f348
+0e3951bf1db22064
+0404d32e97ec1cdb
+126067199873816a
+095fb57435b7d890
+195074aeac8bbd76
+1910e79a60d57aa7
+0a78dcb828c506f1
+2b4f6fdcabf53d59
+2b1da1fbe7f18f7e
+05c48ff6535fbf55
+06dcbfe7cd79bb66
+0bc9fd5c8e50d0ee
+18a3593609eb3269
+35b994780d720894
+15c28f4ada02cf91
+15d4131d721f1b5e
+0c52996355b23d76
+0c72eaf6bbb7c681
+22d6e3fefb1ee7fe
+0f620bfafa25fcf5
+388b75bd17c5332e
+3b8167415736169c
+08072d6cc8e8711d
+178cef169356d4a5
+1072aae07584e091
+18a86c01aaaaaa8b
+2422f760ea77551d
+21840c44aed0ae43
+10ea3faaa29f4a88
+18502a6651367e71
+31c79c843555c2c6
+43c329f7c0b40258
+00beb03ef95dc637
+2b0cbc443e6c3c6d
+2e554e99d045b484
+15ffcb1f98d41218
+39c41be5e76c79c1
+452e9c4e4729ddff
+1fcf851e236dec35
+32615afea87b52dd
+1f079ee70c21002d
+2e04dfa4a1671292
+088b93f15ca8745d
+180bf845cc8cada3
+20d86cff490c0c42
+293c7c1ccaa6861a
+43db35d743e6be54
+308681a294d1417b
+4393f3c42606c573
+16ef89980e2ceef7
+368fae6f3bc0b0f7
+2cc8ef9e5319d5d7
+3a642c6d0e43510b
+4636beec02aa8dce
+4091c41c6909da3b
+203b7543bb3387ce
+095441304a817fe9
+05ddd2fee689399c
+457a1ed78b1ddb01
+04422a07336e32da
+01aaf4ebb084dc16
+27f3ccfb3199499e
+1c840c855f0c8421
+3a9fa6535917a07a
+04e2be0415136fa9
+09e4d5e8eef7b9c1
+00a5a2af678f37d5
+1c36b2b8144d29ec
+171403db6cb88926
+0b970b3417969c89
+0c788e368d993870
+3faac9603907b329
+078b8bdc29565cac
+06058474f164c53d
+0516a5d959b58cb3
+4634124b21b763f6
+10fbe4690dec6258
+4879ebbcfd888f5b
+16b48792e910cf49
+0ece034988793847
+12a5cf6bbe330edb
+3bb70a92a0d384e1
+3cf461bc6d626ed0
+007b4ae7c05f2ea2
+36c5e00f55c4f217
+4b341307a872487c
+2a695d52faf1949e
+1b87f55dec310243
+22b86e38854ad186
+30fceb3b40ec062f
+15f8a54e4822f355
+02b618a34bc12ff9
+104c9a27980f9bb7
+16b667d681f8cf25
+46889bb1803c5cd7
+3eac186b3e7badb2
+02e6fb86b0172f0b
+2ac712ac8d2fd488
+45a00d135c5388fc
+2c2cfc0ac780a3aa
+0c25287b812367cd
+3f9b08ed34ec795a
+35f89d3ac607bd5a
+4949361d0831c838
+03c61595d13e121c
+3d8d29bf0d9f24a4
+145da324f69d1c6b
+3cfb4c69b14a1970
+4b461c1ec52a3076
+0dd9e020b6d9d687
+16b3eea75ad753ef
+450cf402f042bfd2
+21a23c81331b0027
+1e80fd6e7507e3be
+3e746126204810a4
+3613c77d8c234008
+310cbf1c65c52fa7
+2cd27189549897bf
+36977643258aa392
+37c99410741fbca5
+33f4eeb64d0c9c1a
+4cb669bd62a4ffb8
+0c11dbe781b1c11c
+234c14a79d4da1ff
+493bb055f33cb256
+3dfaa97cd48a0332
+375a7fbf80d09c92
+42cc82972397863b
+03aa0437e5d62d58
+20d1b02740ef1124
+2f878176347bcf9e
+06c7c747b4542273
+4b5619958277861f
+04e4c841b349bf5c
+0f12b97e0e4c7e21
+1225476a1221ce08
+0f61837b9749da34
+0712476a67734dee
+3185302b2275b009
+0445459f7afb0f48
+2e7ffcba51990c93
+1515d37824dd6b22
+4ba7caa04cea37b7
+47d11d4bee6608e0
+37a6b3200493fac0
+1dcd8aee9a39a61a
+3ac32347c3ff7d38
+11d5f4e7b0b17565
+41210bec1c0c87e8
+4a566b7e6eeaf9b4
+005dd9a58df1ba3c
+4c2ed13774ae4613
+418ad7b9e78208cb
+424f597efdad3067
+401e10a4352fba1c
+1de1b73fe4d6aa77
+0f2197967bb7fa43
+2ac7eecd3cd0252f
+33517e9838fe5f20
+144c2c2c52734f15
+047c29e9138af233
+0f59c103684c0437
+0542630de1d734de
+02b406d1e5e31d5c
+494f87170e713843
+47b5d62899ea4869
+024908906fadb408
+4cc48509585e4157
+304dd8f38dbeef0f
+433742b23712bf06
+2f3ec1f2335489d2
+443b1691d94c1b3a
+318cc6a39c0acc71
+1247b2ac5986205b
+3b84ac07fd85bb3f
+24d97ac8e96e7a5e
+39f03d5fb1807102
+2da38ca64192354f
+464d63c227f26d09
+28b23ec38ac5c0b1
+20de87f0b3f2d136
+3b58206d99feb4b7
+41abd737e0228c1a
+18d9631f5eb45b87
+4cda491521679291
+3776e900791c1553
+3a10eb9788bcdfa1
+48d4444b94c2a2c0
+40284c1baec06ac9
+1ce8503fd200fed2
+0a8f10a9a68236f7
+00cf0a94235771bb
+0ebaed7e3d044bc2
+36bc6918e9fc837c
+1db5a4df1ab8b8e7
+03482c3bd66de195
+156b422215789c18
+3e034bde9426ae9f
+3fe783b9c7c8f492
+3f68a1e365e94eb4
+17428a1f23edf411
+47cae76cd53c752a
+00d83c48cb78ec83
+040a7af97273204f
+3d410da4d7fd9f64
+0555b07fe6239b4a
+4c144eb40a09a0dd
+0c89e266974e8b90
+12e3d03a933c2eb4
+48f9cd996f80c34d
+28b06f7087798198
+2a39e3b6061dd887
+24895a02057db66e
+42c2c85060ab5233
+4a582ee23dd05a8d
+2c9b5e69fe7f0338
+2c16104a0ed6c8aa
+1a04733e4ee45c90
+11eb4e9eec5048f2
+0869b66f912b845d
+33c1bb87a88e59f1
+4ac044dcaa428723
+2e6876c6c1e40652
+13fcc228c40a0e67
+23dcee801bca67bf
+075f0d808a621ae2
+298c394f21c62ae6
+0181d3b41c2cf87c
+3bad929f21fc4336
+107d3d674fdebd31
+48e49bdd1aa706e1
+1ec6fa3de6fa1bf0
+0f4f779411b45b6f
+220d718317f7a025
+31b8eb8bbebed9b8
+3d4645318868a4f3
+37a6ae1e1c6eff66
+4c169a41e66b6599
+1513c8f030f4cbe2
+460e2066b64b2a40
+1cbab4f69b2d48ce
+14e3fec07ba502d7
+2ab72e30a616dd21
+2d7d7fb53d960909
+0757b4bc82bf26b1
+195fbef2c08715e9
+0894f0072a8c5fd0
+10ac0ae67d317d11
+48638883e537ccda
+373ab0a1009e0316
+12a70416c92a9483
+14417ac810f2024f
+3f1c5b36d217d345
+1fb651cd12893f99
+455964aa4ead1e2f
+4c8fd5318ae8d467
+0043978734eec081
+404043fe2f398440
+0f97153fcaafc80d
+36cb04da872d3bce
+1778e784d47e035e
+1840ae9e2494443e
+498688760312447b
+2ceec371086f5d82
+1f2153b5fb50d41a
+071e8c0978097efd
+0d0f4080d36dfc68
+31b5667e16de1d94
+424397db4b1cf634
+377769942e6a748d
+13adf913ea857ddf
+0a9f2831a3e73de8
+46a4d49d61a86d37
+24f06a46ea08c03c
+420bdc53a6928b32
+41e428b3c7a16695
+219825f542e6ee4b
+3d6ed8b43655929b
+0afdc571e4667a44
+4be8bf31940bd475
+2552fd444d04ef21
+44c16554a21aa6af
+1d36cd02a549e244
+0223924f43297881
+31ee8cabc96b9a62
+397b050e345d73fd
+03a78406de1d0993
+218ba0fa826d3eea
+2945a940639798ce
+1b74274269c75c8d
+084fe29cd9d008db
+38615248b52e2834
+0b429a4733089487
+01a628e2c509b823
+36f4df3e0a1ade5f
+22666111b2180af9
+4c76898a3d535741
+0c4a239e265ae1c3
+0b530eea368f626e
+0ba1cadcb191dc0a
+13ac6a6a3a4f5e5d
+4227369e7d0e735a
+15729869d1862b7f
+3ced9d0b56769bb7
+2ad4ea800caafe09
+02b2358ff02d3ce8
+35e8c6c2168dd087
+3858fc4475d10c78
+12f45658983d380d
+46eccb4820f5a4eb
+082087c82daa295d
+08c87b4b6b23895f
+3fb3327a177a0175
+08291107fc9e9849
+0e060f89ae0a469a
+1961bb85524de229
+2ccd2d98696c87e0
+2beffa088960f673
+39424be692a88364
+3d60041ab79f46fc
+18089956e2be2289
+37de8da2580d0c1d
+47a1f1f01e2b7be6
+3de41ace235a3a13
+4881a65d7476d6dd
+482ce5c63038e5b4
+42b086af2a1e5d98
+3dcdffe3b9c6235b
+2deacec5c281fbac
+21e794f71e31becb
+2e470f6e2c83566a
+4175cb4c71c984ec
+49d5b942442449b4
+1eb03f0e3088edf0
+4c2d32a7f2b62657
+1f925fcf391591df
+0e4f56edbc3d8cd7
+42c75d578535b0fe
+329aba411f341398
+41a55418bee59b11
+0278b3d8abd9654d
+0425df3e42ba0de3
+0b1e61c69c98026b
+47a07f51fd3fef77
+4500d9faefff3a41
+066c35b1abc706be
+4a2d6753676df096
+2a3bd0a2ac422822
+0b1b293ffb0e2f51
+1edc6b95e84127b6
+23cfdadab7cc51a1
+391bc7d21641283e
+10c551ef9644ea03
+2be655d4137e6e29
+49ec7608e51f7ee2
+3e577a3be646152b
+17d841670d2da942
diff --git a/evaluation/preprocess/re10k_test_1719.txt b/evaluation/preprocess/re10k_test_1719.txt
new file mode 100644
index 00000000..90ec9de5
--- /dev/null
+++ b/evaluation/preprocess/re10k_test_1719.txt
@@ -0,0 +1,1719 @@
+000c3ab189999a83
+000db54a47bd43fe
+0017ce4c6a39d122
+002ae53df0e0afe2
+004334c94bbc8bd5
+0043978734eec081
+004dd4b46a06e5be
+004e9db3337e8206
+005dd9a58df1ba3c
+00620c2b77518524
+0068e97c1c1f61aa
+007876f71baf453f
+007b4ae7c05f2ea2
+008cd8c450342e49
+0090cc64d7b7bb24
+00969c45a093d43c
+00a54225c5cb1913
+00a5a2af678f37d5
+00beb03ef95dc637
+00ca5123d8ff6f83
+00cf0a94235771bb
+00cfc0ecd345deb4
+00d83c48cb78ec83
+00e12e215c028984
+00e8df74b6805da7
+0122933cf8ab3317
+0131c9aed0fb3940
+0134d6a876481ed8
+0145c694b53b120d
+01497290d8b93a9b
+015631b21f792a12
+01570ac1c73e9ca2
+015d8a2a2834d38c
+0181b66a65650830
+0181d3b41c2cf87c
+01842c6b21e1d679
+01866b81c3b90f2c
+018f7907401f2fef
+0190fe72a727c853
+0196dedebec3dad2
+01a2277ee817b310
+01a5cc3805e94c21
+01a628e2c509b823
+01aaf4ebb084dc16
+01b08e2f20321127
+01be77405b16df11
+01cefee9563f691a
+01cf2d900cb03afb
+01cf55ae3e378faf
+01e18dbbf22ff263
+01eca393f86d37c5
+01f7915dce639515
+01fa6190cd47d125
+01fe225e2f261d1a
+020991bdfbdbe504
+020a41f988981396
+021575237abe0684
+0223924f43297881
+022a21a897f2a904
+024152256b6bcac7
+024908906fadb408
+025192166c704a39
+02679535c5f06a19
+0277b87a9c943ed5
+0278b3d8abd9654d
+027c8c3fc3e7d056
+0282160b901229a7
+0286d2ed56e8f107
+02b2358ff02d3ce8
+02b406d1e5e31d5c
+02b59cd60efb924e
+02b618a34bc12ff9
+02c0c9192fb9a6a1
+02c485bd207116d4
+02cb3a4fd80ee0cc
+02e6fb86b0172f0b
+02ee66b3efbf3b0a
+02f801e372d67cfd
+0302fcf06bfba582
+0326e5c562bdf1b5
+034677cf3d80162d
+03482c3bd66de195
+0362399a61c18ad5
+0368abd976e8d82e
+036f135766f38f78
+037e8191b3985142
+038137c9569c60eb
+0387ef3895b1393c
+03906f66d3bca71a
+039b153af4fbfba7
+039cc34e9cdbcf8f
+03a78406de1d0993
+03aa0437e5d62d58
+03b3f603a1001de0
+03b440db4696d8e7
+03bcb03930ff1ace
+03c61595d13e121c
+03cf40616d79cb6a
+03de2844d3c8314e
+03e141d7afac53e4
+03e756bff92d49dd
+03ef5f13e0a30864
+03f1781c4cc126e6
+03f551fc4abedc08
+03fe94a439456692
+0404d32e97ec1cdb
+040895d45bf4e580
+040a26b288e7bda4
+040a7af97273204f
+040de715f9303ba5
+04145b4b73b2d313
+0425df3e42ba0de3
+043c48135c5e8cc2
+04422a07336e32da
+04433dcf217ad9a0
+0445459f7afb0f48
+0463d74358aca878
+047c29e9138af233
+0485a8528fa72698
+0498c9066256055c
+049a98f70ce9f471
+04b580ea1f4df0a5
+04c441c7ce273dcc
+04ca03945611febb
+04cb1526cf3c43cc
+04db26572a791881
+04e2be0415136fa9
+04e4c841b349bf5c
+04ec725465dc5329
+04ef6a410f034514
+04f5153fc5255516
+04fe4ec70781a0e8
+0516a5d959b58cb3
+052430ff6e2c07c4
+0539bcecbc483dfe
+053e78d3134437a5
+0542630de1d734de
+0553c19e8933374a
+0555b07fe6239b4a
+055e4612c1ea70f9
+0588138dfec165a1
+058c67085c217b96
+059058768c222bd6
+0598fec76ecc7bd6
+05a0ad1e2aa632e7
+05a6149f1fcee38d
+05ac37966de4e7fb
+05b1462991e38e4d
+05b77cb7c0f79f0f
+05bacb6d6a4741b0
+05c423623c9f6f56
+05c48ff6535fbf55
+05c57211be152630
+05ce34e3cd48c449
+05ddd2fee689399c
+05ec2d3e4c027220
+05ef56b2656c9318
+06058474f164c53d
+061e49ba3a5386c7
+061f829d3dd2e46e
+063b857e6470addb
+0667d5bedfdbc555
+066c35b1abc706be
+068140e09ae5ae8f
+06954737f53b8688
+069597e1fe899530
+069a4c442912c405
+069a9416dc6a373b
+06a2911e9add96c6
+06a8196a66e125af
+06c71ce295284689
+06c7c747b4542273
+06ca8f480c91e9eb
+06d8995be6aa4db6
+06db5bb2465ae58e
+06dcbfe7cd79bb66
+06e499374ddafbff
+06eba57d1c333a3c
+06f4bfa5f9d5fe0e
+0712476a67734dee
+0718f733a326d65f
+071e8c0978097efd
+0720bead0cc7cbbf
+07225d96742d2a6c
+072a60bc7e0b0dfd
+072b4bb46d80484e
+074653ff3928b9fe
+07479835711d6f8d
+075278a4d0af74f7
+0752baf20fbc2285
+07559b44fa10672c
+075654f497170f90
+0757b4bc82bf26b1
+075f0d808a621ae2
+077d42bb51ee2793
+07842cc567e9beec
+078b8bdc29565cac
+07b667b34838336d
+07ba5489b56b7d62
+07d0229847bd7408
+07d1d5769e8d797c
+07d449efdb66c20d
+07e8ffa32746c7ce
+08072d6cc8e8711d
+0807e84457d5ef58
+08138c1a3ba1ce8d
+082087c82daa295d
+08291107fc9e9849
+084fe29cd9d008db
+0869b66f912b845d
+08718fb99eaafea7
+0871e2540b0a6804
+0871e5f582cd933c
+08868143749f321b
+088b93f15ca8745d
+088e115752ce9e56
+0894f0072a8c5fd0
+0896b4819e39caf2
+08b8b63abbec8780
+08c87b4b6b23895f
+08c9e7365f0707a4
+08d5cde674e47324
+08f82d3899d6b726
+090c672e7e394397
+0915a60e1ae6a826
+09265a9e57075e7c
+094fd37f09dc318c
+0951fdaf9d399411
+095441304a817fe9
+0954f5a326941fb1
+095fb57435b7d890
+0972074fece891f2
+09754b77eeea6dae
+098263de57257005
+0992802044bf665d
+09b505bb829c1d12
+09c1b7a0876c08df
+09d860b12f6604cb
+09e4d5e8eef7b9c1
+0a0027a48d9ff2ae
+0a10d55239d83d99
+0a3b5fb184936a83
+0a45b99f42fb0ecb
+0a4cf8d9b81b4c6e
+0a5eeb4466dd19bb
+0a6680fe6e8e09d7
+0a6c499522efa0d5
+0a72a3fd46a88ef6
+0a78dcb828c506f1
+0a7c052273895bb3
+0a8f10a9a68236f7
+0a9f2831a3e73de8
+0aa284f8166e19e4
+0aa49c0b75e51ba4
+0aa8646901d156e4
+0aacb1732fee7a3c
+0ab14ffa7e541b0b
+0ab163a1b88f1128
+0ac6adb37a92f549
+0af60a9ffd747a1c
+0afdc571e4667a44
+0b04644621e97d30
+0b07051f912592d7
+0b1b293ffb0e2f51
+0b1e61c69c98026b
+0b211a1457076450
+0b3674ffb90b641a
+0b429a4733089487
+0b4d5beb7d3bd867
+0b530eea368f626e
+0b55abc1ca2fe909
+0b79ada01eb45be9
+0b970b3417969c89
+0ba1cadcb191dc0a
+0baa633d2094d2c1
+0bb7da710cbf4bb9
+0bb99505a71035cc
+0bc9fd5c8e50d0ee
+0bcde26e5a802638
+0bcef9ed1c18f74d
+0bd7e6e9f0185aa3
+0bd819cb30a432c6
+0be9a0dcbfe032f1
+0beae06611ead92b
+0bf152ef84195293
+0c061512de79b744
+0c0f298ace7c875b
+0c1012a308ee2788
+0c11dbe781b1c11c
+0c209edeb7637dff
+0c24590c68af865f
+0c25287b812367cd
+0c2b3463c27c5ac3
+0c356641df7c72b8
+0c3d3b45ff4a4326
+0c4a239e265ae1c3
+0c4c5d5f751aabf5
+0c52996355b23d76
+0c5ed899789e60ad
+0c609c435b1f7114
+0c6b149da098b121
+0c72eaf6bbb7c681
+0c788e368d993870
+0c824455996db331
+0c8438d86bb28f7d
+0c884aee4b01366f
+0c89e266974e8b90
+0c8b534612a0a776
+0c8c4363e0dca250
+0c916bcc9351521e
+0c9b371cc6225682
+0c9c387ae23d090a
+0c9d930d226d6bd6
+0c9ea3bf67254e95
+0cb83cef3177a006
+0cbbc98eec80360a
+0cca84503a86574c
+0ccb28128213f19d
+0cd63c88350eef60
+0cda9adbbedd7948
+0cdfa29561cb24e0
+0ce3839aa5b66e3f
+0cf444aef3ba16bd
+0d01d4d6c5d5297e
+0d06be83296cf911
+0d08611c8b251e15
+0d0a99d7f22aab71
+0d0f4080d36dfc68
+0d1aa0f47c9d2f6d
+0d20062086f6d05c
+0d22ced53b1db7d3
+0d46043105cf3185
+0d4b941f4678267b
+0d4de33c6888a754
+0d5112a7eb22d61a
+0d5a4dcdf8ec9d36
+0d68a05801d48984
+0d6a534d75f20921
+0d76da0fcac26af8
+0d7f00ff38b135f7
+0d82dba8f137e3da
+0d8fd962cbfc81b7
+0da6a36b24eaf5db
+0dad7f2ef3496f13
+0db2394602b8b81c
+0dd9e020b6d9d687
+0de6bc7da518fcae
+0de78cb98105f8c2
+0de79d4f3d7a9171
+0de8a88480533be6
+0deb1b80eb8481c6
+0e00a382b62667c0
+0e01be9445403642
+0e060f89ae0a469a
+0e16d64d961fe855
+0e1a7abc82b1afb2
+0e241f40ce0cd802
+0e2653d00e3fc05a
+0e2c96cd97e73a38
+0e2f2538b26a179c
+0e3951bf1db22064
+0e41af1514f92887
+0e4f56edbc3d8cd7
+0e512d350465a63c
+0e5b9dcdb891b82b
+0e5e7fbe8914352c
+0e6f8d0eb4103baf
+0e714d042fa59506
+0e728af85650dcb3
+0e7b68884ac4d959
+0e8995dcbdd22f48
+0e8a52a174610350
+0eadbf8806794990
+0ebaed7e3d044bc2
+0ebb04534d7f2ba7
+0ecdc87c3391ce98
+0ecde93bfa1f08d9
+0ece034988793847
+0ecf489d873b7f52
+0ed25f15cbccd939
+0ed4c9cac4a615fd
+0ed8b86b87a30d38
+0ede2c8fbe52c1d3
+0ef054fbdabce0cd
+0ef15055b44649e3
+0ef68550315f57c7
+0efeb5654da456c6
+0f12b97e0e4c7e21
+0f18fb6736efb1c2
+0f1f245fa1c181ae
+0f2197967bb7fa43
+0f25241e37e16f56
+0f47577ab3441480
+0f4f779411b45b6f
+0f540553fc30f16d
+0f59c103684c0437
+0f5bb0704084e290
+0f5c5385dbcd96df
+0f61837b9749da34
+0f6206df8a8e440a
+0f620bfafa25fcf5
+0f64c5e4fead6cf2
+0f68374b76390082
+0f7061acbeed50dd
+0f7267e7e369b7d6
+0f7e8bf1137abcac
+0f97153fcaafc80d
+0fb6678e63316201
+0fbe6d76015f75d4
+0fd536fc3c8fdf19
+0fe2286088ece98e
+0ff193c92d415b18
+0ff896ed26db5da3
+0ffd1083a70c6968
+10002e18d04c3d93
+1025622b7f308760
+10351dc7a37a44c1
+103777494841b376
+1047cc04fa16e0d7
+104c9a27980f9bb7
+1058fe0400a873e0
+1072aae07584e091
+107b78daf075d371
+107d3d674fdebd31
+10a3511b61f40243
+10ac0ae67d317d11
+10ad4fc499c48b38
+10c4c9600bfaa4d8
+10c551ef9644ea03
+10c8e54590f715f7
+10ea3faaa29f4a88
+10f4acad3ed87288
+10f9d6f46e438d36
+10fbe4690dec6258
+111356766833a7df
+11337164b772b7c9
+11491a312c6b8f58
+114d9c301b847239
+1150003196de2529
+115fa3a1923b7c9f
+117335f5d67368ca
+118f563fe2ed4998
+118ffef2ad3950f5
+11a680776863b321
+11bf4b38f88bfe9b
+11d5f4e7b0b17565
+11e62395c85c250d
+11e9cb1ccb9abe9f
+11eb02d24a3241a9
+11eb4e9eec5048f2
+11fcbcdb1dfefb38
+1203cc23b881ab8d
+1214f2a11a9fc1ed
+1225476a1221ce08
+1227d00562c106e2
+12293b264f68673d
+122cb7d5ea4a99df
+1247b2ac5986205b
+12497730f691d00c
+1250e369ad1e2fbc
+1259726fc1f8e966
+125c92c36a04a68a
+126067199873816a
+12627e75d51b372e
+12691b0622a823ba
+127884736471b631
+128bf83073de3ba1
+129b5e9b80cc4b4b
+12a5cf6bbe330edb
+12a70416c92a9483
+12b7562944c06836
+12cb4aa3f5b59ad6
+12d4ca1236a2cf26
+12dc074fab6ada73
+12dce44829d88985
+12e3b1ad12a752f6
+12e3d03a933c2eb4
+12e6ba92e82c7ca4
+12e985eaa4b79298
+12f45658983d380d
+131773f46d989860
+132bbf5a9e9626ce
+133ee6e537353604
+134d7e5a74497a82
+13686755488a9d51
+138bb7b0b25e4669
+139055b26734436f
+139a615209ee09ac
+13ac6a6a3a4f5e5d
+13adf913ea857ddf
+13c4059da4e56a8c
+13c510a7403f8231
+13fcc228c40a0e67
+1402fce28722610b
+1424acd0007d40b5
+1433f61e9591ea9b
+14417ac810f2024f
+144b95c0c3fbe3b0
+144c2c2c52734f15
+145708c0216a06a7
+145da324f69d1c6b
+146581180e89666d
+14a5b002ce46d4d3
+14bfd05497764243
+14cdf4aa7a2de14b
+14cf1f92ca13d605
+14e3fec07ba502d7
+14e540cf0ff7ff91
+14f0b962fabfaae2
+14f477e7d5af5b91
+150b45a39c57623d
+1512ea7a9754ac34
+1513c8f030f4cbe2
+1515d37824dd6b22
+1526707312c94a92
+154072be49bb3c1d
+154813fc1d6820dc
+1564900dd040c718
+156b422215789c18
+156f4c7dca878ff2
+15729869d1862b7f
+15762acaba295de1
+1593596b99e2dde9
+15a138312ad94718
+15aba05919bae167
+15ac594106229c62
+15b902774d67a394
+15b93cbe9fc5220d
+15bc7fa1ed5567cc
+15bc9eb752c6dcbd
+15c28f4ada02cf91
+15c9a45c9c3d73f2
+15ca8e1fce488c19
+15d324ad8ff2dd83
+15d4131d721f1b5e
+15d4a976e4e7d3dd
+15d8b7e256ffd066
+15e0783c6b9683be
+15f8a54e4822f355
+15ffcb1f98d41218
+1648b3e7471e3766
+165696025b477097
+166818269e4e2568
+166bb958d7f4798a
+167c2e0c6e9ffa5d
+168c85ce00de0c6b
+168e0f4071d2fe58
+1692fab166811028
+16930ebf3f0f6b84
+1695a74d194b65c0
+16a23081fd821e92
+16a2c55f96e6aa18
+16a75920e79c3710
+16a7b5a41f31feb8
+16b3eea75ad753ef
+16b48792e910cf49
+16b667d681f8cf25
+16b8ab24bd231f9a
+16c4484a4093e2f6
+16ca2db2f920c1b4
+16cd4f1cb2a467c1
+16da9b4fdfe4883b
+16ed45d1ce9017df
+16ef89980e2ceef7
+171403db6cb88926
+171b3a4c2f95f981
+171fdcc554d303d8
+172bd46c6ddda95d
+172c99489b18e0e6
+1735d8d1b4015669
+173a82eeea56aed3
+17428a1f23edf411
+17519e763b34fe14
+17535adda2aa3b90
+1778e784d47e035e
+177d39d72e983b69
+177ff3969577b8de
+178cef169356d4a5
+1798c9640d8875e6
+179a1357a581ad51
+179ff8424ec7ad13
+17a39d87a22ac1ec
+17a3e731c4fe0aaf
+17a489f3cec39fea
+17a75f0b036c9cf8
+17c234c2eec050a0
+17ca3b8ad5815b35
+17d35e133dc3ce90
+17d841670d2da942
+17d9303ee77c3a3d
+17dba5fa8138ed92
+17f552ef56d85c55
+17fc81293f337cc3
+180542b70f713d5b
+18089956e2be2289
+180bf845cc8cada3
+182054e13eaf58fc
+18297e1f8e25d3ee
+1837b8eebc9c2457
+1839244b04a05e5a
+1840ae9e2494443e
+18502a6651367e71
+1859766466c069b1
+186f18684ed4b516
+188e6f96fa74ebe7
+18a3593609eb3269
+18a86c01aaaaaa8b
+18be7c1b9895691f
+18c6473be3bd827a
+18cde229723f22df
+18ce480be0ececbd
+18d9631f5eb45b87
+18e659699338f835
+1910e79a60d57aa7
+19267b6a68d2701b
+1930a64d9a119b13
+19310eb8261d4bb2
+19391676f0fc7982
+193c3bd339eb0a75
+195074aeac8bbd76
+195ca8350ff27f6f
+195fbef2c08715e9
+196069a792ebbf31
+1961bb85524de229
+19694d2dc528d75d
+196ffec2c68cafd5
+19a28e5c25feb31f
+19b0cd79a126e8bd
+19ec130ecea98d5e
+1a04733e4ee45c90
+1b65111d34f57bb0
+1b74274269c75c8d
+1b747b8eba6f7b45
+1b87f55dec310243
+1b881261742799f7
+1b88fb5063cd8916
+1bc87c160d1dc982
+1bc87f52eba89cc4
+1bce163cad1e1d20
+1bdf9dd7628ddb0b
+1be3758972b35151
+1be5a5c98a51b1cc
+1be80ff36848e758
+1beb8e6662d36ee6
+1bf49251fdd23cc0
+1bf4fe9301893904
+1bf668db0194cf83
+1c11709814a1a2f2
+1c144e2404c5da89
+1c1b2e56952040cc
+1c29b1d8fd1dab3f
+1c36b2b8144d29ec
+1c375830155fe6dc
+1c5514d49d61bafc
+1c58aa75858147f1
+1c73def8a62301a9
+1c742e548e8698c9
+1c7b3ccd5482f834
+1c7b9f93752085f4
+1c840c855f0c8421
+1c8d34a791deaaf1
+1c919c7e4ec601de
+1c9772d765e0679e
+1ca02bf1c0b65675
+1ca4db19711258a0
+1cbab4f69b2d48ce
+1cc1ff58dc89d230
+1cca8650a292e7b0
+1cd3638cceebed08
+1cddaac7be8ecfa5
+1cdfd3abfcc3a64e
+1ce68f950e7cdf8c
+1ce8503fd200fed2
+1d04977fb85a8b3c
+1d125b16063c96c4
+1d1cafa3e27da040
+1d2cab92bdcc1453
+1d36cd02a549e244
+1d46e25b06eef337
+1d4d81f8629b119e
+1d704b9365e9c86b
+1d748783383ad977
+1d7af31482baf61c
+1d8017cad8dc1d56
+1d9254a5cb93d4ff
+1da33647873725fd
+1dad44855584a4ea
+1db274e904e3fb07
+1db5a4df1ab8b8e7
+1dcd8aee9a39a61a
+1dd869f66c3c9497
+1de1b73fe4d6aa77
+1deec169175eb15b
+1defdda324307269
+1df20a29cdec61a4
+1e01b910ceba4573
+1e0f97ec8f5aa374
+1e1742072c0b2d6b
+1e1e13de4ebea05a
+1e2a2be2df033527
+1e5548d951e91a40
+1e80fd6e7507e3be
+1e847ebd7cd1174e
+1e969786d2a8c7a2
+1e9b1dc1c096d68d
+1eab4db6941be725
+1eb03f0e3088edf0
+1ebb496a04a1bd76
+1ec6fa3de6fa1bf0
+1ec7e7dce1175aee
+1eca36ec55b88fe4
+1ed5e32330ec25e8
+1edc6b95e84127b6
+1ef9f5dfa615fafe
+1efbd8f8949b15e8
+1f079ee70c21002d
+1f084607245e4462
+1f0e06e4388dd600
+1f0f5e82e9d0f9ee
+1f1a76ed6db1dae5
+1f2153b5fb50d41a
+1f4279d98e283206
+1f56ccfabbfce568
+1f5df6019b0bb73c
+1f62165dfec00c3e
+1f73124222492f1c
+1f7770ac5cbb41eb
+1f925fcf391591df
+1fa073504b4facaf
+1fb651cd12893f99
+1fcc400e42725a95
+1fcf851e236dec35
+1fcfd8a36e171639
+1fd5f9af785e6e5c
+1fd615fea825fe87
+1fe394077e7c3de0
+1fe9f9bc178a1778
+1fef543188ace6e5
+2007d4829b187feb
+200ad247448c5577
+20100b779d28b6d5
+20171db88f887218
+202a627de66ad397
+20384754a6e5d1b0
+203b7543bb3387ce
+203c8a4d66c74338
+20422596003ac855
+204fc9ff2c7ff92c
+2060cce4dad6f988
+2064e46352532375
+206d9828d717139e
+2075a2388413899d
+20764a96cc70fe46
+20b38e0a985506ba
+20b541350492e3ad
+20bab82d0268c877
+20d0e788abca4aa9
+20d1b02740ef1124
+20d86cff490c0c42
+20de87f0b3f2d136
+20e7a3651ec30386
+20e9bf845c4bd9e7
+20f8c6738e22e764
+20fba1d53c349851
+21005252fe2383ba
+210fc445b9e254f5
+213286ef58a8c73d
+214c597029aebf9f
+214df1c2863d2959
+2177ca3a775a9ee9
+21840c44aed0ae43
+218ba0fa826d3eea
+219825f542e6ee4b
+21a23c81331b0027
+21a6081709444ebf
+21b26eda16f7cb88
+21b548b570c0b415
+21b9d476c5a49c63
+21d9134faec148f2
+21e794f71e31becb
+21e97eb0cfbff775
+21f3cc00e0cfe8bc
+22085848f943c2c6
+220d718317f7a025
+221205ddf59c5156
+2217c43ddaa29027
+22274e48b847c860
+2236eab1c5c86fc4
+223b5e20753d4fa3
+223c9627b978d127
+2245fcd7f76c2ecd
+224e9686747afbb5
+22552c9a2a2a2ce7
+22598e2596e6bae7
+225ae1d37a7fa519
+225c5f2cdcd2753c
+226646771975e6db
+22666111b2180af9
+2276982dbc5a23e2
+227c21a8dd87a153
+227d63ed9a678fb1
+227e06087cbffb2b
+228ed4b87c8a6ea7
+22a0db80d91128e4
+22b16c2f5af0f3ef
+22b86e38854ad186
+22c8b35c589276c4
+22d6e3fefb1ee7fe
+22da7610855d6b9d
+22e6c736f2f7227b
+23099812f662b3ec
+2313ea0fb17cbed6
+2326e9820982ad81
+232a0b3133326242
+232aee1c62a1cd8b
+233d7cca6c4c628b
+2341162bce213f2e
+234271629f7099df
+2347b1e3e70842ac
+234c14a79d4da1ff
+236f9dc6456cf32c
+237ef12cd69e2aa0
+23808c0cfcc72e72
+23a6c9168abdb38e
+23beb4b246e236fb
+23cfdadab7cc51a1
+23d7f14af4b7ba08
+23dcee801bca67bf
+23df266716914368
+23e428c0dc43f046
+240e89ef33ff15b6
+2412e9f45282fd15
+241aa9bcdbdc7ac6
+2422f760ea77551d
+242fc4972c7bb385
+2445756494ef6e3d
+24548ce6c15bc2cf
+24598987691df957
+246579087204ba8f
+24668d960406587f
+2473b5003a95628c
+247bc2e47eb7f6fa
+2482c4388b32f225
+24895a02057db66e
+249fd0890d439aa9
+24ad46fd2b26b208
+24b4e8ff5a9a6439
+24bcb936908f3a31
+24d1c4f7497f8e77
+24d95746999b7f7e
+24d97ac8e96e7a5e
+24f06a46ea08c03c
+24fe21f0a899701f
+24fea6c2c7caa434
+2516b6023683fc3e
+252a24d25a1ea81d
+25468a86d9cd851e
+2552fd444d04ef21
+259f6f1f002d2d94
+25aa5f50072ce7f4
+25ad11c04b852de5
+25b3854e6efb747d
+27d163d7046d36b5
+27daccf898b206de
+27f3ccfb3199499e
+27f772c12c97b594
+280443260e3dced9
+2807e5ac66c140cc
+281452e730c39fd0
+28190e57702bcfbc
+282ad05cb0113543
+283059a56e7f3e75
+283334520a3f8a43
+2837dd5c75e026f5
+284efc2041b1d1d8
+285aef90afaaf565
+286393e1e797cdad
+2864dc6c129cf3cc
+2868a1b43e9eceff
+28742766eb882cd2
+28760a14d0a5ff3a
+288dd40199dee268
+289bcb973ed702a1
+289ce0f2b82dcd0e
+28a5318660ab60ba
+28b06f7087798198
+28b23ec38ac5c0b1
+28c9d20b865f5d56
+28e8300e004ab30b
+28ed97894371982f
+28f59b68509ce59d
+28f5ebf3c3e2fe54
+291bc22350620114
+291db63458af0613
+293c7c1ccaa6861a
+293e02c7c1fa31a8
+2945a940639798ce
+29460ac4580ea232
+296241182c2df900
+296c87d370b03f17
+296ec0d98f4d4151
+29791bf60e718c6b
+297b57a9296052ce
+298276fb3c0330e5
+29832cbdb4144601
+298c394f21c62ae6
+29927d9ef0b472d7
+29a09527214b3dc5
+29c8267c1d10b23e
+29e0bfbad00f0d5e
+29f0d7c051d80035
+29f52c76f269ae48
+2a058bafbecaccf9
+2a1769dddc1dbf8d
+2a1fed061b29b25b
+2a2d971fd44ae258
+2a30c5309018e00f
+2a39e3b6061dd887
+2a3baeaa72b86812
+2a3bd0a2ac422822
+2a41503583d146ed
+2a4d835a6e023621
+2a695d52faf1949e
+2a7387e017c241a4
+2a89b2a52cee9f5f
+2a8cd9f87b3c9a2b
+2a8ef9e44f580d13
+2a9d8ba86290db0e
+2aa1e311e4bc039b
+2ab72e30a616dd21
+2ac712ac8d2fd488
+2ac7eecd3cd0252f
+2aca85b3bdf90a09
+2ad09b7837010330
+2ad4ea800caafe09
+2ad50852a84faf51
+2af206730de6f439
+2afb8a0a98e15155
+2b0b2259a7216762
+2b0cbc443e6c3c6d
+2b1303680b081ccb
+2b1a013698fea3a1
+2b1da1fbe7f18f7e
+2b1e0225f0952a09
+2b38cc883c900d33
+2b3e3c5d30c17bd9
+2b41e71d509a8b8f
+2b43428fa1cf1a7e
+2b4a934049f932d0
+2b4c1f50687b2bcf
+2b4e1061f6415a4a
+2b4f6fdcabf53d59
+2b5b5b4f4fc526ba
+2b625e92f2cf9de4
+2b81fbc1af01f0ad
+2b8778726c1f2fe4
+2b8f367d01df3601
+2b973e6f676eb243
+2bcb26e95f5d152b
+2bd7cee1fa9c8996
+2be655d4137e6e29
+2bec33eeeab0bb9d
+2beffa088960f673
+2bff9ec89ca982c9
+2c02607ae436a9fb
+2c04a38ae16197b2
+2c16104a0ed6c8aa
+2c2cfc0ac780a3aa
+2c3a96fb820e1ab7
+2c48ab563b92a1d5
+2c5249093fc26fde
+2c52d9d606a3ece2
+2c55c5a96b50ab36
+2c596a5abfd67267
+2c5e21f9f91e2e09
+2c6e76b362eed8e4
+2c6fb46edb748fe5
+2c805f56d92a2e22
+2c80f9eb0d3b2bb4
+2c9018ef57c6b061
+2c9b5e69fe7f0338
+2ca8e34592e0c415
+2cb9869cb05a9a01
+2cc330488326fd4f
+2cc5f95fbe24ffe5
+2cc8ef9e5319d5d7
+2ccd2d98696c87e0
+2cd1705407546b72
+2cd27189549897bf
+2cecdd7df86ff8d3
+2ceec371086f5d82
+2cf1a544b179b1a7
+2d0bd035f7df86b0
+2d0e6766c725becc
+2d16da80d7e3b64b
+2d29ff162920db5e
+2d39f39fb8254c27
+2d3e1349898addb6
+2d3f982ada31489c
+2d4e81e66ce80039
+2d524e9324228d6e
+2d5e1c16ba1f89c2
+2d60837ef2e52abd
+2d62a5d66d6e4931
+2d6d5e82bda0611c
+2d6f9fa00dcee664
+2d7d7fb53d960909
+2d815b3e5e9bb237
+2d8f1ccdb70c156a
+2d99ba7951695f79
+2da082dfb7a66b4d
+2da2eeb966bc0ef8
+2da38ca64192354f
+2dc3af70d25d3043
+2dd67f5e68c8d72b
+2ddca94aefd55b8f
+2deacec5c281fbac
+2defb4625a3ccb54
+2e04dfa4a1671292
+2e06abf6286040e2
+2e109379f53bb221
+2e261d7661282e40
+2e2ad99d45033d6a
+2e30ae101611cca8
+2e35fc35559543f2
+2e4013ea92d04301
+2e470f6e2c83566a
+2e4c69143b09033c
+2e4cac06a4f92261
+2e554e99d045b484
+2e619c31122ee40f
+2e64a2d17f9a76f7
+2e6876c6c1e40652
+2e715b2e0162f768
+2e715d4c5bf6c45e
+2e7b5f8836ab642c
+2e7ffcba51990c93
+2e86767798c005df
+2e96b2142fd337dd
+2e9d6a76c40b707f
+2ea3133861ebde3b
+2eb515bff528d3c3
+2ebedb0f027df101
+2ed398e8368e0c6f
+2ef881551a7fda22
+2f18f5579583e648
+2f25826f0d0ef09a
+2f3ec1f2335489d2
+2f4bcd593fe37158
+2f5af4b429b2c992
+2f7f2369486cc959
+2f878176347bcf9e
+2f8e1946600c65d4
+2f98ee24d3fc43a2
+2fbb94b8cf388ba7
+2fca5797ae48529b
+2fdfa70413053b84
+2fe5274c4baf665f
+2ff40df261e17697
+2ffe00ad70fe9c00
+30006eb23f62aa57
+30029fac7c5621de
+30127e00a789ed7c
+30140756550dc38e
+3015a3eab4b6d042
+3018aa8ad3eb5dca
+3021337b3fbdb2f3
+304dd8f38dbeef0f
+3052da93ceeda447
+3059f523501dfd97
+3069cc190d78e55e
+306d435307fef477
+306e2b7785657539
+308681a294d1417b
+3087828bc27bc4c7
+308de3d523189c72
+3094afb27266ee6a
+309bed43e4406d72
+30abedc6c413510e
+30b1d229ad4c6353
+30b1dffa5f783ecb
+30bd5b88e47f6d0e
+30efdcef9f38568b
+30fc5fc78c5a716e
+30fceb3b40ec062f
+3105d330651c2726
+310cbf1c65c52fa7
+3115672ced7c5694
+3115ce06e0160828
+311d8515bd115aba
+311db764bdc7f537
+31262e902165f348
+312c8e1f1f9f3594
+314c56aad151508d
+314c584ff3842715
+316035dd285c5e27
+31607cc68ada0108
+3164f4c30188d403
+316ed1b489ff8f40
+3172feb32990cf09
+3176f1532a468cbd
+31838e9542a906be
+3185302b2275b009
+318cc6a39c0acc71
+31a087ee5b1976da
+31a642fe4cd1e232
+31afa3dcf3b737fa
+31b5667e16de1d94
+31b8eb8bbebed9b8
+31bf989cb15492d4
+31c79c843555c2c6
+31d903660e1647c5
+31ee8cabc96b9a62
+321911ae4a16f038
+322261824c4a3003
+32294ad73efca3db
+322d03d487fc0f01
+3232f7457b27dbbc
+325ff82707386438
+3260a42ccfba8973
+32615afea87b52dd
+3289ad7a811d2348
+3290731e5f908b92
+32991f419c96ea0e
+329aba411f341398
+329abf340d23b0b9
+32a2c04fd8321bbb
+32cccd10c84b4529
+32ce9b303717a29d
+32d2163aa65c0e8a
+32d28b7513f873be
+32db375ab51d77a4
+32dc25ef78b564a1
+32dfef9109202812
+32eba3e4cfb61f93
+330b925cef643b3f
+3317c40fd3e0a7b7
+332294213fa15c56
+33288d55dde83e72
+333c649f75c3c7bc
+3349f3089ea84d6b
+33517e9838fe5f20
+335794d48a9b168a
+33842f4b169e4145
+338ff9f6c02a6a40
+339c95e2e709d044
+33a29a351c1d9800
+33a3f65849195eba
+33a3fc21efdc8547
+33a5a85f06fcc77b
+33a93f85d5713a71
+33ac471b97ddf5c5
+33afdeba3cd5af05
+33baf3e18e5d7256
+33be1ba5aec86c96
+33c1bb87a88e59f1
+33dbdc4396938ae1
+33e23b97daf5d9f0
+33e5bb3820c171a5
+33f1be3a9ccf4e4b
+33f242465f51563c
+33f4eeb64d0c9c1a
+33f7565ccb685cb7
+35b15e97674edec3
+35b994780d720894
+35d5a242ba40f31f
+35e8c6c2168dd087
+35eeb3ce1b3dd01d
+35f89d3ac607bd5a
+36056faee50a621f
+3613c77d8c234008
+361d722ef5009e09
+3628ec0337eae7be
+362be988d0b68a4c
+365cc620c2fcbb05
+365fc12b4f33ada3
+3662b5f2916b470c
+368fa2dd830843c7
+368fae6f3bc0b0f7
+36977643258aa392
+369f3639d9605255
+36afe96c11a8211c
+36bc6918e9fc837c
+36bd5d3f3ee292cc
+36c5e00f55c4f217
+36cb04da872d3bce
+36d85599a9cbb6b3
+36e69606a7599644
+36e94b0ad3a62c7d
+36f4df3e0a1ade5f
+36fb4d41b00581c4
+36fc018c7b62b997
+3714123f055e06a1
+371c9182ffd46ce3
+372bb866143b6e35
+372f324a1f4d6898
+373ab0a1009e0316
+374b2f4abac6dbb9
+374fa34fe701a30d
+375a7fbf80d09c92
+375cff10cab07955
+375f9c448cf31ccb
+37697c41773d597a
+3776e900791c1553
+377769942e6a748d
+3783162ee796a21c
+3784777516b00247
+378cb83947bf2d23
+379470c7d22c498a
+379884ce61c4daa7
+37a6ae1e1c6eff66
+37a6b3200493fac0
+37a960afb176c485
+37b0b70a1a0c25d3
+37bbec9e46c8c1a9
+37c99410741fbca5
+37ce88a12d77382b
+37d09bda74c92a93
+37d0f351f07ee925
+37d4e43b2b029a80
+37dbb9846f2fdc01
+37de8da2580d0c1d
+37ff2186f55b3fd8
+37ff932a6a608c24
+3814a3a8046c8af3
+3824335ebd7a4097
+382a5736d9134153
+3858fc4475d10c78
+385f9444d20eb160
+38615248b52e2834
+3862411e9bf455cd
+388b75bd17c5332e
+388cb2f0ff1a6cad
+388ed39170b69946
+389851bf0ac38227
+389df03f3c2d7291
+389f65e97bd902b2
+38a9a0f5e76103d2
+38b7a1d23745fbf3
+38d44ebf460ac132
+38f7ba7fd9a83069
+39074bca3524418b
+390ddb7ee9b716ca
+391bc7d21641283e
+3928ea8b8c134846
+392c21ee30b21459
+394037c064421c3e
+39424be692a88364
+3953f37661087a95
+395caf9235fde098
+396065ee739ea046
+397037082e6eb839
+3970859f54703c88
+397b050e345d73fd
+397bbed49e1ee8dc
+397da8e32c2edd65
+3984d005557cbd6f
+398c4688209874c9
+39987911e4cf003c
+399cfd9cfacc0499
+39b3e7b2a8bf30a6
+39b58270c2e99310
+39c41be5e76c79c1
+39c662fa32a3b5c6
+39e5f256790c3343
+39f03d5fb1807102
+39f1b33acc70ad7b
+39f444469cf39006
+3a0328ed13dd8c8b
+3a10eb9788bcdfa1
+3a126bd9702ee8f7
+3a3bc11b9ebb7d44
+3a488ff3afa463e2
+3a48dfbd2f0977f9
+3a4d7cbcf0c84668
+3a52947c66de5920
+3a642c6d0e43510b
+3a6dc09185951ae3
+3a79b9aefafb0b8d
+3a86a812a1eaa20e
+3a9c883b11e86530
+3a9fa6535917a07a
+3aae131a319acd17
+3aaed2e6422d7d57
+3ab70559ec30a57c
+3ac32347c3ff7d38
+3ad4793daf6adc19
+3af43fe8d514f7c5
+3af4c4e5a8ced21e
+3af70052616f7fa1
+3b0b55657925fb34
+3b122e1becb5fcb7
+3b1c57027302837f
+3b1f9cedfc40b06c
+3b273cb40c55db95
+3b3880eb01373479
+3b434b5302dea908
+3b58206d99feb4b7
+3b59c7d97b900724
+3b650a9e2ebdfde2
+3b67613d97aac1df
+3b676f25b54dcc1c
+3b6d8db52c54b174
+3b7443b24830d388
+3b8167415736169c
+3b84ac07fd85bb3f
+3b9420585a1e66fc
+3b9e04113b202116
+3bad929f21fc4336
+3bae42d603be2266
+3bb1007fcf0e03ff
+3bb70a92a0d384e1
+3bceca99e87d64c5
+3be029de36008afc
+3c054be9bdb304ee
+3c19657356e9e229
+3c31d7b9f2792ed9
+3c33566bacd602f2
+3c35b868a8ec3433
+3c44d53659dbe4fe
+3c44f7a30e0ad967
+3c5163ede747b187
+3c64a373bc1c53bd
+3c83e9817c9e022d
+3c84329b60bfa7cb
+3c85dbda51f7e9b3
+3c90ad3bb72adcf8
+3c9c37132583a3d2
+3cb1489b614e5f39
+3cc1fcf538c81442
+3cc40f129447cb31
+3cc4c306db84c6fe
+3cc97c3d778975f2
+3ce8f87fcfc988a8
+3ce90c0ea2537c48
+3ced9d0b56769bb7
+3cf461bc6d626ed0
+3cfb4c69b14a1970
+3cfb6cb5052ce744
+3d0a0fecfbdada35
+3d2486ac8822da47
+3d2f4958db5aefbf
+3d394fbabe0e733d
+3d410da4d7fd9f64
+3d4645318868a4f3
+3d5114f5d7496cdd
+3d5125567924e37b
+3d584707e2f3ccf3
+3d60041ab79f46fc
+3d6e04af63ebfea4
+3d6ed8b43655929b
+3d7a1ebc77f683b4
+3d8d29bf0d9f24a4
+3d8d753f0851bf3b
+3d986ec2fd6d210d
+3db49ddb3f470436
+3dba1838ed366ab5
+3dba9cb74bfb79b2
+3dc0058dce3828d9
+3dcdffe3b9c6235b
+3dcef43736468b29
+3dd211f3865fc234
+3de41ace235a3a13
+3dfaa97cd48a0332
+3e00b129b656fbce
+3e034bde9426ae9f
+3e07add8413f8157
+3e1236935a5f70ae
+3e1af0b953407ef7
+3e3d858083d20eab
+3e4057a188e15ac3
+3e577a3be646152b
+3e5f747d06bc84a3
+3e68931874661724
+3e6d44a66c0d7a0a
+3e746126204810a4
+3e8363be673dafa2
+3e8dd5a6930ecb92
+3e94e6706fcdccfa
+3ea8d9787998f70a
+3eac186b3e7badb2
+3eac742acbd69adf
+3eb185c04280412d
+3eb718c3170fcd8e
+3eb795302924e912
+3ed3ffd0ae9c3224
+3edff71624eac3ee
+3ee30754edfbdb3f
+3eef492bf5120757
+3efda95897eb23d1
+3f0d9e856d93b8b3
+3f1c5b36d217d345
+3f265c5edb13f00d
+3f2bf7371b72e40f
+3f33ed2971149ea0
+3f45b8234504020c
+3f4f553239e96d90
+3f5454f2f53e2103
+3f68a1e365e94eb4
+3f6e7ee98174056b
+3f79dc32d575bcdc
+3f7b6f511421e395
+3f89e23583c36441
+3f8d1edf59e70df3
+3f9b08ed34ec795a
+3faac9603907b329
+3fb3327a177a0175
+3fc266558ec5c07b
+3fc2c221557a205c
+3fcf6c1b81b14af5
+3fd084afa49b6499
+3fdaa028b8baad4a
+3fe382b2ae6c9361
+3fe783b9c7c8f492
+401a94bd9d84f501
+401e10a4352fba1c
+40284c1baec06ac9
+403951b5d632b5ab
+404043fe2f398440
+4043989a4ae95a01
+4054d32655ba5eda
+406bdec5b68b1a71
+407eefe8017f6070
+4089ef1b1bdb1d36
+40904cd4b9e0579d
+4091c41c6909da3b
+40954e72e02dc771
+40c517d28a412a5a
+40ca76de44a6e1a9
+40f6d540b9b16531
+40f92f1e65a5e1dd
+41016527728cae5b
+4118895a33890c5a
+411c4dd047c49cde
+41210bec1c0c87e8
+413062cf685711e7
+4130aefaca885090
+414e2bf42ee45cc4
+4161944d7d592071
+41649e3e8f9a4be0
+4175cb4c71c984ec
+418ad7b9e78208cb
+41936ce6152fee64
+41a3a167ea5d9e88
+41a55418bee59b11
+41aa58e688a04336
+41abd737e0228c1a
+41be40880094d8d1
+41c2ed4944dec77f
+41c600ed9f88871b
+41d8b4350913ca64
+41e428b3c7a16695
+41f3291d82fc4d93
+41f438dd19aae981
+41fafa6144b58c39
+4200282fe9b4015a
+420bdc53a6928b32
+4213b6b3b673f9b5
+4221bc1d4aea1a02
+4227369e7d0e735a
+422d976591ab629e
+4242fb49c775710c
+424397db4b1cf634
+4246a11f0971a231
+424f597efdad3067
+42565e9d863220ae
+4260ca20e2430c67
+4263257ec6099434
+42742db2633d2eb5
+427c035484d45682
+42b086af2a1e5d98
+42b218cc2f794026
+42b88f7ee71a7ba9
+42c2c85060ab5233
+42c75d578535b0fe
+42cab73e14195475
+42cc82972397863b
+42d8b53a15001cd5
+42f700b22cb0be39
+42f761f7e655bdce
+4308efab35deb3ec
+430c6d1f8676fabc
+430d7b5b77861810
+430d8fece8e0f7e5
+4317016b336431be
+431795f999dc215f
+4319ddbd5e8f20e1
+431e6542fde13130
+432a9cfcf53ef717
+432fb354aa710e62
+43361dbc0c5a2808
+433742b23712bf06
+436a235ed74c3d89
+43759ced44693671
+4393f3a15ed6fb9a
+4393f3c42606c573
+43c329f7c0b40258
+43db35d743e6be54
+43db8c6515021c01
+43fee307c6339b5e
+44095e87bee5475e
+440b5d1587251680
+44138776bdbfe28c
+4422b38e60e3bc2f
+442ad5ba8e834889
+443b1691d94c1b3a
+443e5a7e679e3e94
+444d7d8445cd444e
+446626a2bd617d24
+446f557155994097
+449c34eaea295942
+44a85c75cf4a6da8
+44ab295bc3092c28
+44accffce93c7e87
+44adc8d00568380f
+44b13221c50914d9
+44b78f9fcb5cd8d8
+44be029ec85609c5
+44c16554a21aa6af
+44d12349e0609ba3
+44d2532c5b5296a1
+44f9aac9faaac569
+4500d9faefff3a41
+45064c8142f3a360
+450cf402f042bfd2
+45108618c40e26a7
+45122648522d4180
+45166f266dd609a3
+4526c5ac1bfebcfb
+452e9c4e4729ddff
+452fff658953aab1
+453c980210335f26
+454197dc5b50b45f
+454fc1e32db7cc41
+45536907ffef7585
+45592a7f307bccd0
+455964aa4ead1e2f
+4560f57598efe5ab
+457a1ed78b1ddb01
+45823117f0acb627
+4596160a24b1af1f
+459a954b63f98d8a
+45a00d135c5388fc
+45a0fe252a89e008
+45a4515834848010
+45ac5168bda9d3e2
+45cb862034851efe
+45dce690caec2917
+45e5fcd5c8978342
+45e6fa48ddd00e87
+45e81a557d2dd78b
+45f2d7abb5fafaa8
+45f5a75e63afd4a2
+460455f96fa1a1d6
+46067fb6d992860e
+460e2066b64b2a40
+4615277ffb68ca9d
+46200541f9943d16
+4634124b21b763f6
+4636beec02aa8dce
+464d63c227f26d09
+464d97e527dd5f8a
+464e3851f923f8d0
+46502a6038bd288f
+466150f780ad7b80
+4675ea4e00c2544d
+46889bb1803c5cd7
+46a4d49d61a86d37
+46b2a13f6ab0be05
+46c9e2d86e7d4c41
+46de062e5ff787c5
+46df36a031f50a04
+46e0654ccb5d88cf
+46e2ddf094d0c3a1
+46e653208e529783
+46ea97f6f3757209
+46eccb4820f5a4eb
+46f840365cee9c44
+46fb9c990b6f8114
+46fdfa2a16c7c811
+47191aa41a979900
+471abe46b812be64
+472e2674ece00632
+473e6ec61583d90f
+474afb2d4641a228
+474d403238a41315
+47573da5cb0e5e44
+4766f2062abaaf74
+4773f5327489d57a
+477df7ad0c2e7fdb
+47a07f51fd3fef77
+47a1f1f01e2b7be6
+47a66fa042406908
+47a76ca10546fe8f
+47b5d62899ea4869
+47c88dcfb1134255
+47cae76cd53c752a
+47d11d4bee6608e0
+47d9493675e58f3b
+4803cf5deca2b38a
+4811a66f87b0dd6a
+4828fb60e4a871ee
+482c3e92080f18c5
+482ce5c63038e5b4
+4833b3d2a8184313
+483c2b4c67e32c19
+485f996ecf360da7
+48614bc62c3acbf8
+48638883e537ccda
+486970d685c0b746
+4879ebbcfd888f5b
+487d83675a8d1574
+4881a65d7476d6dd
+489254d4b26a04c6
+489f9441d513634e
+48a3049cabb54c0d
+48aff7218b00d843
+48b1808c546c7e87
+48bb743178166598
+48d4444b94c2a2c0
+48e49bdd1aa706e1
+48f9cd996f80c34d
+4905bc8817511dd2
+4908fab97c9bcec7
+49235d402cbb8895
+4938177a0e6e2fbe
+493bb055f33cb256
+4949361d0831c838
+494f87170e713843
+4953a5140e2f439b
+4955f54f807ef5aa
+495f50f0997e986e
+497364635884d8aa
+497d2450ed65a678
+497f507a5901bf4b
+498688760312447b
+4989f6cc2b43d2c6
+49b8f80c849dc341
+49be5de41d619cb1
+49c758aa3c35ed86
+49c9324758b5e867
+49cc37f7f96be5a9
+49d4a7288f6b5dac
+49d5b942442449b4
+49e48d66787ecb8f
+49ec7608e51f7ee2
+49f6f14c580b71b9
+4a046d13e389b505
+4a177be7db12edcd
+4a1920283e3087de
+4a1b9fb940541809
+4a1d79baac733df7
+4a2d6753676df096
+4a566b7e6eeaf9b4
+4a582ee23dd05a8d
+4a6a057fc644624e
+4a736d7c30ae9280
+4a763e1b87e495a7
+4a7e531e1a35d424
+4a7f0556fb58a5cc
+4a8f9d6889992fd9
+4aa1973c40d2eb93
+4aa594c0ad661f28
+4ac044dcaa428723
+4acd145b0c133dca
+4ace59951acbae4e
+4ade6d5fe4b32738
+4ae4456267802484
+4b0fdb10ae15684b
+4b341307a872487c
+4b3644bfc6083588
+4b41d03353967b40
+4b457a008376cd73
+4b461c1ec52a3076
+4b4c0c27204604a3
+4b5619958277861f
+4b5a6dd314bebe88
+4b6e9a02975ef9a4
+4b7071b34e8cc67d
+4b7e7de9132f4149
+4b85062505816744
+4b86587ecd3325f4
+4ba7caa04cea37b7
+4bae8b3980bf32d0
+4bc1c3a888a8bfba
+4bc203e17758f3a0
+4bc47dc7f8781812
+4bc7ca44cc62b8b1
+4bd922d1e75cc936
+4bdb70500b99c91f
+4bdfa30358809038
+4be8bf31940bd475
+4befac16ffdf8489
+4bf9ef4705f35e8f
+4c0ef61c55467706
+4c144eb40a09a0dd
+4c169a41e66b6599
+4c2383e60aaf26cc
+4c2d32a7f2b62657
+4c2ed13774ae4613
+4c4fa41951e37e78
+4c502551adddea8b
+4c5fd496905b91ce
+4c69bf407b142b93
+4c76898a3d535741
+4c791225522d45ba
+4c8fd5318ae8d467
+4c9030a5917a1328
+4c943ac66f6c277e
+4c96e034f8af77d6
+4ca952ede2af6578
+4cb669bd62a4ffb8
+4cbb82a6bab25a0f
+4cc48509585e4157
+4cc4c8a8cfa8e944
+4cda491521679291
+4ce58504b055463e
+4ce642bc93f1bb5a
+4cf74ffa5bfd5904
\ No newline at end of file
diff --git a/evaluation/preprocess/re10k_test_1800.txt b/evaluation/preprocess/re10k_test_1800.txt
new file mode 100644
index 00000000..c8446e9e
--- /dev/null
+++ b/evaluation/preprocess/re10k_test_1800.txt
@@ -0,0 +1,1832 @@
+1839244b04a05e5a
+2da38ca64192354f
+4308efab35deb3ec
+06db5bb2465ae58e
+3f265c5edb13f00d
+37d4e43b2b029a80
+30127e00a789ed7c
+1db274e904e3fb07
+1d46e25b06eef337
+0f12b97e0e4c7e21
+007876f71baf453f
+0f59c103684c0437
+133ee6e537353604
+2dd67f5e68c8d72b
+4175cb4c71c984ec
+0ede2c8fbe52c1d3
+13211c9e31fa4b14
+21a23c81331b0027
+2dc3af70d25d3043
+095fb57435b7d890
+07ba5489b56b7d62
+2b4c1f50687b2bcf
+0954f5a326941fb1
+02b59cd60efb924e
+1840ae9e2494443e
+3b3880eb01373479
+16ef89980e2ceef7
+31b8eb8bbebed9b8
+08d5cde674e47324
+2cc8ef9e5319d5d7
+46de062e5ff787c5
+0f5bb0704084e290
+3176f1532a468cbd
+03906f66d3bca71a
+18502a6651367e71
+31262e902165f348
+19694d2dc528d75d
+3824335ebd7a4097
+0807e84457d5ef58
+138bb7b0b25e4669
+4091c41c6909da3b
+068140e09ae5ae8f
+4675ea4e00c2544d
+10fbe4690dec6258
+1da33647873725fd
+44c16554a21aa6af
+2e554e99d045b484
+4b86587ecd3325f4
+4757ffa4f1da98cb
+365fc12b4f33ada3
+36977643258aa392
+1593596b99e2dde9
+2c16104a0ed6c8aa
+1526707312c94a92
+23beb4b246e236fb
+4b41d03353967b40
+318cc6a39c0acc71
+2ab72e30a616dd21
+10c4c9600bfaa4d8
+2347b1e3e70842ac
+1513c8f030f4cbe2
+17a489f3cec39fea
+4cb669bd62a4ffb8
+03b3f603a1001de0
+04fe4ec70781a0e8
+4a0368a338dbde67
+1b881261742799f7
+0e5b9dcdb891b82b
+0a45b99f42fb0ecb
+444d7d8445cd444e
+0ab14ffa7e541b0b
+3c31d7b9f2792ed9
+3776e900791c1553
+0e48b9ee438238f1
+03e756bff92d49dd
+2cf32e2408107ea7
+20e7a3651ec30386
+41aa58e688a04336
+1b88fb5063cd8916
+4ce58504b055463e
+0e3951bf1db22064
+16cd4f1cb2a467c1
+455964aa4ead1e2f
+3d6ed8b43655929b
+1fb651cd12893f99
+4a736d7c30ae9280
+33a93f85d5713a71
+09265a9e57075e7c
+45e81a557d2dd78b
+15ffcb1f98d41218
+3c84329b60bfa7cb
+05ce34e3cd48c449
+1c840c855f0c8421
+3a10eb9788bcdfa1
+403951b5d632b5ab
+0a78dcb828c506f1
+36e94b0ad3a62c7d
+25ad11c04b852de5
+4636beec02aa8dce
+2e64a2d17f9a76f7
+2af206730de6f439
+0cd63c88350eef60
+4c502551adddea8b
+0d5a4dcdf8ec9d36
+3115672ced7c5694
+1bc87f52eba89cc4
+1564900dd040c718
+3b1f9cedfc40b06c
+30bd5b88e47f6d0e
+4b3644bfc6083588
+1efbd8f8949b15e8
+2a3c7ba09ed503d5
+31a642fe4cd1e232
+040a26b288e7bda4
+30b1d229ad4c6353
+4260ca20e2430c67
+145708c0216a06a7
+07559b44fa10672c
+3a0328ed13dd8c8b
+1f4279d98e283206
+3edff71624eac3ee
+428fc13fba69054c
+0c6b149da098b121
+0b211a1457076450
+4043989a4ae95a01
+2d16da80d7e3b64b
+454fc1e32db7cc41
+0286d2ed56e8f107
+3f6c97f1ac96dada
+33a5a85f06fcc77b
+0387ef3895b1393c
+08291107fc9e9849
+1fd5f9af785e6e5c
+37d09bda74c92a93
+330b925cef643b3f
+0c4c5d5f751aabf5
+00ae21ab50209282
+2c6e76b362eed8e4
+45108618c40e26a7
+082087c82daa295d
+407eefe8017f6070
+487e2f9d93c162c9
+460455f96fa1a1d6
+2ac7eecd3cd0252f
+47c88dcfb1134255
+146ef4db9655fd67
+333c649f75c3c7bc
+17a39d87a22ac1ec
+0712476a67734dee
+2e715d4c5bf6c45e
+240e89ef33ff15b6
+3d8d29bf0d9f24a4
+37d0f351f07ee925
+1225476a1221ce08
+2beffa088960f673
+16b667d681f8cf25
+05dec89f80cabf23
+232a0b3133326242
+4c69bf407b142b93
+1fcf851e236dec35
+2a1fed061b29b25b
+40c517d28a412a5a
+29f52c76f269ae48
+06954737f53b8688
+24fea6c2c7caa434
+0c1012a308ee2788
+3dfaa97cd48a0332
+1837b8eebc9c2457
+0d01d4d6c5d5297e
+37a6b3200493fac0
+2dd86d1f9e2d3474
+3e1af0b953407ef7
+452fff658953aab1
+0a5e107e1961d01d
+41c600ed9f88871b
+36f4df3e0a1ade5f
+1be5a5c98a51b1cc
+08c87b4b6b23895f
+289ce0f2b82dcd0e
+196069a792ebbf31
+1e969786d2a8c7a2
+047c29e9138af233
+464d63c227f26d09
+08138c1a3ba1ce8d
+1f084607245e4462
+29927d9ef0b472d7
+0a6680fe6e8e09d7
+2bd7cee1fa9c8996
+154072be49bb3c1d
+043c48135c5e8cc2
+452e9c4e4729ddff
+0c8438d86bb28f7d
+2e4cac06a4f92261
+1fe394077e7c3de0
+46200541f9943d16
+3faac9603907b329
+0e728af85650dcb3
+3c44f7a30e0ad967
+0be9a0dcbfe032f1
+0eadbf8806794990
+07225d96742d2a6c
+47573da5cb0e5e44
+4c4fa41951e37e78
+42cab73e14195475
+2e261d7661282e40
+02b2358ff02d3ce8
+2e86767798c005df
+4aa594c0ad661f28
+3bad929f21fc4336
+10ea3faaa29f4a88
+01497290d8b93a9b
+395f5ae56e94e344
+4a582ee23dd05a8d
+49d5b942442449b4
+3e02762a89de4c7d
+4c791225522d45ba
+252a24d25a1ea81d
+07479835711d6f8d
+0043978734eec081
+374fa34fe701a30d
+4a1d79baac733df7
+0e2653d00e3fc05a
+0c356641df7c72b8
+0b564b685315d8ff
+04e4c841b349bf5c
+3dcdffe3b9c6235b
+2ed398e8368e0c6f
+0a4cf8d9b81b4c6e
+0ff193c92d415b18
+4526c5ac1bfebcfb
+018f7907401f2fef
+46502a6038bd288f
+210fc445b9e254f5
+11ee8f1e1bdf1c2c
+4a7e531e1a35d424
+16a75920e79c3710
+391bc7d21641283e
+362be988d0b68a4c
+223c9627b978d127
+1f925fcf391591df
+47cae76cd53c752a
+3e5f747d06bc84a3
+0bc9fd5c8e50d0ee
+16b3eea75ad753ef
+01e18dbbf22ff263
+00a54225c5cb1913
+365cc620c2fcbb05
+44be029ec85609c5
+2cc330488326fd4f
+06058474f164c53d
+2bff9ec89ca982c9
+0277b87a9c943ed5
+1ca02bf1c0b65675
+44d2532c5b5296a1
+18a86c01aaaaaa8b
+29a09527214b3dc5
+1c36b2b8144d29ec
+1c742e548e8698c9
+430d7b5b77861810
+2e04dfa4a1671292
+3d2fdcb64b0352ff
+4aa1973c40d2eb93
+0b1b293ffb0e2f51
+02b618a34bc12ff9
+086e9118bec887be
+1c29b1d8fd1dab3f
+2ca8e34592e0c415
+1623f47d7e74e848
+4c447e587919525c
+2a3baeaa72b86812
+2b41e71d509a8b8f
+462f64859631a099
+2c9b5e69fe7f0338
+4c144eb40a09a0dd
+0ece034988793847
+2bec33eeeab0bb9d
+0f4f779411b45b6f
+0404d32e97ec1cdb
+4a7f0556fb58a5cc
+39f444469cf39006
+1fcfd8a36e171639
+1f079ee70c21002d
+24895a02057db66e
+2228cda919976437
+14f0b962fabfaae2
+2b625e92f2cf9de4
+3094afb27266ee6a
+3087828bc27bc4c7
+3e4057a188e15ac3
+17519e763b34fe14
+220a9caffae81adc
+0efeb5654da456c6
+3953f37661087a95
+46067fb6d992860e
+0de78cb98105f8c2
+0e00a382b62667c0
+39c662fa32a3b5c6
+4319ddbd5e8f20e1
+0555b07fe6239b4a
+20fba1d53c349851
+236f9dc6456cf32c
+283334520a3f8a43
+477a0a9f77c00480
+0869b66f912b845d
+293e02c7c1fa31a8
+1778e784d47e035e
+2a39e3b6061dd887
+2ebedb0f027df101
+3ced9d0b56769bb7
+33e23b97daf5d9f0
+2cecdd7df86ff8d3
+3fb3327a177a0175
+08f82d3899d6b726
+35d5a242ba40f31f
+498688760312447b
+373ab0a1009e0316
+2c5e21f9f91e2e09
+2e06abf6286040e2
+0bd819cb30a432c6
+12d4ca1236a2cf26
+474afb2d4641a228
+4bae8b3980bf32d0
+0ecf489d873b7f52
+2b4e1061f6415a4a
+00beb03ef95dc637
+0ef68550315f57c7
+3e8363be673dafa2
+36707b9a2d7c344a
+32615afea87b52dd
+46df36a031f50a04
+379884ce61c4daa7
+1e1e13de4ebea05a
+2341162bce213f2e
+0c824455996db331
+2e7ffcba51990c93
+1ebb496a04a1bd76
+0425df3e42ba0de3
+179ff8424ec7ad13
+464e3851f923f8d0
+0f64c5e4fead6cf2
+13c4059da4e56a8c
+283059a56e7f3e75
+0a3b5fb184936a83
+31607cc68ada0108
+332294213fa15c56
+0445459f7afb0f48
+126067199873816a
+11e62395c85c250d
+472e2674ece00632
+2be655d4137e6e29
+00cf0a94235771bb
+28190e57702bcfbc
+15b93cbe9fc5220d
+2a41503583d146ed
+460e2066b64b2a40
+3fd084afa49b6499
+2b1a013698fea3a1
+025192166c704a39
+10e931268a81f228
+08868143749f321b
+0c8c4363e0dca250
+01fa6190cd47d125
+05c57211be152630
+0e2f2538b26a179c
+1d2cab92bdcc1453
+41fafa6144b58c39
+2ff40df261e17697
+457a1ed78b1ddb01
+0b81411e1b5ec798
+15f8a54e4822f355
+1203cc23b881ab8d
+1648b3e7471e3766
+3fc2c221557a205c
+31ee8cabc96b9a62
+14417ac810f2024f
+0ba1cadcb191dc0a
+0972074fece891f2
+07842cc567e9beec
+173a82eeea56aed3
+1f73124222492f1c
+1eab4db6941be725
+166818269e4e2568
+0d5112a7eb22d61a
+3a52947c66de5920
+0baa633d2094d2c1
+000db54a47bd43fe
+24668d960406587f
+04f5153fc5255516
+05a0ad1e2aa632e7
+0b4d5beb7d3bd867
+00a5a2af678f37d5
+0708389923510354
+14bfd05497764243
+453c980210335f26
+193c3bd339eb0a75
+1d704b9365e9c86b
+0f540553fc30f16d
+0ff896ed26db5da3
+0e060f89ae0a469a
+3c9c37132583a3d2
+02ee66b3efbf3b0a
+3fcf6c1b81b14af5
+008cd8c450342e49
+3af43fe8d514f7c5
+24f06a46ea08c03c
+132bbf5a9e9626ce
+30b1dffa5f783ecb
+02f801e372d67cfd
+2a89b2a52cee9f5f
+3f33ed2971149ea0
+45ac5168bda9d3e2
+27f3ccfb3199499e
+12497730f691d00c
+18a3593609eb3269
+25aa5f50072ce7f4
+0ebaed7e3d044bc2
+3c85dbda51f7e9b3
+219825f542e6ee4b
+4560f57598efe5ab
+227c21a8dd87a153
+33a3fc21efdc8547
+1fcc400e42725a95
+29c8267c1d10b23e
+004dd4b46a06e5be
+32db375ab51d77a4
+32a2c04fd8321bbb
+1db5a4df1ab8b8e7
+3d4645318868a4f3
+44f9aac9faaac569
+371c9182ffd46ce3
+13fcc228c40a0e67
+3f11a54c6d0703a0
+1c144e2404c5da89
+2482c4388b32f225
+1c9772d765e0679e
+218ba0fa826d3eea
+3cf461bc6d626ed0
+2e109379f53bb221
+43c329f7c0b40258
+1beb8e6662d36ee6
+388cb2f0ff1a6cad
+4a2d6753676df096
+22d6e3fefb1ee7fe
+46fb9c990b6f8114
+2cd1705407546b72
+05c48ff6535fbf55
+095441304a817fe9
+10ac0ae67d317d11
+329aba411f341398
+072a60bc7e0b0dfd
+31a087ee5b1976da
+446f557155994097
+2f3ec1f2335489d2
+1cd3638cceebed08
+3c5163ede747b187
+06f4bfa5f9d5fe0e
+3b8167415736169c
+30029fac7c5621de
+2a3bd0a2ac422822
+0f61837b9749da34
+406bdec5b68b1a71
+3a86a812a1eaa20e
+41be40880094d8d1
+2b1e0225f0952a09
+06e499374ddafbff
+0c11dbe781b1c11c
+232aee1c62a1cd8b
+414e2bf42ee45cc4
+3260a42ccfba8973
+24ac24abf3057732
+4938177a0e6e2fbe
+45e6fa48ddd00e87
+38615248b52e2834
+17535adda2aa3b90
+0e714d042fa59506
+0ecde93bfa1f08d9
+41649e3e8f9a4be0
+49e48d66787ecb8f
+3b9e04113b202116
+1edc6b95e84127b6
+378cb83947bf2d23
+3b84ac07fd85bb3f
+171b3a4c2f95f981
+316035dd285c5e27
+38f7ba7fd9a83069
+46eccb4820f5a4eb
+3efda95897eb23d1
+233d7cca6c4c628b
+114d9c301b847239
+05330a153a103386
+0c52996355b23d76
+225c5f2cdcd2753c
+1047cc04fa16e0d7
+1f1a76ed6db1dae5
+497364635884d8aa
+2fe5274c4baf665f
+0cda9adbbedd7948
+1be80ff36848e758
+3de41ace235a3a13
+03a78406de1d0993
+2b43428fa1cf1a7e
+3d5114f5d7496cdd
+15d4a976e4e7d3dd
+3232f7457b27dbbc
+375a7fbf80d09c92
+05b77cb7c0f79f0f
+375f9c448cf31ccb
+43759ced44693671
+41f3291d82fc4d93
+00620c2b77518524
+2d524e9324228d6e
+17428a1f23edf411
+2f4bcd593fe37158
+107b78daf075d371
+39b3e7b2a8bf30a6
+16a2c55f96e6aa18
+2cc5f95fbe24ffe5
+382a5736d9134153
+0f18fb6736efb1c2
+4596160a24b1af1f
+14e540cf0ff7ff91
+01eca393f86d37c5
+3f2bf7371b72e40f
+0c4a239e265ae1c3
+3f1c5b36d217d345
+2b3e3c5d30c17bd9
+107d3d674fdebd31
+027c8c3fc3e7d056
+11a680776863b321
+0a6c499522efa0d5
+4b85062505816744
+2f98ee24d3fc43a2
+390ddb7ee9b716ca
+45dce690caec2917
+29f0d7c051d80035
+459a954b63f98d8a
+377769942e6a748d
+18ce480be0ececbd
+08b8b63abbec8780
+4c0ef61c55467706
+4161944d7d592071
+04b580ea1f4df0a5
+424f597efdad3067
+2c55c5a96b50ab36
+45166f266dd609a3
+3ce8f87fcfc988a8
+32d28b7513f873be
+01aaf4ebb084dc16
+3a88be7c404596ad
+1ec6fa3de6fa1bf0
+36d85599a9cbb6b3
+004334c94bbc8bd5
+0d08611c8b251e15
+2b4a934049f932d0
+01866b81c3b90f2c
+24fe21f0a899701f
+39b58270c2e99310
+21005252fe2383ba
+03482c3bd66de195
+0dd9e020b6d9d687
+021575237abe0684
+17a75f0b036c9cf8
+3d394fbabe0e733d
+0cbbc98eec80360a
+0871e5f582cd933c
+2e96b2142fd337dd
+431e6542fde13130
+3cfb6cb5052ce744
+4393f3c42606c573
+1227d00562c106e2
+35b994780d720894
+2b0b2259a7216762
+3ab70559ec30a57c
+36afe96c11a8211c
+0223924f43297881
+145da324f69d1c6b
+04db26572a791881
+02e6fb86b0172f0b
+058c67085c217b96
+48638883e537ccda
+3a9fa6535917a07a
+3af70052616f7fa1
+0bcde26e5a802638
+2c2cfc0ac780a3aa
+19ec130ecea98d5e
+1c7b9f93752085f4
+1fe9f9bc178a1778
+1cca8650a292e7b0
+20d1b02740ef1124
+3b1c57027302837f
+4c943ac66f6c277e
+0553c19e8933374a
+2e35fc35559543f2
+484cfd0b6334c43a
+4b341307a872487c
+1f0e06e4388dd600
+4bdfa30358809038
+42565e9d863220ae
+203c8a4d66c74338
+23df266716914368
+10f9d6f46e438d36
+2fffc623e6a34e23
+23808c0cfcc72e72
+0d0a99d7f22aab71
+10eebcbb9021f437
+01842c6b21e1d679
+11d5f4e7b0b17565
+247bc2e47eb7f6fa
+46e653208e529783
+0b194567eff966f5
+1de1b73fe4d6aa77
+45a00d135c5388fc
+3ed3ffd0ae9c3224
+2060cce4dad6f988
+040a7af97273204f
+4bc1c3a888a8bfba
+1bf668db0194cf83
+1bf4fe9301893904
+2d3e1349898addb6
+22085848f943c2c6
+2c5249093fc26fde
+3858fc4475d10c78
+388b75bd17c5332e
+203b7543bb3387ce
+21b9d476c5a49c63
+4227369e7d0e735a
+39074bca3524418b
+07e8ffa32746c7ce
+20b38e0a985506ba
+12b7562944c06836
+4c169a41e66b6599
+3c44d53659dbe4fe
+471abe46b812be64
+195ca8350ff27f6f
+0db2394602b8b81c
+44138776bdbfe28c
+4ba7caa04cea37b7
+372f324a1f4d6898
+361d722ef5009e09
+41c2ed4944dec77f
+12a70416c92a9483
+1b87f55dec310243
+15d8b7e256ffd066
+241aa9bcdbdc7ac6
+036f135766f38f78
+17dba5fa8138ed92
+21f88a42bf424000
+0a8f10a9a68236f7
+2d60837ef2e52abd
+33c37aeeda88c3bf
+0a0027a48d9ff2ae
+21e97eb0cfbff775
+32eba3e4cfb61f93
+118f563fe2ed4998
+0095ddd83beb3b8d
+49b8f80c849dc341
+088e115752ce9e56
+43db35d743e6be54
+3d2486ac8822da47
+000c3ab189999a83
+037e8191b3985142
+41d8b4350913ca64
+0368abd976e8d82e
+08718fb99eaafea7
+04422a07336e32da
+049a98f70ce9f471
+2f7f2369486cc959
+128bf83073de3ba1
+0326e5c562bdf1b5
+104c9a27980f9bb7
+24548ce6c15bc2cf
+039b153af4fbfba7
+20384754a6e5d1b0
+430c6d1f8676fabc
+1e80fd6e7507e3be
+49d4a7288f6b5dac
+0d7f00ff38b135f7
+024908906fadb408
+25468a86d9cd851e
+45a4515834848010
+0afdc571e4667a44
+11eb4e9eec5048f2
+308681a294d1417b
+27daccf898b206de
+3fe382b2ae6c9361
+368fa2dd830843c7
+23e428c0dc43f046
+483c2b4c67e32c19
+47191aa41a979900
+15ca8e1fce488c19
+0718f733a326d65f
+38d44ebf460ac132
+024152256b6bcac7
+4634124b21b763f6
+0b970b3417969c89
+436a235ed74c3d89
+3f0d9e856d93b8b3
+004e9db3337e8206
+397bbed49e1ee8dc
+4c9030a5917a1328
+40f6d540b9b16531
+3628ec0337eae7be
+24d97ac8e96e7a5e
+12f45658983d380d
+08e076c11a67b54b
+4881a65d7476d6dd
+2ad50852a84faf51
+30abedc6c413510e
+432a9cfcf53ef717
+3d16e4b4719bc256
+31838e9542a906be
+18c6473be3bd827a
+1ef9f5dfa615fafe
+396065ee739ea046
+2b38cc883c900d33
+11337164b772b7c9
+0362399a61c18ad5
+3a488ff3afa463e2
+11357e0934f26aed
+2deacec5c281fbac
+29832cbdb4144601
+05ddd2fee689399c
+4b461c1ec52a3076
+1424acd0007d40b5
+16b8ab24bd231f9a
+2313ea0fb17cbed6
+31c79c843555c2c6
+45f5a75e63afd4a2
+30efdcef9f38568b
+32dc25ef78b564a1
+02c0c9192fb9a6a1
+3b676f25b54dcc1c
+1d8017cad8dc1d56
+420bdc53a6928b32
+2c80f9eb0d3b2bb4
+214bae1626cba843
+304dd8f38dbeef0f
+0af60a9ffd747a1c
+3f89e23583c36441
+06a8196a66e125af
+0aa8646901d156e4
+0a72a3fd46a88ef6
+12e6ba92e82c7ca4
+397037082e6eb839
+23341c3a0b420e54
+08c9e7365f0707a4
+13ac6a6a3a4f5e5d
+3eb718c3170fcd8e
+2d0bd035f7df86b0
+186f18684ed4b516
+0bcef9ed1c18f74d
+30140756550dc38e
+0b3674ffb90b641a
+0e512d350465a63c
+3970859f54703c88
+3e3d858083d20eab
+3d0a0fecfbdada35
+2ea3133861ebde3b
+3f4f553239e96d90
+1150003196de2529
+493bb055f33cb256
+1072aae07584e091
+066ac822934d52d6
+3317c40fd3e0a7b7
+36fb4d41b00581c4
+2eb515bff528d3c3
+2d7d7fb53d960909
+4a1b9fb940541809
+0de8a88480533be6
+38b7a1d23745fbf3
+0871e2540b0a6804
+06d8995be6aa4db6
+3b434b5302dea908
+0f97153fcaafc80d
+28f59b68509ce59d
+03c61595d13e121c
+3185302b2275b009
+3e94e6706fcdccfa
+2d6f9fa00dcee664
+214c597029aebf9f
+2b23144adffe2e49
+4bd922d1e75cc936
+38a9a0f5e76103d2
+0fbe6d76015f75d4
+0190fe72a727c853
+33e5bb3820c171a5
+2cb589c80f31524f
+36bd5d3f3ee292cc
+2fbb94b8cf388ba7
+03cf40616d79cb6a
+227d63ed9a678fb1
+0d4de33c6888a754
+439762b81d4b908c
+1eb03f0e3088edf0
+164e33fbbcb5d223
+0d06be83296cf911
+168e0f4071d2fe58
+06c7c747b4542273
+0588138dfec165a1
+242fc4972c7bb385
+2552fd444d04ef21
+226646771975e6db
+09754b77eeea6dae
+03bcb03930ff1ace
+213286ef58a8c73d
+171403db6cb88926
+4393f3a15ed6fb9a
+29791bf60e718c6b
+31d903660e1647c5
+4bc203e17758f3a0
+1bf49251fdd23cc0
+39c41be5e76c79c1
+44ab295bc3092c28
+3a6dc09185951ae3
+1859766466c069b1
+11dc36548bbd85da
+2075a2388413899d
+4989f6cc2b43d2c6
+02b406d1e5e31d5c
+4c2383e60aaf26cc
+1f0f5e82e9d0f9ee
+4118895a33890c5a
+33288d55dde83e72
+05ef56b2656c9318
+055e4612c1ea70f9
+07b667b34838336d
+308de3d523189c72
+3e8dd5a6930ecb92
+20d86cff490c0c42
+37c99410741fbca5
+2e7b5f8836ab642c
+3fc266558ec5c07b
+002ae53df0e0afe2
+2236eab1c5c86fc4
+36c5e00f55c4f217
+1402fce28722610b
+14cdf4aa7a2de14b
+0beae06611ead92b
+2e2ad99d45033d6a
+0da6a36b24eaf5db
+139055b26734436f
+43fee307c6339b5e
+424980541ccdb10d
+28b23ec38ac5c0b1
+2c04a38ae16197b2
+44d12349e0609ba3
+3115ce06e0160828
+098263de57257005
+202a627de66ad397
+28b06f7087798198
+07d1d5769e8d797c
+220d718317f7a025
+0d1aa0f47c9d2f6d
+3105d330651c2726
+379470c7d22c498a
+47a66fa042406908
+12691b0622a823ba
+4879ebbcfd888f5b
+1e847ebd7cd1174e
+1f56ccfabbfce568
+0e41af1514f92887
+28742766eb882cd2
+1d9254a5cb93d4ff
+17fc81293f337cc3
+246579087204ba8f
+0c2b3463c27c5ac3
+15ac594106229c62
+0d0f4080d36dfc68
+06eba57d1c333a3c
+069597e1fe899530
+401e10a4352fba1c
+020a41f988981396
+16c4484a4093e2f6
+0ab163a1b88f1128
+23a6c9168abdb38e
+4766f2062abaaf74
+15aba05919bae167
+063b857e6470addb
+23099812f662b3ec
+00cfc0ecd345deb4
+03e141d7afac53e4
+0c9c387ae23d090a
+1d1cafa3e27da040
+1ec7e7dce1175aee
+36cb04da872d3bce
+4bdb70500b99c91f
+2b8f367d01df3601
+20e9bf845c4bd9e7
+12dce44829d88985
+3e577a3be646152b
+37ff2186f55b3fd8
+2f5af4b429b2c992
+2ffe00ad70fe9c00
+0e1a7abc82b1afb2
+35eeb3ce1b3dd01d
+3289ad7a811d2348
+22b86e38854ad186
+41a55418bee59b11
+0c0f298ace7c875b
+0ed4c9cac4a615fd
+200ad247448c5577
+195074aeac8bbd76
+443e5a7e679e3e94
+2d6d5e82bda0611c
+485f996ecf360da7
+0d20062086f6d05c
+443b1691d94c1b3a
+2ce75a0f430e2387
+2c596a5abfd67267
+3f9b08ed34ec795a
+4213b6b3b673f9b5
+0992802044bf665d
+450cf402f042bfd2
+00969c45a093d43c
+1d4d81f8629b119e
+413062cf685711e7
+206d9828d717139e
+19a28e5c25feb31f
+00d83c48cb78ec83
+156b422215789c18
+1dd869f66c3c9497
+2e470f6e2c83566a
+37dbb9846f2fdc01
+3f7b6f511421e395
+4be8bf31940bd475
+129b5e9b80cc4b4b
+37a6ae1e1c6eff66
+3d986ec2fd6d210d
+0b79ada01eb45be9
+4200282fe9b4015a
+385f9444d20eb160
+4a653d46e89d4202
+20171db88f887218
+49be5de41d619cb1
+072b4bb46d80484e
+2868a1b43e9eceff
+322d03d487fc0f01
+2afb8a0a98e15155
+32294ad73efca3db
+4cc4c8a8cfa8e944
+4773f5327489d57a
+14f477e7d5af5b91
+233dda1ab6796b0a
+2c02607ae436a9fb
+12e3d03a933c2eb4
+1ce68f950e7cdf8c
+45536907ffef7585
+2a695d52faf1949e
+3cc40f129447cb31
+296c87d370b03f17
+45f2d7abb5fafaa8
+296241182c2df900
+1025622b7f308760
+0b07051f912592d7
+2111cd087a82344b
+4a6a057fc644624e
+3dd211f3865fc234
+389f65e97bd902b2
+3069cc190d78e55e
+33ac471b97ddf5c5
+4803cf5deca2b38a
+2837dd5c75e026f5
+1e2a2be2df033527
+44b13221c50914d9
+1e0f97ec8f5aa374
+12a5cf6bbe330edb
+179a1357a581ad51
+3a79b9aefafb0b8d
+06dcbfe7cd79bb66
+01be77405b16df11
+2d0e6766c725becc
+071e8c0978097efd
+29e0bfbad00f0d5e
+1c5514d49d61bafc
+33a3f65849195eba
+0d76da0fcac26af8
+1ed5e32330ec25e8
+28f5ebf3c3e2fe54
+0b1e61c69c98026b
+00ca5123d8ff6f83
+1d748783383ad977
+0bb99505a71035cc
+103777494841b376
+291bc22350620114
+440b5d1587251680
+146581180e89666d
+22e6c736f2f7227b
+4b6e9a02975ef9a4
+24bcb936908f3a31
+3cb1489b614e5f39
+3eac186b3e7badb2
+36bc6918e9fc837c
+0ac6adb37a92f549
+3b6d8db52c54b174
+2c48ab563b92a1d5
+03f551fc4abedc08
+3349f3089ea84d6b
+4a1920283e3087de
+298c394f21c62ae6
+30fceb3b40ec062f
+395caf9235fde098
+3bb70a92a0d384e1
+09b92d14a157d130
+0463d74358aca878
+47b5d62899ea4869
+32991f419c96ea0e
+46ea97f6f3757209
+41a3a167ea5d9e88
+4befac16ffdf8489
+09d860b12f6604cb
+37697c41773d597a
+01a2277ee817b310
+4b7071b34e8cc67d
+2a7387e017c241a4
+3ac32347c3ff7d38
+433742b23712bf06
+12293b264f68673d
+24b4e8ff5a9a6439
+1695a74d194b65c0
+1bce163cad1e1d20
+154365ab5a4e067b
+2a1769dddc1dbf8d
+0f68374b76390082
+3e746126204810a4
+35b15e97674edec3
+20de87f0b3f2d136
+297b57a9296052ce
+3021337b3fbdb2f3
+1eca36ec55b88fe4
+1b74274269c75c8d
+11eb02d24a3241a9
+0915a60e1ae6a826
+10c8e54590f715f7
+0deb1b80eb8481c6
+0c609c435b1f7114
+0fb6678e63316201
+0c25287b812367cd
+0896b4819e39caf2
+19267b6a68d2701b
+10a3511b61f40243
+3f6e7ee98174056b
+4c2ed13774ae4613
+411c4dd047c49cde
+0e8a52a174610350
+3e6d44a66c0d7a0a
+1c7b3ccd5482f834
+0f7061acbeed50dd
+32dfef9109202812
+17d9303ee77c3a3d
+466150f780ad7b80
+2b0cbc443e6c3c6d
+4130aefaca885090
+0498c9066256055c
+3d7a1ebc77f683b4
+389df03f3c2d7291
+43db8c6515021c01
+432fb354aa710e62
+1e9b1dc1c096d68d
+180542b70f713d5b
+2d4efa3897a4ba2d
+0894f0072a8c5fd0
+21e794f71e31becb
+1247b2ac5986205b
+282ad05cb0113543
+2807e5ac66c140cc
+0c9cc2f6d62336f1
+3c5bb5694853c36a
+3a9c883b11e86530
+3e034bde9426ae9f
+30fc5fc78c5a716e
+11bf4b38f88bfe9b
+04145b4b73b2d313
+416f82fdbad68e21
+12627e75d51b372e
+0720ee62bf014834
+15d324ad8ff2dd83
+1deec169175eb15b
+4953a5140e2f439b
+1214f2a11a9fc1ed
+09e4d5e8eef7b9c1
+171fdcc554d303d8
+3172ad0d099430da
+3f79dc32d575bcdc
+1defdda324307269
+418ad7b9e78208cb
+1fa073504b4facaf
+31afa3dcf3b737fa
+0c884aee4b01366f
+2e15ad61e7078fdb
+3bceca99e87d64c5
+422d976591ab629e
+196ffec2c68cafd5
+111356766833a7df
+2cb9869cb05a9a01
+0122933cf8ab3317
+3bb1007fcf0e03ff
+0516a5d959b58cb3
+01fe225e2f261d1a
+42b086af2a1e5d98
+0c788e368d993870
+33a29a351c1d9800
+061f829d3dd2e46e
+2473b5003a95628c
+0485a8528fa72698
+03f1781c4cc126e6
+3b7443b24830d388
+2defb4625a3ccb54
+4b7e7de9132f4149
+249fd0890d439aa9
+2c6fb46edb748fe5
+19310eb8261d4bb2
+4acd145b0c133dca
+01cf55ae3e378faf
+3dcef43736468b29
+0c5ed899789e60ad
+3172feb32990cf09
+4500d9faefff3a41
+48f9cd996f80c34d
+1e1742072c0b2d6b
+16ca2db2f920c1b4
+39987911e4cf003c
+04ec725465dc5329
+10002e18d04c3d93
+2c805f56d92a2e22
+22b16c2f5af0f3ef
+4cda491521679291
+394037c064421c3e
+2e619c31122ee40f
+33f242465f51563c
+18d9631f5eb45b87
+224e9686747afbb5
+3916390b35258215
+4cc48509585e4157
+00e12e215c028984
+0951fdaf9d399411
+3290731e5f908b92
+3b273cb40c55db95
+1cdfd3abfcc3a64e
+094fd37f09dc318c
+44b2ab5292c06a7e
+33f4eeb64d0c9c1a
+322c4bf5043ffd95
+10c551ef9644ea03
+392c21ee30b21459
+298276fb3c0330e5
+19b0cd79a126e8bd
+12f6faddcc88bcc5
+431795f999dc215f
+39e5f256790c3343
+15e0783c6b9683be
+3ea8d9787998f70a
+2c3a96fb820e1ab7
+131773f46d989860
+39f31c4461ede05f
+3c90ad3bb72adcf8
+37ce88a12d77382b
+0134d6a876481ed8
+3c19657356e9e229
+2516b6023683fc3e
+2fdfa70413053b84
+16da9b4fdfe4883b
+4bc7ca44cc62b8b1
+15b902774d67a394
+20100b779d28b6d5
+314c584ff3842715
+1cddaac7be8ecfa5
+1ca4db19711258a0
+01a628e2c509b823
+4422b38e60e3bc2f
+40f92f1e65a5e1dd
+02241c9f162966e3
+312c8e1f1f9f3594
+0e6f8d0eb4103baf
+0e5e7fbe8914352c
+221205ddf59c5156
+3f5454f2f53e2103
+446626a2bd617d24
+16a23081fd821e92
+374b2f4abac6dbb9
+188e6f96fa74ebe7
+4ac044dcaa428723
+404043fe2f398440
+2ef881551a7fda22
+372bb866143b6e35
+015d8a2a2834d38c
+3c35b868a8ec3433
+3d5125567924e37b
+23d7f14af4b7ba08
+172c99489b18e0e6
+3c64a373bc1c53bd
+1cc1ff58dc89d230
+4317016b336431be
+3928ea8b8c134846
+234271629f7099df
+154813fc1d6820dc
+449ee1308d0710b5
+0e4f56edbc3d8cd7
+286393e1e797cdad
+2b4f6fdcabf53d59
+3af4c4e5a8ced21e
+36e69606a7599644
+0196dedebec3dad2
+214df1c2863d2959
+05ac37966de4e7fb
+3c91fa87850fd1a8
+42b218cc2f794026
+3eac742acbd69adf
+2bcb26e95f5d152b
+0e2c96cd97e73a38
+2aca85b3bdf90a09
+059058768c222bd6
+15a138312ad94718
+12dc074fab6ada73
+46f840365cee9c44
+0302fcf06bfba582
+2aa1e311e4bc039b
+3b0b55657925fb34
+2864dc6c129cf3cc
+20f1c3ade1608b4d
+15d4131d721f1b5e
+3d2f4958db5aefbf
+0b55abc1ca2fe909
+22274e48b847c860
+0c3d3b45ff4a4326
+1fef543188ace6e5
+42f761f7e655bdce
+39f1b33acc70ad7b
+05b1462991e38e4d
+2a4d835a6e023621
+419b773d04a986b7
+3015a3eab4b6d042
+01b08e2f20321127
+4b0fdb10ae15684b
+33afdeba3cd5af05
+369f3639d9605255
+17c234c2eec050a0
+42d8b53a15001cd5
+4ae4456267802484
+0bd7e6e9f0185aa3
+125c92c36a04a68a
+167c2e0c6e9ffa5d
+4949361d0831c838
+01a5cc3805e94c21
+04c441c7ce273dcc
+47d9493675e58f3b
+497f507a5901bf4b
+2276982dbc5a23e2
+2a8ef9e44f580d13
+0ed7ecf45f945ead
+321911ae4a16f038
+24ad46fd2b26b208
+4c7df9d3840b2d63
+2ddca94aefd55b8f
+0181b66a65650830
+22a0db80d91128e4
+2a2d971fd44ae258
+084fe29cd9d008db
+339c95e2e709d044
+0c8b534612a0a776
+1f2153b5fb50d41a
+4c8fd5318ae8d467
+33baf3e18e5d7256
+24d1c4f7497f8e77
+36fc018c7b62b997
+234c14a79d4da1ff
+039cc34e9cdbcf8f
+177d39d72e983b69
+3fdaa028b8baad4a
+2e9d6a76c40b707f
+3d8d753f0851bf3b
+27f772c12c97b594
+3ebaecd85db14943
+1a04733e4ee45c90
+0e01be9445403642
+4ade6d5fe4b32738
+3cc4c306db84c6fe
+1b747b8eba6f7b45
+052430ff6e2c07c4
+053e78d3134437a5
+20b541350492e3ad
+1fd615fea825fe87
+0d82dba8f137e3da
+49c9324758b5e867
+05bacb6d6a4741b0
+007b4ae7c05f2ea2
+3cc97c3d778975f2
+0d4b941f4678267b
+06ca8f480c91e9eb
+42f700b22cb0be39
+4571dd58b16ba385
+14a5b002ce46d4d3
+09c1b7a0876c08df
+2b1da1fbe7f18f7e
+3784777516b00247
+2ceec371086f5d82
+090c672e7e394397
+285aef90afaaf565
+37a960afb176c485
+2e715b2e0162f768
+0720bead0cc7cbbf
+1d36cd02a549e244
+4b5a6dd314bebe88
+0a5eeb4466dd19bb
+1512ea7a9754ac34
+22666111b2180af9
+09b505bb829c1d12
+442ad5ba8e834889
+325ff82707386438
+0a10d55239d83d99
+4bf9ef4705f35e8f
+038137c9569c60eb
+41210bec1c0c87e8
+21d9134faec148f2
+18089956e2be2289
+0cb83cef3177a006
+204fc9ff2c7ff92c
+489f9441d513634e
+2d39f39fb8254c27
+45e5fcd5c8978342
+32cccd10c84b4529
+117335f5d67368ca
+46e2ddf094d0c3a1
+2a8cd9f87b3c9a2b
+0a9f2831a3e73de8
+3d584707e2f3ccf3
+32d2163aa65c0e8a
+075278a4d0af74f7
+0145c694b53b120d
+17ca3b8ad5815b35
+0f25241e37e16f56
+48bb743178166598
+0181d3b41c2cf87c
+005dd9a58df1ba3c
+42c2c85060ab5233
+49cc37f7f96be5a9
+1e5548d951e91a40
+3a4d7cbcf0c84668
+2990bbc008d9fa82
+3ad4793daf6adc19
+4828fb60e4a871ee
+47a76ca10546fe8f
+477df7ad0c2e7fdb
+0d6a534d75f20921
+427c035484d45682
+49d21e0a4c3eace5
+3be029de36008afc
+17f552ef56d85c55
+49235d402cbb8895
+3f8d1edf59e70df3
+0cf444aef3ba16bd
+02c485bd207116d4
+02cb3a4fd80ee0cc
+139a615209ee09ac
+0f1f245fa1c181ae
+1be3758972b35151
+03aa0437e5d62d58
+2d4e81e66ce80039
+4221bc1d4aea1a02
+13686755488a9d51
+04ef6a410f034514
+46a4d49d61a86d37
+03ef5f13e0a30864
+28e8300e004ab30b
+3059f523501dfd97
+0d46043105cf3185
+482c3e92080f18c5
+41e428b3c7a16695
+18e659699338f835
+0aacb1732fee7a3c
+237ef12cd69e2aa0
+31bf989cb15492d4
+2f8e1946600c65d4
+494f87170e713843
+4c2d32a7f2b62657
+4c96e034f8af77d6
+306e2b7785657539
+2ad4ea800caafe09
+43361dbc0c5a2808
+487d83675a8d1574
+2bd43375196ce1a7
+1b65111d34f57bb0
+0278b3d8abd9654d
+3714123f055e06a1
+4cf74ffa5bfd5904
+0598fec76ecc7bd6
+2c52d9d606a3ece2
+180bf845cc8cada3
+2245fcd7f76c2ecd
+41abd737e0228c1a
+2d62a5d66d6e4931
+05a6149f1fcee38d
+3a126bd9702ee8f7
+291db63458af0613
+478e22bb4c242aa9
+1c11709814a1a2f2
+2b1303680b081ccb
+32ce9b303717a29d
+04cb1526cf3c43cc
+3eb185c04280412d
+0c209edeb7637dff
+1c58aa75858147f1
+4833b3d2a8184313
+0f7e8bf1137abcac
+40904cd4b9e0579d
+3aae131a319acd17
+4ca952ede2af6578
+489254d4b26a04c6
+02679535c5f06a19
+3a48dfbd2f0977f9
+04ed1812719e05f0
+2fe4e7ab61b23c85
+1798c9640d8875e6
+329abf340d23b0b9
+48d4444b94c2a2c0
+4cbb82a6bab25a0f
+48614bc62c3acbf8
+0f7267e7e369b7d6
+33dbdc4396938ae1
+398c4688209874c9
+1433f61e9591ea9b
+040de715f9303ba5
+3dba1838ed366ab5
+0de79d4f3d7a9171
+33f1be3a9ccf4e4b
+25b3854e6efb747d
+182054e13eaf58fc
+310cbf1c65c52fa7
+037bdac76bdcd7c6
+18be7c1b9895691f
+03fe94a439456692
+2b121397791014af
+44095e87bee5475e
+39424be692a88364
+1df20a29cdec61a4
+49c758aa3c35ed86
+04ca03945611febb
+134d7e5a74497a82
+2b81fbc1af01f0ad
+2e4c69143b09033c
+0ffd1083a70c6968
+1930a64d9a119b13
+14e3fec07ba502d7
+07d0229847bd7408
+37b0b70a1a0c25d3
+18cde229723f22df
+24d95746999b7f7e
+47d11d4bee6608e0
+0f47577ab3441480
+401a94bd9d84f501
+259f6f1f002d2d94
+3ce90c0ea2537c48
+1ffd3b706d708774
+1f62165dfec00c3e
+066c35b1abc706be
+06c71ce295284689
+0ecdc87c3391ce98
+034677cf3d80162d
+0e7b68884ac4d959
+05ec2d3e4c027220
+23dcee801bca67bf
+0131c9aed0fb3940
+22552c9a2a2a2ce7
+4263257ec6099434
+1c8d34a791deaaf1
+21b548b570c0b415
+11491a312c6b8f58
+482ce5c63038e5b4
+17d841670d2da942
+0ef054fbdabce0cd
+45823117f0acb627
+0c89e266974e8b90
+115fa3a1923b7c9f
+21840c44aed0ae43
+430d8fece8e0f7e5
+335794d48a9b168a
+0757b4bc82bf26b1
+172bd46c6ddda95d
+3d6e04af63ebfea4
+3cfb4c69b14a1970
+41016527728cae5b
+195fbef2c08715e9
+078b8bdc29565cac
+40284c1baec06ac9
+3613c77d8c234008
+3662b5f2916b470c
+48aff7218b00d843
+368fae6f3bc0b0f7
+29460ac4580ea232
+0068e97c1c1f61aa
+33f7565ccb685cb7
+00e8df74b6805da7
+316ed1b489ff8f40
+22c8b35c589276c4
+06ffaa9ffc2eea95
+3d410da4d7fd9f64
+3e07add8413f8157
+2a9d8ba86290db0e
+375cff10cab07955
+2da082dfb7a66b4d
+17a3e731c4fe0aaf
+4811a66f87b0dd6a
+1c375830155fe6dc
+15729869d1862b7f
+130cbf8f12764687
+015631b21f792a12
+12e3b1ad12a752f6
+040895d45bf4e580
+0e16d64d961fe855
+2d5e1c16ba1f89c2
+10351dc7a37a44c1
+2cf1a544b179b1a7
+3bae42d603be2266
+156f4c7dca878ff2
+2d3f982ada31489c
+3a642c6d0e43510b
+03de2844d3c8314e
+08072d6cc8e8711d
+39f03d5fb1807102
+21cfc9b1266a6bd0
+311d8515bd115aba
+0c9d930d226d6bd6
+0dad7f2ef3496f13
+12bed927641025a0
+15bc9eb752c6dcbd
+2177ca3a775a9ee9
+289bcb973ed702a1
+0f2197967bb7fa43
+33842f4b169e4145
+397b050e345d73fd
+0542630de1d734de
+0752baf20fbc2285
+3ee30754edfbdb3f
+0c061512de79b744
+3eef492bf5120757
+45a0fe252a89e008
+36056faee50a621f
+46b2a13f6ab0be05
+311db764bdc7f537
+074653ff3928b9fe
+2d815b3e5e9bb237
+21b26eda16f7cb88
+12cb4aa3f5b59ad6
+21f3cc00e0cfe8bc
+088b93f15ca8745d
+15762acaba295de1
+3c33566bacd602f2
+27d163d7046d36b5
+284efc2041b1d1d8
+2ad09b7837010330
+40954e72e02dc771
+03b440db4696d8e7
+0ce3839aa5b66e3f
+47a07f51fd3fef77
+35e8c6c2168dd087
+11fcbcdb1dfefb38
+178cef169356d4a5
+122cb7d5ea4a99df
+42742db2633d2eb5
+077d42bb51ee2793
+3dc0058dce3828d9
+44adc8d00568380f
+0ccb28128213f19d
+3862411e9bf455cd
+3eb795302924e912
+2445756494ef6e3d
+4c5fd496905b91ce
+46e0654ccb5d88cf
+16b48792e910cf49
+2e4013ea92d04301
+2ac712ac8d2fd488
+4ce642bc93f1bb5a
+3052da93ceeda447
+2b973e6f676eb243
+46889bb1803c5cd7
+4a8f9d6889992fd9
+424397db4b1cf634
+0fe2286088ece98e
+04433dcf217ad9a0
+37bbec9e46c8c1a9
+12e985eaa4b79298
+367446f773123e02
+397da8e32c2edd65
+069a4c442912c405
+1bc87c160d1dc982
+2c45f54d9432d25c
+388ed39170b69946
+1058fe0400a873e0
+3e1236935a5f70ae
+2e4c0705bde13d32
+449c34eaea295942
+306d435307fef477
+0d68a05801d48984
+0c916bcc9351521e
+35f89d3ac607bd5a
+1dad44855584a4ea
+18297e1f8e25d3ee
+0aa49c0b75e51ba4
+10f4acad3ed87288
+0ebb04534d7f2ba7
+0b04644621e97d30
+4246a11f0971a231
+127884736471b631
+367345ad0b29f8a3
+46fdfa2a16c7c811
+01cefee9563f691a
+1250e369ad1e2fbc
+1f5df6019b0bb73c
+22598e2596e6bae7
+07d449efdb66c20d
+2326e9820982ad81
+2e30ae101611cca8
+40ca76de44a6e1a9
+281452e730c39fd0
+33c1bb87a88e59f1
+45064c8142f3a360
+2b8778726c1f2fe4
+44accffce93c7e87
+1735d8d1b4015669
+2d99ba7951695f79
+3cc1fcf538c81442
+3018aa8ad3eb5dca
+33be1ba5aec86c96
+2fca5797ae48529b
+4054d32655ba5eda
+1ce8503fd200fed2
+4a763e1b87e495a7
+1f7770ac5cbb41eb
+3b59c7d97b900724
+2b5b5b4f4fc526ba
+3e68931874661724
+3fe783b9c7c8f492
+0722819bdf5b2737
+20f8c6738e22e764
+48e49bdd1aa706e1
+3b650a9e2ebdfde2
+0667d5bedfdbc555
+33517e9838fe5f20
+227e06087cbffb2b
+0f5c5385dbcd96df
+0bf152ef84195293
+296ec0d98f4d4151
+293c7c1ccaa6861a
+31b5667e16de1d94
+1d04977fb85a8b3c
+01570ac1c73e9ca2
+28760a14d0a5ff3a
+15c9a45c9c3d73f2
+288dd40199dee268
+0e101fbd21daf79c
+2f878176347bcf9e
+0282160b901229a7
+1c919c7e4ec601de
+3a3bc11b9ebb7d44
+19391676f0fc7982
+322261824c4a3003
+2064e46352532375
+44a85c75cf4a6da8
+47a1f1f01e2b7be6
+49f6f14c580b71b9
+05c423623c9f6f56
+4214dda1f9a61b1b
+1e01b910ceba4573
+3783162ee796a21c
+0c9b371cc6225682
+1910e79a60d57aa7
+2422f760ea77551d
+2e6876c6c1e40652
+3f68a1e365e94eb4
+1bdf9dd7628ddb0b
+13c510a7403f8231
+42b88f7ee71a7ba9
+399cfd9cfacc0499
+4615277ffb68ca9d
+2da2eeb966bc0ef8
+1c1b2e56952040cc
+20bab82d0268c877
+0cca84503a86574c
+45cb862034851efe
+16930ebf3f0f6b84
+2a058bafbecaccf9
+28ed97894371982f
+4955f54f807ef5aa
+0fd536fc3c8fdf19
+20764a96cc70fe46
+0ed25f15cbccd939
+37de8da2580d0c1d
+0932a5bf82eb2f5c
+0e8995dcbdd22f48
+0d8fd962cbfc81b7
+3e00b129b656fbce
+13adf913ea857ddf
+22da7610855d6b9d
+2aab03e1aec0222c
+0d22ced53b1db7d3
+3b122e1becb5fcb7
+3f45b8234504020c
+15bc7fa1ed5567cc
+11e9cb1ccb9abe9f
+150b45a39c57623d
+309bed43e4406d72
+14cf1f92ca13d605
+3984d005557cbd6f
+0c24590c68af865f
+144b95c0c3fbe3b0
+2945a940639798ce
+1515d37824dd6b22
+495f50f0997e986e
+48b1808c546c7e87
+075654f497170f90
+2ccd2d98696c87e0
+144c2c2c52734f15
+15c28f4ada02cf91
+3164f4c30188d403
+389851bf0ac38227
+4c76898a3d535741
+0f620bfafa25fcf5
+28c9d20b865f5d56
+069a9416dc6a373b
+28a5318660ab60ba
+10ad4fc499c48b38
+3b67613d97aac1df
+118ffef2ad3950f5
+3c83e9817c9e022d
+3c054be9bdb304ee
+1dcd8aee9a39a61a
+2f18f5579583e648
+0539bcecbc483dfe
+3814a3a8046c8af3
+1fa7929f55b1fdd5
+0f6206df8a8e440a
+0017ce4c6a39d122
+16a7b5a41f31feb8
+0de6bc7da518fcae
+1d125b16063c96c4
+2c7e35b25a2e4b8b
+329177dabfe2951d
+45122648522d4180
+2d8f1ccdb70c156a
+1c73def8a62301a9
+2cd27189549897bf
+49ec7608e51f7ee2
+486970d685c0b746
+0583c7a746238f79
+06a2911e9add96c6
+3b9420585a1e66fc
+1692fab166811028
+3aaed2e6422d7d57
+20422596003ac855
+0ed8b86b87a30d38
+075f0d808a621ae2
+48a3049cabb54c0d
+0c9ea3bf67254e95
+022a21a897f2a904
+3d60041ab79f46fc
+21a6081709444ebf
+24598987691df957
+41936ce6152fee64
+4a177be7db12edcd
+225ae1d37a7fa519
+4b4c0c27204604a3
+4b5619958277861f
+16ed45d1ce9017df
+061e49ba3a5386c7
+19c6e67783781a43
+3b58206d99feb4b7
+31812ed5877b73ab
+0b530eea368f626e
+37ff932a6a608c24
+2a30c5309018e00f
+42c75d578535b0fe
+177ff3969577b8de
+0761e0a3a4e25c53
+473e6ec61583d90f
+42cc82972397863b
+4908fab97c9bcec7
+04e2be0415136fa9
+0cdfa29561cb24e0
+23cfdadab7cc51a1
+474d403238a41315
+20d0e788abca4aa9
+0bb7da710cbf4bb9
+4ace59951acbae4e
+2d29ff162920db5e
+2f25826f0d0ef09a
+0b429a4733089487
+45592a7f307bccd0
+280443260e3dced9
+4905bc8817511dd2
+338ff9f6c02a6a40
+44b78f9fcb5cd8d8
+4bc47dc7f8781812
+2217c43ddaa29027
+0aa284f8166e19e4
+1d7af31482baf61c
+01cf2d900cb03afb
+46c9e2d86e7d4c41
+4a566b7e6eeaf9b4
+166bb958d7f4798a
+01f7915dce639515
+1259726fc1f8e966
+314c56aad151508d
+17d35e133dc3ce90
+223b5e20753d4fa3
+1cbab4f69b2d48ce
+464d97e527dd5f8a
+093b1fe6bc1fd024
+2007d4829b187feb
+165696025b477097
+2412e9f45282fd15
+0ef15055b44649e3
+4089ef1b1bdb1d36
+0898d467d34ff7b2
+4a046d13e389b505
+497d2450ed65a678
+0090cc64d7b7bb24
+1961bb85524de229
+2c9018ef57c6b061
+3dba9cb74bfb79b2
+168c85ce00de0c6b
+0c72eaf6bbb7c681
+384fc7a71b9faca7
+0a7c052273895bb3
+4b457a008376cd73
+021d7121906d6cab
+228ed4b87c8a6ea7
+0e241f40ce0cd802
+3db49ddb3f470436
+30006eb23f62aa57
+454197dc5b50b45f
+020991bdfbdbe504
+4242fb49c775710c
+41f438dd19aae981
\ No newline at end of file
diff --git a/evaluation/preprocess_co3d.py b/evaluation/preprocess_co3d.py
new file mode 100644
index 00000000..e55fd380
--- /dev/null
+++ b/evaluation/preprocess_co3d.py
@@ -0,0 +1,131 @@
+# Modified from https://github.com/amyxlase/relpose-plus-plus/blob/main/preprocess/preprocess_co3d.py
+
+
+"""
+Usage:
+    python -m preprocess.preprocess_co3d --category all \
+        --co3d_v2_dir /path/to/co3d_v2
+"""
+import argparse
+import gzip
+import json
+import os
+import os.path as osp
+
+import matplotlib.pyplot as plt
+import numpy as np
+from tqdm.auto import tqdm
+
+# fmt: off
+# CATEGORIES = [
+#     "apple", "backpack", "ball", "banana", "baseballbat", "baseballglove",
+#     "bench", "bicycle", "book", "bottle", "bowl", "broccoli", "cake", "car", "carrot",
+#     "cellphone", "chair", "couch", "cup", "donut", "frisbee", "hairdryer", "handbag",
+#     "hotdog", "hydrant", "keyboard", "kite", "laptop", "microwave", "motorcycle",
+#     "mouse", "orange", "parkingmeter", "pizza", "plant", "remote", "sandwich",
+#     "skateboard", "stopsign", "suitcase", "teddybear", "toaster", "toilet", "toybus",
+#     "toyplane", "toytrain", "toytruck", "tv", "umbrella", "vase", "wineglass",
+# ]
+CATEGORIES = [
+    "apple", "bench", "bowl", "cellphone", "frisbee", "hotdog", "keyboard", "parkingmeter", "teddybear", "toybus",
+    "backpack", "book", "car", "donut", "handbag", "hydrant", "motorcycle", "pizza", "stopsign", "toaster", "tv"
+]
+# fmt: on
+
+
+def get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--category", type=str, default="apple")
+    parser.add_argument("--output_dir", type=str, default="annotations/co3d_v2_annotations")
+    parser.add_argument("--co3d_v2_dir", type=str, default="data/co3d_v2")
+    parser.add_argument(
+        "--min_quality",
+        type=float,
+        default=0.5,
+        help="Minimum viewpoint quality score.",
+    )
+    return parser
+
+
+
+
+def process_poses(co3d_dir, category, output_dir, min_quality):
+    category_dir = osp.join(co3d_dir, category)
+    print("Processing category:", category)
+    frame_file = osp.join(category_dir, "frame_annotations.jgz")
+    sequence_file = osp.join(category_dir, "sequence_annotations.jgz")
+    subset_lists_file = osp.join(category_dir, "set_lists/set_lists_fewview_dev.json")
+
+    # bbox_file = osp.join(output_dir, f"{category}_bbox.jgz")
+
+    with open(subset_lists_file) as f:
+        subset_lists_data = json.load(f)
+
+    with gzip.open(sequence_file, "r") as fin:
+        sequence_data = json.loads(fin.read())
+
+    with gzip.open(frame_file, "r") as fin:
+        frame_data = json.loads(fin.read())
+
+    # with gzip.open(bbox_file, "r") as fin:
+        # bbox_data = json.loads(fin.read())
+
+    frame_data_processed = {}
+    for f_data in frame_data:
+        sequence_name = f_data["sequence_name"]
+        if sequence_name not in frame_data_processed:
+            frame_data_processed[sequence_name] = {}
+        frame_data_processed[sequence_name][f_data["frame_number"]] = f_data
+
+    good_quality_sequences = set()
+    for seq_data in sequence_data:
+        if seq_data["viewpoint_quality_score"] > min_quality:
+            good_quality_sequences.add(seq_data["sequence_name"])
+
+    for subset in ["train", "test"]:
+        category_data = {}  # {sequence_name: [{filepath, R, T}]}
+        for seq_name, frame_number, filepath in subset_lists_data[subset]:
+            if seq_name not in good_quality_sequences:
+                continue
+
+            if seq_name not in category_data:
+                category_data[seq_name] = []
+
+            # mask_path = filepath.replace("images", "masks").replace(".jpg", ".png")
+            # bbox = bbox_data[mask_path]
+            # if bbox == []:
+                # Mask did not include any object.
+                # continue
+
+            frame_data = frame_data_processed[seq_name][frame_number]
+            category_data[seq_name].append(
+                {
+                    "filepath": filepath,
+                    "R": frame_data["viewpoint"]["R"],
+                    "T": frame_data["viewpoint"]["T"],
+                    "focal_length": frame_data["viewpoint"]["focal_length"],
+                    "principal_point": frame_data["viewpoint"]["principal_point"],
+                    # "bbox": bbox,
+                }
+            )
+
+        output_file = osp.join(output_dir, f"{category}_{subset}.jgz")
+        with gzip.open(output_file, "w") as f:
+            f.write(json.dumps(category_data).encode("utf-8"))
+
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    if args.category == "all":
+        categories = CATEGORIES
+    else:
+        categories = [args.category]
+    for category in categories:
+        process_poses(
+            co3d_dir=args.co3d_v2_dir,
+            category=category,
+            output_dir=args.output_dir,
+            min_quality=args.min_quality,
+        )
\ No newline at end of file
diff --git a/evaluation/re10k.py b/evaluation/re10k.py
new file mode 100644
index 00000000..570d51db
--- /dev/null
+++ b/evaluation/re10k.py
@@ -0,0 +1,3 @@
+# https://github.com/facebookresearch/vggt/issues/45#top
+# https://github.com/facebookresearch/PoseDiffusion/blob/main/pose_diffusion/datasets/re10k_test_1800.txt
+# https://github.com/yyfz/Pi3/tree/evaluation
\ No newline at end of file
diff --git a/evaluation/test_co3d.py b/evaluation/test_co3d.py
new file mode 100644
index 00000000..61dbe511
--- /dev/null
+++ b/evaluation/test_co3d.py
@@ -0,0 +1,476 @@
+import os
+import torch
+import numpy as np
+import gzip
+import json
+import random
+import logging
+import warnings
+from vggt.models.vggt import VGGT
+from vggt.models.vggt_small import VGGT as VGGTsmall
+from vggt.utils.rotation import mat_to_quat
+from vggt.utils.load_fn import load_and_preprocess_images
+from vggt.utils.pose_enc import pose_encoding_to_extri_intri
+from vggt.utils.geometry import closed_form_inverse_se3
+import argparse
+# python test_co3d.py --model_path ../pretrained_models/model_tracker_fixed_e20.pt --co3d_dir /mimer/NOBACKUP/groups/3d-dl/co3dv2 co3d_anno_dir ../annotations/co3d_v2_annotations
+# python test_co3d.py --model_path ../training/logs/dinov3_exp004/ckpts/checkpoint.pt --co3d_dir /mimer/NOBACKUP/groups/3d-dl/co3dv2 co3d_anno_dir ../annotations/co3d_v2_annotations --encoder dinov3
+
+
+# Suppress DINO v2 logs
+logging.getLogger("dinov2").setLevel(logging.WARNING)
+warnings.filterwarnings("ignore", message="xFormers is available")
+warnings.filterwarnings("ignore", message="dinov2")
+
+# Set computation precision
+torch.set_float32_matmul_precision('highest')
+torch.backends.cudnn.allow_tf32 = False
+
+
+def convert_pt3d_RT_to_opencv(Rot, Trans):
+    """
+    Convert Point3D extrinsic matrices to OpenCV convention.
+
+    Args:
+        Rot: 3D rotation matrix in Point3D format
+        Trans: 3D translation vector in Point3D format
+
+    Returns:
+        extri_opencv: 3x4 extrinsic matrix in OpenCV format
+    """
+    rot_pt3d = np.array(Rot)
+    trans_pt3d = np.array(Trans)
+
+    trans_pt3d[:2] *= -1
+    rot_pt3d[:, :2] *= -1
+    rot_pt3d = rot_pt3d.transpose(1, 0)
+    extri_opencv = np.hstack((rot_pt3d, trans_pt3d[:, None]))
+    return extri_opencv
+
+
+def build_pair_index(N, B=1):
+    """
+    Build indices for all possible pairs of frames.
+
+    Args:
+        N: Number of frames
+        B: Batch size
+
+    Returns:
+        i1, i2: Indices for all possible pairs
+    """
+    i1_, i2_ = torch.combinations(torch.arange(N), 2, with_replacement=False).unbind(-1)
+    i1, i2 = [(i[None] + torch.arange(B)[:, None] * N).reshape(-1) for i in [i1_, i2_]]
+    return i1, i2
+
+
+def rotation_angle(rot_gt, rot_pred, batch_size=None, eps=1e-15):
+    """
+    Calculate rotation angle error between ground truth and predicted rotations.
+
+    Args:
+        rot_gt: Ground truth rotation matrices
+        rot_pred: Predicted rotation matrices
+        batch_size: Batch size for reshaping the result
+        eps: Small value to avoid numerical issues
+
+    Returns:
+        Rotation angle error in degrees
+    """
+    q_pred = mat_to_quat(rot_pred)
+    q_gt = mat_to_quat(rot_gt)
+
+    loss_q = (1 - (q_pred * q_gt).sum(dim=1) ** 2).clamp(min=eps)
+    err_q = torch.arccos(1 - 2 * loss_q)
+
+    rel_rangle_deg = err_q * 180 / np.pi
+
+    if batch_size is not None:
+        rel_rangle_deg = rel_rangle_deg.reshape(batch_size, -1)
+
+    return rel_rangle_deg
+
+
+def translation_angle(tvec_gt, tvec_pred, batch_size=None, ambiguity=True):
+    """
+    Calculate translation angle error between ground truth and predicted translations.
+
+    Args:
+        tvec_gt: Ground truth translation vectors
+        tvec_pred: Predicted translation vectors
+        batch_size: Batch size for reshaping the result
+        ambiguity: Whether to handle direction ambiguity
+
+    Returns:
+        Translation angle error in degrees
+    """
+    rel_tangle_deg = compare_translation_by_angle(tvec_gt, tvec_pred)
+    rel_tangle_deg = rel_tangle_deg * 180.0 / np.pi
+
+    if ambiguity:
+        rel_tangle_deg = torch.min(rel_tangle_deg, (180 - rel_tangle_deg).abs())
+
+    if batch_size is not None:
+        rel_tangle_deg = rel_tangle_deg.reshape(batch_size, -1)
+
+    return rel_tangle_deg
+
+
+def compare_translation_by_angle(t_gt, t, eps=1e-15, default_err=1e6):
+    """
+    Normalize the translation vectors and compute the angle between them.
+
+    Args:
+        t_gt: Ground truth translation vectors
+        t: Predicted translation vectors
+        eps: Small value to avoid division by zero
+        default_err: Default error value for invalid cases
+
+    Returns:
+        Angular error between translation vectors in radians
+    """
+    t_norm = torch.norm(t, dim=1, keepdim=True)
+    t = t / (t_norm + eps)
+
+    t_gt_norm = torch.norm(t_gt, dim=1, keepdim=True)
+    t_gt = t_gt / (t_gt_norm + eps)
+
+    loss_t = torch.clamp_min(1.0 - torch.sum(t * t_gt, dim=1) ** 2, eps)
+    err_t = torch.acos(torch.sqrt(1 - loss_t))
+
+    err_t[torch.isnan(err_t) | torch.isinf(err_t)] = default_err
+    return err_t
+
+
+def calculate_auc_np(r_error, t_error, max_threshold=30):
+    """
+    Calculate the Area Under the Curve (AUC) for the given error arrays using NumPy.
+
+    Args:
+        r_error: numpy array representing R error values (Degree)
+        t_error: numpy array representing T error values (Degree)
+        max_threshold: Maximum threshold value for binning the histogram
+
+    Returns:
+        AUC value and the normalized histogram
+    """
+    error_matrix = np.concatenate((r_error[:, None], t_error[:, None]), axis=1)
+    max_errors = np.max(error_matrix, axis=1)
+    bins = np.arange(max_threshold + 1)
+    histogram, _ = np.histogram(max_errors, bins=bins)
+    num_pairs = float(len(max_errors))
+    normalized_histogram = histogram.astype(float) / num_pairs
+    return np.mean(np.cumsum(normalized_histogram)), normalized_histogram
+
+
+def se3_to_relative_pose_error(pred_se3, gt_se3, num_frames):
+    """
+    Compute rotation and translation errors between predicted and ground truth poses.
+    This function assumes the input poses are world-to-camera (w2c) transformations.
+
+    Args:
+        pred_se3: Predicted SE(3) transformations (w2c), shape (N, 4, 4)
+        gt_se3: Ground truth SE(3) transformations (w2c), shape (N, 4, 4)
+        num_frames: Number of frames (N)
+
+    Returns:
+        Rotation and translation angle errors in degrees
+    """
+    pair_idx_i1, pair_idx_i2 = build_pair_index(num_frames)
+
+    relative_pose_gt = gt_se3[pair_idx_i1].bmm(
+        closed_form_inverse_se3(gt_se3[pair_idx_i2])
+    )
+    relative_pose_pred = pred_se3[pair_idx_i1].bmm(
+        closed_form_inverse_se3(pred_se3[pair_idx_i2])
+    )
+
+    rel_rangle_deg = rotation_angle(
+        relative_pose_gt[:, :3, :3], relative_pose_pred[:, :3, :3]
+    )
+    rel_tangle_deg = translation_angle(
+        relative_pose_gt[:, :3, 3], relative_pose_pred[:, :3, 3]
+    )
+
+    return rel_rangle_deg, rel_tangle_deg
+
+
+def setup_args():
+    """Set up command-line arguments for the CO3D evaluation script."""
+    parser = argparse.ArgumentParser(description='Test VGGT on CO3D dataset')
+    parser.add_argument('--debug', action='store_true', help='Enable debug mode (only test on specific category)')
+    parser.add_argument('--fast_eval', action='store_true', default=False, help='Only evaluate 10 sequences per category')
+    
+    parser.add_argument('--big_model', action='store_true', default=False, help='If to load the original VGGT')
+    parser.add_argument('--encoder', type=str, default="dinov3", help='Encoder to use in VGGTsmall')
+    
+    parser.add_argument('--min_num_images', type=int, default=50, help='Minimum number of images for a sequence')
+    parser.add_argument('--num_frames', type=int, default=10, help='Number of frames to use for testing')
+    parser.add_argument('--co3d_dir', type=str, required=True, help='Path to CO3D dataset')
+    parser.add_argument('--co3d_anno_dir', type=str, required=True, help='Path to CO3D annotations')
+    parser.add_argument('--seed', type=int, default=0, help='Random seed for reproducibility')
+    parser.add_argument('--model_path', type=str, required=True, help='Path to the VGGT model checkpoint')
+    return parser.parse_args()
+
+
+def load_model(device, model_path, big_model=False, encoder="dinov3"):
+    """
+    Load the VGGT model.
+
+    Args:
+        device: Device to load the model on
+        model_path: Path to the model checkpoint
+
+    Returns:
+        Loaded VGGT model
+    """
+    print("Initializing and loading VGGT model...")
+    if not big_model:
+        model = VGGTsmall(
+            img_size=336,
+            embed_dim=768,
+            depth=6,
+            num_heads=12,
+            patch_size=16,
+            patch_embed=encoder,
+            enable_camera=True,
+            enable_depth=True,
+            enable_point=True,
+            enable_track=False,
+        )
+    else:
+        model = VGGT()
+    print(f"USING {model_path}")
+    model.load_state_dict(torch.load(model_path)['model'], strict=True)
+    model.eval()
+    model = model.to(device)
+    return model
+
+
+
+def set_random_seeds(seed):
+    """
+    Set random seeds for reproducibility.
+
+    Args:
+        seed: Random seed value
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+
+def process_sequence(model, seq_name, seq_data, category, co3d_dir, min_num_images, num_frames, device, dtype):
+    """
+    Process a single sequence and compute pose errors.
+
+    Args:
+        model: VGGT model
+        seq_name: Sequence name
+        seq_data: Sequence data
+        category: Category name
+        co3d_dir: CO3D dataset directory
+        min_num_images: Minimum number of images required
+        num_frames: Number of frames to sample
+        device: Device to run on
+        dtype: Data type for model inference
+
+    Returns:
+        rError: Rotation errors
+        tError: Translation errors
+    """
+    if len(seq_data) < min_num_images:
+        return None, None
+
+    metadata = []
+    for data in seq_data:
+        # Make sure translations are not ridiculous
+        if data["T"][0] + data["T"][1] + data["T"][2] > 1e5:
+            return None, None
+        extri_opencv = convert_pt3d_RT_to_opencv(data["R"], data["T"])
+        metadata.append({
+            "filepath": data["filepath"],
+            "extri": extri_opencv,
+        })
+
+    # Random sample num_frames images
+    ids = np.random.choice(len(metadata), num_frames, replace=False)
+    print("Image ids", ids)
+
+    image_names = [os.path.join(co3d_dir, metadata[i]["filepath"]) for i in ids]
+    gt_extri = [np.array(metadata[i]["extri"]) for i in ids]
+    gt_extri = np.stack(gt_extri, axis=0)
+
+    images = load_and_preprocess_images(image_names).to(device)
+
+
+    with torch.no_grad():
+        with torch.cuda.amp.autocast(dtype=dtype):
+            predictions = model(images)
+    with torch.cuda.amp.autocast(dtype=torch.float64):
+        extrinsic, intrinsic = pose_encoding_to_extri_intri(predictions["pose_enc"], images.shape[-2:])
+        pred_extrinsic = extrinsic[0]
+
+    with torch.cuda.amp.autocast(dtype=torch.float64):
+        gt_extrinsic = torch.from_numpy(gt_extri).to(device)
+        add_row = torch.tensor([0, 0, 0, 1], device=device).expand(pred_extrinsic.size(0), 1, 4)
+
+        pred_se3 = torch.cat((pred_extrinsic, add_row), dim=1)
+        gt_se3 = torch.cat((gt_extrinsic, add_row), dim=1)
+
+        rel_rangle_deg, rel_tangle_deg = se3_to_relative_pose_error(pred_se3, gt_se3, num_frames)
+
+
+        Racc_5 = (rel_rangle_deg < 5).float().mean().item()
+        Tacc_5 = (rel_tangle_deg < 5).float().mean().item()
+
+        print(f"{category} sequence {seq_name} R_ACC@5: {Racc_5:.4f}")
+        print(f"{category} sequence {seq_name} T_ACC@5: {Tacc_5:.4f}")
+
+        return rel_rangle_deg.cpu().numpy(), rel_tangle_deg.cpu().numpy()
+
+
+def main():
+    """Main function to evaluate VGGT on CO3D dataset."""
+    # Parse command-line arguments
+    args = setup_args()
+
+    # Setup device and data type
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8 else torch.float16
+
+    # Load model
+    model = load_model(device, model_path=args.model_path, big_model=args.big_model, encoder=args.encoder)
+
+    # Set random seeds
+    set_random_seeds(args.seed)
+
+    # Categories to evaluate
+    SEEN_CATEGORIES = [
+        "apple",
+        "bench",
+        "bowl", "cellphone", "frisbee", "hotdog",
+        # "keyboard",
+        "parkingmeter",
+        "teddybear",
+        "toybus",
+        "backpack",
+        "book",
+        "car",
+        "donut",
+        "handbag",
+        "hydrant",
+        "motorcycle",
+        "pizza",
+        "stopsign",
+        "toaster",
+        "tv"
+    ]
+
+    if args.debug:
+        SEEN_CATEGORIES = ["parkingmeter"]
+
+    per_category_results = {}
+
+    for category in SEEN_CATEGORIES:
+        print(f"Loading annotation for {category} test set")
+        annotation_file = os.path.join(args.co3d_anno_dir, f"{category}_test.jgz")
+
+        try:
+            with gzip.open(annotation_file, "r") as fin:
+                annotation = json.loads(fin.read())
+        except FileNotFoundError:
+            print(f"Annotation file not found for {category}, skipping")
+            continue
+
+        rError = []
+        tError = []
+
+        seq_names = sorted(list(annotation.keys()))
+        if args.fast_eval and len(seq_names)>=10:
+            seq_names = random.sample(seq_names, 10)
+        seq_names = sorted(seq_names)
+
+
+        print("Testing Sequences: ")
+        print(seq_names)
+
+        for seq_name in seq_names:
+            seq_data = annotation[seq_name]
+            print("-" * 50)
+            print(f"Processing {seq_name} for {category} test set")
+            if args.debug and not os.path.exists(os.path.join(args.co3d_dir, category, seq_name)):
+                print(f"Skipping {seq_name} (not found)")
+                continue
+
+            try: 
+                seq_rError, seq_tError = process_sequence(
+                    model, seq_name, seq_data, category, args.co3d_dir,
+                    args.min_num_images, args.num_frames, device, dtype,
+                )
+            except Exception as e:
+                print(f"Error processing {seq_name}: {e}")
+                continue
+
+            print("-" * 50)
+
+            if seq_rError is not None and seq_tError is not None:
+                rError.extend(seq_rError)
+                tError.extend(seq_tError)
+
+        if not rError:
+            print(f"No valid sequences found for {category}, skipping")
+            continue
+
+        rError = np.array(rError)
+        tError = np.array(tError)
+
+        Auc_30, _ = calculate_auc_np(rError, tError, max_threshold=30)
+        Auc_15, _ = calculate_auc_np(rError, tError, max_threshold=15)
+        Auc_5, _ = calculate_auc_np(rError, tError, max_threshold=5)
+        Auc_3, _ = calculate_auc_np(rError, tError, max_threshold=3)
+
+        per_category_results[category] = {
+            "rError": rError,
+            "tError": tError,
+            "Auc_30": Auc_30,
+            "Auc_15": Auc_15,
+            "Auc_5": Auc_5,
+            "Auc_3": Auc_3
+        }
+
+        print("="*80)
+        # Print results with colors
+        GREEN = "\033[92m"
+        RED = "\033[91m"
+        BLUE = "\033[94m"
+        BOLD = "\033[1m"
+        RESET = "\033[0m"
+
+        print(f"{BOLD}{BLUE}AUC of {category} test set:{RESET} {GREEN}{Auc_30:.4f} (AUC@30), {Auc_15:.4f} (AUC@15), {Auc_5:.4f} (AUC@5), {Auc_3:.4f} (AUC@3){RESET}")
+        mean_AUC_30_by_now = np.mean([per_category_results[category]["Auc_30"] for category in per_category_results])
+        mean_AUC_15_by_now = np.mean([per_category_results[category]["Auc_15"] for category in per_category_results])
+        mean_AUC_5_by_now = np.mean([per_category_results[category]["Auc_5"] for category in per_category_results])
+        mean_AUC_3_by_now = np.mean([per_category_results[category]["Auc_3"] for category in per_category_results])
+        print(f"{BOLD}{BLUE}Mean AUC of categories by now:{RESET} {RED}{mean_AUC_30_by_now:.4f} (AUC@30), {mean_AUC_15_by_now:.4f} (AUC@15), {mean_AUC_5_by_now:.4f} (AUC@5), {mean_AUC_3_by_now:.4f} (AUC@3){RESET}")
+        print("="*80)
+
+    # Print summary results
+    print("\nSummary of AUC results:")
+    print("-"*50)
+    for category in sorted(per_category_results.keys()):
+        print(f"{category:<15}: {per_category_results[category]['Auc_30']:.4f} (AUC@30), {per_category_results[category]['Auc_15']:.4f} (AUC@15), {per_category_results[category]['Auc_5']:.4f} (AUC@5), {per_category_results[category]['Auc_3']:.4f} (AUC@3)")
+
+    if per_category_results:
+        mean_AUC_30 = np.mean([per_category_results[category]["Auc_30"] for category in per_category_results])
+        mean_AUC_15 = np.mean([per_category_results[category]["Auc_15"] for category in per_category_results])
+        mean_AUC_5 = np.mean([per_category_results[category]["Auc_5"] for category in per_category_results])
+        mean_AUC_3 = np.mean([per_category_results[category]["Auc_3"] for category in per_category_results])
+        print("-"*50)
+        print(f"Mean AUC: {mean_AUC_30:.4f} (AUC@30), {mean_AUC_15:.4f} (AUC@15), {mean_AUC_5:.4f} (AUC@5), {mean_AUC_3:.4f} (AUC@3)")
+    print(args.model_path)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/evaluation/test_relpose.py b/evaluation/test_relpose.py
new file mode 100644
index 00000000..f22f8f86
--- /dev/null
+++ b/evaluation/test_relpose.py
@@ -0,0 +1,458 @@
+from math import e
+import os
+import torch
+import numpy as np
+import gzip
+import json
+import random
+import logging
+import warnings
+from vggt.models.vggt_small import VGGT as VGGTsmall
+from vggt.models.vggt import VGGT
+from vggt.utils.rotation import mat_to_quat
+from vggt.utils.load_fn import load_and_preprocess_images
+from vggt.utils.pose_enc import pose_encoding_to_extri_intri
+from vggt.utils.geometry import closed_form_inverse_se3
+import argparse
+import gzip
+import json
+import os
+import logging
+from PIL import Image
+
+# python evaluation/test_megadepth.py --data_dir /mimer/NOBACKUP/groups/snic2022-6-266/data/megadepth --anno_dir /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations/megadepth/test.jgz
+# CUDA_VISIBLE_DEVICES=1 python evaluation/test_megadepth.py --data_dir /mimer/NOBACKUP/groups/snic2022-6-266/data/megadepth --anno_dir /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations/megadepth/test.jgz --model_path /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/training/logs/dinov3_exp001/ckpts/checkpoint_15.pt 
+
+
+# python evaluation/test_megadepth.py --data_dir /mimer/NOBACKUP/groups/snic2022-6-266/data/megadepth --anno_dir /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations/megadepth/test.jgz --model_path /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/training/logs/dinov3_exp001/ckpts/checkpoint_15.pt --fast_eval
+
+
+# For running MegaDepth-1500:
+# * python test_relpose.py --data_dir /mimer/NOBACKUP/groups/snic2022-6-266/data/megadepth --anno_dir /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations/megadepth/test.jgz --model_path ../pretrained_models/model_tracker_fixed_e20.pt --fast_eval
+
+# For running ScanNet-1500:
+# python test_relpose.py --data_dir /mimer/NOBACKUP/groups/3d-dl/scannet/scannet_test_1500 --anno_dir /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations/scannet/scannet_test_1500.jgz --model_path ../pretrained_models/model_tracker_fixed_e20.pt --fast_eval
+
+# Example on how to evaluate MuM on MegaDepth:
+# python test_relpose.py --data_dir /mimer/NOBACKUP/groups/snic2022-6-266/data/megadepth --anno_dir /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations/megadepth/test.jgz --model_path ../training/logs/mum_exp001/ckpts/checkpoint.pt --fast_eval --encoder mum 
+# python test_relpose.py --data_dir /mimer/NOBACKUP/groups/3d-dl/scannet/scannet_test_1500 --anno_dir /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations/scannet/scannet_test_1500.jgz --model_path ../training/logs/dinov3_exp001/ckpts/checkpoint.pt --fast_eval --encoder dinov3
+# python test_co3d.py --model_path ../training/logs/mum_exp001/ckpts/checkpoint.pt --fast_eval --encoder mum --co3d_dir /mimer/NOBACKUP/groups/3d-dl/co3dv2 --co3d_anno_dir ../annotations/co3d_v2_annotations
+# CUDA_VISIBLE_DEVICES=2 python test_relpose.py --data_dir /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/data/re10k/ --anno_dir /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations/re10k/test.jgz --model_path ../training/logs/mum_exp004/ckpts/checkpoint.pt --fast_eval --encoder mum
+
+# Suppress DINO v2 logs
+logging.getLogger("dinov2").setLevel(logging.WARNING)
+warnings.filterwarnings("ignore", message="xFormers is available")
+warnings.filterwarnings("ignore", message="dinov2")
+
+# Set computation precision
+torch.set_float32_matmul_precision('highest')
+torch.backends.cudnn.allow_tf32 = False
+
+
+def convert_pt3d_RT_to_opencv(Rot, Trans):
+    """
+    Convert Point3D extrinsic matrices to OpenCV convention.
+
+    Args:
+        Rot: 3D rotation matrix in Point3D format
+        Trans: 3D translation vector in Point3D format
+
+    Returns:
+        extri_opencv: 3x4 extrinsic matrix in OpenCV format
+    """
+    rot_pt3d = np.array(Rot)
+    trans_pt3d = np.array(Trans)
+
+    trans_pt3d[:2] *= -1
+    rot_pt3d[:, :2] *= -1
+    rot_pt3d = rot_pt3d.transpose(1, 0)
+    extri_opencv = np.hstack((rot_pt3d, trans_pt3d[:, None]))
+    return extri_opencv
+
+
+def build_pair_index(N, B=1):
+    """
+    Build indices for all possible pairs of frames.
+
+    Args:
+        N: Number of frames
+        B: Batch size
+
+    Returns:
+        i1, i2: Indices for all possible pairs
+    """
+    i1_, i2_ = torch.combinations(torch.arange(N), 2, with_replacement=False).unbind(-1)
+    i1, i2 = [(i[None] + torch.arange(B)[:, None] * N).reshape(-1) for i in [i1_, i2_]]
+    return i1, i2
+
+
+def rotation_angle(rot_gt, rot_pred, batch_size=None, eps=1e-15):
+    """
+    Calculate rotation angle error between ground truth and predicted rotations.
+
+    Args:
+        rot_gt: Ground truth rotation matrices
+        rot_pred: Predicted rotation matrices
+        batch_size: Batch size for reshaping the result
+        eps: Small value to avoid numerical issues
+
+    Returns:
+        Rotation angle error in degrees
+    """
+    q_pred = mat_to_quat(rot_pred)
+    q_gt = mat_to_quat(rot_gt)
+
+    loss_q = (1 - (q_pred * q_gt).sum(dim=1) ** 2).clamp(min=eps)
+    err_q = torch.arccos(1 - 2 * loss_q)
+
+    rel_rangle_deg = err_q * 180 / np.pi
+
+    if batch_size is not None:
+        rel_rangle_deg = rel_rangle_deg.reshape(batch_size, -1)
+
+    return rel_rangle_deg
+
+
+def translation_angle(tvec_gt, tvec_pred, batch_size=None, ambiguity=True):
+    """
+    Calculate translation angle error between ground truth and predicted translations.
+
+    Args:
+        tvec_gt: Ground truth translation vectors
+        tvec_pred: Predicted translation vectors
+        batch_size: Batch size for reshaping the result
+        ambiguity: Whether to handle direction ambiguity
+
+    Returns:
+        Translation angle error in degrees
+    """
+    rel_tangle_deg = compare_translation_by_angle(tvec_gt, tvec_pred)
+    rel_tangle_deg = rel_tangle_deg * 180.0 / np.pi
+
+    if ambiguity:
+        rel_tangle_deg = torch.min(rel_tangle_deg, (180 - rel_tangle_deg).abs())
+
+    if batch_size is not None:
+        rel_tangle_deg = rel_tangle_deg.reshape(batch_size, -1)
+
+    return rel_tangle_deg
+
+
+def compare_translation_by_angle(t_gt, t, eps=1e-15, default_err=1e6):
+    """
+    Normalize the translation vectors and compute the angle between them.
+
+    Args:
+        t_gt: Ground truth translation vectors
+        t: Predicted translation vectors
+        eps: Small value to avoid division by zero
+        default_err: Default error value for invalid cases
+
+    Returns:
+        Angular error between translation vectors in radians
+    """
+    t_norm = torch.norm(t, dim=1, keepdim=True)
+    t = t / (t_norm + eps)
+
+    t_gt_norm = torch.norm(t_gt, dim=1, keepdim=True)
+    t_gt = t_gt / (t_gt_norm + eps)
+
+    loss_t = torch.clamp_min(1.0 - torch.sum(t * t_gt, dim=1) ** 2, eps)
+    err_t = torch.acos(torch.sqrt(1 - loss_t))
+
+    err_t[torch.isnan(err_t) | torch.isinf(err_t)] = default_err
+    return err_t
+
+
+def calculate_auc_np(r_error, t_error, max_threshold=30):
+    """
+    Calculate the Area Under the Curve (AUC) for the given error arrays using NumPy.
+
+    Args:
+        r_error: numpy array representing R error values (Degree)
+        t_error: numpy array representing T error values (Degree)
+        max_threshold: Maximum threshold value for binning the histogram
+
+    Returns:
+        AUC value and the normalized histogram
+    """
+    error_matrix = np.concatenate((r_error[:, None], t_error[:, None]), axis=1)
+    max_errors = np.max(error_matrix, axis=1)
+    bins = np.arange(max_threshold + 1)
+    histogram, _ = np.histogram(max_errors, bins=bins)
+    num_pairs = float(len(max_errors))
+    normalized_histogram = histogram.astype(float) / num_pairs
+    return np.mean(np.cumsum(normalized_histogram)), normalized_histogram
+
+
+def se3_to_relative_pose_error(pred_se3, gt_se3, num_frames):
+    """
+    Compute rotation and translation errors between predicted and ground truth poses.
+    This function assumes the input poses are world-to-camera (w2c) transformations.
+
+    Args:
+        pred_se3: Predicted SE(3) transformations (w2c), shape (N, 4, 4)
+        gt_se3: Ground truth SE(3) transformations (w2c), shape (N, 4, 4)
+        num_frames: Number of frames (N)
+
+    Returns:
+        Rotation and translation angle errors in degrees
+    """
+    pair_idx_i1, pair_idx_i2 = build_pair_index(num_frames)
+
+    relative_pose_gt = gt_se3[pair_idx_i1].bmm(
+        closed_form_inverse_se3(gt_se3[pair_idx_i2])
+    )
+    relative_pose_pred = pred_se3[pair_idx_i1].bmm(
+        closed_form_inverse_se3(pred_se3[pair_idx_i2])
+    )
+
+    rel_rangle_deg = rotation_angle(
+        relative_pose_gt[:, :3, :3], relative_pose_pred[:, :3, :3]
+    )
+    rel_tangle_deg = translation_angle(
+        relative_pose_gt[:, :3, 3], relative_pose_pred[:, :3, 3]
+    )
+
+    return rel_rangle_deg, rel_tangle_deg
+
+
+def setup_args():
+    """Set up command-line arguments for the CO3D evaluation script."""
+    parser = argparse.ArgumentParser(description='Test VGGT on CO3D dataset')
+    parser.add_argument('--debug', action='store_true', help='Enable debug mode (only test on specific category)')
+    parser.add_argument('--fast_eval', action='store_true', default=False, help='Only evaluate 10 sequences per category')
+    
+    parser.add_argument('--big_model', action='store_true', default=False, help='If to load the original VGGT')
+    parser.add_argument('--encoder', type=str, default="dinov3", help='Encoder to use in VGGTsmall')
+    
+    parser.add_argument('--min_num_images', type=int, default=10, help='Minimum number of images for a sequence')
+    parser.add_argument('--num_frames', type=int, default=10, help='Number of frames to use for testing')
+    parser.add_argument('--data_dir', type=str, required=True, help='Path to CO3D dataset')
+    parser.add_argument('--anno_dir', type=str, required=True, help='Path to CO3D annotations')
+    parser.add_argument('--seed', type=int, default=0, help='Random seed for reproducibility')
+    parser.add_argument('--model_path', type=str, required=True, help='Path to the VGGT model checkpoint')
+    return parser.parse_args()
+
+
+def load_model(device, model_path, big_model=False, encoder="dinov3"):
+    """
+    Load the VGGT model.
+
+    Args:
+        device: Device to load the model on
+        model_path: Path to the model checkpoint
+
+    Returns:
+        Loaded VGGT model
+    """
+    print("Initializing and loading VGGT model...")
+    if not big_model:
+        model = VGGTsmall(
+            img_size=336,
+            embed_dim=768,
+            depth=6,
+            num_heads=12,
+            patch_size=16,
+            patch_embed=encoder,
+            enable_camera=True,
+            enable_depth=True,
+            enable_point=True,
+            enable_track=False,
+        )
+    else:
+        model = VGGT()
+    print(f"USING {model_path}")
+    state_dict = torch.load(model_path)['model']
+    model.load_state_dict(state_dict, strict=True)
+    model.eval()
+    model = model.to(device)
+    return model
+
+def set_random_seeds(seed):
+    """
+    Set random seeds for reproducibility.
+
+    Args:
+        seed: Random seed value
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+
+def process_sequence(model, seq_name, seq_data, category, data_dir, min_num_images, num_frames, device, dtype):
+    """
+    Process a single sequence and compute pose errors.
+
+    Args:
+        model: VGGT model
+        seq_name: Sequence name
+        seq_data: Sequence data
+        category: Category name
+        data_dir: CO3D dataset directory
+        min_num_images: Minimum number of images required
+        num_frames: Number of frames to sample
+        device: Device to run on
+        dtype: Data type for model inference
+
+    Returns:
+        rError: Rotation errors
+        tError: Translation errors
+    """
+    if len(seq_data) < min_num_images:
+        return None, None
+
+    metadata = []
+    for data in seq_data:
+
+        metadata.append({
+            "filepath": data["filepath"],
+            "extri": data["extri"],
+        })
+
+    
+    # Random sample num_frames images
+    ids = np.random.choice(len(metadata), num_frames, replace=False)
+
+    image_names = [os.path.join(data_dir, metadata[i]["filepath"]) for i in ids]
+    gt_extri = [np.array(metadata[i]["extri"]) for i in ids]
+    gt_extri = np.stack(gt_extri, axis=0)
+
+    # images = []
+    # for image_name in image_names:
+    #     assert os.path.exists(image_name), f"{image_name} does not exist"
+    #     img = Image.open(image_name).convert('RGB')
+    #     images.append(transforms(img))
+    # images = torch.stack(images).to(device)
+
+    images = load_and_preprocess_images(image_names).to(device)
+
+    # images = load_and_preprocess_images(image_names).to(device).unsqueeze(0)
+
+    with torch.no_grad():
+        with torch.cuda.amp.autocast(dtype=dtype):
+            predictions = model(images)
+    with torch.cuda.amp.autocast(dtype=torch.float64):
+        extrinsic, intrinsic = pose_encoding_to_extri_intri(predictions["pose_enc"], images.shape[-2:])
+        pred_extrinsic = extrinsic[0]
+
+    with torch.cuda.amp.autocast(dtype=torch.float64):
+        gt_extrinsic = torch.from_numpy(gt_extri).to(device)
+        add_row = torch.tensor([0, 0, 0, 1], device=device).expand(pred_extrinsic.size(0), 1, 4)
+
+        pred_se3 = torch.cat((pred_extrinsic, add_row), dim=1)
+        gt_se3 = torch.cat((gt_extrinsic, add_row), dim=1)
+
+        rel_rangle_deg, rel_tangle_deg = se3_to_relative_pose_error(pred_se3, gt_se3, num_frames)
+
+
+        Racc_5 = (rel_rangle_deg < 5).float().mean().item()
+        Tacc_5 = (rel_tangle_deg < 5).float().mean().item()
+
+        print(f"{category} sequence {seq_name} R_ACC@5: {Racc_5:.4f}")
+        print(f"{category} sequence {seq_name} T_ACC@5: {Tacc_5:.4f}")
+
+        return rel_rangle_deg.cpu().numpy(), rel_tangle_deg.cpu().numpy()
+
+
+def main():
+    """Main function to evaluate VGGT on CO3D dataset."""
+    # Parse command-line arguments
+    args = setup_args()
+
+    # Setup device and data type
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8 else torch.float16
+
+    # Load model
+    model = load_model(device, model_path=args.model_path, big_model=args.big_model, encoder=args.encoder)
+
+    # Set random seeds
+    set_random_seeds(args.seed)
+
+    per_category_results = {}
+
+    with gzip.open(args.anno_dir, "r") as fin:
+        annotation = json.loads(fin.read())
+
+    for scene_name, scene_data in annotation.items():
+        category = scene_name
+        print(f"Loading annotation for {scene_name} test set")
+
+        rError = []
+        tError = []
+
+        if args.fast_eval and len(scene_data)>=10:
+            # scene_data = random.sample(scene_data, 10)
+            scene_data = scene_data[:10]
+
+        for i, seq_data in enumerate(scene_data):
+            print("-" * 50)
+            seq_rError, seq_tError = process_sequence(
+                model, i, seq_data, category, args.data_dir,
+                args.min_num_images, args.num_frames, device, dtype,
+            )
+
+            print("-" * 50)
+
+            if seq_rError is not None and seq_tError is not None:
+                rError.extend(seq_rError)
+                tError.extend(seq_tError)
+
+        if not rError:
+            print(f"No valid sequences found for {category}, skipping")
+            continue
+
+        rError = np.array(rError)
+        tError = np.array(tError)
+
+        Auc_30, _ = calculate_auc_np(rError, tError, max_threshold=30)
+        Auc_15, _ = calculate_auc_np(rError, tError, max_threshold=15)
+        Auc_5, _ = calculate_auc_np(rError, tError, max_threshold=5)
+        Auc_3, _ = calculate_auc_np(rError, tError, max_threshold=3)
+
+        per_category_results[category] = {
+            "rError": rError,
+            "tError": tError,
+            "Auc_30": Auc_30,
+            "Auc_15": Auc_15,
+            "Auc_5": Auc_5,
+            "Auc_3": Auc_3
+        }
+
+        print("="*80)
+        # Print results with colors
+        GREEN = "\033[92m"
+        RED = "\033[91m"
+        BLUE = "\033[94m"
+        BOLD = "\033[1m"
+        RESET = "\033[0m"
+
+        print(f"{BOLD}{BLUE}AUC of {category} test set:{RESET} {GREEN}{Auc_30:.4f} (AUC@30), {Auc_15:.4f} (AUC@15), {Auc_5:.4f} (AUC@5), {Auc_3:.4f} (AUC@3){RESET}")
+        mean_AUC_30_by_now = np.mean([per_category_results[category]["Auc_30"] for category in per_category_results])
+        mean_AUC_15_by_now = np.mean([per_category_results[category]["Auc_15"] for category in per_category_results])
+        mean_AUC_5_by_now = np.mean([per_category_results[category]["Auc_5"] for category in per_category_results])
+        mean_AUC_3_by_now = np.mean([per_category_results[category]["Auc_3"] for category in per_category_results])
+        print(f"{BOLD}{BLUE}Mean AUC of categories by now:{RESET} {RED}{mean_AUC_30_by_now:.4f} (AUC@30), {mean_AUC_15_by_now:.4f} (AUC@15), {mean_AUC_5_by_now:.4f} (AUC@5), {mean_AUC_3_by_now:.4f} (AUC@3){RESET}")
+        print("="*80)
+
+    # Print summary results
+    print("\nSummary of AUC results:")
+    print("-"*50)
+    for category in sorted(per_category_results.keys()):
+        print(f"{category:<15}: {per_category_results[category]['Auc_30']:.4f} (AUC@30), {per_category_results[category]['Auc_15']:.4f} (AUC@15), {per_category_results[category]['Auc_5']:.4f} (AUC@5), {per_category_results[category]['Auc_3']:.4f} (AUC@3)")
+
+    if per_category_results:
+        mean_AUC_30 = np.mean([per_category_results[category]["Auc_30"] for category in per_category_results])
+        mean_AUC_15 = np.mean([per_category_results[category]["Auc_15"] for category in per_category_results])
+        mean_AUC_5 = np.mean([per_category_results[category]["Auc_5"] for category in per_category_results])
+        mean_AUC_3 = np.mean([per_category_results[category]["Auc_3"] for category in per_category_results])
+        print("-"*50)
+        print(f"Mean AUC: {mean_AUC_30:.4f} (AUC@30), {mean_AUC_15:.4f} (AUC@15), {mean_AUC_5:.4f} (AUC@5), {mean_AUC_3:.4f} (AUC@3)")
+    print(args.model_path)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/evaluation/test_scannet1500.py b/evaluation/test_scannet1500.py
new file mode 100644
index 00000000..bbb88ed2
--- /dev/null
+++ b/evaluation/test_scannet1500.py
@@ -0,0 +1,225 @@
+import os.path as osp
+import numpy as np
+import torch
+from PIL import Image
+from tqdm import tqdm
+import cv2
+from vggt.models.vggt import VGGT
+
+
+# Code taken from https://github.com/PruneTruong/DenseMatching/blob/40c29a6b5c35e86b9509e65ab0cd12553d998e5f/validation/utils_pose_estimation.py
+# --- GEOMETRY ---
+def estimate_pose(kpts0, kpts1, K0, K1, norm_thresh, conf=0.99999):
+    if len(kpts0) < 5:
+        return None
+    K0inv = np.linalg.inv(K0[:2,:2])
+    K1inv = np.linalg.inv(K1[:2,:2])
+
+    kpts0 = (K0inv @ (kpts0-K0[None,:2,2]).T).T 
+    kpts1 = (K1inv @ (kpts1-K1[None,:2,2]).T).T
+    E, mask = cv2.findEssentialMat(
+        kpts0, kpts1, np.eye(3), threshold=norm_thresh, prob=conf
+    )
+
+    ret = None
+    if E is not None:
+        best_num_inliers = 0
+
+        for _E in np.split(E, len(E) / 3):
+            n, R, t, _ = cv2.recoverPose(_E, kpts0, kpts1, np.eye(3), 1e9, mask=mask)
+            if n > best_num_inliers:
+                best_num_inliers = n
+                ret = (R, t, mask.ravel() > 0)
+    return ret
+
+def pose_auc(errors, thresholds):
+    sort_idx = np.argsort(errors)
+    errors = np.array(errors.copy())[sort_idx]
+    recall = (np.arange(len(errors)) + 1) / len(errors)
+    errors = np.r_[0.0, errors]
+    recall = np.r_[0.0, recall]
+    aucs = []
+    for t in thresholds:
+        last_index = np.searchsorted(errors, t)
+        r = np.r_[recall[:last_index], recall[last_index - 1]]
+        e = np.r_[errors[:last_index], t]
+        aucs.append(np.trapz(r, x=e).item() / t)
+    return aucs
+
+def angle_error_vec(v1, v2):
+    n = np.linalg.norm(v1) * np.linalg.norm(v2)
+    return np.rad2deg(np.arccos(np.clip(np.dot(v1, v2) / n, -1.0, 1.0)))
+
+def angle_error_mat(R1, R2):
+    cos = (np.trace(np.dot(R1.T, R2)) - 1) / 2
+    cos = np.clip(cos, -1.0, 1.0)  # numercial errors can make it out of bounds
+    return np.rad2deg(np.abs(np.arccos(cos)))
+
+def compute_pose_error(T_0to1, R, t):
+    R_gt = T_0to1[:3, :3]
+    t_gt = T_0to1[:3, 3]
+    error_t = angle_error_vec(t.squeeze(), t_gt)
+    error_t = np.minimum(error_t, 180 - error_t)  # ambiguity of E estimation
+    error_R = angle_error_mat(R, R_gt)
+    return error_t, error_R
+
+class ScanNetBenchmark:
+    def __init__(self, data_root="data/scannet") -> None:
+        self.data_root = data_root
+
+    def benchmark(self, model, model_name = None):
+        model.train(False)
+        with torch.no_grad():
+            data_root = self.data_root
+            tmp = np.load(osp.join(data_root, "test.npz"))
+            pairs, rel_pose = tmp["name"], tmp["rel_pose"]
+            tot_e_t, tot_e_R, tot_e_pose = [], [], []
+            pair_inds = np.random.choice(
+                range(len(pairs)), size=len(pairs), replace=False
+            )
+            for pairind in tqdm(pair_inds, smoothing=0.9):
+                scene = pairs[pairind]
+                scene_name = f"scene0{scene[0]}_00"
+                im_A_path = osp.join(
+                        self.data_root,
+                        "scans_test",
+                        scene_name,
+                        "color",
+                        f"{scene[2]}.jpg",
+                    )
+                im_A = Image.open(im_A_path)
+                im_B_path = osp.join(
+                        self.data_root,
+                        "scans_test",
+                        scene_name,
+                        "color",
+                        f"{scene[3]}.jpg",
+                    )
+                im_B = Image.open(im_B_path)
+                T_gt = rel_pose[pairind].reshape(3, 4)
+                R, t = T_gt[:3, :3], T_gt[:3, 3]
+                K = np.stack(
+                    [
+                        np.array([float(i) for i in r.split()])
+                        for r in open(
+                            osp.join(
+                                self.data_root,
+                                "scans_test",
+                                scene_name,
+                                "intrinsic",
+                                "intrinsic_color.txt",
+                            ),
+                            "r",
+                        )
+                        .read()
+                        .split("\n")
+                        if r
+                    ]
+                )
+                w1, h1 = im_A.size
+                w2, h2 = im_B.size
+                K1 = K.copy()
+                K2 = K.copy()
+                dense_matches, dense_certainty = model.match(im_A_path, im_B_path)
+                sparse_matches, sparse_certainty = model.sample(
+                    dense_matches, dense_certainty, 5000
+                )
+                scale1 = 480 / min(w1, h1)
+                scale2 = 480 / min(w2, h2)
+                w1, h1 = scale1 * w1, scale1 * h1
+                w2, h2 = scale2 * w2, scale2 * h2
+                K1 = K1 * scale1
+                K2 = K2 * scale2
+
+                offset = 0.5
+                kpts1 = sparse_matches[:, :2]
+                kpts1 = (
+                    np.stack(
+                        (
+                            w1 * (kpts1[:, 0] + 1) / 2 - offset,
+                            h1 * (kpts1[:, 1] + 1) / 2 - offset,
+                        ),
+                        axis=-1,
+                    )
+                )
+                kpts2 = sparse_matches[:, 2:]
+                kpts2 = (
+                    np.stack(
+                        (
+                            w2 * (kpts2[:, 0] + 1) / 2 - offset,
+                            h2 * (kpts2[:, 1] + 1) / 2 - offset,
+                        ),
+                        axis=-1,
+                    )
+                )
+                for _ in range(5):
+                    shuffling = np.random.permutation(np.arange(len(kpts1)))
+                    kpts1 = kpts1[shuffling]
+                    kpts2 = kpts2[shuffling]
+                    try:
+                        norm_threshold = 0.5 / (
+                        np.mean(np.abs(K1[:2, :2])) + np.mean(np.abs(K2[:2, :2])))
+                        R_est, t_est, mask = estimate_pose(
+                            kpts1,
+                            kpts2,
+                            K1,
+                            K2,
+                            norm_threshold,
+                            conf=0.99999,
+                        )
+                        T1_to_2_est = np.concatenate((R_est, t_est), axis=-1)  #
+                        e_t, e_R = compute_pose_error(T1_to_2_est, R, t)
+                        e_pose = max(e_t, e_R)
+                    except Exception as e:
+                        print(repr(e))
+                        e_t, e_R = 90, 90
+                        e_pose = max(e_t, e_R)
+                    tot_e_t.append(e_t)
+                    tot_e_R.append(e_R)
+                    tot_e_pose.append(e_pose)
+                tot_e_t.append(e_t)
+                tot_e_R.append(e_R)
+                tot_e_pose.append(e_pose)
+            tot_e_pose = np.array(tot_e_pose)
+            thresholds = [5, 10, 20]
+            auc = pose_auc(tot_e_pose, thresholds)
+            acc_5 = (tot_e_pose < 5).mean()
+            acc_10 = (tot_e_pose < 10).mean()
+            acc_15 = (tot_e_pose < 15).mean()
+            acc_20 = (tot_e_pose < 20).mean()
+            map_5 = acc_5
+            map_10 = np.mean([acc_5, acc_10])
+            map_20 = np.mean([acc_5, acc_10, acc_15, acc_20])
+            return {
+                "auc_5": auc[0],
+                "auc_10": auc[1],
+                "auc_20": auc[2],
+                "map_5": map_5,
+                "map_10": map_10,
+                "map_20": map_20,
+            }
+
+def load_model(device, model_path):
+    """
+    Load the VGGT model.
+
+    Args:
+        device: Device to load the model on
+        model_path: Path to the model checkpoint
+
+    Returns:
+        Loaded VGGT model
+    """
+    print("Initializing and loading VGGT model...")
+    model = VGGT()
+    # _URL = "https://huggingface.co/facebook/VGGT-1B/resolve/main/model.pt"
+    # model.load_state_dict(torch.hub.load_state_dict_from_url(_URL))
+    print(f"USING {model_path}")
+    model.load_state_dict(torch.load(model_path))
+    model.eval()
+    model = model.to(device)
+    return model
+
+if __name__ == "__main__":
+    model = load_model("cuda", "../pretrained_models/model_tracker_fixed_e20.pt")
+    pass
\ No newline at end of file
diff --git a/evaluation/utils/__init__.py b/evaluation/utils/__init__.py
new file mode 100644
index 00000000..66734d93
--- /dev/null
+++ b/evaluation/utils/__init__.py
@@ -0,0 +1,38 @@
+from vggt.models.vggt_small import VGGT as VGGTsmall
+from vggt.models.vggt import VGGT
+import torch
+
+def load_model(device, model_path, big_model=False, encoder="dinov3"):
+    """
+    Load the VGGT model.
+
+    Args:
+        device: Device to load the model on
+        model_path: Path to the model checkpoint
+
+    Returns:
+        Loaded VGGT model
+    """
+    print("Initializing and loading VGGT model...")
+    if not big_model:
+        model = VGGTsmall(
+            img_size=336,
+            embed_dim=768,
+            depth=6,
+            num_heads=12,
+            patch_size=16,
+            patch_embed=encoder,
+            enable_camera=True,
+            enable_depth=True,
+            enable_point=True,
+            enable_track=False,
+        )
+        state_dict = torch.load(model_path)['model']
+    else:
+        model = VGGT()
+        state_dict = torch.load(model_path)
+    print(f"USING {model_path}")
+    model.load_state_dict(state_dict, strict=True)
+    model.eval()
+    model = model.to(device)
+    return model
\ No newline at end of file
diff --git a/evaluation/utils/cropping.py b/evaluation/utils/cropping.py
new file mode 100644
index 00000000..2980696b
--- /dev/null
+++ b/evaluation/utils/cropping.py
@@ -0,0 +1,53 @@
+import cv2
+import numpy as np
+
+from PIL import Image
+from typing import Tuple
+
+try:
+    lanczos = Image.Resampling.LANCZOS
+    bicubic = Image.Resampling.BICUBIC
+except AttributeError:
+    lanczos = Image.LANCZOS
+    bicubic = Image.BICUBIC
+
+def resize_image(image: Image.Image, output_resolution: Tuple[int, int]) -> Image.Image:
+    max_resize_scale = max(output_resolution[0] / image.size[0], output_resolution[1] / image.size[1])
+    return image.resize(output_resolution, resample=lanczos if max_resize_scale < 1 else bicubic)
+
+def resize_image_depth_and_intrinsic(
+    image: Image.Image,
+    depth_map: np.ndarray,
+    intrinsic: np.ndarray,
+    output_width: int,
+    pixel_center: bool = True,
+) ->  Tuple[Image.Image, np.ndarray, np.ndarray]:
+    if len(depth_map.shape) != 2:
+        raise ValueError(f"Depth map must be a 2D array, but found depthmap.shape = {depth_map.shape}")
+    input_resolution = np.array(depth_map.shape[::-1], dtype=np.float32)  # (H, W) -> (W, H)
+    # output_resolution = np.array([output_width, round(input_resolution[1] * (output_width / input_resolution[0]))])
+    output_resolution = np.array([output_width, round(input_resolution[1] * (output_width / input_resolution[0]) / 14) * 14])
+
+    image = resize_image(image, tuple(output_resolution))
+
+    depth_map = cv2.resize(
+        depth_map,
+        output_resolution,
+        interpolation = cv2.INTER_NEAREST,
+    )
+
+    intrinsic = np.copy(intrinsic)
+
+    if pixel_center:
+        intrinsic[0, 2] = intrinsic[0, 2] + 0.5
+        intrinsic[1, 2] = intrinsic[1, 2] + 0.5
+
+    resize_scale = np.max(output_resolution / input_resolution)
+    intrinsic[:2, :] = intrinsic[:2, :] * resize_scale
+
+    if pixel_center:
+        intrinsic[0, 2] = intrinsic[0, 2] - 0.5
+        intrinsic[1, 2] = intrinsic[1, 2] - 0.5
+
+    assert image.size == depth_map.shape[::-1], f"Image size {image.size} does not match depth map shape {depth_map.shape[::-1]}"
+    return image, depth_map, intrinsic
\ No newline at end of file
diff --git a/evaluation/utils/geometry.py b/evaluation/utils/geometry.py
new file mode 100644
index 00000000..acdb178a
--- /dev/null
+++ b/evaluation/utils/geometry.py
@@ -0,0 +1,296 @@
+# From https://github.com/facebookresearch/vggt/blob/main/vggt/utils/geometry.py, https://github.com/facebookresearch/vggt/blob/main/vggt/utils/rotation.py
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import numpy as np
+import torch.nn.functional as F
+
+
+def quat_to_mat(quaternions: torch.Tensor) -> torch.Tensor:
+    """
+    Quaternion Order: XYZW or say ijkr, scalar-last
+
+    Convert rotations given as quaternions to rotation matrices.
+    Args:
+        quaternions: quaternions with real part last,
+            as tensor of shape (..., 4).
+
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+    i, j, k, r = torch.unbind(quaternions, -1)
+    # pyre-fixme[58]: `/` is not supported for operand types `float` and `Tensor`.
+    two_s = 2.0 / (quaternions * quaternions).sum(-1)
+
+    o = torch.stack(
+        (
+            1 - two_s * (j * j + k * k),
+            two_s * (i * j - k * r),
+            two_s * (i * k + j * r),
+            two_s * (i * j + k * r),
+            1 - two_s * (i * i + k * k),
+            two_s * (j * k - i * r),
+            two_s * (i * k - j * r),
+            two_s * (j * k + i * r),
+            1 - two_s * (i * i + j * j),
+        ),
+        -1,
+    )
+    return o.reshape(quaternions.shape[:-1] + (3, 3))
+
+
+def mat_to_quat(matrix: torch.Tensor) -> torch.Tensor:
+    """
+    Convert rotations given as rotation matrices to quaternions.
+
+    Args:
+        matrix: Rotation matrices as tensor of shape (..., 3, 3).
+
+    Returns:
+        quaternions with real part last, as tensor of shape (..., 4).
+        Quaternion Order: XYZW or say ijkr, scalar-last
+    """
+    if matrix.size(-1) != 3 or matrix.size(-2) != 3:
+        raise ValueError(f"Invalid rotation matrix shape {matrix.shape}.")
+
+    batch_dim = matrix.shape[:-2]
+    m00, m01, m02, m10, m11, m12, m20, m21, m22 = torch.unbind(matrix.reshape(batch_dim + (9,)), dim=-1)
+
+    q_abs = _sqrt_positive_part(
+        torch.stack(
+            [
+                1.0 + m00 + m11 + m22,
+                1.0 + m00 - m11 - m22,
+                1.0 - m00 + m11 - m22,
+                1.0 - m00 - m11 + m22,
+            ],
+            dim=-1,
+        )
+    )
+
+    # we produce the desired quaternion multiplied by each of r, i, j, k
+    quat_by_rijk = torch.stack(
+        [
+            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
+            #  `int`.
+            torch.stack([q_abs[..., 0] ** 2, m21 - m12, m02 - m20, m10 - m01], dim=-1),
+            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
+            #  `int`.
+            torch.stack([m21 - m12, q_abs[..., 1] ** 2, m10 + m01, m02 + m20], dim=-1),
+            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
+            #  `int`.
+            torch.stack([m02 - m20, m10 + m01, q_abs[..., 2] ** 2, m12 + m21], dim=-1),
+            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
+            #  `int`.
+            torch.stack([m10 - m01, m20 + m02, m21 + m12, q_abs[..., 3] ** 2], dim=-1),
+        ],
+        dim=-2,
+    )
+
+    # We floor here at 0.1 but the exact level is not important; if q_abs is small,
+    # the candidate won't be picked.
+    flr = torch.tensor(0.1).to(dtype=q_abs.dtype, device=q_abs.device)
+    quat_candidates = quat_by_rijk / (2.0 * q_abs[..., None].max(flr))
+
+    # if not for numerical problems, quat_candidates[i] should be same (up to a sign),
+    # forall i; we pick the best-conditioned one (with the largest denominator)
+    out = quat_candidates[F.one_hot(q_abs.argmax(dim=-1), num_classes=4) > 0.5, :].reshape(batch_dim + (4,))
+
+    # Convert from rijk to ijkr
+    out = out[..., [1, 2, 3, 0]]
+
+    out = standardize_quaternion(out)
+
+    return out
+
+
+def _sqrt_positive_part(x: torch.Tensor) -> torch.Tensor:
+    """
+    Returns torch.sqrt(torch.max(0, x))
+    but with a zero subgradient where x is 0.
+    """
+    ret = torch.zeros_like(x)
+    positive_mask = x > 0
+    if torch.is_grad_enabled():
+        ret[positive_mask] = torch.sqrt(x[positive_mask])
+    else:
+        ret = torch.where(positive_mask, torch.sqrt(x), ret)
+    return ret
+
+
+def standardize_quaternion(quaternions: torch.Tensor) -> torch.Tensor:
+    """
+    Convert a unit quaternion to a standard form: one in which the real
+    part is non negative.
+
+    Args:
+        quaternions: Quaternions with real part last,
+            as tensor of shape (..., 4).
+
+    Returns:
+        Standardized quaternions as tensor of shape (..., 4).
+    """
+    return torch.where(quaternions[..., 3:4] < 0, -quaternions, quaternions)
+
+
+
+def unproject_depth_map_to_point_map(
+    depth_map: np.ndarray, extrinsics_cam: np.ndarray, intrinsics_cam: np.ndarray
+) -> np.ndarray:
+    """
+    Unproject a batch of depth maps to 3D world coordinates.
+
+    Args:
+        depth_map (np.ndarray): Batch of depth maps of shape (S, H, W, 1) or (S, H, W)
+        extrinsics_cam (np.ndarray): Batch of camera extrinsic matrices of shape (S, 3, 4)
+        intrinsics_cam (np.ndarray): Batch of camera intrinsic matrices of shape (S, 3, 3)
+
+    Returns:
+        np.ndarray: Batch of 3D world coordinates of shape (S, H, W, 3)
+    """
+    if isinstance(depth_map, torch.Tensor):
+        depth_map = depth_map.cpu().numpy()
+    if isinstance(extrinsics_cam, torch.Tensor):
+        extrinsics_cam = extrinsics_cam.cpu().numpy()
+    if isinstance(intrinsics_cam, torch.Tensor):
+        intrinsics_cam = intrinsics_cam.cpu().numpy()
+
+    world_points_list = []
+    for frame_idx in range(depth_map.shape[0]):
+        cur_world_points, _, _ = depth_to_world_coords_points(
+            depth_map[frame_idx].squeeze(-1), extrinsics_cam[frame_idx], intrinsics_cam[frame_idx]
+        )
+        world_points_list.append(cur_world_points)
+    world_points_array = np.stack(world_points_list, axis=0)
+
+    return world_points_array
+
+
+def depth_to_world_coords_points(
+    depth_map: np.ndarray,
+    extrinsic: np.ndarray,
+    intrinsic: np.ndarray,
+    eps=1e-8,
+) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """
+    Convert a depth map to world coordinates.
+
+    Args:
+        depth_map (np.ndarray): Depth map of shape (H, W).
+        intrinsic (np.ndarray): Camera intrinsic matrix of shape (3, 3).
+        extrinsic (np.ndarray): Camera extrinsic matrix of shape (3, 4). OpenCV camera coordinate convention, cam from world.
+
+    Returns:
+        tuple[np.ndarray, np.ndarray]: World coordinates (H, W, 3) and valid depth mask (H, W).
+    """
+    if depth_map is None:
+        return None, None, None
+
+    # Valid depth mask
+    point_mask = depth_map > eps
+
+    # Convert depth map to camera coordinates
+    cam_coords_points = depth_to_cam_coords_points(depth_map, intrinsic)
+
+    # Multiply with the inverse of extrinsic matrix to transform to world coordinates
+    # extrinsic_inv is 4x4 (note closed_form_inverse_OpenCV is batched, the output is (N, 4, 4))
+    cam_to_world_extrinsic = closed_form_inverse_se3(extrinsic[None])[0]
+
+    R_cam_to_world = cam_to_world_extrinsic[:3, :3]
+    t_cam_to_world = cam_to_world_extrinsic[:3, 3]
+
+    # Apply the rotation and translation to the camera coordinates
+    world_coords_points = np.dot(cam_coords_points, R_cam_to_world.T) + t_cam_to_world  # HxWx3, 3x3 -> HxWx3
+    # world_coords_points = np.einsum("ij,hwj->hwi", R_cam_to_world, cam_coords_points) + t_cam_to_world
+
+    return world_coords_points, cam_coords_points, point_mask
+
+
+def depth_to_cam_coords_points(depth_map: np.ndarray, intrinsic: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
+    """
+    Convert a depth map to camera coordinates.
+
+    Args:
+        depth_map (np.ndarray): Depth map of shape (H, W).
+        intrinsic (np.ndarray): Camera intrinsic matrix of shape (3, 3).
+
+    Returns:
+        tuple[np.ndarray, np.ndarray]: Camera coordinates (H, W, 3)
+    """
+    H, W = depth_map.shape
+    assert intrinsic.shape == (3, 3), "Intrinsic matrix must be 3x3"
+    assert intrinsic[0, 1] == 0 and intrinsic[1, 0] == 0, "Intrinsic matrix must have zero skew"
+
+    # Intrinsic parameters
+    fu, fv = intrinsic[0, 0], intrinsic[1, 1]
+    cu, cv = intrinsic[0, 2], intrinsic[1, 2]
+
+    # Generate grid of pixel coordinates
+    u, v = np.meshgrid(np.arange(W), np.arange(H))
+
+    # Unproject to camera coordinates
+    x_cam = (u - cu) * depth_map / fu
+    y_cam = (v - cv) * depth_map / fv
+    z_cam = depth_map
+
+    # Stack to form camera coordinates
+    cam_coords = np.stack((x_cam, y_cam, z_cam), axis=-1).astype(np.float32)
+
+    return cam_coords
+
+
+def closed_form_inverse_se3(se3, R=None, T=None):
+    """
+    Compute the inverse of each 4x4 (or 3x4) SE3 matrix in a batch.
+
+    If `R` and `T` are provided, they must correspond to the rotation and translation
+    components of `se3`. Otherwise, they will be extracted from `se3`.
+
+    Args:
+        se3: Nx4x4 or Nx3x4 array or tensor of SE3 matrices.
+        R (optional): Nx3x3 array or tensor of rotation matrices.
+        T (optional): Nx3x1 array or tensor of translation vectors.
+
+    Returns:
+        Inverted SE3 matrices with the same type and device as `se3`.
+
+    Shapes:
+        se3: (N, 4, 4)
+        R: (N, 3, 3)
+        T: (N, 3, 1)
+    """
+    # Check if se3 is a numpy array or a torch tensor
+    is_numpy = isinstance(se3, np.ndarray)
+
+    # Validate shapes
+    if se3.shape[-2:] != (4, 4) and se3.shape[-2:] != (3, 4):
+        raise ValueError(f"se3 must be of shape (N,4,4), got {se3.shape}.")
+
+    # Extract R and T if not provided
+    if R is None:
+        R = se3[:, :3, :3]  # (N,3,3)
+    if T is None:
+        T = se3[:, :3, 3:]  # (N,3,1)
+
+    # Transpose R
+    if is_numpy:
+        # Compute the transpose of the rotation for NumPy
+        R_transposed = np.transpose(R, (0, 2, 1))
+        # -R^T t for NumPy
+        top_right = -np.matmul(R_transposed, T)
+        inverted_matrix = np.tile(np.eye(4), (len(R), 1, 1))
+    else:
+        R_transposed = R.transpose(1, 2)  # (N,3,3)
+        top_right = -torch.bmm(R_transposed, T)  # (N,3,1)
+        inverted_matrix = torch.eye(4, 4)[None].repeat(len(R), 1, 1)
+        inverted_matrix = inverted_matrix.to(R.dtype).to(R.device)
+
+    inverted_matrix[:, :3, :3] = R_transposed
+    inverted_matrix[:, :3, 3:] = top_right
+
+    return inverted_matrix
\ No newline at end of file
diff --git a/evaluation/utils/interfaces.py b/evaluation/utils/interfaces.py
new file mode 100644
index 00000000..0c4d6b16
--- /dev/null
+++ b/evaluation/utils/interfaces.py
@@ -0,0 +1,145 @@
+import math
+import torch
+import torch.nn.functional as F
+import torchvision.transforms as tvf
+import time
+
+from typing import List, Optional, Tuple
+from omegaconf import DictConfig
+from PIL import Image
+
+
+def load_images(filelist: List[str], PIXEL_LIMIT: int = 255000, new_width: Optional[int] = None, verbose: bool = False):
+    """
+    Loads images from a directory or video, resizes them to a uniform size,
+    then converts and stacks them into a single [N, 3, H, W] PyTorch tensor.
+    """
+    sources = [] 
+    
+    # --- 1. Load image paths or video frames ---
+    for img_path in filelist:
+        try:
+            sources.append(Image.open(img_path).convert('RGB'))
+        except Exception as e:
+            print(f"Could not load image {img_path}: {e}")
+
+    if not sources:
+        print("No images found or loaded.")
+        return torch.empty(0)
+
+    if verbose:
+        print(f"Found {len(sources)} images/frames. Processing...")
+
+    # --- 2. Determine a uniform target size for all images based on the first image ---
+    # This is necessary to ensure all tensors have the same dimensions for stacking.
+    first_img = sources[0]
+    W_orig, H_orig = first_img.size
+    if new_width is None:
+        scale = math.sqrt(PIXEL_LIMIT / (W_orig * H_orig)) if W_orig * H_orig > 0 else 1
+        W_target, H_target = W_orig * scale, H_orig * scale
+        k, m = round(W_target / 14), round(H_target / 14)
+        while (k * 14) * (m * 14) > PIXEL_LIMIT:
+            if k / m > W_target / H_target: k -= 1
+            else: m -= 1
+        TARGET_W, TARGET_H = max(1, k) * 14, max(1, m) * 14
+    else:
+        TARGET_W, TARGET_H = new_width, round(H_orig * (new_width / W_orig) / 14) * 14
+    if verbose:
+        print(f"All images will be resized to a uniform size: ({TARGET_W}, {TARGET_H})")
+
+    # --- 3. Resize images and convert them to tensors in the [0, 1] range ---
+    tensor_list = []
+    # Define a transform to convert a PIL Image to a CxHxW tensor and normalize to [0,1]
+    to_tensor_transform = tvf.ToTensor()
+    
+    for img_pil in sources:
+        try:
+            # Resize to the uniform target size
+            resized_img = img_pil.resize((TARGET_W, TARGET_H), Image.Resampling.LANCZOS)
+            # Convert to tensor
+            img_tensor = to_tensor_transform(resized_img)
+            tensor_list.append(img_tensor)
+        except Exception as e:
+            print(f"Error processing an image: {e}")
+
+    if not tensor_list:
+        print("No images were successfully processed.")
+        return torch.empty(0)
+
+    # --- 4. Stack the list of tensors into a single [N, C, H, W] batch tensor ---
+    return torch.stack(tensor_list, dim=0)
+
+
+def load_and_resize14(filelist: List[str], new_width: int, device: str, verbose: bool):
+    imgs = load_images(filelist, new_width=new_width, verbose=verbose).to(device)
+
+    ori_h, ori_w = imgs.shape[-2:]
+    patch_h, patch_w = ori_h // 14, ori_w // 14
+    # (N, 3, h, w) -> (1, N, 3, h_14, w_14)
+    imgs = F.interpolate(imgs, (patch_h * 14, patch_w * 14), mode="bilinear", align_corners=False, antialias=True).unsqueeze(0)
+    return imgs
+
+def infer_monodepth(file: str, model, hydra_cfg: DictConfig):
+
+    imgs = load_and_resize14([file], new_width=hydra_cfg.load_img_size, device=hydra_cfg.device, verbose=hydra_cfg.verbose)
+
+    dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] >= 8 else torch.float16
+    with torch.no_grad():
+        with torch.amp.autocast(hydra_cfg.device, dtype=dtype):
+            pred = model(imgs)
+
+    points = pred['local_points'][0]         # (1, h_14, w_14, 3)
+    depth_map = points[0, ..., -1].detach()  # (h_14, w_14)
+    return depth_map  # torch.Tensor
+
+
+def infer_videodepth(filelist: str, model, hydra_cfg: DictConfig):
+
+    imgs = load_and_resize14(filelist, new_width=hydra_cfg.load_img_size, device=hydra_cfg.device, verbose=hydra_cfg.verbose)
+
+    dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] >= 8 else torch.float16
+
+    start = time.time()
+    with torch.no_grad():
+        with torch.amp.autocast(hydra_cfg.device, dtype=dtype):
+            pred = model(imgs)
+    end = time.time()
+
+    depth_map = pred['local_points'][0, ..., -1]  # (N, h_14, w_14)
+    depth_conf = pred['conf'][0, ..., 0]          # (N, h_14, w_14)
+    return end - start, depth_map, depth_conf
+
+
+
+def infer_cameras_c2w(filelist: str, model, hydra_cfg: DictConfig):
+
+    imgs = load_and_resize14(filelist, new_width=hydra_cfg.load_img_size, device=hydra_cfg.device, verbose=hydra_cfg.verbose)
+
+    dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] >= 8 else torch.float16
+
+    with torch.no_grad():
+        with torch.amp.autocast(hydra_cfg.device, dtype=dtype):
+            pred = model(imgs)
+
+    poses_c2w_all = pred['camera_poses'].cpu()
+
+    return poses_c2w_all[0], None
+
+def infer_mv_pointclouds(filelist: str, model, hydra_cfg: DictConfig, data_size: Tuple[int, int]):
+
+    imgs = load_and_resize14(filelist, new_width=hydra_cfg.load_img_size, device=hydra_cfg.device, verbose=hydra_cfg.verbose)
+
+    dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] >= 8 else torch.float16
+
+    with torch.no_grad():
+        with torch.amp.autocast(hydra_cfg.device, dtype=dtype):
+            pred = model(imgs)
+    
+    # global_points = pred['points'][0]  # (N, h, w, 3)
+    global_points = pred['world_points'][0]  # (N, h, w, 3)
+    global_points = F.interpolate(
+        global_points.permute(0, 3, 1, 2), data_size,
+        mode="bilinear", align_corners=False, antialias=True
+    ).permute(0, 2, 3, 1)  # align to gt
+
+    return global_points.cpu().numpy()
\ No newline at end of file
diff --git a/evaluation/utils/messages.py b/evaluation/utils/messages.py
new file mode 100644
index 00000000..ea9997be
--- /dev/null
+++ b/evaluation/utils/messages.py
@@ -0,0 +1,87 @@
+import os
+import os.path as osp
+import sys
+import pandas as pd
+
+from typing import List
+
+def set_default_arg(key: str, default_value: str):
+    """
+    check if `key` in arguments, else append default value `key=value` to argument list
+    """
+    has_key = any(arg.startswith(f"{key}=") for arg in sys.argv)
+    if not has_key:
+        sys.argv.append(f"{key}={default_value}")
+
+def make_csvsdir_and_remove_history_csvs(input_root: str, seqs_csv_file: str):
+    """
+    Make the input directory for CSV files and remove any existing history CSV files.
+    """
+    if osp.isfile(seqs_csv_file):
+        os.remove(seqs_csv_file)
+    os.makedirs(input_root, exist_ok=True)
+    for file in os.listdir(input_root):
+        if file.endswith(".csv"):
+            os.remove(osp.join(input_root, file))
+
+def gather_csv_and_write(input_root: str, output_file: str):
+    """
+    Gather all CSV files in the input directory, concatenate them, and write to the output file.
+    If the input directory contains multiple rows in a CSV file, only the last row will be saved.
+    If the output file already exists, it will be overwritten.
+    """
+    seq_dfs = []
+    for seq_csv_file in sorted(os.listdir(input_root)):
+        if seq_csv_file.endswith(".csv"):
+            df = pd.read_csv(osp.join(input_root, seq_csv_file))
+            if len(df) > 1:
+                print(f"Warning: {osp.join(input_root, seq_csv_file)} has more than one row, only the last row will be saved.")
+                df = df.tail(1)
+            seq_dfs.append(df)
+
+    if len(seq_dfs) == 0:
+        raise ValueError(f"No CSV files found in {input_root}. Returning an empty DataFrame.")
+
+    df = pd.concat(seq_dfs, ignore_index=True)
+    if osp.isfile(output_file):
+        print(f"Warning: {output_file} already exists, data will be overwritten.")
+    df.to_csv(output_file, index=False)
+    return df
+
+def write_csv(file_path: str, data_dict: dict):
+    # transform data of one row to DataFrame
+    new_row = pd.DataFrame([data_dict])
+    
+    # directly save when the file does not exist; else we just append
+    if not osp.isfile(file_path):
+        new_row.to_csv(file_path, index=False)
+    else:
+        existing_data = pd.read_csv(file_path)
+        updated_data = pd.concat([existing_data, new_row], ignore_index=True)
+        updated_data.to_csv(file_path, index=False)
+
+def format_matrix_str(matrix):
+    def format_float(num, total_width=20, decimal_places=12):
+        # strip all right 0s, then strip '.'
+        s = f"{num:{total_width}.{decimal_places}f}".rstrip("0").rstrip(".")
+        # add space to the left
+        return f"{s:>{total_width}}"
+    formatted = [
+        [
+            # f"{num:20.12f}".rstrip("0").rstrip(".") if "." in f"{num}" else f"{num:20}" 
+            # f"{num:20.12f}" if "." in f"{num}" else f"{num:20}" 
+            format_float(num, total_width=15, decimal_places=8)
+            for num in row
+        ]
+        for row in matrix
+    ]
+    rows = [
+        f"        [{', '.join(num for num in row)}]"
+        for row in formatted
+    ]
+    return "    [\n" + ",\n".join(rows) + "\n    ]"
+
+def save_list_of_matrices(matrices_tosave: List[List[List[float]]], save_path: str) -> None:
+    json_str = "[\n" + ",\n".join(format_matrix_str(mat) for mat in matrices_tosave) + "\n]"
+    with open(save_path, "w") as f:
+        f.write(json_str)
\ No newline at end of file
diff --git a/evaluation/utils/mv_recon.py b/evaluation/utils/mv_recon.py
new file mode 100644
index 00000000..ab5d763e
--- /dev/null
+++ b/evaluation/utils/mv_recon.py
@@ -0,0 +1,98 @@
+# Reference: https://github.com/CUT3R/CUT3R/blob/main/eval/mv_recon/utils.py
+
+import numpy as np
+from scipy.spatial import cKDTree as KDTree
+
+
+def umeyama(X, Y):
+    """
+    Estimates the Sim(3) transformation between `X` and `Y` point sets.
+
+    Estimates c, R and t such as c * R @ X + t ~ Y.
+
+    Parameters
+    ----------
+    X : numpy.array
+        (m, n) shaped numpy array. m is the dimension of the points,
+        n is the number of points in the point set.
+    Y : numpy.array
+        (m, n) shaped numpy array. Indexes should be consistent with `X`.
+        That is, Y[:, i] must be the point corresponding to X[:, i].
+
+    Returns
+    -------
+    c : float
+        Scale factor.
+    R : numpy.array
+        (3, 3) shaped rotation matrix.
+    t : numpy.array
+        (3, 1) shaped translation vector.
+    """
+    mu_x = X.mean(axis=1).reshape(-1, 1)
+    mu_y = Y.mean(axis=1).reshape(-1, 1)
+    var_x = np.square(X - mu_x).sum(axis=0).mean()
+    cov_xy = ((Y - mu_y) @ (X - mu_x).T) / X.shape[1]
+    U, D, VH = np.linalg.svd(cov_xy)
+    S = np.eye(X.shape[0])
+    if np.linalg.det(U) * np.linalg.det(VH) < 0:
+        S[-1, -1] = -1
+    c = np.trace(np.diag(D) @ S) / var_x
+    R = U @ S @ VH
+    t = mu_y - c * R @ mu_x
+    return c, R, t
+
+
+def completion_ratio(gt_points, rec_points, dist_th=0.05):
+    gen_points_kd_tree = KDTree(rec_points)
+    distances, _ = gen_points_kd_tree.query(gt_points)
+    comp_ratio = np.mean((distances < dist_th).astype(np.float32))
+    return comp_ratio
+
+
+def accuracy(gt_points, rec_points, gt_normals=None, rec_normals=None):
+    gt_points_kd_tree = KDTree(gt_points)
+    distances, idx = gt_points_kd_tree.query(rec_points, workers=-1)
+    acc = np.mean(distances)
+
+    acc_median = np.median(distances)
+
+    if gt_normals is not None and rec_normals is not None:
+        normal_dot = np.sum(gt_normals[idx] * rec_normals, axis=-1)
+        normal_dot = np.abs(normal_dot)
+
+        return acc, acc_median, np.mean(normal_dot), np.median(normal_dot)
+
+    return acc, acc_median
+
+
+def completion(gt_points, rec_points, gt_normals=None, rec_normals=None):
+    gt_points_kd_tree = KDTree(rec_points)
+    distances, idx = gt_points_kd_tree.query(gt_points, workers=-1)
+    comp = np.mean(distances)
+    comp_median = np.median(distances)
+
+    if gt_normals is not None and rec_normals is not None:
+        normal_dot = np.sum(gt_normals * rec_normals[idx], axis=-1)
+        normal_dot = np.abs(normal_dot)
+
+        return comp, comp_median, np.mean(normal_dot), np.median(normal_dot)
+
+    return comp, comp_median
+
+
+def compute_iou(pred_vox, target_vox):
+    # Get voxel indices
+    v_pred_indices = [voxel.grid_index for voxel in pred_vox.get_voxels()]
+    v_target_indices = [voxel.grid_index for voxel in target_vox.get_voxels()]
+
+    # Convert to sets for set operations
+    v_pred_filled = set(tuple(np.round(x, 4)) for x in v_pred_indices)
+    v_target_filled = set(tuple(np.round(x, 4)) for x in v_target_indices)
+
+    # Compute intersection and union
+    intersection = v_pred_filled & v_target_filled
+    union = v_pred_filled | v_target_filled
+
+    # Compute IoU
+    iou = len(intersection) / len(union)
+    return iou
\ No newline at end of file
diff --git a/evaluation/utils/vis_utils.py b/evaluation/utils/vis_utils.py
new file mode 100644
index 00000000..14cfc295
--- /dev/null
+++ b/evaluation/utils/vis_utils.py
@@ -0,0 +1,41 @@
+import math
+import numpy as np
+import torch
+
+from typing import Union
+from PIL import Image
+
+
+def save_image_grid(images: np.ndarray, grid_shape: tuple, save_path: str):
+    """
+    images: numpy array of shape (N, H, W, 3)
+    grid_shape: (rows, cols)
+    """
+    H, W = images.shape[1], images.shape[2]
+    grid = np.zeros((grid_shape[0]*H, grid_shape[1]*W, 3), dtype=np.uint8)
+    
+    for i in range(min(len(images), grid_shape[0]*grid_shape[1])):
+        row = i // grid_shape[1]
+        col = i % grid_shape[1]
+        grid[row*H:(row+1)*H, col*W:(col+1)*W] = images[i]
+    
+    Image.fromarray(grid).save(save_path)
+
+
+def save_image_grid_auto(images: Union[np.ndarray, torch.Tensor], save_path: str):
+    """
+    images: np.ndarray of shape (N, H, W, 3) in [0, 255] or torch.Tensor of shape (N, 3, H, W) in range [0, 1]
+    """
+    if isinstance(images, torch.Tensor):
+        assert images.ndim == 4 and (images.shape[1] == 3 or images.shape[-1] == 3), f"images must be a 4D torch tensor with shape (N, 3, H, W) or (N, H, W, 3)"
+        if images.shape[1] == 3:
+            images = images.permute(0, 2, 3, 1)
+        images = (images.detach().cpu().numpy() * 255).astype(np.uint8)
+    elif isinstance(images, np.ndarray):
+        assert images.ndim == 4 and images.shape[3] == 3, f"images must be a 4D numpy array with shape (N, H, W, 3)"
+    else:
+        raise ValueError(f"images must be a numpy array or a torch tensor, but got {type(images)}")
+
+    rows = math.floor(math.sqrt(len(images)))
+    cols = math.ceil(len(images) / rows)
+    save_image_grid(images, (rows, cols), save_path)
\ No newline at end of file
diff --git a/pretrained_models b/pretrained_models
new file mode 120000
index 00000000..fb5bc8d0
--- /dev/null
+++ b/pretrained_models
@@ -0,0 +1 @@
+../mv-ssl/pretrained_models
\ No newline at end of file
diff --git a/run_eval.sh b/run_eval.sh
new file mode 100644
index 00000000..d632e1a3
--- /dev/null
+++ b/run_eval.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+#SBATCH -A NAISS2024-5-609
+#SBATCH -o /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/evaluation/slurm_outs/%x_%j.out
+#SBATCH -t 0-02:00:00
+#SBATCH --gpus-per-node=A100:1
+#SBATCH --nodes 1
+
+python evaluation/dtu.py
\ No newline at end of file
diff --git a/test.sh b/test.sh
new file mode 100644
index 00000000..a0a4ed36
--- /dev/null
+++ b/test.sh
@@ -0,0 +1,6 @@
+find ./data/re10k -type d -name images | while read imgdir; do
+  count=$(find "$imgdir" -maxdepth 1 -type f | wc -l)
+  if [ "$count" -gt 10 ]; then
+    basename "$(dirname "$imgdir")"
+  fi
+done > folders_with_many_images.txt
\ No newline at end of file
diff --git a/todo.txt b/todo.txt
new file mode 100644
index 00000000..8a391903
--- /dev/null
+++ b/todo.txt
@@ -0,0 +1,18 @@
+Klara (genererar snygga ply plots):
+* MegaDepth
+* VKITTI
+* BlendedMVS
+* PointOdyssey
+* ScanNet
+* Hypersim
+
+
+Datsets
+- [x] MegaDepth
+- [x] ScanNet
+- [x] VKITTI
+- [ ] CO3D (downloading)
+- [ ] BlendedMVS (downloading)
+- [ ] WildRGB
+- [ ] PointOdyssey
+- [ ] MVS-Synth
\ No newline at end of file
diff --git a/training/config/crocov2.yaml b/training/config/crocov2.yaml
new file mode 100644
index 00000000..f1b76a08
--- /dev/null
+++ b/training/config/crocov2.yaml
@@ -0,0 +1,6 @@
+defaults:
+  - default
+  - _self_
+exp_name: crocov2_exp002
+model:
+  patch_embed: crocov2
\ No newline at end of file
diff --git a/training/config/default.yaml b/training/config/default.yaml
index 8bec7a73..5fdbd1b0 100644
--- a/training/config/default.yaml
+++ b/training/config/default.yaml
@@ -2,17 +2,17 @@ defaults:
   - default_dataset.yaml
 
 exp_name: exp001
-img_size: 518
+log_wandb: true
+img_size: 512
 num_workers: 8
 seed_value: 42
-accum_steps: 2    # We did not use gradient accumulation in our training, while if you suffer from OOM, you can try to use it.
-patch_size: 14
-val_epoch_freq: 5
+accum_steps: 1    # We did not use gradient accumulation in our training, while if you suffer from OOM, you can try to use it.
+patch_size: 16
+val_epoch_freq: 10
 max_img_per_gpu: 48
 
-limit_train_batches: 800
-limit_val_batches: 400
-
+limit_train_batches: 1000
+limit_val_batches: 100
 
 data:
   # The code for data still looks too complicated. I should refactor this again (do I have time?...)
@@ -28,10 +28,51 @@ data:
     dataset:
       _target_: data.composed_dataset.ComposedDataset
       dataset_configs:
-        - _target_: data.datasets.co3d.Co3dDataset
+        - _target_: data.datasets.megadepth.MegadepthDataset
+          split: train
+          MEGADEPTH_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/data/megadepth
+          MEGADEPTH_ANNOTATION_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations
+          len_train: 100000  
+        - _target_: data.datasets.scannet.ScanNetDataset
+          split: train
+          SCANNET_DIR: /mimer/NOBACKUP/groups/3d-dl/scannet/scans/scans_train
+          SCANNET_ANNOTATION_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations
+          len_train: 100000
+        - _target_: data.datasets.vkitti.VKittiDataset
+          split: train
+          VKitti_DIR: /mimer/NOBACKUP/groups/3d-dl/vkitti
+          len_train: 100000
+          expand_ratio: 8 
+        - _target_: data.datasets.mvssynth.MVSSynthDataset
+          split: train
+          MVSSYNTH_DIR: /mimer/NOBACKUP/groups/3d-dl/MVS-Synth/GTAV_540
+          MVSSYNTH_ANNOTATION_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations
+          len_train: 100000
+        - _target_: data.datasets.blendedmvs.BlendedMVSDataset
           split: train
-          CO3D_DIR: /YOUR/PATH/TO/CO3D
-          CO3D_ANNOTATION_DIR: /YOUR/PATH/TO/CO3D_ANNOTATION
+          BLENDEDMVS_DIR: /mimer/NOBACKUP/groups/3d-dl/blendedmvs_full
+          BLENDEDMVS_ANNOTATION_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations
+          len_train: 100000
+        - _target_: data.datasets.pointodyssey.PointOdysseyDataset
+          split: train
+          POINTODYSSEY_DIR: /mimer/NOBACKUP/groups/3d-dl/pointodyssey
+          POINTODYSSEY_ANNOTATION_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations
+          len_train: 100000
+        - _target_: data.datasets.hypersim.HypersimDataset
+          split: train
+          HYPERSIM_DIR: /mimer/NOBACKUP/groups/3d-dl/ml-hypersim/contrib/99991/downloads
+          HYPERSIM_ANNOTATION_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations
+          len_train: 100000
+        - _target_: data.datasets.wildrgbd.WildrgbdDataset
+          split: train
+          WILDRGBD_DIR: /mimer/NOBACKUP/groups/3d-dl/wildrgbd
+          WILDRGBD_ANNOTATION_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations
+          len_train: 100000
+        - _target_: data.datasets.co3dv2.Co3dDataset
+          split: train
+          CO3D_DIR: /mimer/NOBACKUP/groups/3d-dl/co3dv2
+          CO3D_ANNOTATION_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations/co3d
+          len_train: 100000
   val:
     _target_: data.dynamic_dataloader.DynamicTorchDataset
     num_workers: ${num_workers}
@@ -43,16 +84,15 @@ data:
     dataset:
       _target_: data.composed_dataset.ComposedDataset
       dataset_configs:
-        - _target_: data.datasets.co3d.Co3dDataset
+        - _target_: data.datasets.megadepth.MegadepthDataset
           split: test
-          CO3D_DIR: /YOUR/PATH/TO/CO3D
-          CO3D_ANNOTATION_DIR: /YOUR/PATH/TO/CO3D_ANNOTATION
-
+          MEGADEPTH_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/data/megadepth
+          MEGADEPTH_ANNOTATION_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations
 
 logging:
   log_dir: logs
   log_visuals: False
-  log_freq: 1
+  log_freq: 5
   log_level_primary: DEBUG
   log_level_secondary: WARNING
   all_ranks: False
@@ -85,8 +125,8 @@ logging:
 
 checkpoint:
   save_dir: logs/${exp_name}/ckpts
-  save_freq: 5
-  resume_checkpoint_path: /YOUR/PATH/TO/CKPT
+  save_freq: 20
+  resume_checkpoint_path: # /YOUR/PATH/TO/CKPT
   strict: False
 
 
@@ -99,27 +139,24 @@ loss:
     weight: 1.0
     gradient_loss_fn: "grad" 
     valid_range: 0.98
-  point: null
+  # point: null
   # If you want to enable point, use the following config
-  # point: 
-  #   weight: 1.0
-  #   gradient_loss_fn: "normal" 
-  #   valid_range: 0.98
+  point: 
+    weight: 1.0
+    gradient_loss_fn: "normal" 
+    valid_range: 0.98
   track: null   
 
-
-
-
 optim:
   param_group_modifiers: False
 
   optimizer:
     _target_: torch.optim.AdamW
-    lr: 5e-5
+    lr: 2e-4 # 5e-5
     weight_decay: 0.05
 
   frozen_module_names:
-      - "*aggregator*"  # example, freeze the aggregator
+      # - "*aggregator*"  # example, freeze the aggregator
 
   amp:
     enabled: True
@@ -136,6 +173,9 @@ optim:
       - module_name: ["camera"]
         max_norm: 1.0   # feel free to reduce this if you see instabilities
         norm_type: 2
+      - module_name: ["point"]
+        max_norm: 1.0   # feel free to reduce this if you see instabilities
+        norm_type: 2
   options:
     lr:
       - scheduler:
@@ -143,9 +183,9 @@ optim:
           schedulers:
             - _target_: fvcore.common.param_scheduler.LinearParamScheduler
               start_value: 1e-8
-              end_value: 5e-5
+              end_value: ${optim.optimizer.lr}
             - _target_: fvcore.common.param_scheduler.CosineParamScheduler
-              start_value: 5e-5
+              start_value: ${optim.optimizer.lr}
               end_value: 1e-8
           lengths: [0.05, 0.95]
           interval_scaling: ['rescaled', 'rescaled']
@@ -154,18 +194,31 @@ optim:
           _target_: fvcore.common.param_scheduler.ConstantParamScheduler
           value: 0.05
 
+max_epochs: 100
 
+# Base: 
+# embed_dim=768
+# depth=12
+# num_heads=12
 
-
-max_epochs: 20
+# Large: 
+# embed_dim=1024
+# depth=24
+# num_heads=16
 
 model:
-  _target_: vggt.models.vggt.VGGT
+  _target_: vggt.models.vggt_small.VGGT
+  img_size: ${img_size}
+  embed_dim: 1024
+  depth: 6
+  num_heads: 16
+
   enable_camera: True
   enable_depth: True
-  enable_point: False
+  enable_point: True
   enable_track: False
-
+  patch_size: ${patch_size}
+  patch_embed: dinov3 # crocov2 # mum # dinov3
 
 distributed:
   # check https://docs.pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html for options
diff --git a/training/config/default_old.yaml b/training/config/default_old.yaml
new file mode 100644
index 00000000..606295ff
--- /dev/null
+++ b/training/config/default_old.yaml
@@ -0,0 +1,237 @@
+defaults:
+  - default_dataset.yaml
+
+exp_name: exp001
+log_wandb: true
+img_size: 336 # 400 # 480 # 512
+num_workers: 8
+seed_value: 42
+accum_steps: 1    # We did not use gradient accumulation in our training, while if you suffer from OOM, you can try to use it.
+patch_size: 16
+val_epoch_freq: 10
+max_img_per_gpu: 48
+
+limit_train_batches: 1000
+limit_val_batches: 100
+
+data:
+  # The code for data still looks too complicated. I should refactor this again (do I have time?...)
+  train:
+    _target_: data.dynamic_dataloader.DynamicTorchDataset
+    num_workers: ${num_workers}
+    max_img_per_gpu: ${max_img_per_gpu}
+    common_config:
+      img_size: ${img_size}
+      patch_size: ${patch_size}
+      debug: False
+      repeat_batch: False
+    dataset:
+      _target_: data.composed_dataset.ComposedDataset
+      dataset_configs:
+        - _target_: data.datasets.megadepth.MegadepthDataset
+          split: train
+          MEGADEPTH_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/data/megadepth
+          MEGADEPTH_ANNOTATION_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations
+          len_train: 100000  
+        - _target_: data.datasets.scannet.ScanNetDataset
+          split: train
+          SCANNET_DIR: /mimer/NOBACKUP/groups/3d-dl/scannet/scans/scans_train
+          SCANNET_ANNOTATION_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations
+          len_train: 100000
+        - _target_: data.datasets.vkitti.VKittiDataset
+          split: train
+          VKitti_DIR: /mimer/NOBACKUP/groups/3d-dl/vkitti
+          len_train: 100000
+          expand_ratio: 8 
+        - _target_: data.datasets.mvssynth.MVSSynthDataset
+          split: train
+          MVSSYNTH_DIR: /mimer/NOBACKUP/groups/3d-dl/MVS-Synth/GTAV_540
+          MVSSYNTH_ANNOTATION_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations
+          len_train: 100000
+        - _target_: data.datasets.blendedmvs.BlendedMVSDataset
+          split: train
+          BLENDEDMVS_DIR: /mimer/NOBACKUP/groups/3d-dl/blendedmvs_full
+          BLENDEDMVS_ANNOTATION_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations
+          len_train: 100000
+        - _target_: data.datasets.pointodyssey.PointOdysseyDataset
+          split: train
+          POINTODYSSEY_DIR: /mimer/NOBACKUP/groups/3d-dl/pointodyssey
+          POINTODYSSEY_ANNOTATION_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations
+          len_train: 100000
+        - _target_: data.datasets.hypersim.HypersimDataset
+          split: train
+          HYPERSIM_DIR: /mimer/NOBACKUP/groups/3d-dl/ml-hypersim/contrib/99991/downloads
+          HYPERSIM_ANNOTATION_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations
+          len_train: 100000
+        - _target_: data.datasets.wildrgbd.WildrgbdDataset
+          split: train
+          WILDRGBD_DIR: /mimer/NOBACKUP/groups/3d-dl/wildrgbd
+          WILDRGBD_ANNOTATION_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations
+          len_train: 100000
+
+        - _target_: data.datasets.co3dv2.Co3dDataset
+          split: train
+          CO3D_DIR: /mimer/NOBACKUP/groups/3d-dl/co3dv2
+          CO3D_ANNOTATION_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations/co3d
+          len_train: 100000
+  val:
+    _target_: data.dynamic_dataloader.DynamicTorchDataset
+    num_workers: ${num_workers}
+    max_img_per_gpu: ${max_img_per_gpu}
+    common_config:
+      img_size: ${img_size}
+      patch_size: ${patch_size}
+      debug: False
+    dataset:
+      _target_: data.composed_dataset.ComposedDataset
+      dataset_configs:
+        - _target_: data.datasets.megadepth.MegadepthDataset
+          split: test
+          MEGADEPTH_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/data/megadepth
+          MEGADEPTH_ANNOTATION_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations
+
+logging:
+  log_dir: logs
+  log_visuals: False
+  log_freq: 5
+  log_level_primary: DEBUG
+  log_level_secondary: WARNING
+  all_ranks: False
+  tensorboard_writer:
+    _target_: train_utils.tb_writer.TensorBoardLogger
+    path: ${logging.log_dir}/tensorboard
+  scalar_keys_to_log:
+    train:
+      keys_to_log:
+        - loss_objective
+        - loss_camera
+        - loss_T
+        - loss_R
+        - loss_FL
+        - loss_conf_depth
+        - loss_reg_depth
+        - loss_grad_depth
+    val:
+      keys_to_log:
+        - loss_objective
+        - loss_camera
+        - loss_T
+        - loss_R
+        - loss_FL
+        - loss_conf_depth
+        - loss_reg_depth
+        - loss_grad_depth
+
+
+
+checkpoint:
+  save_dir: logs/${exp_name}/ckpts
+  save_freq: 20
+  resume_checkpoint_path: # /YOUR/PATH/TO/CKPT
+  strict: False
+
+
+loss:
+  _target_: loss.MultitaskLoss
+  camera: 
+    weight: 5.0
+    loss_type: "l1" # The paper uses smooth l1 loss, but we found l1 loss is more stable than smooth l1 and l2 loss.  
+  depth:
+    weight: 1.0
+    gradient_loss_fn: "grad" 
+    valid_range: 0.98
+  # point: null
+  # If you want to enable point, use the following config
+  point: 
+    weight: 1.0
+    gradient_loss_fn: "normal" 
+    valid_range: 0.98
+  track: null   
+
+optim:
+  param_group_modifiers: False
+
+  optimizer:
+    _target_: torch.optim.AdamW
+    lr: 1e-4 # 5e-5
+    weight_decay: 0.05
+
+  frozen_module_names:
+      # - "*aggregator*"  # example, freeze the aggregator
+
+  amp:
+    enabled: True
+    amp_dtype: bfloat16
+  gradient_clip:
+    _target_: train_utils.gradient_clip.GradientClipper
+    configs:
+      - module_name: ["aggregator"]
+        max_norm: 1.0   # feel free to reduce this if you see instabilities
+        norm_type: 2
+      - module_name: ["depth"]
+        max_norm: 1.0   # feel free to reduce this if you see instabilities
+        norm_type: 2
+      - module_name: ["camera"]
+        max_norm: 1.0   # feel free to reduce this if you see instabilities
+        norm_type: 2
+      - module_name: ["point"]
+        max_norm: 1.0   # feel free to reduce this if you see instabilities
+        norm_type: 2
+  options:
+    lr:
+      - scheduler:
+          _target_: fvcore.common.param_scheduler.CompositeParamScheduler
+          schedulers:
+            - _target_: fvcore.common.param_scheduler.LinearParamScheduler
+              start_value: 1e-8
+              end_value: ${optim.optimizer.lr}
+            - _target_: fvcore.common.param_scheduler.CosineParamScheduler
+              start_value: ${optim.optimizer.lr}
+              end_value: 1e-8
+          lengths: [0.05, 0.95]
+          interval_scaling: ['rescaled', 'rescaled']
+    weight_decay:
+      - scheduler:
+          _target_: fvcore.common.param_scheduler.ConstantParamScheduler
+          value: 0.05
+
+max_epochs: 100
+
+# Base: 
+# embed_dim=768
+# depth=12
+# num_heads=12
+
+# Large: 
+# embed_dim=1024
+# depth=24
+# num_heads=16
+
+model:
+  _target_: vggt.models.vggt_small.VGGT
+  img_size: ${img_size}
+  embed_dim: 768
+  depth: 6
+  num_heads: 12
+
+  enable_camera: True
+  enable_depth: True
+  enable_point: True
+  enable_track: False
+  patch_size: ${patch_size}
+  patch_embed: dinov3 # crocov2 # mum # dinov3
+
+distributed:
+  # check https://docs.pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html for options
+  backend: nccl
+  comms_dtype: None
+  find_unused_parameters: False
+  timeout_mins: 30
+  gradient_as_bucket_view: True  # Less memory used
+  bucket_cap_mb: 25
+  broadcast_buffers: True
+
+cuda:
+    cudnn_deterministic: False
+    cudnn_benchmark: False
+    allow_tf32: True
diff --git a/training/config/dinov3.yaml b/training/config/dinov3.yaml
new file mode 100644
index 00000000..e57075dc
--- /dev/null
+++ b/training/config/dinov3.yaml
@@ -0,0 +1,6 @@
+defaults:
+  - default
+  - _self_
+exp_name: dinov3_exp005
+model:
+  patch_embed: dinov3
\ No newline at end of file
diff --git a/training/config/mum.yaml b/training/config/mum.yaml
new file mode 100644
index 00000000..990b7037
--- /dev/null
+++ b/training/config/mum.yaml
@@ -0,0 +1,6 @@
+defaults:
+  - default
+  - _self_
+exp_name: mum_exp005
+model:
+  patch_embed: mum
\ No newline at end of file
diff --git a/training/data/dataset_util.py b/training/data/dataset_util.py
index 542af78f..e1935da9 100644
--- a/training/data/dataset_util.py
+++ b/training/data/dataset_util.py
@@ -708,4 +708,4 @@ def load_16big_png_depth(depth_png: str) -> np.ndarray:
             .astype(np.float32)
             .reshape((depth_pil.size[1], depth_pil.size[0]))
         )
-    return depth
+    return depth
\ No newline at end of file
diff --git a/training/data/datasets/blendedmvs.py b/training/data/datasets/blendedmvs.py
new file mode 100644
index 00000000..89d045e4
--- /dev/null
+++ b/training/data/datasets/blendedmvs.py
@@ -0,0 +1,264 @@
+import re
+import gzip
+import json
+import os.path as osp
+import os
+import logging
+
+import cv2
+import random
+import numpy as np
+import h5py
+
+from data.dataset_util import *
+from data.base_dataset import BaseDataset
+
+import numpy as np
+import torch
+import cv2
+
+def read_pfm(filename):
+    file = open(filename, 'rb')
+    color = None
+    width = None
+    height = None
+    scale = None
+    endian = None
+    
+    header = file.readline().decode('utf-8').rstrip()
+    if header == 'PF':
+        color = True
+    elif header == 'Pf':
+        color = False
+    else:
+        raise Exception('Not a PFM file.')
+
+    dim_match = re.match(r'^(\d+)\s(\d+)\s$', file.readline().decode('utf-8'))
+    if dim_match:
+        width, height = map(int, dim_match.groups())
+    else:
+        raise Exception('Malformed PFM header.')
+
+    scale = float(file.readline().rstrip())
+    if scale < 0:  # little-endian
+        endian = '<'
+        scale = -scale
+    else:
+        endian = '>'  # big-endian
+
+    data = np.fromfile(file, endian + 'f')
+    shape = (height, width, 3) if color else (height, width)
+
+    data = np.reshape(data, shape)
+    data = np.flipud(data)
+    file.close()
+    return data, scale
+
+class BlendedMVSDataset(BaseDataset):
+    def __init__(
+        self,
+        common_conf,
+        split: str = "train",
+        BLENDEDMVS_DIR: str = None,
+        BLENDEDMVS_ANNOTATION_DIR: str = None,
+        min_num_images: int = 24,
+        len_train: int = 100000,
+        len_test: int = 10000,
+    ):
+        """
+        Initialize the BlendedMVSDataset.
+
+        Args:
+            common_conf: Configuration object with common settings.
+            split (str): Dataset split, either 'train' or 'test'.
+            BLENDEDMVS_DIR (str): Directory path to BlendedMVS data.
+            BLENDEDMVS_ANNOTATION_DIR (str): Directory path to BlendedMVS annotations.
+            min_num_images (int): Minimum number of images per sequence.
+            len_train (int): Length of the training dataset.
+            len_test (int): Length of the test dataset.
+        Raises:
+            ValueError: If BLENDEDMVS_DIR or BLENDEDMVS_ANNOTATION_DIR is not specified.
+        """
+        super().__init__(common_conf=common_conf)
+
+        self.debug = common_conf.debug
+        self.training = common_conf.training
+        self.get_nearby = common_conf.get_nearby
+        self.load_depth = common_conf.load_depth
+        self.inside_random = common_conf.inside_random
+        self.allow_duplicate_img = common_conf.allow_duplicate_img
+
+        if BLENDEDMVS_DIR is None or BLENDEDMVS_ANNOTATION_DIR is None:
+            raise ValueError("Both BLENDEDMVS_DIR and BLENDEDMVS_ANNOTATION_DIR must be specified.")
+
+        if split == "train":
+            split_name = "train.jgz"
+            self.len_train = len_train
+        elif split == "test":
+            split_name = "test.jgz"
+            self.len_train = len_test
+        else:
+            raise ValueError(f"Invalid split: {split}")
+
+        self.invalid_sequence = [] # set any invalid sequence names here
+
+
+        self.category_map = {}
+        self.data_store = {}
+        self.seqlen = None
+        self.min_num_images = min_num_images
+
+        logging.info(f"BLENDEDMVS_DIR is {BLENDEDMVS_DIR}")
+
+        self.BLENDEDMVS_DIR = BLENDEDMVS_DIR
+        self.BLENDEDMVS_ANNOTATION_DIR = BLENDEDMVS_ANNOTATION_DIR
+
+        annotation_file = osp.join(
+            self.BLENDEDMVS_ANNOTATION_DIR, "blendedmvs", split_name
+        )
+
+        try:
+            with gzip.open(annotation_file, "r") as fin:
+                annotation = json.loads(fin.read())
+        except FileNotFoundError:
+            logging.error(f"Annotation file not found: {annotation_file}")
+        total_frame_num = 0
+
+        for seq_name, seq_data in annotation.items():
+            if seq_name in self.invalid_sequence:
+                continue
+
+            if len(seq_data) < min_num_images:
+                continue
+            total_frame_num += len(seq_data)
+            self.data_store[seq_name] = seq_data
+        self.sequence_list = list(self.data_store.keys())
+        self.sequence_list_len = len(self.sequence_list)
+        self.total_frame_num = total_frame_num
+
+        status = "Training" if self.training else "Testing"
+        logging.info(f"{status}: BlendedMVS Data size: {self.sequence_list_len}")
+        logging.info(f"{status}: BlendedMVS Data dataset length: {len(self)}")
+
+    def get_data(
+        self,
+        seq_index: int = None,
+        img_per_seq: int = None,
+        seq_name: str = None,
+        ids: list = None,
+        aspect_ratio: float = 1.0,
+    ) -> dict:
+        """
+        Retrieve data for a specific sequence.
+
+        Args:
+            seq_index (int): Index of the sequence to retrieve.
+            img_per_seq (int): Number of images per sequence.
+            seq_name (str): Name of the sequence.
+            ids (list): Specific IDs to retrieve.
+            aspect_ratio (float): Aspect ratio for image processing.
+
+        Returns:
+            dict: A batch of data including images, depths, and other metadata.
+        """
+        if self.inside_random:
+            seq_index = random.randint(0, self.sequence_list_len - 1)
+            
+        if seq_name is None:
+            seq_name = self.sequence_list[seq_index]
+
+        metadata = self.data_store[seq_name]
+
+        if ids is None:
+            ids = np.random.choice(
+                len(metadata), img_per_seq, replace=self.allow_duplicate_img
+            )
+
+        annos = [metadata[i] for i in ids]
+
+        target_image_shape = self.get_target_shape(aspect_ratio)
+
+        images = []
+        depths = []
+        cam_points = []
+        world_points = []
+        point_masks = []
+        extrinsics = []
+        intrinsics = []
+        image_paths = []
+        original_sizes = []
+
+        for anno in annos:
+            filepath = anno["filepath"]
+
+            image_path = osp.join(self.BLENDEDMVS_DIR, filepath)
+            image = read_image_cv2(image_path)
+
+            if self.load_depth:
+                depth_path = osp.join(self.BLENDEDMVS_DIR, anno["depthpath"])
+
+                depth_map, _ = read_pfm(depth_path)
+                depth_map = threshold_depth_map(depth_map, max_percentile=98, min_percentile=-1)
+                # depth_path = image_path.replace("/images", "/depths") + ".geometric.png"
+        
+                # mvs_mask_path = image_path.replace(
+                #     "/images", "/depth_masks"
+                # ).replace(".jpg", ".png")
+                # mvs_mask = cv2.imread(mvs_mask_path, cv2.IMREAD_GRAYSCALE) > 128
+                # depth_map[~mvs_mask] = 0
+
+                # depth_map = threshold_depth_map(
+                #     depth_map, min_percentile=-1, max_percentile=98
+                # )
+            else:
+                depth_map = None
+
+            original_size = np.array(image.shape[:2])
+            extri_opencv = np.array(anno["extri"])
+            intri_opencv = np.array(anno["intri"])
+
+            (
+                image,
+                depth_map,
+                extri_opencv,
+                intri_opencv,
+                world_coords_points,
+                cam_coords_points,
+                point_mask,
+                _,
+            ) = self.process_one_image(
+                image,
+                depth_map,
+                extri_opencv,
+                intri_opencv,
+                original_size,
+                target_image_shape,
+                filepath=filepath,
+            )
+
+            images.append(image)
+            depths.append(depth_map)
+            extrinsics.append(extri_opencv)
+            intrinsics.append(intri_opencv)
+            cam_points.append(cam_coords_points)
+            world_points.append(world_coords_points)
+            point_masks.append(point_mask)
+            image_paths.append(image_path)
+            original_sizes.append(original_size)
+
+        set_name = "BlendedMVS"
+
+        batch = {
+            "seq_name": set_name + "_" + seq_name,
+            "ids": ids,
+            "frame_num": len(extrinsics),
+            "images": images,
+            "depths": depths,
+            "extrinsics": extrinsics,
+            "intrinsics": intrinsics,
+            "cam_points": cam_points,
+            "world_points": world_points,
+            "point_masks": point_masks,
+            "original_sizes": original_sizes,
+        }
+        return batch
diff --git a/training/data/datasets/co3d.py b/training/data/datasets/co3d.py
index 5636626d..69662867 100644
--- a/training/data/datasets/co3d.py
+++ b/training/data/datasets/co3d.py
@@ -1,8 +1,3 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
 
 import gzip
 import json
@@ -13,58 +8,28 @@
 import cv2
 import random
 import numpy as np
-
+import h5py
 
 from data.dataset_util import *
 from data.base_dataset import BaseDataset
 
+import numpy as np
+import torch
+import cv2
+
+
+def _load_16big_png_depth(depth_png):
+    with Image.open(depth_png) as depth_pil:
+        # the image is stored with 16-bit depth but PIL reads it as I (32 bit).
+        # we cast it to uint16, then reinterpret as float16, then cast to float32
+        depth = (
+            np.frombuffer(np.array(depth_pil, dtype=np.uint16), dtype=np.float16)
+            .astype(np.float32)
+            .reshape((depth_pil.size[1], depth_pil.size[0]))
+        )
+    return depth
 
-SEEN_CATEGORIES = [
-    "apple",
-    "backpack",
-    "banana",
-    "baseballbat",
-    "baseballglove",
-    "bench",
-    "bicycle",
-    "bottle",
-    "bowl",
-    "broccoli",
-    "cake",
-    "car",
-    "carrot",
-    "cellphone",
-    "chair",
-    "cup",
-    "donut",
-    "hairdryer",
-    "handbag",
-    "hydrant",
-    "keyboard",
-    "laptop",
-    "microwave",
-    "motorcycle",
-    "mouse",
-    "orange",
-    "parkingmeter",
-    "pizza",
-    "plant",
-    "stopsign",
-    "teddybear",
-    "toaster",
-    "toilet",
-    "toybus",
-    "toyplane",
-    "toytrain",
-    "toytruck",
-    "tv",
-    "umbrella",
-    "vase",
-    "wineglass",
-]
-
-
-class Co3dDataset(BaseDataset):
+class CO3DDataset(BaseDataset):
     def __init__(
         self,
         common_conf,
@@ -76,7 +41,7 @@ def __init__(
         len_test: int = 10000,
     ):
         """
-        Initialize the Co3dDataset.
+        Initialize the CO3DDataset.
 
         Args:
             common_conf: Configuration object with common settings.
@@ -101,16 +66,11 @@ def __init__(
         if CO3D_DIR is None or CO3D_ANNOTATION_DIR is None:
             raise ValueError("Both CO3D_DIR and CO3D_ANNOTATION_DIR must be specified.")
 
-        category = sorted(SEEN_CATEGORIES)
-
-        if self.debug:
-            category = ["apple"]
-
         if split == "train":
-            split_name_list = ["train"]
+            split_name = "train.jgz"
             self.len_train = len_train
         elif split == "test":
-            split_name_list = ["test"]
+            split_name = "test.jgz"
             self.len_train = len_test
         else:
             raise ValueError(f"Invalid split: {split}")
@@ -128,36 +88,32 @@ def __init__(
         self.CO3D_DIR = CO3D_DIR
         self.CO3D_ANNOTATION_DIR = CO3D_ANNOTATION_DIR
 
-        total_frame_num = 0
+        annotation_file = osp.join(
+            self.CO3D_ANNOTATION_DIR, "co3d", split_name
+        )
 
-        for c in category:
-            for split_name in split_name_list:
-                annotation_file = osp.join(
-                    self.CO3D_ANNOTATION_DIR, f"{c}_{split_name}.jgz"
-                )
+        try:
+            with gzip.open(annotation_file, "r") as fin:
+                annotation = json.loads(fin.read())
+        except FileNotFoundError:
+            logging.error(f"Annotation file not found: {annotation_file}")
+        total_frame_num = 0
 
-                try:
-                    with gzip.open(annotation_file, "r") as fin:
-                        annotation = json.loads(fin.read())
-                except FileNotFoundError:
-                    logging.error(f"Annotation file not found: {annotation_file}")
-                    continue
-
-                for seq_name, seq_data in annotation.items():
-                    if len(seq_data) < min_num_images:
-                        continue
-                    if seq_name in self.invalid_sequence:
-                        continue
-                    total_frame_num += len(seq_data)
-                    self.data_store[seq_name] = seq_data
+        for seq_name, seq_data in annotation.items():
+            if seq_name in self.invalid_sequence:
+                continue
 
+            if len(seq_data) < min_num_images:
+                continue
+            total_frame_num += len(seq_data)
+            self.data_store[seq_name] = seq_data
         self.sequence_list = list(self.data_store.keys())
         self.sequence_list_len = len(self.sequence_list)
         self.total_frame_num = total_frame_num
 
         status = "Training" if self.training else "Testing"
-        logging.info(f"{status}: Co3D Data size: {self.sequence_list_len}")
-        logging.info(f"{status}: Co3D Data dataset length: {len(self)}")
+        logging.info(f"{status}: CO3D Data size: {self.sequence_list_len}")
+        logging.info(f"{status}: CO3D Data dataset length: {len(self)}")
 
     def get_data(
         self,
@@ -216,12 +172,12 @@ def get_data(
             if self.load_depth:
                 depth_path = image_path.replace("/images", "/depths") + ".geometric.png"
                 depth_map = read_depth(depth_path, 1.0)
-
-                mvs_mask_path = image_path.replace(
-                    "/images", "/depth_masks"
-                ).replace(".jpg", ".png")
-                mvs_mask = cv2.imread(mvs_mask_path, cv2.IMREAD_GRAYSCALE) > 128
-                depth_map[~mvs_mask] = 0
+                
+                # mvs_mask_path = image_path.replace(
+                #     "/images", "/depth_masks"
+                # ).replace(".jpg", ".png")
+                # mvs_mask = cv2.imread(mvs_mask_path, cv2.IMREAD_GRAYSCALE) > 128
+                # depth_map[~mvs_mask] = 0
 
                 depth_map = threshold_depth_map(
                     depth_map, min_percentile=-1, max_percentile=98
@@ -262,7 +218,7 @@ def get_data(
             image_paths.append(image_path)
             original_sizes.append(original_size)
 
-        set_name = "co3d"
+        set_name = "CO3D"
 
         batch = {
             "seq_name": set_name + "_" + seq_name,
diff --git a/training/data/datasets/co3dv2.py b/training/data/datasets/co3dv2.py
new file mode 100644
index 00000000..6b6104ac
--- /dev/null
+++ b/training/data/datasets/co3dv2.py
@@ -0,0 +1,277 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import gzip
+import json
+import os.path as osp
+import os
+import logging
+
+import cv2
+import random
+import numpy as np
+
+
+from data.dataset_util import *
+from data.base_dataset import BaseDataset
+
+# TV 
+# donut
+# frisbee
+# toybus
+# bowl
+# book
+# car
+# toaster
+# hydrant
+# keyboard
+# parkingmeter
+# hotdog
+# handbag
+# motorcycle
+# pizza
+# teddybear
+# remote
+# backpack
+# cellphone
+# bench
+# stopsign
+
+
+SEEN_CATEGORIES = [
+    "apple",
+    "bowl",
+    "book",
+    "car",
+    "donut",
+    "hydrant",
+    "keyboard",
+    "parkingmeter",
+    "toaster",
+    "toybus",
+    "tv",
+    "frisbee",
+]
+
+
+class Co3dDataset(BaseDataset):
+    def __init__(
+        self,
+        common_conf,
+        split: str = "train",
+        CO3D_DIR: str = None,
+        CO3D_ANNOTATION_DIR: str = None,
+        min_num_images: int = 24,
+        len_train: int = 100000,
+        len_test: int = 10000,
+    ):
+        """
+        Initialize the Co3dDataset.
+
+        Args:
+            common_conf: Configuration object with common settings.
+            split (str): Dataset split, either 'train' or 'test'.
+            CO3D_DIR (str): Directory path to CO3D data.
+            CO3D_ANNOTATION_DIR (str): Directory path to CO3D annotations.
+            min_num_images (int): Minimum number of images per sequence.
+            len_train (int): Length of the training dataset.
+            len_test (int): Length of the test dataset.
+        Raises:
+            ValueError: If CO3D_DIR or CO3D_ANNOTATION_DIR is not specified.
+        """
+        super().__init__(common_conf=common_conf)
+
+        self.debug = common_conf.debug
+        self.training = common_conf.training
+        self.get_nearby = common_conf.get_nearby
+        self.load_depth = common_conf.load_depth
+        self.inside_random = common_conf.inside_random
+        self.allow_duplicate_img = common_conf.allow_duplicate_img
+
+        if CO3D_DIR is None or CO3D_ANNOTATION_DIR is None:
+            raise ValueError("Both CO3D_DIR and CO3D_ANNOTATION_DIR must be specified.")
+
+
+        # category = sorted(SEEN_CATEGORIES)
+        category = sorted(os.listdir(CO3D_DIR))
+
+        if self.debug:
+            category = ["apple"]
+
+        if split == "train":
+            split_name_list = ["train"]
+            self.len_train = len_train
+        elif split == "test":
+            split_name_list = ["test"]
+            self.len_train = len_test
+        else:
+            raise ValueError(f"Invalid split: {split}")
+
+        self.invalid_sequence = [] # set any invalid sequence names here
+
+
+        self.category_map = {}
+        self.data_store = {}
+        self.seqlen = None
+        self.min_num_images = min_num_images
+
+        logging.info(f"CO3D_DIR is {CO3D_DIR}")
+
+        self.CO3D_DIR = CO3D_DIR
+        self.CO3D_ANNOTATION_DIR = CO3D_ANNOTATION_DIR
+
+        total_frame_num = 0
+
+        for c in category:
+            for split_name in split_name_list:
+                annotation_file = osp.join(
+                    self.CO3D_ANNOTATION_DIR, f"{c}_{split_name}.jgz"
+                )
+
+                try:
+                    with gzip.open(annotation_file, "r") as fin:
+                        annotation = json.loads(fin.read())
+                except FileNotFoundError:
+                    logging.error(f"Annotation file not found: {annotation_file}")
+                    continue
+
+                for seq_name, seq_data in annotation.items():
+                    if len(seq_data) < min_num_images:
+                        continue
+                    if seq_name in self.invalid_sequence:
+                        continue
+                    total_frame_num += len(seq_data)
+                    self.data_store[seq_name] = seq_data
+
+        self.sequence_list = list(self.data_store.keys())
+        self.sequence_list_len = len(self.sequence_list)
+        self.total_frame_num = total_frame_num
+
+        status = "Training" if self.training else "Testing"
+        logging.info(f"{status}: Co3D Data size: {self.sequence_list_len}")
+        logging.info(f"{status}: Co3D Data dataset length: {len(self)}")
+
+    def get_data(
+        self,
+        seq_index: int = None,
+        img_per_seq: int = None,
+        seq_name: str = None,
+        ids: list = None,
+        aspect_ratio: float = 1.0,
+    ) -> dict:
+        """
+        Retrieve data for a specific sequence.
+
+        Args:
+            seq_index (int): Index of the sequence to retrieve.
+            img_per_seq (int): Number of images per sequence.
+            seq_name (str): Name of the sequence.
+            ids (list): Specific IDs to retrieve.
+            aspect_ratio (float): Aspect ratio for image processing.
+
+        Returns:
+            dict: A batch of data including images, depths, and other metadata.
+        """
+        if self.inside_random:
+            seq_index = random.randint(0, self.sequence_list_len - 1)
+            
+        if seq_name is None:
+            seq_name = self.sequence_list[seq_index]
+
+        metadata = self.data_store[seq_name]
+
+        if ids is None:
+            ids = np.random.choice(
+                len(metadata), img_per_seq, replace=self.allow_duplicate_img
+            )
+
+        annos = [metadata[i] for i in ids]
+
+        target_image_shape = self.get_target_shape(aspect_ratio)
+
+        images = []
+        depths = []
+        cam_points = []
+        world_points = []
+        point_masks = []
+        extrinsics = []
+        intrinsics = []
+        image_paths = []
+        original_sizes = []
+
+        for anno in annos:
+            filepath = anno["filepath"]
+
+            image_path = osp.join(self.CO3D_DIR, filepath)
+            image = read_image_cv2(image_path)
+
+            if self.load_depth:
+                depth_path = image_path.replace("/images", "/depths") + ".geometric.png"
+                depth_map = read_depth(depth_path, 1.0)
+
+                mvs_mask_path = image_path.replace(
+                    "/images", "/depth_masks"
+                ).replace(".jpg", ".png")
+                mvs_mask = cv2.imread(mvs_mask_path, cv2.IMREAD_GRAYSCALE)
+                if mvs_mask is not None:
+                    mvs_mask =  mvs_mask> 128
+                    depth_map[~mvs_mask] = 0
+
+                depth_map = threshold_depth_map(
+                    depth_map, min_percentile=-1, max_percentile=98
+                )
+            else:
+                depth_map = None
+
+            original_size = np.array(image.shape[:2])
+            extri_opencv = np.array(anno["extri"])
+            intri_opencv = np.array(anno["intri"])
+
+            (
+                image,
+                depth_map,
+                extri_opencv,
+                intri_opencv,
+                world_coords_points,
+                cam_coords_points,
+                point_mask,
+                _,
+            ) = self.process_one_image(
+                image,
+                depth_map,
+                extri_opencv,
+                intri_opencv,
+                original_size,
+                target_image_shape,
+                filepath=filepath,
+            )
+
+            images.append(image)
+            depths.append(depth_map)
+            extrinsics.append(extri_opencv)
+            intrinsics.append(intri_opencv)
+            cam_points.append(cam_coords_points)
+            world_points.append(world_coords_points)
+            point_masks.append(point_mask)
+            image_paths.append(image_path)
+            original_sizes.append(original_size)
+
+        set_name = "co3d"
+
+        batch = {
+            "seq_name": set_name + "_" + seq_name,
+            "ids": ids,
+            "frame_num": len(extrinsics),
+            "images": images,
+            "depths": depths,
+            "extrinsics": extrinsics,
+            "intrinsics": intrinsics,
+            "cam_points": cam_points,
+            "world_points": world_points,
+            "point_masks": point_masks,
+            "original_sizes": original_sizes,
+        }
+        return batch
diff --git a/training/data/datasets/hypersim.py b/training/data/datasets/hypersim.py
new file mode 100644
index 00000000..55f4b5fa
--- /dev/null
+++ b/training/data/datasets/hypersim.py
@@ -0,0 +1,290 @@
+import gzip
+import json
+import os.path as osp
+import logging
+
+import cv2
+import random
+import numpy as np
+
+from data.dataset_util import *
+from data.base_dataset import BaseDataset
+
+import numpy as np
+import cv2
+import torch
+import h5py
+
+def to_homogeneous(x: torch.Tensor) -> torch.Tensor:
+    return torch.cat((x, torch.ones_like(x[..., :1])), dim=-1)
+
+def get_pixel_grid(
+    B: int,
+    H: int,
+    W: int,
+) -> torch.Tensor:
+    x1_n = torch.meshgrid(
+        *[torch.arange(n) + 0.5 for n in (B, H, W)],
+        indexing="ij",
+    )
+    x1_n = torch.stack((x1_n[2], x1_n[1]), dim=-1).reshape(B, H, W, 2)
+    return x1_n
+
+def load_distance(distance_path) -> np.ndarray:
+    with h5py.File(distance_path, "r") as x:
+        return x["dataset"][:]  # type: ignore
+
+def homog_pixel_grid(H: int, W: int) -> np.ndarray:
+    return (
+        to_homogeneous(
+            get_pixel_grid(
+                1,
+                H,
+                W,
+            )
+        )
+        .numpy()
+        .reshape(-1, 3)
+        .T
+    )
+
+def depth_from_distance(
+        distance: torch.Tensor, K: torch.Tensor
+    ) -> torch.Tensor:
+        H, W = distance.shape[0], distance.shape[1]
+        grid = homog_pixel_grid(H, W)
+        rays = torch.linalg.inv(K) @ grid  # 3xHW
+        ray_z = rays[-1] / torch.linalg.norm(rays, dim=0)
+        z = distance.reshape(-1) * ray_z
+        return z.reshape(H, W, 1)
+
+class HypersimDataset(BaseDataset):
+    def __init__(
+        self,
+        common_conf,
+        split: str = "train",
+        HYPERSIM_DIR: str = None,
+        HYPERSIM_ANNOTATION_DIR: str = None,
+        min_num_images: int = 24,
+        len_train: int = 100000,
+        len_test: int = 10000,
+    ):
+        """
+        Initialize the HypersimDataset.
+
+        Args:
+            common_conf: Configuration object with common settings.
+            split (str): Dataset split, either 'train' or 'test'.
+            HYPERSIM_DIR (str): Directory path to Hypersim data.
+            HYPERSIM_ANNOTATION_DIR (str): Directory path to Hypersim annotations.
+            min_num_images (int): Minimum number of images per sequence.
+            len_train (int): Length of the training dataset.
+            len_test (int): Length of the test dataset.
+        Raises:
+            ValueError: If HYPERSIM_DIR or HYPERSIM_ANNOTATION_DIR is not specified.
+        """
+        super().__init__(common_conf=common_conf)
+
+        self.debug = common_conf.debug
+        self.training = common_conf.training
+        self.get_nearby = common_conf.get_nearby
+        self.load_depth = common_conf.load_depth
+        self.inside_random = common_conf.inside_random
+        self.allow_duplicate_img = common_conf.allow_duplicate_img
+
+        if HYPERSIM_DIR is None or HYPERSIM_ANNOTATION_DIR is None:
+            raise ValueError("Both HYPERSIM_DIR and HYPERSIM_ANNOTATION_DIR must be specified.")
+
+        if split == "train":
+            split_name = "train.jgz"
+            self.len_train = len_train
+        elif split == "test":
+            split_name = "test.jgz"
+            self.len_train = len_test
+        else:
+            raise ValueError(f"Invalid split: {split}")
+
+        self.invalid_sequence = [] # set any invalid sequence names here
+
+
+        self.category_map = {}
+        self.data_store = {}
+        self.seqlen = None
+        self.min_num_images = min_num_images
+
+        logging.info(f"HYPERSIM_DIR is {HYPERSIM_DIR}")
+
+        self.HYPERSIM_DIR = HYPERSIM_DIR
+        self.HYPERSIM_ANNOTATION_DIR = HYPERSIM_ANNOTATION_DIR
+
+        annotation_file = osp.join(
+            self.HYPERSIM_ANNOTATION_DIR, "hypersim", split_name
+        )
+
+        try:
+            with gzip.open(annotation_file, "r") as fin:
+                annotation = json.loads(fin.read())
+        except FileNotFoundError:
+            logging.error(f"Annotation file not found: {annotation_file}")
+        total_frame_num = 0
+
+        for seq_name, seq_data in annotation.items():
+            if seq_name in self.invalid_sequence:
+                continue
+
+            if len(seq_data) < min_num_images:
+                continue
+            total_frame_num += len(seq_data)
+            self.data_store[seq_name] = seq_data
+        self.sequence_list = list(self.data_store.keys())
+        self.sequence_list_len = len(self.sequence_list)
+        self.total_frame_num = total_frame_num
+
+        status = "Training" if self.training else "Testing"
+        logging.info(f"{status}: Hypersim Data size: {self.sequence_list_len}")
+        logging.info(f"{status}: Hypersim Data dataset length: {len(self)}")
+
+    def get_data(
+        self,
+        seq_index: int = None,
+        img_per_seq: int = None,
+        seq_name: str = None,
+        ids: list = None,
+        aspect_ratio: float = 1.0,
+        max_retries: int = 10,
+    ) -> dict:
+        """
+        Retrieve data for a specific sequence.
+        
+        Args:
+            seq_index (int): Index of the sequence to retrieve.
+            img_per_seq (int): Number of images per sequence.
+            seq_name (str): Name of the sequence.
+            ids (list): Specific IDs to retrieve.
+            aspect_ratio (float): Aspect ratio for image processing.
+            max_retries (int): Maximum number of retry attempts.
+            
+        Returns:
+            dict: A batch of data including images, depths, and other metadata.
+        """
+        original_seq_index = seq_index
+        original_seq_name = seq_name
+        
+        for attempt in range(max_retries):
+            # Force new random sequence on retry
+            if attempt > 0 or self.inside_random:
+                seq_index = random.randint(0, self.sequence_list_len - 1)
+                seq_name = None  # Reset seq_name to force using the new index
+                
+            if seq_name is None:
+                seq_name = self.sequence_list[seq_index]
+
+            metadata = self.data_store[seq_name]
+
+            if ids is None or attempt > 0:  # Also resample IDs on retry
+                ids = np.random.choice(
+                    len(metadata), img_per_seq, replace=self.allow_duplicate_img
+                )
+
+            annos = [metadata[i] for i in ids]
+            target_image_shape = self.get_target_shape(aspect_ratio)
+
+            images = []
+            depths = []
+            cam_points = []
+            world_points = []
+            point_masks = []
+            extrinsics = []
+            intrinsics = []
+            image_paths = []
+            original_sizes = []
+            
+            valid_sequence = True
+
+            for anno in annos:
+                filepath = anno["filepath"]
+                image_path = osp.join(self.HYPERSIM_DIR, filepath)
+                image = read_image_cv2(image_path)
+
+                if self.load_depth:
+                    meters_per_asset = anno["meters_per_asset"]
+                    depth_path = osp.join(self.HYPERSIM_DIR, anno["depthpath"])
+                    distance = (
+                        torch.tensor(load_distance(depth_path)).float() / meters_per_asset
+                    )
+                    intrinsic = torch.tensor(anno["intri"]).reshape(3, 3).float()
+
+                    depth = depth_from_distance(distance, intrinsic).float()
+                    depth[depth.isnan()] = 0
+
+                    depth_map = depth.squeeze(-1).numpy()
+                    depth_map = threshold_depth_map(
+                        depth_map, min_percentile=-1, max_percentile=98
+                    )
+                else:
+                    depth_map = None
+
+                original_size = np.array(image.shape[:2])
+                extri_opencv = np.array(anno["extri"])
+                intri_opencv = np.array(anno["intri"])
+                cx = intri_opencv[0, 2]
+                cy = intri_opencv[1, 2]
+
+                if cy > 768 or cx > 1024:
+                    valid_sequence = False
+                    break  # Break and try a different sequence
+
+                # Setting zero skew
+                intri_opencv[0, 1] = 0.0
+
+                (
+                    image,
+                    depth_map,
+                    extri_opencv,
+                    intri_opencv,
+                    world_coords_points,
+                    cam_coords_points,
+                    point_mask,
+                    _,
+                ) = self.process_one_image(
+                    image,
+                    depth_map,
+                    extri_opencv,
+                    intri_opencv,
+                    original_size,
+                    target_image_shape,
+                    filepath=filepath,
+                )
+
+                images.append(image)
+                depths.append(depth_map)
+                extrinsics.append(extri_opencv)
+                intrinsics.append(intri_opencv)
+                cam_points.append(cam_coords_points)
+                world_points.append(world_coords_points)
+                point_masks.append(point_mask)
+                image_paths.append(image_path)
+                original_sizes.append(original_size)
+
+            if valid_sequence:
+                set_name = "Hypersim"
+                batch = {
+                    "seq_name": set_name + "_" + seq_name,
+                    "ids": ids,
+                    "frame_num": len(extrinsics),
+                    "images": images,
+                    "depths": depths,
+                    "extrinsics": extrinsics,
+                    "intrinsics": intrinsics,
+                    "cam_points": cam_points,
+                    "world_points": world_points,
+                    "point_masks": point_masks,
+                    "original_sizes": original_sizes,
+                }
+                return batch
+            
+            # Reset for next attempt
+            seq_index = original_seq_index
+            seq_name = original_seq_name
+        
+        raise RuntimeError(f"Failed to find valid sequence after {max_retries} attempts")
\ No newline at end of file
diff --git a/training/data/datasets/mapillary.py b/training/data/datasets/mapillary.py
new file mode 100644
index 00000000..6529df49
--- /dev/null
+++ b/training/data/datasets/mapillary.py
@@ -0,0 +1,3 @@
+# https://www.mapillary.com/dataset/metropolis
+# https://github.com/mapillary/metropolis_sdk/blob/main/FORMAT.md
+# https://github.com/mapillary/metropolis_sdk/blob/main/SENSORS.md
diff --git a/training/data/datasets/megadepth.py b/training/data/datasets/megadepth.py
new file mode 100644
index 00000000..4f21b618
--- /dev/null
+++ b/training/data/datasets/megadepth.py
@@ -0,0 +1,218 @@
+
+import gzip
+import json
+import os.path as osp
+import os
+import logging
+
+import cv2
+import random
+import numpy as np
+import h5py
+
+from data.dataset_util import *
+from data.base_dataset import BaseDataset
+
+import numpy as np
+
+
+class MegadepthDataset(BaseDataset):
+    def __init__(
+        self,
+        common_conf,
+        split: str = "train",
+        MEGADEPTH_DIR: str = None,
+        MEGADEPTH_ANNOTATION_DIR: str = None,
+        min_num_images: int = 24,
+        len_train: int = 100000,
+        len_test: int = 10000,
+    ):
+        """
+        Initialize the MegadepthDataset.
+
+        Args:
+            common_conf: Configuration object with common settings.
+            split (str): Dataset split, either 'train' or 'test'.
+            MEGADEPTH_DIR (str): Directory path to Megadepth data.
+            MEGADEPTH_ANNOTATION_DIR (str): Directory path to Megadepth annotations.
+            min_num_images (int): Minimum number of images per sequence.
+            len_train (int): Length of the training dataset.
+            len_test (int): Length of the test dataset.
+        Raises:
+            ValueError: If MEGADEPTH_DIR or MEGADEPTH_ANNOTATION_DIR is not specified.
+        """
+        super().__init__(common_conf=common_conf)
+
+        self.debug = common_conf.debug
+        self.training = common_conf.training
+        self.get_nearby = common_conf.get_nearby
+        self.load_depth = common_conf.load_depth
+        self.inside_random = common_conf.inside_random
+        self.allow_duplicate_img = common_conf.allow_duplicate_img
+
+        if MEGADEPTH_DIR is None or MEGADEPTH_ANNOTATION_DIR is None:
+            raise ValueError("Both MEGADEPTH_DIR and MEGADEPTH_ANNOTATION_DIR must be specified.")
+
+        if split == "train":
+            split_name = "train.jgz"
+            self.len_train = len_train
+        elif split == "test":
+            split_name = "test.jgz"
+            self.len_train = len_test
+        else:
+            raise ValueError(f"Invalid split: {split}")
+
+        self.invalid_sequence = [] # set any invalid sequence names here
+
+
+        self.category_map = {}
+        self.data_store = {}
+        self.seqlen = None
+        self.min_num_images = min_num_images
+
+        logging.info(f"MEGADEPTH_DIR is {MEGADEPTH_DIR}")
+
+        self.MEGADEPTH_DIR = MEGADEPTH_DIR
+        self.MEGADEPTH_ANNOTATION_DIR = MEGADEPTH_ANNOTATION_DIR
+
+        annotation_file = osp.join(
+            self.MEGADEPTH_ANNOTATION_DIR, "megadepth", split_name
+        )
+
+        try:
+            with gzip.open(annotation_file, "r") as fin:
+                annotation = json.loads(fin.read())
+        except FileNotFoundError:
+            logging.error(f"Annotation file not found: {annotation_file}")
+
+        total_frame_num = 0
+
+        for scene_name, scene_data in annotation.items():
+            if scene_name in self.invalid_sequence:
+                continue
+
+            for i, seq_data in enumerate(scene_data):
+                if len(seq_data) < min_num_images:
+                    continue
+                total_frame_num += len(seq_data)
+                self.data_store[f"{scene_name}_{i}"] = seq_data
+
+        self.sequence_list = list(self.data_store.keys())
+        self.sequence_list_len = len(self.sequence_list)
+        self.total_frame_num = total_frame_num
+
+        status = "Training" if self.training else "Testing"
+        logging.info(f"{status}: Megadepth Data size: {self.sequence_list_len}")
+        logging.info(f"{status}: Megadepth Data dataset length: {len(self)}")
+
+    def get_data(
+        self,
+        seq_index: int = None,
+        img_per_seq: int = None,
+        seq_name: str = None,
+        ids: list = None,
+        aspect_ratio: float = 1.0,
+    ) -> dict:
+        """
+        Retrieve data for a specific sequence.
+
+        Args:
+            seq_index (int): Index of the sequence to retrieve.
+            img_per_seq (int): Number of images per sequence.
+            seq_name (str): Name of the sequence.
+            ids (list): Specific IDs to retrieve.
+            aspect_ratio (float): Aspect ratio for image processing.
+
+        Returns:
+            dict: A batch of data including images, depths, and other metadata.
+        """
+        if self.inside_random:
+            seq_index = random.randint(0, self.sequence_list_len - 1)
+            
+        if seq_name is None:
+            seq_name = self.sequence_list[seq_index]
+
+        metadata = self.data_store[seq_name]
+
+        if ids is None:
+            ids = np.random.choice(
+                len(metadata), img_per_seq, replace=self.allow_duplicate_img
+            )
+
+        annos = [metadata[i] for i in ids]
+
+        target_image_shape = self.get_target_shape(aspect_ratio)
+
+        images = []
+        depths = []
+        cam_points = []
+        world_points = []
+        point_masks = []
+        extrinsics = []
+        intrinsics = []
+        image_paths = []
+        original_sizes = []
+
+        for anno in annos:
+            filepath = anno["filepath"]
+
+            image_path = osp.join(self.MEGADEPTH_DIR, filepath)
+            image = read_image_cv2(image_path)
+
+            if self.load_depth:
+                depth_path = osp.join(self.MEGADEPTH_DIR, anno["depth_path"])
+                depth_map = np.array(h5py.File(depth_path, "r")["depth"])
+                depth_map = threshold_depth_map(depth_map, max_percentile=98, min_percentile=-1)
+                
+            else:
+                depth_map = None
+
+            original_size = np.array(image.shape[:2])
+            extri_opencv = np.array(anno["extri"])
+            intri_opencv = np.array(anno["intri"])
+
+            (
+                image,
+                depth_map,
+                extri_opencv,
+                intri_opencv,
+                world_coords_points,
+                cam_coords_points,
+                point_mask,
+                _,
+            ) = self.process_one_image(
+                image,
+                depth_map,
+                extri_opencv,
+                intri_opencv,
+                original_size,
+                target_image_shape,
+                filepath=filepath,
+            )
+
+            images.append(image)
+            depths.append(depth_map)
+            extrinsics.append(extri_opencv)
+            intrinsics.append(intri_opencv)
+            cam_points.append(cam_coords_points)
+            world_points.append(world_coords_points)
+            point_masks.append(point_mask)
+            image_paths.append(image_path)
+            original_sizes.append(original_size)
+
+        set_name = "Megadepth"
+
+        batch = {
+            "seq_name": set_name + "_" + seq_name,
+            "ids": ids,
+            "frame_num": len(extrinsics),
+            "images": images,
+            "depths": depths,
+            "extrinsics": extrinsics,
+            "intrinsics": intrinsics,
+            "cam_points": cam_points,
+            "world_points": world_points,
+            "point_masks": point_masks,
+            "original_sizes": original_sizes,
+        }
+        return batch
diff --git a/training/data/datasets/mvssynth.py b/training/data/datasets/mvssynth.py
new file mode 100644
index 00000000..4a1ab4d1
--- /dev/null
+++ b/training/data/datasets/mvssynth.py
@@ -0,0 +1,227 @@
+
+import gzip
+import json
+import os.path as osp
+import os
+import logging
+
+import cv2
+import random
+import numpy as np
+import h5py
+import imageio
+
+from data.dataset_util import *
+from data.base_dataset import BaseDataset
+
+import numpy as np
+import torch
+import cv2
+
+# https://github.com/phuang17/DeepMVS/issues/13
+
+def read_img_depth_pose(depth_path):
+    raw_depth = np.asarray(imageio.imread(depth_path)[:])
+    raw_depth = np.clip(raw_depth, 0.1, 1000.0)
+    # print('Raw depth shape:', raw_depth.shape)
+    return raw_depth[:, :, 0]
+
+class MVSSynthDataset(BaseDataset):
+    def __init__(
+        self,
+        common_conf,
+        split: str = "train",
+        MVSSYNTH_DIR: str = None,
+        MVSSYNTH_ANNOTATION_DIR: str = None,
+        min_num_images: int = 24,
+        len_train: int = 100000,
+        len_test: int = 10000,
+    ):
+        """
+        Initialize the MVSSynthDataset.
+
+        Args:
+            common_conf: Configuration object with common settings.
+            split (str): Dataset split, either 'train' or 'test'.
+            MVSSYNTH_DIR (str): Directory path to MVSSynth data.
+            MVSSYNTH_ANNOTATION_DIR (str): Directory path to MVSSynth annotations.
+            min_num_images (int): Minimum number of images per sequence.
+            len_train (int): Length of the training dataset.
+            len_test (int): Length of the test dataset.
+        Raises:
+            ValueError: If MVSSYNTH_DIR or MVSSYNTH_ANNOTATION_DIR is not specified.
+        """
+        super().__init__(common_conf=common_conf)
+
+        self.debug = common_conf.debug
+        self.training = common_conf.training
+        self.get_nearby = common_conf.get_nearby
+        self.load_depth = common_conf.load_depth
+        self.inside_random = common_conf.inside_random
+        self.allow_duplicate_img = common_conf.allow_duplicate_img
+
+        if MVSSYNTH_DIR is None or MVSSYNTH_ANNOTATION_DIR is None:
+            raise ValueError("Both MVSSYNTH_DIR and MVSSYNTH_ANNOTATION_DIR must be specified.")
+
+        if split == "train":
+            split_name = "train.jgz"
+            self.len_train = len_train
+        elif split == "test":
+            split_name = "test.jgz"
+            self.len_train = len_test
+        else:
+            raise ValueError(f"Invalid split: {split}")
+
+        self.invalid_sequence = [] # set any invalid sequence names here
+
+
+        self.category_map = {}
+        self.data_store = {}
+        self.seqlen = None
+        self.min_num_images = min_num_images
+
+        logging.info(f"MVSSYNTH_DIR is {MVSSYNTH_DIR}")
+
+        self.MVSSYNTH_DIR = MVSSYNTH_DIR
+        self.MVSSYNTH_ANNOTATION_DIR = MVSSYNTH_ANNOTATION_DIR
+
+        annotation_file = osp.join(
+            self.MVSSYNTH_ANNOTATION_DIR, "mvssynth", split_name
+        )
+
+        try:
+            with gzip.open(annotation_file, "r") as fin:
+                annotation = json.loads(fin.read())
+        except FileNotFoundError:
+            logging.error(f"Annotation file not found: {annotation_file}")
+        total_frame_num = 0
+
+        for seq_name, seq_data in annotation.items():
+            if seq_name in self.invalid_sequence:
+                continue
+
+            if len(seq_data) < min_num_images:
+                continue
+            total_frame_num += len(seq_data)
+            self.data_store[seq_name] = seq_data
+        self.sequence_list = list(self.data_store.keys())
+        self.sequence_list_len = len(self.sequence_list)
+        self.total_frame_num = total_frame_num
+
+        status = "Training" if self.training else "Testing"
+        logging.info(f"{status}: MVSSynth Data size: {self.sequence_list_len}")
+        logging.info(f"{status}: MVSSynth Data dataset length: {len(self)}")
+
+    def get_data(
+        self,
+        seq_index: int = None,
+        img_per_seq: int = None,
+        seq_name: str = None,
+        ids: list = None,
+        aspect_ratio: float = 1.0,
+    ) -> dict:
+        """
+        Retrieve data for a specific sequence.
+
+        Args:
+            seq_index (int): Index of the sequence to retrieve.
+            img_per_seq (int): Number of images per sequence.
+            seq_name (str): Name of the sequence.
+            ids (list): Specific IDs to retrieve.
+            aspect_ratio (float): Aspect ratio for image processing.
+
+        Returns:
+            dict: A batch of data including images, depths, and other metadata.
+        """
+        if self.inside_random:
+            seq_index = random.randint(0, self.sequence_list_len - 1)
+            
+        if seq_name is None:
+            seq_name = self.sequence_list[seq_index]
+
+        metadata = self.data_store[seq_name]
+
+        if ids is None:
+            ids = np.random.choice(
+                len(metadata), img_per_seq, replace=self.allow_duplicate_img
+            )
+
+        annos = [metadata[i] for i in ids]
+
+        target_image_shape = self.get_target_shape(aspect_ratio)
+
+        images = []
+        depths = []
+        cam_points = []
+        world_points = []
+        point_masks = []
+        extrinsics = []
+        intrinsics = []
+        image_paths = []
+        original_sizes = []
+
+        for anno in annos:
+            filepath = anno["filepath"]
+
+            image_path = osp.join(self.MVSSYNTH_DIR, filepath)
+            image = read_image_cv2(image_path)
+
+            if self.load_depth:
+                depth_path = osp.join(self.MVSSYNTH_DIR, anno["depthpath"])
+                d = cv2.imread(depth_path, cv2.IMREAD_ANYDEPTH)
+                d[d > 1e9] = 0.0
+                d[~np.isfinite(d)] = 0.0
+                depth_map = d
+                depth_map = threshold_depth_map(depth_map, max_percentile=98, min_percentile=-1)        
+            else:
+                depth_map = None
+
+            original_size = np.array(image.shape[:2])
+            extri_opencv = np.array(anno["extri"])
+            intri_opencv = np.array(anno["intri"])
+
+            (
+                image,
+                depth_map,
+                extri_opencv,
+                intri_opencv,
+                world_coords_points,
+                cam_coords_points,
+                point_mask,
+                _,
+            ) = self.process_one_image(
+                image,
+                depth_map,
+                extri_opencv,
+                intri_opencv,
+                original_size,
+                target_image_shape,
+                filepath=filepath,
+            )
+
+            images.append(image)
+            depths.append(depth_map)
+            extrinsics.append(extri_opencv)
+            intrinsics.append(intri_opencv)
+            cam_points.append(cam_coords_points)
+            world_points.append(world_coords_points)
+            point_masks.append(point_mask)
+            image_paths.append(image_path)
+            original_sizes.append(original_size)
+
+        set_name = "MVSSynth"
+
+        batch = {
+            "seq_name": set_name + "_" + seq_name,
+            "ids": ids,
+            "frame_num": len(extrinsics),
+            "images": images,
+            "depths": depths,
+            "extrinsics": extrinsics,
+            "intrinsics": intrinsics,
+            "cam_points": cam_points,
+            "world_points": world_points,
+            "point_masks": point_masks,
+            "original_sizes": original_sizes,
+        }
+        return batch
diff --git a/training/data/datasets/pointodyssey.py b/training/data/datasets/pointodyssey.py
new file mode 100644
index 00000000..a8a2634f
--- /dev/null
+++ b/training/data/datasets/pointodyssey.py
@@ -0,0 +1,225 @@
+# Depth should be divided by 1000
+
+
+import gzip
+import json
+import os.path as osp
+import logging
+
+import cv2
+import random
+import numpy as np
+
+from data.dataset_util import *
+from data.base_dataset import BaseDataset
+
+import numpy as np
+import cv2
+
+class PointOdysseyDataset(BaseDataset):
+    def __init__(
+        self,
+        common_conf,
+        split: str = "train",
+        POINTODYSSEY_DIR: str = None,
+        POINTODYSSEY_ANNOTATION_DIR: str = None,
+        min_num_images: int = 24,
+        len_train: int = 100000,
+        len_test: int = 10000,
+    ):
+        """
+        Initialize the PointOdysseyDataset.
+
+        Args:
+            common_conf: Configuration object with common settings.
+            split (str): Dataset split, either 'train' or 'test'.
+            POINTODYSSEY_DIR (str): Directory path to PointOdyssey data.
+            POINTODYSSEY_ANNOTATION_DIR (str): Directory path to PointOdyssey annotations.
+            min_num_images (int): Minimum number of images per sequence.
+            len_train (int): Length of the training dataset.
+            len_test (int): Length of the test dataset.
+        Raises:
+            ValueError: If POINTODYSSEY_DIR or POINTODYSSEY_ANNOTATION_DIR is not specified.
+        """
+        super().__init__(common_conf=common_conf)
+
+        self.debug = common_conf.debug
+        self.training = common_conf.training
+        self.get_nearby = common_conf.get_nearby
+        self.load_depth = common_conf.load_depth
+        self.inside_random = common_conf.inside_random
+        self.allow_duplicate_img = common_conf.allow_duplicate_img
+
+        if POINTODYSSEY_DIR is None or POINTODYSSEY_ANNOTATION_DIR is None:
+            raise ValueError("Both POINTODYSSEY_DIR and POINTODYSSEY_ANNOTATION_DIR must be specified.")
+
+        if split == "train":
+            split_name = "train.jgz"
+            self.len_train = len_train
+        elif split == "test":
+            split_name = "test.jgz"
+            self.len_train = len_test
+        else:
+            raise ValueError(f"Invalid split: {split}")
+
+        self.invalid_sequence = [] # set any invalid sequence names here
+
+
+        self.category_map = {}
+        self.data_store = {}
+        self.seqlen = None
+        self.min_num_images = min_num_images
+
+        logging.info(f"POINTODYSSEY_DIR is {POINTODYSSEY_DIR}")
+
+        self.POINTODYSSEY_DIR = POINTODYSSEY_DIR
+        self.POINTODYSSEY_ANNOTATION_DIR = POINTODYSSEY_ANNOTATION_DIR
+
+        annotation_file = osp.join(
+            self.POINTODYSSEY_ANNOTATION_DIR, "pointodyssey", split_name
+        )
+
+        try:
+            with gzip.open(annotation_file, "r") as fin:
+                annotation = json.loads(fin.read())
+        except FileNotFoundError:
+            logging.error(f"Annotation file not found: {annotation_file}")
+        total_frame_num = 0
+
+        for seq_name, seq_data in annotation.items():
+            if seq_name in self.invalid_sequence:
+                continue
+
+            if len(seq_data) < min_num_images:
+                continue
+            total_frame_num += len(seq_data)
+            self.data_store[seq_name] = seq_data
+        self.sequence_list = list(self.data_store.keys())
+        self.sequence_list_len = len(self.sequence_list)
+        self.total_frame_num = total_frame_num
+
+        status = "Training" if self.training else "Testing"
+        logging.info(f"{status}: PointOdyssey Data size: {self.sequence_list_len}")
+        logging.info(f"{status}: PointOdyssey Data dataset length: {len(self)}")
+
+    def get_data(
+        self,
+        seq_index: int = None,
+        img_per_seq: int = None,
+        seq_name: str = None,
+        ids: list = None,
+        aspect_ratio: float = 1.0,
+    ) -> dict:
+        """
+        Retrieve data for a specific sequence.
+
+        Args:
+            seq_index (int): Index of the sequence to retrieve.
+            img_per_seq (int): Number of images per sequence.
+            seq_name (str): Name of the sequence.
+            ids (list): Specific IDs to retrieve.
+            aspect_ratio (float): Aspect ratio for image processing.
+
+        Returns:
+            dict: A batch of data including images, depths, and other metadata.
+        """
+        if self.inside_random:
+            seq_index = random.randint(0, self.sequence_list_len - 1)
+            
+        if seq_name is None:
+            seq_name = self.sequence_list[seq_index]
+
+        metadata = self.data_store[seq_name]
+
+        if ids is None:
+            ids = np.random.choice(
+                len(metadata), img_per_seq, replace=self.allow_duplicate_img
+            )
+
+        annos = [metadata[i] for i in ids]
+
+        target_image_shape = self.get_target_shape(aspect_ratio)
+
+        images = []
+        depths = []
+        cam_points = []
+        world_points = []
+        point_masks = []
+        extrinsics = []
+        intrinsics = []
+        image_paths = []
+        original_sizes = []
+
+        for anno in annos:
+            filepath = anno["filepath"]
+
+            image_path = osp.join(self.POINTODYSSEY_DIR, filepath)
+            image = read_image_cv2(image_path)
+
+            if self.load_depth:
+                depth_path = osp.join(self.POINTODYSSEY_DIR, anno["depthpath"])
+                # depth_map = read_depth(depth_path, 1.0) * 1000
+                depth_16bit = cv2.imread(depth_path, cv2.IMREAD_ANYDEPTH)
+                depth_map = depth_16bit.astype(np.float32) / 65535.0 * 1000.0
+
+                # mvs_mask_path = image_path.replace(
+                #     "/rgbs", "/masks"
+                # ).replace("rgb", "mask").replace(".jpg", ".png")
+                # mvs_mask = cv2.imread(mvs_mask_path, cv2.IMREAD_GRAYSCALE) > 128
+                # depth_map[~mvs_mask] = 0
+
+                depth_map = threshold_depth_map(
+                    depth_map, min_percentile=-1, max_percentile=98
+                )
+            else:
+                depth_map = None
+
+            original_size = np.array(image.shape[:2])
+            extri_opencv = np.array(anno["extri"])
+            intri_opencv = np.array(anno["intri"])
+
+            (
+                image,
+                depth_map,
+                extri_opencv,
+                intri_opencv,
+                world_coords_points,
+                cam_coords_points,
+                point_mask,
+                _,
+            ) = self.process_one_image(
+                image,
+                depth_map,
+                extri_opencv,
+                intri_opencv,
+                original_size,
+                target_image_shape,
+                filepath=filepath,
+            )
+
+            images.append(image)
+            depths.append(depth_map)
+            extrinsics.append(extri_opencv)
+            intrinsics.append(intri_opencv)
+            cam_points.append(cam_coords_points)
+            world_points.append(world_coords_points)
+            point_masks.append(point_mask)
+            image_paths.append(image_path)
+            original_sizes.append(original_size)
+
+        set_name = "PointOdyssey"
+
+        batch = {
+            "seq_name": set_name + "_" + seq_name,
+            "ids": ids,
+            "frame_num": len(extrinsics),
+            "images": images,
+            "depths": depths,
+            "extrinsics": extrinsics,
+            "intrinsics": intrinsics,
+            "cam_points": cam_points,
+            "world_points": world_points,
+            "point_masks": point_masks,
+            "original_sizes": original_sizes,
+        }
+        return batch
diff --git a/training/data/datasets/scannet.py b/training/data/datasets/scannet.py
new file mode 100644
index 00000000..66f19e7b
--- /dev/null
+++ b/training/data/datasets/scannet.py
@@ -0,0 +1,231 @@
+
+import gzip
+import json
+import os.path as osp
+import os
+import logging
+
+import cv2
+import random
+import numpy as np
+import h5py
+
+from data.dataset_util import *
+from data.base_dataset import BaseDataset
+
+import numpy as np
+import torch
+import cv2
+
+class ScanNetDataset(BaseDataset):
+    def __init__(
+        self,
+        common_conf,
+        split: str = "train",
+        SCANNET_DIR: str = None,
+        SCANNET_ANNOTATION_DIR: str = None,
+        min_num_images: int = 24,
+        len_train: int = 100000,
+        len_test: int = 10000,
+    ):
+        """
+        Initialize the ScanNetDataset.
+
+        Args:
+            common_conf: Configuration object with common settings.
+            split (str): Dataset split, either 'train' or 'test'.
+            SCANNET_DIR (str): Directory path to ScanNet data.
+            SCANNET_ANNOTATION_DIR (str): Directory path to ScanNet annotations.
+            min_num_images (int): Minimum number of images per sequence.
+            len_train (int): Length of the training dataset.
+            len_test (int): Length of the test dataset.
+        Raises:
+            ValueError: If SCANNET_DIR or SCANNET_ANNOTATION_DIR is not specified.
+        """
+        super().__init__(common_conf=common_conf)
+
+        self.debug = common_conf.debug
+        self.training = common_conf.training
+        self.get_nearby = common_conf.get_nearby
+        self.load_depth = common_conf.load_depth
+        self.inside_random = common_conf.inside_random
+        self.allow_duplicate_img = common_conf.allow_duplicate_img
+
+        if SCANNET_DIR is None or SCANNET_ANNOTATION_DIR is None:
+            raise ValueError("Both SCANNET_DIR and SCANNET_ANNOTATION_DIR must be specified.")
+
+        if split == "train":
+            split_name = "train.jgz"
+            self.len_train = len_train
+        elif split == "test":
+            split_name = "test.jgz"
+            self.len_train = len_test
+        else:
+            raise ValueError(f"Invalid split: {split}")
+
+        self.invalid_sequence = [] # set any invalid sequence names here
+
+
+        self.category_map = {}
+        self.data_store = {}
+        self.seqlen = None
+        self.min_num_images = min_num_images
+
+        logging.info(f"SCANNET_DIR is {SCANNET_DIR}")
+
+        self.SCANNET_DIR = SCANNET_DIR
+        self.SCANNET_ANNOTATION_DIR = SCANNET_ANNOTATION_DIR
+
+        annotation_file = osp.join(
+            self.SCANNET_ANNOTATION_DIR, "scannet", split_name
+        )
+
+        try:
+            with gzip.open(annotation_file, "r") as fin:
+                annotation = json.loads(fin.read())
+        except FileNotFoundError:
+            logging.error(f"Annotation file not found: {annotation_file}")
+        total_frame_num = 0
+
+        for seq_name, seq_data in annotation.items():
+            if seq_name in self.invalid_sequence:
+                continue
+
+            if len(seq_data) < min_num_images:
+                continue
+            total_frame_num += len(seq_data)
+            self.data_store[seq_name] = seq_data
+        self.sequence_list = list(self.data_store.keys())
+        self.sequence_list_len = len(self.sequence_list)
+        self.total_frame_num = total_frame_num
+
+        status = "Training" if self.training else "Testing"
+        logging.info(f"{status}: ScanNet Data size: {self.sequence_list_len}")
+        logging.info(f"{status}: ScanNet Data dataset length: {len(self)}")
+
+    def get_data(
+        self,
+        seq_index: int = None,
+        img_per_seq: int = None,
+        seq_name: str = None,
+        ids: list = None,
+        aspect_ratio: float = 1.0,
+    ) -> dict:
+        """
+        Retrieve data for a specific sequence.
+
+        Args:
+            seq_index (int): Index of the sequence to retrieve.
+            img_per_seq (int): Number of images per sequence.
+            seq_name (str): Name of the sequence.
+            ids (list): Specific IDs to retrieve.
+            aspect_ratio (float): Aspect ratio for image processing.
+
+        Returns:
+            dict: A batch of data including images, depths, and other metadata.
+        """
+        if self.inside_random:
+            seq_index = random.randint(0, self.sequence_list_len - 1)
+            
+        if seq_name is None:
+            seq_name = self.sequence_list[seq_index]
+
+        metadata = self.data_store[seq_name]
+
+        if ids is None:
+            ids = np.random.choice(
+                len(metadata), img_per_seq, replace=self.allow_duplicate_img
+            )
+
+        annos = [metadata[i] for i in ids]
+
+        target_image_shape = self.get_target_shape(aspect_ratio)
+
+        images = []
+        depths = []
+        cam_points = []
+        world_points = []
+        point_masks = []
+        extrinsics = []
+        intrinsics = []
+        image_paths = []
+        original_sizes = []
+
+        for anno in annos:
+            filepath = anno["filepath"]
+
+            image_path = osp.join(self.SCANNET_DIR, filepath)
+            image = read_image_cv2(image_path)
+
+            if self.load_depth:
+                depth_path = osp.join(self.SCANNET_DIR, anno["depthpath"])
+                depth = cv2.imread(depth_path, cv2.IMREAD_UNCHANGED)
+                depth_map = depth / 1000
+
+                # depth_map = self.load_depth_image(depth_path)
+                # depth_map = read_depth(depth_path, 1.0) / 1000
+                depth_map = threshold_depth_map(depth_map, max_percentile=98, min_percentile=-1)
+                # depth_path = image_path.replace("/images", "/depths") + ".geometric.png"
+        
+                # mvs_mask_path = image_path.replace(
+                #     "/images", "/depth_masks"
+                # ).replace(".jpg", ".png")
+                # mvs_mask = cv2.imread(mvs_mask_path, cv2.IMREAD_GRAYSCALE) > 128
+                # depth_map[~mvs_mask] = 0
+
+                # depth_map = threshold_depth_map(
+                #     depth_map, min_percentile=-1, max_percentile=98
+                # )
+            else:
+                depth_map = None
+
+            original_size = np.array(image.shape[:2])
+            extri_opencv = np.array(anno["extri"])
+            # print('Extri: ', extri_opencv)
+            intri_opencv = np.array(anno["intri"])
+            # print('Intri: ', intri_opencv)
+            (
+                image,
+                depth_map,
+                extri_opencv,
+                intri_opencv,
+                world_coords_points,
+                cam_coords_points,
+                point_mask,
+                _,
+            ) = self.process_one_image(
+                image,
+                depth_map,
+                extri_opencv,
+                intri_opencv,
+                original_size,
+                target_image_shape,
+                filepath=filepath,
+            )
+
+            images.append(image)
+            depths.append(depth_map)
+            extrinsics.append(extri_opencv)
+            intrinsics.append(intri_opencv)
+            cam_points.append(cam_coords_points)
+            world_points.append(world_coords_points)
+            point_masks.append(point_mask)
+            image_paths.append(image_path)
+            original_sizes.append(original_size)
+
+        set_name = "ScanNet"
+
+        batch = {
+            "seq_name": set_name + "_" + seq_name,
+            "ids": ids,
+            "frame_num": len(extrinsics),
+            "images": images,
+            "depths": depths,
+            "extrinsics": extrinsics,
+            "intrinsics": intrinsics,
+            "cam_points": cam_points,
+            "world_points": world_points,
+            "point_masks": point_masks,
+            "original_sizes": original_sizes,
+        }
+        return batch
diff --git a/training/data/datasets/wildrgbd.py b/training/data/datasets/wildrgbd.py
new file mode 100644
index 00000000..14c1f653
--- /dev/null
+++ b/training/data/datasets/wildrgbd.py
@@ -0,0 +1,222 @@
+
+import gzip
+import json
+import os.path as osp
+import logging
+
+import cv2
+import random
+import numpy as np
+
+from data.dataset_util import *
+from data.base_dataset import BaseDataset
+
+import numpy as np
+import cv2
+
+class WildrgbdDataset(BaseDataset):
+    def __init__(
+        self,
+        common_conf,
+        split: str = "train",
+        WILDRGBD_DIR: str = None,
+        WILDRGBD_ANNOTATION_DIR: str = None,
+        min_num_images: int = 24,
+        len_train: int = 100000,
+        len_test: int = 10000,
+    ):
+        """
+        Initialize the WildrgbdDataset.
+
+        Args:
+            common_conf: Configuration object with common settings.
+            split (str): Dataset split, either 'train' or 'test'.
+            WILDRGBD_DIR (str): Directory path to Wildrgbd data.
+            WILDRGBD_ANNOTATION_DIR (str): Directory path to Wildrgbd annotations.
+            min_num_images (int): Minimum number of images per sequence.
+            len_train (int): Length of the training dataset.
+            len_test (int): Length of the test dataset.
+        Raises:
+            ValueError: If WILDRGBD_DIR or WILDRGBD_ANNOTATION_DIR is not specified.
+        """
+        super().__init__(common_conf=common_conf)
+
+        self.debug = common_conf.debug
+        self.training = common_conf.training
+        self.get_nearby = common_conf.get_nearby
+        self.load_depth = common_conf.load_depth
+        self.inside_random = common_conf.inside_random
+        self.allow_duplicate_img = common_conf.allow_duplicate_img
+
+        if WILDRGBD_DIR is None or WILDRGBD_ANNOTATION_DIR is None:
+            raise ValueError("Both WILDRGBD_DIR and WILDRGBD_ANNOTATION_DIR must be specified.")
+
+        if split == "train":
+            split_name = "train.jgz"
+            self.len_train = len_train
+        elif split == "test":
+            split_name = "test.jgz"
+            self.len_train = len_test
+        else:
+            raise ValueError(f"Invalid split: {split}")
+
+        self.invalid_sequence = [] # set any invalid sequence names here
+
+
+        self.category_map = {}
+        self.data_store = {}
+        self.seqlen = None
+        self.min_num_images = min_num_images
+
+        logging.info(f"WILDRGBD_DIR is {WILDRGBD_DIR}")
+
+        self.WILDRGBD_DIR = WILDRGBD_DIR
+        self.WILDRGBD_ANNOTATION_DIR = WILDRGBD_ANNOTATION_DIR
+
+        annotation_file = osp.join(
+            self.WILDRGBD_ANNOTATION_DIR, "wildrgbd", split_name
+        )
+
+        try:
+            with gzip.open(annotation_file, "r") as fin:
+                annotation = json.loads(fin.read())
+        except FileNotFoundError:
+            logging.error(f"Annotation file not found: {annotation_file}")
+        total_frame_num = 0
+
+        for seq_name, seq_data in annotation.items():
+            if seq_name in self.invalid_sequence:
+                continue
+
+            if len(seq_data) < min_num_images:
+                continue
+            total_frame_num += len(seq_data)
+            self.data_store[seq_name] = seq_data
+        self.sequence_list = list(self.data_store.keys())
+        self.sequence_list_len = len(self.sequence_list)
+        self.total_frame_num = total_frame_num
+
+        status = "Training" if self.training else "Testing"
+        logging.info(f"{status}: Wildrgbd Data size: {self.sequence_list_len}")
+        logging.info(f"{status}: Wildrgbd Data dataset length: {len(self)}")
+
+    def get_data(
+        self,
+        seq_index: int = None,
+        img_per_seq: int = None,
+        seq_name: str = None,
+        ids: list = None,
+        aspect_ratio: float = 1.0,
+    ) -> dict:
+        """
+        Retrieve data for a specific sequence.
+
+        Args:
+            seq_index (int): Index of the sequence to retrieve.
+            img_per_seq (int): Number of images per sequence.
+            seq_name (str): Name of the sequence.
+            ids (list): Specific IDs to retrieve.
+            aspect_ratio (float): Aspect ratio for image processing.
+
+        Returns:
+            dict: A batch of data including images, depths, and other metadata.
+        """
+        if self.inside_random:
+            seq_index = random.randint(0, self.sequence_list_len - 1)
+            
+        if seq_name is None:
+            seq_name = self.sequence_list[seq_index]
+
+        metadata = self.data_store[seq_name]
+
+        if ids is None:
+            ids = np.random.choice(
+                len(metadata), img_per_seq, replace=self.allow_duplicate_img
+            )
+
+        annos = [metadata[i] for i in ids]
+
+        target_image_shape = self.get_target_shape(aspect_ratio)
+
+        images = []
+        depths = []
+        cam_points = []
+        world_points = []
+        point_masks = []
+        extrinsics = []
+        intrinsics = []
+        image_paths = []
+        original_sizes = []
+
+        for anno in annos:
+            filepath = anno["filepath"]
+
+            image_path = osp.join(self.WILDRGBD_DIR, filepath)
+            image = read_image_cv2(image_path)
+
+            if self.load_depth:
+                depth_path = osp.join(self.WILDRGBD_DIR, anno["depthpath"])
+                depth = cv2.imread(depth_path, cv2.IMREAD_ANYDEPTH)
+                depth_map = depth.astype(np.float32) / 1000.0
+                
+                mvs_mask_path = image_path.replace(
+                    "/rgb", "/masks"
+                )
+                mvs_mask = cv2.imread(mvs_mask_path, cv2.IMREAD_GRAYSCALE) > 128
+                depth_map[~mvs_mask] = 0
+
+                depth_map = threshold_depth_map(
+                    depth_map, min_percentile=-1, max_percentile=98
+                )
+            else:
+                depth_map = None
+
+            original_size = np.array(image.shape[:2])
+            extri_opencv = np.array(anno["extri"])
+            intri_opencv = np.array(anno["intri"])
+
+            (
+                image,
+                depth_map,
+                extri_opencv,
+                intri_opencv,
+                world_coords_points,
+                cam_coords_points,
+                point_mask,
+                _,
+            ) = self.process_one_image(
+                image,
+                depth_map,
+                extri_opencv,
+                intri_opencv,
+                original_size,
+                target_image_shape,
+                filepath=filepath,
+            )
+
+            images.append(image)
+            depths.append(depth_map)
+            extrinsics.append(extri_opencv)
+            intrinsics.append(intri_opencv)
+            cam_points.append(cam_coords_points)
+            world_points.append(world_coords_points)
+            point_masks.append(point_mask)
+            image_paths.append(image_path)
+            original_sizes.append(original_size)
+
+        set_name = "Wildrgbd"
+
+        batch = {
+            "seq_name": set_name + "_" + seq_name,
+            "ids": ids,
+            "frame_num": len(extrinsics),
+            "images": images,
+            "depths": depths,
+            "extrinsics": extrinsics,
+            "intrinsics": intrinsics,
+            "cam_points": cam_points,
+            "world_points": world_points,
+            "point_masks": point_masks,
+            "original_sizes": original_sizes,
+        }
+        return batch
diff --git a/training/data/preprocess/blendedmvs.py b/training/data/preprocess/blendedmvs.py
new file mode 100644
index 00000000..dcc24976
--- /dev/null
+++ b/training/data/preprocess/blendedmvs.py
@@ -0,0 +1,57 @@
+from pathlib import Path
+import json
+import gzip
+import numpy as np
+import torch
+from tqdm import tqdm
+
+def read_cam_file(cam_path: str):
+    with open(cam_path) as f:
+        lines = f.readlines()
+
+    # Extrinsic (world-to-camera)
+    extrinsic = np.array([[float(x) for x in line.split()] for line in lines[1:5]], dtype=np.float32)
+    pose_w2c = extrinsic[:3, :]  # 3x4
+
+    # Intrinsic
+    intrinsic = np.array([[float(x) for x in line.split()] for line in lines[7:10]], dtype=np.float32)
+    K = intrinsic
+
+    # Depth range info
+    depth_line = [float(x) for x in lines[11].split()]
+    depth_min, depth_interval, num_depth, depth_max = depth_line
+
+    return pose_w2c, K, (depth_min, depth_interval, num_depth, depth_max)
+
+# Root folder where everything starts
+root = Path("/mimer/NOBACKUP/groups/3d-dl/blendedmvs_full")
+
+out = {}
+
+for scene_dir in tqdm(root.iterdir()):
+    frames = sorted([p.name for p in (scene_dir / "blended_images").iterdir() if p.suffix == ".jpg" and not p.name.endswith("_masked.jpg")])
+    
+    sequence_data = []
+    for frame in frames:
+        cams_path = scene_dir / "cams" / (frame.replace(".jpg", "_cam.txt"))
+        depth_path = scene_dir / "depths" / (frame.replace(".jpg", ".pfm"))
+
+        pose_w2c, K, depth_info = read_cam_file(cams_path)
+
+        frame_data = {
+            "filepath": f"{scene_dir.name}/blended_images/{frame}",
+            "extri": pose_w2c.tolist(),
+            "intri": K.tolist(),
+            "depthpath": f"{scene_dir.name}/rendered_depth_maps/{frame.replace('.jpg', '.pfm')}",
+        }
+        sequence_data.append(frame_data)
+
+    out[scene_dir.name] = sequence_data
+
+root = "/mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt"
+
+with gzip.open(root+"/annotations/blendedmvs/train.jgz", "wt", encoding="utf-8") as f:
+    json.dump(out, f, ensure_ascii=False, indent=4)
+
+print(f"Processed {len(out)} scenes with a total of {sum(len(v) for v in out.values())} images.")
+
diff --git a/training/data/preprocess/co3d.py b/training/data/preprocess/co3d.py
new file mode 100644
index 00000000..29dfd1c1
--- /dev/null
+++ b/training/data/preprocess/co3d.py
@@ -0,0 +1,104 @@
+from pathlib import Path
+import json
+import gzip
+import numpy as np
+import torch
+from tqdm import tqdm
+import os.path as osp
+
+root = Path("/mimer/NOBACKUP/groups/3d-dl/co3d_full")
+
+def co3d_annotation_to_opencv_pose(frame_data):
+    p = frame_data['viewpoint']['principal_point']
+    f = frame_data['viewpoint']['focal_length']
+    h, w = frame_data['image']['size']
+    K = np.eye(3)
+    s = (min(h, w) - 1) / 2
+    K[0, 0] = f[0] * (w - 1) / 2
+    K[1, 1] = f[1] * (h - 1) / 2
+    K[0, 2] = -p[0] * s + (w - 1) / 2
+    K[1, 2] = -p[1] * s + (h - 1) / 2
+
+    R = np.asarray(frame_data['viewpoint']['R']).T   # note the transpose here
+    T = np.asarray(frame_data['viewpoint']['T'])
+    pose = np.concatenate([R,T[:,None]],1)
+    pose = np.diag([-1,-1,1]).astype(np.float32) @ pose # flip the direction of x,y axis
+
+    return pose, K
+
+out = {}
+for category_dir in tqdm(root.iterdir()):
+    print('Processing category: ', category_dir.name)
+    frame_file = osp.join(category_dir, "frame_annotations.jgz")
+    sequence_file = osp.join(category_dir, "sequence_annotations.jgz")
+
+    set_list = json.load(open(osp.join(category_dir, "set_lists.json"), "r"))
+    
+    train_sequences = set()
+    for split in ["train_known", "train_unseen"]:
+        if split in set_list:
+            for entry in set_list[split]:
+                sequence_id = entry[0]  # first element is the sequence ID
+                train_sequences.add(sequence_id)
+
+    # Convert to a sorted list if you want
+    train_sequences = sorted(train_sequences)
+
+    # print('Set list: ', train_sequences)
+
+    with gzip.open(frame_file, "r") as fin:
+        frame_data = json.loads(fin.read())
+    with gzip.open(sequence_file, "r") as fin:
+        sequence_data = json.loads(fin.read())
+
+    frame_data_processed = {}
+    for f_data in frame_data:
+        sequence_name = f_data["sequence_name"]
+        frame_data_processed.setdefault(sequence_name, {})[f_data["frame_number"]] = f_data
+
+
+    for seq in train_sequences:
+        # print(frame_data_processed[seq])
+        seq_data = frame_data_processed[seq]
+        scene_dir = category_dir / seq
+        images_dir = scene_dir / "images"
+        frames = sorted([p.name for p in images_dir.iterdir() if p.suffix == ".jpg"])
+        out_sequence_data = []
+        for i, frame in enumerate(frames):
+            frame_data = seq_data[i]
+            # viewpoint = frame_data['viewpoint']
+            # R = np.array(viewpoint['R'])
+            # T = np.array(viewpoint['T']).reshape(3, 1)
+            # extrinsic = np.eye(4)
+            # extrinsic[:3, :3] = R
+            # extrinsic[:3, 3:] = T
+
+            # fx, fy = viewpoint['focal_length']
+            # cx, cy = viewpoint['principal_point']
+
+            # intrinsic = np.array([
+            #     [fx, 0, cx],
+            #     [0, fy, cy],
+            #     [0, 0, 1]
+            # ])
+
+            extrinsic, intrinsic = co3d_annotation_to_opencv_pose(frame_data)
+            # extrinsic = np.vstack([extrinsic, [0, 0, 0, 1]])
+            # extrinsic = np.linalg.inv(extrinsic)
+
+            frame_data = {
+                "filepath": frame_data['image']['path'],
+                "extri": extrinsic[:3].tolist(),
+                "intri": intrinsic.tolist(),
+            }
+            out_sequence_data.append(frame_data)
+            # print('Frame data: ', frame_data)
+        out[category_dir.name+"_"+seq] = out_sequence_data
+
+root = "/mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt"
+
+with gzip.open(root+"/annotations/co3d/train.jgz", "wt", encoding="utf-8") as f:
+    json.dump(out, f, ensure_ascii=False, indent=4)
+
+print(f"Processed {len(out)} scenes with a total of {sum(len(v) for v in out.values())} images.")
+
diff --git a/training/data/preprocess/co3d_clean_anno.py b/training/data/preprocess/co3d_clean_anno.py
new file mode 100644
index 00000000..c7bbfdf7
--- /dev/null
+++ b/training/data/preprocess/co3d_clean_anno.py
@@ -0,0 +1,51 @@
+import os.path as osp
+import os
+import random
+import gzip
+import json
+
+data_root = "/mimer/NOBACKUP/groups/3d-dl/co3dv2"
+split = "train"
+all_categories = os.listdir(data_root)
+annotation_dir = "/mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations/co3d"
+
+for c in all_categories:
+    annotation_file = osp.join(annotation_dir, f"{c}_{split}.jgz")
+    
+    try:
+        # Load the annotation
+        with gzip.open(annotation_file, "r") as fin:
+            annotation = json.loads(fin.read())
+    except FileNotFoundError:
+        print(f"Annotation file not found: {annotation_file}")
+        continue
+    
+    # Get existing sequences from the data directory
+    category_path = osp.join(data_root, c)
+    if not osp.isdir(category_path):
+        print(f"Category directory not found: {category_path}")
+        continue
+        
+    existing_sequences = set(os.listdir(category_path))
+    print(f"Category: {c}")
+    print(f"  Total sequences in annotation: {len(annotation)}")
+    print(f"  Existing sequences in data: {len(existing_sequences)}")
+    
+    # Filter annotation to keep only existing sequences
+    filtered_annotation = {
+        seq_name: seq_data 
+        for seq_name, seq_data in annotation.items() 
+        if seq_name in existing_sequences
+    }
+    
+    removed_count = len(annotation) - len(filtered_annotation)
+    print(f"  Removed sequences: {removed_count}")
+    print(f"  Remaining sequences: {len(filtered_annotation)}")
+    
+    # Save the filtered annotation back
+    if removed_count > 0:
+        with gzip.open(annotation_file, "wt", encoding="utf-8") as fout:
+            json.dump(filtered_annotation, fout)
+        print(f"  ✓ Saved filtered annotation")
+    else:
+        print(f"  No changes needed")
\ No newline at end of file
diff --git a/training/data/preprocess/hypersim.py b/training/data/preprocess/hypersim.py
new file mode 100644
index 00000000..62c41bb8
--- /dev/null
+++ b/training/data/preprocess/hypersim.py
@@ -0,0 +1,218 @@
+from glob import glob
+from pathlib import Path
+
+import cv2
+import h5py
+import numpy as np
+import pandas as pd
+import torch
+from tqdm import tqdm
+import json
+import gzip
+
+def to_homogeneous(x: torch.Tensor) -> torch.Tensor:
+    return torch.cat((x, torch.ones_like(x[..., :1])), dim=-1)
+
+def get_pixel_grid(
+    B: int,
+    H: int,
+    W: int,
+) -> torch.Tensor:
+    x1_n = torch.meshgrid(
+        *[torch.arange(n) + 0.5 for n in (B, H, W)],
+        indexing="ij",
+    )
+    x1_n = torch.stack((x1_n[2], x1_n[1]), dim=-1).reshape(B, H, W, 2)
+    return x1_n
+
+def load_distance(distance_path) -> np.ndarray:
+    with h5py.File(distance_path, "r") as x:
+        return x["dataset"][:]  # type: ignore
+
+def homog_pixel_grid(H: int, W: int) -> np.ndarray:
+    return (
+        to_homogeneous(
+            get_pixel_grid(
+                1,
+                H,
+                W,
+            )
+        )
+        .numpy()
+        .reshape(-1, 3)
+        .T
+    )
+
+def depth_from_distance(
+        distance: torch.Tensor, K: torch.Tensor
+    ) -> torch.Tensor:
+        H, W = distance.shape[0], distance.shape[1]
+        grid = homog_pixel_grid(H, W)
+        rays = torch.linalg.inv(K) @ grid  # 3xHW
+        ray_z = rays[-1] / torch.linalg.norm(rays, dim=0)
+        z = distance.reshape(-1) * ray_z
+        return z.reshape(H, W, 1)
+
+if __name__ == "__main__":
+    out = {}
+    data_root = Path("/mimer/NOBACKUP/groups/3d-dl/ml-hypersim/contrib/99991/downloads")
+
+    metadata_camera_parameters_csv_file = (
+        data_root / "metadata_camera_parameters.csv"
+    )
+    df_camera_parameters = pd.read_csv(
+        metadata_camera_parameters_csv_file, index_col="scene_name"
+    )
+    scene_names = {f"ai_{i:03d}" for i in range(61)}
+
+    for scene_path in tqdm(list(data_root.iterdir())):
+        scene_name = scene_path.name
+        if scene_name in ["ai_024_012", "ai_026_008", "ai_026_013"]:
+            print("Skipping problematic scene " + scene_name)
+            continue
+        if (scene_name[:-4] not in scene_names) and (scene_name not in scene_names):
+            continue
+        df_: pd.Series = df_camera_parameters.loc[scene_name]  # type: ignore
+        width_pixels = int(df_["settings_output_img_width"])
+        height_pixels = int(df_["settings_output_img_height"])
+
+        M_proj = [
+            [
+                df_["M_proj_00"],
+                df_["M_proj_01"],
+                df_["M_proj_02"],
+                df_["M_proj_03"],
+            ],
+            [
+                df_["M_proj_10"],
+                df_["M_proj_11"],
+                df_["M_proj_12"],
+                df_["M_proj_13"],
+            ],
+            [
+                df_["M_proj_20"],
+                df_["M_proj_21"],
+                df_["M_proj_22"],
+                df_["M_proj_23"],
+            ],
+            [
+                df_["M_proj_30"],
+                df_["M_proj_31"],
+                df_["M_proj_32"],
+                df_["M_proj_33"],
+            ],
+        ]
+        M_proj = np.array(M_proj)
+        M_screen_from_ndc = np.array(
+            [
+                [0.5 * (width_pixels), 0, 0, 0.5 * (width_pixels)],
+                [0, -0.5 * (height_pixels), 0, 0.5 * (height_pixels)],
+                [0, 0, 0.5, 0.5],  # doesn't matter
+                [0, 0, 0, 1.0],
+            ]
+        )
+        x = (M_screen_from_ndc @ M_proj)[[0, 1, 3]]
+        K, R = cv2.decomposeProjectionMatrix(x)[:2]  # type: ignore
+        K = K / K[2, 2]
+
+        scene_root = scene_path
+
+        metadata_scene = scene_root / "_detail" / "metadata_scene.csv"
+        camera_name = "cam_00"
+        df = pd.read_csv(metadata_scene)
+        meters_per_asset = df.loc[
+            df["parameter_name"] == "meters_per_asset_unit", "parameter_value"
+        ].iloc[0]
+
+        image_paths = sorted(
+            glob(
+                (
+                    scene_root
+                    / "images"
+                    / f"scene_{camera_name}_final_preview"
+                    / "frame.*.color.jpg"
+                ).as_posix()
+            )
+        )
+        distance_paths = sorted(
+            glob(
+                (
+                    scene_root
+                    / "images"
+                    / f"scene_{camera_name}_geometry_hdf5"
+                    / "frame.*.depth_meters.hdf5"
+                ).as_posix()
+            )
+        )
+
+        distance_paths = {int(dp.split(".")[-3]): dp for dp in distance_paths}
+        image_paths = {int(ip.split(".")[-3]): ip for ip in image_paths}
+        image_ids = set(distance_paths.keys()).intersection(
+            image_paths.keys()
+        )
+
+        if len(image_ids) == 0:
+            print("No shared image/depth paths for scene" + scene_name)
+            continue
+
+        camera_root = scene_root / "_detail" / camera_name
+        camera_positions_hdf5_file = camera_root / "camera_keyframe_positions.hdf5"
+        camera_orientations_hdf5_file = (
+            camera_root / "camera_keyframe_orientations.hdf5"
+        )
+        with (
+            h5py.File(camera_positions_hdf5_file, "r") as h5_pos,
+            h5py.File(camera_orientations_hdf5_file, "r") as h5_rots,
+        ):  # type: ignore
+            camera_positions: np.ndarray = h5_pos["dataset"][:]  # type: ignore
+            rots: np.ndarray = h5_rots["dataset"][:]  # type: ignore
+            rots = rots.transpose((0, 2, 1))
+            translations = -rots @ camera_positions[..., None]
+            poses = np.zeros((len(rots), 4, 4))
+            poses[:, 3, 3] = 1.0
+            poses[:, :3, :3] = R[None] @ rots
+            poses[:, :3, 3:] = R[None] @ translations
+
+        idx_to_image_id = {
+            idx: img_id for idx, img_id in enumerate(image_ids)
+        }
+        image_id_to_idx = {
+            img_id: idx for idx, img_id in enumerate(image_ids)
+        }
+
+        # K_fixed = K.copy()
+        # K_fixed[0,2], K_fixed[1,2] = K[1,2], K[0,2]
+        intrinsic = torch.tensor(K).reshape(3, 3).float()
+
+        sequence_data = []
+        for img_id in image_ids:
+            T = torch.tensor(poses[img_id]).float()
+            im_path = Path(image_paths[img_id])
+            depth_path = Path(distance_paths[img_id])
+            # distance = (
+            #     torch.tensor(load_distance(depth_path)).float()
+            #     / meters_per_asset
+            # )
+
+            # depth = depth_from_distance(distance, intrinsic).float()
+            # depth[depth.isnan()] = 0
+
+
+            T_w2c = T[:3].numpy().tolist()
+            frame_data = {
+                "filepath": im_path.as_posix().split('downloads/')[1],
+                "extri": T_w2c,
+                "meters_per_asset": meters_per_asset,
+                "intri": intrinsic.numpy().tolist(),
+                "depthpath": depth_path.as_posix().split('downloads/')[1],
+            }
+            sequence_data.append(frame_data)
+        out[scene_name] = sequence_data
+        
+root = "/mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt"
+
+with gzip.open(root+"/annotations/hypersim/train.jgz", "wt", encoding="utf-8") as f:
+    json.dump(out, f, ensure_ascii=False, indent=4)
+
+print(f"Processed {len(out)} scenes with a total of {sum(len(v) for v in out.values())} images.")
+
diff --git a/training/data/preprocess/megadepth/generate_sequences.py b/training/data/preprocess/megadepth/generate_sequences.py
new file mode 100644
index 00000000..0cd40f8d
--- /dev/null
+++ b/training/data/preprocess/megadepth/generate_sequences.py
@@ -0,0 +1,80 @@
+import numpy as np
+import json
+import gzip
+from tqdm import tqdm
+
+
+# See https://github.com/facebookresearch/vggt/issues/82
+# and https://github.com/facebookresearch/vggt/issues/216#issuecomment-3053586858
+
+def sample_topk_sequences(overlap_matrix, image_paths, sequence_length=256, num_sequences=1000):
+    n_images = overlap_matrix.shape[0]
+    sequences = []
+
+    for _ in range(num_sequences):
+        # Randomly pick an anchor image
+        anchor = np.random.randint(n_images)
+        
+        overlaps = overlap_matrix[anchor]
+        # Exclude invalid entries (e.g., -1)
+        valid_mask = overlaps >= 0
+        valid_mask[anchor] = False  # don't include self
+
+        valid_indices = np.where(valid_mask)[0]
+        if len(valid_indices) < sequence_length - 1:
+            continue  # skip if not enough neighbors
+
+        # Sort by overlap descending
+        sorted_neighbors = valid_indices[np.argsort(-overlaps[valid_indices])]
+
+        # Pick top-k
+        selected_neighbors = sorted_neighbors[:sequence_length - 1]
+
+        # Form the sequence: anchor + top neighbors
+        sequence = [anchor] + selected_neighbors.tolist()
+
+        # print(image_paths[sequence])  # Access image paths for the sequence
+        sequence = [{
+            "filepath": p,
+            "id": s
+        } for p, s in zip(image_paths[sequence], sequence)]
+        sequences.append(sequence)
+
+    return sequences
+
+with open("train_scenes.txt", "r") as f:
+    train_scenes = [line.strip() for line in f.readlines()]
+with open("valid_scenes.txt", "r") as f:
+    val_scenes = [line.strip() for line in f.readlines()]
+
+for split in ["train", "val"]:
+    if split == "train":
+        scenes = train_scenes
+    else:
+        scenes = val_scenes
+
+   
+    out = {}
+    for scene in tqdm(scenes):
+        try: 
+            data = np.load(f"/mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations/megadepth/scene_info/{scene}.npz", allow_pickle=True)
+            print('Data keys:', data.keys())
+            print('Depth paths: ', data['depth_paths'])
+            overlap_matrix = data['overlap_matrix']
+            image_paths = data['image_paths']
+            print('Data: ', data)
+
+            sequences = sample_topk_sequences(overlap_matrix, image_paths, sequence_length=256, num_sequences=1000)
+            out[scene] = sequences
+        except FileNotFoundError:
+            print(f"File not found for scene {scene}. Skipping...")
+            continue
+        
+    # root = "/mimer/NOBACKUP/groups/snic2022-6-266/davnords/mv-ssl"
+    # with open(root+f"/annotations/megadepth/{split}.json", "w") as f:
+    #     json.dump(out, f, indent=4)  # `indent=4` makes it pretty-printed
+
+    # with gzip.open(root+f"/annotations/megadepth/{split}.jgz", "wt", encoding="utf-8") as f:
+    #     json.dump(out, f, ensure_ascii=False, indent=4)
+    
+    # print(f"Processed {len(out)} scenes with a total of {sum(len(v) for v in out.values())} images.")
\ No newline at end of file
diff --git a/training/data/preprocess/megadepth/load.py b/training/data/preprocess/megadepth/load.py
new file mode 100644
index 00000000..de124456
--- /dev/null
+++ b/training/data/preprocess/megadepth/load.py
@@ -0,0 +1,251 @@
+import os
+import numpy as np
+import shutil
+import json
+import random
+from collections import defaultdict
+from typing import List, Dict, Set, Tuple
+import gzip
+
+def check_file_exists(path: str) -> bool:
+    """Check if file exists and is readable"""
+    return os.path.exists(path) and os.path.isfile(path)
+
+def build_graph_from_pairs(pairs: List[Tuple[int, int]], overlaps: np.ndarray, 
+                          image_paths: List[str], depth_paths: List[str],
+                          min_overlap: float, max_overlap: float) -> Dict[int, List[int]]:
+    """Build adjacency graph from valid pairs with overlap filtering"""
+    graph = defaultdict(list)
+    
+    for i, (idx1, idx2) in enumerate(pairs):
+        overlap = overlaps[i]
+        
+        # Check overlap constraints
+        if overlap < min_overlap or overlap > max_overlap:
+            continue
+            
+        # Check if files exist
+        img1_exists = check_file_exists(os.path.join(data_root, image_paths[idx1]))
+        img2_exists = check_file_exists(os.path.join(data_root, image_paths[idx2]))
+        depth1_exists = check_file_exists(os.path.join(data_root, depth_paths[idx1]))
+        depth2_exists = check_file_exists(os.path.join(data_root, depth_paths[idx2]))
+        
+        # Only add edge if both frames have valid files
+        if img1_exists and img2_exists and depth1_exists and depth2_exists:
+            graph[idx1].append(idx2)
+            graph[idx2].append(idx1)
+    
+    return graph
+
+def generate_sequence(graph: Dict[int, List[int]], start_node: int, 
+                     target_length: int, used_nodes: Set[int]) -> List[int]:
+    """Generate a sequence by random walk, avoiding already used nodes when possible"""
+    sequence = [start_node]
+    current = start_node
+    local_used = {start_node}
+    
+    for _ in range(target_length - 1):
+        if current not in graph or not graph[current]:
+            break
+            
+        # Get neighbors, prefer unused ones
+        neighbors = graph[current]
+        unused_neighbors = [n for n in neighbors if n not in used_nodes and n not in local_used]
+        
+        if unused_neighbors:
+            next_node = random.choice(unused_neighbors)
+        else:
+            # Fall back to any neighbor not in current sequence
+            available = [n for n in neighbors if n not in local_used]
+            if not available:
+                break
+            next_node = random.choice(available)
+        
+        sequence.append(next_node)
+        local_used.add(next_node)
+        current = next_node
+    
+    return sequence
+
+def create_sequences_for_scene(scene_info: Dict, scene_name: str, 
+                             min_overlap: float, max_overlap: float,
+                             num_sequences: int = 1000, sequence_length: int = 24) -> List[List[Dict]]:
+    """Create diverse sequences for a scene"""
+    
+    image_paths = scene_info["image_paths"]
+    depth_paths = scene_info["depth_paths"]
+    intrinsics = scene_info["intrinsics"]
+    poses = scene_info["poses"]
+    pairs = scene_info["pairs"]
+    overlaps = scene_info["overlaps"]
+    
+    print(f'Scene {scene_name}: {len(pairs)} pairs, {overlaps.shape[0]} overlaps')
+    
+    # Build graph from valid pairs
+    graph = build_graph_from_pairs(pairs, overlaps, image_paths, depth_paths, 
+                                  min_overlap, max_overlap)
+    
+    if not graph:
+        print(f"No valid pairs found for scene {scene_name}")
+        return []
+    
+    print(f'Built graph with {len(graph)} nodes')
+    
+    sequences = []
+    used_nodes = set()
+    max_attempts = num_sequences * 3  # Allow some failed attempts
+    attempts = 0
+    
+    # Get nodes with good connectivity for starting points
+    node_degrees = [(node, len(neighbors)) for node, neighbors in graph.items()]
+    node_degrees.sort(key=lambda x: x[1], reverse=True)
+    good_start_nodes = [node for node, degree in node_degrees if degree >= 2]
+    
+    if not good_start_nodes:
+        good_start_nodes = list(graph.keys())
+    
+    while len(sequences) < num_sequences and attempts < max_attempts:
+        attempts += 1
+        
+        # Choose starting node with preference for unused nodes
+        unused_start_nodes = [n for n in good_start_nodes if n not in used_nodes]
+        if unused_start_nodes:
+            start_node = random.choice(unused_start_nodes)
+        else:
+            start_node = random.choice(good_start_nodes)
+        
+        # Generate sequence
+        sequence_indices = generate_sequence(graph, start_node, sequence_length, used_nodes)
+        
+        if len(sequence_indices) >= sequence_length // 2:  # Accept if at least half the target length
+            # Create sequence with all metadata
+            sequence_frames = []
+            for frame_idx in sequence_indices:
+                frame_data = {
+                    'frame_idx': int(frame_idx),
+                    'filepath': image_paths[frame_idx],
+                    'depth_path': depth_paths[frame_idx],
+                    'intri': intrinsics[frame_idx].tolist(),
+                    'extri': poses[frame_idx][:3, :].tolist()
+                }
+                sequence_frames.append(frame_data)
+            
+            sequences.append(sequence_frames)
+            used_nodes.update(sequence_indices)
+            
+            if len(sequences) % 100 == 0:
+                print(f'Generated {len(sequences)} sequences')
+        
+        # Reset used nodes occasionally to allow more diversity
+        if attempts % (num_sequences // 4) == 0:
+            used_nodes = set()
+    
+    print(f'Generated {len(sequences)} sequences for scene {scene_name}')
+    return sequences
+
+def save_sequences(sequences: List[List[Dict]], scene_out_dir: str):
+    """Save sequences to disk with file copying and metadata"""
+    
+    for seq_idx, seq in enumerate(sequences):
+        seq_dir = os.path.join(scene_out_dir, f"sequence_{seq_idx:03d}")
+        os.makedirs(seq_dir, exist_ok=True)
+        
+        metadata = []
+        valid_frames = []
+        
+        for frame_idx, frame_data in enumerate(seq):
+            # Double-check files exist before copying
+            img_src = frame_data["image_path"]
+            depth_src = frame_data["depth_path"]
+            
+            if not (check_file_exists(img_src) and check_file_exists(depth_src)):
+                print(f"Warning: Skipping frame {frame_idx} in sequence {seq_idx} - missing files")
+                continue
+                
+            # Copy files
+            img_dst = os.path.join(seq_dir, f"{len(valid_frames):03d}.jpg")
+            depth_dst = os.path.join(seq_dir, f"{len(valid_frames):03d}.npy")
+            
+            try:
+                shutil.copy(img_src, img_dst)
+                valid_frames.append(frame_data)
+                
+            except Exception as e:
+                print(f"Error copying files for sequence {seq_idx}, frame {frame_idx}: {e}")
+                continue
+        
+        # Save metadata
+        if metadata:
+            metadata_file = os.path.join(seq_dir, "metadata.json")
+            with open(metadata_file, 'w') as f:
+                json.dump({
+                    'sequence_length': len(metadata),
+                    'frames': metadata
+                }, f, indent=2)
+
+from tqdm import tqdm
+# Main execution
+data_root = "/mimer/NOBACKUP/groups/snic2022-6-266/data/megadepth"
+scene_info_root = os.path.join(data_root, "prep_scene_info")
+all_scenes = os.listdir(scene_info_root)
+test_scenes = ["0017.npy", "0004.npy", "0048.npy", "0013.npy"]
+out_root = "sequences_out"
+os.makedirs(out_root, exist_ok=True)
+
+split = "test"
+
+if split == "train":
+    scene_names = set(all_scenes) - set(test_scenes)
+
+elif split == "test":
+    scene_names = test_scenes
+
+min_overlap = 0.01
+max_overlap = 1.0
+
+# Set random seed for reproducibility
+random.seed(42)
+np.random.seed(42)
+
+result = {}
+
+for scene_name in tqdm(scene_names):
+    print(f"\nProcessing scene: {scene_name}")
+    
+    try:
+        scene_info = np.load(
+            os.path.join(scene_info_root, scene_name), allow_pickle=True
+        ).item()
+        
+        scene_name_clean = os.path.splitext(scene_name)[0]
+        scene_name_out = f"{scene_name_clean}_{min_overlap}_{max_overlap}"
+        
+        # Create sequences
+        sequences = create_sequences_for_scene(
+            scene_info, scene_name_clean, min_overlap, max_overlap,
+            num_sequences=500, sequence_length=24
+        )
+        result[scene_name_clean] = sequences
+        print(f"Total sequences for scene {scene_name_clean}: {len(sequences)}")
+        # if sequences:
+        #     # Create output directory
+        #     scene_out_dir = os.path.join(out_root, scene_name_out)
+        #     os.makedirs(scene_out_dir, exist_ok=True)
+            
+        #     # Save sequences
+        #     save_sequences(sequences, scene_out_dir)
+            
+        #     print(f"Saved {len(sequences)} sequences for scene {scene_name_clean}")
+        # else:
+        #     print(f"No valid sequences generated for scene {scene_name_clean}")
+            
+    except Exception as e:
+        print(f"Error processing scene {scene_name}: {e}")
+        continue
+
+# Save as .jgz
+
+with gzip.open(f"/mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations/megadepth/{split}.jgz", "wt", encoding="utf-8") as f:
+    json.dump(result, f, ensure_ascii=False, indent=4)
+
+print("\nSequence generation completed!")
\ No newline at end of file
diff --git a/training/data/preprocess/megadepth/preprocess_scene.py b/training/data/preprocess/megadepth/preprocess_scene.py
new file mode 100644
index 00000000..59d6c106
--- /dev/null
+++ b/training/data/preprocess/megadepth/preprocess_scene.py
@@ -0,0 +1,241 @@
+import argparse
+
+import numpy as np
+
+import os
+
+parser = argparse.ArgumentParser(description='MegaDepth preprocessing script')
+
+parser.add_argument(
+    '--base_path', type=str, required=True,
+    help='path to MegaDepth'
+)
+parser.add_argument(
+    '--scene_id', type=str, required=True,
+    help='scene ID'
+)
+
+parser.add_argument(
+    '--output_path', type=str, required=True,
+    help='path to the output directory'
+)
+
+args = parser.parse_args()
+
+base_path = args.base_path
+# Remove the trailing / if need be.
+if base_path[-1] in ['/', '\\']:
+    base_path = base_path[: - 1]
+scene_id = args.scene_id
+
+base_depth_path = os.path.join(
+    base_path, 'phoenix/S6/zl548/MegaDepth_v1'
+)
+base_undistorted_sfm_path = os.path.join(
+    base_path, 'Undistorted_SfM'
+)
+
+undistorted_sparse_path = os.path.join(
+    base_undistorted_sfm_path, scene_id, 'sparse-txt'
+)
+if not os.path.exists(undistorted_sparse_path):
+    exit()
+
+depths_path = os.path.join(
+    base_depth_path, scene_id, 'dense0', 'depths'
+)
+if not os.path.exists(depths_path):
+    exit()
+
+images_path = os.path.join(
+    base_undistorted_sfm_path, scene_id, 'images'
+)
+if not os.path.exists(images_path):
+    exit()
+
+# Process cameras.txt
+with open(os.path.join(undistorted_sparse_path, 'cameras.txt'), 'r') as f:
+    raw = f.readlines()[3 :]  # skip the header
+
+camera_intrinsics = {}
+for camera in raw:
+    camera = camera.split(' ')
+    camera_intrinsics[int(camera[0])] = [float(elem) for elem in camera[2 :]]
+
+# Process points3D.txt
+with open(os.path.join(undistorted_sparse_path, 'points3D.txt'), 'r') as f:
+    raw = f.readlines()[3 :]  # skip the header
+
+points3D = {}
+for point3D in raw:
+    point3D = point3D.split(' ')
+    points3D[int(point3D[0])] = np.array([
+        float(point3D[1]), float(point3D[2]), float(point3D[3])
+    ])
+    
+# Process images.txt
+with open(os.path.join(undistorted_sparse_path, 'images.txt'), 'r') as f:
+    raw = f.readlines()[4 :]  # skip the header
+
+image_id_to_idx = {}
+image_names = []
+raw_pose = []
+camera = []
+points3D_id_to_2D = []
+n_points3D = []
+for idx, (image, points) in enumerate(zip(raw[:: 2], raw[1 :: 2])):
+    image = image.split(' ')
+    points = points.split(' ')
+
+    image_id_to_idx[int(image[0])] = idx
+
+    image_name = image[-1].strip('\n')
+    image_names.append(image_name)
+
+    raw_pose.append([float(elem) for elem in image[1 : -2]])
+    camera.append(int(image[-2]))
+    current_points3D_id_to_2D = {}
+    for x, y, point3D_id in zip(points[:: 3], points[1 :: 3], points[2 :: 3]):
+        if int(point3D_id) == -1:
+            continue
+        current_points3D_id_to_2D[int(point3D_id)] = [float(x), float(y)]
+    points3D_id_to_2D.append(current_points3D_id_to_2D)
+    n_points3D.append(len(current_points3D_id_to_2D))
+n_images = len(image_names)
+
+# Image and depthmaps paths
+image_paths = []
+depth_paths = []
+for image_name in image_names:
+    image_path = os.path.join(images_path, image_name)
+   
+    # Path to the depth file
+    depth_path = os.path.join(
+        depths_path, '%s.h5' % os.path.splitext(image_name)[0]
+    )
+    
+    if os.path.exists(depth_path):
+        # Check if depth map or background / foreground mask
+        file_size = os.stat(depth_path).st_size
+        # Rough estimate - 75KB might work as well
+        if file_size < 100 * 1024:
+            depth_paths.append(None)
+            image_paths.append(None)
+        else:
+            depth_paths.append(depth_path[len(base_path) + 1 :])
+            image_paths.append(image_path[len(base_path) + 1 :])
+    else:
+        print('ERROR: Depth path does not exist: %s' % depth_path)
+        depth_paths.append(None)
+        image_paths.append(None)
+
+# Camera configuration
+intrinsics = []
+poses = []
+principal_axis = []
+points3D_id_to_ndepth = []
+for idx, image_name in enumerate(image_names):
+    if image_paths[idx] is None:
+        intrinsics.append(None)
+        poses.append(None)
+        principal_axis.append([0, 0, 0])
+        points3D_id_to_ndepth.append({})
+        continue
+    image_intrinsics = camera_intrinsics[camera[idx]]
+    K = np.zeros([3, 3])
+    K[0, 0] = image_intrinsics[2]
+    K[0, 2] = image_intrinsics[4]
+    K[1, 1] = image_intrinsics[3]
+    K[1, 2] = image_intrinsics[5]
+    K[2, 2] = 1
+    intrinsics.append(K)
+
+    image_pose = raw_pose[idx]
+    qvec = image_pose[: 4]
+    qvec = qvec / np.linalg.norm(qvec)
+    w, x, y, z = qvec
+    R = np.array([
+        [
+            1 - 2 * y * y - 2 * z * z,
+            2 * x * y - 2 * z * w,
+            2 * x * z + 2 * y * w
+        ],
+        [
+            2 * x * y + 2 * z * w,
+            1 - 2 * x * x - 2 * z * z,
+            2 * y * z - 2 * x * w
+        ],
+        [
+            2 * x * z - 2 * y * w,
+            2 * y * z + 2 * x * w,
+            1 - 2 * x * x - 2 * y * y
+        ]
+    ])
+    principal_axis.append(R[2, :])
+    t = image_pose[4 : 7]
+    # World-to-Camera pose
+    current_pose = np.zeros([4, 4])
+    current_pose[: 3, : 3] = R
+    current_pose[: 3, 3] = t
+    current_pose[3, 3] = 1
+    # Camera-to-World pose
+    # pose = np.zeros([4, 4])
+    # pose[: 3, : 3] = np.transpose(R)
+    # pose[: 3, 3] = -np.matmul(np.transpose(R), t)
+    # pose[3, 3] = 1
+    poses.append(current_pose)
+    
+    current_points3D_id_to_ndepth = {}
+    for point3D_id in points3D_id_to_2D[idx].keys():
+        p3d = points3D[point3D_id]
+        current_points3D_id_to_ndepth[point3D_id] = (np.dot(R[2, :], p3d) + t[2]) / (.5 * (K[0, 0] + K[1, 1])) 
+    points3D_id_to_ndepth.append(current_points3D_id_to_ndepth)
+principal_axis = np.array(principal_axis)
+angles = np.rad2deg(np.arccos(
+    np.clip(
+        np.dot(principal_axis, np.transpose(principal_axis)),
+        -1, 1
+    )
+))
+
+# Compute overlap score
+overlap_matrix = np.full([n_images, n_images], -1.)
+scale_ratio_matrix = np.full([n_images, n_images], -1.)
+for idx1 in range(n_images):
+    if image_paths[idx1] is None or depth_paths[idx1] is None:
+        continue
+    for idx2 in range(idx1 + 1, n_images):
+        if image_paths[idx2] is None or depth_paths[idx2] is None:
+            continue
+        matches = (
+            points3D_id_to_2D[idx1].keys() &
+            points3D_id_to_2D[idx2].keys()
+        )
+        min_num_points3D = min(
+            len(points3D_id_to_2D[idx1]), len(points3D_id_to_2D[idx2])
+        )
+        overlap_matrix[idx1, idx2] = len(matches) / len(points3D_id_to_2D[idx1])  # min_num_points3D
+        overlap_matrix[idx2, idx1] = len(matches) / len(points3D_id_to_2D[idx2])  # min_num_points3D
+        if len(matches) == 0:
+            continue
+        points3D_id_to_ndepth1 = points3D_id_to_ndepth[idx1]
+        points3D_id_to_ndepth2 = points3D_id_to_ndepth[idx2]
+        nd1 = np.array([points3D_id_to_ndepth1[match] for match in matches])
+        nd2 = np.array([points3D_id_to_ndepth2[match] for match in matches])
+        min_scale_ratio = np.min(np.maximum(nd1 / nd2, nd2 / nd1))
+        scale_ratio_matrix[idx1, idx2] = min_scale_ratio
+        scale_ratio_matrix[idx2, idx1] = min_scale_ratio
+
+np.savez(
+    os.path.join(args.output_path, '%s.npz' % scene_id),
+    image_paths=image_paths,
+    depth_paths=depth_paths,
+    intrinsics=intrinsics,
+    poses=poses,
+    overlap_matrix=overlap_matrix,
+    scale_ratio_matrix=scale_ratio_matrix,
+    angles=angles,
+    n_points3D=n_points3D,
+    points3D_id_to_2D=points3D_id_to_2D,
+    points3D_id_to_ndepth=points3D_id_to_ndepth
+)
\ No newline at end of file
diff --git a/training/data/preprocess/megadepth/preprocess_undistorted_megadepth.sh b/training/data/preprocess/megadepth/preprocess_undistorted_megadepth.sh
new file mode 100644
index 00000000..c983ee46
--- /dev/null
+++ b/training/data/preprocess/megadepth/preprocess_undistorted_megadepth.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+
+if [[ $# != 2 ]]; then
+    echo 'Usage: bash preprocess_megadepth.sh /path/to/megadepth /output/path'
+    exit
+fi
+
+export dataset_path=$1
+export output_path=$2
+
+mkdir $output_path
+echo 0
+ls $dataset_path/Undistorted_SfM | xargs -P 8 -I % sh -c 'echo %; python preprocess_scene.py --base_path $dataset_path --scene_id % --output_path $output_path'
\ No newline at end of file
diff --git a/training/data/preprocess/megadepth/train_scenes.txt b/training/data/preprocess/megadepth/train_scenes.txt
new file mode 100644
index 00000000..83c78d3a
--- /dev/null
+++ b/training/data/preprocess/megadepth/train_scenes.txt
@@ -0,0 +1,117 @@
+0000
+0001
+0002
+0003
+0004
+0005
+0007
+0008
+0011
+0012
+0013
+0015
+0017
+0019
+0020
+0021
+0022
+0023
+0024
+0025
+0026
+0027
+0032
+0035
+0036
+0037
+0039
+0042
+0043
+0046
+0048
+0050
+0056
+0057
+0060
+0061
+0063
+0065
+0070
+0080
+0083
+0086
+0087
+0095
+0098
+0100
+0101
+0103
+0104
+0105
+0107
+0115
+0117
+0122
+0130
+0137
+0143
+0147
+0148
+0149
+0150
+0156
+0160
+0176
+0183
+0189
+0190
+0200
+0214
+0224
+0235
+0237
+0240
+0243
+0258
+0265
+0269
+0299
+0312
+0326
+0327
+0331
+0335
+0341
+0348
+0366
+0377
+0380
+0394
+0407
+0411
+0430
+0446
+0455
+0472
+0474
+0476
+0478
+0493
+0494
+0496
+0505
+0559
+0733
+0860
+1017
+1589
+4541
+5004
+5005
+5006
+5007
+5009
+5010
+5012
+5013
+5017
\ No newline at end of file
diff --git a/training/data/preprocess/megadepth/undistort_reconstructions.py b/training/data/preprocess/megadepth/undistort_reconstructions.py
new file mode 100644
index 00000000..a6b99a72
--- /dev/null
+++ b/training/data/preprocess/megadepth/undistort_reconstructions.py
@@ -0,0 +1,69 @@
+import argparse
+
+import imagesize
+
+import os
+
+import subprocess
+
+parser = argparse.ArgumentParser(description='MegaDepth Undistortion')
+
+parser.add_argument(
+    '--colmap_path', type=str, required=True,
+    help='path to colmap executable'
+)
+parser.add_argument(
+    '--base_path', type=str, required=True,
+    help='path to MegaDepth'
+)
+
+args = parser.parse_args()
+
+sfm_path = os.path.join(
+    args.base_path, 'MegaDepth_v1_SfM'
+)
+base_depth_path = os.path.join(
+    args.base_path, 'phoenix/S6/zl548/MegaDepth_v1'
+)
+output_path = os.path.join(
+    args.base_path, 'Undistorted_SfM'
+)
+
+os.mkdir(output_path)
+
+for scene_name in os.listdir(base_depth_path):
+    current_output_path = os.path.join(output_path, scene_name)
+    os.mkdir(current_output_path)
+
+    image_path = os.path.join(
+        base_depth_path, scene_name, 'dense0', 'imgs'
+    )
+    if not os.path.exists(image_path):
+        continue
+    
+    # Find the maximum image size in scene.
+    max_image_size = 0
+    for image_name in os.listdir(image_path):
+        max_image_size = max(
+            max_image_size,
+            max(imagesize.get(os.path.join(image_path, image_name)))
+        )
+
+    # Undistort the images and update the reconstruction.
+    subprocess.call([
+        os.path.join(args.colmap_path, 'colmap'), 'image_undistorter', 
+        '--image_path', os.path.join(sfm_path, scene_name, 'images'),
+        '--input_path', os.path.join(sfm_path, scene_name, 'sparse', 'manhattan', '0'),
+        '--output_path',  current_output_path,
+        '--max_image_size', str(max_image_size)
+    ])
+
+    # Transform the reconstruction to raw text format.
+    sparse_txt_path = os.path.join(current_output_path, 'sparse-txt')
+    os.mkdir(sparse_txt_path)
+    subprocess.call([
+        os.path.join(args.colmap_path, 'colmap'), 'model_converter',
+        '--input_path', os.path.join(current_output_path, 'sparse'),
+        '--output_path', sparse_txt_path, 
+        '--output_type', 'TXT'
+    ])
\ No newline at end of file
diff --git a/training/data/preprocess/megadepth/valid_scenes.txt b/training/data/preprocess/megadepth/valid_scenes.txt
new file mode 100644
index 00000000..c9e35b50
--- /dev/null
+++ b/training/data/preprocess/megadepth/valid_scenes.txt
@@ -0,0 +1,77 @@
+0016
+0033
+0034
+0041
+0044
+0047
+0049
+0058
+0062
+0064
+0067
+0071
+0076
+0078
+0090
+0094
+0099
+0102
+0121
+0129
+0133
+0141
+0151
+0162
+0168
+0175
+0177
+0178
+0181
+0185
+0186
+0197
+0204
+0205
+0209
+0212
+0217
+0223
+0229
+0231
+0238
+0252
+0257
+0271
+0275
+0277
+0281
+0285
+0286
+0290
+0294
+0303
+0306
+0307
+0323
+0349
+0360
+0387
+0389
+0402
+0406
+0412
+0443
+0482
+0768
+1001
+3346
+5000
+5001
+5002
+5003
+5008
+5011
+5014
+5015
+5016
+5018
\ No newline at end of file
diff --git a/training/data/preprocess/mvssynth.py b/training/data/preprocess/mvssynth.py
new file mode 100644
index 00000000..b4d28f15
--- /dev/null
+++ b/training/data/preprocess/mvssynth.py
@@ -0,0 +1,77 @@
+from pathlib import Path
+import json
+import gzip
+import numpy as np
+import torch
+from tqdm import tqdm
+
+def read_img_depth_pose(pose_path):
+    with open(pose_path) as f:
+        r_info = json.load(f)
+        c_x = r_info["c_x"]
+        c_y = r_info["c_y"]
+        f_x = r_info["f_x"]
+        f_y = r_info["f_y"]
+        extrinsic = np.array(r_info["extrinsic"])
+        # extrinsic = inv(extrinsic)
+          
+    # This is only for GTA 540
+    f_x = f_x * 810 / 1920
+
+    K = np.array([[f_x, 0, c_x], [0, f_y, c_y], [0,0,1]])
+    return K, extrinsic
+
+# Root folder where everything starts
+root = Path("/mimer/NOBACKUP/groups/3d-dl/MVS-Synth/GTAV_540")
+
+out = {}
+
+for scene_dir in tqdm(root.iterdir()):
+
+    if scene_dir.name.startswith('num_images'):
+        continue
+    
+    frames = sorted([p.name for p in (scene_dir / "images").iterdir() if p.suffix == ".png"])
+    sequence_data = []
+    for frame in frames:
+        pose_path = scene_dir / "poses" / (frame.replace(".png", ".json"))
+        depth_path = scene_dir / "depths" / (frame.replace(".png", ".exr"))
+
+        # with open(pose_path) as f:
+        #     cam = json.load(f)
+
+        K, pose_w2c = read_img_depth_pose(pose_path)
+        # extrinsic_4x4 = np.array(cam["extrinsic"], dtype=np.float32)
+        # # extrinsic_4x4 = np.linalg.inv(extrinsic_4x4)
+        # R = extrinsic_4x4[:3, :3]
+        # t = extrinsic_4x4[:3, 3]
+
+        # if np.linalg.det(R) < 0:
+        #     R[:, 2] *= -1
+        #     t[2] *= -1
+        # pose_w2c = np.hstack([R, t.reshape(3, 1)])
+
+        # K = np.array([
+        #     [cam["f_x"], 0, cam["c_x"]],
+        #     [0, cam["f_y"], cam["c_y"]],
+        #     [0, 0, 1]
+        # ], dtype=np.float32)
+
+        # pose_w2c = read_scannet_pose(pose_path)
+        frame_data = {
+            "filepath": f"{scene_dir.name}/images/{frame}",
+            "extri": pose_w2c[:3].tolist(),
+            "intri": K.tolist(),
+            "depthpath": f"{scene_dir.name}/depths/{frame.replace('.png', '.exr')}",
+        }
+        sequence_data.append(frame_data)
+
+    out[scene_dir.name] = sequence_data
+
+root = "/mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt"
+
+with gzip.open(root+"/annotations/mvssynth/train.jgz", "wt", encoding="utf-8") as f:
+    json.dump(out, f, ensure_ascii=False, indent=4)
+
+print(f"Processed {len(out)} scenes with a total of {sum(len(v) for v in out.values())} images.")
+
diff --git a/training/data/preprocess/pointodyssey.py b/training/data/preprocess/pointodyssey.py
new file mode 100644
index 00000000..00fbf772
--- /dev/null
+++ b/training/data/preprocess/pointodyssey.py
@@ -0,0 +1,42 @@
+from pathlib import Path
+import json
+import gzip
+import numpy as np
+import torch
+from tqdm import tqdm
+
+# Root folder where everything starts
+root = Path("/mimer/NOBACKUP/groups/3d-dl/pointodyssey")
+
+out = {}
+
+for scene_dir in tqdm(root.iterdir()):
+    
+    frames = sorted([p.name for p in (scene_dir / "rgbs").iterdir() if p.suffix == ".jpg"])
+    sequence_data = []
+
+    # info = np.load(scene_dir / "info.npz")
+    anno = np.load(scene_dir / "anno.npz")
+
+    intrinsics = anno['intrinsics']
+    extrinsics = anno['extrinsics']
+
+    for i, frame in enumerate(frames):
+        depth_path = scene_dir / "depths" / (frame.replace("rgb", ".depth").replace(".jpg", ".png"))
+
+        frame_data = {
+            "filepath": f"{scene_dir.name}/rgbs/{frame}",
+            "extri": extrinsics[i][:3].tolist(),
+            "intri": intrinsics[i].tolist(),
+            "depthpath": f"{scene_dir.name}/depths/{frame.replace('rgb', 'depth').replace('.jpg', '.png')}",
+        }
+        sequence_data.append(frame_data)
+    out[scene_dir.name] = sequence_data
+
+root = "/mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt"
+
+with gzip.open(root+"/annotations/pointodyssey.jgz", "wt", encoding="utf-8") as f:
+    json.dump(out, f, ensure_ascii=False, indent=4)
+
+print(f"Processed {len(out)} scenes with a total of {sum(len(v) for v in out.values())} images.")
+
diff --git a/training/data/preprocess/prepare_eth3d.py b/training/data/preprocess/prepare_eth3d.py
new file mode 100644
index 00000000..68a0e9c7
--- /dev/null
+++ b/training/data/preprocess/prepare_eth3d.py
@@ -0,0 +1,249 @@
+# The scripts provided here are for reference only. Please ensure you have obtained the necessary licenses from the original dataset providers before proceeding.
+
+import os
+import os.path as osp
+import cv2
+import numpy as np
+
+from PIL import Image
+from scipy.spatial.transform import Rotation as R
+from tqdm import tqdm
+
+def read_cameras_txt(path):
+    cameras = {}
+    with open(path, "r") as f:
+        for line in f:
+            # skip comment
+            if line.startswith("#"):
+                continue
+            parts = line.strip().split()
+            camera_id = int(parts[0])
+            model = parts[1]
+            width = int(parts[2])
+            height = int(parts[3])
+            
+            # parse camera parameters
+            if model == "SIMPLE_PINHOLE":
+                # f, cx, cy
+                params = np.array(list(map(float, parts[4:])))
+                fx = fy = params[0]
+                cx = params[1]
+                cy = params[2]
+            elif model == "PINHOLE":
+                # fx, fy, cx, cy
+                params = np.array(list(map(float, parts[4:])))
+                fx = params[0]
+                fy = params[1]
+                cx = params[2]
+                cy = params[3]
+            elif model == "THIN_PRISM_FISHEYE":
+                # fx, fy, cx, cy, k1, k2, p1, p2, k3, k4, sx1, sy1
+                params = np.array(list(map(float, parts[4:])))
+                fx, fy, cx, cy = params[0], params[1], params[2], params[3]
+                dist_params = {
+                    'k1': params[4], 'k2': params[5],
+                    'p1': params[6], 'p2': params[7],
+                    'k3': params[8], 'k4': params[9],
+                    'sx1': params[10], 'sy1': params[11]
+                }
+            else:
+                print(f"Warning: camera model {model} is not supported yet")
+                continue
+
+            K = np.array([
+                [fx, 0, cx],
+                [0, fy, cy],
+                [0, 0,  1]
+            ])
+            
+            cameras[camera_id] = {
+                'K': K,
+                'dist_params': dist_params,
+                'model': model,
+                'width': width,
+                'height': height
+            }
+    return cameras
+
+
+def read_images_txt(path):
+    images = {}
+    with open(path, "r") as f:
+        lines = f.readlines()
+        for i in range(0, len(lines), 2):
+            # skip comment lines
+            if lines[i].startswith("#"):
+                if "Number of images" in lines[i]:
+                    i = 2
+                else:
+                    continue
+
+            # first line: extrinsics
+            line1_parts = lines[i].strip().split()
+            image_id = int(line1_parts[0])
+            # (qw, qx, qy, qz)
+            qvec = np.array(list(map(float, line1_parts[1:5])))
+            # (tx, ty, tz)
+            tvec = np.array(list(map(float, line1_parts[5:8])))
+            camera_id = int(line1_parts[8])
+            image_name = line1_parts[9]
+            
+            # COLMAP (W, X, Y, Z)
+            # Scipy Rotation (X, Y, Z, W)
+            rotation = R.from_quat([qvec[1], qvec[2], qvec[3], qvec[0]])
+            
+            # get rotation matrix R and tranlsation T, w2c
+            # P_camera = R * P_world + T
+            R_matrix = rotation.as_matrix()
+            
+            images[image_id] = {
+                'R': R_matrix,
+                'T': tvec,
+                'camera_id': camera_id,
+                'name': image_name
+            }
+    return images
+
+
+if __name__ == '__main__':
+    data_root = '/mimer/NOBACKUP/groups/3d-dl/eth3d'
+    # sequences = [seq for seq in os.listdir('data/eth3d') if os.path.isdir(os.path.join('data/eth3d', seq))]
+    # print(sequences)
+    sequences = ["courtyard", "delivery_area", "electro", "facade", "kicker", "meadow", "office", "pipes", "playground", "relief", "relief_2", "terrace", "terrains"]
+
+    # setup_debug()
+
+    for seq in tqdm(sequences, desc="Processing sequences"):
+        cameras_intrinsics = read_cameras_txt(osp.join(data_root, seq, 'dslr_calibration_jpg', 'cameras.txt'))
+        images_extrinsics = read_images_txt(osp.join(data_root, seq, 'dslr_calibration_jpg', 'images.txt'))
+
+        idxs = sorted(list(images_extrinsics.keys()))
+
+        output_image_dir = os.path.join(data_root, seq, 'images', 'custom_undistorted')
+        output_depth_dir = os.path.join(data_root, seq, 'ground_truth_depth', 'custom_undistorted')
+
+        output_camera_dir = os.path.join(data_root, seq, 'custom_undistorted_cam')
+        os.makedirs(output_image_dir, exist_ok=True)
+        os.makedirs(output_depth_dir, exist_ok=True)
+        os.makedirs(output_camera_dir, exist_ok=True)
+
+        for idx in tqdm(idxs, desc=f"Processing images in {seq}"):
+            meta = images_extrinsics[idx]
+
+            output_impath = os.path.join(output_image_dir, meta['name'].split('/')[1])
+            if os.path.exists(output_impath):
+                continue
+
+            # Fix the depth map path error: idxs is a list, should use meta['name'] or similar index
+            # Assume that the depth map and RGB image file names are similar, just with different extensions
+            impath = os.path.join(data_root, seq, 'images', meta['name'])
+            depthpath = os.path.join(data_root, seq, 'ground_truth_depth', meta['name']) # 假设是 .bin 文件
+
+            # load image and depth
+            rgb_image = np.array(Image.open(impath))
+            height, width = rgb_image.shape[:2]
+            depthmap = np.fromfile(depthpath, dtype=np.float32).reshape(height, width)
+
+            # load camera params for undistortion
+            intrinsic = cameras_intrinsics[meta['camera_id']]['K'].astype(np.float32)
+            dist_params_dict = cameras_intrinsics[meta['camera_id']]['dist_params']
+            
+            # ##################################################################
+            # ### TODO 1: Undistort Image                                    ###
+            # ##################################################################
+            print(f"Undistorting image {meta['name']}...")
+            
+            # Note: cv2.fisheye model primarily uses k1, k2, k3, k4. It ignores tangential (p1, p2) and thin prism (sx1, sy1) distortions.
+            # This is an approximation, but it usually works well in practice.
+            K = intrinsic
+            D = np.array([
+                dist_params_dict['k1'],
+                dist_params_dict['k2'],
+                dist_params_dict['k3'],
+                dist_params_dict['k4']
+            ])
+
+            # Calculate the undistortion mapping.
+            # K_new can be the same as K, or optimized through the balance parameter.
+            K_new = K.copy()
+            map1, map2 = cv2.fisheye.initUndistortRectifyMap(K, D, np.eye(3), K_new, (width, height), cv2.CV_16SC2)
+            
+            # Apply mapping
+            rgb_image_undistorted = cv2.remap(
+                rgb_image, map1, map2, 
+                interpolation=cv2.INTER_LINEAR, 
+                borderMode=cv2.BORDER_CONSTANT
+            )
+
+            # ##################################################################
+            # ### TODO 2: Undistort Depth                                    ###
+            # ##################################################################
+            print(f"Undistorting depth for {meta['name']}...")
+            
+            # Core idea: For each pixel (u_d, v_d, depth) in the distorted depth map,
+            # we back-project it to 3D space, then re-project it onto the undistorted image plane.
+
+            # 1. Create a grid of pixel coordinates for the distorted image
+            v_dist, u_dist = np.indices((height, width))
+            pixels_dist = np.stack([u_dist.ravel(), v_dist.ravel()], axis=-1).astype(np.float32)
+            pixels_dist = pixels_dist.reshape(-1, 1, 2) # (N, 1, 2) 的形状
+
+            # 2. Calculate normalized coordinates in the undistorted camera frame
+            # `undistortPoints` will apply the inverse transformation of the fisheye model
+            normalized_coords_undistorted = cv2.fisheye.undistortPoints(pixels_dist, K, D)
+
+            # 3. Multiply the normalized coordinates by the depth to get 3D points in camera coordinates
+            # (x', y') = normalized_coords_undistorted
+            # X = x' * depth, Y = y' * depth, Z = depth
+            depth_values = depthmap.ravel()
+            
+            # filter out invalid depth values
+            valid_mask = np.logical_and(depth_values > 0, np.isfinite(depth_values))
+            
+            points_3D_X = normalized_coords_undistorted.ravel()[0::2][valid_mask] * depth_values[valid_mask]
+            points_3D_Y = normalized_coords_undistorted.ravel()[1::2][valid_mask] * depth_values[valid_mask]
+            points_3D_Z = depth_values[valid_mask]
+            
+            # 4. Project the 3D points back to the undistorted image plane
+            fx_new, fy_new = K_new[0, 0], K_new[1, 1]
+            cx_new, cy_new = K_new[0, 2], K_new[1, 2]
+            
+            u_new = (points_3D_X * fx_new / points_3D_Z) + cx_new
+            v_new = (points_3D_Y * fy_new / points_3D_Z) + cy_new
+
+            # 5. Create a sparse depth map
+            depthmap_undistorted_sparse = np.zeros((height, width), dtype=np.float32)
+            u_new_int = np.round(u_new).astype(int)
+            v_new_int = np.round(v_new).astype(int)
+
+            # filter out points that are out of bounds
+            valid_mask = (u_new_int >= 0) & (u_new_int < width) & \
+                        (v_new_int >= 0) & (v_new_int < height)
+
+            u_target = u_new_int[valid_mask]
+            v_target = v_new_int[valid_mask]
+            z_target = points_3D_Z[valid_mask]
+
+            depthmap_undistorted_sparse[v_target, u_target] = z_target
+            depthmap_undistorted = depthmap_undistorted_sparse
+
+            output_impath = os.path.join(output_image_dir, meta['name'].split('/')[1])
+            output_depthpath = os.path.join(output_depth_dir, meta['name'].split('/')[1])
+
+            print(f"  -> Save Image to: {output_impath}")
+            Image.fromarray(rgb_image_undistorted).save(output_impath)
+
+            print(f"  -> Save Depth Map to: {output_depthpath}")
+            depthmap_undistorted.astype(np.float32).tofile(output_depthpath)
+
+            extrinsic = np.eye(4)
+            extrinsic[:3, :3] = meta['R']
+            extrinsic[:3, 3] = meta['T']
+
+            output_cam_path = os.path.join(output_camera_dir, meta['name'].split('/')[1].replace('JPG', 'npz'))
+            
+            np.savez(
+                output_cam_path,
+                intrinsics=K_new,
+                extrinsics=extrinsic
+            )
\ No newline at end of file
diff --git a/training/data/preprocess/prepare_eth3d.sh b/training/data/preprocess/prepare_eth3d.sh
new file mode 100644
index 00000000..b5557184
--- /dev/null
+++ b/training/data/preprocess/prepare_eth3d.sh
@@ -0,0 +1,19 @@
+
+mkdir -p /mimer/NOBACKUP/groups/3d-dl/eth3d
+cd /mimer/NOBACKUP/groups/3d-dl/eth3d
+
+wget https://www.eth3d.net/data/multi_view_training_dslr_jpg.7z
+# install 7zip or p7zip on your system if not already installed
+7z x multi_view_training_dslr_jpg.7z -bsp1
+rm multi_view_training_dslr_jpg.7z
+
+scenes=("courtyard" "delivery_area" "electro" "facade" "kicker" "meadow" "office" "pipes" "playground" "relief" "relief_2" "terrace" "terrains")
+for scene in "${scenes[@]}"; do
+    wget -c https://www.eth3d.net/data/${scene}_dslr_depth.7z
+    7z x ${scene}_dslr_depth.7z -bsp1
+    rm ${scene}_dslr_depth.7z
+done
+
+cd /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt
+
+# python training/data/preprocess/prepare_eth3d.py
\ No newline at end of file
diff --git a/training/data/preprocess/scannet.py b/training/data/preprocess/scannet.py
new file mode 100644
index 00000000..6a5a513a
--- /dev/null
+++ b/training/data/preprocess/scannet.py
@@ -0,0 +1,100 @@
+from pathlib import Path
+import json
+import gzip
+import numpy as np
+import torch
+from tqdm import tqdm
+
+def read_scannet_pose(path):
+    """ Read ScanNet's Camera2World pose and transform it to World2Camera.
+
+    Returns:
+        pose_w2c (np.ndarray): (4, 4)
+    """
+    cam2world = np.loadtxt(path, delimiter=' ')
+
+    if not np.isfinite(cam2world).all():
+        return None
+
+    world2cam = np.linalg.inv(cam2world)
+    return world2cam
+
+
+def read_scannet_intrinsic(path):
+    """ Read ScanNet's intrinsic matrix and return the 3x3 matrix.
+    """
+    intrinsic = np.loadtxt(path, delimiter=' ')
+    return torch.tensor(intrinsic[:-1, :-1], dtype = torch.float)
+
+# Root folder where everything starts
+root = Path("/mimer/NOBACKUP/groups/3d-dl/scannet/scans/scans_train")
+
+out = {}
+
+chunk_size = 24
+
+valid_frames = 0
+invalid_frames = 0
+for scene_dir in tqdm(root.iterdir()):
+
+    intrinsics = read_scannet_intrinsic(scene_dir / "intrinsic/intrinsic_color.txt")
+    
+    frames = sorted([p.name for p in (scene_dir / "color").iterdir() if p.suffix == ".jpg"])
+
+    # Maybe resized undistorted images are too high resolution?
+    num_frames = len(frames)
+
+    # Since the images are taken in a sequence we will just chunk up the sequences
+
+    sequences = []
+    # Calculate how many full chunks we can take, stopping before the last chunk
+    num_full_chunks = (num_frames - 1) // chunk_size  # leave room for overflow in last chunk
+
+    for i in range(num_full_chunks - 1):
+        sequences.append(frames[i * chunk_size: (i + 1) * chunk_size])
+
+    # Last chunk gets the rest of the frames
+    sequences.append(frames[(num_full_chunks - 1) * chunk_size:])
+
+    
+    for i, seq in enumerate(sequences):
+        sequence_data = []
+        for frame in seq:
+            pose_path = scene_dir / "pose" / (frame.replace(".jpg", ".txt"))
+            pose_w2c = read_scannet_pose(pose_path)
+            if pose_w2c is None:
+                print(f"Warning: Pose contains NaN, skipping frame {pose_path}")
+                invalid_frames += 1
+                continue
+            valid_frames += 1
+            R = pose_w2c[:3, :3]
+            assert not np.isnan(pose_w2c).any(), f"Pose contains NaN: {pose_w2c}"
+            # print('Determinant of R: ', np.linalg.det(R))
+            # assert np.allclose(np.linalg.det(R), 1.0, atol=1e-3), f"Rotation matrix determinant is not 1 but {np.linalg.det(R)}, R is {R}"
+
+            frame_data = {
+                "filepath": f"{scene_dir.name}/color/{frame}",
+                "extri": pose_w2c[:3].tolist(),
+                "intri": intrinsics.tolist(),
+                "depthpath": f"{scene_dir.name}/depth/{frame.replace('.jpg', '.png')}",
+            }
+            # Sanity check
+            assert len(pose_w2c) == 4 and len(pose_w2c[0]) == 4
+            assert len(intrinsics) == 3 and len(intrinsics[0]) == 3
+
+            sequence_data.append(frame_data)
+
+        out[scene_dir.name+"_"+str(i)] = sequence_data
+
+    print(f"  Created {len(sequences)} sequences for {scene_dir.name}")
+
+
+print('Valid frames: ', valid_frames)
+print('Invalid frames: ', invalid_frames)
+root = "/mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt"
+
+with gzip.open(root+"/annotations/scannet/train.jgz", "wt", encoding="utf-8") as f:
+    json.dump(out, f, ensure_ascii=False, indent=4)
+
+print(f"Processed {len(out)} scenes with a total of {sum(len(v) for v in out.values())} images.")
+
diff --git a/training/data/preprocess/test_co3d.py b/training/data/preprocess/test_co3d.py
new file mode 100644
index 00000000..c0173cca
--- /dev/null
+++ b/training/data/preprocess/test_co3d.py
@@ -0,0 +1,120 @@
+import gzip
+import json
+import numpy as np
+import cv2
+import matplotlib.pyplot as plt
+from mpl_toolkits.mplot3d import Axes3D
+from pathlib import Path
+from PIL import Image
+
+def _load_16big_png_depth(depth_png):
+    with Image.open(depth_png) as depth_pil:
+        # the image is stored with 16-bit depth but PIL reads it as I (32 bit).
+        # we cast it to uint16, then reinterpret as float16, then cast to float32
+        depth = (
+            np.frombuffer(np.array(depth_pil, dtype=np.uint16), dtype=np.float16)
+            .astype(np.float32)
+            .reshape((depth_pil.size[1], depth_pil.size[0]))
+        )
+    return depth
+
+def depth_to_points(depth, K, extrinsic, stride=8):
+    """Backprojects depth map to 3D world coordinates."""
+    h, w = depth.shape
+    i, j = np.meshgrid(np.arange(0, w, stride), np.arange(0, h, stride))
+    depth_sampled = depth[j, i]
+    valid = depth_sampled > 0
+    pixels = np.stack([i[valid], j[valid], np.ones_like(i[valid])], axis=-1)
+
+    K_inv = np.linalg.inv(K)
+    cam_points = (K_inv @ pixels.T) * depth_sampled[valid]
+    cam_points = np.vstack((cam_points, np.ones((1, cam_points.shape[1]))))
+    world_points = extrinsic @ cam_points
+    return world_points[:3].T
+
+
+def plot_scene(cameras, points=None):
+    """Visualize cameras and optionally 3D points."""
+    fig = plt.figure(figsize=(10, 8))
+    ax = fig.add_subplot(111, projection="3d")
+
+    for extri in cameras:
+        R = extri[:3, :3]
+        t = extri[:3, 3]
+        cam_center = -R.T @ t
+        ax.scatter(*cam_center, color="r", s=30)
+
+        # draw axes
+        axes_len = 0.05
+        cam_axes = R.T * axes_len
+        for k, color in enumerate(["r", "g", "b"]):
+            ax.plot(
+                [cam_center[0], cam_center[0] + cam_axes[0, k]],
+                [cam_center[1], cam_center[1] + cam_axes[1, k]],
+                [cam_center[2], cam_center[2] + cam_axes[2, k]],
+                color=color,
+            )
+
+    if points is not None:
+        ax.scatter(points[:, 0], points[:, 1], points[:, 2], s=0.1, c=points[:, 2], cmap="viridis")
+
+    ax.set_xlabel("X")
+    ax.set_ylabel("Y")
+    ax.set_zlabel("Z")
+    ax.set_title("CO3D Camera & Depth Visualization")
+    ax.view_init(elev=20, azim=-60)
+    plt.tight_layout()
+    plt.show()
+
+
+def main():
+    # === Adjust these paths ===
+    path = Path("/mimer/NOBACKUP/groups/3d-dl/co3d_full/189_20379_35626")
+
+    frames = sorted((path / "images").glob("*.jpg"))
+    frame_file = path / "frame_annotations.jgz"
+    sequence_file = path / "sequence_annotations.jgz"
+
+    with gzip.open(frame_file, "r") as fin:
+        frame_data = json.loads(fin.read())
+    with gzip.open(sequence_file, "r") as fin:
+        sequence_data = json.loads(fin.read())
+
+    seq_data = data[category_sequence]
+
+    # Collect camera extrinsics and intrinsics
+    extrinsics = []
+    intrinsics = []
+    for f in seq_data:
+        E = np.array(f["extri"])
+        if E.shape == (3, 4):
+            E = np.vstack([E, [0, 0, 0, 1]])
+        extrinsics.append(E)
+        intrinsics.append(np.array(f["intri"]))
+
+    # === Visualize camera frustums ===
+    print("Visualizing camera poses...")
+    plot_scene(extrinsics)
+    plt.savefig("cameras.png")
+
+    # === Load one depth map and backproject ===
+    frame = seq_data[frame_idx]
+    img_path = Path(frame["filepath"])
+    depth_path = Path(str(img_path).replace("/images", "/depths") + ".geometric.png")
+
+    if not depth_path.exists():
+        print(f"Depth map not found at {depth_path}")
+        return
+
+    print(f"Loading depth: {depth_path}")
+    depth = _load_16big_png_depth(depth_path, scale=1.0)
+    points_3d = depth_to_points(depth, intrinsics[frame_idx], extrinsics[frame_idx])
+
+    # === Plot cameras + point cloud ===
+    print("Rendering 3D scene...")
+    plot_scene(extrinsics, points_3d)
+    plt.savefig("scene_with_depth.png")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/training/data/preprocess/wildrgbd.py b/training/data/preprocess/wildrgbd.py
new file mode 100644
index 00000000..16986c4e
--- /dev/null
+++ b/training/data/preprocess/wildrgbd.py
@@ -0,0 +1,66 @@
+from pathlib import Path
+import json
+import gzip
+import numpy as np
+import torch
+from tqdm import tqdm
+
+# Root folder where everything starts
+root = Path("/mimer/NOBACKUP/groups/3d-dl/wildrgbd")
+
+out = {}
+
+def load_cam_poses(path):
+    poses = []
+    with open(path, "r") as f:
+        for line in f:
+            tokens = line.strip().split()
+            frame_id = int(tokens[0])
+            mat = np.array([float(x) for x in tokens[1:]]).reshape(4, 4)
+            poses.append((frame_id, mat))
+    return poses
+
+
+for category_dir in tqdm(root.iterdir()):
+    if category_dir.name.endswith('.py') or category_dir.name.endswith('.zip') or category_dir.name.startswith('.') or category_dir.name == "chair":
+        print('Skipping', category_dir.name)
+        continue
+    category = category_dir.name
+    print(f"Processing category: {category}")
+    for scene_dir in (category_dir / "scenes").iterdir():
+
+        poses = load_cam_poses(scene_dir / "cam_poses.txt")
+
+        with open(scene_dir / "metadata", "r") as f:
+            meta = json.load(f)
+
+        # Get the intrinsic matrix
+        K_flat = meta["K"]  # list of 9 numbers
+        K = np.array(K_flat).reshape(3, 3).T
+        
+        frames = sorted([p.name for p in (scene_dir / "rgb").iterdir() if p.suffix == ".png"])
+        sequence_data = []
+        for i, frame in enumerate(frames):
+            frame_id, pose = poses[i]
+            pose = np.linalg.inv(pose)  # to world to cam
+            assert frame_id == i
+            
+            depth_path = scene_dir / "depth" / frame
+            frame_data = {
+                "filepath": f"{category}/scenes/{scene_dir.name}/rgb/{frame}",
+                "extri": pose[:3].tolist(),
+                "intri": K.tolist(),
+                "depthpath": f"{category}/scenes/{scene_dir.name}/depth/{frame}",
+                "maskpath": f"{category}/scenes/{scene_dir.name}/masks/{frame}",
+            }
+            sequence_data.append(frame_data)
+
+        out[scene_dir.name] = sequence_data
+
+root = "/mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt"
+
+with gzip.open(root+"/annotations/wildrgbd/train.jgz", "wt", encoding="utf-8") as f:
+    json.dump(out, f, ensure_ascii=False, indent=4)
+
+print(f"Processed {len(out)} scenes with a total of {sum(len(v) for v in out.values())} images.")
+
diff --git a/training/main.py b/training/main.py
new file mode 100644
index 00000000..8ff24af8
--- /dev/null
+++ b/training/main.py
@@ -0,0 +1,75 @@
+import torch
+from hydra import initialize, compose
+from hydra.utils import instantiate
+import torch.distributed as dist
+import matplotlib.pyplot as plt
+import numpy as np
+
+def save_ply(points, colors, filename):
+    import open3d as o3d                
+    if torch.is_tensor(points):
+        points_visual = points.reshape(-1, 3).cpu().numpy()
+    else:
+        points_visual = points.reshape(-1, 3)
+    if torch.is_tensor(colors):
+        points_visual_rgb = colors.reshape(-1, 3).cpu().numpy()
+    else:
+        points_visual_rgb = colors.reshape(-1, 3)
+    pcd = o3d.geometry.PointCloud()
+    pcd.points = o3d.utility.Vector3dVector(points_visual.astype(np.float64))
+    pcd.colors = o3d.utility.Vector3dVector(points_visual_rgb.astype(np.float64))
+    o3d.io.write_point_cloud(filename, pcd, write_ascii=True)
+
+with initialize(version_base=None, config_path="config"):
+    cfg = compose(config_name="default")
+
+dist.init_process_group(
+    backend="nccl",
+)
+
+train_dataset = instantiate(cfg.data.train, _recursive_=False)
+train_dataset.seed = 1337
+
+dataloader = train_dataset.get_loader(epoch=0)
+
+create_ply = True
+
+for i, batch in enumerate(dataloader):
+    if create_ply: 
+        save_ply(
+            batch["world_points"][0].reshape(-1, 3), 
+            batch["images"][0].permute(0, 2, 3, 1).reshape(-1, 3), 
+            f"debug_{i:04d}.ply"
+        )
+        print(f"Saved debug_{i:04d}.ply")
+    else:
+        images = batch['images']  # [B, T, C, H, W]
+        depths = batch['depths']  # [B, T, H, W]
+
+        # pick first sample and first frame
+        img = images[0, 0]        # [C, H, W]
+        depth = depths[0, 0]      # [H, W]
+
+        # move channels last for matplotlib
+        img_np = img.permute(1, 2, 0).cpu().numpy()
+        depth_np = depth.cpu().numpy()
+
+        # normalize depth for visualization
+        depth_vis = (depth_np - depth_np.min()) / (depth_np.max() - depth_np.min() + 1e-8)
+
+        plt.figure(figsize=(10,5))
+
+        plt.subplot(1,2,1)
+        plt.imshow(img_np)
+        plt.title("Image")
+        plt.axis("off")
+
+        plt.subplot(1,2,2)
+        plt.imshow(depth_vis, cmap="plasma")  # "viridis", "magma" also nice
+        plt.title("Depth")
+        plt.axis("off")
+
+        plt.savefig(f"sample_{i:04d}.png")
+        plt.close()
+
+        print(f"Saved sample_{i:04d}.png")
\ No newline at end of file
diff --git a/training/todo.txt b/training/todo.txt
new file mode 100644
index 00000000..e64eed37
--- /dev/null
+++ b/training/todo.txt
@@ -0,0 +1,3 @@
+* Build into the config so we can choose between MuM, CroCov2 and DINOv2
+* Add config about how large the model is and try to just train a couple of layers on top of frozen backbone
+* For example one could do 4xA100 for some time and see how it works on MegaDepth
\ No newline at end of file
diff --git a/training/train_utils/distributed.py b/training/train_utils/distributed.py
index af61e269..d97bbbed 100644
--- a/training/train_utils/distributed.py
+++ b/training/train_utils/distributed.py
@@ -15,6 +15,11 @@ def get_machine_local_and_dist_rank():
     """
     local_rank = int(os.environ.get("LOCAL_RANK", None))
     distributed_rank = int(os.environ.get("RANK", None))
+
+    # local_rank = int(os.environ["SLURM_LOCALID"])  # 0-3 on each node
+    # distributed_rank = int(os.environ["SLURM_PROCID"])  
+
+    print('I am local rank', local_rank, 'and distributed rank', distributed_rank)
     assert (
         local_rank is not None and distributed_rank is not None
     ), "Please the set the RANK and LOCAL_RANK environment variables."
diff --git a/training/trainer.py b/training/trainer.py
index 21ffa53e..f283e172 100644
--- a/training/trainer.py
+++ b/training/trainer.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import os
+import constants
 
 
 # --- Environment Variable Setup for Performance and Debugging ---
@@ -17,6 +18,8 @@
 # Enables asynchronous error handling for NCCL, which can prevent hangs.
 os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "1"
 
+os.environ['WANDB_API_KEY'] = constants.WANDB_API_KEY
+
 
 import contextlib
 import gc
@@ -41,7 +44,7 @@
 from train_utils.logging import setup_logging
 from train_utils.normalization import normalize_camera_extrinsics_and_points_batch
 from train_utils.optimizer import construct_optimizers
-
+import wandb
 
 class Trainer:
     """
@@ -77,6 +80,8 @@ def __init__(
         loss: Optional[Dict[str, Any]] = None,
         env_variables: Optional[Dict[str, Any]] = None,
         accum_steps: int = 1,
+        log_wandb: bool = False,
+        exp_name: str = "exp001",
         **kwargs,
     ):
         """
@@ -141,6 +146,9 @@ def __init__(
 
         assert is_dist_avail_and_initialized(), "Torch distributed needs to be initialized before calling the trainer."
 
+        wandb_mode = "online" if log_wandb and self.rank == 0 else "disabled"
+        wandb.init(project="vggt", entity="georgs-team", name=exp_name, reinit=False, mode = wandb_mode)
+
         # Instantiate components (model, loss, etc.)
         self._setup_components()
         self._setup_dataloaders()
@@ -376,6 +384,7 @@ def run(self):
 
     def run_train(self):
         """Runs the main training loop over all epochs."""
+        print('Max epochs: ', self.max_epochs)
         while self.epoch < self.max_epochs:
             set_seeds(self.seed_value + self.epoch * 100, self.max_epochs, self.distributed_rank)
             
@@ -428,9 +437,16 @@ def val_epoch(self, val_loader):
         loss_meters = {
             name: AverageMeter(name, self.device, ":.4f") for name in loss_names
         }
+
+        iters_per_epoch = len(val_loader)
+        limit_val_batches = (
+            iters_per_epoch
+            if self.limit_val_batches is None
+            else self.limit_val_batches
+        )
         
         progress = ProgressMeter(
-            num_batches=len(val_loader),
+            num_batches=limit_val_batches,
             meters=[
                 batch_time,
                 data_time,
@@ -445,12 +461,6 @@ def val_epoch(self, val_loader):
         self.model.eval()
         end = time.time()
 
-        iters_per_epoch = len(val_loader)
-        limit_val_batches = (
-            iters_per_epoch
-            if self.limit_val_batches is None
-            else self.limit_val_batches
-        )
 
         for data_iter, batch in enumerate(val_loader):
             if data_iter > limit_val_batches:
@@ -495,6 +505,13 @@ def val_epoch(self, val_loader):
             if data_iter % self.logging_conf.log_freq == 0:
                 progress.display(data_iter)
 
+        avg_stats = {}
+
+        for name, meter in loss_meters.items():
+            avg_stats[f"{name}_val"] = meter.avg
+
+        wandb.log(avg_stats)
+        print("Validation averages:", avg_stats)
 
         return True
 
@@ -516,8 +533,16 @@ def train_epoch(self, train_loader):
             loss_meters[f"Grad/{param_names}"] = AverageMeter(f"Grad/{param_names}", self.device, ":.4f")
 
 
+        iters_per_epoch = len(train_loader)
+        limit_train_batches = (
+            iters_per_epoch
+            if self.limit_train_batches is None
+            else self.limit_train_batches
+        )
+        print('Num batches: ', limit_train_batches)
+
         progress = ProgressMeter(
-            num_batches=len(train_loader),
+            num_batches=limit_train_batches,
             meters=[
                 batch_time,
                 data_time,
@@ -531,13 +556,6 @@ def train_epoch(self, train_loader):
 
         self.model.train()
         end = time.time()
-
-        iters_per_epoch = len(train_loader)
-        limit_train_batches = (
-            iters_per_epoch
-            if self.limit_train_batches is None
-            else self.limit_train_batches
-        )
         
         if self.gradient_clipper is not None:
             # setup gradient clipping at the beginning of training
@@ -581,9 +599,14 @@ def train_epoch(self, train_loader):
                 logging.warning(
                     f"Skipping scheduler update since the training is at the end, i.e, {self.where} of [0,1]."
                 )
-                    
-            # Log schedulers
+
+            #         
+
+
+            # Log schedulers (to W&B instead of TensorBoard)
             if self.steps[phase] % self.logging_conf.log_freq == 0:
+                wandb_dict = {}
+
                 for i, optim in enumerate(self.optims):
                     for j, param_group in enumerate(optim.optimizer.param_groups):
                         for option in optim.schedulers[j]:
@@ -596,16 +619,13 @@ def train_epoch(self, train_loader):
                                     else ""
                                 )
                             )
-                            self.tb_writer.log(
-                                os.path.join("Optim", f"{optim_prefix}", option),
-                                param_group[option],
-                                self.steps[phase],
-                            )
-                self.tb_writer.log(
-                    os.path.join("Optim", "where"),
-                    self.where,
-                    self.steps[phase],
-                )
+                            key = f"Optim/{optim_prefix}{option}"
+                            wandb_dict[key] = param_group.get(option, None)
+
+                # Also log the scheduler position (e.g., training progress)
+                wandb_dict["Optim/where"] = self.where
+
+                wandb.log(wandb_dict, step=self.steps[phase])
 
             # Clipping gradients and detecting diverging gradients
             if self.gradient_clipper is not None:
@@ -632,6 +652,9 @@ def train_epoch(self, train_loader):
 
             if data_iter % self.logging_conf.log_freq == 0:
                 progress.display(data_iter)
+                wandb.log({
+                    **{name: meter.avg for name, meter in loss_meters.items()}
+                })
 
         return True
 
diff --git a/training/vggt.sh b/training/vggt.sh
new file mode 100644
index 00000000..f537f9d7
--- /dev/null
+++ b/training/vggt.sh
@@ -0,0 +1,56 @@
+#!/usr/bin/env bash
+set -e
+
+# ==== CONFIGURABLE VARIABLES ====
+MODEL="dinov3"
+GPUS_PER_NODE=1
+NODES=1
+TIME="0-00:10:00"
+# TIME="2-10:00:00"
+
+# ==== AUTO-DERIVED VARIABLES ====
+JOB_NAME="vggt:${MODEL}"
+OUTPUT_DIR="/mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/output_dir/${MODEL}"
+
+mkdir -p "${OUTPUT_DIR}"
+
+# ==== EXPORT TO MAKE AVAILABLE INSIDE SLURM JOB ====
+export MODEL
+export GPUS_PER_NODE
+export NODES
+export OUTPUT_DIR
+
+# ==== SUBMIT THE JOB ====
+sbatch \
+  -A NAISS2025-5-255 \
+  --job-name=${JOB_NAME} \
+  --nodes=${NODES} \
+  --gpus-per-node=A100:${GPUS_PER_NODE} \
+  --ntasks-per-node=1 \
+  --time=${TIME} \
+  --output=${OUTPUT_DIR}/%j/log.out \
+  --error=${OUTPUT_DIR}/%j/log.err \
+  --export=ALL,MODEL,GPUS_PER_NODE,NODES,OUTPUT_DIR \
+  <<'EOF'
+#!/usr/bin/env bash
+set -e
+
+echo "Running model: ${MODEL}"
+echo "GPUs per node: ${GPUS_PER_NODE}"
+echo "Nodes: ${SLURM_NNODES}"
+
+export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+export MASTER_PORT=29501
+export WORLD_SIZE=$(($SLURM_NNODES * $GPUS_PER_NODE))
+
+echo "MASTER_ADDR: $MASTER_ADDR"
+echo "WORLD_SIZE: $WORLD_SIZE"
+
+srun torchrun \
+  --nproc_per_node=${GPUS_PER_NODE} \
+  --nnodes=${SLURM_NNODES} \
+  --rdzv_id=${SLURM_JOB_ID} \
+  --rdzv_backend=c10d \
+  --rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} \
+  launch.py --config "${MODEL}"
+EOF
diff --git a/vggt/encoders/__init__.py b/vggt/encoders/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/vggt/encoders/croco/__init__.py b/vggt/encoders/croco/__init__.py
new file mode 100644
index 00000000..b06ca763
--- /dev/null
+++ b/vggt/encoders/croco/__init__.py
@@ -0,0 +1 @@
+from .croco import CroCoNet
\ No newline at end of file
diff --git a/vggt/encoders/croco/blocks.py b/vggt/encoders/croco/blocks.py
new file mode 100644
index 00000000..b3d02307
--- /dev/null
+++ b/vggt/encoders/croco/blocks.py
@@ -0,0 +1,240 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+
+# --------------------------------------------------------
+# Main encoder/decoder blocks
+# --------------------------------------------------------
+# References: 
+# timm
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/helpers.py
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/drop.py
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/mlp.py
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/patch_embed.py
+
+
+import torch
+import torch.nn as nn 
+
+from itertools import repeat
+import collections.abc
+
+
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            return x
+        return tuple(repeat(x, n))
+    return parse
+to_2tuple = _ntuple(2)
+
+def drop_path(x, drop_prob: float = 0., training: bool = False, scale_by_keep: bool = True):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+
+    def extra_repr(self):
+        return f'drop_prob={round(self.drop_prob,3):0.3f}'
+
+class Mlp(nn.Module):
+    """ MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, bias=True, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        bias = to_2tuple(bias)
+        drop_probs = to_2tuple(drop)
+
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias[0])
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias[1])
+        self.drop2 = nn.Dropout(drop_probs[1])
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+
+class Attention(nn.Module):
+
+    def __init__(self, dim, rope=None, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.rope = rope 
+
+    def forward(self, x, xpos):
+        B, N, C = x.shape
+
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).transpose(1,3)
+        q, k, v = [qkv[:,:,i] for i in range(3)]
+        # q,k,v = qkv.unbind(2)  # make torchscript happy (cannot use tensor as tuple)
+               
+        if self.rope is not None:
+            q = self.rope(q, xpos)
+            k = self.rope(k, xpos)
+               
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+class Block(nn.Module):
+
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, rope=None):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(dim, rope=rope, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+    def forward(self, x, xpos):
+        x = x + self.drop_path(self.attn(self.norm1(x), xpos))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+class CrossAttention(nn.Module):
+    
+    def __init__(self, dim, rope=None, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+
+        self.projq = nn.Linear(dim, dim, bias=qkv_bias)
+        self.projk = nn.Linear(dim, dim, bias=qkv_bias)
+        self.projv = nn.Linear(dim, dim, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        
+        self.rope = rope
+        
+    def forward(self, query, key, value, qpos, kpos):
+        B, Nq, C = query.shape
+        Nk = key.shape[1]
+        Nv = value.shape[1]
+        
+        q = self.projq(query).reshape(B,Nq,self.num_heads, C// self.num_heads).permute(0, 2, 1, 3)
+        k = self.projk(key).reshape(B,Nk,self.num_heads, C// self.num_heads).permute(0, 2, 1, 3)
+        v = self.projv(value).reshape(B,Nv,self.num_heads, C// self.num_heads).permute(0, 2, 1, 3)
+        
+        if self.rope is not None:
+            q = self.rope(q, qpos)
+            k = self.rope(k, kpos)
+            
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, Nq, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+class DecoderBlock(nn.Module):
+
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, norm_mem=True, rope=None):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(dim, rope=rope, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+        self.cross_attn = CrossAttention(dim, rope=rope, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        self.norm3 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        self.norm_y = norm_layer(dim) if norm_mem else nn.Identity()
+
+    def forward(self, x, y, xpos, ypos):
+        x = x + self.drop_path(self.attn(self.norm1(x), xpos))
+        y_ = self.norm_y(y)
+        x = x + self.drop_path(self.cross_attn(self.norm2(x), y_, y_, xpos, ypos))
+        x = x + self.drop_path(self.mlp(self.norm3(x)))
+        return x, y
+        
+        
+# patch embedding
+class PositionGetter(object):
+    """ return positions of patches """
+
+    def __init__(self):
+        self.cache_positions = {}
+        
+    def __call__(self, b, h, w, device):
+        if not (h,w) in self.cache_positions:
+            x = torch.arange(w, device=device)
+            y = torch.arange(h, device=device)
+            self.cache_positions[h,w] = torch.cartesian_prod(y, x) # (h, w, 2)
+        pos = self.cache_positions[h,w].view(1, h*w, 2).expand(b, -1, 2).clone()
+        return pos
+
+class PatchEmbed(nn.Module):
+    """ just adding _init_weights + position getter compared to timm.models.layers.patch_embed.PatchEmbed"""
+
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None, flatten=True):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.flatten = flatten
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+        
+        self.position_getter = PositionGetter()
+        
+    def forward(self, x):
+        B, C, H, W = x.shape
+        # torch._assert(H == self.img_size[0], f"Input image height ({H}) doesn't match model ({self.img_size[0]}).")
+        # torch._assert(W == self.img_size[1], f"Input image width ({W}) doesn't match model ({self.img_size[1]}).")
+        x = self.proj(x)
+        pos = self.position_getter(B, x.size(2), x.size(3), x.device)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        x = self.norm(x)
+        return x, pos
+        
+    def _init_weights(self):
+        w = self.proj.weight.data
+        torch.nn.init.xavier_uniform_(w.view([w.shape[0], -1])) 
\ No newline at end of file
diff --git a/vggt/encoders/croco/criterion.py b/vggt/encoders/croco/criterion.py
new file mode 100644
index 00000000..b1ef1b3f
--- /dev/null
+++ b/vggt/encoders/croco/criterion.py
@@ -0,0 +1,37 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+# 
+# --------------------------------------------------------
+# Criterion to train CroCo
+# --------------------------------------------------------
+# References:
+# MAE: https://github.com/facebookresearch/mae
+# --------------------------------------------------------
+
+import torch
+
+class MaskedMSE(torch.nn.Module):
+
+    def __init__(self, norm_pix_loss=False, masked=True):
+        """
+            norm_pix_loss: normalize each patch by their pixel mean and variance
+            masked: compute loss over the masked patches only 
+        """
+        super().__init__()
+        self.norm_pix_loss = norm_pix_loss
+        self.masked = masked 
+        
+    def forward(self, pred, mask, target):
+        
+        if self.norm_pix_loss:
+            mean = target.mean(dim=-1, keepdim=True)
+            var = target.var(dim=-1, keepdim=True)
+            target = (target - mean) / (var + 1.e-6)**.5
+            
+        loss = (pred - target) ** 2
+        loss = loss.mean(dim=-1)  # [N, L], mean loss per patch
+        if self.masked:
+            loss = (loss * mask).sum() / mask.sum()  # mean loss on masked patches
+        else:
+            loss = loss.mean()  # mean loss
+        return loss
\ No newline at end of file
diff --git a/vggt/encoders/croco/croco.py b/vggt/encoders/croco/croco.py
new file mode 100644
index 00000000..ec6d0c33
--- /dev/null
+++ b/vggt/encoders/croco/croco.py
@@ -0,0 +1,261 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+
+# --------------------------------------------------------
+# CroCo model during pretraining
+# --------------------------------------------------------
+
+
+
+import torch
+import torch.nn as nn
+torch.backends.cuda.matmul.allow_tf32 = True # for gpu >= Ampere and pytorch >= 1.12
+from functools import partial
+
+from .blocks import Block, DecoderBlock, PatchEmbed
+from .pos_embed import get_2d_sincos_pos_embed, RoPE2D 
+from .masking import RandomMask
+
+
+class CroCoNet(nn.Module):
+
+    def __init__(self,
+                 img_size=224,           # input image size
+                 patch_size=16,          # patch_size 
+                 mask_ratio=0.9,         # ratios of masked tokens 
+                 enc_embed_dim=768,      # encoder feature dimension
+                 enc_depth=12,           # encoder depth 
+                 enc_num_heads=12,       # encoder number of heads in the transformer block 
+                 dec_embed_dim=512,      # decoder feature dimension 
+                 dec_depth=8,            # decoder depth 
+                 dec_num_heads=16,       # decoder number of heads in the transformer block 
+                 mlp_ratio=4,
+                 norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                 norm_im2_in_dec=True,   # whether to apply normalization of the 'memory' = (second image) in the decoder 
+                 pos_embed='cosine',     # positional embedding (either cosine or RoPE100)
+                ):
+                
+        super(CroCoNet, self).__init__()
+
+        self.patch_size = 16
+                
+        # patch embeddings  (with initialization done as in MAE)
+        self._set_patch_embed(img_size, patch_size, enc_embed_dim)
+
+        # mask generations
+        self._set_mask_generator(self.patch_embed.num_patches, mask_ratio)
+
+        self.pos_embed = pos_embed
+        if pos_embed=='cosine':
+            # positional embedding of the encoder 
+            enc_pos_embed = get_2d_sincos_pos_embed(enc_embed_dim, self.patch_embed.grid_size, n_cls_token=0)
+            self.register_buffer('enc_pos_embed', torch.from_numpy(enc_pos_embed).float())
+            # positional embedding of the decoder  
+            dec_pos_embed = get_2d_sincos_pos_embed(dec_embed_dim, self.patch_embed.grid_size, n_cls_token=0)
+            self.register_buffer('dec_pos_embed', torch.from_numpy(dec_pos_embed).float())
+            # pos embedding in each block
+            self.rope = None # nothing for cosine 
+        elif pos_embed.startswith('RoPE'): # eg RoPE100 
+            self.enc_pos_embed = None # nothing to add in the encoder with RoPE
+            self.dec_pos_embed = None # nothing to add in the decoder with RoPE
+            if RoPE2D is None: raise ImportError("Cannot find cuRoPE2D, please install it following the README instructions")
+            freq = float(pos_embed[len('RoPE'):])
+            self.rope = RoPE2D(freq=freq)
+        else:
+            raise NotImplementedError('Unknown pos_embed '+pos_embed)
+
+        # transformer for the encoder 
+        self.enc_depth = enc_depth
+        self.enc_embed_dim = enc_embed_dim
+        self.enc_blocks = nn.ModuleList([
+            Block(enc_embed_dim, enc_num_heads, mlp_ratio, qkv_bias=True, norm_layer=norm_layer, rope=self.rope)
+            for i in range(enc_depth)])
+        self.enc_norm = norm_layer(enc_embed_dim)
+        
+        # masked tokens 
+        self._set_mask_token(dec_embed_dim)
+
+        # decoder 
+        self._set_decoder(enc_embed_dim, dec_embed_dim, dec_num_heads, dec_depth, mlp_ratio, norm_layer, norm_im2_in_dec)
+        
+        # prediction head 
+        self._set_prediction_head(dec_embed_dim, patch_size)
+        
+        # initializer weights
+        self.initialize_weights()           
+
+    def _set_patch_embed(self, img_size=224, patch_size=16, enc_embed_dim=768):
+        self.patch_embed = PatchEmbed(img_size, patch_size, 3, enc_embed_dim)
+
+    def _set_mask_generator(self, num_patches, mask_ratio):
+        self.mask_generator = RandomMask(num_patches, mask_ratio)
+        
+    def _set_mask_token(self, dec_embed_dim):
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, dec_embed_dim))
+        
+    def _set_decoder(self, enc_embed_dim, dec_embed_dim, dec_num_heads, dec_depth, mlp_ratio, norm_layer, norm_im2_in_dec):
+        self.dec_depth = dec_depth
+        self.dec_embed_dim = dec_embed_dim
+        # transfer from encoder to decoder 
+        self.decoder_embed = nn.Linear(enc_embed_dim, dec_embed_dim, bias=True)
+        # transformer for the decoder 
+        self.dec_blocks = nn.ModuleList([
+            DecoderBlock(dec_embed_dim, dec_num_heads, mlp_ratio=mlp_ratio, qkv_bias=True, norm_layer=norm_layer, norm_mem=norm_im2_in_dec, rope=self.rope)
+            for i in range(dec_depth)])
+        # final norm layer 
+        self.dec_norm = norm_layer(dec_embed_dim)
+        
+    def _set_prediction_head(self, dec_embed_dim, patch_size):
+         self.prediction_head = nn.Linear(dec_embed_dim, patch_size**2 * 3, bias=True)
+        
+        
+    def initialize_weights(self):
+        # patch embed 
+        self.patch_embed._init_weights()
+        # mask tokens
+        if self.mask_token is not None: torch.nn.init.normal_(self.mask_token, std=.02)
+        # linears and layer norms
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            # we use xavier_uniform following official JAX ViT:
+            torch.nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+            
+    def _encode_image(self, image, do_mask=False, return_all_blocks=False):
+        """
+        image has B x 3 x img_size x img_size 
+        do_mask: whether to perform masking or not
+        return_all_blocks: if True, return the features at the end of every block 
+                           instead of just the features from the last block (eg for some prediction heads)
+        """
+        # embed the image into patches  (x has size B x Npatches x C) 
+        # and get position if each return patch (pos has size B x Npatches x 2)
+        x, pos = self.patch_embed(image)              
+        # add positional embedding without cls token  
+        if self.enc_pos_embed is not None: 
+            x = x + self.enc_pos_embed[None,...]
+        # apply masking 
+        B,N,C = x.size()
+        if do_mask:
+            masks = self.mask_generator(x)
+            x = x[~masks].view(B, -1, C)
+            posvis = pos[~masks].view(B, -1, 2)
+        else:
+            B,N,C = x.size()
+            masks = torch.zeros((B,N), dtype=bool)
+            posvis = pos
+        # now apply the transformer encoder and normalization        
+        if return_all_blocks:
+            out = []
+            for blk in self.enc_blocks:
+                x = blk(x, posvis)
+                out.append(x)
+            out[-1] = self.enc_norm(out[-1])
+            return out, pos, masks
+        else:
+            for blk in self.enc_blocks:
+                x = blk(x, posvis)
+            x = self.enc_norm(x)
+            return x, pos, masks
+ 
+    def _decoder(self, feat1, pos1, masks1, feat2, pos2, return_all_blocks=False):
+        """
+        return_all_blocks: if True, return the features at the end of every block 
+                           instead of just the features from the last block (eg for some prediction heads)
+                           
+        masks1 can be None => assume image1 fully visible 
+        """
+        # encoder to decoder layer 
+        visf1 = self.decoder_embed(feat1)
+        f2 = self.decoder_embed(feat2)
+        # append masked tokens to the sequence
+        B,Nenc,C = visf1.size()
+        if masks1 is None: # downstreams
+            f1_ = visf1
+        else: # pretraining 
+            Ntotal = masks1.size(1)
+            f1_ = self.mask_token.repeat(B, Ntotal, 1).to(dtype=visf1.dtype)
+            f1_[~masks1] = visf1.view(B * Nenc, C)
+        # add positional embedding
+        if self.dec_pos_embed is not None:
+            f1_ = f1_ + self.dec_pos_embed
+            f2 = f2 + self.dec_pos_embed
+        # apply Transformer blocks
+        out = f1_
+        out2 = f2 
+        if return_all_blocks:
+            _out, out = out, []
+            for blk in self.dec_blocks:
+                _out, out2 = blk(_out, out2, pos1, pos2)
+                out.append(_out)
+            out[-1] = self.dec_norm(out[-1])
+        else:
+            for blk in self.dec_blocks:
+                out, out2 = blk(out, out2, pos1, pos2)
+            out = self.dec_norm(out)
+        return out
+
+    def patchify(self, imgs):
+        """
+        imgs: (B, 3, H, W)
+        x: (B, L, patch_size**2 *3)
+        """
+        p = self.patch_embed.patch_size[0]
+        assert imgs.shape[2] == imgs.shape[3] and imgs.shape[2] % p == 0
+
+        h = w = imgs.shape[2] // p
+        x = imgs.reshape(shape=(imgs.shape[0], 3, h, p, w, p))
+        x = torch.einsum('nchpwq->nhwpqc', x)
+        x = x.reshape(shape=(imgs.shape[0], h * w, p**2 * 3))
+        
+        return x
+
+    def unpatchify(self, x, channels=3):
+        """
+        x: (N, L, patch_size**2 *channels)
+        imgs: (N, 3, H, W)
+        """
+        patch_size = self.patch_embed.patch_size[0]
+        h = w = int(x.shape[1]**.5)
+        assert h * w == x.shape[1]
+        x = x.reshape(shape=(x.shape[0], h, w, patch_size, patch_size, channels))
+        x = torch.einsum('nhwpqc->nchpwq', x)
+        imgs = x.reshape(shape=(x.shape[0], channels, h * patch_size, h * patch_size))
+        return imgs
+
+    def forward(self, img1, img2):
+        """
+        img1: tensor of size B x 3 x img_size x img_size
+        img2: tensor of size B x 3 x img_size x img_size
+        
+        out will be    B x N x (3*patch_size*patch_size)
+        masks are also returned as B x N just in case 
+        """
+        # encoder of the masked first image 
+        feat1, pos1, mask1 = self._encode_image(img1, do_mask=True)
+        # encoder of the second image 
+        feat2, pos2, _ = self._encode_image(img2, do_mask=False)
+        # decoder 
+        decfeat = self._decoder(feat1, pos1, mask1, feat2, pos2)
+        # prediction head 
+        out = self.prediction_head(decfeat)
+        # get target
+        target = self.patchify(img1)
+        return out, mask1, target
+    
+    def forward_features(self, x, masks=None):
+        x_norm, pos1, mask1 = self._encode_image(x, do_mask=False)
+        return {
+            "x_norm_patchtokens": x_norm,
+        }
+    
+    @property
+    def device(self):
+        return self.mask_token.device
\ No newline at end of file
diff --git a/vggt/encoders/croco/masking.py b/vggt/encoders/croco/masking.py
new file mode 100644
index 00000000..493b203f
--- /dev/null
+++ b/vggt/encoders/croco/masking.py
@@ -0,0 +1,25 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+
+# --------------------------------------------------------
+# Masking utils
+# --------------------------------------------------------
+
+import torch
+import torch.nn as nn    
+    
+class RandomMask(nn.Module):
+    """
+    random masking
+    """
+
+    def __init__(self, num_patches, mask_ratio):
+        super().__init__()
+        self.num_patches = num_patches
+        self.num_mask = int(mask_ratio * self.num_patches)
+    
+    def __call__(self, x):
+        noise = torch.rand(x.size(0), self.num_patches, device=x.device) 
+        argsort = torch.argsort(noise, dim=1) 
+        return argsort < self.num_mask
\ No newline at end of file
diff --git a/vggt/encoders/croco/pos_embed.py b/vggt/encoders/croco/pos_embed.py
new file mode 100644
index 00000000..33f6517c
--- /dev/null
+++ b/vggt/encoders/croco/pos_embed.py
@@ -0,0 +1,157 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+
+# --------------------------------------------------------
+# Position embedding utils
+# --------------------------------------------------------
+
+
+
+import numpy as np
+
+import torch
+
+# --------------------------------------------------------
+# 2D sine-cosine position embedding
+# References:
+# MAE: https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+# Transformer: https://github.com/tensorflow/models/blob/master/official/nlp/transformer/model_utils.py
+# MoCo v3: https://github.com/facebookresearch/moco-v3
+# --------------------------------------------------------
+def get_2d_sincos_pos_embed(embed_dim, grid_size, n_cls_token=0):
+    """
+    grid_size: tuple (height, width) of the grid
+    return:
+    pos_embed: [grid_size[0]*grid_size[1], embed_dim] or [n_cls_token+grid_size[0]*grid_size[1], embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size[0], dtype=np.float32)
+    grid_w = np.arange(grid_size[1], dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+
+    grid = grid.reshape([2, 1, grid_size[0], grid_size[1]])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if n_cls_token>0:
+        pos_embed = np.concatenate([np.zeros([n_cls_token, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+
+
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+
+    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+    return emb
+
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=float)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+
+    emb_sin = np.sin(out) # (M, D/2)
+    emb_cos = np.cos(out) # (M, D/2)
+
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+
+
+# --------------------------------------------------------
+# Interpolate position embeddings for high-resolution
+# References:
+# MAE: https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+# DeiT: https://github.com/facebookresearch/deit
+# --------------------------------------------------------
+def interpolate_pos_embed(model, checkpoint_model):
+    keys = ['enc_pos_embed']+(['dec_pos_embed'] if hasattr(model,'dec_blocks') else [])
+    img_size = model.patch_embed.img_size
+    if isinstance(img_size,int): img_size = (img_size,img_size)
+    for k in keys:
+        if not k in checkpoint_model: continue
+        pos_embed_checkpoint = checkpoint_model[k]
+        embedding_size = pos_embed_checkpoint.shape[-1]
+        num_extra_tokens = 0 # no cls token
+        # height (== width) for the checkpoint position embedding
+        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
+        new_size = (img_size[0]//model.patch_embed.patch_size[0],img_size[1]//model.patch_embed.patch_size[1])
+        if orig_size != new_size[0] or orig_size != new_size[1]:
+            print("Position interpolate %s from %dx%d to %dx%d" % (k, orig_size, orig_size, new_size[0], new_size[1]))
+            extra_tokens = pos_embed_checkpoint[:num_extra_tokens,:]
+            pos_tokens = pos_embed_checkpoint[num_extra_tokens:,:]
+            pos_tokens = pos_tokens.reshape(1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
+            pos_tokens = torch.nn.functional.interpolate(pos_tokens, size=(new_size[0], new_size[1]), mode='bicubic', align_corners=False)
+            pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2).squeeze(0)
+            new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=0)
+            checkpoint_model[k] = new_pos_embed.squeeze(0)
+
+#----------------------------------------------------------
+# RoPE2D: RoPE implementation in 2D
+#----------------------------------------------------------
+
+try:
+    from models.curope import cuRoPE2D
+    RoPE2D = cuRoPE2D
+except ImportError:
+    print('Warning, cannot find cuda-compiled version of RoPE2D, using a slow pytorch version instead')
+
+    class RoPE2D(torch.nn.Module):
+        
+        def __init__(self, freq=100.0, F0=1.0):
+            super().__init__()
+            self.base = freq 
+            self.F0 = F0
+            self.cache = {}
+
+        def get_cos_sin(self, D, seq_len, device, dtype):
+            if (D,seq_len,device,dtype) not in self.cache:
+                inv_freq = self.F0 / (self.base ** (torch.arange(0, D, 2).float().to(device) / D))
+                t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype)
+                freqs = torch.einsum("i,j->ij", t, inv_freq).to(dtype)
+                freqs = torch.cat((freqs, freqs), dim=-1)
+                cos = freqs.cos() # (Seq, Dim)
+                sin = freqs.sin()
+                self.cache[D,seq_len,device,dtype] = (cos,sin)
+            return self.cache[D,seq_len,device,dtype]
+            
+        @staticmethod
+        def rotate_half(x):
+            x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
+            return torch.cat((-x2, x1), dim=-1)
+            
+        def apply_rope1d(self, tokens, pos1d, cos, sin):
+            assert pos1d.ndim==2
+            cos = torch.nn.functional.embedding(pos1d, cos)[:, None, :, :]
+            sin = torch.nn.functional.embedding(pos1d, sin)[:, None, :, :]
+            return (tokens * cos) + (self.rotate_half(tokens) * sin)
+            
+        def forward(self, tokens, positions):
+            """
+            input:
+                * tokens: batch_size x nheads x ntokens x dim
+                * positions: batch_size x ntokens x 2 (y and x position of each token)
+            output:
+                * tokens after appplying RoPE2D (batch_size x nheads x ntokens x dim)
+            """
+            assert tokens.size(3)%2==0, "number of dimensions should be a multiple of two"
+            D = tokens.size(3) // 2
+            assert positions.ndim==3 and positions.shape[-1] == 2 # Batch, Seq, 2
+            cos, sin = self.get_cos_sin(D, int(positions.max())+1, tokens.device, tokens.dtype)
+            # split features into two along the feature dimension, and apply rope1d on each half
+            y, x = tokens.chunk(2, dim=-1)
+            y = self.apply_rope1d(y, positions[:,:,0], cos, sin)
+            x = self.apply_rope1d(x, positions[:,:,1], cos, sin)
+            tokens = torch.cat((y, x), dim=-1)
+            return tokens
\ No newline at end of file
diff --git a/vggt/encoders/mum/__init__.py b/vggt/encoders/mum/__init__.py
new file mode 100644
index 00000000..322ac1fe
--- /dev/null
+++ b/vggt/encoders/mum/__init__.py
@@ -0,0 +1 @@
+from .model import MultiViewMaskedAutoEncoder, vit_base, vit_huge, vit_large
\ No newline at end of file
diff --git a/vggt/encoders/mum/layers/__init__.py b/vggt/encoders/mum/layers/__init__.py
new file mode 100644
index 00000000..716ff606
--- /dev/null
+++ b/vggt/encoders/mum/layers/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This software may be used and distributed in accordance with
+# the terms of the DINOv3 License Agreement.
+
+from .attention import CausalSelfAttention, LinearKMaskedBias, SelfAttention
+from .block import CausalSelfAttentionBlock, SelfAttentionBlock
+from .ffn_layers import Mlp, SwiGLUFFN
+from .fp8_linear import convert_linears_to_fp8
+from .layer_scale import LayerScale
+from .patch_embed import PatchEmbed, DINOv3PatchEmbed
+from .rms_norm import RMSNorm
+from .rope_position_encoding import RopePositionEmbedding
diff --git a/vggt/encoders/mum/layers/attention.py b/vggt/encoders/mum/layers/attention.py
new file mode 100644
index 00000000..3c7df173
--- /dev/null
+++ b/vggt/encoders/mum/layers/attention.py
@@ -0,0 +1,168 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This software may be used and distributed in accordance with
+# the terms of the DINOv3 License Agreement.
+
+import math
+from typing import List, Tuple
+
+import torch
+import torch.nn.functional as F
+from ..utils import cat_keep_shapes, uncat_with_shapes
+from torch import Tensor, nn
+
+
+# RoPE-related functions:
+def rope_rotate_half(x: Tensor) -> Tensor:
+    # x:   [ x0  x1  x2  x3  x4  x5]
+    # out: [-x3 -x4 -x5  x0  x1  x2]
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat([-x2, x1], dim=-1)
+
+
+def rope_apply(x: Tensor, sin: Tensor, cos: Tensor) -> Tensor:
+    # x:   [..., D], eg [x0,     x1,   x2,   x3,   x4,   x5]
+    # sin: [..., D], eg [sin0, sin1, sin2, sin0, sin1, sin2]
+    # cos: [..., D], eg [cos0, cos1, cos2, cos0, cos1, cos2]
+    return (x * cos) + (rope_rotate_half(x) * sin)
+
+
+class LinearKMaskedBias(nn.Linear):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        o = self.out_features
+        assert o % 3 == 0
+        if self.bias is not None:
+            self.register_buffer("bias_mask", torch.full_like(self.bias, fill_value=math.nan))
+
+    def forward(self, input: Tensor) -> Tensor:
+        masked_bias = self.bias * self.bias_mask.to(self.bias.dtype) if self.bias is not None else None
+        return F.linear(input, self.weight, masked_bias)
+
+
+class SelfAttention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        mask_k_bias: bool = False,
+        device=None,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+
+        linear_class = LinearKMaskedBias if mask_k_bias else nn.Linear
+        self.qkv = linear_class(dim, dim * 3, bias=qkv_bias, device=device)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias, device=device)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def apply_rope(self, q: Tensor, k: Tensor, rope: Tensor | Tuple[Tensor, Tensor]) -> Tuple[Tensor, Tensor]:
+        # All operations will use the dtype of rope, the output is cast back to the dtype of q and k
+        q_dtype = q.dtype
+        k_dtype = k.dtype
+        sin, cos = rope
+        rope_dtype = sin.dtype
+        q = q.to(dtype=rope_dtype)
+        k = k.to(dtype=rope_dtype)
+        N = q.shape[-2]
+        prefix = N - sin.shape[-2]
+        assert prefix >= 0
+        q_prefix = q[:, :, :prefix, :]
+
+        q = rope_apply(q[:, :, prefix:, :], sin, cos)  # [B, head, hw, D//head]
+        q = torch.cat((q_prefix, q), dim=-2)  # [B, head, N, D//head]
+        k_prefix = k[:, :, :prefix, :]
+        k = rope_apply(k[:, :, prefix:, :], sin, cos)  # [B, head, hw, D//head]
+        k = torch.cat((k_prefix, k), dim=-2)  # [B, head, N, D//head]
+        q = q.to(dtype=q_dtype)
+        k = k.to(dtype=k_dtype)
+        return q, k
+
+    def forward(self, x: Tensor, attn_bias=None, rope: Tensor = None) -> Tensor:
+        qkv = self.qkv(x)
+        attn_v = self.compute_attention(qkv=qkv, attn_bias=attn_bias, rope=rope)
+        x = self.proj(attn_v)
+        x = self.proj_drop(x)
+        return x
+
+    def forward_list(self, x_list, attn_bias=None, rope_list=None) -> List[Tensor]:
+        assert len(x_list) == len(rope_list)  # should be enforced by the Block
+        x_flat, shapes, num_tokens = cat_keep_shapes(x_list)
+        qkv_flat = self.qkv(x_flat)
+        qkv_list = uncat_with_shapes(qkv_flat, shapes, num_tokens)
+        att_out = []
+        for _, (qkv, _, rope) in enumerate(zip(qkv_list, shapes, rope_list)):
+            att_out.append(self.compute_attention(qkv, attn_bias=attn_bias, rope=rope))
+        x_flat, shapes, num_tokens = cat_keep_shapes(att_out)
+        x_flat = self.proj(x_flat)
+        return uncat_with_shapes(x_flat, shapes, num_tokens)
+
+    def compute_attention(self, qkv: Tensor, attn_bias=None, rope=None) -> Tensor:
+        assert attn_bias is None
+        B, N, _ = qkv.shape
+        C = self.qkv.in_features
+
+        qkv = qkv.reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        q, k, v = torch.unbind(qkv, 2)
+        q, k, v = [t.transpose(1, 2) for t in [q, k, v]]
+        if rope is not None:
+            q, k = self.apply_rope(q, k, rope)
+
+        # self._last_q = q.detach().cpu()
+        # self._last_k = k.detach().cpu()
+        x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
+        x = x.transpose(1, 2)
+        return x.reshape([B, N, C])
+
+
+class CausalSelfAttention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = attn_drop
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def init_weights(
+        self, init_attn_std: float | None = None, init_proj_std: float | None = None, factor: float = 1.0
+    ) -> None:
+        init_attn_std = init_attn_std or (self.dim**-0.5)
+        init_proj_std = init_proj_std or init_attn_std * factor
+        nn.init.normal_(self.qkv.weight, std=init_attn_std)
+        nn.init.normal_(self.proj.weight, std=init_proj_std)
+        if self.qkv.bias is not None:
+            nn.init.zeros_(self.qkv.bias)
+        if self.proj.bias is not None:
+            nn.init.zeros_(self.proj.bias)
+
+    def forward(self, x: Tensor, is_causal: bool = True) -> Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        q, k, v = torch.unbind(qkv, 2)
+        q, k, v = [t.transpose(1, 2) for t in [q, k, v]]
+        x = torch.nn.functional.scaled_dot_product_attention(
+            q, k, v, attn_mask=None, dropout_p=self.attn_drop if self.training else 0, is_causal=is_causal
+        )
+        x = x.transpose(1, 2).contiguous().view(B, N, C)
+        x = self.proj_drop(self.proj(x))
+        return x
diff --git a/vggt/encoders/mum/layers/block.py b/vggt/encoders/mum/layers/block.py
new file mode 100644
index 00000000..21189d00
--- /dev/null
+++ b/vggt/encoders/mum/layers/block.py
@@ -0,0 +1,273 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This software may be used and distributed in accordance with
+# the terms of the DINOv3 License Agreement.
+
+from typing import Callable, List, Optional
+
+import torch
+from torch import Tensor, nn
+
+from ..utils import cat_keep_shapes, uncat_with_shapes
+
+from .attention import CausalSelfAttention, SelfAttention
+from .ffn_layers import Mlp
+from .layer_scale import LayerScale  # , DropPath
+
+torch._dynamo.config.automatic_dynamic_shapes = False
+torch._dynamo.config.accumulated_cache_size_limit = 1024
+
+
+class SelfAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        ffn_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_class: Callable[..., nn.Module] = SelfAttention,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+        mask_k_bias: bool = False,
+        device=None,
+    ) -> None:
+        super().__init__()
+        # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            mask_k_bias=mask_k_bias,
+            device=device,
+        )
+        self.ls1 = LayerScale(dim, init_values=init_values, device=device) if init_values else nn.Identity()
+
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * ffn_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+            bias=ffn_bias,
+            device=device,
+        )
+        self.ls2 = LayerScale(dim, init_values=init_values, device=device) if init_values else nn.Identity()
+
+        self.sample_drop_ratio = drop_path
+
+    @staticmethod
+    def _maybe_index_rope(rope: tuple[Tensor, Tensor] | None, indices: Tensor) -> tuple[Tensor, Tensor] | None:
+        if rope is None:
+            return None
+
+        sin, cos = rope
+        assert sin.ndim == cos.ndim
+        if sin.ndim == 4:
+            # If the rope embedding has a batch dimension (is different for each batch element), index into it
+            return sin[indices], cos[indices]  # [batch, heads, patches, embed_dim]
+        else:
+            # No batch dimension, do not index
+            return sin, cos  # [heads, patches, embed_dim] or [patches, embed_dim]
+
+    def _forward(self, x: Tensor, rope=None) -> Tensor:
+        """
+        This is the reference implementation for a single tensor, matching what is done below for a list.
+        We call the list op on [x] instead of this function.
+        """
+        b, _, _ = x.shape
+        sample_subset_size = max(int(b * (1 - self.sample_drop_ratio)), 1)
+        residual_scale_factor = b / sample_subset_size
+
+        if self.training and self.sample_drop_ratio > 0.0:
+            indices_1 = (torch.randperm(b, device=x.device))[:sample_subset_size]
+
+            x_subset_1 = x[indices_1]
+            rope_subset = self._maybe_index_rope(rope, indices_1)
+            residual_1 = self.attn(self.norm1(x_subset_1), rope=rope_subset)
+
+            x_attn = torch.index_add(
+                x,
+                dim=0,
+                source=self.ls1(residual_1),
+                index=indices_1,
+                alpha=residual_scale_factor,
+            )
+
+            indices_2 = (torch.randperm(b, device=x.device))[:sample_subset_size]
+
+            x_subset_2 = x_attn[indices_2]
+            residual_2 = self.mlp(self.norm2(x_subset_2))
+
+            x_ffn = torch.index_add(
+                x_attn,
+                dim=0,
+                source=self.ls2(residual_2),
+                index=indices_2,
+                alpha=residual_scale_factor,
+            )
+        else:
+            x_attn = x + self.ls1(self.attn(self.norm1(x), rope=rope))
+            x_ffn = x_attn + self.ls2(self.mlp(self.norm2(x_attn)))
+
+        return x_ffn
+
+    def _forward_list(self, x_list: List[Tensor], rope_list=None) -> List[Tensor]:
+        """
+        This list operator concatenates the tokens from the list of inputs together to save
+        on the elementwise operations. Torch-compile memory-planning allows hiding the overhead
+        related to concat ops.
+        """
+        b_list = [x.shape[0] for x in x_list]
+        sample_subset_sizes = [max(int(b * (1 - self.sample_drop_ratio)), 1) for b in b_list]
+        residual_scale_factors = [b / sample_subset_size for b, sample_subset_size in zip(b_list, sample_subset_sizes)]
+
+        if self.training and self.sample_drop_ratio > 0.0:
+            indices_1_list = [
+                (torch.randperm(b, device=x.device))[:sample_subset_size]
+                for x, b, sample_subset_size in zip(x_list, b_list, sample_subset_sizes)
+            ]
+            x_subset_1_list = [x[indices_1] for x, indices_1 in zip(x_list, indices_1_list)]
+
+            if rope_list is not None:
+                rope_subset_list = [
+                    self._maybe_index_rope(rope, indices_1) for rope, indices_1 in zip(rope_list, indices_1_list)
+                ]
+            else:
+                rope_subset_list = rope_list
+
+            flattened, shapes, num_tokens = cat_keep_shapes(x_subset_1_list)
+            norm1 = uncat_with_shapes(self.norm1(flattened), shapes, num_tokens)
+            residual_1_list = self.attn.forward_list(norm1, rope_list=rope_subset_list)
+
+            residual_1_list = [r.to(dtype=x_list[0].dtype) for r in residual_1_list]
+
+            x_attn_list = [
+                torch.index_add(
+                    x,
+                    dim=0,
+                    source=self.ls1(residual_1),
+                    index=indices_1,
+                    alpha=residual_scale_factor,
+                )
+                for x, residual_1, indices_1, residual_scale_factor in zip(
+                    x_list, residual_1_list, indices_1_list, residual_scale_factors
+                )
+            ]
+
+            indices_2_list = [
+                (torch.randperm(b, device=x.device))[:sample_subset_size]
+                for x, b, sample_subset_size in zip(x_list, b_list, sample_subset_sizes)
+            ]
+            x_subset_2_list = [x[indices_2] for x, indices_2 in zip(x_attn_list, indices_2_list)]
+            flattened, shapes, num_tokens = cat_keep_shapes(x_subset_2_list)
+            norm2_flat = self.norm2(flattened)
+            norm2_list = uncat_with_shapes(norm2_flat, shapes, num_tokens)
+
+            residual_2_list = self.mlp.forward_list(norm2_list)
+
+            residual_2_list = [r.to(dtype=x_attn_list[0].dtype) for r in residual_2_list]
+
+            x_ffn = [
+                torch.index_add(
+                    x_attn,
+                    dim=0,
+                    source=self.ls2(residual_2),
+                    index=indices_2,
+                    alpha=residual_scale_factor,
+                )
+                for x_attn, residual_2, indices_2, residual_scale_factor in zip(
+                    x_attn_list, residual_2_list, indices_2_list, residual_scale_factors
+                )
+            ]
+        else:
+            x_out = []
+            for x, rope in zip(x_list, rope_list):
+                x_attn = x + self.ls1(self.attn(self.norm1(x), rope=rope))
+                x_ffn = x_attn + self.ls2(self.mlp(self.norm2(x_attn)))
+                x_out.append(x_ffn)
+            x_ffn = x_out
+
+        return x_ffn
+
+    def forward(self, x_or_x_list, rope_or_rope_list=None) -> List[Tensor]:
+        if isinstance(x_or_x_list, Tensor):
+            # for reference:
+            # return self._forward(x_or_x_list, rope=rope_or_rope_list)
+            # in order to match implementations we call the list op:
+            return self._forward_list([x_or_x_list], rope_list=[rope_or_rope_list])[0]
+        elif isinstance(x_or_x_list, list):
+            if rope_or_rope_list is None:
+                rope_or_rope_list = [None for x in x_or_x_list]
+            # return [self._forward(x, rope=rope) for x, rope in zip(x_or_x_list, rope_or_rope_list)]
+            return self._forward_list(x_or_x_list, rope_list=rope_or_rope_list)
+        else:
+            raise AssertionError
+
+
+class CausalSelfAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        ffn_ratio: float = 4.0,
+        ls_init_value: Optional[float] = None,
+        is_causal: bool = True,
+        act_layer: Callable = nn.GELU,
+        norm_layer: Callable = nn.LayerNorm,
+        dropout_prob: float = 0.0,
+    ):
+        super().__init__()
+
+        self.dim = dim
+        self.is_causal = is_causal
+        self.ls1 = LayerScale(dim, init_values=ls_init_value) if ls_init_value else nn.Identity()
+        self.attention_norm = norm_layer(dim)
+        self.attention = CausalSelfAttention(dim, num_heads, attn_drop=dropout_prob, proj_drop=dropout_prob)
+
+        self.ffn_norm = norm_layer(dim)
+        ffn_hidden_dim = int(dim * ffn_ratio)
+        self.feed_forward = Mlp(
+            in_features=dim,
+            hidden_features=ffn_hidden_dim,
+            drop=dropout_prob,
+            act_layer=act_layer,
+        )
+
+        self.ls2 = LayerScale(dim, init_values=ls_init_value) if ls_init_value else nn.Identity()
+
+    def init_weights(
+        self,
+        init_attn_std: float | None = None,
+        init_proj_std: float | None = None,
+        init_fc_std: float | None = None,
+        factor: float = 1.0,
+    ) -> None:
+        init_attn_std = init_attn_std or (self.dim**-0.5)
+        init_proj_std = init_proj_std or init_attn_std * factor
+        init_fc_std = init_fc_std or (2 * self.dim) ** -0.5
+        self.attention.init_weights(init_attn_std, init_proj_std)
+        self.attention_norm.reset_parameters()
+        nn.init.normal_(self.feed_forward.fc1.weight, std=init_fc_std)
+        nn.init.normal_(self.feed_forward.fc2.weight, std=init_proj_std)
+        self.ffn_norm.reset_parameters()
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ):
+
+        x_attn = x + self.ls1(self.attention(self.attention_norm(x), self.is_causal))
+        x_ffn = x_attn + self.ls2(self.feed_forward(self.ffn_norm(x_attn)))
+        return x_ffn
diff --git a/vggt/encoders/mum/layers/dino_head.py b/vggt/encoders/mum/layers/dino_head.py
new file mode 100644
index 00000000..bb71f35f
--- /dev/null
+++ b/vggt/encoders/mum/layers/dino_head.py
@@ -0,0 +1,67 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This software may be used and distributed in accordance with
+# the terms of the DINOv3 License Agreement.
+
+import torch
+import torch.nn as nn
+from torch.nn.init import trunc_normal_
+
+
+class DINOHead(nn.Module):
+    def __init__(
+        self,
+        in_dim,
+        out_dim,
+        use_bn=False,
+        nlayers=3,
+        hidden_dim=2048,
+        bottleneck_dim=256,
+        mlp_bias=True,
+    ):
+        super().__init__()
+        nlayers = max(nlayers, 1)
+        self.mlp = _build_mlp(
+            nlayers,
+            in_dim,
+            bottleneck_dim,
+            hidden_dim=hidden_dim,
+            use_bn=use_bn,
+            bias=mlp_bias,
+        )
+        self.last_layer = nn.Linear(bottleneck_dim, out_dim, bias=False)
+
+    def init_weights(self) -> None:
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+
+    def forward(self, x, no_last_layer=False, only_last_layer=False):
+        if not only_last_layer:
+            x = self.mlp(x)
+            eps = 1e-6 if x.dtype == torch.float16 else 1e-12
+            x = nn.functional.normalize(x, dim=-1, p=2, eps=eps)
+        if not no_last_layer:
+            x = self.last_layer(x)
+        return x
+
+
+def _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=None, use_bn=False, bias=True):
+    if nlayers == 1:
+        return nn.Linear(in_dim, bottleneck_dim, bias=bias)
+    else:
+        layers = [nn.Linear(in_dim, hidden_dim, bias=bias)]
+        if use_bn:
+            layers.append(nn.BatchNorm1d(hidden_dim))
+        layers.append(nn.GELU())
+        for _ in range(nlayers - 2):
+            layers.append(nn.Linear(hidden_dim, hidden_dim, bias=bias))
+            if use_bn:
+                layers.append(nn.BatchNorm1d(hidden_dim))
+            layers.append(nn.GELU())
+        layers.append(nn.Linear(hidden_dim, bottleneck_dim, bias=bias))
+        return nn.Sequential(*layers)
diff --git a/vggt/encoders/mum/layers/ffn_layers.py b/vggt/encoders/mum/layers/ffn_layers.py
new file mode 100644
index 00000000..cd533049
--- /dev/null
+++ b/vggt/encoders/mum/layers/ffn_layers.py
@@ -0,0 +1,77 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This software may be used and distributed in accordance with
+# the terms of the DINOv3 License Agreement.
+
+from typing import Callable, List, Optional
+
+import torch.nn.functional as F
+from torch import Tensor, nn
+
+from ..utils import cat_keep_shapes, uncat_with_shapes
+
+
+class ListForwardMixin(object):
+    def forward(self, x: Tensor):
+        raise NotImplementedError
+
+    def forward_list(self, x_list: List[Tensor]) -> List[Tensor]:
+        x_flat, shapes, num_tokens = cat_keep_shapes(x_list)
+        x_flat = self.forward(x_flat)
+        return uncat_with_shapes(x_flat, shapes, num_tokens)
+
+
+class Mlp(nn.Module, ListForwardMixin):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        drop: float = 0.0,
+        bias: bool = True,
+        device=None,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias, device=device)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias, device=device)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class SwiGLUFFN(nn.Module, ListForwardMixin):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Optional[Callable[..., nn.Module]] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+        align_to: int = 8,
+        device=None,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        d = int(hidden_features * 2 / 3)
+        swiglu_hidden_features = d + (-d % align_to)
+        self.w1 = nn.Linear(in_features, swiglu_hidden_features, bias=bias, device=device)
+        self.w2 = nn.Linear(in_features, swiglu_hidden_features, bias=bias, device=device)
+        self.w3 = nn.Linear(swiglu_hidden_features, out_features, bias=bias, device=device)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x1 = self.w1(x)
+        x2 = self.w2(x)
+        hidden = F.silu(x1) * x2
+        return self.w3(hidden)
diff --git a/vggt/encoders/mum/layers/fp8_linear.py b/vggt/encoders/mum/layers/fp8_linear.py
new file mode 100644
index 00000000..0fff8a0e
--- /dev/null
+++ b/vggt/encoders/mum/layers/fp8_linear.py
@@ -0,0 +1,141 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This software may be used and distributed in accordance with
+# the terms of the DINOv3 License Agreement.
+
+import re
+
+import torch
+
+from .attention import LinearKMaskedBias
+from ..utils import named_replace
+
+# avoid division by zero when calculating scale
+EPS = 1e-12
+
+
+def scale(t, amax_t):
+    max_v = torch.finfo(torch.float8_e4m3fn).max
+    scale_t = torch.clamp(amax_t.float(), min=EPS) / max_v
+    t_fp8 = (t / scale_t).to(torch.float8_e4m3fn)
+    return t_fp8, scale_t
+
+
+def matmul(first, amax_first, second_t, amax_second_t, bias):
+    first_fp8, scale_first = scale(first, amax_first)
+    second_t_fp8, scale_second_t = scale(second_t, amax_second_t)
+    # PyTorch's row-wise scaled matmul kernel is based on CUTLASS and is quite
+    # slow. Hence we fall back to an "unscaled" matmul, which uses cuBLAS, and
+    # apply the scale manually afterwards.
+    output = torch._scaled_mm(
+        first_fp8,
+        second_t_fp8.t(),
+        scale_a=scale_first.new_ones((1, 1)),
+        scale_b=scale_second_t.t().new_ones((1, 1)),
+        bias=None,
+        out_dtype=torch.bfloat16,
+        use_fast_accum=False,
+    )
+    output = (output * scale_first * scale_second_t.t()).to(torch.bfloat16)
+    if bias is not None:
+        output = output + bias
+    return output
+
+
+@torch.compiler.allow_in_graph
+class Fp8LinearFn(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, a, b_t, bias):
+        amax_a = a.abs().amax(dim=-1, keepdim=True)
+        amax_b_t = b_t.abs().amax(dim=-1, keepdim=True)
+        out = matmul(a, amax_a, b_t, amax_b_t, bias)
+
+        ctx.a_requires_grad = a.requires_grad
+        ctx.b_requires_grad = b_t.requires_grad
+        ctx.bias_requires_grad = bias.requires_grad if bias is not None else False
+
+        ctx.save_for_backward(a, b_t, amax_b_t.max())
+
+        return out
+
+    @staticmethod
+    def backward(ctx, grad_out):
+        a, b_t, amax_b = ctx.saved_tensors
+
+        if ctx.a_requires_grad:
+            b = b_t.t().contiguous()
+            amax_grad_out = grad_out.abs().amax(dim=-1, keepdim=True)
+            amax_b = amax_b.repeat(b.shape[0], 1)
+            grad_a = matmul(grad_out, amax_grad_out, b, amax_b, None)
+        else:
+            grad_a = None
+        if ctx.b_requires_grad:
+            grad_b = grad_out.t() @ a
+        else:
+            grad_b = None
+        if ctx.bias_requires_grad:
+            grad_bias = grad_out.sum(dim=0)
+        else:
+            grad_bias = None
+
+        return grad_a, grad_b, grad_bias
+
+
+class Fp8Linear(torch.nn.Linear):
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        out = Fp8LinearFn.apply(input.flatten(end_dim=-2), self.weight, self.bias)
+        out = out.unflatten(0, input.shape[:-1])
+        return out
+
+
+class Fp8LinearKMaskedBias(LinearKMaskedBias):
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        masked_bias = self.bias * self.bias_mask if self.bias is not None else None
+        out = Fp8LinearFn.apply(input.flatten(end_dim=-2), self.weight, masked_bias)
+        out = out.unflatten(0, input.shape[:-1])
+        return out
+
+
+def convert_linears_to_fp8(root_module: torch.nn.Module, *, filter: str) -> torch.nn.Module:
+    filter_re = re.compile(filter)
+    total_count = 0
+
+    def replace(module: torch.nn.Module, name: str) -> torch.nn.Module:
+        nonlocal total_count
+        if not isinstance(module, torch.nn.Linear) or not filter_re.search(name):
+            return module
+        if type(module) == torch.nn.Linear:
+            new_cls = Fp8Linear
+        elif type(module) == LinearKMaskedBias:
+            new_cls = Fp8LinearKMaskedBias
+        else:
+            assert False, str(type(module))
+        if module.in_features % 64 != 0 or module.out_features % 64 != 0:
+            # This is not a strict requirement, but H100 TensorCores for fp8
+            # operate on tiles of 64 elements anyways, and Inductor sometimes
+            # pads inner dims to become multiples of 64. Also, if one day we
+            # switch back to cuBLAS, it artificially requires dims to be
+            # multiples of 16.
+            raise RuntimeError(
+                "fp8 requires all dimensions to be multiples of 64 " "(consider using ffn_layer=swiglu64 or higher)"
+            )
+        new_module = new_cls(
+            in_features=module.in_features,
+            out_features=module.out_features,
+            bias=module.bias is not None,
+            dtype=module.weight.dtype,
+            device=module.weight.device,
+        )
+        new_module.weight = module.weight
+        new_module.bias = module.bias
+        total_count += 1
+        return new_module
+
+    out = named_replace(replace, root_module)
+    assert total_count > 0, "fp8: no layer found to convert"
+    # Force re-compile everything
+    torch._dynamo.reset_code_caches()
+    from torch._inductor.cudagraph_trees import reset_cudagraph_trees
+
+    reset_cudagraph_trees()
+    return out
diff --git a/vggt/encoders/mum/layers/layer_scale.py b/vggt/encoders/mum/layers/layer_scale.py
new file mode 100644
index 00000000..0b72b7c6
--- /dev/null
+++ b/vggt/encoders/mum/layers/layer_scale.py
@@ -0,0 +1,29 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This software may be used and distributed in accordance with
+# the terms of the DINOv3 License Agreement.
+
+from typing import Union
+
+import torch
+from torch import Tensor, nn
+
+
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: Union[float, Tensor] = 1e-5,
+        inplace: bool = False,
+        device=None,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(torch.empty(dim, device=device))
+        self.init_values = init_values
+
+    def reset_parameters(self):
+        nn.init.constant_(self.gamma, self.init_values)
+
+    def forward(self, x: Tensor) -> Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
diff --git a/vggt/encoders/mum/layers/patch_embed.py b/vggt/encoders/mum/layers/patch_embed.py
new file mode 100644
index 00000000..5a12cc1c
--- /dev/null
+++ b/vggt/encoders/mum/layers/patch_embed.py
@@ -0,0 +1,135 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This software may be used and distributed in accordance with
+# the terms of the DINOv3 License Agreement.
+
+import math
+from typing import Callable, Tuple, Union
+import torch
+from torch import Tensor, nn
+
+
+def make_2tuple(x):
+    if isinstance(x, tuple):
+        assert len(x) == 2
+        return x
+
+    assert isinstance(x, int)
+    return (x, x)
+
+
+class PatchEmbed(nn.Module):
+    """
+    2D image to patch embedding: (B,C,H,W) -> (B,N,D)
+
+    Args:
+        img_size: Image size.
+        patch_size: Patch token size.
+        in_chans: Number of input image channels.
+        embed_dim: Number of linear projection output channels.
+        norm_layer: Normalization layer.
+    """
+
+    def __init__(
+        self,
+        img_size: Union[int, Tuple[int, int]] = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        norm_layer: Callable | None = None,
+        flatten_embedding: bool = True,
+    ) -> None:
+        super().__init__()
+
+        image_HW = make_2tuple(img_size)
+        patch_HW = make_2tuple(patch_size)
+        patch_grid_size = (
+            image_HW[0] // patch_HW[0],
+            image_HW[1] // patch_HW[1],
+        )
+
+        self.img_size = image_HW
+        self.patch_size = patch_HW
+        self.patches_resolution = patch_grid_size
+        self.num_patches = patch_grid_size[0] * patch_grid_size[1]
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.flatten_embedding = flatten_embedding
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+    def forward(self, x: Tensor) -> Tensor:
+        _, _, H, W = x.shape
+        # patch_H, patch_W = self.patch_size
+        # assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
+        # assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
+
+        x = self.proj(x)  # B C H W
+        H, W = x.size(2), x.size(3)
+        x = x.flatten(2).transpose(1, 2)  # B HW C
+        x = self.norm(x)
+        if not self.flatten_embedding:
+            x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
+        return x
+
+    def flops(self) -> float:
+        Ho, Wo = self.patches_resolution
+        flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops
+
+    def reset_parameters(self):
+        k = 1 / (self.in_chans * (self.patch_size[0] ** 2))
+        nn.init.uniform_(self.proj.weight, -math.sqrt(k), math.sqrt(k))
+        if self.proj.bias is not None:
+            nn.init.uniform_(self.proj.bias, -math.sqrt(k), math.sqrt(k))
+
+
+class DINOv3PatchEmbed(nn.Module):
+    def __init__(
+            self,
+            img_size: Union[int, Tuple[int, int]] = 224,
+            patch_size: Union[int, Tuple[int, int]] = 16,
+            in_chans: int = 3,
+            embed_dim: int = 768,
+            norm_layer: Callable | None = None,
+            flatten_embedding: bool = True,
+            **kwargs,
+    ):
+        super().__init__()
+
+        image_HW = make_2tuple(img_size)
+        patch_HW = make_2tuple(patch_size)
+        patch_grid_size = (
+            image_HW[0] // patch_HW[0],
+            image_HW[1] // patch_HW[1],
+        )
+
+        self.img_size = image_HW
+        self.patch_size = patch_HW
+        self.patches_resolution = patch_grid_size
+        self.num_patches = patch_grid_size[0] * patch_grid_size[1]
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.flatten_embedding = flatten_embedding
+
+        self.backbone = [torch.hub.load("/mimer/NOBACKUP/groups/snic2022-6-266/davnords/dinov3", "dinov3_vitl16", source='local', weights="/mimer/NOBACKUP/groups/snic2022-6-266/davnords/mv-ssl/pretrained_models/dinov3_vitl16_pretrain_lvd1689m-8aa4cbdd.pth")]
+        self.proj = nn.Linear(self.backbone[0].embed_dim, embed_dim) if self.backbone[0].embed_dim != embed_dim else nn.Identity()
+    
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        _, _, H, W = x.shape
+        p = self.patch_size[0]
+        with torch.no_grad():
+            if next(self.backbone[0].parameters()).device != x.device:
+                self.backbone[0] = self.backbone[0].to(x.device).to(x.dtype)
+            x = self.backbone[0].forward_features(x)['x_norm_patchtokens']
+        x = self.proj(x)
+        if not self.flatten_embedding:
+            x = x.reshape(-1, H//p, W//p, self.embed_dim)
+        return x
diff --git a/vggt/encoders/mum/layers/rms_norm.py b/vggt/encoders/mum/layers/rms_norm.py
new file mode 100644
index 00000000..1d0a89c4
--- /dev/null
+++ b/vggt/encoders/mum/layers/rms_norm.py
@@ -0,0 +1,24 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This software may be used and distributed in accordance with
+# the terms of the DINOv3 License Agreement.
+
+import torch
+from torch import Tensor, nn
+
+
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-5):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.eps = eps
+
+    def reset_parameters(self) -> None:
+        nn.init.constant_(self.weight, 1)
+
+    def _norm(self, x: Tensor) -> Tensor:
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x: Tensor) -> Tensor:
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
diff --git a/vggt/encoders/mum/layers/rope_position_encoding.py b/vggt/encoders/mum/layers/rope_position_encoding.py
new file mode 100644
index 00000000..2635d09e
--- /dev/null
+++ b/vggt/encoders/mum/layers/rope_position_encoding.py
@@ -0,0 +1,121 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This software may be used and distributed in accordance with
+# the terms of the DINOv3 License Agreement.
+
+import math
+from typing import Literal
+
+import numpy as np
+import torch
+from torch import Tensor, nn
+
+
+# RoPE positional embedding with no mixing of coordinates (axial) and no learnable weights
+# Supports two parametrizations of the rope parameters: either using `base` or `min_period` and `max_period`.
+class RopePositionEmbedding(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        *,
+        num_heads: int,
+        base: float | None = 100.0,
+        min_period: float | None = None,
+        max_period: float | None = None,
+        normalize_coords: Literal["min", "max", "separate"] = "separate",
+        shift_coords: float | None = None,
+        jitter_coords: float | None = None,
+        rescale_coords: float | None = None,
+        dtype: torch.dtype | None = None,
+        device: torch.device | None = None,
+    ):
+        super().__init__()
+        assert embed_dim % (4 * num_heads) == 0
+        both_periods = min_period is not None and max_period is not None
+        if (base is None and not both_periods) or (base is not None and both_periods):
+            raise ValueError("Either `base` or `min_period`+`max_period` must be provided.")
+
+        D_head = embed_dim // num_heads
+        self.base = base
+        self.min_period = min_period
+        self.max_period = max_period
+        self.D_head = D_head
+        self.normalize_coords = normalize_coords
+        self.shift_coords = shift_coords
+        self.jitter_coords = jitter_coords
+        self.rescale_coords = rescale_coords
+
+        # Needs persistent=True because we do teacher.load_state_dict(student.state_dict()) to initialize the teacher
+        self.dtype = dtype  # Don't rely on self.periods.dtype
+        self.register_buffer(
+            "periods",
+            torch.empty(D_head // 4, device=device, dtype=dtype),
+            persistent=True,
+        )
+        self._init_weights()
+
+    def forward(self, *, H: int, W: int) -> tuple[Tensor, Tensor]:
+        device = self.periods.device
+        dtype = self.dtype
+        dd = {"device": device, "dtype": dtype}
+
+        # Prepare coords in range [-1, +1]
+        if self.normalize_coords == "max":
+            max_HW = max(H, W)
+            coords_h = torch.arange(0.5, H, **dd) / max_HW  # [H]
+            coords_w = torch.arange(0.5, W, **dd) / max_HW  # [W]
+        elif self.normalize_coords == "min":
+            min_HW = min(H, W)
+            coords_h = torch.arange(0.5, H, **dd) / min_HW  # [H]
+            coords_w = torch.arange(0.5, W, **dd) / min_HW  # [W]
+        elif self.normalize_coords == "separate":
+            coords_h = torch.arange(0.5, H, **dd) / H  # [H]
+            coords_w = torch.arange(0.5, W, **dd) / W  # [W]
+        else:
+            raise ValueError(f"Unknown normalize_coords: {self.normalize_coords}")
+        coords = torch.stack(torch.meshgrid(coords_h, coords_w, indexing="ij"), dim=-1)  # [H, W, 2]
+        coords = coords.flatten(0, 1)  # [HW, 2]
+        coords = 2.0 * coords - 1.0  # Shift range [0, 1] to [-1, +1]
+
+        # Shift coords by adding a uniform value in [-shift, shift]
+        if self.training and self.shift_coords is not None:
+            shift_hw = torch.empty(2, **dd).uniform_(-self.shift_coords, self.shift_coords)
+            coords += shift_hw[None, :]
+
+        # Jitter coords by multiplying the range [-1, 1] by a log-uniform value in [1/jitter, jitter]
+        if self.training and self.jitter_coords is not None:
+            jitter_max = np.log(self.jitter_coords)
+            jitter_min = -jitter_max
+            jitter_hw = torch.empty(2, **dd).uniform_(jitter_min, jitter_max).exp()
+            coords *= jitter_hw[None, :]
+
+        # Rescale coords by multiplying the range [-1, 1] by a log-uniform value in [1/rescale, rescale]
+        if self.training and self.rescale_coords is not None:
+            rescale_max = np.log(self.rescale_coords)
+            rescale_min = -rescale_max
+            rescale_hw = torch.empty(1, **dd).uniform_(rescale_min, rescale_max).exp()
+            coords *= rescale_hw
+
+        # Prepare angles and sin/cos
+        angles = 2 * math.pi * coords[:, :, None] / self.periods[None, None, :]  # [HW, 2, D//4]
+        angles = angles.flatten(1, 2)  # [HW, D//2]
+        angles = angles.tile(2)  # [HW, D]
+        cos = torch.cos(angles)  # [HW, D]
+        sin = torch.sin(angles)  # [HW, D]
+
+        return (sin, cos)  # 2 * [HW, D]
+
+    def _init_weights(self):
+        device = self.periods.device
+        dtype = self.dtype
+        if self.base is not None:
+            periods = self.base ** (
+                2 * torch.arange(self.D_head // 4, device=device, dtype=dtype) / (self.D_head // 2)
+            )  # [D//4]
+        else:
+            base = self.max_period / self.min_period
+            exponents = torch.linspace(0, 1, self.D_head // 4, device=device, dtype=dtype)  # [D//4] range [0, 1]
+            periods = base**exponents  # range [1, max_period / min_period]
+            periods = periods / base  # range [min_period / max_period, 1]
+            periods = periods * self.max_period  # range [min_period, max_period]
+        self.periods.data = periods
diff --git a/vggt/encoders/mum/model.py b/vggt/encoders/mum/model.py
new file mode 100644
index 00000000..068beeae
--- /dev/null
+++ b/vggt/encoders/mum/model.py
@@ -0,0 +1,402 @@
+import logging
+from functools import partial
+from typing import Any, Literal, Callable
+
+import torch
+import torch.nn.init
+from torch import nn
+
+from .layers import LayerScale, Mlp, PatchEmbed, RMSNorm, RopePositionEmbedding, SelfAttentionBlock, SwiGLUFFN, DINOv3PatchEmbed
+from .utils import named_apply
+
+logger = logging.getLogger("dinov3")
+
+ffn_layer_dict = {
+    "mlp": Mlp,
+    "swiglu": SwiGLUFFN,
+    "swiglu32": partial(SwiGLUFFN, align_to=32),
+    "swiglu64": partial(SwiGLUFFN, align_to=64),
+    "swiglu128": partial(SwiGLUFFN, align_to=128),
+}
+
+norm_layer_dict = {
+    "layernorm": partial(nn.LayerNorm, eps=1e-6),
+    "layernormbf16": partial(nn.LayerNorm, eps=1e-5),
+    "rmsnorm": RMSNorm,
+}
+
+dtype_dict = {
+    "fp32": torch.float32,
+    "fp16": torch.float16,
+    "bf16": torch.bfloat16,
+}
+
+
+def init_weights_vit(module: nn.Module, name: str = ""):
+    if isinstance(module, nn.Linear):
+        torch.nn.init.trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+    if isinstance(module, nn.LayerNorm):
+        module.reset_parameters()
+    if isinstance(module, LayerScale):
+        module.reset_parameters()
+    if isinstance(module, PatchEmbed):
+        module.reset_parameters()
+    if isinstance(module, RMSNorm):
+        module.reset_parameters()
+
+def build_model(cfg):
+    vit_kwargs = dict(**cfg.model)
+    vit_kwargs['device'] = 'cuda'
+    # model = model_file.__dict__[cfg.model.name](**vit_kwargs)
+    model = globals()[cfg.model.name](**vit_kwargs)
+    model.init_weights()
+    return model
+
+class MultiViewMaskedAutoEncoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        img_size: int = 224,
+        patch_size: int = 16,
+        in_chans: int = 3,
+        embed_dim: int = 1024,
+        depth: int = 24,
+        num_heads: int = 16,
+        decoder_embed_dim: int = 512,
+        decoder_depth: int = 8,
+        decoder_num_heads: int = 16,
+        norm_pix_loss:bool = True,
+        patch_embed: Literal['conv', 'dinov3'] = 'conv',
+
+        pos_embed_rope_base: float = 100.0,
+        pos_embed_rope_min_period: float | None = None,
+        pos_embed_rope_max_period: float | None = None,
+        pos_embed_rope_normalize_coords: Literal["min", "max", "separate"] = "separate",
+        pos_embed_rope_shift_coords: float | None = None,
+        pos_embed_rope_jitter_coords: float | None = None,
+        pos_embed_rope_rescale_coords: float | None = None,
+        pos_embed_rope_dtype: str = "bf16",
+        
+        ffn_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        drop_path_rate: float = 0.0,
+        layerscale_init: float | None = None,
+        norm_layer: str = "layernorm",
+        ffn_layer: str = "mlp",
+        ffn_bias: bool = True,
+        proj_bias: bool = True,
+        n_storage_tokens: int = 0,
+        mask_k_bias: bool = False,
+        device: Any | None = None,
+        **ignored_kwargs,
+    ):
+        super().__init__()
+        if len(ignored_kwargs) > 0:
+            logger.warning(f"Ignored kwargs: {ignored_kwargs}")
+        del ignored_kwargs
+
+        norm_layer_cls = norm_layer_dict[norm_layer]
+
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.n_blocks = depth
+        self.num_heads = num_heads
+        self.patch_size = patch_size
+        self.norm_pix_loss = norm_pix_loss
+
+        # --------------------------------------------------------------------------
+        # MAE encoder specifics
+        if patch_embed == "conv":
+            patch_embed_cls = PatchEmbed
+        elif patch_embed == "dinov3":
+            patch_embed_cls = DINOv3PatchEmbed
+        else:
+            raise ValueError(f"Unknown patch embedding type: {patch_embed}")
+        self.patch_embed = patch_embed_cls(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            flatten_embedding=False,
+        )
+        self.cls_token = nn.Parameter(torch.empty(1, 1, embed_dim, device=device))
+        self.n_storage_tokens = n_storage_tokens
+        if self.n_storage_tokens > 0:
+            self.storage_tokens = nn.Parameter(torch.empty(1, n_storage_tokens, embed_dim, device=device))
+        logger.info(f"using base={pos_embed_rope_base} for rope new")
+        logger.info(f"using min_period={pos_embed_rope_min_period} for rope new")
+        logger.info(f"using max_period={pos_embed_rope_max_period} for rope new")
+        logger.info(f"using normalize_coords={pos_embed_rope_normalize_coords} for rope new")
+        logger.info(f"using shift_coords={pos_embed_rope_shift_coords} for rope new")
+        logger.info(f"using rescale_coords={pos_embed_rope_rescale_coords} for rope new")
+        logger.info(f"using jitter_coords={pos_embed_rope_jitter_coords} for rope new")
+        logger.info(f"using dtype={pos_embed_rope_dtype} for rope new")
+
+        rope_cls = partial(
+            RopePositionEmbedding,
+            base=pos_embed_rope_base,
+            min_period=pos_embed_rope_min_period,
+            max_period=pos_embed_rope_max_period,
+            normalize_coords=pos_embed_rope_normalize_coords,
+            shift_coords=pos_embed_rope_shift_coords,
+            jitter_coords=pos_embed_rope_jitter_coords,
+            rescale_coords=pos_embed_rope_rescale_coords,
+            dtype=dtype_dict[pos_embed_rope_dtype],
+            device=device,
+        )
+        self.rope_embed = rope_cls(
+            embed_dim=embed_dim,
+            num_heads=num_heads,
+        )
+        logger.info(f"using {ffn_layer} layer as FFN")
+        ffn_layer_cls = ffn_layer_dict[ffn_layer]
+
+        block_cls = partial(SelfAttentionBlock, 
+                ffn_ratio=ffn_ratio,
+                qkv_bias=qkv_bias,
+                proj_bias=proj_bias,
+                ffn_bias=ffn_bias,
+                drop_path=drop_path_rate,
+                norm_layer=norm_layer_cls,
+                act_layer=nn.GELU,
+                ffn_layer=ffn_layer_cls,
+                init_values=layerscale_init,
+                mask_k_bias=mask_k_bias,
+                device=device,
+        )
+        self.blocks = nn.ModuleList([block_cls(dim=embed_dim, num_heads=num_heads) for i in range(depth)])
+        self.norm = norm_layer_cls(embed_dim)
+
+        # --------------------------------------------------------------------------
+        # MAE decoder specifics
+        self.rope_embed_decoder = rope_cls(
+            embed_dim=decoder_embed_dim,
+            num_heads=decoder_num_heads,
+        )
+        self.decoder_embed = nn.Linear(embed_dim, decoder_embed_dim, bias=True, device=device)
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, decoder_embed_dim, device=device))
+
+        self.decoder_frame_blocks = nn.ModuleList([
+            block_cls(dim=decoder_embed_dim, num_heads=decoder_num_heads)
+            for i in range(decoder_depth//2)])
+
+        self.decoder_global_blocks = nn.ModuleList([
+            block_cls(dim=decoder_embed_dim, num_heads=decoder_num_heads)
+            for i in range(decoder_depth//2)])
+        
+        self.decoder_norm = norm_layer_cls(decoder_embed_dim)
+        self.decoder_pred = nn.Linear(decoder_embed_dim, patch_size**2 * in_chans, bias=True, device=device) # decoder to patch
+        # --------------------------------------------------------------------------
+
+    def init_weights(self):
+        self.rope_embed._init_weights()
+        nn.init.normal_(self.cls_token, std=0.02)
+        if self.n_storage_tokens > 0:
+            nn.init.normal_(self.storage_tokens, std=0.02)
+        # nn.init.zeros_(self.mask_token)
+        nn.init.normal_(self.mask_token, std=.02)
+        named_apply(init_weights_vit, self)
+
+    def patchify(self, imgs):
+        """
+        imgs: (N, 3, H, W)
+        x: (N, L, patch_size**2 *3)
+        """
+        p = self.patch_embed.patch_size[0]
+        # assert imgs.shape[2] == imgs.shape[3] and imgs.shape[2] % p == 0
+        assert imgs.shape[2] % p == 0 and imgs.shape[3] % p == 0
+
+        h, w = imgs.shape[2] // p, imgs.shape[3] // p 
+        x = imgs.reshape(shape=(imgs.shape[0], 3, h, p, w, p))
+        x = torch.einsum('nchpwq->nhwpqc', x)
+        x = x.reshape(shape=(imgs.shape[0], h * w, p**2 * 3))
+        return x
+
+    def unpatchify(self, x):
+        """
+        x: (N, L, patch_size**2 *3)
+        imgs: (N, 3, H, W)
+        """
+        p = self.patch_embed.patch_size[0]
+        h = w = int(x.shape[1]**.5)
+        assert h * w == x.shape[1]
+        
+        x = x.reshape(shape=(x.shape[0], h, w, p, p, 3))
+        x = torch.einsum('nhwpqc->nchpwq', x)
+        imgs = x.reshape(shape=(x.shape[0], 3, h * p, h * p))
+        return imgs
+
+    def random_masking(self, x, mask_ratio):
+        """
+        Perform per-sample random masking by per-sample shuffling.
+        Per-sample shuffling is done by argsort random noise.
+        x: [N, L, D], sequence
+        """
+        N, L, D = x.shape  # batch, length, dim
+        len_keep = int(L * (1 - mask_ratio))
+        
+        noise = torch.rand(N, L, device=x.device)  # noise in [0, 1]
+        
+        # sort noise for each sample
+        ids_shuffle = torch.argsort(noise, dim=1)  # ascend: small is keep, large is remove
+        ids_restore = torch.argsort(ids_shuffle, dim=1)
+
+        # keep the first subset
+        ids_keep = ids_shuffle[:, :len_keep]
+        x_masked = torch.gather(x, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, D))
+
+        # generate the binary mask: 0 is keep, 1 is remove
+        mask = torch.ones([N, L], device=x.device)
+        mask[:, :len_keep] = 0
+        # unshuffle to get the binary mask
+        mask = torch.gather(mask, dim=1, index=ids_restore)
+
+        return x_masked, mask, ids_restore, ids_keep
+    
+    def forward_encoder(self, x, mask_ratio, return_all_blocks=False):
+        # embed patches
+        SB, C_in, H, W = x.shape
+        x = self.patch_embed(x)
+        rope_sincos = self.rope_embed(H=x.shape[1], W=x.shape[2])
+        x = x.flatten(1,2)  # [SB, L, C], with L=H*W
+
+        # masking: length -> length * mask_ratio
+        if not return_all_blocks: 
+            x, mask, ids_restore, ids_keep = self.random_masking(x, mask_ratio)
+
+            # Let's just drop the masked patches in the rope
+            sin, cos = rope_sincos 
+
+            sin_vis, cos_vis = sin[ids_keep], cos[ids_keep]  # [B, N_vis, D_head]
+            sin_vis, cos_vis = sin_vis.unsqueeze(1).repeat(1, self.num_heads, 1, 1), cos_vis.unsqueeze(1).repeat(1, self.num_heads, 1, 1)
+
+            rope_sincos = (sin_vis, cos_vis)
+
+        # append cls token
+        cls_tokens = self.cls_token.expand(x.shape[0], -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+
+        # apply Transformer blocks
+        if return_all_blocks:
+            out = []
+            for blk in self.blocks:
+                x = blk(x, rope_sincos)
+                out.append(x)
+            return out
+        else:
+            for blk in self.blocks:
+                x = blk(x, rope_sincos)
+            x = self.norm(x)
+        return x, mask, ids_restore
+
+    def forward_decoder(self, x, ids_restore, B:int, S:int, H=None, W=None):
+        # embed tokens
+        x = self.decoder_embed(x)
+        rope_sincos = self.rope_embed_decoder(H=H//self.patch_size, W=W//self.patch_size)
+        # append mask tokens to sequence
+        mask_tokens = self.mask_token.repeat(x.shape[0], ids_restore.shape[1] + 1 - x.shape[1], 1)
+        x_ = torch.cat([x[:, 1:, :], mask_tokens], dim=1)  # no cls token
+        x_ = torch.gather(x_, dim=1, index=ids_restore.unsqueeze(-1).repeat(1, 1, x.shape[2]))  # unshuffle
+        x = torch.cat([x[:, :1, :], x_], dim=1)  # append cls token
+
+        _, P, C = x.shape
+
+        # apply alternating attention
+        for frame_block, global_block in zip(self.decoder_frame_blocks, self.decoder_global_blocks):
+            # Frame-wise attention
+            if x.shape != (B * S, P, C):
+                x = x.view(B, S, P, C).view(B * S, P, C)
+            x = frame_block(x, rope_sincos)
+            
+            # Global attention
+            x = x.view(B, S, P, C).view(B, S * P, C)
+            x = global_block(x, rope_sincos)
+        
+        x = x.view(B, S, P, C).view(B*S, P, C)
+        x = self.decoder_norm(x)
+
+        # predictor projection
+        x = self.decoder_pred(x)
+
+        # remove cls token
+        x = x[:, 1:, :]
+
+        return x
+
+    def forward_loss(self, imgs, pred, mask):
+        """
+        imgs: [N, 3, H, W]
+        pred: [N, L, p*p*3]
+        mask: [N, L], 0 is keep, 1 is remove, 
+        """
+        target = self.patchify(imgs)
+        if self.norm_pix_loss:
+            mean = target.mean(dim=-1, keepdim=True)
+            var = target.var(dim=-1, keepdim=True)
+            target = (target - mean) / (var + 1.e-6)**.5
+
+        loss = (pred - target) ** 2
+        loss = loss.mean(dim=-1)  # [N, L], mean loss per patch
+
+        loss = (loss * mask).sum() / mask.sum()  # mean loss on removed patches
+        return loss
+
+    def forward(self, imgs, mask_ratio=0.75):
+        B, S, C_in, H, W = imgs.shape
+        imgs = imgs.view(B*S, C_in, H, W)  # [B*S, C, H, W]
+        latent, mask, ids_restore = self.forward_encoder(imgs, mask_ratio)
+        pred = self.forward_decoder(latent, ids_restore, B, S, H=H, W=W)  # [N, L, p*p*3]
+        loss = self.forward_loss(imgs, pred, mask)
+        return loss, pred, mask
+    
+    def forward_features(self, x, masks=None):
+        out = self.forward_encoder(x, 0, return_all_blocks=True)[-1]
+        x_norm = self.norm(out)
+        return {
+            "x_norm_patchtokens": x_norm[:, 1:],
+        }
+    
+    @property
+    def device(self):
+        return self.mask_token.device
+
+def vit_base(patch_size=16, **kwargs):
+    model = MultiViewMaskedAutoEncoder(
+        patch_size=patch_size,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        decoder_embed_dim=512,
+        decoder_depth=8,
+        decoder_num_heads=16,
+        **kwargs,
+    )
+    return model
+
+
+def vit_large(patch_size=16, **kwargs):
+    model = MultiViewMaskedAutoEncoder(
+        patch_size=patch_size,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        decoder_embed_dim=768,
+        decoder_depth=12,
+        decoder_num_heads=16,
+        **kwargs,
+    )
+    return model
+
+def vit_huge(patch_size=16, **kwargs):
+    model = MultiViewMaskedAutoEncoder(
+        patch_size=patch_size,
+        embed_dim=1280,
+        depth=32,
+        num_heads=16,
+        decoder_embed_dim=1024,
+        decoder_depth=24,
+        decoder_num_heads=16,
+        **kwargs,
+    )
+    return model
diff --git a/vggt/encoders/mum/utils.py b/vggt/encoders/mum/utils.py
new file mode 100644
index 00000000..5c578578
--- /dev/null
+++ b/vggt/encoders/mum/utils.py
@@ -0,0 +1,66 @@
+import torch
+from torch import Tensor, nn
+from typing import Callable, List, Optional, Tuple
+
+
+def named_apply(
+    fn: Callable,
+    module: nn.Module,
+    name: str = "",
+    depth_first: bool = True,
+    include_root: bool = False,
+) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = ".".join((name, child_name)) if name else child_name
+        named_apply(
+            fn=fn,
+            module=child_module,
+            name=child_name,
+            depth_first=depth_first,
+            include_root=True,
+        )
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+
+def cat_keep_shapes(x_list: List[Tensor]) -> Tuple[Tensor, List[Tuple[int]], List[int]]:
+    shapes = [x.shape for x in x_list]
+    num_tokens = [x.select(dim=-1, index=0).numel() for x in x_list]
+    flattened = torch.cat([x.flatten(0, -2) for x in x_list])
+    return flattened, shapes, num_tokens
+
+
+def uncat_with_shapes(flattened: Tensor, shapes: List[Tuple[int]], num_tokens: List[int]) -> List[Tensor]:
+    outputs_splitted = torch.split_with_sizes(flattened, num_tokens, dim=0)
+    shapes_adjusted = [shape[:-1] + torch.Size([flattened.shape[-1]]) for shape in shapes]
+    outputs_reshaped = [o.reshape(shape) for o, shape in zip(outputs_splitted, shapes_adjusted)]
+    return outputs_reshaped
+
+
+
+
+def named_replace(
+    fn: Callable,
+    module: nn.Module,
+    name: str = "",
+    depth_first: bool = True,
+    include_root: bool = False,
+) -> nn.Module:
+    if not depth_first and include_root:
+        module = fn(module=module, name=name)
+    for child_name_o, child_module in list(module.named_children()):
+        child_name = ".".join((name, child_name_o)) if name else child_name_o
+        new_child = named_replace(
+            fn=fn,
+            module=child_module,
+            name=child_name,
+            depth_first=depth_first,
+            include_root=True,
+        )
+        setattr(module, child_name_o, new_child)
+
+    if depth_first and include_root:
+        module = fn(module=module, name=name)
+    return module
diff --git a/vggt/models/aggregator_small.py b/vggt/models/aggregator_small.py
new file mode 100644
index 00000000..b5122a23
--- /dev/null
+++ b/vggt/models/aggregator_small.py
@@ -0,0 +1,363 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.checkpoint import checkpoint
+from typing import Optional, Tuple, Union, List, Dict, Any
+
+from vggt.layers import PatchEmbed
+from vggt.layers.block import Block
+from vggt.layers.rope import RotaryPositionEmbedding2D, PositionGetter
+from vggt.layers.vision_transformer import vit_small, vit_base, vit_large, vit_giant2
+from typing import Literal
+
+logger = logging.getLogger(__name__)
+
+_RESNET_MEAN = [0.485, 0.456, 0.406]
+_RESNET_STD = [0.229, 0.224, 0.225]
+
+
+class Aggregator(nn.Module):
+    """
+    The Aggregator applies alternating-attention over input frames,
+    as described in VGGT: Visual Geometry Grounded Transformer.
+
+    Remember to set model.train() to enable gradient checkpointing to reduce memory usage.
+
+    Args:
+        img_size (int): Image size in pixels.
+        patch_size (int): Size of each patch for PatchEmbed.
+        embed_dim (int): Dimension of the token embeddings.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        mlp_ratio (float): Ratio of MLP hidden dim to embedding dim.
+        num_register_tokens (int): Number of register tokens.
+        block_fn (nn.Module): The block type used for attention (Block by default).
+        qkv_bias (bool): Whether to include bias in QKV projections.
+        proj_bias (bool): Whether to include bias in the output projection.
+        ffn_bias (bool): Whether to include bias in MLP layers.
+        patch_embed (str): Type of patch embed. e.g., "conv" or "dinov2_vitl14_reg".
+        aa_order (list[str]): The order of alternating attention, e.g. ["frame", "global"].
+        aa_block_size (int): How many blocks to group under each attention type before switching. If not necessary, set to 1.
+        qk_norm (bool): Whether to apply QK normalization.
+        rope_freq (int): Base frequency for rotary embedding. -1 to disable.
+        init_values (float): Init scale for layer scale.
+    """
+
+    def __init__(
+        self,
+        img_size=512,
+        patch_size=16,
+        embed_dim=384,
+        depth=6,
+        num_heads=6,
+        mlp_ratio=4.0,
+        num_register_tokens=0,
+        block_fn=Block,
+        qkv_bias=True,
+        proj_bias=True,
+        ffn_bias=True,
+        aa_order=["frame", "global"],
+        aa_block_size=1,
+        qk_norm=True,
+        rope_freq=100,
+        init_values=0.01,
+        patch_embed: Literal["mum", "dinov3", "crocov2", "dinov2"]="dinov3"
+    ):
+        super().__init__()
+
+        self.__build_patch_embed__(patch_embed, img_size, patch_size, num_register_tokens, embed_dim=embed_dim)
+
+        # Initialize rotary position embedding if frequency > 0
+        self.rope = RotaryPositionEmbedding2D(frequency=rope_freq) if rope_freq > 0 else None
+        self.position_getter = PositionGetter() if self.rope is not None else None
+
+        self.frame_blocks = nn.ModuleList(
+            [
+                block_fn(
+                    dim=embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    proj_bias=proj_bias,
+                    ffn_bias=ffn_bias,
+                    init_values=init_values,
+                    qk_norm=qk_norm,
+                    rope=self.rope,
+                )
+                for _ in range(depth)
+            ]
+        )
+
+        self.global_blocks = nn.ModuleList(
+            [
+                block_fn(
+                    dim=embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    proj_bias=proj_bias,
+                    ffn_bias=ffn_bias,
+                    init_values=init_values,
+                    qk_norm=qk_norm,
+                    rope=self.rope,
+                )
+                for _ in range(depth)
+            ]
+        )
+
+        self.depth = depth
+        self.aa_order = aa_order
+        self.patch_size = patch_size
+        self.aa_block_size = aa_block_size
+
+        # Validate that depth is divisible by aa_block_size
+        if self.depth % self.aa_block_size != 0:
+            raise ValueError(f"depth ({depth}) must be divisible by aa_block_size ({aa_block_size})")
+
+        self.aa_block_num = self.depth // self.aa_block_size
+
+        # Note: We have two camera tokens, one for the first frame and one for the rest
+        # The same applies for register tokens
+        self.camera_token = nn.Parameter(torch.randn(1, 2, 1, embed_dim))
+        self.register_token = nn.Parameter(torch.randn(1, 2, num_register_tokens, embed_dim))
+
+        # The patch tokens start after the camera and register tokens
+        self.patch_start_idx = 1 + num_register_tokens
+
+        # Initialize parameters with small values
+        nn.init.normal_(self.camera_token, std=1e-6)
+        nn.init.normal_(self.register_token, std=1e-6)
+
+        # Register normalization constants as buffers
+        for name, value in (("_resnet_mean", _RESNET_MEAN), ("_resnet_std", _RESNET_STD)):
+            self.register_buffer(name, torch.FloatTensor(value).view(1, 1, 3, 1, 1), persistent=False)
+
+        self.use_reentrant = False # hardcoded to False
+
+    def __build_patch_embed__(
+        self,
+        patch_embed,
+        img_size,
+        patch_size,
+        num_register_tokens,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        block_chunks=0,
+        init_values=1.0,
+        embed_dim=1024,
+    ):
+        """
+        Build the patch embed layer. If 'conv', we use a
+        simple PatchEmbed conv layer. Otherwise, we use a vision transformer.
+        """
+        if patch_embed == "dinov2":
+            vit_models = {
+                "dinov2_vitl14_reg": vit_large,
+                "dinov2_vitb14_reg": vit_base,
+                "dinov2_vits14_reg": vit_small,
+                "dinov2_vitg2_reg": vit_giant2,
+            }
+
+            patch_embed_vit = vit_models[patch_embed](
+                img_size=img_size,
+                patch_size=patch_size,
+                num_register_tokens=num_register_tokens,
+                interpolate_antialias=interpolate_antialias,
+                interpolate_offset=interpolate_offset,
+                block_chunks=block_chunks,
+                init_values=init_values,
+            )
+        elif patch_embed == "dinov3":
+            patch_embed_vit = torch.hub.load("/mimer/NOBACKUP/groups/snic2022-6-266/davnords/dinov3", "dinov3_vitl16", source='local', weights="/mimer/NOBACKUP/groups/snic2022-6-266/davnords/mv-ssl/pretrained_models/dinov3_vitl16_pretrain_lvd1689m-8aa4cbdd.pth")
+            patch_embed_vit.patch_size = 16
+            patch_embed_vit.device = patch_embed_vit.cls_token.device
+        elif patch_embed == "mum":
+            from vggt.encoders.mum import vit_large
+            patch_embed_vit = vit_large().eval()
+            pretrained_weights = "/mimer/NOBACKUP/groups/snic2022-6-266/davnords/mv-ssl/pretrained_models/MuM_ViTLarge_BaseDecoder_500k.pth"
+            ckpt = torch.load(pretrained_weights, map_location='cpu', weights_only=False)
+            patch_embed_vit.load_state_dict(ckpt['model'], strict=True)
+        elif patch_embed == "crocov2":
+            from vggt.encoders.croco import CroCoNet
+            ckpt = torch.load('/mimer/NOBACKUP/groups/snic2022-6-266/davnords/mv-ssl/pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth', 'cpu')
+            patch_embed_vit = CroCoNet( **ckpt.get('croco_kwargs',{})).eval()
+            patch_embed_vit.load_state_dict(ckpt['model'], strict=True)
+        else:
+            raise NotImplementedError("Invalid value for patch_embed")
+
+        # Freeze patch embed
+        for param in patch_embed_vit.parameters():
+            param.requires_grad = False
+        
+        # if patch_embed == "crocov2":
+        #     for block in patch_embed_vit.enc_blocks[20:]:
+        #         for param in block.parameters():
+        #             param.requires_grad = True
+        # else:
+        #     for block in patch_embed_vit.blocks[20:]:
+        #         for param in block.parameters():
+        #             param.requires_grad = True
+
+        self.patch_embed = patch_embed_vit
+        del patch_embed_vit
+
+        patch_embed_dim = 1024
+        self.proj = nn.Linear(patch_embed_dim, embed_dim) if patch_embed_dim != embed_dim else nn.Identity()
+
+
+    def forward(self, images: torch.Tensor) -> Tuple[List[torch.Tensor], int]:
+        """
+        Args:
+            images (torch.Tensor): Input images with shape [B, S, 3, H, W], in range [0, 1].
+                B: batch size, S: sequence length, 3: RGB channels, H: height, W: width
+
+        Returns:
+            (list[torch.Tensor], int):
+                The list of outputs from the attention blocks,
+                and the patch_start_idx indicating where patch tokens begin.
+        """
+        B, S, C_in, H, W = images.shape
+
+        if C_in != 3:
+            raise ValueError(f"Expected 3 input channels, got {C_in}")
+
+        # Normalize images and reshape for patch embed
+        images = (images - self._resnet_mean) / self._resnet_std
+
+        # Reshape to [B*S, C, H, W] for patch embedding
+        images = images.view(B * S, C_in, H, W)
+        patch_tokens = self.patch_embed.forward_features(images)
+
+        if isinstance(patch_tokens, dict):
+            patch_tokens = patch_tokens["x_norm_patchtokens"]
+        patch_tokens = self.proj(patch_tokens)
+
+        _, P, C = patch_tokens.shape
+
+        # Expand camera and register tokens to match batch size and sequence length
+        camera_token = slice_expand_and_flatten(self.camera_token, B, S)
+        register_token = slice_expand_and_flatten(self.register_token, B, S)
+
+        # Concatenate special tokens with patch tokens
+        tokens = torch.cat([camera_token, register_token, patch_tokens], dim=1)
+
+        pos = None
+        if self.rope is not None:
+            pos = self.position_getter(B * S, H // self.patch_size, W // self.patch_size, device=images.device)
+
+        if self.patch_start_idx > 0:
+            # do not use position embedding for special tokens (camera and register tokens)
+            # so set pos to 0 for the special tokens
+            pos = pos + 1
+            pos_special = torch.zeros(B * S, self.patch_start_idx, 2).to(images.device).to(pos.dtype)
+            pos = torch.cat([pos_special, pos], dim=1)
+
+        # update P because we added special tokens
+        _, P, C = tokens.shape
+
+        frame_idx = 0
+        global_idx = 0
+        output_list = []
+
+        for _ in range(self.aa_block_num):
+            for attn_type in self.aa_order:
+                if attn_type == "frame":
+                    tokens, frame_idx, frame_intermediates = self._process_frame_attention(
+                        tokens, B, S, P, C, frame_idx, pos=pos
+                    )
+                elif attn_type == "global":
+                    tokens, global_idx, global_intermediates = self._process_global_attention(
+                        tokens, B, S, P, C, global_idx, pos=pos
+                    )
+                else:
+                    raise ValueError(f"Unknown attention type: {attn_type}")
+
+            for i in range(len(frame_intermediates)):
+                # concat frame and global intermediates, [B x S x P x 2C]
+                concat_inter = torch.cat([frame_intermediates[i], global_intermediates[i]], dim=-1)
+                output_list.append(concat_inter)
+
+        del concat_inter
+        del frame_intermediates
+        del global_intermediates
+        return output_list, self.patch_start_idx
+
+    def _process_frame_attention(self, tokens, B, S, P, C, frame_idx, pos=None):
+        """
+        Process frame attention blocks. We keep tokens in shape (B*S, P, C).
+        """
+        # If needed, reshape tokens or positions:
+        if tokens.shape != (B * S, P, C):
+            tokens = tokens.view(B, S, P, C).view(B * S, P, C)
+
+        if pos is not None and pos.shape != (B * S, P, 2):
+            pos = pos.view(B, S, P, 2).view(B * S, P, 2)
+
+        intermediates = []
+
+        # by default, self.aa_block_size=1, which processes one block at a time
+        for _ in range(self.aa_block_size):
+            if self.training:
+                tokens = checkpoint(self.frame_blocks[frame_idx], tokens, pos, use_reentrant=self.use_reentrant)
+            else:
+                tokens = self.frame_blocks[frame_idx](tokens, pos=pos)
+            frame_idx += 1
+            intermediates.append(tokens.view(B, S, P, C))
+
+        return tokens, frame_idx, intermediates
+
+    def _process_global_attention(self, tokens, B, S, P, C, global_idx, pos=None):
+        """
+        Process global attention blocks. We keep tokens in shape (B, S*P, C).
+        """
+        if tokens.shape != (B, S * P, C):
+            tokens = tokens.view(B, S, P, C).view(B, S * P, C)
+
+        if pos is not None and pos.shape != (B, S * P, 2):
+            pos = pos.view(B, S, P, 2).view(B, S * P, 2)
+
+        intermediates = []
+
+        # by default, self.aa_block_size=1, which processes one block at a time
+        for _ in range(self.aa_block_size):
+            if self.training:
+                tokens = checkpoint(self.global_blocks[global_idx], tokens, pos, use_reentrant=self.use_reentrant)
+            else:
+                tokens = self.global_blocks[global_idx](tokens, pos=pos)
+            global_idx += 1
+            intermediates.append(tokens.view(B, S, P, C))
+
+        return tokens, global_idx, intermediates
+
+
+def slice_expand_and_flatten(token_tensor, B, S):
+    """
+    Processes specialized tokens with shape (1, 2, X, C) for multi-frame processing:
+    1) Uses the first position (index=0) for the first frame only
+    2) Uses the second position (index=1) for all remaining frames (S-1 frames)
+    3) Expands both to match batch size B
+    4) Concatenates to form (B, S, X, C) where each sequence has 1 first-position token
+       followed by (S-1) second-position tokens
+    5) Flattens to (B*S, X, C) for processing
+
+    Returns:
+        torch.Tensor: Processed tokens with shape (B*S, X, C)
+    """
+
+    # Slice out the "query" tokens => shape (1, 1, ...)
+    query = token_tensor[:, 0:1, ...].expand(B, 1, *token_tensor.shape[2:])
+    # Slice out the "other" tokens => shape (1, S-1, ...)
+    others = token_tensor[:, 1:, ...].expand(B, S - 1, *token_tensor.shape[2:])
+    # Concatenate => shape (B, S, ...)
+    combined = torch.cat([query, others], dim=1)
+
+    # Finally flatten => shape (B*S, ...)
+    combined = combined.view(B * S, *combined.shape[2:])
+    return combined
diff --git a/vggt/models/vggt.py b/vggt/models/vggt.py
index 686e6f9d..fc67251b 100644
--- a/vggt/models/vggt.py
+++ b/vggt/models/vggt.py
@@ -22,8 +22,8 @@ def __init__(self, img_size=518, patch_size=14, embed_dim=1024,
         self.aggregator = Aggregator(img_size=img_size, patch_size=patch_size, embed_dim=embed_dim)
 
         self.camera_head = CameraHead(dim_in=2 * embed_dim) if enable_camera else None
-        self.point_head = DPTHead(dim_in=2 * embed_dim, output_dim=4, activation="inv_log", conf_activation="expp1") if enable_point else None
-        self.depth_head = DPTHead(dim_in=2 * embed_dim, output_dim=2, activation="exp", conf_activation="expp1") if enable_depth else None
+        self.point_head = DPTHead(dim_in=2 * embed_dim, output_dim=4, activation="inv_log", conf_activation="expp1", patch_size=patch_size) if enable_point else None
+        self.depth_head = DPTHead(dim_in=2 * embed_dim, output_dim=2, activation="exp", conf_activation="expp1", patch_size=patch_size) if enable_depth else None
         self.track_head = TrackHead(dim_in=2 * embed_dim, patch_size=patch_size) if enable_track else None
 
     def forward(self, images: torch.Tensor, query_points: torch.Tensor = None):
diff --git a/vggt/models/vggt_small.py b/vggt/models/vggt_small.py
new file mode 100644
index 00000000..f7bd7e6e
--- /dev/null
+++ b/vggt/models/vggt_small.py
@@ -0,0 +1,99 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+from huggingface_hub import PyTorchModelHubMixin  # used for model hub
+
+from vggt.models.aggregator_small import Aggregator
+from vggt.heads.camera_head import CameraHead
+from vggt.heads.dpt_head import DPTHead
+from vggt.heads.track_head import TrackHead
+from typing import Literal
+
+class VGGT(nn.Module, PyTorchModelHubMixin):
+    def __init__(self, img_size=512, patch_size=16, embed_dim=384, depth=6, num_heads=6,
+                 enable_camera=True, enable_point=True, enable_depth=True, enable_track=True, patch_embed: Literal["mum", "dinov3", "crocov2", "dinov2"]="dinov3",
+                 ):
+        super().__init__()
+        self.aggregator = Aggregator(img_size=img_size, patch_size=patch_size, embed_dim=embed_dim, patch_embed=patch_embed, depth=depth, num_heads=num_heads)
+
+        self.camera_head = CameraHead(dim_in=2 * embed_dim) if enable_camera else None
+        self.point_head = DPTHead(dim_in=2 * embed_dim, output_dim=4, activation="inv_log", conf_activation="expp1", patch_size=patch_size,
+                                  intermediate_layer_idx=[2,3,4,5]) if enable_point else None
+        self.depth_head = DPTHead(dim_in=2 * embed_dim, output_dim=2, activation="exp", conf_activation="expp1", patch_size=patch_size,
+                                  intermediate_layer_idx=[2,3,4,5]) if enable_depth else None
+        self.track_head = TrackHead(dim_in=2 * embed_dim, patch_size=patch_size) if enable_track else None
+
+    def forward(self, images: torch.Tensor, query_points: torch.Tensor = None):
+        """
+        Forward pass of the VGGT model.
+
+        Args:
+            images (torch.Tensor): Input images with shape [S, 3, H, W] or [B, S, 3, H, W], in range [0, 1].
+                B: batch size, S: sequence length, 3: RGB channels, H: height, W: width
+            query_points (torch.Tensor, optional): Query points for tracking, in pixel coordinates.
+                Shape: [N, 2] or [B, N, 2], where N is the number of query points.
+                Default: None
+
+        Returns:
+            dict: A dictionary containing the following predictions:
+                - pose_enc (torch.Tensor): Camera pose encoding with shape [B, S, 9] (from the last iteration)
+                - depth (torch.Tensor): Predicted depth maps with shape [B, S, H, W, 1]
+                - depth_conf (torch.Tensor): Confidence scores for depth predictions with shape [B, S, H, W]
+                - world_points (torch.Tensor): 3D world coordinates for each pixel with shape [B, S, H, W, 3]
+                - world_points_conf (torch.Tensor): Confidence scores for world points with shape [B, S, H, W]
+                - images (torch.Tensor): Original input images, preserved for visualization
+
+                If query_points is provided, also includes:
+                - track (torch.Tensor): Point tracks with shape [B, S, N, 2] (from the last iteration), in pixel coordinates
+                - vis (torch.Tensor): Visibility scores for tracked points with shape [B, S, N]
+                - conf (torch.Tensor): Confidence scores for tracked points with shape [B, S, N]
+        """        
+        # If without batch dimension, add it
+        if len(images.shape) == 4:
+            images = images.unsqueeze(0)
+            
+        if query_points is not None and len(query_points.shape) == 2:
+            query_points = query_points.unsqueeze(0)
+
+        aggregated_tokens_list, patch_start_idx = self.aggregator(images)
+
+        predictions = {}
+
+        with torch.cuda.amp.autocast(enabled=False):
+            if self.camera_head is not None:
+                pose_enc_list = self.camera_head(aggregated_tokens_list)
+                predictions["pose_enc"] = pose_enc_list[-1]  # pose encoding of the last iteration
+                predictions["pose_enc_list"] = pose_enc_list
+                
+            if self.depth_head is not None:
+                depth, depth_conf = self.depth_head(
+                    aggregated_tokens_list, images=images, patch_start_idx=patch_start_idx
+                )
+                predictions["depth"] = depth
+                predictions["depth_conf"] = depth_conf
+
+            if self.point_head is not None:
+                pts3d, pts3d_conf = self.point_head(
+                    aggregated_tokens_list, images=images, patch_start_idx=patch_start_idx
+                )
+                predictions["world_points"] = pts3d
+                predictions["world_points_conf"] = pts3d_conf
+
+        if self.track_head is not None and query_points is not None:
+            track_list, vis, conf = self.track_head(
+                aggregated_tokens_list, images=images, patch_start_idx=patch_start_idx, query_points=query_points
+            )
+            predictions["track"] = track_list[-1]  # track of the last iteration
+            predictions["vis"] = vis
+            predictions["conf"] = conf
+
+        if not self.training:
+            predictions["images"] = images  # store the images for visualization during inference
+
+        return predictions
+