diff --git a/.gitignore b/.gitignore index 9d7352ea..8273e1f9 100644 --- a/.gitignore +++ b/.gitignore @@ -147,3 +147,13 @@ skyseg.onnx # pixi environments .pixi *.egg-info + +slurm_outs/ +logs/ +.vscode/ +annotations/ +output_dir/ +constants.py +wandb/ +/data/ +outputs/ \ No newline at end of file diff --git a/co3d.py b/co3d.py new file mode 100644 index 00000000..88c77a81 --- /dev/null +++ b/co3d.py @@ -0,0 +1,101 @@ +from training.data.dataset_util import * +from pathlib import Path +import torch +import numpy as np +import os.path as osp +import json +import gzip + +def save_ply(points, filename): + import open3d as o3d + if torch.is_tensor(points): + points_visual = points.reshape(-1, 3).cpu().numpy() + else: + points_visual = points.reshape(-1, 3) + pcd = o3d.geometry.PointCloud() + pcd.points = o3d.utility.Vector3dVector(points_visual.astype(np.float64)) + # pcd.colors = o3d.utility.Vector3dVector(points_visual_rgb.astype(np.float64)) + o3d.io.write_point_cloud(filename, pcd, write_ascii=True) + + +def co3d_annotation_to_opencv_pose(frame_data): + p = frame_data['viewpoint']['principal_point'] + f = frame_data['viewpoint']['focal_length'] + h, w = frame_data['image']['size'] + K = np.eye(3) + s = (min(h, w) - 1) / 2 + K[0, 0] = f[0] * (w - 1) / 2 + K[1, 1] = f[1] * (h - 1) / 2 + K[0, 2] = -p[0] * s + (w - 1) / 2 + K[1, 2] = -p[1] * s + (h - 1) / 2 + + R = np.asarray(frame_data['viewpoint']['R']).T # note the transpose here + T = np.asarray(frame_data['viewpoint']['T']) + pose = np.concatenate([R,T[:,None]],1) + # pose = np.diag([-1,-1,1]).astype(np.float32) @ pose # flip the direction of x,y axis + + return pose, K + +def _load_16big_png_depth(depth_png): + with Image.open(depth_png) as depth_pil: + # the image is stored with 16-bit depth but PIL reads it as I (32 bit). + # we cast it to uint16, then reinterpret as float16, then cast to float32 + depth = ( + np.frombuffer(np.array(depth_pil, dtype=np.uint16), dtype=np.float16) + .astype(np.float32) + .reshape((depth_pil.size[1], depth_pil.size[0])) + ) + return depth + +root = Path("/mimer/NOBACKUP/groups/3d-dl/co3d_full/apple") + +frame_file = osp.join(root, "frame_annotations.jgz") + +with gzip.open(frame_file, "r") as fin: + frame_data = json.loads(fin.read()) + +frame_data_processed = {} +for f_data in frame_data: + sequence_name = f_data["sequence_name"] + frame_data_processed.setdefault(sequence_name, {})[f_data["frame_number"]] = f_data + +seq_name = "12_90_489" +seq_data = frame_data_processed[seq_name] + +seq_dir = root / seq_name +images_dir = seq_dir / "images" +frames = sorted([p.name for p in images_dir.iterdir() if p.suffix == ".jpg"]) + + +total_world_points = [] +for i, frame in enumerate(frames[:10]): + frame_data = seq_data[i] + + extrinsic, intrinsic = co3d_annotation_to_opencv_pose(frame_data) + + filepath= frame_data['image']['path'] + image_path = osp.join("/mimer/NOBACKUP/groups/3d-dl/co3d_full", filepath) + depth_path = image_path.replace("/images", "/depths") + ".geometric.png" + + # extrinsic = np.vstack([extrinsic, [0, 0, 0, 1]]) + # extrinsic = np.linalg.inv(extrinsic) + + extri_opencv = np.array(extrinsic[:3].tolist()) + intri_opencv = np.array(intrinsic.tolist()) + + depth_map = _load_16big_png_depth(depth_path) + + depth_map = cv2.resize(depth_map, (1024//4, 1896//4), interpolation=cv2.INTER_NEAREST) + + world_coords_points, cam_coords_points, point_mask = ( + depth_to_world_coords_points(depth_map, extri_opencv, intri_opencv) + ) + total_world_points.append(world_coords_points) + +total_world_points = np.concatenate(total_world_points, axis=0) +print('Total points: ', total_world_points.shape) + +save_ply( + total_world_points.reshape(-1, 3), + f"yum.ply" +) diff --git a/evaluation/configs/eval.yaml b/evaluation/configs/eval.yaml new file mode 100644 index 00000000..4a2f7f6f --- /dev/null +++ b/evaluation/configs/eval.yaml @@ -0,0 +1,55 @@ +# @package _global_ + + +name: mv_recon +work_dir: ${hydra:runtime.cwd} +output_dir: ${work_dir}/outputs/${name} + +hydra: + run: + dir: ${output_dir}/hydra/${now:%Y-%m-%d_%H-%M-%S} + +debug: false + +pi3: + pretrained_model_name_or_path: yyfz233/Pi3 + # pretrained_model_name_or_path: checkpoints/Pi3 + +eval_datasets: + # - DTU + - ETH3D + +no_crop: True +load_img_size: 518 + +device: cuda + +verbose: False + +seed: 42 + +save_suffix: null + +data: + DTU: + cfg: + _target_: evaluation.datasets.dtu.DTU + split: test + DTU_DIR: /mimer/NOBACKUP/groups/3d-dl/dtu_test_mvsnet_release + load_img_size: ${load_img_size} + cache_file: data/dataset_cache/dtu_mv_recon_cache.npy + sampling: + strategy: stride + kf_every: 5 + seq_id_map: evaluation/datasets/seq-id-maps/DTU_mv-recon_seq-id-map-kf5.json + + ETH3D: + cfg: + _target_: evaluation.datasets.eth3d.ETH3D + ETH3D_DIR: /mimer/NOBACKUP/groups/3d-dl/eth3d + load_img_size: ${load_img_size} + cache_file: data/dataset_cache/eth3d_mv_recon_cache.npy + sampling: + strategy: stride + kf_every: 5 + seq_id_map: evaluation/datasets/seq-id-maps/ETH3D_mv-recon_seq-id-map-kf5.json \ No newline at end of file diff --git a/evaluation/datasets/dtu.py b/evaluation/datasets/dtu.py new file mode 100644 index 00000000..47075a9b --- /dev/null +++ b/evaluation/datasets/dtu.py @@ -0,0 +1,231 @@ +import os.path as osp +import os +import numpy as np +import torch +import cv2 +import torchvision.transforms as tvf + +from typing import Optional, Union, List +from PIL import Image, ImageFile +from torch.utils.data import Dataset +from tqdm import tqdm +from evaluation.utils.geometry import unproject_depth_map_to_point_map +from evaluation.utils.cropping import resize_image, resize_image_depth_and_intrinsic + +Image.MAX_IMAGE_PIXELS = None +ImageFile.LOAD_TRUNCATED_IMAGES = True +to_tensor = tvf.ToTensor() + +def load_cam_mvsnet(words, interval_scale=1): + """read camera txt file""" + cam = np.zeros((2, 4, 4)) + # words = file.read().split() + words = words.split() + # read extrinsic + for i in range(0, 4): + for j in range(0, 4): + extrinsic_index = 4 * i + j + 1 + cam[0][i][j] = words[extrinsic_index] + + # read intrinsic + for i in range(0, 3): + for j in range(0, 3): + intrinsic_index = 3 * i + j + 18 + cam[1][i][j] = words[intrinsic_index] + + if len(words) == 29: + cam[1][3][0] = words[27] + cam[1][3][1] = float(words[28]) * interval_scale + cam[1][3][2] = 192 + cam[1][3][3] = cam[1][3][0] + cam[1][3][1] * cam[1][3][2] + elif len(words) == 30: + cam[1][3][0] = words[27] + cam[1][3][1] = float(words[28]) * interval_scale + cam[1][3][2] = words[29] + cam[1][3][3] = cam[1][3][0] + cam[1][3][1] * cam[1][3][2] + elif len(words) == 31: + cam[1][3][0] = words[27] + cam[1][3][1] = float(words[28]) * interval_scale + cam[1][3][2] = words[29] + cam[1][3][3] = words[30] + else: + cam[1][3][0] = 0 + cam[1][3][1] = 0 + cam[1][3][2] = 0 + cam[1][3][3] = 0 + + extrinsic = cam[0].astype(np.float32) + intrinsic = cam[1].astype(np.float32) + + return intrinsic, extrinsic + +class DTU(Dataset): + def __init__( + self, + DTU_DIR: str, + split: str = "test", + load_img_size: int = 518, + cache_file: str = "data/dataset_cache/dtu_mv_recon_cache.npy", + ): + + self.DTU_DIR = DTU_DIR + if DTU_DIR == None: + raise NotImplementedError + print(f"DTU_DIR is {DTU_DIR}") + + self.split = split + assert split == 'test', "Only test set preprocessed." + if self.split == 'train': + seq_numbers = [ + 2, 6, 7, 8, 14, 16, 18, 19, 20, 22, 30, 31, 36, 39, 41, 42, 44, + 45, 46, 47, 50, 51, 52, 53, 55, 57, 58, 60, 61, 63, 64, 65, 68, 69, 70, 71, 72, + 74, 76, 83, 84, 85, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, + 101, 102, 103, 104, 105, 107, 108, 109, 111, 112, 113, 115, 116, 119, 120, + 121, 122, 123, 124, 125, 126, 127, 128 + ] + elif self.split == 'valid': + seq_numbers = [3, 5, 17, 21, 28, 35, 37, 38, 40, 43, 56, 59, 66, 67, 82, 86, 106, 117] + elif self.split == 'test': + seq_numbers = [1, 4, 9, 10, 11, 12, 13, 15, 23, 24, 29, 32, 33, 34, 48, 49, 62, 75, 77, 110, 114, 118] + else: + raise ValueError(f"Invalid split: {self.split}. Must be 'train', 'valid' or 'test'.") + + if osp.exists(cache_file): + print(f"[DTU] Loading from cache file: {cache_file}") + self.metadata = np.load(cache_file, allow_pickle=True).item() + self.sequence_list = sorted(list(self.metadata.keys())) + else: + print(f"[DTU] Cache file not found, loading from {DTU_DIR}") + + self.sequence_list = [f"scan{num}" for num in seq_numbers] + + self.metadata = {} + for seq in tqdm(self.sequence_list): + rgb_root = osp.join(DTU_DIR, seq, 'images') + + all_imgs = sorted([d for d in os.listdir(rgb_root) if d.endswith('.jpg')]) + + all_img_numbers = [int(imgname.split('.')[0]) for imgname in all_imgs] + if all_img_numbers[0] != 0 or all_img_numbers[-1] + 1 != len(all_img_numbers): + raise ValueError(f"Image number not regular, with first image {all_imgs[0]} and last image {all_imgs[-1]} but number of images {len(all_imgs)}") + + self.metadata[seq] = len(all_imgs) + + np.save(cache_file, self.metadata) + + self.load_img_size = load_img_size + print(f"[DTU] Data size: {len(self)}") + + def __len__(self): + return len(self.sequence_list) + + def get_seq_framenum(self, index: Optional[int] = None, sequence_name: Optional[str] = None): + if sequence_name is None: + if index is None: + raise ValueError("Please specify either index or sequence_name") + sequence_name = self.sequence_list[index] + return self.metadata[sequence_name] + + def __getitem__(self, idx_N): + """Fetch item by index and a dynamic variable n_per_seq.""" + + # Different from most pytorch datasets, + # here we not only get index, but also a dynamic variable n_per_seq + # supported by DynamicBatchSampler + + index, n_per_seq = idx_N + sequence_name = self.sequence_list[index] + metadata = self.metadata[sequence_name] + ids = np.random.choice(len(metadata), n_per_seq, replace=False) + return self.get_data(index=index, ids=ids) + + def get_data( + self, + index: Optional[int] = None, + sequence_name: Optional[str] = None, + ids: Union[List[int], np.ndarray, None] = None, + ): + if sequence_name is None: + if index is None: + raise ValueError("Please specify either index or sequence_name") + sequence_name: str = self.sequence_list[index] + seq_len: int = self.metadata[sequence_name] + + if ids is None: + ids = np.arange(seq_len).tolist() + elif isinstance(ids, np.ndarray): + assert ids.ndim == 1, f"ids should be a 1D array, but got {ids.ndim}D" + ids = ids.tolist() + + image_path = osp.join(self.DTU_DIR, sequence_name, "images") + depth_path = osp.join(self.DTU_DIR, sequence_name, "depths") + mask_path = osp.join(self.DTU_DIR, sequence_name, "binary_masks") + cam_path = osp.join(self.DTU_DIR, sequence_name, "cams") + + image_paths: list = [""] * len(ids) + images: list = [0] * len(ids) + depths: list = [0] * len(ids) + extrinsics: np.ndarray = np.zeros((len(ids), 3, 4)) + intrinsics: np.ndarray = np.zeros((len(ids), 3, 3)) + + for id_index, id in enumerate(ids): + impath = osp.join(image_path, f"{id:08d}.jpg") + depthpath = osp.join(depth_path, f"{id:08d}.npy") + campath = osp.join(cam_path, f"{id:08d}_cam.txt") + maskpath = osp.join(mask_path, f"{id:08d}.png") + + rgb_image: Image.Image = Image.open(impath) + depthmap: np.ndarray = np.load(depthpath) + rgb_image: Image.Image = resize_image(rgb_image, (depthmap.shape[1], depthmap.shape[0])) + + depthmap = np.nan_to_num(depthmap.astype(np.float32), 0.0) + + mask = cv2.imread(maskpath, cv2.IMREAD_UNCHANGED) / 255.0 + mask = mask.astype(np.float32) + + mask[mask > 0.5] = 1.0 + mask[mask < 0.5] = 0.0 + + mask = cv2.resize( + mask, + (depthmap.shape[1], depthmap.shape[0]), + interpolation=cv2.INTER_NEAREST, + ) + kernel = np.ones((10, 10), np.uint8) # Define the erosion kernel + mask = cv2.erode(mask, kernel, iterations=1) + depthmap = depthmap * mask + + cur_intrinsics, extrinsic = load_cam_mvsnet(open(campath, "r").read()) + intrinsic = cur_intrinsics[:3, :3] + + rgb_image, depthmap, intrinsic = resize_image_depth_and_intrinsic( + image=rgb_image, + depth_map=depthmap, + intrinsic=intrinsic, + output_width=self.load_img_size, # finally width = 518, height = 388 + ) + + image_paths[id_index] = impath + images[id_index] = to_tensor(rgb_image) + depths[id_index] = depthmap + intrinsics[id_index] = intrinsic + extrinsics[id_index] = extrinsic[:3, :] + + depths = np.array(depths) # (S, H, W) + pointclouds = unproject_depth_map_to_point_map( + depth_map=depths[..., None], + intrinsics_cam=intrinsics, + extrinsics_cam=extrinsics + ) + + batch = {"seq_id": sequence_name, "seq_len": seq_len, "ind": torch.tensor(ids)} + batch['image_paths'] = image_paths # list of str + batch['images'] = torch.stack(images, dim=0) + batch['pointclouds'] = pointclouds # in numpy + batch['valid_mask'] = depths > 1e-4 + # batch["extrs"] = extrinsics + # batch["intrs"] = intrinsics + # batch["w"] = metadata["w"] + # batch["h"] = metadata["h"] + + return batch \ No newline at end of file diff --git a/evaluation/datasets/eth3d.py b/evaluation/datasets/eth3d.py new file mode 100644 index 00000000..4a82831f --- /dev/null +++ b/evaluation/datasets/eth3d.py @@ -0,0 +1,147 @@ +import os.path as osp +import os +import numpy as np +import torch +import torchvision.transforms as tvf + +from typing import Optional, Union, List +from PIL import Image, ImageFile +from torch.utils.data import Dataset +from evaluation.utils.geometry import unproject_depth_map_to_point_map +from evaluation.utils.cropping import resize_image_depth_and_intrinsic + +Image.MAX_IMAGE_PIXELS = None +ImageFile.LOAD_TRUNCATED_IMAGES = True +to_tensor = tvf.ToTensor() + +class ETH3D(Dataset): + def __init__( + self, + ETH3D_DIR: str, + load_img_size: int = 518, + cache_file: str = "data/dataset_cache/eth3d_mv_recon_cache.npy", + ): + + self.ETH3D_DIR = ETH3D_DIR + if ETH3D_DIR == None: + raise NotImplementedError + print(f"ETH3D_DIR is {ETH3D_DIR}") + + if osp.exists(cache_file): + print(f"[ETH3D] Loading from cache file: {cache_file}") + self.metadata = np.load(cache_file, allow_pickle=True).item() + self.sequence_list = sorted(self.metadata.keys()) + else: + print(f"[ETH3D] Cache file not found, loading from {ETH3D_DIR}") + + self.sequence_list = [seq for seq in os.listdir(ETH3D_DIR) if os.path.isdir(osp.join(ETH3D_DIR, seq))] + self.sequence_list = sorted(self.sequence_list) + + self.metadata = {} + for seq in self.sequence_list: + seq_image_root = osp.join(ETH3D_DIR, seq, 'images', 'custom_undistorted') + image_list = [imgname for imgname in os.listdir(seq_image_root) if imgname.endswith('.JPG')] + image_list = sorted(image_list) + + self.metadata[seq] = image_list + + np.save(cache_file, self.metadata) + + self.load_img_size = load_img_size + print(f"[ETH3D] Data size: {len(self)}") + + def __len__(self): + return len(self.sequence_list) + + def get_seq_framenum(self, index: Optional[int] = None, sequence_name: Optional[str] = None): + if sequence_name is None: + if index is None: + raise ValueError("Please specify either index or sequence_name") + sequence_name = self.sequence_list[index] + return len(self.metadata[sequence_name]) + + def __getitem__(self, idx_N): + """Fetch item by index and a dynamic variable n_per_seq.""" + + # Different from most pytorch datasets, + # here we not only get index, but also a dynamic variable n_per_seq + # supported by DynamicBatchSampler + + index, n_per_seq = idx_N + sequence_name = self.sequence_list[index] + metadata = self.metadata[sequence_name] + ids = np.random.choice(len(metadata), n_per_seq, replace=False) + return self.get_data(index=index, ids=ids) + + def get_data( + self, + index: Optional[int] = None, + sequence_name: Optional[str] = None, + ids: Union[List[int], np.ndarray, None] = None, + ): + if sequence_name is None: + if index is None: + raise ValueError("Please specify either index or sequence_name") + sequence_name: str = self.sequence_list[index] + image_list: list = self.metadata[sequence_name] + seq_len: int = len(image_list) + + if ids is None: + ids = np.arange(seq_len).tolist() + elif isinstance(ids, np.ndarray): + assert ids.ndim == 1, f"ids should be a 1D array, but got {ids.ndim}D" + ids = ids.tolist() + + image_paths: list = [""] * len(ids) + images: list = [0] * len(ids) + depths: list = [0] * len(ids) + extrinsics: np.ndarray = np.zeros((len(ids), 3, 4)) + intrinsics: np.ndarray = np.zeros((len(ids), 3, 3)) + + for id_index, id in enumerate(ids): + img_name = image_list[id] + impath = os.path.join(self.ETH3D_DIR, sequence_name, 'images', 'custom_undistorted', img_name) + depthpath = os.path.join(self.ETH3D_DIR, sequence_name, 'ground_truth_depth', 'custom_undistorted', img_name) + cam_path = os.path.join(self.ETH3D_DIR, sequence_name, 'custom_undistorted_cam', img_name.replace('JPG', 'npz')) + + cam = np.load(cam_path) + intrinsic = cam['intrinsics'] + extrinsic = cam['extrinsics'] + + # load image and depth + rgb_image: Image.Image = Image.open(impath) + width, height = rgb_image.size + depthmap: np.ndarray = np.fromfile(depthpath, dtype=np.float32).reshape(height, width) + depthmap[~np.isfinite(depthmap)] = -1 + + rgb_image, depthmap, intrinsic = resize_image_depth_and_intrinsic( + image=rgb_image, + depth_map=depthmap, + intrinsic=intrinsic, + output_width=self.load_img_size, # finally width = 518, height = 388 + ) + + image_paths[id_index] = impath + images[id_index] = to_tensor(rgb_image) + depths[id_index] = depthmap + intrinsics[id_index] = intrinsic + extrinsics[id_index] = extrinsic[:3, :] + + depths = np.array(depths) # (S, H, W) + pointclouds = unproject_depth_map_to_point_map( + depth_map=depths[..., None], + intrinsics_cam=intrinsics, + extrinsics_cam=extrinsics + ) + + batch = {"seq_id": sequence_name, "seq_len": seq_len, "ind": torch.tensor(ids)} + batch['image_paths'] = image_paths # list of str + batch['images'] = torch.stack(images, dim=0) + batch['pointclouds'] = pointclouds # in numpy + batch['valid_mask'] = depths > 1e-4 + # batch["extrs"] = extrinsics + # batch["intrs"] = intrinsics + # batch["w"] = metadata["w"] + # batch["h"] = metadata["h"] + + return batch \ No newline at end of file diff --git a/evaluation/datasets/seq-id-maps/DTU_mv-recon_seq-id-map-kf5.json b/evaluation/datasets/seq-id-maps/DTU_mv-recon_seq-id-map-kf5.json new file mode 100644 index 00000000..12f53a28 --- /dev/null +++ b/evaluation/datasets/seq-id-maps/DTU_mv-recon_seq-id-map-kf5.json @@ -0,0 +1,266 @@ +{ + "scan1": [ + 0, + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40, + 45 + ], + "scan10": [ + 0, + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40, + 45 + ], + "scan11": [ + 0, + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40, + 45 + ], + "scan110": [ + 0, + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40, + 45 + ], + "scan114": [ + 0, + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40, + 45 + ], + "scan118": [ + 0, + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40, + 45 + ], + "scan12": [ + 0, + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40, + 45 + ], + "scan13": [ + 0, + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40, + 45 + ], + "scan15": [ + 0, + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40, + 45 + ], + "scan23": [ + 0, + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40, + 45 + ], + "scan24": [ + 0, + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40, + 45 + ], + "scan29": [ + 0, + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40, + 45 + ], + "scan32": [ + 0, + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40, + 45 + ], + "scan33": [ + 0, + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40, + 45 + ], + "scan34": [ + 0, + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40, + 45 + ], + "scan4": [ + 0, + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40, + 45 + ], + "scan48": [ + 0, + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40, + 45 + ], + "scan49": [ + 0, + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40, + 45 + ], + "scan62": [ + 0, + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40, + 45 + ], + "scan75": [ + 0, + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40, + 45 + ], + "scan77": [ + 0, + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40, + 45 + ], + "scan9": [ + 0, + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40, + 45 + ] +} \ No newline at end of file diff --git a/evaluation/datasets/seq-id-maps/ETH3D_mv-recon_seq-id-map-kf5.json b/evaluation/datasets/seq-id-maps/ETH3D_mv-recon_seq-id-map-kf5.json new file mode 100644 index 00000000..02b19b58 --- /dev/null +++ b/evaluation/datasets/seq-id-maps/ETH3D_mv-recon_seq-id-map-kf5.json @@ -0,0 +1,125 @@ +{ + "courtyard": [ + 0, + 5, + 10, + 15, + 20, + 25, + 30, + 35 + ], + "delivery_area": [ + 0, + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40 + ], + "electro": [ + 0, + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40 + ], + "facade": [ + 0, + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40, + 45, + 50, + 55, + 60, + 65, + 70, + 75 + ], + "kicker": [ + 0, + 5, + 10, + 15, + 20, + 25, + 30 + ], + "meadow": [ + 0, + 5, + 10 + ], + "office": [ + 0, + 5, + 10, + 15, + 20, + 25 + ], + "pipes": [ + 0, + 5, + 10 + ], + "playground": [ + 0, + 5, + 10, + 15, + 20, + 25, + 30, + 35 + ], + "relief": [ + 0, + 5, + 10, + 15, + 20, + 25, + 30 + ], + "relief_2": [ + 0, + 5, + 10, + 15, + 20, + 25, + 30 + ], + "terrace": [ + 0, + 5, + 10, + 15, + 20 + ], + "terrains": [ + 0, + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40 + ] +} \ No newline at end of file diff --git a/evaluation/pointcloud.py b/evaluation/pointcloud.py new file mode 100644 index 00000000..5be18ac3 --- /dev/null +++ b/evaluation/pointcloud.py @@ -0,0 +1,203 @@ +# https://github.com/facebookresearch/vggt/issues/208 +# https://github.com/doubleZ0108/GeoMVSNet +# https://github.com/yyfz/Pi3/blob/evaluation/mv_recon/eval.py + +import os +import json +import torch +import numpy as np +import open3d as o3d +import os.path as osp +import hydra + +from omegaconf import DictConfig +from evaluation.utils.interfaces import infer_mv_pointclouds +from evaluation.utils.mv_recon import umeyama, accuracy, completion +from evaluation.utils.messages import set_default_arg, write_csv +from evaluation.utils.vis_utils import save_image_grid_auto +from evaluation.utils import load_model + + +@hydra.main(version_base="1.2", config_path="./configs", config_name="eval") +def main(hydra_cfg: DictConfig): + + all_eval_datasets: DictConfig = hydra_cfg.eval_datasets # see configs/evaluation/mv_recon.yaml + all_data_info: DictConfig = hydra_cfg.data # see configs/data + pretrained_model_name_or_path: str = hydra_cfg.pi3.pretrained_model_name_or_path # see configs/evaluation/relpose-angular.yaml + + # 0. create model + # model = VGGT.from_pretrained(pretrained_model_name_or_path).to(hydra_cfg.device).eval() + model = load_model( + device=hydra_cfg.device, + # model_path="/mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/pretrained_models/model_tracker_fixed_e20.pt", + # big_model=True, + # model_path="/mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/training/logs/mum_exp004/ckpts/checkpoint.pt", + # model_path="/mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/training/logs/dinov3_exp004/ckpts/checkpoint.pt", + model_path="/mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/training/logs/crocov2_exp001/ckpts/checkpoint.pt", + big_model=False, + # encoder="dinov3" + # encoder="mum" + encoder="crocov2" + ) + print(f"Loaded VGGT from {pretrained_model_name_or_path}") + + for idx_dataset, dataset_name in enumerate(all_eval_datasets, start=1): + # 1.1 look up dataset config from configs/data, decide the dataset name, and load the dataset + if dataset_name not in all_data_info: + raise ValueError(f"Unknown dataset in global data information: {dataset_name}") + dataset_info = all_data_info[dataset_name] + dataset = hydra.utils.instantiate(dataset_info.cfg) + + # 1.2 ready for output directory & metrics + output_root = osp.join(hydra_cfg.output_dir, dataset_name) + os.makedirs(output_root, exist_ok=True) + all_data_dict = { + "Acc-mean": 0.0, "Acc-med": 0.0, + "Comp-mean": 0.0, "Comp-med": 0.0, + "NC-mean": 0.0, "NC-med": 0.0, + "NC1-mean": 0.0, "NC1-med": 0.0, + "NC2-mean": 0.0, "NC2-med": 0.0, + } + + # 1.3 load pre-sampled seq-id-map + print(f"[{idx_dataset}/{len(all_eval_datasets)}] Evaluating Multi-View Pointcloud Reconstruction of Pi3 on dataset {dataset_name}...") + sample_config: DictConfig = dataset_info.sampling + print(f"Sampling strategy: {sample_config.strategy}") + with open(dataset_info.seq_id_map, "r") as f: + seq_id_map: dict = json.load(f) + + if osp.exists(osp.join(output_root, "_all_samples.csv")): + os.remove(osp.join(output_root, "_all_samples.csv")) # remove old csv file + for seq_idx, (seq_name, ids) in enumerate(seq_id_map.items(), start=1): + # 2. load data, choose specific ids of a sequence + data = dataset.get_data(sequence_name=seq_name, ids=ids) + filelist: list = data['image_paths'] # [str] * N + images: torch.Tensor = data['images'] # (N, 3, H, W) + gt_pts: np.ndarray = data['pointclouds'] # (N, H, W, 3) + valid_mask: np.ndarray = data['valid_mask'] # (N, H, W) + + # 3. real inference, predicted pointcloud aligned to ground truth (data_h, data_w) + data_h, data_w = images.shape[-2:] + pred_pts: np.ndarray = infer_mv_pointclouds(filelist, model, hydra_cfg, (data_h, data_w)) # (N, H, W, 3) + assert pred_pts.shape == gt_pts.shape, f"Predicted points shape {pred_pts.shape} does not match ground truth shape {gt_pts.shape}." + + # 4. save input images + seq_name = seq_name.replace("/", "-") + save_image_grid_auto(images, osp.join(output_root, f"{seq_name}.png")) + colors = images.permute(0, 2, 3, 1)[valid_mask].cpu().numpy().reshape(-1, 3) + + # 5. coarse align + c, R, t = umeyama(pred_pts[valid_mask].T, gt_pts[valid_mask].T) + pred_pts = c * np.einsum('nhwj, ij -> nhwi', pred_pts, R) + t.T + + # 6. filter invalid points + pred_pts = pred_pts[valid_mask].reshape(-1, 3) + gt_pts = gt_pts[valid_mask].reshape(-1, 3) + + pred_pts = np.ascontiguousarray(pred_pts, dtype=np.float64) + gt_pts = np.ascontiguousarray(gt_pts, dtype=np.float64) + colors = np.ascontiguousarray(colors, dtype=np.float64) + + # 7. save predicted & ground truth point clouds + pcd = o3d.geometry.PointCloud() + pcd.points = o3d.utility.Vector3dVector(pred_pts) + pcd.colors = o3d.utility.Vector3dVector(colors) + o3d.io.write_point_cloud(osp.join(output_root, f"{seq_name}-pred.ply"), pcd) + + pcd_gt = o3d.geometry.PointCloud() + pcd_gt.points = o3d.utility.Vector3dVector(gt_pts) + pcd_gt.colors = o3d.utility.Vector3dVector(colors) + o3d.io.write_point_cloud(osp.join(output_root, f"{seq_name}-gt.ply"), pcd_gt) + + # 8. ICP align refinement + if "DTU" in dataset_name: + threshold = 100 + else: + threshold = 0.1 + + trans_init = np.eye(4) + reg_p2p = o3d.pipelines.registration.registration_icp( + pcd, + pcd_gt, + threshold, + trans_init, + o3d.pipelines.registration.TransformationEstimationPointToPoint(), + ) + + transformation = reg_p2p.transformation + pcd = pcd.transform(transformation) + + # 9. estimate normals + pcd.estimate_normals() + pcd_gt.estimate_normals() + pred_normal = np.asarray(pcd.normals) + gt_normal = np.asarray(pcd_gt.normals) + + # o3d.io.write_point_cloud( + # os.path.join( + # save_path, f"{seq.replace('/', '_')}-mask-icp.ply" + # ), + # pcd, + # ) + + # 10. compute metrics + acc, acc_med, nc1, nc1_med = accuracy( + pcd_gt.points, pcd.points, gt_normal, pred_normal + ) + comp, comp_med, nc2, nc2_med = completion( + pcd_gt.points, pcd.points, gt_normal, pred_normal + ) + print( + f"[{dataset_name} {seq_idx}/{len(dataset.sequence_list)}] Seq: {seq_name}, Acc: {acc}, Comp: {comp}, NC1: {nc1}, NC2: {nc2} - Acc_med: {acc_med}, Compc_med: {comp_med}, NC1c_med: {nc1_med}, NC2c_med: {nc2_med}" + ) + + # 11. save metrics to csv + write_csv(osp.join(output_root, f"_all_samples.csv"), { + "seq": seq_name, + "Acc-mean": acc, + "Acc-med": acc_med, + "Comp-mean": comp, + "Comp-med": comp_med, + "NC1-mean": nc1, + "NC1-med": nc1_med, + "NC2-mean": nc2, + "NC2-med": nc2_med, + }) + all_data_dict["Acc-mean"] += acc + all_data_dict["Acc-med"] += acc_med + all_data_dict["Comp-mean"] += comp + all_data_dict["Comp-med"] += comp_med + all_data_dict["NC-mean"] += (nc1 + nc2) / 2 + all_data_dict["NC-med"] += (nc1_med + nc2_med) / 2 + all_data_dict["NC1-mean"] += nc1 + all_data_dict["NC1-med"] += nc1_med + all_data_dict["NC2-mean"] += nc2 + all_data_dict["NC2-med"] += nc2_med + + # release cuda memory + torch.cuda.empty_cache() + + num_samples = len(dataset) + metric_dict = { + metric: value / num_samples + for metric, value in all_data_dict.items() + if metric != "model" + } + + statistics_file = osp.join(hydra_cfg.output_dir, f"{dataset_name}-metric") # + ".csv" + if getattr(hydra_cfg, "save_suffix", None) is not None: + statistics_file += f"-{hydra_cfg.save_suffix}" + statistics_file += ".csv" + print(metric_dict) + write_csv(statistics_file, metric_dict) + + del model + torch.cuda.empty_cache() + print(f"Finished evaluating Pi3 on all datasets.") + + +if __name__ == "__main__": + # set_default_arg("evaluation", "mv_recon") + os.environ["HYDRA_FULL_ERROR"] = '1' + with torch.no_grad(): + main() \ No newline at end of file diff --git a/evaluation/preprocess/download_re10k.py b/evaluation/preprocess/download_re10k.py new file mode 100644 index 00000000..fcda03d0 --- /dev/null +++ b/evaluation/preprocess/download_re10k.py @@ -0,0 +1,234 @@ +""" +References: +[cashiwamochi/RealEstate10K_Downloader](https://github.com/cashiwamochi/RealEstate10K_Downloader/blob/master/generate_dataset.py) +The scripts provided here are for reference only. Please ensure you have obtained the necessary licenses from the original dataset providers before proceeding. + +datasets/sequences/re10k_test_1800.txt: test sequences chosen by [PoseDiffusion](https://github.com/facebookresearch/PoseDiffusion/blob/main/pose_diffusion/datasets/re10k_test_1800.txt) +However, some of the youtube videos are not available now, so we evaluate [Pi3](https://github.com/yyfz/Pi3) on datasets/sequences/re10k_test_1719.txt + +You may run into 403 error when downloading youtube videos, please refer to original pytube/pytubefix repo for help or use other downloader like yt-dlp. +However, this script works for us when doing evaluation for [Pi3](https://github.com/yyfz/Pi3). + +For resolutions, most sequences are (640, 360), with a few exceptions: +3b0b55657925fb34: (640, 272) +3e034bde9426ae9f: (640, 338) +2c2cfc0ac780a3aa: (640, 338) +""" + +import os +import os.path as osp +import glob + +from pytubefix import YouTube + +class Data: + def __init__(self, url, seqname, list_timestamps): + self.url = url + self.list_seqnames = [] + self.list_list_timestamps = [] + + self.list_seqnames.append(seqname) + self.list_list_timestamps.append(list_timestamps) + + def add(self, seqname, list_timestamps): + self.list_seqnames.append(seqname) + self.list_list_timestamps.append(list_timestamps) + + def __len__(self): + return len(self.list_seqnames) + + +def process(data, seq_id, videoname, output_root): + seqname = data.list_seqnames[seq_id] + image_dir = os.path.join(output_root, seqname, "images") + if not os.path.exists(image_dir): + os.makedirs(image_dir) + else: + print("[INFO] Something Wrong, stop process") + return True + + list_str_timestamps = [] + for timestamp in data.list_list_timestamps[seq_id]: + timestamp = int(timestamp/1000) + str_hour = str(int(timestamp/3600000)).zfill(2) + str_min = str(int(int(timestamp%3600000)/60000)).zfill(2) + str_sec = str(int(int(int(timestamp%3600000)%60000)/1000)).zfill(2) + str_mill = str(int(int(int(timestamp%3600000)%60000)%1000)).zfill(3) + _str_timestamp = str_hour+":"+str_min+":"+str_sec+"."+str_mill + list_str_timestamps.append(_str_timestamp) + + # extract frames from a video + for idx, str_timestamp in enumerate(list_str_timestamps): + command = 'ffmpeg -ss '+str_timestamp+' -i '+videoname+' -vframes 1 -f image2 '+image_dir+'/'+str(data.list_list_timestamps[seq_id][idx])+'.png' + # print("current command is {}".format(command)) + os.system(command) + + png_files = sorted( + glob.glob(os.path.join(image_dir, "*.png")), + key=lambda x: int(os.path.splitext(os.path.basename(x))[0]) + ) + + for idx, old_path in enumerate(png_files): + new_name = f"{idx:04d}.png" + new_path = os.path.join(image_dir, new_name) + os.rename(old_path, new_path) + print(f"Renamed: {os.path.basename(old_path)} -> {new_name}") + + return False + +def wrap_process(list_args): + return process(*list_args) + +class DataDownloader: + def __init__ ( + self, + meta_root: str, + output_root: str, + sequence_list: list, # end with .txt + mode: str = "test", + ): + print("[Re10k Downloader] Loading data list ... ") + self.meta_root = meta_root + all_seqnames = glob.glob(osp.join(meta_root, mode, '*.txt')) + all_seqnames = sorted([osp.basename(x) for x in all_seqnames]) + all_seqnames = set(all_seqnames) + + the_other_mode = "train" if mode == "test" else "test" + assert mode == "test", "Currently only support test mode, please set mode to 'test'" + all_seq_exists = True + seq_not_exists = [] + all_other_seqnames = {} + for seqname in sequence_list: + if seqname not in all_seqnames: + if all_seq_exists: + all_other_seqnames = sorted(glob.glob(osp.join(meta_root, the_other_mode, '*.txt'))) + all_other_seqnames = set(all_other_seqnames) + + if seqname not in all_other_seqnames: + print(f"[Error] {seqname} not in bote train and test meta") + else: + print(f"[Warning] {seqname} not in {mode} meta, but in {the_other_mode} meta") + seq_not_exists.append(seqname) + all_seq_exists = False + if not all_seq_exists: + print("---------------------------------------------") + print(seq_not_exists) + raise ValueError(f"{mode} meta not exists, please check the path") + print(f"[Re10k Downloader] {len(sequence_list)} sequences are to download in {mode} mode") + + self.output_root = output_root + os.makedirs(self.output_root, exist_ok=True) + self.mode = mode + # self.sequence_list = sequence_list + + self.isDone = False + + self.list_data = [] + for txt_file in sequence_list: + seq_name = txt_file.split('.')[0] + if osp.exists(osp.join(output_root, seq_name)): + print(f"[Re10k Downloader] {seq_name} already exists, skip") + continue + + # extract info from txt + txt_path = osp.join(self.meta_root, self.mode, txt_file) + with open(txt_path, "r") as seq_file: + lines = seq_file.readlines() + youtube_url = "" + list_timestamps= [] + for idx, line in enumerate(lines): + if idx == 0: + youtube_url = line.strip() + else: + timestamp = int(line.split(' ')[0]) + list_timestamps.append(timestamp) + + isRegistered = False + for i in range(len(self.list_data)): + if youtube_url == self.list_data[i].url: + isRegistered = True + self.list_data[i].add(seq_name, list_timestamps) + else: + pass + + if not isRegistered: + self.list_data.append(Data(youtube_url, seq_name, list_timestamps)) + + print("[Re10k Downloader] {} movies are used in {} mode".format(len(self.list_data), self.mode)) + + + def Run(self, tmp_dir): + print("[Re10k Downloader] Start downloading {} movies".format(len(self.list_data))) + + os.makedirs(tmp_dir, exist_ok=True) + for global_count, data in enumerate(self.list_data): + print("[Re10k Downloader] Downloading {} ".format(data.url)) + try : + # sometimes this fails because of known issues of pytube and unknown factors + yt = YouTube(data.url) + stream = yt.streams.first() + stream.download(tmp_dir, data.url.split("=")[-1]) + except : + failure_log = open(osp.join(self.output_root, 'failed_videos.txt'), 'a') + for seqname in data.list_seqnames: + failure_log.writelines(seqname + '\n') + failure_log.close() + continue + + videoname = osp.join(tmp_dir, data.url.split("=")[-1]) + if len(data) == 1: # len(data) is len(data.list_seqnames) + process(data, 0, videoname, self.output_root) + else: + for seq_id in range(len(data)): + process(data, seq_id, videoname, self.output_root) + print("Process {} done".format(seq_id)) + + # remove videos + command = "rm " + videoname + os.system(command) + + if self.isDone: + return False + + return True + + def Show(self): + print("########################################") + global_count = 0 + for data in self.list_data: + print(" URL : {}".format(data.url)) + for idx in range(len(data)): + print(" SEQ_{} : {}".format(idx, data.list_seqnames[idx])) + print(" LEN_{} : {}".format(idx, len(data.list_list_timestamps[idx]))) + global_count = global_count + 1 + print("----------------------------------------") + + print("TOTAL : {} sequnces".format(global_count)) + +if __name__ == "__main__": + # setup_debug(True, 10033) + MODE = "test" + RE10K_METAROOT = "data/re10k/metadata" + OUTPUT_ROOT = "data/re10k" + SEQUENCE_LIST_FILE = "evaluation/preprocess/re10k_test_1800.txt" + TMP_DIR = osp.join(OUTPUT_ROOT, "tmp") + + with open(SEQUENCE_LIST_FILE, "r") as f: + sequence_list = f.read().splitlines() + for idx, seq in enumerate(sequence_list): + sequence_list[idx] = seq + '.txt' if seq[-4:] != '.txt' else seq + + downloader = DataDownloader( + meta_root = RE10K_METAROOT, + output_root = OUTPUT_ROOT, + sequence_list = sequence_list, + mode = MODE, + ) + + downloader.Show() + isOK = downloader.Run(tmp_dir=TMP_DIR) + + if isOK: + print("Done!") + else: + print("Failed") \ No newline at end of file diff --git a/evaluation/preprocess/prepare_re10k.py b/evaluation/preprocess/prepare_re10k.py new file mode 100644 index 00000000..509217a1 --- /dev/null +++ b/evaluation/preprocess/prepare_re10k.py @@ -0,0 +1,90 @@ +import os.path as osp +import json +from typing import List, Tuple +from PIL import Image +from tqdm import tqdm +import gzip +import numpy as np + +def load_seq_cameras(example_path: str) -> Tuple[List[List[float]], List[List[List[float]]]]: + with open(example_path, "r") as f: + lines = f.read().splitlines() + + # url = lines[0] + # timestamps = [] + # cameras = [] + + intrinsic_list = [] + extrinsic_list = [] + for line in lines[1:]: + timestamp, *camera = line.split(" ") + camera = [float(param) for param in camera] + intrinsic = camera[:4] # fx, fy, cx, cy + extrinsic = camera[6:] # 3 * 4 matrix + extrinsic = [ + extrinsic[i:i+4] + for i in range(0, len(extrinsic), 4) + ] + intrinsic_list.append(intrinsic) + extrinsic_list.append(extrinsic) + + return intrinsic_list, extrinsic_list + +MODE = "test" +RE10K_METAROOT = "data/re10k/metadata" +OUTPUT_ROOT = "data/re10k" +# SEQUENCE_LIST_FILE = "evaluation/preprocess/re10k_test_1719.txt" +SEQUENCE_LIST_FILE = "evaluation/preprocess/re10k_test.txt" + +with open(SEQUENCE_LIST_FILE, "r") as f: + sequence_list = f.read().splitlines() + +# seq = '498688760312447b' + +out = {} +for seq in tqdm(sequence_list): + first_image_path = osp.join(OUTPUT_ROOT, seq, "images", "0000.png") + first_image = Image.open(first_image_path) + anno_save_file = osp.join(OUTPUT_ROOT, seq, f"annotations.json") + width, height = first_image.size + + seq_meta_file = osp.join(RE10K_METAROOT, MODE, f"{seq}.txt") + intrinsic_list, extrinsic_list = load_seq_cameras(seq_meta_file) + seq_info = [] + seq_info_standard = [] + for idx, (intrinsics, extrinsics) in enumerate(zip(intrinsic_list, extrinsic_list)): + # intrinsics, OpenCV-style 3*3 K + # https://google.github.io/realestate10k/download.html + fx, fy, cx, cy = intrinsics + intrinsics = [ + [width * fx, 0, width * cx ], + [0, height * fy, height * cy], + [0, 0, 1 ] + ] + + seq_info_standard.append({ + "filepath": osp.join(seq, "images", f"{idx:04d}.png"), + "extri": extrinsics, + "intri": intrinsics, + }) + # print('Np array shape extrinsics:', np.array(extrinsics).shape) + + # extrinsics, OpenCV-style W2C 3*4 + # extrinsics.append([0, 0, 0, 1]) # Add the last row for homogeneous coordinates + + # seq_info.append({ + # "idx": idx, + # "filepath": osp.join(seq, "images", f"{idx:04d}.png"), + # "intrinsics": intrinsics, + # "extrinsics": extrinsics, + # }) + + out[seq] = [seq_info_standard] + + # with open(anno_save_file, "w") as f: + # f.write(json.dumps(seq_info, indent=4)) + +root = "/mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt" + +with gzip.open(root+"/annotations/re10k/test.jgz", "wt", encoding="utf-8") as f: + json.dump(out, f, ensure_ascii=False, indent=4) \ No newline at end of file diff --git a/evaluation/preprocess/prepare_re10k.sh b/evaluation/preprocess/prepare_re10k.sh new file mode 100644 index 00000000..89b99b91 --- /dev/null +++ b/evaluation/preprocess/prepare_re10k.sh @@ -0,0 +1,9 @@ +# The scripts provided here are for reference only. Please ensure you have obtained the necessary licenses from the original dataset providers before proceeding. + +# First download the data from here: https://google.github.io/realestate10k/download.html + +# download re10k test sequences by yourself, you can refer to the below script +python evaluation/preprocess/download_re10k.py + +# convert camera annotations in metadata to video data folder +python evaluation/preprocess/prepare_re10k.py \ No newline at end of file diff --git a/evaluation/preprocess/preprocess_co3d.py b/evaluation/preprocess/preprocess_co3d.py new file mode 100644 index 00000000..e55fd380 --- /dev/null +++ b/evaluation/preprocess/preprocess_co3d.py @@ -0,0 +1,131 @@ +# Modified from https://github.com/amyxlase/relpose-plus-plus/blob/main/preprocess/preprocess_co3d.py + + +""" +Usage: + python -m preprocess.preprocess_co3d --category all \ + --co3d_v2_dir /path/to/co3d_v2 +""" +import argparse +import gzip +import json +import os +import os.path as osp + +import matplotlib.pyplot as plt +import numpy as np +from tqdm.auto import tqdm + +# fmt: off +# CATEGORIES = [ +# "apple", "backpack", "ball", "banana", "baseballbat", "baseballglove", +# "bench", "bicycle", "book", "bottle", "bowl", "broccoli", "cake", "car", "carrot", +# "cellphone", "chair", "couch", "cup", "donut", "frisbee", "hairdryer", "handbag", +# "hotdog", "hydrant", "keyboard", "kite", "laptop", "microwave", "motorcycle", +# "mouse", "orange", "parkingmeter", "pizza", "plant", "remote", "sandwich", +# "skateboard", "stopsign", "suitcase", "teddybear", "toaster", "toilet", "toybus", +# "toyplane", "toytrain", "toytruck", "tv", "umbrella", "vase", "wineglass", +# ] +CATEGORIES = [ + "apple", "bench", "bowl", "cellphone", "frisbee", "hotdog", "keyboard", "parkingmeter", "teddybear", "toybus", + "backpack", "book", "car", "donut", "handbag", "hydrant", "motorcycle", "pizza", "stopsign", "toaster", "tv" +] +# fmt: on + + +def get_parser(): + parser = argparse.ArgumentParser() + parser.add_argument("--category", type=str, default="apple") + parser.add_argument("--output_dir", type=str, default="annotations/co3d_v2_annotations") + parser.add_argument("--co3d_v2_dir", type=str, default="data/co3d_v2") + parser.add_argument( + "--min_quality", + type=float, + default=0.5, + help="Minimum viewpoint quality score.", + ) + return parser + + + + +def process_poses(co3d_dir, category, output_dir, min_quality): + category_dir = osp.join(co3d_dir, category) + print("Processing category:", category) + frame_file = osp.join(category_dir, "frame_annotations.jgz") + sequence_file = osp.join(category_dir, "sequence_annotations.jgz") + subset_lists_file = osp.join(category_dir, "set_lists/set_lists_fewview_dev.json") + + # bbox_file = osp.join(output_dir, f"{category}_bbox.jgz") + + with open(subset_lists_file) as f: + subset_lists_data = json.load(f) + + with gzip.open(sequence_file, "r") as fin: + sequence_data = json.loads(fin.read()) + + with gzip.open(frame_file, "r") as fin: + frame_data = json.loads(fin.read()) + + # with gzip.open(bbox_file, "r") as fin: + # bbox_data = json.loads(fin.read()) + + frame_data_processed = {} + for f_data in frame_data: + sequence_name = f_data["sequence_name"] + if sequence_name not in frame_data_processed: + frame_data_processed[sequence_name] = {} + frame_data_processed[sequence_name][f_data["frame_number"]] = f_data + + good_quality_sequences = set() + for seq_data in sequence_data: + if seq_data["viewpoint_quality_score"] > min_quality: + good_quality_sequences.add(seq_data["sequence_name"]) + + for subset in ["train", "test"]: + category_data = {} # {sequence_name: [{filepath, R, T}]} + for seq_name, frame_number, filepath in subset_lists_data[subset]: + if seq_name not in good_quality_sequences: + continue + + if seq_name not in category_data: + category_data[seq_name] = [] + + # mask_path = filepath.replace("images", "masks").replace(".jpg", ".png") + # bbox = bbox_data[mask_path] + # if bbox == []: + # Mask did not include any object. + # continue + + frame_data = frame_data_processed[seq_name][frame_number] + category_data[seq_name].append( + { + "filepath": filepath, + "R": frame_data["viewpoint"]["R"], + "T": frame_data["viewpoint"]["T"], + "focal_length": frame_data["viewpoint"]["focal_length"], + "principal_point": frame_data["viewpoint"]["principal_point"], + # "bbox": bbox, + } + ) + + output_file = osp.join(output_dir, f"{category}_{subset}.jgz") + with gzip.open(output_file, "w") as f: + f.write(json.dumps(category_data).encode("utf-8")) + + + +if __name__ == "__main__": + parser = get_parser() + args = parser.parse_args() + if args.category == "all": + categories = CATEGORIES + else: + categories = [args.category] + for category in categories: + process_poses( + co3d_dir=args.co3d_v2_dir, + category=category, + output_dir=args.output_dir, + min_quality=args.min_quality, + ) \ No newline at end of file diff --git a/evaluation/preprocess/preprocess_scannet1500.py b/evaluation/preprocess/preprocess_scannet1500.py new file mode 100644 index 00000000..8c594492 --- /dev/null +++ b/evaluation/preprocess/preprocess_scannet1500.py @@ -0,0 +1,89 @@ +from pathlib import Path +import json +import gzip +import numpy as np +import torch +from tqdm import tqdm + +def read_scannet_pose(path): + """ Read ScanNet's Camera2World pose and transform it to World2Camera. + + Returns: + pose_w2c (np.ndarray): (4, 4) + """ + cam2world = np.loadtxt(path, delimiter=' ') + + if not np.isfinite(cam2world).all(): + return None + + world2cam = np.linalg.inv(cam2world) + return world2cam + + +def read_scannet_intrinsic(path): + """ Read ScanNet's intrinsic matrix and return the 3x3 matrix. + """ + intrinsic = np.loadtxt(path, delimiter=' ') + return torch.tensor(intrinsic[:-1, :-1], dtype = torch.float) + +# Root folder where everything starts +root = Path("/mimer/NOBACKUP/groups/3d-dl/scannet/scannet_test_1500") + +out = {} + +valid_frames = 0 +invalid_frames = 0 +for scene_dir in tqdm(root.iterdir()): + + if scene_dir.name in ["scannet_indices", "scannet_indices.tar", "intrinsics.npz", "test.npz"]: + print(f"Skipping {scene_dir.name}") + continue + + intrinsics = read_scannet_intrinsic(scene_dir / "intrinsic/intrinsic_color.txt") + + frames = sorted([p.name for p in (scene_dir / "color").iterdir() if p.suffix == ".jpg"]) + + # Maybe resized undistorted images are too high resolution? + num_frames = len(frames) + + # Since the images are taken in a sequence we will just chunk up the sequences + + sequence_data = [] + for frame in frames: + pose_path = scene_dir / "pose" / (frame.replace(".jpg", ".txt")) + pose_w2c = read_scannet_pose(pose_path) + if pose_w2c is None: + print(f"Warning: Pose contains NaN, skipping frame {pose_path}") + invalid_frames += 1 + continue + valid_frames += 1 + R = pose_w2c[:3, :3] + assert not np.isnan(pose_w2c).any(), f"Pose contains NaN: {pose_w2c}" + # print('Determinant of R: ', np.linalg.det(R)) + # assert np.allclose(np.linalg.det(R), 1.0, atol=1e-3), f"Rotation matrix determinant is not 1 but {np.linalg.det(R)}, R is {R}" + + frame_data = { + "filepath": f"{scene_dir.name}/color/{frame}", + "extri": pose_w2c[:3].tolist(), + "intri": intrinsics.tolist(), + } + # Sanity check + assert len(pose_w2c) == 4 and len(pose_w2c[0]) == 4 + assert len(intrinsics) == 3 and len(intrinsics[0]) == 3 + + sequence_data.append(frame_data) + + out[scene_dir.name] = [sequence_data] + + print(f" Created a sequences for {scene_dir.name} with {len(sequence_data)} frames (out of {num_frames} original frames).") + + +print('Valid frames: ', valid_frames) +print('Invalid frames: ', invalid_frames) +root = "/mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt" + +with gzip.open(root+"/annotations/scannet/scannet_test_1500.jgz", "wt", encoding="utf-8") as f: + json.dump(out, f, ensure_ascii=False, indent=4) + +print(f"Processed {len(out)} scenes with a total of {sum(len(v) for v in out.values())} images.") + diff --git a/evaluation/preprocess/re10k_test.txt b/evaluation/preprocess/re10k_test.txt new file mode 100644 index 00000000..7f87ff51 --- /dev/null +++ b/evaluation/preprocess/re10k_test.txt @@ -0,0 +1,1386 @@ +497364635884d8aa +12dc074fab6ada73 +0be9a0dcbfe032f1 +0e5e7fbe8914352c +2864dc6c129cf3cc +0fbe6d76015f75d4 +4200282fe9b4015a +133ee6e537353604 +020a41f988981396 +0f7061acbeed50dd +2e4013ea92d04301 +31a087ee5b1976da +22598e2596e6bae7 +4938177a0e6e2fbe +1fe394077e7c3de0 +0bd7e6e9f0185aa3 +431795f999dc215f +33288d55dde83e72 +389f65e97bd902b2 +0131c9aed0fb3940 +0c4c5d5f751aabf5 +0de8a88480533be6 +0f1f245fa1c181ae +14bfd05497764243 +45dce690caec2917 +454197dc5b50b45f +47573da5cb0e5e44 +05c423623c9f6f56 +1fd5f9af785e6e5c +05b77cb7c0f79f0f +039cc34e9cdbcf8f +0f47577ab3441480 +47c88dcfb1134255 +3fd084afa49b6499 +15aba05919bae167 +3862411e9bf455cd +30140756550dc38e +16da9b4fdfe4883b +3dc0058dce3828d9 +16a2c55f96e6aa18 +168e0f4071d2fe58 +2fca5797ae48529b +325ff82707386438 +298276fb3c0330e5 +2cd1705407546b72 +33f7565ccb685cb7 +49c758aa3c35ed86 +4c502551adddea8b +2e35fc35559543f2 +31838e9542a906be +23808c0cfcc72e72 +0baa633d2094d2c1 +22552c9a2a2a2ce7 +0ed8b86b87a30d38 +0e728af85650dcb3 +17f552ef56d85c55 +41936ce6152fee64 +05a6149f1fcee38d +4c0ef61c55467706 +46200541f9943d16 +3b7443b24830d388 +04b580ea1f4df0a5 +33baf3e18e5d7256 +01be77405b16df11 +477df7ad0c2e7fdb +2c3a96fb820e1ab7 +3e1af0b953407ef7 +28a5318660ab60ba +3317c40fd3e0a7b7 +293e02c7c1fa31a8 +3dba1838ed366ab5 +0e8995dcbdd22f48 +3814a3a8046c8af3 +32991f419c96ea0e +0068e97c1c1f61aa +242fc4972c7bb385 +0f5c5385dbcd96df +2a1fed061b29b25b +335794d48a9b168a +446f557155994097 +39074bca3524418b +46e2ddf094d0c3a1 +0f18fb6736efb1c2 +3c44d53659dbe4fe +389851bf0ac38227 +0de78cb98105f8c2 +2dc3af70d25d3043 +411c4dd047c49cde +1be80ff36848e758 +2ad09b7837010330 +375cff10cab07955 +0387ef3895b1393c +308de3d523189c72 +297b57a9296052ce +483c2b4c67e32c19 +0c0f298ace7c875b +0a6680fe6e8e09d7 +372f324a1f4d6898 +200ad247448c5577 +03ef5f13e0a30864 +01842c6b21e1d679 +210fc445b9e254f5 +3b1f9cedfc40b06c +03cf40616d79cb6a +03e756bff92d49dd +3c64a373bc1c53bd +04ca03945611febb +0145c694b53b120d +2d524e9324228d6e +3349f3089ea84d6b +1695a74d194b65c0 +3115672ced7c5694 +0c3d3b45ff4a4326 +3bceca99e87d64c5 +4bdfa30358809038 +431e6542fde13130 +0aa8646901d156e4 +1798c9640d8875e6 +3a488ff3afa463e2 +17dba5fa8138ed92 +46fb9c990b6f8114 +0485a8528fa72698 +04433dcf217ad9a0 +1dad44855584a4ea +052430ff6e2c07c4 +01cf55ae3e378faf +339c95e2e709d044 +0cd63c88350eef60 +46fdfa2a16c7c811 +0e2c96cd97e73a38 +3953f37661087a95 +3d2486ac8822da47 +4c4fa41951e37e78 +14cdf4aa7a2de14b +16cd4f1cb2a467c1 +020991bdfbdbe504 +2b8f367d01df3601 +0d8fd962cbfc81b7 +36e94b0ad3a62c7d +32a2c04fd8321bbb +20100b779d28b6d5 +414e2bf42ee45cc4 +43fee307c6339b5e +1ca4db19711258a0 +45064c8142f3a360 +0aa49c0b75e51ba4 +2d16da80d7e3b64b +454fc1e32db7cc41 +1f7770ac5cbb41eb +3115ce06e0160828 +18ce480be0ececbd +49c9324758b5e867 +2cf1a544b179b1a7 +03f1781c4cc126e6 +3f33ed2971149ea0 +4161944d7d592071 +196ffec2c68cafd5 +45cb862034851efe +1cddaac7be8ecfa5 +3cb1489b614e5f39 +46ea97f6f3757209 +1defdda324307269 +1c11709814a1a2f2 +0f64c5e4fead6cf2 +0c8438d86bb28f7d +2b4a934049f932d0 +1cc1ff58dc89d230 +20d0e788abca4aa9 +22085848f943c2c6 +3bae42d603be2266 +1e847ebd7cd1174e +4c2383e60aaf26cc +12cb4aa3f5b59ad6 +02f801e372d67cfd +165696025b477097 +06954737f53b8688 +464d97e527dd5f8a +19ec130ecea98d5e +3f4f553239e96d90 +306e2b7785657539 +29c8267c1d10b23e +17519e763b34fe14 +1cd3638cceebed08 +3c19657356e9e229 +41f3291d82fc4d93 +0e41af1514f92887 +173a82eeea56aed3 +41649e3e8f9a4be0 +0a4cf8d9b81b4c6e +284efc2041b1d1d8 +29832cbdb4144601 +15bc7fa1ed5567cc +075654f497170f90 +2b625e92f2cf9de4 +40f6d540b9b16531 +2b4c1f50687b2bcf +1d8017cad8dc1d56 +27daccf898b206de +2482c4388b32f225 +22c8b35c589276c4 +21a6081709444ebf +3b6d8db52c54b174 +4b457a008376cd73 +3d0a0fecfbdada35 +2b81fbc1af01f0ad +45166f266dd609a3 +3a86a812a1eaa20e +2347b1e3e70842ac +2b0b2259a7216762 +17a75f0b036c9cf8 +27f772c12c97b594 +332294213fa15c56 +2b973e6f676eb243 +1ce68f950e7cdf8c +2a8ef9e44f580d13 +44be029ec85609c5 +37d4e43b2b029a80 +309bed43e4406d72 +3c9c37132583a3d2 +314c584ff3842715 +17fc81293f337cc3 +0c2b3463c27c5ac3 +0ce3839aa5b66e3f +08868143749f321b +0282160b901229a7 +32d28b7513f873be +31afa3dcf3b737fa +474afb2d4641a228 +3232f7457b27dbbc +036f135766f38f78 +069a4c442912c405 +05a0ad1e2aa632e7 +2e64a2d17f9a76f7 +127884736471b631 +1214f2a11a9fc1ed +19310eb8261d4bb2 +45108618c40e26a7 +182054e13eaf58fc +05bacb6d6a4741b0 +3d2f4958db5aefbf +07ba5489b56b7d62 +3cc4c306db84c6fe +01cefee9563f691a +46df36a031f50a04 +1be5a5c98a51b1cc +361d722ef5009e09 +3d5114f5d7496cdd +1648b3e7471e3766 +0c9d930d226d6bd6 +1fe9f9bc178a1778 +41c600ed9f88871b +33a5a85f06fcc77b +3ce90c0ea2537c48 +4422b38e60e3bc2f +00969c45a093d43c +489254d4b26a04c6 +3f45b8234504020c +28760a14d0a5ff3a +3a79b9aefafb0b8d +11e62395c85c250d +430d7b5b77861810 +3d8d753f0851bf3b +2e06abf6286040e2 +1be3758972b35151 +154072be49bb3c1d +128bf83073de3ba1 +19b0cd79a126e8bd +055e4612c1ea70f9 +0e2f2538b26a179c +1837b8eebc9c2457 +0eadbf8806794990 +368fa2dd830843c7 +1c29b1d8fd1dab3f +0972074fece891f2 +44b78f9fcb5cd8d8 +4b41d03353967b40 +0c916bcc9351521e +15b902774d67a394 +12b7562944c06836 +252a24d25a1ea81d +0cdfa29561cb24e0 +32d2163aa65c0e8a +08c9e7365f0707a4 +28190e57702bcfbc +247bc2e47eb7f6fa +15bc9eb752c6dcbd +049a98f70ce9f471 +3ea8d9787998f70a +1058fe0400a873e0 +21d9134faec148f2 +11491a312c6b8f58 +4cbb82a6bab25a0f +3a0328ed13dd8c8b +0ed4c9cac4a615fd +07842cc567e9beec +10ad4fc499c48b38 +36056faee50a621f +0752baf20fbc2285 +004e9db3337e8206 +1150003196de2529 +2313ea0fb17cbed6 +0de79d4f3d7a9171 +1259726fc1f8e966 +30bd5b88e47f6d0e +42b88f7ee71a7ba9 +2f5af4b429b2c992 +107b78daf075d371 +4aa1973c40d2eb93 +08718fb99eaafea7 +4a6a057fc644624e +1d7af31482baf61c +002ae53df0e0afe2 +3b273cb40c55db95 +2c02607ae436a9fb +379884ce61c4daa7 +4ce58504b055463e +0b211a1457076450 +390ddb7ee9b716ca +1bc87c160d1dc982 +2f4bcd593fe37158 +0c8c4363e0dca250 +489f9441d513634e +45a4515834848010 +460455f96fa1a1d6 +30127e00a789ed7c +30029fac7c5621de +007876f71baf453f +4cc4c8a8cfa8e944 +375f9c448cf31ccb +114d9c301b847239 +02ee66b3efbf3b0a +2868a1b43e9eceff +3aaed2e6422d7d57 +41d8b4350913ca64 +4b7071b34e8cc67d +1beb8e6662d36ee6 +0bf152ef84195293 +07d0229847bd7408 +04fe4ec70781a0e8 +4b3644bfc6083588 +118f563fe2ed4998 +2d6d5e82bda0611c +43759ced44693671 +485f996ecf360da7 +3b59c7d97b900724 +2b4e1061f6415a4a +2e86767798c005df +2af206730de6f439 +06eba57d1c333a3c +4bf9ef4705f35e8f +395caf9235fde098 +4526c5ac1bfebcfb +4b5a6dd314bebe88 +1d704b9365e9c86b +36d85599a9cbb6b3 +180542b70f713d5b +10c8e54590f715f7 +4bc1c3a888a8bfba +2c805f56d92a2e22 +1db274e904e3fb07 +06f4bfa5f9d5fe0e +0c1012a308ee2788 +3af70052616f7fa1 +382a5736d9134153 +12497730f691d00c +04ef6a410f034514 +2eb515bff528d3c3 +1e01b910ceba4573 +2c6e76b362eed8e4 +45ac5168bda9d3e2 +1f084607245e4462 +24548ce6c15bc2cf +403951b5d632b5ab +2defb4625a3ccb54 +1ec7e7dce1175aee +145708c0216a06a7 +01eca393f86d37c5 +497d2450ed65a678 +32ce9b303717a29d +18c6473be3bd827a +1227d00562c106e2 +0ccb28128213f19d +407eefe8017f6070 +0bcef9ed1c18f74d +059058768c222bd6 +29f52c76f269ae48 +177d39d72e983b69 +202a627de66ad397 +401a94bd9d84f501 +1eab4db6941be725 +236f9dc6456cf32c +2a8cd9f87b3c9a2b +0d01d4d6c5d5297e +1da33647873725fd +39b58270c2e99310 +2cecdd7df86ff8d3 +4b6e9a02975ef9a4 +3289ad7a811d2348 +3a4d7cbcf0c84668 +28c9d20b865f5d56 +4acd145b0c133dca +4bc7ca44cc62b8b1 +17ca3b8ad5815b35 +072a60bc7e0b0dfd +061f829d3dd2e46e +1203cc23b881ab8d +31d903660e1647c5 +172c99489b18e0e6 +46e653208e529783 +058c67085c217b96 +33a3fc21efdc8547 +4a736d7c30ae9280 +2b3e3c5d30c17bd9 +388cb2f0ff1a6cad +3a52947c66de5920 +3c33566bacd602f2 +45823117f0acb627 +223b5e20753d4fa3 +3c83e9817c9e022d +1e2a2be2df033527 +3e8363be673dafa2 +37d09bda74c92a93 +0fd536fc3c8fdf19 +146581180e89666d +232aee1c62a1cd8b +2177ca3a775a9ee9 +015631b21f792a12 +338ff9f6c02a6a40 +2bcb26e95f5d152b +22b16c2f5af0f3ef +05ac37966de4e7fb +0ecf489d873b7f52 +1b747b8eba6f7b45 +21005252fe2383ba +3f7b6f511421e395 +2a058bafbecaccf9 +45122648522d4180 +2c48ab563b92a1d5 +4766f2062abaaf74 +4308efab35deb3ec +3cc97c3d778975f2 +1c8d34a791deaaf1 +36afe96c11a8211c +0e7b68884ac4d959 +436a235ed74c3d89 +0720bead0cc7cbbf +2e619c31122ee40f +311db764bdc7f537 +3eb718c3170fcd8e +1f4279d98e283206 +48bb743178166598 +2ebedb0f027df101 +0c6b149da098b121 +46067fb6d992860e +0a5eeb4466dd19bb +00ca5123d8ff6f83 +241aa9bcdbdc7ac6 +283059a56e7f3e75 +38f7ba7fd9a83069 +333c649f75c3c7bc +2f8e1946600c65d4 +1c144e2404c5da89 +2341162bce213f2e +171b3a4c2f95f981 +29791bf60e718c6b +36fb4d41b00581c4 +0d22ced53b1db7d3 +2cc330488326fd4f +1f1a76ed6db1dae5 +19694d2dc528d75d +1efbd8f8949b15e8 +1cca8650a292e7b0 +41aa58e688a04336 +43db8c6515021c01 +2aca85b3bdf90a09 +3172feb32990cf09 +18e659699338f835 +22274e48b847c860 +077d42bb51ee2793 +39c662fa32a3b5c6 +33e5bb3820c171a5 +0db2394602b8b81c +02b59cd60efb924e +4c791225522d45ba +17535adda2aa3b90 +473e6ec61583d90f +10351dc7a37a44c1 +440b5d1587251680 +30006eb23f62aa57 +2d4e81e66ce80039 +2b8778726c1f2fe4 +1df20a29cdec61a4 +03e141d7afac53e4 +3970859f54703c88 +0b79ada01eb45be9 +1c9772d765e0679e +1c73def8a62301a9 +04f5153fc5255516 +4c943ac66f6c277e +2e2ad99d45033d6a +004dd4b46a06e5be +365cc620c2fcbb05 +3b9e04113b202116 +17a489f3cec39fea +2ad50852a84faf51 +1bce163cad1e1d20 +21b9d476c5a49c63 +1047cc04fa16e0d7 +471abe46b812be64 +4260ca20e2430c67 +11337164b772b7c9 +2ca8e34592e0c415 +3edff71624eac3ee +0bb99505a71035cc +2fdfa70413053b84 +2c6fb46edb748fe5 +2a89b2a52cee9f5f +008cd8c450342e49 +14e540cf0ff7ff91 +13c4059da4e56a8c +45a0fe252a89e008 +193c3bd339eb0a75 +20384754a6e5d1b0 +371c9182ffd46ce3 +0cda9adbbedd7948 +2d29ff162920db5e +3c84329b60bfa7cb +3af43fe8d514f7c5 +4aa594c0ad661f28 +2d60837ef2e52abd +0368abd976e8d82e +3f265c5edb13f00d +18be7c1b9895691f +2d62a5d66d6e4931 +446626a2bd617d24 +179ff8424ec7ad13 +040a26b288e7bda4 +45e5fcd5c8978342 +316035dd285c5e27 +3b650a9e2ebdfde2 +20e7a3651ec30386 +2c5e21f9f91e2e09 +0e01be9445403642 +0f7267e7e369b7d6 +4b0fdb10ae15684b +3984d005557cbd6f +24fea6c2c7caa434 +3e4057a188e15ac3 +1cdfd3abfcc3a64e +01b08e2f20321127 +07e8ffa32746c7ce +043c48135c5e8cc2 +29a09527214b3dc5 +09265a9e57075e7c +12d4ca1236a2cf26 +03fe94a439456692 +44d2532c5b5296a1 +024152256b6bcac7 +23a6c9168abdb38e +379470c7d22c498a +098263de57257005 +2f18f5579583e648 +2bec33eeeab0bb9d +28f59b68509ce59d +0af60a9ffd747a1c +15ca8e1fce488c19 +3fcf6c1b81b14af5 +2f98ee24d3fc43a2 +2ef881551a7fda22 +44ab295bc3092c28 +406bdec5b68b1a71 +204fc9ff2c7ff92c +2a2d971fd44ae258 +4b85062505816744 +3c31d7b9f2792ed9 +4ce642bc93f1bb5a +4221bc1d4aea1a02 +08138c1a3ba1ce8d +037e8191b3985142 +0d46043105cf3185 +20422596003ac855 +42742db2633d2eb5 +3b434b5302dea908 +0588138dfec165a1 +0d68a05801d48984 +40c517d28a412a5a +2f7f2369486cc959 +2afb8a0a98e15155 +01fa6190cd47d125 +3290731e5f908b92 +4b86587ecd3325f4 +33842f4b169e4145 +0f6206df8a8e440a +038137c9569c60eb +1d125b16063c96c4 +03906f66d3bca71a +01f7915dce639515 +04db26572a791881 +171fdcc554d303d8 +0f68374b76390082 +1c5514d49d61bafc +195ca8350ff27f6f +44138776bdbfe28c +45f5a75e63afd4a2 +1b881261742799f7 +2d3f982ada31489c +0a7c052273895bb3 +0667d5bedfdbc555 +0c356641df7c72b8 +42f761f7e655bdce +3c5163ede747b187 +31a642fe4cd1e232 +223c9627b978d127 +0954f5a326941fb1 +1512ea7a9754ac34 +040895d45bf4e580 +05ec2d3e4c027220 +0aa284f8166e19e4 +29f0d7c051d80035 +1d04977fb85a8b3c +36fc018c7b62b997 +0277b87a9c943ed5 +02c0c9192fb9a6a1 +111356766833a7df +09754b77eeea6dae +0a6c499522efa0d5 +0d06be83296cf911 +48a3049cabb54c0d +01e18dbbf22ff263 +44d12349e0609ba3 +10f9d6f46e438d36 +188e6f96fa74ebe7 +0951fdaf9d399411 +08f82d3899d6b726 +179a1357a581ad51 +01cf2d900cb03afb +068140e09ae5ae8f +20f8c6738e22e764 +33dbdc4396938ae1 +44f9aac9faaac569 +1ebb496a04a1bd76 +0b04644621e97d30 +03b3f603a1001de0 +1bf49251fdd23cc0 +3cc40f129447cb31 +3714123f055e06a1 +38d44ebf460ac132 +378cb83947bf2d23 +4054d32655ba5eda +444d7d8445cd444e +3f0d9e856d93b8b3 +2fe5274c4baf665f +25b3854e6efb747d +3824335ebd7a4097 +2a41503583d146ed +452fff658953aab1 +0fb6678e63316201 +397037082e6eb839 +0a0027a48d9ff2ae +1fcfd8a36e171639 +4b4c0c27204604a3 +17a39d87a22ac1ec +47191aa41a979900 +07d1d5769e8d797c +1bf4fe9301893904 +41be40880094d8d1 +0d76da0fcac26af8 +125c92c36a04a68a +39f444469cf39006 +025192166c704a39 +246579087204ba8f +4596160a24b1af1f +2e96b2142fd337dd +1e1e13de4ebea05a +3ad4793daf6adc19 +01570ac1c73e9ca2 +1424acd0007d40b5 +2060cce4dad6f988 +330b925cef643b3f +1d4d81f8629b119e +286393e1e797cdad +3928ea8b8c134846 +0a72a3fd46a88ef6 +32eba3e4cfb61f93 +00620c2b77518524 +3b1c57027302837f +365fc12b4f33ada3 +0d4de33c6888a754 +487d83675a8d1574 +237ef12cd69e2aa0 +20b38e0a985506ba +289ce0f2b82dcd0e +088e115752ce9e56 +2d3e1349898addb6 +0d5112a7eb22d61a +21e97eb0cfbff775 +22a0db80d91128e4 +32dc25ef78b564a1 +1f0e06e4388dd600 +322261824c4a3003 +0ef68550315f57c7 +06a2911e9add96c6 +0807e84457d5ef58 +4ace59951acbae4e +46b2a13f6ab0be05 +0ede2c8fbe52c1d3 +37697c41773d597a +01497290d8b93a9b +0d5a4dcdf8ec9d36 +0e00a382b62667c0 +2a1769dddc1dbf8d +23df266716914368 +094fd37f09dc318c +2326e9820982ad81 +3ed3ffd0ae9c3224 +20764a96cc70fe46 +166818269e4e2568 +37d0f351f07ee925 +3087828bc27bc4c7 +06a8196a66e125af +49e48d66787ecb8f +0f5bb0704084e290 +3eb795302924e912 +2d0bd035f7df86b0 +2aa1e311e4bc039b +0bcde26e5a802638 +0c5ed899789e60ad +0ac6adb37a92f549 +0122933cf8ab3317 +19391676f0fc7982 +3a9c883b11e86530 +23e428c0dc43f046 +4317016b336431be +37dbb9846f2fdc01 +004334c94bbc8bd5 +0992802044bf665d +1f5df6019b0bb73c +0871e2540b0a6804 +0c609c435b1f7114 +322d03d487fc0f01 +1e9b1dc1c096d68d +1bf668db0194cf83 +08d5cde674e47324 +2a3baeaa72b86812 +0aacb1732fee7a3c +04cb1526cf3c43cc +3a6dc09185951ae3 +3fdaa028b8baad4a +24b4e8ff5a9a6439 +0a3b5fb184936a83 +0b3674ffb90b641a +3c35b868a8ec3433 +44b13221c50914d9 +02cb3a4fd80ee0cc +41fafa6144b58c39 +45e81a557d2dd78b +4ae4456267802484 +3e07add8413f8157 +4c96e034f8af77d6 +38b7a1d23745fbf3 +1c58aa75858147f1 +2e9d6a76c40b707f +362be988d0b68a4c +3ab70559ec30a57c +1433f61e9591ea9b +139a615209ee09ac +4811a66f87b0dd6a +2b1e0225f0952a09 +3db49ddb3f470436 +018f7907401f2fef +0e241f40ce0cd802 +04145b4b73b2d313 +1f73124222492f1c +24fe21f0a899701f +4a1d79baac733df7 +06db5bb2465ae58e +466150f780ad7b80 +1e1742072c0b2d6b +4955f54f807ef5aa +42565e9d863220ae +35d5a242ba40f31f +15e0783c6b9683be +45e6fa48ddd00e87 +396065ee739ea046 +280443260e3dced9 +10c4c9600bfaa4d8 +0181b66a65650830 +1fd615fea825fe87 +15c9a45c9c3d73f2 +33ac471b97ddf5c5 +3f2bf7371b72e40f +0a45b99f42fb0ecb +2ddca94aefd55b8f +00a54225c5cb1913 +0d6a534d75f20921 +072b4bb46d80484e +4a046d13e389b505 +07559b44fa10672c +07479835711d6f8d +4242fb49c775710c +15b93cbe9fc5220d +10f4acad3ed87288 +453c980210335f26 +291db63458af0613 +49b8f80c849dc341 +05ce34e3cd48c449 +15d324ad8ff2dd83 +3cc1fcf538c81442 +0ab14ffa7e541b0b +4043989a4ae95a01 +4c9030a5917a1328 +21b548b570c0b415 +3c44f7a30e0ad967 +1b88fb5063cd8916 +4c69bf407b142b93 +3059f523501dfd97 +4118895a33890c5a +30b1d229ad4c6353 +18cde229723f22df +000db54a47bd43fe +0e2653d00e3fc05a +3e6d44a66c0d7a0a +4319ddbd5e8f20e1 +459a954b63f98d8a +4a177be7db12edcd +497f507a5901bf4b +25aa5f50072ce7f4 +2276982dbc5a23e2 +47a66fa042406908 +2a7387e017c241a4 +0ff896ed26db5da3 +464e3851f923f8d0 +1e969786d2a8c7a2 +45f2d7abb5fafaa8 +3dcef43736468b29 +2a9d8ba86290db0e +21f3cc00e0cfe8bc +0ef054fbdabce0cd +3784777516b00247 +103777494841b376 +2ffe00ad70fe9c00 +289bcb973ed702a1 +186f18684ed4b516 +0d7f00ff38b135f7 +11a680776863b321 +21b26eda16f7cb88 +285aef90afaaf565 +0ff193c92d415b18 +49f6f14c580b71b9 +1b65111d34f57bb0 +015d8a2a2834d38c +3d394fbabe0e733d +4263257ec6099434 +39b3e7b2a8bf30a6 +0326e5c562bdf1b5 +2dd67f5e68c8d72b +27d163d7046d36b5 +3094afb27266ee6a +3c90ad3bb72adcf8 +374fa34fe701a30d +225ae1d37a7fa519 +1c742e548e8698c9 +16a7b5a41f31feb8 +2245fcd7f76c2ecd +196069a792ebbf31 +0ab163a1b88f1128 +15d8b7e256ffd066 +05c57211be152630 +3b9420585a1e66fc +1f56ccfabbfce568 +0498c9066256055c +2d5e1c16ba1f89c2 +37ff2186f55b3fd8 +0e5b9dcdb891b82b +0fe2286088ece98e +0e714d042fa59506 +3e5f747d06bc84a3 +0f540553fc30f16d +01866b81c3b90f2c +3105d330651c2726 +0ed25f15cbccd939 +234271629f7099df +2e109379f53bb221 +3efda95897eb23d1 +1c7b9f93752085f4 +240e89ef33ff15b6 +2b41e71d509a8b8f +31bf989cb15492d4 +36e69606a7599644 +07b667b34838336d +0b4d5beb7d3bd867 +3fc2c221557a205c +4908fab97c9bcec7 +3b3880eb01373479 +33a29a351c1d9800 +1c919c7e4ec601de +138bb7b0b25e4669 +2e715d4c5bf6c45e +2bff9ec89ca982c9 +167c2e0c6e9ffa5d +1fcc400e42725a95 +24668d960406587f +0cb83cef3177a006 +0d0a99d7f22aab71 +11e9cb1ccb9abe9f +4a7f0556fb58a5cc +24d95746999b7f7e +4560f57598efe5ab +16930ebf3f0f6b84 +14f0b962fabfaae2 +4bae8b3980bf32d0 +12dce44829d88985 +3c85dbda51f7e9b3 +49cc37f7f96be5a9 +2e4cac06a4f92261 +31607cc68ada0108 +0f7e8bf1137abcac +20171db88f887218 +3cfb6cb5052ce744 +0bd819cb30a432c6 +4089ef1b1bdb1d36 +283334520a3f8a43 +472e2674ece00632 +17d35e133dc3ce90 +09c1b7a0876c08df +3ce8f87fcfc988a8 +28ed97894371982f +2b43428fa1cf1a7e +0302fcf06bfba582 +30b1dffa5f783ecb +2e261d7661282e40 +29927d9ef0b472d7 +228ed4b87c8a6ea7 +2bd7cee1fa9c8996 +33e23b97daf5d9f0 +46de062e5ff787c5 +027c8c3fc3e7d056 +39f1b33acc70ad7b +0b07051f912592d7 +4675ea4e00c2544d +0ecde93bfa1f08d9 +214df1c2863d2959 +2c52d9d606a3ece2 +4bd922d1e75cc936 +000c3ab189999a83 +389df03f3c2d7291 +0c9b371cc6225682 +2c5249093fc26fde +1930a64d9a119b13 +2007d4829b187feb +19267b6a68d2701b +0539bcecbc483dfe +4a7e531e1a35d424 +22da7610855d6b9d +40954e72e02dc771 +2ed398e8368e0c6f +3e00b129b656fbce +1526707312c94a92 +0553c19e8933374a +33a3f65849195eba +1593596b99e2dde9 +118ffef2ad3950f5 +3d7a1ebc77f683b4 +374b2f4abac6dbb9 +32db375ab51d77a4 +1d2cab92bdcc1453 +33a93f85d5713a71 +4a1b9fb940541809 +150b45a39c57623d +1fa073504b4facaf +20e9bf845c4bd9e7 +3176f1532a468cbd +021575237abe0684 +3260a42ccfba8973 +0bb7da710cbf4bb9 +23beb4b246e236fb +2473b5003a95628c +1564900dd040c718 +41c2ed4944dec77f +132bbf5a9e9626ce +46502a6038bd288f +20fba1d53c349851 +2412e9f45282fd15 +430c6d1f8676fabc +16a75920e79c3710 +2cc5f95fbe24ffe5 +37ff932a6a608c24 +00e8df74b6805da7 +233d7cca6c4c628b +0286d2ed56e8f107 +1735d8d1b4015669 +225c5f2cdcd2753c +069a9416dc6a373b +203c8a4d66c74338 +397bbed49e1ee8dc +1ca02bf1c0b65675 +166bb958d7f4798a +02c485bd207116d4 +369f3639d9605255 +392c21ee30b21459 +0d08611c8b251e15 +1eca36ec55b88fe4 +2ff40df261e17697 +2e715b2e0162f768 +15d4a976e4e7d3dd +1d46e25b06eef337 +23d7f14af4b7ba08 +42cab73e14195475 +24bcb936908f3a31 +4ade6d5fe4b32738 +10a3511b61f40243 +1d9254a5cb93d4ff +3fc266558ec5c07b +2064e46352532375 +3f6e7ee98174056b +07225d96742d2a6c +17c234c2eec050a0 +2b1a013698fea3a1 +0190fe72a727c853 +11eb02d24a3241a9 +0efeb5654da456c6 +0c824455996db331 +14f477e7d5af5b91 +232a0b3133326242 +1bc87f52eba89cc4 +1402fce28722610b +172bd46c6ddda95d +2c55c5a96b50ab36 +31262e902165f348 +0e3951bf1db22064 +0404d32e97ec1cdb +126067199873816a +095fb57435b7d890 +195074aeac8bbd76 +1910e79a60d57aa7 +0a78dcb828c506f1 +2b4f6fdcabf53d59 +2b1da1fbe7f18f7e +05c48ff6535fbf55 +06dcbfe7cd79bb66 +0bc9fd5c8e50d0ee +18a3593609eb3269 +35b994780d720894 +15c28f4ada02cf91 +15d4131d721f1b5e +0c52996355b23d76 +0c72eaf6bbb7c681 +22d6e3fefb1ee7fe +0f620bfafa25fcf5 +388b75bd17c5332e +3b8167415736169c +08072d6cc8e8711d +178cef169356d4a5 +1072aae07584e091 +18a86c01aaaaaa8b +2422f760ea77551d +21840c44aed0ae43 +10ea3faaa29f4a88 +18502a6651367e71 +31c79c843555c2c6 +43c329f7c0b40258 +00beb03ef95dc637 +2b0cbc443e6c3c6d +2e554e99d045b484 +15ffcb1f98d41218 +39c41be5e76c79c1 +452e9c4e4729ddff +1fcf851e236dec35 +32615afea87b52dd +1f079ee70c21002d +2e04dfa4a1671292 +088b93f15ca8745d +180bf845cc8cada3 +20d86cff490c0c42 +293c7c1ccaa6861a +43db35d743e6be54 +308681a294d1417b +4393f3c42606c573 +16ef89980e2ceef7 +368fae6f3bc0b0f7 +2cc8ef9e5319d5d7 +3a642c6d0e43510b +4636beec02aa8dce +4091c41c6909da3b +203b7543bb3387ce +095441304a817fe9 +05ddd2fee689399c +457a1ed78b1ddb01 +04422a07336e32da +01aaf4ebb084dc16 +27f3ccfb3199499e +1c840c855f0c8421 +3a9fa6535917a07a +04e2be0415136fa9 +09e4d5e8eef7b9c1 +00a5a2af678f37d5 +1c36b2b8144d29ec +171403db6cb88926 +0b970b3417969c89 +0c788e368d993870 +3faac9603907b329 +078b8bdc29565cac +06058474f164c53d +0516a5d959b58cb3 +4634124b21b763f6 +10fbe4690dec6258 +4879ebbcfd888f5b +16b48792e910cf49 +0ece034988793847 +12a5cf6bbe330edb +3bb70a92a0d384e1 +3cf461bc6d626ed0 +007b4ae7c05f2ea2 +36c5e00f55c4f217 +4b341307a872487c +2a695d52faf1949e +1b87f55dec310243 +22b86e38854ad186 +30fceb3b40ec062f +15f8a54e4822f355 +02b618a34bc12ff9 +104c9a27980f9bb7 +16b667d681f8cf25 +46889bb1803c5cd7 +3eac186b3e7badb2 +02e6fb86b0172f0b +2ac712ac8d2fd488 +45a00d135c5388fc +2c2cfc0ac780a3aa +0c25287b812367cd +3f9b08ed34ec795a +35f89d3ac607bd5a +4949361d0831c838 +03c61595d13e121c +3d8d29bf0d9f24a4 +145da324f69d1c6b +3cfb4c69b14a1970 +4b461c1ec52a3076 +0dd9e020b6d9d687 +16b3eea75ad753ef +450cf402f042bfd2 +21a23c81331b0027 +1e80fd6e7507e3be +3e746126204810a4 +3613c77d8c234008 +310cbf1c65c52fa7 +2cd27189549897bf +36977643258aa392 +37c99410741fbca5 +33f4eeb64d0c9c1a +4cb669bd62a4ffb8 +0c11dbe781b1c11c +234c14a79d4da1ff +493bb055f33cb256 +3dfaa97cd48a0332 +375a7fbf80d09c92 +42cc82972397863b +03aa0437e5d62d58 +20d1b02740ef1124 +2f878176347bcf9e +06c7c747b4542273 +4b5619958277861f +04e4c841b349bf5c +0f12b97e0e4c7e21 +1225476a1221ce08 +0f61837b9749da34 +0712476a67734dee +3185302b2275b009 +0445459f7afb0f48 +2e7ffcba51990c93 +1515d37824dd6b22 +4ba7caa04cea37b7 +47d11d4bee6608e0 +37a6b3200493fac0 +1dcd8aee9a39a61a +3ac32347c3ff7d38 +11d5f4e7b0b17565 +41210bec1c0c87e8 +4a566b7e6eeaf9b4 +005dd9a58df1ba3c +4c2ed13774ae4613 +418ad7b9e78208cb +424f597efdad3067 +401e10a4352fba1c +1de1b73fe4d6aa77 +0f2197967bb7fa43 +2ac7eecd3cd0252f +33517e9838fe5f20 +144c2c2c52734f15 +047c29e9138af233 +0f59c103684c0437 +0542630de1d734de +02b406d1e5e31d5c +494f87170e713843 +47b5d62899ea4869 +024908906fadb408 +4cc48509585e4157 +304dd8f38dbeef0f +433742b23712bf06 +2f3ec1f2335489d2 +443b1691d94c1b3a +318cc6a39c0acc71 +1247b2ac5986205b +3b84ac07fd85bb3f +24d97ac8e96e7a5e +39f03d5fb1807102 +2da38ca64192354f +464d63c227f26d09 +28b23ec38ac5c0b1 +20de87f0b3f2d136 +3b58206d99feb4b7 +41abd737e0228c1a +18d9631f5eb45b87 +4cda491521679291 +3776e900791c1553 +3a10eb9788bcdfa1 +48d4444b94c2a2c0 +40284c1baec06ac9 +1ce8503fd200fed2 +0a8f10a9a68236f7 +00cf0a94235771bb +0ebaed7e3d044bc2 +36bc6918e9fc837c +1db5a4df1ab8b8e7 +03482c3bd66de195 +156b422215789c18 +3e034bde9426ae9f +3fe783b9c7c8f492 +3f68a1e365e94eb4 +17428a1f23edf411 +47cae76cd53c752a +00d83c48cb78ec83 +040a7af97273204f +3d410da4d7fd9f64 +0555b07fe6239b4a +4c144eb40a09a0dd +0c89e266974e8b90 +12e3d03a933c2eb4 +48f9cd996f80c34d +28b06f7087798198 +2a39e3b6061dd887 +24895a02057db66e +42c2c85060ab5233 +4a582ee23dd05a8d +2c9b5e69fe7f0338 +2c16104a0ed6c8aa +1a04733e4ee45c90 +11eb4e9eec5048f2 +0869b66f912b845d +33c1bb87a88e59f1 +4ac044dcaa428723 +2e6876c6c1e40652 +13fcc228c40a0e67 +23dcee801bca67bf +075f0d808a621ae2 +298c394f21c62ae6 +0181d3b41c2cf87c +3bad929f21fc4336 +107d3d674fdebd31 +48e49bdd1aa706e1 +1ec6fa3de6fa1bf0 +0f4f779411b45b6f +220d718317f7a025 +31b8eb8bbebed9b8 +3d4645318868a4f3 +37a6ae1e1c6eff66 +4c169a41e66b6599 +1513c8f030f4cbe2 +460e2066b64b2a40 +1cbab4f69b2d48ce +14e3fec07ba502d7 +2ab72e30a616dd21 +2d7d7fb53d960909 +0757b4bc82bf26b1 +195fbef2c08715e9 +0894f0072a8c5fd0 +10ac0ae67d317d11 +48638883e537ccda +373ab0a1009e0316 +12a70416c92a9483 +14417ac810f2024f +3f1c5b36d217d345 +1fb651cd12893f99 +455964aa4ead1e2f +4c8fd5318ae8d467 +0043978734eec081 +404043fe2f398440 +0f97153fcaafc80d +36cb04da872d3bce +1778e784d47e035e +1840ae9e2494443e +498688760312447b +2ceec371086f5d82 +1f2153b5fb50d41a +071e8c0978097efd +0d0f4080d36dfc68 +31b5667e16de1d94 +424397db4b1cf634 +377769942e6a748d +13adf913ea857ddf +0a9f2831a3e73de8 +46a4d49d61a86d37 +24f06a46ea08c03c +420bdc53a6928b32 +41e428b3c7a16695 +219825f542e6ee4b +3d6ed8b43655929b +0afdc571e4667a44 +4be8bf31940bd475 +2552fd444d04ef21 +44c16554a21aa6af +1d36cd02a549e244 +0223924f43297881 +31ee8cabc96b9a62 +397b050e345d73fd +03a78406de1d0993 +218ba0fa826d3eea +2945a940639798ce +1b74274269c75c8d +084fe29cd9d008db +38615248b52e2834 +0b429a4733089487 +01a628e2c509b823 +36f4df3e0a1ade5f +22666111b2180af9 +4c76898a3d535741 +0c4a239e265ae1c3 +0b530eea368f626e +0ba1cadcb191dc0a +13ac6a6a3a4f5e5d +4227369e7d0e735a +15729869d1862b7f +3ced9d0b56769bb7 +2ad4ea800caafe09 +02b2358ff02d3ce8 +35e8c6c2168dd087 +3858fc4475d10c78 +12f45658983d380d +46eccb4820f5a4eb +082087c82daa295d +08c87b4b6b23895f +3fb3327a177a0175 +08291107fc9e9849 +0e060f89ae0a469a +1961bb85524de229 +2ccd2d98696c87e0 +2beffa088960f673 +39424be692a88364 +3d60041ab79f46fc +18089956e2be2289 +37de8da2580d0c1d +47a1f1f01e2b7be6 +3de41ace235a3a13 +4881a65d7476d6dd +482ce5c63038e5b4 +42b086af2a1e5d98 +3dcdffe3b9c6235b +2deacec5c281fbac +21e794f71e31becb +2e470f6e2c83566a +4175cb4c71c984ec +49d5b942442449b4 +1eb03f0e3088edf0 +4c2d32a7f2b62657 +1f925fcf391591df +0e4f56edbc3d8cd7 +42c75d578535b0fe +329aba411f341398 +41a55418bee59b11 +0278b3d8abd9654d +0425df3e42ba0de3 +0b1e61c69c98026b +47a07f51fd3fef77 +4500d9faefff3a41 +066c35b1abc706be +4a2d6753676df096 +2a3bd0a2ac422822 +0b1b293ffb0e2f51 +1edc6b95e84127b6 +23cfdadab7cc51a1 +391bc7d21641283e +10c551ef9644ea03 +2be655d4137e6e29 +49ec7608e51f7ee2 +3e577a3be646152b +17d841670d2da942 diff --git a/evaluation/preprocess/re10k_test_1719.txt b/evaluation/preprocess/re10k_test_1719.txt new file mode 100644 index 00000000..90ec9de5 --- /dev/null +++ b/evaluation/preprocess/re10k_test_1719.txt @@ -0,0 +1,1719 @@ +000c3ab189999a83 +000db54a47bd43fe +0017ce4c6a39d122 +002ae53df0e0afe2 +004334c94bbc8bd5 +0043978734eec081 +004dd4b46a06e5be +004e9db3337e8206 +005dd9a58df1ba3c +00620c2b77518524 +0068e97c1c1f61aa +007876f71baf453f +007b4ae7c05f2ea2 +008cd8c450342e49 +0090cc64d7b7bb24 +00969c45a093d43c +00a54225c5cb1913 +00a5a2af678f37d5 +00beb03ef95dc637 +00ca5123d8ff6f83 +00cf0a94235771bb +00cfc0ecd345deb4 +00d83c48cb78ec83 +00e12e215c028984 +00e8df74b6805da7 +0122933cf8ab3317 +0131c9aed0fb3940 +0134d6a876481ed8 +0145c694b53b120d +01497290d8b93a9b +015631b21f792a12 +01570ac1c73e9ca2 +015d8a2a2834d38c +0181b66a65650830 +0181d3b41c2cf87c +01842c6b21e1d679 +01866b81c3b90f2c +018f7907401f2fef +0190fe72a727c853 +0196dedebec3dad2 +01a2277ee817b310 +01a5cc3805e94c21 +01a628e2c509b823 +01aaf4ebb084dc16 +01b08e2f20321127 +01be77405b16df11 +01cefee9563f691a +01cf2d900cb03afb +01cf55ae3e378faf +01e18dbbf22ff263 +01eca393f86d37c5 +01f7915dce639515 +01fa6190cd47d125 +01fe225e2f261d1a +020991bdfbdbe504 +020a41f988981396 +021575237abe0684 +0223924f43297881 +022a21a897f2a904 +024152256b6bcac7 +024908906fadb408 +025192166c704a39 +02679535c5f06a19 +0277b87a9c943ed5 +0278b3d8abd9654d +027c8c3fc3e7d056 +0282160b901229a7 +0286d2ed56e8f107 +02b2358ff02d3ce8 +02b406d1e5e31d5c +02b59cd60efb924e +02b618a34bc12ff9 +02c0c9192fb9a6a1 +02c485bd207116d4 +02cb3a4fd80ee0cc +02e6fb86b0172f0b +02ee66b3efbf3b0a +02f801e372d67cfd +0302fcf06bfba582 +0326e5c562bdf1b5 +034677cf3d80162d +03482c3bd66de195 +0362399a61c18ad5 +0368abd976e8d82e +036f135766f38f78 +037e8191b3985142 +038137c9569c60eb +0387ef3895b1393c +03906f66d3bca71a +039b153af4fbfba7 +039cc34e9cdbcf8f +03a78406de1d0993 +03aa0437e5d62d58 +03b3f603a1001de0 +03b440db4696d8e7 +03bcb03930ff1ace +03c61595d13e121c +03cf40616d79cb6a +03de2844d3c8314e +03e141d7afac53e4 +03e756bff92d49dd +03ef5f13e0a30864 +03f1781c4cc126e6 +03f551fc4abedc08 +03fe94a439456692 +0404d32e97ec1cdb +040895d45bf4e580 +040a26b288e7bda4 +040a7af97273204f +040de715f9303ba5 +04145b4b73b2d313 +0425df3e42ba0de3 +043c48135c5e8cc2 +04422a07336e32da +04433dcf217ad9a0 +0445459f7afb0f48 +0463d74358aca878 +047c29e9138af233 +0485a8528fa72698 +0498c9066256055c +049a98f70ce9f471 +04b580ea1f4df0a5 +04c441c7ce273dcc +04ca03945611febb +04cb1526cf3c43cc +04db26572a791881 +04e2be0415136fa9 +04e4c841b349bf5c +04ec725465dc5329 +04ef6a410f034514 +04f5153fc5255516 +04fe4ec70781a0e8 +0516a5d959b58cb3 +052430ff6e2c07c4 +0539bcecbc483dfe +053e78d3134437a5 +0542630de1d734de +0553c19e8933374a +0555b07fe6239b4a +055e4612c1ea70f9 +0588138dfec165a1 +058c67085c217b96 +059058768c222bd6 +0598fec76ecc7bd6 +05a0ad1e2aa632e7 +05a6149f1fcee38d +05ac37966de4e7fb +05b1462991e38e4d +05b77cb7c0f79f0f +05bacb6d6a4741b0 +05c423623c9f6f56 +05c48ff6535fbf55 +05c57211be152630 +05ce34e3cd48c449 +05ddd2fee689399c +05ec2d3e4c027220 +05ef56b2656c9318 +06058474f164c53d +061e49ba3a5386c7 +061f829d3dd2e46e +063b857e6470addb +0667d5bedfdbc555 +066c35b1abc706be +068140e09ae5ae8f +06954737f53b8688 +069597e1fe899530 +069a4c442912c405 +069a9416dc6a373b +06a2911e9add96c6 +06a8196a66e125af +06c71ce295284689 +06c7c747b4542273 +06ca8f480c91e9eb +06d8995be6aa4db6 +06db5bb2465ae58e +06dcbfe7cd79bb66 +06e499374ddafbff +06eba57d1c333a3c +06f4bfa5f9d5fe0e +0712476a67734dee +0718f733a326d65f +071e8c0978097efd +0720bead0cc7cbbf +07225d96742d2a6c +072a60bc7e0b0dfd +072b4bb46d80484e +074653ff3928b9fe +07479835711d6f8d +075278a4d0af74f7 +0752baf20fbc2285 +07559b44fa10672c +075654f497170f90 +0757b4bc82bf26b1 +075f0d808a621ae2 +077d42bb51ee2793 +07842cc567e9beec +078b8bdc29565cac +07b667b34838336d +07ba5489b56b7d62 +07d0229847bd7408 +07d1d5769e8d797c +07d449efdb66c20d +07e8ffa32746c7ce +08072d6cc8e8711d +0807e84457d5ef58 +08138c1a3ba1ce8d +082087c82daa295d +08291107fc9e9849 +084fe29cd9d008db +0869b66f912b845d +08718fb99eaafea7 +0871e2540b0a6804 +0871e5f582cd933c +08868143749f321b +088b93f15ca8745d +088e115752ce9e56 +0894f0072a8c5fd0 +0896b4819e39caf2 +08b8b63abbec8780 +08c87b4b6b23895f +08c9e7365f0707a4 +08d5cde674e47324 +08f82d3899d6b726 +090c672e7e394397 +0915a60e1ae6a826 +09265a9e57075e7c +094fd37f09dc318c +0951fdaf9d399411 +095441304a817fe9 +0954f5a326941fb1 +095fb57435b7d890 +0972074fece891f2 +09754b77eeea6dae +098263de57257005 +0992802044bf665d +09b505bb829c1d12 +09c1b7a0876c08df +09d860b12f6604cb +09e4d5e8eef7b9c1 +0a0027a48d9ff2ae +0a10d55239d83d99 +0a3b5fb184936a83 +0a45b99f42fb0ecb +0a4cf8d9b81b4c6e +0a5eeb4466dd19bb +0a6680fe6e8e09d7 +0a6c499522efa0d5 +0a72a3fd46a88ef6 +0a78dcb828c506f1 +0a7c052273895bb3 +0a8f10a9a68236f7 +0a9f2831a3e73de8 +0aa284f8166e19e4 +0aa49c0b75e51ba4 +0aa8646901d156e4 +0aacb1732fee7a3c +0ab14ffa7e541b0b +0ab163a1b88f1128 +0ac6adb37a92f549 +0af60a9ffd747a1c +0afdc571e4667a44 +0b04644621e97d30 +0b07051f912592d7 +0b1b293ffb0e2f51 +0b1e61c69c98026b +0b211a1457076450 +0b3674ffb90b641a +0b429a4733089487 +0b4d5beb7d3bd867 +0b530eea368f626e +0b55abc1ca2fe909 +0b79ada01eb45be9 +0b970b3417969c89 +0ba1cadcb191dc0a +0baa633d2094d2c1 +0bb7da710cbf4bb9 +0bb99505a71035cc +0bc9fd5c8e50d0ee +0bcde26e5a802638 +0bcef9ed1c18f74d +0bd7e6e9f0185aa3 +0bd819cb30a432c6 +0be9a0dcbfe032f1 +0beae06611ead92b +0bf152ef84195293 +0c061512de79b744 +0c0f298ace7c875b +0c1012a308ee2788 +0c11dbe781b1c11c +0c209edeb7637dff +0c24590c68af865f +0c25287b812367cd +0c2b3463c27c5ac3 +0c356641df7c72b8 +0c3d3b45ff4a4326 +0c4a239e265ae1c3 +0c4c5d5f751aabf5 +0c52996355b23d76 +0c5ed899789e60ad +0c609c435b1f7114 +0c6b149da098b121 +0c72eaf6bbb7c681 +0c788e368d993870 +0c824455996db331 +0c8438d86bb28f7d +0c884aee4b01366f +0c89e266974e8b90 +0c8b534612a0a776 +0c8c4363e0dca250 +0c916bcc9351521e +0c9b371cc6225682 +0c9c387ae23d090a +0c9d930d226d6bd6 +0c9ea3bf67254e95 +0cb83cef3177a006 +0cbbc98eec80360a +0cca84503a86574c +0ccb28128213f19d +0cd63c88350eef60 +0cda9adbbedd7948 +0cdfa29561cb24e0 +0ce3839aa5b66e3f +0cf444aef3ba16bd +0d01d4d6c5d5297e +0d06be83296cf911 +0d08611c8b251e15 +0d0a99d7f22aab71 +0d0f4080d36dfc68 +0d1aa0f47c9d2f6d +0d20062086f6d05c +0d22ced53b1db7d3 +0d46043105cf3185 +0d4b941f4678267b +0d4de33c6888a754 +0d5112a7eb22d61a +0d5a4dcdf8ec9d36 +0d68a05801d48984 +0d6a534d75f20921 +0d76da0fcac26af8 +0d7f00ff38b135f7 +0d82dba8f137e3da +0d8fd962cbfc81b7 +0da6a36b24eaf5db +0dad7f2ef3496f13 +0db2394602b8b81c +0dd9e020b6d9d687 +0de6bc7da518fcae +0de78cb98105f8c2 +0de79d4f3d7a9171 +0de8a88480533be6 +0deb1b80eb8481c6 +0e00a382b62667c0 +0e01be9445403642 +0e060f89ae0a469a +0e16d64d961fe855 +0e1a7abc82b1afb2 +0e241f40ce0cd802 +0e2653d00e3fc05a +0e2c96cd97e73a38 +0e2f2538b26a179c +0e3951bf1db22064 +0e41af1514f92887 +0e4f56edbc3d8cd7 +0e512d350465a63c +0e5b9dcdb891b82b +0e5e7fbe8914352c +0e6f8d0eb4103baf +0e714d042fa59506 +0e728af85650dcb3 +0e7b68884ac4d959 +0e8995dcbdd22f48 +0e8a52a174610350 +0eadbf8806794990 +0ebaed7e3d044bc2 +0ebb04534d7f2ba7 +0ecdc87c3391ce98 +0ecde93bfa1f08d9 +0ece034988793847 +0ecf489d873b7f52 +0ed25f15cbccd939 +0ed4c9cac4a615fd +0ed8b86b87a30d38 +0ede2c8fbe52c1d3 +0ef054fbdabce0cd +0ef15055b44649e3 +0ef68550315f57c7 +0efeb5654da456c6 +0f12b97e0e4c7e21 +0f18fb6736efb1c2 +0f1f245fa1c181ae +0f2197967bb7fa43 +0f25241e37e16f56 +0f47577ab3441480 +0f4f779411b45b6f +0f540553fc30f16d +0f59c103684c0437 +0f5bb0704084e290 +0f5c5385dbcd96df +0f61837b9749da34 +0f6206df8a8e440a +0f620bfafa25fcf5 +0f64c5e4fead6cf2 +0f68374b76390082 +0f7061acbeed50dd +0f7267e7e369b7d6 +0f7e8bf1137abcac +0f97153fcaafc80d +0fb6678e63316201 +0fbe6d76015f75d4 +0fd536fc3c8fdf19 +0fe2286088ece98e +0ff193c92d415b18 +0ff896ed26db5da3 +0ffd1083a70c6968 +10002e18d04c3d93 +1025622b7f308760 +10351dc7a37a44c1 +103777494841b376 +1047cc04fa16e0d7 +104c9a27980f9bb7 +1058fe0400a873e0 +1072aae07584e091 +107b78daf075d371 +107d3d674fdebd31 +10a3511b61f40243 +10ac0ae67d317d11 +10ad4fc499c48b38 +10c4c9600bfaa4d8 +10c551ef9644ea03 +10c8e54590f715f7 +10ea3faaa29f4a88 +10f4acad3ed87288 +10f9d6f46e438d36 +10fbe4690dec6258 +111356766833a7df +11337164b772b7c9 +11491a312c6b8f58 +114d9c301b847239 +1150003196de2529 +115fa3a1923b7c9f +117335f5d67368ca +118f563fe2ed4998 +118ffef2ad3950f5 +11a680776863b321 +11bf4b38f88bfe9b +11d5f4e7b0b17565 +11e62395c85c250d +11e9cb1ccb9abe9f +11eb02d24a3241a9 +11eb4e9eec5048f2 +11fcbcdb1dfefb38 +1203cc23b881ab8d +1214f2a11a9fc1ed +1225476a1221ce08 +1227d00562c106e2 +12293b264f68673d +122cb7d5ea4a99df +1247b2ac5986205b +12497730f691d00c +1250e369ad1e2fbc +1259726fc1f8e966 +125c92c36a04a68a +126067199873816a +12627e75d51b372e +12691b0622a823ba +127884736471b631 +128bf83073de3ba1 +129b5e9b80cc4b4b +12a5cf6bbe330edb +12a70416c92a9483 +12b7562944c06836 +12cb4aa3f5b59ad6 +12d4ca1236a2cf26 +12dc074fab6ada73 +12dce44829d88985 +12e3b1ad12a752f6 +12e3d03a933c2eb4 +12e6ba92e82c7ca4 +12e985eaa4b79298 +12f45658983d380d +131773f46d989860 +132bbf5a9e9626ce +133ee6e537353604 +134d7e5a74497a82 +13686755488a9d51 +138bb7b0b25e4669 +139055b26734436f +139a615209ee09ac +13ac6a6a3a4f5e5d +13adf913ea857ddf +13c4059da4e56a8c +13c510a7403f8231 +13fcc228c40a0e67 +1402fce28722610b +1424acd0007d40b5 +1433f61e9591ea9b +14417ac810f2024f +144b95c0c3fbe3b0 +144c2c2c52734f15 +145708c0216a06a7 +145da324f69d1c6b +146581180e89666d +14a5b002ce46d4d3 +14bfd05497764243 +14cdf4aa7a2de14b +14cf1f92ca13d605 +14e3fec07ba502d7 +14e540cf0ff7ff91 +14f0b962fabfaae2 +14f477e7d5af5b91 +150b45a39c57623d +1512ea7a9754ac34 +1513c8f030f4cbe2 +1515d37824dd6b22 +1526707312c94a92 +154072be49bb3c1d +154813fc1d6820dc +1564900dd040c718 +156b422215789c18 +156f4c7dca878ff2 +15729869d1862b7f +15762acaba295de1 +1593596b99e2dde9 +15a138312ad94718 +15aba05919bae167 +15ac594106229c62 +15b902774d67a394 +15b93cbe9fc5220d +15bc7fa1ed5567cc +15bc9eb752c6dcbd +15c28f4ada02cf91 +15c9a45c9c3d73f2 +15ca8e1fce488c19 +15d324ad8ff2dd83 +15d4131d721f1b5e +15d4a976e4e7d3dd +15d8b7e256ffd066 +15e0783c6b9683be +15f8a54e4822f355 +15ffcb1f98d41218 +1648b3e7471e3766 +165696025b477097 +166818269e4e2568 +166bb958d7f4798a +167c2e0c6e9ffa5d +168c85ce00de0c6b +168e0f4071d2fe58 +1692fab166811028 +16930ebf3f0f6b84 +1695a74d194b65c0 +16a23081fd821e92 +16a2c55f96e6aa18 +16a75920e79c3710 +16a7b5a41f31feb8 +16b3eea75ad753ef +16b48792e910cf49 +16b667d681f8cf25 +16b8ab24bd231f9a +16c4484a4093e2f6 +16ca2db2f920c1b4 +16cd4f1cb2a467c1 +16da9b4fdfe4883b +16ed45d1ce9017df +16ef89980e2ceef7 +171403db6cb88926 +171b3a4c2f95f981 +171fdcc554d303d8 +172bd46c6ddda95d +172c99489b18e0e6 +1735d8d1b4015669 +173a82eeea56aed3 +17428a1f23edf411 +17519e763b34fe14 +17535adda2aa3b90 +1778e784d47e035e +177d39d72e983b69 +177ff3969577b8de +178cef169356d4a5 +1798c9640d8875e6 +179a1357a581ad51 +179ff8424ec7ad13 +17a39d87a22ac1ec +17a3e731c4fe0aaf +17a489f3cec39fea +17a75f0b036c9cf8 +17c234c2eec050a0 +17ca3b8ad5815b35 +17d35e133dc3ce90 +17d841670d2da942 +17d9303ee77c3a3d +17dba5fa8138ed92 +17f552ef56d85c55 +17fc81293f337cc3 +180542b70f713d5b +18089956e2be2289 +180bf845cc8cada3 +182054e13eaf58fc +18297e1f8e25d3ee +1837b8eebc9c2457 +1839244b04a05e5a +1840ae9e2494443e +18502a6651367e71 +1859766466c069b1 +186f18684ed4b516 +188e6f96fa74ebe7 +18a3593609eb3269 +18a86c01aaaaaa8b +18be7c1b9895691f +18c6473be3bd827a +18cde229723f22df +18ce480be0ececbd +18d9631f5eb45b87 +18e659699338f835 +1910e79a60d57aa7 +19267b6a68d2701b +1930a64d9a119b13 +19310eb8261d4bb2 +19391676f0fc7982 +193c3bd339eb0a75 +195074aeac8bbd76 +195ca8350ff27f6f +195fbef2c08715e9 +196069a792ebbf31 +1961bb85524de229 +19694d2dc528d75d +196ffec2c68cafd5 +19a28e5c25feb31f +19b0cd79a126e8bd +19ec130ecea98d5e +1a04733e4ee45c90 +1b65111d34f57bb0 +1b74274269c75c8d +1b747b8eba6f7b45 +1b87f55dec310243 +1b881261742799f7 +1b88fb5063cd8916 +1bc87c160d1dc982 +1bc87f52eba89cc4 +1bce163cad1e1d20 +1bdf9dd7628ddb0b +1be3758972b35151 +1be5a5c98a51b1cc +1be80ff36848e758 +1beb8e6662d36ee6 +1bf49251fdd23cc0 +1bf4fe9301893904 +1bf668db0194cf83 +1c11709814a1a2f2 +1c144e2404c5da89 +1c1b2e56952040cc +1c29b1d8fd1dab3f +1c36b2b8144d29ec +1c375830155fe6dc +1c5514d49d61bafc +1c58aa75858147f1 +1c73def8a62301a9 +1c742e548e8698c9 +1c7b3ccd5482f834 +1c7b9f93752085f4 +1c840c855f0c8421 +1c8d34a791deaaf1 +1c919c7e4ec601de +1c9772d765e0679e +1ca02bf1c0b65675 +1ca4db19711258a0 +1cbab4f69b2d48ce +1cc1ff58dc89d230 +1cca8650a292e7b0 +1cd3638cceebed08 +1cddaac7be8ecfa5 +1cdfd3abfcc3a64e +1ce68f950e7cdf8c +1ce8503fd200fed2 +1d04977fb85a8b3c +1d125b16063c96c4 +1d1cafa3e27da040 +1d2cab92bdcc1453 +1d36cd02a549e244 +1d46e25b06eef337 +1d4d81f8629b119e +1d704b9365e9c86b +1d748783383ad977 +1d7af31482baf61c +1d8017cad8dc1d56 +1d9254a5cb93d4ff +1da33647873725fd +1dad44855584a4ea +1db274e904e3fb07 +1db5a4df1ab8b8e7 +1dcd8aee9a39a61a +1dd869f66c3c9497 +1de1b73fe4d6aa77 +1deec169175eb15b +1defdda324307269 +1df20a29cdec61a4 +1e01b910ceba4573 +1e0f97ec8f5aa374 +1e1742072c0b2d6b +1e1e13de4ebea05a +1e2a2be2df033527 +1e5548d951e91a40 +1e80fd6e7507e3be +1e847ebd7cd1174e +1e969786d2a8c7a2 +1e9b1dc1c096d68d +1eab4db6941be725 +1eb03f0e3088edf0 +1ebb496a04a1bd76 +1ec6fa3de6fa1bf0 +1ec7e7dce1175aee +1eca36ec55b88fe4 +1ed5e32330ec25e8 +1edc6b95e84127b6 +1ef9f5dfa615fafe +1efbd8f8949b15e8 +1f079ee70c21002d +1f084607245e4462 +1f0e06e4388dd600 +1f0f5e82e9d0f9ee +1f1a76ed6db1dae5 +1f2153b5fb50d41a +1f4279d98e283206 +1f56ccfabbfce568 +1f5df6019b0bb73c +1f62165dfec00c3e +1f73124222492f1c +1f7770ac5cbb41eb +1f925fcf391591df +1fa073504b4facaf +1fb651cd12893f99 +1fcc400e42725a95 +1fcf851e236dec35 +1fcfd8a36e171639 +1fd5f9af785e6e5c +1fd615fea825fe87 +1fe394077e7c3de0 +1fe9f9bc178a1778 +1fef543188ace6e5 +2007d4829b187feb +200ad247448c5577 +20100b779d28b6d5 +20171db88f887218 +202a627de66ad397 +20384754a6e5d1b0 +203b7543bb3387ce +203c8a4d66c74338 +20422596003ac855 +204fc9ff2c7ff92c +2060cce4dad6f988 +2064e46352532375 +206d9828d717139e +2075a2388413899d +20764a96cc70fe46 +20b38e0a985506ba +20b541350492e3ad +20bab82d0268c877 +20d0e788abca4aa9 +20d1b02740ef1124 +20d86cff490c0c42 +20de87f0b3f2d136 +20e7a3651ec30386 +20e9bf845c4bd9e7 +20f8c6738e22e764 +20fba1d53c349851 +21005252fe2383ba +210fc445b9e254f5 +213286ef58a8c73d +214c597029aebf9f +214df1c2863d2959 +2177ca3a775a9ee9 +21840c44aed0ae43 +218ba0fa826d3eea +219825f542e6ee4b +21a23c81331b0027 +21a6081709444ebf +21b26eda16f7cb88 +21b548b570c0b415 +21b9d476c5a49c63 +21d9134faec148f2 +21e794f71e31becb +21e97eb0cfbff775 +21f3cc00e0cfe8bc +22085848f943c2c6 +220d718317f7a025 +221205ddf59c5156 +2217c43ddaa29027 +22274e48b847c860 +2236eab1c5c86fc4 +223b5e20753d4fa3 +223c9627b978d127 +2245fcd7f76c2ecd +224e9686747afbb5 +22552c9a2a2a2ce7 +22598e2596e6bae7 +225ae1d37a7fa519 +225c5f2cdcd2753c +226646771975e6db +22666111b2180af9 +2276982dbc5a23e2 +227c21a8dd87a153 +227d63ed9a678fb1 +227e06087cbffb2b +228ed4b87c8a6ea7 +22a0db80d91128e4 +22b16c2f5af0f3ef +22b86e38854ad186 +22c8b35c589276c4 +22d6e3fefb1ee7fe +22da7610855d6b9d +22e6c736f2f7227b +23099812f662b3ec +2313ea0fb17cbed6 +2326e9820982ad81 +232a0b3133326242 +232aee1c62a1cd8b +233d7cca6c4c628b +2341162bce213f2e +234271629f7099df +2347b1e3e70842ac +234c14a79d4da1ff +236f9dc6456cf32c +237ef12cd69e2aa0 +23808c0cfcc72e72 +23a6c9168abdb38e +23beb4b246e236fb +23cfdadab7cc51a1 +23d7f14af4b7ba08 +23dcee801bca67bf +23df266716914368 +23e428c0dc43f046 +240e89ef33ff15b6 +2412e9f45282fd15 +241aa9bcdbdc7ac6 +2422f760ea77551d +242fc4972c7bb385 +2445756494ef6e3d +24548ce6c15bc2cf +24598987691df957 +246579087204ba8f +24668d960406587f +2473b5003a95628c +247bc2e47eb7f6fa +2482c4388b32f225 +24895a02057db66e +249fd0890d439aa9 +24ad46fd2b26b208 +24b4e8ff5a9a6439 +24bcb936908f3a31 +24d1c4f7497f8e77 +24d95746999b7f7e +24d97ac8e96e7a5e +24f06a46ea08c03c +24fe21f0a899701f +24fea6c2c7caa434 +2516b6023683fc3e +252a24d25a1ea81d +25468a86d9cd851e +2552fd444d04ef21 +259f6f1f002d2d94 +25aa5f50072ce7f4 +25ad11c04b852de5 +25b3854e6efb747d +27d163d7046d36b5 +27daccf898b206de +27f3ccfb3199499e +27f772c12c97b594 +280443260e3dced9 +2807e5ac66c140cc +281452e730c39fd0 +28190e57702bcfbc +282ad05cb0113543 +283059a56e7f3e75 +283334520a3f8a43 +2837dd5c75e026f5 +284efc2041b1d1d8 +285aef90afaaf565 +286393e1e797cdad +2864dc6c129cf3cc +2868a1b43e9eceff +28742766eb882cd2 +28760a14d0a5ff3a +288dd40199dee268 +289bcb973ed702a1 +289ce0f2b82dcd0e +28a5318660ab60ba +28b06f7087798198 +28b23ec38ac5c0b1 +28c9d20b865f5d56 +28e8300e004ab30b +28ed97894371982f +28f59b68509ce59d +28f5ebf3c3e2fe54 +291bc22350620114 +291db63458af0613 +293c7c1ccaa6861a +293e02c7c1fa31a8 +2945a940639798ce +29460ac4580ea232 +296241182c2df900 +296c87d370b03f17 +296ec0d98f4d4151 +29791bf60e718c6b +297b57a9296052ce +298276fb3c0330e5 +29832cbdb4144601 +298c394f21c62ae6 +29927d9ef0b472d7 +29a09527214b3dc5 +29c8267c1d10b23e +29e0bfbad00f0d5e +29f0d7c051d80035 +29f52c76f269ae48 +2a058bafbecaccf9 +2a1769dddc1dbf8d +2a1fed061b29b25b +2a2d971fd44ae258 +2a30c5309018e00f +2a39e3b6061dd887 +2a3baeaa72b86812 +2a3bd0a2ac422822 +2a41503583d146ed +2a4d835a6e023621 +2a695d52faf1949e +2a7387e017c241a4 +2a89b2a52cee9f5f +2a8cd9f87b3c9a2b +2a8ef9e44f580d13 +2a9d8ba86290db0e +2aa1e311e4bc039b +2ab72e30a616dd21 +2ac712ac8d2fd488 +2ac7eecd3cd0252f +2aca85b3bdf90a09 +2ad09b7837010330 +2ad4ea800caafe09 +2ad50852a84faf51 +2af206730de6f439 +2afb8a0a98e15155 +2b0b2259a7216762 +2b0cbc443e6c3c6d +2b1303680b081ccb +2b1a013698fea3a1 +2b1da1fbe7f18f7e +2b1e0225f0952a09 +2b38cc883c900d33 +2b3e3c5d30c17bd9 +2b41e71d509a8b8f +2b43428fa1cf1a7e +2b4a934049f932d0 +2b4c1f50687b2bcf +2b4e1061f6415a4a +2b4f6fdcabf53d59 +2b5b5b4f4fc526ba +2b625e92f2cf9de4 +2b81fbc1af01f0ad +2b8778726c1f2fe4 +2b8f367d01df3601 +2b973e6f676eb243 +2bcb26e95f5d152b +2bd7cee1fa9c8996 +2be655d4137e6e29 +2bec33eeeab0bb9d +2beffa088960f673 +2bff9ec89ca982c9 +2c02607ae436a9fb +2c04a38ae16197b2 +2c16104a0ed6c8aa +2c2cfc0ac780a3aa +2c3a96fb820e1ab7 +2c48ab563b92a1d5 +2c5249093fc26fde +2c52d9d606a3ece2 +2c55c5a96b50ab36 +2c596a5abfd67267 +2c5e21f9f91e2e09 +2c6e76b362eed8e4 +2c6fb46edb748fe5 +2c805f56d92a2e22 +2c80f9eb0d3b2bb4 +2c9018ef57c6b061 +2c9b5e69fe7f0338 +2ca8e34592e0c415 +2cb9869cb05a9a01 +2cc330488326fd4f +2cc5f95fbe24ffe5 +2cc8ef9e5319d5d7 +2ccd2d98696c87e0 +2cd1705407546b72 +2cd27189549897bf +2cecdd7df86ff8d3 +2ceec371086f5d82 +2cf1a544b179b1a7 +2d0bd035f7df86b0 +2d0e6766c725becc +2d16da80d7e3b64b +2d29ff162920db5e +2d39f39fb8254c27 +2d3e1349898addb6 +2d3f982ada31489c +2d4e81e66ce80039 +2d524e9324228d6e +2d5e1c16ba1f89c2 +2d60837ef2e52abd +2d62a5d66d6e4931 +2d6d5e82bda0611c +2d6f9fa00dcee664 +2d7d7fb53d960909 +2d815b3e5e9bb237 +2d8f1ccdb70c156a +2d99ba7951695f79 +2da082dfb7a66b4d +2da2eeb966bc0ef8 +2da38ca64192354f +2dc3af70d25d3043 +2dd67f5e68c8d72b +2ddca94aefd55b8f +2deacec5c281fbac +2defb4625a3ccb54 +2e04dfa4a1671292 +2e06abf6286040e2 +2e109379f53bb221 +2e261d7661282e40 +2e2ad99d45033d6a +2e30ae101611cca8 +2e35fc35559543f2 +2e4013ea92d04301 +2e470f6e2c83566a +2e4c69143b09033c +2e4cac06a4f92261 +2e554e99d045b484 +2e619c31122ee40f +2e64a2d17f9a76f7 +2e6876c6c1e40652 +2e715b2e0162f768 +2e715d4c5bf6c45e +2e7b5f8836ab642c +2e7ffcba51990c93 +2e86767798c005df +2e96b2142fd337dd +2e9d6a76c40b707f +2ea3133861ebde3b +2eb515bff528d3c3 +2ebedb0f027df101 +2ed398e8368e0c6f +2ef881551a7fda22 +2f18f5579583e648 +2f25826f0d0ef09a +2f3ec1f2335489d2 +2f4bcd593fe37158 +2f5af4b429b2c992 +2f7f2369486cc959 +2f878176347bcf9e +2f8e1946600c65d4 +2f98ee24d3fc43a2 +2fbb94b8cf388ba7 +2fca5797ae48529b +2fdfa70413053b84 +2fe5274c4baf665f +2ff40df261e17697 +2ffe00ad70fe9c00 +30006eb23f62aa57 +30029fac7c5621de +30127e00a789ed7c +30140756550dc38e +3015a3eab4b6d042 +3018aa8ad3eb5dca +3021337b3fbdb2f3 +304dd8f38dbeef0f +3052da93ceeda447 +3059f523501dfd97 +3069cc190d78e55e +306d435307fef477 +306e2b7785657539 +308681a294d1417b +3087828bc27bc4c7 +308de3d523189c72 +3094afb27266ee6a +309bed43e4406d72 +30abedc6c413510e +30b1d229ad4c6353 +30b1dffa5f783ecb +30bd5b88e47f6d0e +30efdcef9f38568b +30fc5fc78c5a716e +30fceb3b40ec062f +3105d330651c2726 +310cbf1c65c52fa7 +3115672ced7c5694 +3115ce06e0160828 +311d8515bd115aba +311db764bdc7f537 +31262e902165f348 +312c8e1f1f9f3594 +314c56aad151508d +314c584ff3842715 +316035dd285c5e27 +31607cc68ada0108 +3164f4c30188d403 +316ed1b489ff8f40 +3172feb32990cf09 +3176f1532a468cbd +31838e9542a906be +3185302b2275b009 +318cc6a39c0acc71 +31a087ee5b1976da +31a642fe4cd1e232 +31afa3dcf3b737fa +31b5667e16de1d94 +31b8eb8bbebed9b8 +31bf989cb15492d4 +31c79c843555c2c6 +31d903660e1647c5 +31ee8cabc96b9a62 +321911ae4a16f038 +322261824c4a3003 +32294ad73efca3db +322d03d487fc0f01 +3232f7457b27dbbc +325ff82707386438 +3260a42ccfba8973 +32615afea87b52dd +3289ad7a811d2348 +3290731e5f908b92 +32991f419c96ea0e +329aba411f341398 +329abf340d23b0b9 +32a2c04fd8321bbb +32cccd10c84b4529 +32ce9b303717a29d +32d2163aa65c0e8a +32d28b7513f873be +32db375ab51d77a4 +32dc25ef78b564a1 +32dfef9109202812 +32eba3e4cfb61f93 +330b925cef643b3f +3317c40fd3e0a7b7 +332294213fa15c56 +33288d55dde83e72 +333c649f75c3c7bc +3349f3089ea84d6b +33517e9838fe5f20 +335794d48a9b168a +33842f4b169e4145 +338ff9f6c02a6a40 +339c95e2e709d044 +33a29a351c1d9800 +33a3f65849195eba +33a3fc21efdc8547 +33a5a85f06fcc77b +33a93f85d5713a71 +33ac471b97ddf5c5 +33afdeba3cd5af05 +33baf3e18e5d7256 +33be1ba5aec86c96 +33c1bb87a88e59f1 +33dbdc4396938ae1 +33e23b97daf5d9f0 +33e5bb3820c171a5 +33f1be3a9ccf4e4b +33f242465f51563c +33f4eeb64d0c9c1a +33f7565ccb685cb7 +35b15e97674edec3 +35b994780d720894 +35d5a242ba40f31f +35e8c6c2168dd087 +35eeb3ce1b3dd01d +35f89d3ac607bd5a +36056faee50a621f +3613c77d8c234008 +361d722ef5009e09 +3628ec0337eae7be +362be988d0b68a4c +365cc620c2fcbb05 +365fc12b4f33ada3 +3662b5f2916b470c +368fa2dd830843c7 +368fae6f3bc0b0f7 +36977643258aa392 +369f3639d9605255 +36afe96c11a8211c +36bc6918e9fc837c +36bd5d3f3ee292cc +36c5e00f55c4f217 +36cb04da872d3bce +36d85599a9cbb6b3 +36e69606a7599644 +36e94b0ad3a62c7d +36f4df3e0a1ade5f +36fb4d41b00581c4 +36fc018c7b62b997 +3714123f055e06a1 +371c9182ffd46ce3 +372bb866143b6e35 +372f324a1f4d6898 +373ab0a1009e0316 +374b2f4abac6dbb9 +374fa34fe701a30d +375a7fbf80d09c92 +375cff10cab07955 +375f9c448cf31ccb +37697c41773d597a +3776e900791c1553 +377769942e6a748d +3783162ee796a21c +3784777516b00247 +378cb83947bf2d23 +379470c7d22c498a +379884ce61c4daa7 +37a6ae1e1c6eff66 +37a6b3200493fac0 +37a960afb176c485 +37b0b70a1a0c25d3 +37bbec9e46c8c1a9 +37c99410741fbca5 +37ce88a12d77382b +37d09bda74c92a93 +37d0f351f07ee925 +37d4e43b2b029a80 +37dbb9846f2fdc01 +37de8da2580d0c1d +37ff2186f55b3fd8 +37ff932a6a608c24 +3814a3a8046c8af3 +3824335ebd7a4097 +382a5736d9134153 +3858fc4475d10c78 +385f9444d20eb160 +38615248b52e2834 +3862411e9bf455cd +388b75bd17c5332e +388cb2f0ff1a6cad +388ed39170b69946 +389851bf0ac38227 +389df03f3c2d7291 +389f65e97bd902b2 +38a9a0f5e76103d2 +38b7a1d23745fbf3 +38d44ebf460ac132 +38f7ba7fd9a83069 +39074bca3524418b +390ddb7ee9b716ca +391bc7d21641283e +3928ea8b8c134846 +392c21ee30b21459 +394037c064421c3e +39424be692a88364 +3953f37661087a95 +395caf9235fde098 +396065ee739ea046 +397037082e6eb839 +3970859f54703c88 +397b050e345d73fd +397bbed49e1ee8dc +397da8e32c2edd65 +3984d005557cbd6f +398c4688209874c9 +39987911e4cf003c +399cfd9cfacc0499 +39b3e7b2a8bf30a6 +39b58270c2e99310 +39c41be5e76c79c1 +39c662fa32a3b5c6 +39e5f256790c3343 +39f03d5fb1807102 +39f1b33acc70ad7b +39f444469cf39006 +3a0328ed13dd8c8b +3a10eb9788bcdfa1 +3a126bd9702ee8f7 +3a3bc11b9ebb7d44 +3a488ff3afa463e2 +3a48dfbd2f0977f9 +3a4d7cbcf0c84668 +3a52947c66de5920 +3a642c6d0e43510b +3a6dc09185951ae3 +3a79b9aefafb0b8d +3a86a812a1eaa20e +3a9c883b11e86530 +3a9fa6535917a07a +3aae131a319acd17 +3aaed2e6422d7d57 +3ab70559ec30a57c +3ac32347c3ff7d38 +3ad4793daf6adc19 +3af43fe8d514f7c5 +3af4c4e5a8ced21e +3af70052616f7fa1 +3b0b55657925fb34 +3b122e1becb5fcb7 +3b1c57027302837f +3b1f9cedfc40b06c +3b273cb40c55db95 +3b3880eb01373479 +3b434b5302dea908 +3b58206d99feb4b7 +3b59c7d97b900724 +3b650a9e2ebdfde2 +3b67613d97aac1df +3b676f25b54dcc1c +3b6d8db52c54b174 +3b7443b24830d388 +3b8167415736169c +3b84ac07fd85bb3f +3b9420585a1e66fc +3b9e04113b202116 +3bad929f21fc4336 +3bae42d603be2266 +3bb1007fcf0e03ff +3bb70a92a0d384e1 +3bceca99e87d64c5 +3be029de36008afc +3c054be9bdb304ee +3c19657356e9e229 +3c31d7b9f2792ed9 +3c33566bacd602f2 +3c35b868a8ec3433 +3c44d53659dbe4fe +3c44f7a30e0ad967 +3c5163ede747b187 +3c64a373bc1c53bd +3c83e9817c9e022d +3c84329b60bfa7cb +3c85dbda51f7e9b3 +3c90ad3bb72adcf8 +3c9c37132583a3d2 +3cb1489b614e5f39 +3cc1fcf538c81442 +3cc40f129447cb31 +3cc4c306db84c6fe +3cc97c3d778975f2 +3ce8f87fcfc988a8 +3ce90c0ea2537c48 +3ced9d0b56769bb7 +3cf461bc6d626ed0 +3cfb4c69b14a1970 +3cfb6cb5052ce744 +3d0a0fecfbdada35 +3d2486ac8822da47 +3d2f4958db5aefbf +3d394fbabe0e733d +3d410da4d7fd9f64 +3d4645318868a4f3 +3d5114f5d7496cdd +3d5125567924e37b +3d584707e2f3ccf3 +3d60041ab79f46fc +3d6e04af63ebfea4 +3d6ed8b43655929b +3d7a1ebc77f683b4 +3d8d29bf0d9f24a4 +3d8d753f0851bf3b +3d986ec2fd6d210d +3db49ddb3f470436 +3dba1838ed366ab5 +3dba9cb74bfb79b2 +3dc0058dce3828d9 +3dcdffe3b9c6235b +3dcef43736468b29 +3dd211f3865fc234 +3de41ace235a3a13 +3dfaa97cd48a0332 +3e00b129b656fbce +3e034bde9426ae9f +3e07add8413f8157 +3e1236935a5f70ae +3e1af0b953407ef7 +3e3d858083d20eab +3e4057a188e15ac3 +3e577a3be646152b +3e5f747d06bc84a3 +3e68931874661724 +3e6d44a66c0d7a0a +3e746126204810a4 +3e8363be673dafa2 +3e8dd5a6930ecb92 +3e94e6706fcdccfa +3ea8d9787998f70a +3eac186b3e7badb2 +3eac742acbd69adf +3eb185c04280412d +3eb718c3170fcd8e +3eb795302924e912 +3ed3ffd0ae9c3224 +3edff71624eac3ee +3ee30754edfbdb3f +3eef492bf5120757 +3efda95897eb23d1 +3f0d9e856d93b8b3 +3f1c5b36d217d345 +3f265c5edb13f00d +3f2bf7371b72e40f +3f33ed2971149ea0 +3f45b8234504020c +3f4f553239e96d90 +3f5454f2f53e2103 +3f68a1e365e94eb4 +3f6e7ee98174056b +3f79dc32d575bcdc +3f7b6f511421e395 +3f89e23583c36441 +3f8d1edf59e70df3 +3f9b08ed34ec795a +3faac9603907b329 +3fb3327a177a0175 +3fc266558ec5c07b +3fc2c221557a205c +3fcf6c1b81b14af5 +3fd084afa49b6499 +3fdaa028b8baad4a +3fe382b2ae6c9361 +3fe783b9c7c8f492 +401a94bd9d84f501 +401e10a4352fba1c +40284c1baec06ac9 +403951b5d632b5ab +404043fe2f398440 +4043989a4ae95a01 +4054d32655ba5eda +406bdec5b68b1a71 +407eefe8017f6070 +4089ef1b1bdb1d36 +40904cd4b9e0579d +4091c41c6909da3b +40954e72e02dc771 +40c517d28a412a5a +40ca76de44a6e1a9 +40f6d540b9b16531 +40f92f1e65a5e1dd +41016527728cae5b +4118895a33890c5a +411c4dd047c49cde +41210bec1c0c87e8 +413062cf685711e7 +4130aefaca885090 +414e2bf42ee45cc4 +4161944d7d592071 +41649e3e8f9a4be0 +4175cb4c71c984ec +418ad7b9e78208cb +41936ce6152fee64 +41a3a167ea5d9e88 +41a55418bee59b11 +41aa58e688a04336 +41abd737e0228c1a +41be40880094d8d1 +41c2ed4944dec77f +41c600ed9f88871b +41d8b4350913ca64 +41e428b3c7a16695 +41f3291d82fc4d93 +41f438dd19aae981 +41fafa6144b58c39 +4200282fe9b4015a +420bdc53a6928b32 +4213b6b3b673f9b5 +4221bc1d4aea1a02 +4227369e7d0e735a +422d976591ab629e +4242fb49c775710c +424397db4b1cf634 +4246a11f0971a231 +424f597efdad3067 +42565e9d863220ae +4260ca20e2430c67 +4263257ec6099434 +42742db2633d2eb5 +427c035484d45682 +42b086af2a1e5d98 +42b218cc2f794026 +42b88f7ee71a7ba9 +42c2c85060ab5233 +42c75d578535b0fe +42cab73e14195475 +42cc82972397863b +42d8b53a15001cd5 +42f700b22cb0be39 +42f761f7e655bdce +4308efab35deb3ec +430c6d1f8676fabc +430d7b5b77861810 +430d8fece8e0f7e5 +4317016b336431be +431795f999dc215f +4319ddbd5e8f20e1 +431e6542fde13130 +432a9cfcf53ef717 +432fb354aa710e62 +43361dbc0c5a2808 +433742b23712bf06 +436a235ed74c3d89 +43759ced44693671 +4393f3a15ed6fb9a +4393f3c42606c573 +43c329f7c0b40258 +43db35d743e6be54 +43db8c6515021c01 +43fee307c6339b5e +44095e87bee5475e +440b5d1587251680 +44138776bdbfe28c +4422b38e60e3bc2f +442ad5ba8e834889 +443b1691d94c1b3a +443e5a7e679e3e94 +444d7d8445cd444e +446626a2bd617d24 +446f557155994097 +449c34eaea295942 +44a85c75cf4a6da8 +44ab295bc3092c28 +44accffce93c7e87 +44adc8d00568380f +44b13221c50914d9 +44b78f9fcb5cd8d8 +44be029ec85609c5 +44c16554a21aa6af +44d12349e0609ba3 +44d2532c5b5296a1 +44f9aac9faaac569 +4500d9faefff3a41 +45064c8142f3a360 +450cf402f042bfd2 +45108618c40e26a7 +45122648522d4180 +45166f266dd609a3 +4526c5ac1bfebcfb +452e9c4e4729ddff +452fff658953aab1 +453c980210335f26 +454197dc5b50b45f +454fc1e32db7cc41 +45536907ffef7585 +45592a7f307bccd0 +455964aa4ead1e2f +4560f57598efe5ab +457a1ed78b1ddb01 +45823117f0acb627 +4596160a24b1af1f +459a954b63f98d8a +45a00d135c5388fc +45a0fe252a89e008 +45a4515834848010 +45ac5168bda9d3e2 +45cb862034851efe +45dce690caec2917 +45e5fcd5c8978342 +45e6fa48ddd00e87 +45e81a557d2dd78b +45f2d7abb5fafaa8 +45f5a75e63afd4a2 +460455f96fa1a1d6 +46067fb6d992860e +460e2066b64b2a40 +4615277ffb68ca9d +46200541f9943d16 +4634124b21b763f6 +4636beec02aa8dce +464d63c227f26d09 +464d97e527dd5f8a +464e3851f923f8d0 +46502a6038bd288f +466150f780ad7b80 +4675ea4e00c2544d +46889bb1803c5cd7 +46a4d49d61a86d37 +46b2a13f6ab0be05 +46c9e2d86e7d4c41 +46de062e5ff787c5 +46df36a031f50a04 +46e0654ccb5d88cf +46e2ddf094d0c3a1 +46e653208e529783 +46ea97f6f3757209 +46eccb4820f5a4eb +46f840365cee9c44 +46fb9c990b6f8114 +46fdfa2a16c7c811 +47191aa41a979900 +471abe46b812be64 +472e2674ece00632 +473e6ec61583d90f +474afb2d4641a228 +474d403238a41315 +47573da5cb0e5e44 +4766f2062abaaf74 +4773f5327489d57a +477df7ad0c2e7fdb +47a07f51fd3fef77 +47a1f1f01e2b7be6 +47a66fa042406908 +47a76ca10546fe8f +47b5d62899ea4869 +47c88dcfb1134255 +47cae76cd53c752a +47d11d4bee6608e0 +47d9493675e58f3b +4803cf5deca2b38a +4811a66f87b0dd6a +4828fb60e4a871ee +482c3e92080f18c5 +482ce5c63038e5b4 +4833b3d2a8184313 +483c2b4c67e32c19 +485f996ecf360da7 +48614bc62c3acbf8 +48638883e537ccda +486970d685c0b746 +4879ebbcfd888f5b +487d83675a8d1574 +4881a65d7476d6dd +489254d4b26a04c6 +489f9441d513634e +48a3049cabb54c0d +48aff7218b00d843 +48b1808c546c7e87 +48bb743178166598 +48d4444b94c2a2c0 +48e49bdd1aa706e1 +48f9cd996f80c34d +4905bc8817511dd2 +4908fab97c9bcec7 +49235d402cbb8895 +4938177a0e6e2fbe +493bb055f33cb256 +4949361d0831c838 +494f87170e713843 +4953a5140e2f439b +4955f54f807ef5aa +495f50f0997e986e +497364635884d8aa +497d2450ed65a678 +497f507a5901bf4b +498688760312447b +4989f6cc2b43d2c6 +49b8f80c849dc341 +49be5de41d619cb1 +49c758aa3c35ed86 +49c9324758b5e867 +49cc37f7f96be5a9 +49d4a7288f6b5dac +49d5b942442449b4 +49e48d66787ecb8f +49ec7608e51f7ee2 +49f6f14c580b71b9 +4a046d13e389b505 +4a177be7db12edcd +4a1920283e3087de +4a1b9fb940541809 +4a1d79baac733df7 +4a2d6753676df096 +4a566b7e6eeaf9b4 +4a582ee23dd05a8d +4a6a057fc644624e +4a736d7c30ae9280 +4a763e1b87e495a7 +4a7e531e1a35d424 +4a7f0556fb58a5cc +4a8f9d6889992fd9 +4aa1973c40d2eb93 +4aa594c0ad661f28 +4ac044dcaa428723 +4acd145b0c133dca +4ace59951acbae4e +4ade6d5fe4b32738 +4ae4456267802484 +4b0fdb10ae15684b +4b341307a872487c +4b3644bfc6083588 +4b41d03353967b40 +4b457a008376cd73 +4b461c1ec52a3076 +4b4c0c27204604a3 +4b5619958277861f +4b5a6dd314bebe88 +4b6e9a02975ef9a4 +4b7071b34e8cc67d +4b7e7de9132f4149 +4b85062505816744 +4b86587ecd3325f4 +4ba7caa04cea37b7 +4bae8b3980bf32d0 +4bc1c3a888a8bfba +4bc203e17758f3a0 +4bc47dc7f8781812 +4bc7ca44cc62b8b1 +4bd922d1e75cc936 +4bdb70500b99c91f +4bdfa30358809038 +4be8bf31940bd475 +4befac16ffdf8489 +4bf9ef4705f35e8f +4c0ef61c55467706 +4c144eb40a09a0dd +4c169a41e66b6599 +4c2383e60aaf26cc +4c2d32a7f2b62657 +4c2ed13774ae4613 +4c4fa41951e37e78 +4c502551adddea8b +4c5fd496905b91ce +4c69bf407b142b93 +4c76898a3d535741 +4c791225522d45ba +4c8fd5318ae8d467 +4c9030a5917a1328 +4c943ac66f6c277e +4c96e034f8af77d6 +4ca952ede2af6578 +4cb669bd62a4ffb8 +4cbb82a6bab25a0f +4cc48509585e4157 +4cc4c8a8cfa8e944 +4cda491521679291 +4ce58504b055463e +4ce642bc93f1bb5a +4cf74ffa5bfd5904 \ No newline at end of file diff --git a/evaluation/preprocess/re10k_test_1800.txt b/evaluation/preprocess/re10k_test_1800.txt new file mode 100644 index 00000000..c8446e9e --- /dev/null +++ b/evaluation/preprocess/re10k_test_1800.txt @@ -0,0 +1,1832 @@ +1839244b04a05e5a +2da38ca64192354f +4308efab35deb3ec +06db5bb2465ae58e +3f265c5edb13f00d +37d4e43b2b029a80 +30127e00a789ed7c +1db274e904e3fb07 +1d46e25b06eef337 +0f12b97e0e4c7e21 +007876f71baf453f +0f59c103684c0437 +133ee6e537353604 +2dd67f5e68c8d72b +4175cb4c71c984ec +0ede2c8fbe52c1d3 +13211c9e31fa4b14 +21a23c81331b0027 +2dc3af70d25d3043 +095fb57435b7d890 +07ba5489b56b7d62 +2b4c1f50687b2bcf +0954f5a326941fb1 +02b59cd60efb924e +1840ae9e2494443e +3b3880eb01373479 +16ef89980e2ceef7 +31b8eb8bbebed9b8 +08d5cde674e47324 +2cc8ef9e5319d5d7 +46de062e5ff787c5 +0f5bb0704084e290 +3176f1532a468cbd +03906f66d3bca71a +18502a6651367e71 +31262e902165f348 +19694d2dc528d75d +3824335ebd7a4097 +0807e84457d5ef58 +138bb7b0b25e4669 +4091c41c6909da3b +068140e09ae5ae8f +4675ea4e00c2544d +10fbe4690dec6258 +1da33647873725fd +44c16554a21aa6af +2e554e99d045b484 +4b86587ecd3325f4 +4757ffa4f1da98cb +365fc12b4f33ada3 +36977643258aa392 +1593596b99e2dde9 +2c16104a0ed6c8aa +1526707312c94a92 +23beb4b246e236fb +4b41d03353967b40 +318cc6a39c0acc71 +2ab72e30a616dd21 +10c4c9600bfaa4d8 +2347b1e3e70842ac +1513c8f030f4cbe2 +17a489f3cec39fea +4cb669bd62a4ffb8 +03b3f603a1001de0 +04fe4ec70781a0e8 +4a0368a338dbde67 +1b881261742799f7 +0e5b9dcdb891b82b +0a45b99f42fb0ecb +444d7d8445cd444e +0ab14ffa7e541b0b +3c31d7b9f2792ed9 +3776e900791c1553 +0e48b9ee438238f1 +03e756bff92d49dd +2cf32e2408107ea7 +20e7a3651ec30386 +41aa58e688a04336 +1b88fb5063cd8916 +4ce58504b055463e +0e3951bf1db22064 +16cd4f1cb2a467c1 +455964aa4ead1e2f +3d6ed8b43655929b +1fb651cd12893f99 +4a736d7c30ae9280 +33a93f85d5713a71 +09265a9e57075e7c +45e81a557d2dd78b +15ffcb1f98d41218 +3c84329b60bfa7cb +05ce34e3cd48c449 +1c840c855f0c8421 +3a10eb9788bcdfa1 +403951b5d632b5ab +0a78dcb828c506f1 +36e94b0ad3a62c7d +25ad11c04b852de5 +4636beec02aa8dce +2e64a2d17f9a76f7 +2af206730de6f439 +0cd63c88350eef60 +4c502551adddea8b +0d5a4dcdf8ec9d36 +3115672ced7c5694 +1bc87f52eba89cc4 +1564900dd040c718 +3b1f9cedfc40b06c +30bd5b88e47f6d0e +4b3644bfc6083588 +1efbd8f8949b15e8 +2a3c7ba09ed503d5 +31a642fe4cd1e232 +040a26b288e7bda4 +30b1d229ad4c6353 +4260ca20e2430c67 +145708c0216a06a7 +07559b44fa10672c +3a0328ed13dd8c8b +1f4279d98e283206 +3edff71624eac3ee +428fc13fba69054c +0c6b149da098b121 +0b211a1457076450 +4043989a4ae95a01 +2d16da80d7e3b64b +454fc1e32db7cc41 +0286d2ed56e8f107 +3f6c97f1ac96dada +33a5a85f06fcc77b +0387ef3895b1393c +08291107fc9e9849 +1fd5f9af785e6e5c +37d09bda74c92a93 +330b925cef643b3f +0c4c5d5f751aabf5 +00ae21ab50209282 +2c6e76b362eed8e4 +45108618c40e26a7 +082087c82daa295d +407eefe8017f6070 +487e2f9d93c162c9 +460455f96fa1a1d6 +2ac7eecd3cd0252f +47c88dcfb1134255 +146ef4db9655fd67 +333c649f75c3c7bc +17a39d87a22ac1ec +0712476a67734dee +2e715d4c5bf6c45e +240e89ef33ff15b6 +3d8d29bf0d9f24a4 +37d0f351f07ee925 +1225476a1221ce08 +2beffa088960f673 +16b667d681f8cf25 +05dec89f80cabf23 +232a0b3133326242 +4c69bf407b142b93 +1fcf851e236dec35 +2a1fed061b29b25b +40c517d28a412a5a +29f52c76f269ae48 +06954737f53b8688 +24fea6c2c7caa434 +0c1012a308ee2788 +3dfaa97cd48a0332 +1837b8eebc9c2457 +0d01d4d6c5d5297e +37a6b3200493fac0 +2dd86d1f9e2d3474 +3e1af0b953407ef7 +452fff658953aab1 +0a5e107e1961d01d +41c600ed9f88871b +36f4df3e0a1ade5f +1be5a5c98a51b1cc +08c87b4b6b23895f +289ce0f2b82dcd0e +196069a792ebbf31 +1e969786d2a8c7a2 +047c29e9138af233 +464d63c227f26d09 +08138c1a3ba1ce8d +1f084607245e4462 +29927d9ef0b472d7 +0a6680fe6e8e09d7 +2bd7cee1fa9c8996 +154072be49bb3c1d +043c48135c5e8cc2 +452e9c4e4729ddff +0c8438d86bb28f7d +2e4cac06a4f92261 +1fe394077e7c3de0 +46200541f9943d16 +3faac9603907b329 +0e728af85650dcb3 +3c44f7a30e0ad967 +0be9a0dcbfe032f1 +0eadbf8806794990 +07225d96742d2a6c +47573da5cb0e5e44 +4c4fa41951e37e78 +42cab73e14195475 +2e261d7661282e40 +02b2358ff02d3ce8 +2e86767798c005df +4aa594c0ad661f28 +3bad929f21fc4336 +10ea3faaa29f4a88 +01497290d8b93a9b +395f5ae56e94e344 +4a582ee23dd05a8d +49d5b942442449b4 +3e02762a89de4c7d +4c791225522d45ba +252a24d25a1ea81d +07479835711d6f8d +0043978734eec081 +374fa34fe701a30d +4a1d79baac733df7 +0e2653d00e3fc05a +0c356641df7c72b8 +0b564b685315d8ff +04e4c841b349bf5c +3dcdffe3b9c6235b +2ed398e8368e0c6f +0a4cf8d9b81b4c6e +0ff193c92d415b18 +4526c5ac1bfebcfb +018f7907401f2fef +46502a6038bd288f +210fc445b9e254f5 +11ee8f1e1bdf1c2c +4a7e531e1a35d424 +16a75920e79c3710 +391bc7d21641283e +362be988d0b68a4c +223c9627b978d127 +1f925fcf391591df +47cae76cd53c752a +3e5f747d06bc84a3 +0bc9fd5c8e50d0ee +16b3eea75ad753ef +01e18dbbf22ff263 +00a54225c5cb1913 +365cc620c2fcbb05 +44be029ec85609c5 +2cc330488326fd4f +06058474f164c53d +2bff9ec89ca982c9 +0277b87a9c943ed5 +1ca02bf1c0b65675 +44d2532c5b5296a1 +18a86c01aaaaaa8b +29a09527214b3dc5 +1c36b2b8144d29ec +1c742e548e8698c9 +430d7b5b77861810 +2e04dfa4a1671292 +3d2fdcb64b0352ff +4aa1973c40d2eb93 +0b1b293ffb0e2f51 +02b618a34bc12ff9 +086e9118bec887be +1c29b1d8fd1dab3f +2ca8e34592e0c415 +1623f47d7e74e848 +4c447e587919525c +2a3baeaa72b86812 +2b41e71d509a8b8f +462f64859631a099 +2c9b5e69fe7f0338 +4c144eb40a09a0dd +0ece034988793847 +2bec33eeeab0bb9d +0f4f779411b45b6f +0404d32e97ec1cdb +4a7f0556fb58a5cc +39f444469cf39006 +1fcfd8a36e171639 +1f079ee70c21002d +24895a02057db66e +2228cda919976437 +14f0b962fabfaae2 +2b625e92f2cf9de4 +3094afb27266ee6a +3087828bc27bc4c7 +3e4057a188e15ac3 +17519e763b34fe14 +220a9caffae81adc +0efeb5654da456c6 +3953f37661087a95 +46067fb6d992860e +0de78cb98105f8c2 +0e00a382b62667c0 +39c662fa32a3b5c6 +4319ddbd5e8f20e1 +0555b07fe6239b4a +20fba1d53c349851 +236f9dc6456cf32c +283334520a3f8a43 +477a0a9f77c00480 +0869b66f912b845d +293e02c7c1fa31a8 +1778e784d47e035e +2a39e3b6061dd887 +2ebedb0f027df101 +3ced9d0b56769bb7 +33e23b97daf5d9f0 +2cecdd7df86ff8d3 +3fb3327a177a0175 +08f82d3899d6b726 +35d5a242ba40f31f +498688760312447b +373ab0a1009e0316 +2c5e21f9f91e2e09 +2e06abf6286040e2 +0bd819cb30a432c6 +12d4ca1236a2cf26 +474afb2d4641a228 +4bae8b3980bf32d0 +0ecf489d873b7f52 +2b4e1061f6415a4a +00beb03ef95dc637 +0ef68550315f57c7 +3e8363be673dafa2 +36707b9a2d7c344a +32615afea87b52dd +46df36a031f50a04 +379884ce61c4daa7 +1e1e13de4ebea05a +2341162bce213f2e +0c824455996db331 +2e7ffcba51990c93 +1ebb496a04a1bd76 +0425df3e42ba0de3 +179ff8424ec7ad13 +464e3851f923f8d0 +0f64c5e4fead6cf2 +13c4059da4e56a8c +283059a56e7f3e75 +0a3b5fb184936a83 +31607cc68ada0108 +332294213fa15c56 +0445459f7afb0f48 +126067199873816a +11e62395c85c250d +472e2674ece00632 +2be655d4137e6e29 +00cf0a94235771bb +28190e57702bcfbc +15b93cbe9fc5220d +2a41503583d146ed +460e2066b64b2a40 +3fd084afa49b6499 +2b1a013698fea3a1 +025192166c704a39 +10e931268a81f228 +08868143749f321b +0c8c4363e0dca250 +01fa6190cd47d125 +05c57211be152630 +0e2f2538b26a179c +1d2cab92bdcc1453 +41fafa6144b58c39 +2ff40df261e17697 +457a1ed78b1ddb01 +0b81411e1b5ec798 +15f8a54e4822f355 +1203cc23b881ab8d +1648b3e7471e3766 +3fc2c221557a205c +31ee8cabc96b9a62 +14417ac810f2024f +0ba1cadcb191dc0a +0972074fece891f2 +07842cc567e9beec +173a82eeea56aed3 +1f73124222492f1c +1eab4db6941be725 +166818269e4e2568 +0d5112a7eb22d61a +3a52947c66de5920 +0baa633d2094d2c1 +000db54a47bd43fe +24668d960406587f +04f5153fc5255516 +05a0ad1e2aa632e7 +0b4d5beb7d3bd867 +00a5a2af678f37d5 +0708389923510354 +14bfd05497764243 +453c980210335f26 +193c3bd339eb0a75 +1d704b9365e9c86b +0f540553fc30f16d +0ff896ed26db5da3 +0e060f89ae0a469a +3c9c37132583a3d2 +02ee66b3efbf3b0a +3fcf6c1b81b14af5 +008cd8c450342e49 +3af43fe8d514f7c5 +24f06a46ea08c03c +132bbf5a9e9626ce +30b1dffa5f783ecb +02f801e372d67cfd +2a89b2a52cee9f5f +3f33ed2971149ea0 +45ac5168bda9d3e2 +27f3ccfb3199499e +12497730f691d00c +18a3593609eb3269 +25aa5f50072ce7f4 +0ebaed7e3d044bc2 +3c85dbda51f7e9b3 +219825f542e6ee4b +4560f57598efe5ab +227c21a8dd87a153 +33a3fc21efdc8547 +1fcc400e42725a95 +29c8267c1d10b23e +004dd4b46a06e5be +32db375ab51d77a4 +32a2c04fd8321bbb +1db5a4df1ab8b8e7 +3d4645318868a4f3 +44f9aac9faaac569 +371c9182ffd46ce3 +13fcc228c40a0e67 +3f11a54c6d0703a0 +1c144e2404c5da89 +2482c4388b32f225 +1c9772d765e0679e +218ba0fa826d3eea +3cf461bc6d626ed0 +2e109379f53bb221 +43c329f7c0b40258 +1beb8e6662d36ee6 +388cb2f0ff1a6cad +4a2d6753676df096 +22d6e3fefb1ee7fe +46fb9c990b6f8114 +2cd1705407546b72 +05c48ff6535fbf55 +095441304a817fe9 +10ac0ae67d317d11 +329aba411f341398 +072a60bc7e0b0dfd +31a087ee5b1976da +446f557155994097 +2f3ec1f2335489d2 +1cd3638cceebed08 +3c5163ede747b187 +06f4bfa5f9d5fe0e +3b8167415736169c +30029fac7c5621de +2a3bd0a2ac422822 +0f61837b9749da34 +406bdec5b68b1a71 +3a86a812a1eaa20e +41be40880094d8d1 +2b1e0225f0952a09 +06e499374ddafbff +0c11dbe781b1c11c +232aee1c62a1cd8b +414e2bf42ee45cc4 +3260a42ccfba8973 +24ac24abf3057732 +4938177a0e6e2fbe +45e6fa48ddd00e87 +38615248b52e2834 +17535adda2aa3b90 +0e714d042fa59506 +0ecde93bfa1f08d9 +41649e3e8f9a4be0 +49e48d66787ecb8f +3b9e04113b202116 +1edc6b95e84127b6 +378cb83947bf2d23 +3b84ac07fd85bb3f +171b3a4c2f95f981 +316035dd285c5e27 +38f7ba7fd9a83069 +46eccb4820f5a4eb +3efda95897eb23d1 +233d7cca6c4c628b +114d9c301b847239 +05330a153a103386 +0c52996355b23d76 +225c5f2cdcd2753c +1047cc04fa16e0d7 +1f1a76ed6db1dae5 +497364635884d8aa +2fe5274c4baf665f +0cda9adbbedd7948 +1be80ff36848e758 +3de41ace235a3a13 +03a78406de1d0993 +2b43428fa1cf1a7e +3d5114f5d7496cdd +15d4a976e4e7d3dd +3232f7457b27dbbc +375a7fbf80d09c92 +05b77cb7c0f79f0f +375f9c448cf31ccb +43759ced44693671 +41f3291d82fc4d93 +00620c2b77518524 +2d524e9324228d6e +17428a1f23edf411 +2f4bcd593fe37158 +107b78daf075d371 +39b3e7b2a8bf30a6 +16a2c55f96e6aa18 +2cc5f95fbe24ffe5 +382a5736d9134153 +0f18fb6736efb1c2 +4596160a24b1af1f +14e540cf0ff7ff91 +01eca393f86d37c5 +3f2bf7371b72e40f +0c4a239e265ae1c3 +3f1c5b36d217d345 +2b3e3c5d30c17bd9 +107d3d674fdebd31 +027c8c3fc3e7d056 +11a680776863b321 +0a6c499522efa0d5 +4b85062505816744 +2f98ee24d3fc43a2 +390ddb7ee9b716ca +45dce690caec2917 +29f0d7c051d80035 +459a954b63f98d8a +377769942e6a748d +18ce480be0ececbd +08b8b63abbec8780 +4c0ef61c55467706 +4161944d7d592071 +04b580ea1f4df0a5 +424f597efdad3067 +2c55c5a96b50ab36 +45166f266dd609a3 +3ce8f87fcfc988a8 +32d28b7513f873be +01aaf4ebb084dc16 +3a88be7c404596ad +1ec6fa3de6fa1bf0 +36d85599a9cbb6b3 +004334c94bbc8bd5 +0d08611c8b251e15 +2b4a934049f932d0 +01866b81c3b90f2c +24fe21f0a899701f +39b58270c2e99310 +21005252fe2383ba +03482c3bd66de195 +0dd9e020b6d9d687 +021575237abe0684 +17a75f0b036c9cf8 +3d394fbabe0e733d +0cbbc98eec80360a +0871e5f582cd933c +2e96b2142fd337dd +431e6542fde13130 +3cfb6cb5052ce744 +4393f3c42606c573 +1227d00562c106e2 +35b994780d720894 +2b0b2259a7216762 +3ab70559ec30a57c +36afe96c11a8211c +0223924f43297881 +145da324f69d1c6b +04db26572a791881 +02e6fb86b0172f0b +058c67085c217b96 +48638883e537ccda +3a9fa6535917a07a +3af70052616f7fa1 +0bcde26e5a802638 +2c2cfc0ac780a3aa +19ec130ecea98d5e +1c7b9f93752085f4 +1fe9f9bc178a1778 +1cca8650a292e7b0 +20d1b02740ef1124 +3b1c57027302837f +4c943ac66f6c277e +0553c19e8933374a +2e35fc35559543f2 +484cfd0b6334c43a +4b341307a872487c +1f0e06e4388dd600 +4bdfa30358809038 +42565e9d863220ae +203c8a4d66c74338 +23df266716914368 +10f9d6f46e438d36 +2fffc623e6a34e23 +23808c0cfcc72e72 +0d0a99d7f22aab71 +10eebcbb9021f437 +01842c6b21e1d679 +11d5f4e7b0b17565 +247bc2e47eb7f6fa +46e653208e529783 +0b194567eff966f5 +1de1b73fe4d6aa77 +45a00d135c5388fc +3ed3ffd0ae9c3224 +2060cce4dad6f988 +040a7af97273204f +4bc1c3a888a8bfba +1bf668db0194cf83 +1bf4fe9301893904 +2d3e1349898addb6 +22085848f943c2c6 +2c5249093fc26fde +3858fc4475d10c78 +388b75bd17c5332e +203b7543bb3387ce +21b9d476c5a49c63 +4227369e7d0e735a +39074bca3524418b +07e8ffa32746c7ce +20b38e0a985506ba +12b7562944c06836 +4c169a41e66b6599 +3c44d53659dbe4fe +471abe46b812be64 +195ca8350ff27f6f +0db2394602b8b81c +44138776bdbfe28c +4ba7caa04cea37b7 +372f324a1f4d6898 +361d722ef5009e09 +41c2ed4944dec77f +12a70416c92a9483 +1b87f55dec310243 +15d8b7e256ffd066 +241aa9bcdbdc7ac6 +036f135766f38f78 +17dba5fa8138ed92 +21f88a42bf424000 +0a8f10a9a68236f7 +2d60837ef2e52abd +33c37aeeda88c3bf +0a0027a48d9ff2ae +21e97eb0cfbff775 +32eba3e4cfb61f93 +118f563fe2ed4998 +0095ddd83beb3b8d +49b8f80c849dc341 +088e115752ce9e56 +43db35d743e6be54 +3d2486ac8822da47 +000c3ab189999a83 +037e8191b3985142 +41d8b4350913ca64 +0368abd976e8d82e +08718fb99eaafea7 +04422a07336e32da +049a98f70ce9f471 +2f7f2369486cc959 +128bf83073de3ba1 +0326e5c562bdf1b5 +104c9a27980f9bb7 +24548ce6c15bc2cf +039b153af4fbfba7 +20384754a6e5d1b0 +430c6d1f8676fabc +1e80fd6e7507e3be +49d4a7288f6b5dac +0d7f00ff38b135f7 +024908906fadb408 +25468a86d9cd851e +45a4515834848010 +0afdc571e4667a44 +11eb4e9eec5048f2 +308681a294d1417b +27daccf898b206de +3fe382b2ae6c9361 +368fa2dd830843c7 +23e428c0dc43f046 +483c2b4c67e32c19 +47191aa41a979900 +15ca8e1fce488c19 +0718f733a326d65f +38d44ebf460ac132 +024152256b6bcac7 +4634124b21b763f6 +0b970b3417969c89 +436a235ed74c3d89 +3f0d9e856d93b8b3 +004e9db3337e8206 +397bbed49e1ee8dc +4c9030a5917a1328 +40f6d540b9b16531 +3628ec0337eae7be +24d97ac8e96e7a5e +12f45658983d380d +08e076c11a67b54b +4881a65d7476d6dd +2ad50852a84faf51 +30abedc6c413510e +432a9cfcf53ef717 +3d16e4b4719bc256 +31838e9542a906be +18c6473be3bd827a +1ef9f5dfa615fafe +396065ee739ea046 +2b38cc883c900d33 +11337164b772b7c9 +0362399a61c18ad5 +3a488ff3afa463e2 +11357e0934f26aed +2deacec5c281fbac +29832cbdb4144601 +05ddd2fee689399c +4b461c1ec52a3076 +1424acd0007d40b5 +16b8ab24bd231f9a +2313ea0fb17cbed6 +31c79c843555c2c6 +45f5a75e63afd4a2 +30efdcef9f38568b +32dc25ef78b564a1 +02c0c9192fb9a6a1 +3b676f25b54dcc1c +1d8017cad8dc1d56 +420bdc53a6928b32 +2c80f9eb0d3b2bb4 +214bae1626cba843 +304dd8f38dbeef0f +0af60a9ffd747a1c +3f89e23583c36441 +06a8196a66e125af +0aa8646901d156e4 +0a72a3fd46a88ef6 +12e6ba92e82c7ca4 +397037082e6eb839 +23341c3a0b420e54 +08c9e7365f0707a4 +13ac6a6a3a4f5e5d +3eb718c3170fcd8e +2d0bd035f7df86b0 +186f18684ed4b516 +0bcef9ed1c18f74d +30140756550dc38e +0b3674ffb90b641a +0e512d350465a63c +3970859f54703c88 +3e3d858083d20eab +3d0a0fecfbdada35 +2ea3133861ebde3b +3f4f553239e96d90 +1150003196de2529 +493bb055f33cb256 +1072aae07584e091 +066ac822934d52d6 +3317c40fd3e0a7b7 +36fb4d41b00581c4 +2eb515bff528d3c3 +2d7d7fb53d960909 +4a1b9fb940541809 +0de8a88480533be6 +38b7a1d23745fbf3 +0871e2540b0a6804 +06d8995be6aa4db6 +3b434b5302dea908 +0f97153fcaafc80d +28f59b68509ce59d +03c61595d13e121c +3185302b2275b009 +3e94e6706fcdccfa +2d6f9fa00dcee664 +214c597029aebf9f +2b23144adffe2e49 +4bd922d1e75cc936 +38a9a0f5e76103d2 +0fbe6d76015f75d4 +0190fe72a727c853 +33e5bb3820c171a5 +2cb589c80f31524f +36bd5d3f3ee292cc +2fbb94b8cf388ba7 +03cf40616d79cb6a +227d63ed9a678fb1 +0d4de33c6888a754 +439762b81d4b908c +1eb03f0e3088edf0 +164e33fbbcb5d223 +0d06be83296cf911 +168e0f4071d2fe58 +06c7c747b4542273 +0588138dfec165a1 +242fc4972c7bb385 +2552fd444d04ef21 +226646771975e6db +09754b77eeea6dae +03bcb03930ff1ace +213286ef58a8c73d +171403db6cb88926 +4393f3a15ed6fb9a +29791bf60e718c6b +31d903660e1647c5 +4bc203e17758f3a0 +1bf49251fdd23cc0 +39c41be5e76c79c1 +44ab295bc3092c28 +3a6dc09185951ae3 +1859766466c069b1 +11dc36548bbd85da +2075a2388413899d +4989f6cc2b43d2c6 +02b406d1e5e31d5c +4c2383e60aaf26cc +1f0f5e82e9d0f9ee +4118895a33890c5a +33288d55dde83e72 +05ef56b2656c9318 +055e4612c1ea70f9 +07b667b34838336d +308de3d523189c72 +3e8dd5a6930ecb92 +20d86cff490c0c42 +37c99410741fbca5 +2e7b5f8836ab642c +3fc266558ec5c07b +002ae53df0e0afe2 +2236eab1c5c86fc4 +36c5e00f55c4f217 +1402fce28722610b +14cdf4aa7a2de14b +0beae06611ead92b +2e2ad99d45033d6a +0da6a36b24eaf5db +139055b26734436f +43fee307c6339b5e +424980541ccdb10d +28b23ec38ac5c0b1 +2c04a38ae16197b2 +44d12349e0609ba3 +3115ce06e0160828 +098263de57257005 +202a627de66ad397 +28b06f7087798198 +07d1d5769e8d797c +220d718317f7a025 +0d1aa0f47c9d2f6d +3105d330651c2726 +379470c7d22c498a +47a66fa042406908 +12691b0622a823ba +4879ebbcfd888f5b +1e847ebd7cd1174e +1f56ccfabbfce568 +0e41af1514f92887 +28742766eb882cd2 +1d9254a5cb93d4ff +17fc81293f337cc3 +246579087204ba8f +0c2b3463c27c5ac3 +15ac594106229c62 +0d0f4080d36dfc68 +06eba57d1c333a3c +069597e1fe899530 +401e10a4352fba1c +020a41f988981396 +16c4484a4093e2f6 +0ab163a1b88f1128 +23a6c9168abdb38e +4766f2062abaaf74 +15aba05919bae167 +063b857e6470addb +23099812f662b3ec +00cfc0ecd345deb4 +03e141d7afac53e4 +0c9c387ae23d090a +1d1cafa3e27da040 +1ec7e7dce1175aee +36cb04da872d3bce +4bdb70500b99c91f +2b8f367d01df3601 +20e9bf845c4bd9e7 +12dce44829d88985 +3e577a3be646152b +37ff2186f55b3fd8 +2f5af4b429b2c992 +2ffe00ad70fe9c00 +0e1a7abc82b1afb2 +35eeb3ce1b3dd01d +3289ad7a811d2348 +22b86e38854ad186 +41a55418bee59b11 +0c0f298ace7c875b +0ed4c9cac4a615fd +200ad247448c5577 +195074aeac8bbd76 +443e5a7e679e3e94 +2d6d5e82bda0611c +485f996ecf360da7 +0d20062086f6d05c +443b1691d94c1b3a +2ce75a0f430e2387 +2c596a5abfd67267 +3f9b08ed34ec795a +4213b6b3b673f9b5 +0992802044bf665d +450cf402f042bfd2 +00969c45a093d43c +1d4d81f8629b119e +413062cf685711e7 +206d9828d717139e +19a28e5c25feb31f +00d83c48cb78ec83 +156b422215789c18 +1dd869f66c3c9497 +2e470f6e2c83566a +37dbb9846f2fdc01 +3f7b6f511421e395 +4be8bf31940bd475 +129b5e9b80cc4b4b +37a6ae1e1c6eff66 +3d986ec2fd6d210d +0b79ada01eb45be9 +4200282fe9b4015a +385f9444d20eb160 +4a653d46e89d4202 +20171db88f887218 +49be5de41d619cb1 +072b4bb46d80484e +2868a1b43e9eceff +322d03d487fc0f01 +2afb8a0a98e15155 +32294ad73efca3db +4cc4c8a8cfa8e944 +4773f5327489d57a +14f477e7d5af5b91 +233dda1ab6796b0a +2c02607ae436a9fb +12e3d03a933c2eb4 +1ce68f950e7cdf8c +45536907ffef7585 +2a695d52faf1949e +3cc40f129447cb31 +296c87d370b03f17 +45f2d7abb5fafaa8 +296241182c2df900 +1025622b7f308760 +0b07051f912592d7 +2111cd087a82344b +4a6a057fc644624e +3dd211f3865fc234 +389f65e97bd902b2 +3069cc190d78e55e +33ac471b97ddf5c5 +4803cf5deca2b38a +2837dd5c75e026f5 +1e2a2be2df033527 +44b13221c50914d9 +1e0f97ec8f5aa374 +12a5cf6bbe330edb +179a1357a581ad51 +3a79b9aefafb0b8d +06dcbfe7cd79bb66 +01be77405b16df11 +2d0e6766c725becc +071e8c0978097efd +29e0bfbad00f0d5e +1c5514d49d61bafc +33a3f65849195eba +0d76da0fcac26af8 +1ed5e32330ec25e8 +28f5ebf3c3e2fe54 +0b1e61c69c98026b +00ca5123d8ff6f83 +1d748783383ad977 +0bb99505a71035cc +103777494841b376 +291bc22350620114 +440b5d1587251680 +146581180e89666d +22e6c736f2f7227b +4b6e9a02975ef9a4 +24bcb936908f3a31 +3cb1489b614e5f39 +3eac186b3e7badb2 +36bc6918e9fc837c +0ac6adb37a92f549 +3b6d8db52c54b174 +2c48ab563b92a1d5 +03f551fc4abedc08 +3349f3089ea84d6b +4a1920283e3087de +298c394f21c62ae6 +30fceb3b40ec062f +395caf9235fde098 +3bb70a92a0d384e1 +09b92d14a157d130 +0463d74358aca878 +47b5d62899ea4869 +32991f419c96ea0e +46ea97f6f3757209 +41a3a167ea5d9e88 +4befac16ffdf8489 +09d860b12f6604cb +37697c41773d597a +01a2277ee817b310 +4b7071b34e8cc67d +2a7387e017c241a4 +3ac32347c3ff7d38 +433742b23712bf06 +12293b264f68673d +24b4e8ff5a9a6439 +1695a74d194b65c0 +1bce163cad1e1d20 +154365ab5a4e067b +2a1769dddc1dbf8d +0f68374b76390082 +3e746126204810a4 +35b15e97674edec3 +20de87f0b3f2d136 +297b57a9296052ce +3021337b3fbdb2f3 +1eca36ec55b88fe4 +1b74274269c75c8d +11eb02d24a3241a9 +0915a60e1ae6a826 +10c8e54590f715f7 +0deb1b80eb8481c6 +0c609c435b1f7114 +0fb6678e63316201 +0c25287b812367cd +0896b4819e39caf2 +19267b6a68d2701b +10a3511b61f40243 +3f6e7ee98174056b +4c2ed13774ae4613 +411c4dd047c49cde +0e8a52a174610350 +3e6d44a66c0d7a0a +1c7b3ccd5482f834 +0f7061acbeed50dd +32dfef9109202812 +17d9303ee77c3a3d +466150f780ad7b80 +2b0cbc443e6c3c6d +4130aefaca885090 +0498c9066256055c +3d7a1ebc77f683b4 +389df03f3c2d7291 +43db8c6515021c01 +432fb354aa710e62 +1e9b1dc1c096d68d +180542b70f713d5b +2d4efa3897a4ba2d +0894f0072a8c5fd0 +21e794f71e31becb +1247b2ac5986205b +282ad05cb0113543 +2807e5ac66c140cc +0c9cc2f6d62336f1 +3c5bb5694853c36a +3a9c883b11e86530 +3e034bde9426ae9f +30fc5fc78c5a716e +11bf4b38f88bfe9b +04145b4b73b2d313 +416f82fdbad68e21 +12627e75d51b372e +0720ee62bf014834 +15d324ad8ff2dd83 +1deec169175eb15b +4953a5140e2f439b +1214f2a11a9fc1ed +09e4d5e8eef7b9c1 +171fdcc554d303d8 +3172ad0d099430da +3f79dc32d575bcdc +1defdda324307269 +418ad7b9e78208cb +1fa073504b4facaf +31afa3dcf3b737fa +0c884aee4b01366f +2e15ad61e7078fdb +3bceca99e87d64c5 +422d976591ab629e +196ffec2c68cafd5 +111356766833a7df +2cb9869cb05a9a01 +0122933cf8ab3317 +3bb1007fcf0e03ff +0516a5d959b58cb3 +01fe225e2f261d1a +42b086af2a1e5d98 +0c788e368d993870 +33a29a351c1d9800 +061f829d3dd2e46e +2473b5003a95628c +0485a8528fa72698 +03f1781c4cc126e6 +3b7443b24830d388 +2defb4625a3ccb54 +4b7e7de9132f4149 +249fd0890d439aa9 +2c6fb46edb748fe5 +19310eb8261d4bb2 +4acd145b0c133dca +01cf55ae3e378faf +3dcef43736468b29 +0c5ed899789e60ad +3172feb32990cf09 +4500d9faefff3a41 +48f9cd996f80c34d +1e1742072c0b2d6b +16ca2db2f920c1b4 +39987911e4cf003c +04ec725465dc5329 +10002e18d04c3d93 +2c805f56d92a2e22 +22b16c2f5af0f3ef +4cda491521679291 +394037c064421c3e +2e619c31122ee40f +33f242465f51563c +18d9631f5eb45b87 +224e9686747afbb5 +3916390b35258215 +4cc48509585e4157 +00e12e215c028984 +0951fdaf9d399411 +3290731e5f908b92 +3b273cb40c55db95 +1cdfd3abfcc3a64e +094fd37f09dc318c +44b2ab5292c06a7e +33f4eeb64d0c9c1a +322c4bf5043ffd95 +10c551ef9644ea03 +392c21ee30b21459 +298276fb3c0330e5 +19b0cd79a126e8bd +12f6faddcc88bcc5 +431795f999dc215f +39e5f256790c3343 +15e0783c6b9683be +3ea8d9787998f70a +2c3a96fb820e1ab7 +131773f46d989860 +39f31c4461ede05f +3c90ad3bb72adcf8 +37ce88a12d77382b +0134d6a876481ed8 +3c19657356e9e229 +2516b6023683fc3e +2fdfa70413053b84 +16da9b4fdfe4883b +4bc7ca44cc62b8b1 +15b902774d67a394 +20100b779d28b6d5 +314c584ff3842715 +1cddaac7be8ecfa5 +1ca4db19711258a0 +01a628e2c509b823 +4422b38e60e3bc2f +40f92f1e65a5e1dd +02241c9f162966e3 +312c8e1f1f9f3594 +0e6f8d0eb4103baf +0e5e7fbe8914352c +221205ddf59c5156 +3f5454f2f53e2103 +446626a2bd617d24 +16a23081fd821e92 +374b2f4abac6dbb9 +188e6f96fa74ebe7 +4ac044dcaa428723 +404043fe2f398440 +2ef881551a7fda22 +372bb866143b6e35 +015d8a2a2834d38c +3c35b868a8ec3433 +3d5125567924e37b +23d7f14af4b7ba08 +172c99489b18e0e6 +3c64a373bc1c53bd +1cc1ff58dc89d230 +4317016b336431be +3928ea8b8c134846 +234271629f7099df +154813fc1d6820dc +449ee1308d0710b5 +0e4f56edbc3d8cd7 +286393e1e797cdad +2b4f6fdcabf53d59 +3af4c4e5a8ced21e +36e69606a7599644 +0196dedebec3dad2 +214df1c2863d2959 +05ac37966de4e7fb +3c91fa87850fd1a8 +42b218cc2f794026 +3eac742acbd69adf +2bcb26e95f5d152b +0e2c96cd97e73a38 +2aca85b3bdf90a09 +059058768c222bd6 +15a138312ad94718 +12dc074fab6ada73 +46f840365cee9c44 +0302fcf06bfba582 +2aa1e311e4bc039b +3b0b55657925fb34 +2864dc6c129cf3cc +20f1c3ade1608b4d +15d4131d721f1b5e +3d2f4958db5aefbf +0b55abc1ca2fe909 +22274e48b847c860 +0c3d3b45ff4a4326 +1fef543188ace6e5 +42f761f7e655bdce +39f1b33acc70ad7b +05b1462991e38e4d +2a4d835a6e023621 +419b773d04a986b7 +3015a3eab4b6d042 +01b08e2f20321127 +4b0fdb10ae15684b +33afdeba3cd5af05 +369f3639d9605255 +17c234c2eec050a0 +42d8b53a15001cd5 +4ae4456267802484 +0bd7e6e9f0185aa3 +125c92c36a04a68a +167c2e0c6e9ffa5d +4949361d0831c838 +01a5cc3805e94c21 +04c441c7ce273dcc +47d9493675e58f3b +497f507a5901bf4b +2276982dbc5a23e2 +2a8ef9e44f580d13 +0ed7ecf45f945ead +321911ae4a16f038 +24ad46fd2b26b208 +4c7df9d3840b2d63 +2ddca94aefd55b8f +0181b66a65650830 +22a0db80d91128e4 +2a2d971fd44ae258 +084fe29cd9d008db +339c95e2e709d044 +0c8b534612a0a776 +1f2153b5fb50d41a +4c8fd5318ae8d467 +33baf3e18e5d7256 +24d1c4f7497f8e77 +36fc018c7b62b997 +234c14a79d4da1ff +039cc34e9cdbcf8f +177d39d72e983b69 +3fdaa028b8baad4a +2e9d6a76c40b707f +3d8d753f0851bf3b +27f772c12c97b594 +3ebaecd85db14943 +1a04733e4ee45c90 +0e01be9445403642 +4ade6d5fe4b32738 +3cc4c306db84c6fe +1b747b8eba6f7b45 +052430ff6e2c07c4 +053e78d3134437a5 +20b541350492e3ad +1fd615fea825fe87 +0d82dba8f137e3da +49c9324758b5e867 +05bacb6d6a4741b0 +007b4ae7c05f2ea2 +3cc97c3d778975f2 +0d4b941f4678267b +06ca8f480c91e9eb +42f700b22cb0be39 +4571dd58b16ba385 +14a5b002ce46d4d3 +09c1b7a0876c08df +2b1da1fbe7f18f7e +3784777516b00247 +2ceec371086f5d82 +090c672e7e394397 +285aef90afaaf565 +37a960afb176c485 +2e715b2e0162f768 +0720bead0cc7cbbf +1d36cd02a549e244 +4b5a6dd314bebe88 +0a5eeb4466dd19bb +1512ea7a9754ac34 +22666111b2180af9 +09b505bb829c1d12 +442ad5ba8e834889 +325ff82707386438 +0a10d55239d83d99 +4bf9ef4705f35e8f +038137c9569c60eb +41210bec1c0c87e8 +21d9134faec148f2 +18089956e2be2289 +0cb83cef3177a006 +204fc9ff2c7ff92c +489f9441d513634e +2d39f39fb8254c27 +45e5fcd5c8978342 +32cccd10c84b4529 +117335f5d67368ca +46e2ddf094d0c3a1 +2a8cd9f87b3c9a2b +0a9f2831a3e73de8 +3d584707e2f3ccf3 +32d2163aa65c0e8a +075278a4d0af74f7 +0145c694b53b120d +17ca3b8ad5815b35 +0f25241e37e16f56 +48bb743178166598 +0181d3b41c2cf87c +005dd9a58df1ba3c +42c2c85060ab5233 +49cc37f7f96be5a9 +1e5548d951e91a40 +3a4d7cbcf0c84668 +2990bbc008d9fa82 +3ad4793daf6adc19 +4828fb60e4a871ee +47a76ca10546fe8f +477df7ad0c2e7fdb +0d6a534d75f20921 +427c035484d45682 +49d21e0a4c3eace5 +3be029de36008afc +17f552ef56d85c55 +49235d402cbb8895 +3f8d1edf59e70df3 +0cf444aef3ba16bd +02c485bd207116d4 +02cb3a4fd80ee0cc +139a615209ee09ac +0f1f245fa1c181ae +1be3758972b35151 +03aa0437e5d62d58 +2d4e81e66ce80039 +4221bc1d4aea1a02 +13686755488a9d51 +04ef6a410f034514 +46a4d49d61a86d37 +03ef5f13e0a30864 +28e8300e004ab30b +3059f523501dfd97 +0d46043105cf3185 +482c3e92080f18c5 +41e428b3c7a16695 +18e659699338f835 +0aacb1732fee7a3c +237ef12cd69e2aa0 +31bf989cb15492d4 +2f8e1946600c65d4 +494f87170e713843 +4c2d32a7f2b62657 +4c96e034f8af77d6 +306e2b7785657539 +2ad4ea800caafe09 +43361dbc0c5a2808 +487d83675a8d1574 +2bd43375196ce1a7 +1b65111d34f57bb0 +0278b3d8abd9654d +3714123f055e06a1 +4cf74ffa5bfd5904 +0598fec76ecc7bd6 +2c52d9d606a3ece2 +180bf845cc8cada3 +2245fcd7f76c2ecd +41abd737e0228c1a +2d62a5d66d6e4931 +05a6149f1fcee38d +3a126bd9702ee8f7 +291db63458af0613 +478e22bb4c242aa9 +1c11709814a1a2f2 +2b1303680b081ccb +32ce9b303717a29d +04cb1526cf3c43cc +3eb185c04280412d +0c209edeb7637dff +1c58aa75858147f1 +4833b3d2a8184313 +0f7e8bf1137abcac +40904cd4b9e0579d +3aae131a319acd17 +4ca952ede2af6578 +489254d4b26a04c6 +02679535c5f06a19 +3a48dfbd2f0977f9 +04ed1812719e05f0 +2fe4e7ab61b23c85 +1798c9640d8875e6 +329abf340d23b0b9 +48d4444b94c2a2c0 +4cbb82a6bab25a0f +48614bc62c3acbf8 +0f7267e7e369b7d6 +33dbdc4396938ae1 +398c4688209874c9 +1433f61e9591ea9b +040de715f9303ba5 +3dba1838ed366ab5 +0de79d4f3d7a9171 +33f1be3a9ccf4e4b +25b3854e6efb747d +182054e13eaf58fc +310cbf1c65c52fa7 +037bdac76bdcd7c6 +18be7c1b9895691f +03fe94a439456692 +2b121397791014af +44095e87bee5475e +39424be692a88364 +1df20a29cdec61a4 +49c758aa3c35ed86 +04ca03945611febb +134d7e5a74497a82 +2b81fbc1af01f0ad +2e4c69143b09033c +0ffd1083a70c6968 +1930a64d9a119b13 +14e3fec07ba502d7 +07d0229847bd7408 +37b0b70a1a0c25d3 +18cde229723f22df +24d95746999b7f7e +47d11d4bee6608e0 +0f47577ab3441480 +401a94bd9d84f501 +259f6f1f002d2d94 +3ce90c0ea2537c48 +1ffd3b706d708774 +1f62165dfec00c3e +066c35b1abc706be +06c71ce295284689 +0ecdc87c3391ce98 +034677cf3d80162d +0e7b68884ac4d959 +05ec2d3e4c027220 +23dcee801bca67bf +0131c9aed0fb3940 +22552c9a2a2a2ce7 +4263257ec6099434 +1c8d34a791deaaf1 +21b548b570c0b415 +11491a312c6b8f58 +482ce5c63038e5b4 +17d841670d2da942 +0ef054fbdabce0cd +45823117f0acb627 +0c89e266974e8b90 +115fa3a1923b7c9f +21840c44aed0ae43 +430d8fece8e0f7e5 +335794d48a9b168a +0757b4bc82bf26b1 +172bd46c6ddda95d +3d6e04af63ebfea4 +3cfb4c69b14a1970 +41016527728cae5b +195fbef2c08715e9 +078b8bdc29565cac +40284c1baec06ac9 +3613c77d8c234008 +3662b5f2916b470c +48aff7218b00d843 +368fae6f3bc0b0f7 +29460ac4580ea232 +0068e97c1c1f61aa +33f7565ccb685cb7 +00e8df74b6805da7 +316ed1b489ff8f40 +22c8b35c589276c4 +06ffaa9ffc2eea95 +3d410da4d7fd9f64 +3e07add8413f8157 +2a9d8ba86290db0e +375cff10cab07955 +2da082dfb7a66b4d +17a3e731c4fe0aaf +4811a66f87b0dd6a +1c375830155fe6dc +15729869d1862b7f +130cbf8f12764687 +015631b21f792a12 +12e3b1ad12a752f6 +040895d45bf4e580 +0e16d64d961fe855 +2d5e1c16ba1f89c2 +10351dc7a37a44c1 +2cf1a544b179b1a7 +3bae42d603be2266 +156f4c7dca878ff2 +2d3f982ada31489c +3a642c6d0e43510b +03de2844d3c8314e +08072d6cc8e8711d +39f03d5fb1807102 +21cfc9b1266a6bd0 +311d8515bd115aba +0c9d930d226d6bd6 +0dad7f2ef3496f13 +12bed927641025a0 +15bc9eb752c6dcbd +2177ca3a775a9ee9 +289bcb973ed702a1 +0f2197967bb7fa43 +33842f4b169e4145 +397b050e345d73fd +0542630de1d734de +0752baf20fbc2285 +3ee30754edfbdb3f +0c061512de79b744 +3eef492bf5120757 +45a0fe252a89e008 +36056faee50a621f +46b2a13f6ab0be05 +311db764bdc7f537 +074653ff3928b9fe +2d815b3e5e9bb237 +21b26eda16f7cb88 +12cb4aa3f5b59ad6 +21f3cc00e0cfe8bc +088b93f15ca8745d +15762acaba295de1 +3c33566bacd602f2 +27d163d7046d36b5 +284efc2041b1d1d8 +2ad09b7837010330 +40954e72e02dc771 +03b440db4696d8e7 +0ce3839aa5b66e3f +47a07f51fd3fef77 +35e8c6c2168dd087 +11fcbcdb1dfefb38 +178cef169356d4a5 +122cb7d5ea4a99df +42742db2633d2eb5 +077d42bb51ee2793 +3dc0058dce3828d9 +44adc8d00568380f +0ccb28128213f19d +3862411e9bf455cd +3eb795302924e912 +2445756494ef6e3d +4c5fd496905b91ce +46e0654ccb5d88cf +16b48792e910cf49 +2e4013ea92d04301 +2ac712ac8d2fd488 +4ce642bc93f1bb5a +3052da93ceeda447 +2b973e6f676eb243 +46889bb1803c5cd7 +4a8f9d6889992fd9 +424397db4b1cf634 +0fe2286088ece98e +04433dcf217ad9a0 +37bbec9e46c8c1a9 +12e985eaa4b79298 +367446f773123e02 +397da8e32c2edd65 +069a4c442912c405 +1bc87c160d1dc982 +2c45f54d9432d25c +388ed39170b69946 +1058fe0400a873e0 +3e1236935a5f70ae +2e4c0705bde13d32 +449c34eaea295942 +306d435307fef477 +0d68a05801d48984 +0c916bcc9351521e +35f89d3ac607bd5a +1dad44855584a4ea +18297e1f8e25d3ee +0aa49c0b75e51ba4 +10f4acad3ed87288 +0ebb04534d7f2ba7 +0b04644621e97d30 +4246a11f0971a231 +127884736471b631 +367345ad0b29f8a3 +46fdfa2a16c7c811 +01cefee9563f691a +1250e369ad1e2fbc +1f5df6019b0bb73c +22598e2596e6bae7 +07d449efdb66c20d +2326e9820982ad81 +2e30ae101611cca8 +40ca76de44a6e1a9 +281452e730c39fd0 +33c1bb87a88e59f1 +45064c8142f3a360 +2b8778726c1f2fe4 +44accffce93c7e87 +1735d8d1b4015669 +2d99ba7951695f79 +3cc1fcf538c81442 +3018aa8ad3eb5dca +33be1ba5aec86c96 +2fca5797ae48529b +4054d32655ba5eda +1ce8503fd200fed2 +4a763e1b87e495a7 +1f7770ac5cbb41eb +3b59c7d97b900724 +2b5b5b4f4fc526ba +3e68931874661724 +3fe783b9c7c8f492 +0722819bdf5b2737 +20f8c6738e22e764 +48e49bdd1aa706e1 +3b650a9e2ebdfde2 +0667d5bedfdbc555 +33517e9838fe5f20 +227e06087cbffb2b +0f5c5385dbcd96df +0bf152ef84195293 +296ec0d98f4d4151 +293c7c1ccaa6861a +31b5667e16de1d94 +1d04977fb85a8b3c +01570ac1c73e9ca2 +28760a14d0a5ff3a +15c9a45c9c3d73f2 +288dd40199dee268 +0e101fbd21daf79c +2f878176347bcf9e +0282160b901229a7 +1c919c7e4ec601de +3a3bc11b9ebb7d44 +19391676f0fc7982 +322261824c4a3003 +2064e46352532375 +44a85c75cf4a6da8 +47a1f1f01e2b7be6 +49f6f14c580b71b9 +05c423623c9f6f56 +4214dda1f9a61b1b +1e01b910ceba4573 +3783162ee796a21c +0c9b371cc6225682 +1910e79a60d57aa7 +2422f760ea77551d +2e6876c6c1e40652 +3f68a1e365e94eb4 +1bdf9dd7628ddb0b +13c510a7403f8231 +42b88f7ee71a7ba9 +399cfd9cfacc0499 +4615277ffb68ca9d +2da2eeb966bc0ef8 +1c1b2e56952040cc +20bab82d0268c877 +0cca84503a86574c +45cb862034851efe +16930ebf3f0f6b84 +2a058bafbecaccf9 +28ed97894371982f +4955f54f807ef5aa +0fd536fc3c8fdf19 +20764a96cc70fe46 +0ed25f15cbccd939 +37de8da2580d0c1d +0932a5bf82eb2f5c +0e8995dcbdd22f48 +0d8fd962cbfc81b7 +3e00b129b656fbce +13adf913ea857ddf +22da7610855d6b9d +2aab03e1aec0222c +0d22ced53b1db7d3 +3b122e1becb5fcb7 +3f45b8234504020c +15bc7fa1ed5567cc +11e9cb1ccb9abe9f +150b45a39c57623d +309bed43e4406d72 +14cf1f92ca13d605 +3984d005557cbd6f +0c24590c68af865f +144b95c0c3fbe3b0 +2945a940639798ce +1515d37824dd6b22 +495f50f0997e986e +48b1808c546c7e87 +075654f497170f90 +2ccd2d98696c87e0 +144c2c2c52734f15 +15c28f4ada02cf91 +3164f4c30188d403 +389851bf0ac38227 +4c76898a3d535741 +0f620bfafa25fcf5 +28c9d20b865f5d56 +069a9416dc6a373b +28a5318660ab60ba +10ad4fc499c48b38 +3b67613d97aac1df +118ffef2ad3950f5 +3c83e9817c9e022d +3c054be9bdb304ee +1dcd8aee9a39a61a +2f18f5579583e648 +0539bcecbc483dfe +3814a3a8046c8af3 +1fa7929f55b1fdd5 +0f6206df8a8e440a +0017ce4c6a39d122 +16a7b5a41f31feb8 +0de6bc7da518fcae +1d125b16063c96c4 +2c7e35b25a2e4b8b +329177dabfe2951d +45122648522d4180 +2d8f1ccdb70c156a +1c73def8a62301a9 +2cd27189549897bf +49ec7608e51f7ee2 +486970d685c0b746 +0583c7a746238f79 +06a2911e9add96c6 +3b9420585a1e66fc +1692fab166811028 +3aaed2e6422d7d57 +20422596003ac855 +0ed8b86b87a30d38 +075f0d808a621ae2 +48a3049cabb54c0d +0c9ea3bf67254e95 +022a21a897f2a904 +3d60041ab79f46fc +21a6081709444ebf +24598987691df957 +41936ce6152fee64 +4a177be7db12edcd +225ae1d37a7fa519 +4b4c0c27204604a3 +4b5619958277861f +16ed45d1ce9017df +061e49ba3a5386c7 +19c6e67783781a43 +3b58206d99feb4b7 +31812ed5877b73ab +0b530eea368f626e +37ff932a6a608c24 +2a30c5309018e00f +42c75d578535b0fe +177ff3969577b8de +0761e0a3a4e25c53 +473e6ec61583d90f +42cc82972397863b +4908fab97c9bcec7 +04e2be0415136fa9 +0cdfa29561cb24e0 +23cfdadab7cc51a1 +474d403238a41315 +20d0e788abca4aa9 +0bb7da710cbf4bb9 +4ace59951acbae4e +2d29ff162920db5e +2f25826f0d0ef09a +0b429a4733089487 +45592a7f307bccd0 +280443260e3dced9 +4905bc8817511dd2 +338ff9f6c02a6a40 +44b78f9fcb5cd8d8 +4bc47dc7f8781812 +2217c43ddaa29027 +0aa284f8166e19e4 +1d7af31482baf61c +01cf2d900cb03afb +46c9e2d86e7d4c41 +4a566b7e6eeaf9b4 +166bb958d7f4798a +01f7915dce639515 +1259726fc1f8e966 +314c56aad151508d +17d35e133dc3ce90 +223b5e20753d4fa3 +1cbab4f69b2d48ce +464d97e527dd5f8a +093b1fe6bc1fd024 +2007d4829b187feb +165696025b477097 +2412e9f45282fd15 +0ef15055b44649e3 +4089ef1b1bdb1d36 +0898d467d34ff7b2 +4a046d13e389b505 +497d2450ed65a678 +0090cc64d7b7bb24 +1961bb85524de229 +2c9018ef57c6b061 +3dba9cb74bfb79b2 +168c85ce00de0c6b +0c72eaf6bbb7c681 +384fc7a71b9faca7 +0a7c052273895bb3 +4b457a008376cd73 +021d7121906d6cab +228ed4b87c8a6ea7 +0e241f40ce0cd802 +3db49ddb3f470436 +30006eb23f62aa57 +454197dc5b50b45f +020991bdfbdbe504 +4242fb49c775710c +41f438dd19aae981 \ No newline at end of file diff --git a/evaluation/preprocess_co3d.py b/evaluation/preprocess_co3d.py new file mode 100644 index 00000000..e55fd380 --- /dev/null +++ b/evaluation/preprocess_co3d.py @@ -0,0 +1,131 @@ +# Modified from https://github.com/amyxlase/relpose-plus-plus/blob/main/preprocess/preprocess_co3d.py + + +""" +Usage: + python -m preprocess.preprocess_co3d --category all \ + --co3d_v2_dir /path/to/co3d_v2 +""" +import argparse +import gzip +import json +import os +import os.path as osp + +import matplotlib.pyplot as plt +import numpy as np +from tqdm.auto import tqdm + +# fmt: off +# CATEGORIES = [ +# "apple", "backpack", "ball", "banana", "baseballbat", "baseballglove", +# "bench", "bicycle", "book", "bottle", "bowl", "broccoli", "cake", "car", "carrot", +# "cellphone", "chair", "couch", "cup", "donut", "frisbee", "hairdryer", "handbag", +# "hotdog", "hydrant", "keyboard", "kite", "laptop", "microwave", "motorcycle", +# "mouse", "orange", "parkingmeter", "pizza", "plant", "remote", "sandwich", +# "skateboard", "stopsign", "suitcase", "teddybear", "toaster", "toilet", "toybus", +# "toyplane", "toytrain", "toytruck", "tv", "umbrella", "vase", "wineglass", +# ] +CATEGORIES = [ + "apple", "bench", "bowl", "cellphone", "frisbee", "hotdog", "keyboard", "parkingmeter", "teddybear", "toybus", + "backpack", "book", "car", "donut", "handbag", "hydrant", "motorcycle", "pizza", "stopsign", "toaster", "tv" +] +# fmt: on + + +def get_parser(): + parser = argparse.ArgumentParser() + parser.add_argument("--category", type=str, default="apple") + parser.add_argument("--output_dir", type=str, default="annotations/co3d_v2_annotations") + parser.add_argument("--co3d_v2_dir", type=str, default="data/co3d_v2") + parser.add_argument( + "--min_quality", + type=float, + default=0.5, + help="Minimum viewpoint quality score.", + ) + return parser + + + + +def process_poses(co3d_dir, category, output_dir, min_quality): + category_dir = osp.join(co3d_dir, category) + print("Processing category:", category) + frame_file = osp.join(category_dir, "frame_annotations.jgz") + sequence_file = osp.join(category_dir, "sequence_annotations.jgz") + subset_lists_file = osp.join(category_dir, "set_lists/set_lists_fewview_dev.json") + + # bbox_file = osp.join(output_dir, f"{category}_bbox.jgz") + + with open(subset_lists_file) as f: + subset_lists_data = json.load(f) + + with gzip.open(sequence_file, "r") as fin: + sequence_data = json.loads(fin.read()) + + with gzip.open(frame_file, "r") as fin: + frame_data = json.loads(fin.read()) + + # with gzip.open(bbox_file, "r") as fin: + # bbox_data = json.loads(fin.read()) + + frame_data_processed = {} + for f_data in frame_data: + sequence_name = f_data["sequence_name"] + if sequence_name not in frame_data_processed: + frame_data_processed[sequence_name] = {} + frame_data_processed[sequence_name][f_data["frame_number"]] = f_data + + good_quality_sequences = set() + for seq_data in sequence_data: + if seq_data["viewpoint_quality_score"] > min_quality: + good_quality_sequences.add(seq_data["sequence_name"]) + + for subset in ["train", "test"]: + category_data = {} # {sequence_name: [{filepath, R, T}]} + for seq_name, frame_number, filepath in subset_lists_data[subset]: + if seq_name not in good_quality_sequences: + continue + + if seq_name not in category_data: + category_data[seq_name] = [] + + # mask_path = filepath.replace("images", "masks").replace(".jpg", ".png") + # bbox = bbox_data[mask_path] + # if bbox == []: + # Mask did not include any object. + # continue + + frame_data = frame_data_processed[seq_name][frame_number] + category_data[seq_name].append( + { + "filepath": filepath, + "R": frame_data["viewpoint"]["R"], + "T": frame_data["viewpoint"]["T"], + "focal_length": frame_data["viewpoint"]["focal_length"], + "principal_point": frame_data["viewpoint"]["principal_point"], + # "bbox": bbox, + } + ) + + output_file = osp.join(output_dir, f"{category}_{subset}.jgz") + with gzip.open(output_file, "w") as f: + f.write(json.dumps(category_data).encode("utf-8")) + + + +if __name__ == "__main__": + parser = get_parser() + args = parser.parse_args() + if args.category == "all": + categories = CATEGORIES + else: + categories = [args.category] + for category in categories: + process_poses( + co3d_dir=args.co3d_v2_dir, + category=category, + output_dir=args.output_dir, + min_quality=args.min_quality, + ) \ No newline at end of file diff --git a/evaluation/re10k.py b/evaluation/re10k.py new file mode 100644 index 00000000..570d51db --- /dev/null +++ b/evaluation/re10k.py @@ -0,0 +1,3 @@ +# https://github.com/facebookresearch/vggt/issues/45#top +# https://github.com/facebookresearch/PoseDiffusion/blob/main/pose_diffusion/datasets/re10k_test_1800.txt +# https://github.com/yyfz/Pi3/tree/evaluation \ No newline at end of file diff --git a/evaluation/test_co3d.py b/evaluation/test_co3d.py new file mode 100644 index 00000000..61dbe511 --- /dev/null +++ b/evaluation/test_co3d.py @@ -0,0 +1,476 @@ +import os +import torch +import numpy as np +import gzip +import json +import random +import logging +import warnings +from vggt.models.vggt import VGGT +from vggt.models.vggt_small import VGGT as VGGTsmall +from vggt.utils.rotation import mat_to_quat +from vggt.utils.load_fn import load_and_preprocess_images +from vggt.utils.pose_enc import pose_encoding_to_extri_intri +from vggt.utils.geometry import closed_form_inverse_se3 +import argparse +# python test_co3d.py --model_path ../pretrained_models/model_tracker_fixed_e20.pt --co3d_dir /mimer/NOBACKUP/groups/3d-dl/co3dv2 co3d_anno_dir ../annotations/co3d_v2_annotations +# python test_co3d.py --model_path ../training/logs/dinov3_exp004/ckpts/checkpoint.pt --co3d_dir /mimer/NOBACKUP/groups/3d-dl/co3dv2 co3d_anno_dir ../annotations/co3d_v2_annotations --encoder dinov3 + + +# Suppress DINO v2 logs +logging.getLogger("dinov2").setLevel(logging.WARNING) +warnings.filterwarnings("ignore", message="xFormers is available") +warnings.filterwarnings("ignore", message="dinov2") + +# Set computation precision +torch.set_float32_matmul_precision('highest') +torch.backends.cudnn.allow_tf32 = False + + +def convert_pt3d_RT_to_opencv(Rot, Trans): + """ + Convert Point3D extrinsic matrices to OpenCV convention. + + Args: + Rot: 3D rotation matrix in Point3D format + Trans: 3D translation vector in Point3D format + + Returns: + extri_opencv: 3x4 extrinsic matrix in OpenCV format + """ + rot_pt3d = np.array(Rot) + trans_pt3d = np.array(Trans) + + trans_pt3d[:2] *= -1 + rot_pt3d[:, :2] *= -1 + rot_pt3d = rot_pt3d.transpose(1, 0) + extri_opencv = np.hstack((rot_pt3d, trans_pt3d[:, None])) + return extri_opencv + + +def build_pair_index(N, B=1): + """ + Build indices for all possible pairs of frames. + + Args: + N: Number of frames + B: Batch size + + Returns: + i1, i2: Indices for all possible pairs + """ + i1_, i2_ = torch.combinations(torch.arange(N), 2, with_replacement=False).unbind(-1) + i1, i2 = [(i[None] + torch.arange(B)[:, None] * N).reshape(-1) for i in [i1_, i2_]] + return i1, i2 + + +def rotation_angle(rot_gt, rot_pred, batch_size=None, eps=1e-15): + """ + Calculate rotation angle error between ground truth and predicted rotations. + + Args: + rot_gt: Ground truth rotation matrices + rot_pred: Predicted rotation matrices + batch_size: Batch size for reshaping the result + eps: Small value to avoid numerical issues + + Returns: + Rotation angle error in degrees + """ + q_pred = mat_to_quat(rot_pred) + q_gt = mat_to_quat(rot_gt) + + loss_q = (1 - (q_pred * q_gt).sum(dim=1) ** 2).clamp(min=eps) + err_q = torch.arccos(1 - 2 * loss_q) + + rel_rangle_deg = err_q * 180 / np.pi + + if batch_size is not None: + rel_rangle_deg = rel_rangle_deg.reshape(batch_size, -1) + + return rel_rangle_deg + + +def translation_angle(tvec_gt, tvec_pred, batch_size=None, ambiguity=True): + """ + Calculate translation angle error between ground truth and predicted translations. + + Args: + tvec_gt: Ground truth translation vectors + tvec_pred: Predicted translation vectors + batch_size: Batch size for reshaping the result + ambiguity: Whether to handle direction ambiguity + + Returns: + Translation angle error in degrees + """ + rel_tangle_deg = compare_translation_by_angle(tvec_gt, tvec_pred) + rel_tangle_deg = rel_tangle_deg * 180.0 / np.pi + + if ambiguity: + rel_tangle_deg = torch.min(rel_tangle_deg, (180 - rel_tangle_deg).abs()) + + if batch_size is not None: + rel_tangle_deg = rel_tangle_deg.reshape(batch_size, -1) + + return rel_tangle_deg + + +def compare_translation_by_angle(t_gt, t, eps=1e-15, default_err=1e6): + """ + Normalize the translation vectors and compute the angle between them. + + Args: + t_gt: Ground truth translation vectors + t: Predicted translation vectors + eps: Small value to avoid division by zero + default_err: Default error value for invalid cases + + Returns: + Angular error between translation vectors in radians + """ + t_norm = torch.norm(t, dim=1, keepdim=True) + t = t / (t_norm + eps) + + t_gt_norm = torch.norm(t_gt, dim=1, keepdim=True) + t_gt = t_gt / (t_gt_norm + eps) + + loss_t = torch.clamp_min(1.0 - torch.sum(t * t_gt, dim=1) ** 2, eps) + err_t = torch.acos(torch.sqrt(1 - loss_t)) + + err_t[torch.isnan(err_t) | torch.isinf(err_t)] = default_err + return err_t + + +def calculate_auc_np(r_error, t_error, max_threshold=30): + """ + Calculate the Area Under the Curve (AUC) for the given error arrays using NumPy. + + Args: + r_error: numpy array representing R error values (Degree) + t_error: numpy array representing T error values (Degree) + max_threshold: Maximum threshold value for binning the histogram + + Returns: + AUC value and the normalized histogram + """ + error_matrix = np.concatenate((r_error[:, None], t_error[:, None]), axis=1) + max_errors = np.max(error_matrix, axis=1) + bins = np.arange(max_threshold + 1) + histogram, _ = np.histogram(max_errors, bins=bins) + num_pairs = float(len(max_errors)) + normalized_histogram = histogram.astype(float) / num_pairs + return np.mean(np.cumsum(normalized_histogram)), normalized_histogram + + +def se3_to_relative_pose_error(pred_se3, gt_se3, num_frames): + """ + Compute rotation and translation errors between predicted and ground truth poses. + This function assumes the input poses are world-to-camera (w2c) transformations. + + Args: + pred_se3: Predicted SE(3) transformations (w2c), shape (N, 4, 4) + gt_se3: Ground truth SE(3) transformations (w2c), shape (N, 4, 4) + num_frames: Number of frames (N) + + Returns: + Rotation and translation angle errors in degrees + """ + pair_idx_i1, pair_idx_i2 = build_pair_index(num_frames) + + relative_pose_gt = gt_se3[pair_idx_i1].bmm( + closed_form_inverse_se3(gt_se3[pair_idx_i2]) + ) + relative_pose_pred = pred_se3[pair_idx_i1].bmm( + closed_form_inverse_se3(pred_se3[pair_idx_i2]) + ) + + rel_rangle_deg = rotation_angle( + relative_pose_gt[:, :3, :3], relative_pose_pred[:, :3, :3] + ) + rel_tangle_deg = translation_angle( + relative_pose_gt[:, :3, 3], relative_pose_pred[:, :3, 3] + ) + + return rel_rangle_deg, rel_tangle_deg + + +def setup_args(): + """Set up command-line arguments for the CO3D evaluation script.""" + parser = argparse.ArgumentParser(description='Test VGGT on CO3D dataset') + parser.add_argument('--debug', action='store_true', help='Enable debug mode (only test on specific category)') + parser.add_argument('--fast_eval', action='store_true', default=False, help='Only evaluate 10 sequences per category') + + parser.add_argument('--big_model', action='store_true', default=False, help='If to load the original VGGT') + parser.add_argument('--encoder', type=str, default="dinov3", help='Encoder to use in VGGTsmall') + + parser.add_argument('--min_num_images', type=int, default=50, help='Minimum number of images for a sequence') + parser.add_argument('--num_frames', type=int, default=10, help='Number of frames to use for testing') + parser.add_argument('--co3d_dir', type=str, required=True, help='Path to CO3D dataset') + parser.add_argument('--co3d_anno_dir', type=str, required=True, help='Path to CO3D annotations') + parser.add_argument('--seed', type=int, default=0, help='Random seed for reproducibility') + parser.add_argument('--model_path', type=str, required=True, help='Path to the VGGT model checkpoint') + return parser.parse_args() + + +def load_model(device, model_path, big_model=False, encoder="dinov3"): + """ + Load the VGGT model. + + Args: + device: Device to load the model on + model_path: Path to the model checkpoint + + Returns: + Loaded VGGT model + """ + print("Initializing and loading VGGT model...") + if not big_model: + model = VGGTsmall( + img_size=336, + embed_dim=768, + depth=6, + num_heads=12, + patch_size=16, + patch_embed=encoder, + enable_camera=True, + enable_depth=True, + enable_point=True, + enable_track=False, + ) + else: + model = VGGT() + print(f"USING {model_path}") + model.load_state_dict(torch.load(model_path)['model'], strict=True) + model.eval() + model = model.to(device) + return model + + + +def set_random_seeds(seed): + """ + Set random seeds for reproducibility. + + Args: + seed: Random seed value + """ + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + +def process_sequence(model, seq_name, seq_data, category, co3d_dir, min_num_images, num_frames, device, dtype): + """ + Process a single sequence and compute pose errors. + + Args: + model: VGGT model + seq_name: Sequence name + seq_data: Sequence data + category: Category name + co3d_dir: CO3D dataset directory + min_num_images: Minimum number of images required + num_frames: Number of frames to sample + device: Device to run on + dtype: Data type for model inference + + Returns: + rError: Rotation errors + tError: Translation errors + """ + if len(seq_data) < min_num_images: + return None, None + + metadata = [] + for data in seq_data: + # Make sure translations are not ridiculous + if data["T"][0] + data["T"][1] + data["T"][2] > 1e5: + return None, None + extri_opencv = convert_pt3d_RT_to_opencv(data["R"], data["T"]) + metadata.append({ + "filepath": data["filepath"], + "extri": extri_opencv, + }) + + # Random sample num_frames images + ids = np.random.choice(len(metadata), num_frames, replace=False) + print("Image ids", ids) + + image_names = [os.path.join(co3d_dir, metadata[i]["filepath"]) for i in ids] + gt_extri = [np.array(metadata[i]["extri"]) for i in ids] + gt_extri = np.stack(gt_extri, axis=0) + + images = load_and_preprocess_images(image_names).to(device) + + + with torch.no_grad(): + with torch.cuda.amp.autocast(dtype=dtype): + predictions = model(images) + with torch.cuda.amp.autocast(dtype=torch.float64): + extrinsic, intrinsic = pose_encoding_to_extri_intri(predictions["pose_enc"], images.shape[-2:]) + pred_extrinsic = extrinsic[0] + + with torch.cuda.amp.autocast(dtype=torch.float64): + gt_extrinsic = torch.from_numpy(gt_extri).to(device) + add_row = torch.tensor([0, 0, 0, 1], device=device).expand(pred_extrinsic.size(0), 1, 4) + + pred_se3 = torch.cat((pred_extrinsic, add_row), dim=1) + gt_se3 = torch.cat((gt_extrinsic, add_row), dim=1) + + rel_rangle_deg, rel_tangle_deg = se3_to_relative_pose_error(pred_se3, gt_se3, num_frames) + + + Racc_5 = (rel_rangle_deg < 5).float().mean().item() + Tacc_5 = (rel_tangle_deg < 5).float().mean().item() + + print(f"{category} sequence {seq_name} R_ACC@5: {Racc_5:.4f}") + print(f"{category} sequence {seq_name} T_ACC@5: {Tacc_5:.4f}") + + return rel_rangle_deg.cpu().numpy(), rel_tangle_deg.cpu().numpy() + + +def main(): + """Main function to evaluate VGGT on CO3D dataset.""" + # Parse command-line arguments + args = setup_args() + + # Setup device and data type + device = "cuda" if torch.cuda.is_available() else "cpu" + dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8 else torch.float16 + + # Load model + model = load_model(device, model_path=args.model_path, big_model=args.big_model, encoder=args.encoder) + + # Set random seeds + set_random_seeds(args.seed) + + # Categories to evaluate + SEEN_CATEGORIES = [ + "apple", + "bench", + "bowl", "cellphone", "frisbee", "hotdog", + # "keyboard", + "parkingmeter", + "teddybear", + "toybus", + "backpack", + "book", + "car", + "donut", + "handbag", + "hydrant", + "motorcycle", + "pizza", + "stopsign", + "toaster", + "tv" + ] + + if args.debug: + SEEN_CATEGORIES = ["parkingmeter"] + + per_category_results = {} + + for category in SEEN_CATEGORIES: + print(f"Loading annotation for {category} test set") + annotation_file = os.path.join(args.co3d_anno_dir, f"{category}_test.jgz") + + try: + with gzip.open(annotation_file, "r") as fin: + annotation = json.loads(fin.read()) + except FileNotFoundError: + print(f"Annotation file not found for {category}, skipping") + continue + + rError = [] + tError = [] + + seq_names = sorted(list(annotation.keys())) + if args.fast_eval and len(seq_names)>=10: + seq_names = random.sample(seq_names, 10) + seq_names = sorted(seq_names) + + + print("Testing Sequences: ") + print(seq_names) + + for seq_name in seq_names: + seq_data = annotation[seq_name] + print("-" * 50) + print(f"Processing {seq_name} for {category} test set") + if args.debug and not os.path.exists(os.path.join(args.co3d_dir, category, seq_name)): + print(f"Skipping {seq_name} (not found)") + continue + + try: + seq_rError, seq_tError = process_sequence( + model, seq_name, seq_data, category, args.co3d_dir, + args.min_num_images, args.num_frames, device, dtype, + ) + except Exception as e: + print(f"Error processing {seq_name}: {e}") + continue + + print("-" * 50) + + if seq_rError is not None and seq_tError is not None: + rError.extend(seq_rError) + tError.extend(seq_tError) + + if not rError: + print(f"No valid sequences found for {category}, skipping") + continue + + rError = np.array(rError) + tError = np.array(tError) + + Auc_30, _ = calculate_auc_np(rError, tError, max_threshold=30) + Auc_15, _ = calculate_auc_np(rError, tError, max_threshold=15) + Auc_5, _ = calculate_auc_np(rError, tError, max_threshold=5) + Auc_3, _ = calculate_auc_np(rError, tError, max_threshold=3) + + per_category_results[category] = { + "rError": rError, + "tError": tError, + "Auc_30": Auc_30, + "Auc_15": Auc_15, + "Auc_5": Auc_5, + "Auc_3": Auc_3 + } + + print("="*80) + # Print results with colors + GREEN = "\033[92m" + RED = "\033[91m" + BLUE = "\033[94m" + BOLD = "\033[1m" + RESET = "\033[0m" + + print(f"{BOLD}{BLUE}AUC of {category} test set:{RESET} {GREEN}{Auc_30:.4f} (AUC@30), {Auc_15:.4f} (AUC@15), {Auc_5:.4f} (AUC@5), {Auc_3:.4f} (AUC@3){RESET}") + mean_AUC_30_by_now = np.mean([per_category_results[category]["Auc_30"] for category in per_category_results]) + mean_AUC_15_by_now = np.mean([per_category_results[category]["Auc_15"] for category in per_category_results]) + mean_AUC_5_by_now = np.mean([per_category_results[category]["Auc_5"] for category in per_category_results]) + mean_AUC_3_by_now = np.mean([per_category_results[category]["Auc_3"] for category in per_category_results]) + print(f"{BOLD}{BLUE}Mean AUC of categories by now:{RESET} {RED}{mean_AUC_30_by_now:.4f} (AUC@30), {mean_AUC_15_by_now:.4f} (AUC@15), {mean_AUC_5_by_now:.4f} (AUC@5), {mean_AUC_3_by_now:.4f} (AUC@3){RESET}") + print("="*80) + + # Print summary results + print("\nSummary of AUC results:") + print("-"*50) + for category in sorted(per_category_results.keys()): + print(f"{category:<15}: {per_category_results[category]['Auc_30']:.4f} (AUC@30), {per_category_results[category]['Auc_15']:.4f} (AUC@15), {per_category_results[category]['Auc_5']:.4f} (AUC@5), {per_category_results[category]['Auc_3']:.4f} (AUC@3)") + + if per_category_results: + mean_AUC_30 = np.mean([per_category_results[category]["Auc_30"] for category in per_category_results]) + mean_AUC_15 = np.mean([per_category_results[category]["Auc_15"] for category in per_category_results]) + mean_AUC_5 = np.mean([per_category_results[category]["Auc_5"] for category in per_category_results]) + mean_AUC_3 = np.mean([per_category_results[category]["Auc_3"] for category in per_category_results]) + print("-"*50) + print(f"Mean AUC: {mean_AUC_30:.4f} (AUC@30), {mean_AUC_15:.4f} (AUC@15), {mean_AUC_5:.4f} (AUC@5), {mean_AUC_3:.4f} (AUC@3)") + print(args.model_path) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/evaluation/test_relpose.py b/evaluation/test_relpose.py new file mode 100644 index 00000000..f22f8f86 --- /dev/null +++ b/evaluation/test_relpose.py @@ -0,0 +1,458 @@ +from math import e +import os +import torch +import numpy as np +import gzip +import json +import random +import logging +import warnings +from vggt.models.vggt_small import VGGT as VGGTsmall +from vggt.models.vggt import VGGT +from vggt.utils.rotation import mat_to_quat +from vggt.utils.load_fn import load_and_preprocess_images +from vggt.utils.pose_enc import pose_encoding_to_extri_intri +from vggt.utils.geometry import closed_form_inverse_se3 +import argparse +import gzip +import json +import os +import logging +from PIL import Image + +# python evaluation/test_megadepth.py --data_dir /mimer/NOBACKUP/groups/snic2022-6-266/data/megadepth --anno_dir /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations/megadepth/test.jgz +# CUDA_VISIBLE_DEVICES=1 python evaluation/test_megadepth.py --data_dir /mimer/NOBACKUP/groups/snic2022-6-266/data/megadepth --anno_dir /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations/megadepth/test.jgz --model_path /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/training/logs/dinov3_exp001/ckpts/checkpoint_15.pt + + +# python evaluation/test_megadepth.py --data_dir /mimer/NOBACKUP/groups/snic2022-6-266/data/megadepth --anno_dir /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations/megadepth/test.jgz --model_path /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/training/logs/dinov3_exp001/ckpts/checkpoint_15.pt --fast_eval + + +# For running MegaDepth-1500: +# * python test_relpose.py --data_dir /mimer/NOBACKUP/groups/snic2022-6-266/data/megadepth --anno_dir /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations/megadepth/test.jgz --model_path ../pretrained_models/model_tracker_fixed_e20.pt --fast_eval + +# For running ScanNet-1500: +# python test_relpose.py --data_dir /mimer/NOBACKUP/groups/3d-dl/scannet/scannet_test_1500 --anno_dir /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations/scannet/scannet_test_1500.jgz --model_path ../pretrained_models/model_tracker_fixed_e20.pt --fast_eval + +# Example on how to evaluate MuM on MegaDepth: +# python test_relpose.py --data_dir /mimer/NOBACKUP/groups/snic2022-6-266/data/megadepth --anno_dir /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations/megadepth/test.jgz --model_path ../training/logs/mum_exp001/ckpts/checkpoint.pt --fast_eval --encoder mum +# python test_relpose.py --data_dir /mimer/NOBACKUP/groups/3d-dl/scannet/scannet_test_1500 --anno_dir /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations/scannet/scannet_test_1500.jgz --model_path ../training/logs/dinov3_exp001/ckpts/checkpoint.pt --fast_eval --encoder dinov3 +# python test_co3d.py --model_path ../training/logs/mum_exp001/ckpts/checkpoint.pt --fast_eval --encoder mum --co3d_dir /mimer/NOBACKUP/groups/3d-dl/co3dv2 --co3d_anno_dir ../annotations/co3d_v2_annotations +# CUDA_VISIBLE_DEVICES=2 python test_relpose.py --data_dir /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/data/re10k/ --anno_dir /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations/re10k/test.jgz --model_path ../training/logs/mum_exp004/ckpts/checkpoint.pt --fast_eval --encoder mum + +# Suppress DINO v2 logs +logging.getLogger("dinov2").setLevel(logging.WARNING) +warnings.filterwarnings("ignore", message="xFormers is available") +warnings.filterwarnings("ignore", message="dinov2") + +# Set computation precision +torch.set_float32_matmul_precision('highest') +torch.backends.cudnn.allow_tf32 = False + + +def convert_pt3d_RT_to_opencv(Rot, Trans): + """ + Convert Point3D extrinsic matrices to OpenCV convention. + + Args: + Rot: 3D rotation matrix in Point3D format + Trans: 3D translation vector in Point3D format + + Returns: + extri_opencv: 3x4 extrinsic matrix in OpenCV format + """ + rot_pt3d = np.array(Rot) + trans_pt3d = np.array(Trans) + + trans_pt3d[:2] *= -1 + rot_pt3d[:, :2] *= -1 + rot_pt3d = rot_pt3d.transpose(1, 0) + extri_opencv = np.hstack((rot_pt3d, trans_pt3d[:, None])) + return extri_opencv + + +def build_pair_index(N, B=1): + """ + Build indices for all possible pairs of frames. + + Args: + N: Number of frames + B: Batch size + + Returns: + i1, i2: Indices for all possible pairs + """ + i1_, i2_ = torch.combinations(torch.arange(N), 2, with_replacement=False).unbind(-1) + i1, i2 = [(i[None] + torch.arange(B)[:, None] * N).reshape(-1) for i in [i1_, i2_]] + return i1, i2 + + +def rotation_angle(rot_gt, rot_pred, batch_size=None, eps=1e-15): + """ + Calculate rotation angle error between ground truth and predicted rotations. + + Args: + rot_gt: Ground truth rotation matrices + rot_pred: Predicted rotation matrices + batch_size: Batch size for reshaping the result + eps: Small value to avoid numerical issues + + Returns: + Rotation angle error in degrees + """ + q_pred = mat_to_quat(rot_pred) + q_gt = mat_to_quat(rot_gt) + + loss_q = (1 - (q_pred * q_gt).sum(dim=1) ** 2).clamp(min=eps) + err_q = torch.arccos(1 - 2 * loss_q) + + rel_rangle_deg = err_q * 180 / np.pi + + if batch_size is not None: + rel_rangle_deg = rel_rangle_deg.reshape(batch_size, -1) + + return rel_rangle_deg + + +def translation_angle(tvec_gt, tvec_pred, batch_size=None, ambiguity=True): + """ + Calculate translation angle error between ground truth and predicted translations. + + Args: + tvec_gt: Ground truth translation vectors + tvec_pred: Predicted translation vectors + batch_size: Batch size for reshaping the result + ambiguity: Whether to handle direction ambiguity + + Returns: + Translation angle error in degrees + """ + rel_tangle_deg = compare_translation_by_angle(tvec_gt, tvec_pred) + rel_tangle_deg = rel_tangle_deg * 180.0 / np.pi + + if ambiguity: + rel_tangle_deg = torch.min(rel_tangle_deg, (180 - rel_tangle_deg).abs()) + + if batch_size is not None: + rel_tangle_deg = rel_tangle_deg.reshape(batch_size, -1) + + return rel_tangle_deg + + +def compare_translation_by_angle(t_gt, t, eps=1e-15, default_err=1e6): + """ + Normalize the translation vectors and compute the angle between them. + + Args: + t_gt: Ground truth translation vectors + t: Predicted translation vectors + eps: Small value to avoid division by zero + default_err: Default error value for invalid cases + + Returns: + Angular error between translation vectors in radians + """ + t_norm = torch.norm(t, dim=1, keepdim=True) + t = t / (t_norm + eps) + + t_gt_norm = torch.norm(t_gt, dim=1, keepdim=True) + t_gt = t_gt / (t_gt_norm + eps) + + loss_t = torch.clamp_min(1.0 - torch.sum(t * t_gt, dim=1) ** 2, eps) + err_t = torch.acos(torch.sqrt(1 - loss_t)) + + err_t[torch.isnan(err_t) | torch.isinf(err_t)] = default_err + return err_t + + +def calculate_auc_np(r_error, t_error, max_threshold=30): + """ + Calculate the Area Under the Curve (AUC) for the given error arrays using NumPy. + + Args: + r_error: numpy array representing R error values (Degree) + t_error: numpy array representing T error values (Degree) + max_threshold: Maximum threshold value for binning the histogram + + Returns: + AUC value and the normalized histogram + """ + error_matrix = np.concatenate((r_error[:, None], t_error[:, None]), axis=1) + max_errors = np.max(error_matrix, axis=1) + bins = np.arange(max_threshold + 1) + histogram, _ = np.histogram(max_errors, bins=bins) + num_pairs = float(len(max_errors)) + normalized_histogram = histogram.astype(float) / num_pairs + return np.mean(np.cumsum(normalized_histogram)), normalized_histogram + + +def se3_to_relative_pose_error(pred_se3, gt_se3, num_frames): + """ + Compute rotation and translation errors between predicted and ground truth poses. + This function assumes the input poses are world-to-camera (w2c) transformations. + + Args: + pred_se3: Predicted SE(3) transformations (w2c), shape (N, 4, 4) + gt_se3: Ground truth SE(3) transformations (w2c), shape (N, 4, 4) + num_frames: Number of frames (N) + + Returns: + Rotation and translation angle errors in degrees + """ + pair_idx_i1, pair_idx_i2 = build_pair_index(num_frames) + + relative_pose_gt = gt_se3[pair_idx_i1].bmm( + closed_form_inverse_se3(gt_se3[pair_idx_i2]) + ) + relative_pose_pred = pred_se3[pair_idx_i1].bmm( + closed_form_inverse_se3(pred_se3[pair_idx_i2]) + ) + + rel_rangle_deg = rotation_angle( + relative_pose_gt[:, :3, :3], relative_pose_pred[:, :3, :3] + ) + rel_tangle_deg = translation_angle( + relative_pose_gt[:, :3, 3], relative_pose_pred[:, :3, 3] + ) + + return rel_rangle_deg, rel_tangle_deg + + +def setup_args(): + """Set up command-line arguments for the CO3D evaluation script.""" + parser = argparse.ArgumentParser(description='Test VGGT on CO3D dataset') + parser.add_argument('--debug', action='store_true', help='Enable debug mode (only test on specific category)') + parser.add_argument('--fast_eval', action='store_true', default=False, help='Only evaluate 10 sequences per category') + + parser.add_argument('--big_model', action='store_true', default=False, help='If to load the original VGGT') + parser.add_argument('--encoder', type=str, default="dinov3", help='Encoder to use in VGGTsmall') + + parser.add_argument('--min_num_images', type=int, default=10, help='Minimum number of images for a sequence') + parser.add_argument('--num_frames', type=int, default=10, help='Number of frames to use for testing') + parser.add_argument('--data_dir', type=str, required=True, help='Path to CO3D dataset') + parser.add_argument('--anno_dir', type=str, required=True, help='Path to CO3D annotations') + parser.add_argument('--seed', type=int, default=0, help='Random seed for reproducibility') + parser.add_argument('--model_path', type=str, required=True, help='Path to the VGGT model checkpoint') + return parser.parse_args() + + +def load_model(device, model_path, big_model=False, encoder="dinov3"): + """ + Load the VGGT model. + + Args: + device: Device to load the model on + model_path: Path to the model checkpoint + + Returns: + Loaded VGGT model + """ + print("Initializing and loading VGGT model...") + if not big_model: + model = VGGTsmall( + img_size=336, + embed_dim=768, + depth=6, + num_heads=12, + patch_size=16, + patch_embed=encoder, + enable_camera=True, + enable_depth=True, + enable_point=True, + enable_track=False, + ) + else: + model = VGGT() + print(f"USING {model_path}") + state_dict = torch.load(model_path)['model'] + model.load_state_dict(state_dict, strict=True) + model.eval() + model = model.to(device) + return model + +def set_random_seeds(seed): + """ + Set random seeds for reproducibility. + + Args: + seed: Random seed value + """ + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + +def process_sequence(model, seq_name, seq_data, category, data_dir, min_num_images, num_frames, device, dtype): + """ + Process a single sequence and compute pose errors. + + Args: + model: VGGT model + seq_name: Sequence name + seq_data: Sequence data + category: Category name + data_dir: CO3D dataset directory + min_num_images: Minimum number of images required + num_frames: Number of frames to sample + device: Device to run on + dtype: Data type for model inference + + Returns: + rError: Rotation errors + tError: Translation errors + """ + if len(seq_data) < min_num_images: + return None, None + + metadata = [] + for data in seq_data: + + metadata.append({ + "filepath": data["filepath"], + "extri": data["extri"], + }) + + + # Random sample num_frames images + ids = np.random.choice(len(metadata), num_frames, replace=False) + + image_names = [os.path.join(data_dir, metadata[i]["filepath"]) for i in ids] + gt_extri = [np.array(metadata[i]["extri"]) for i in ids] + gt_extri = np.stack(gt_extri, axis=0) + + # images = [] + # for image_name in image_names: + # assert os.path.exists(image_name), f"{image_name} does not exist" + # img = Image.open(image_name).convert('RGB') + # images.append(transforms(img)) + # images = torch.stack(images).to(device) + + images = load_and_preprocess_images(image_names).to(device) + + # images = load_and_preprocess_images(image_names).to(device).unsqueeze(0) + + with torch.no_grad(): + with torch.cuda.amp.autocast(dtype=dtype): + predictions = model(images) + with torch.cuda.amp.autocast(dtype=torch.float64): + extrinsic, intrinsic = pose_encoding_to_extri_intri(predictions["pose_enc"], images.shape[-2:]) + pred_extrinsic = extrinsic[0] + + with torch.cuda.amp.autocast(dtype=torch.float64): + gt_extrinsic = torch.from_numpy(gt_extri).to(device) + add_row = torch.tensor([0, 0, 0, 1], device=device).expand(pred_extrinsic.size(0), 1, 4) + + pred_se3 = torch.cat((pred_extrinsic, add_row), dim=1) + gt_se3 = torch.cat((gt_extrinsic, add_row), dim=1) + + rel_rangle_deg, rel_tangle_deg = se3_to_relative_pose_error(pred_se3, gt_se3, num_frames) + + + Racc_5 = (rel_rangle_deg < 5).float().mean().item() + Tacc_5 = (rel_tangle_deg < 5).float().mean().item() + + print(f"{category} sequence {seq_name} R_ACC@5: {Racc_5:.4f}") + print(f"{category} sequence {seq_name} T_ACC@5: {Tacc_5:.4f}") + + return rel_rangle_deg.cpu().numpy(), rel_tangle_deg.cpu().numpy() + + +def main(): + """Main function to evaluate VGGT on CO3D dataset.""" + # Parse command-line arguments + args = setup_args() + + # Setup device and data type + device = "cuda" if torch.cuda.is_available() else "cpu" + dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8 else torch.float16 + + # Load model + model = load_model(device, model_path=args.model_path, big_model=args.big_model, encoder=args.encoder) + + # Set random seeds + set_random_seeds(args.seed) + + per_category_results = {} + + with gzip.open(args.anno_dir, "r") as fin: + annotation = json.loads(fin.read()) + + for scene_name, scene_data in annotation.items(): + category = scene_name + print(f"Loading annotation for {scene_name} test set") + + rError = [] + tError = [] + + if args.fast_eval and len(scene_data)>=10: + # scene_data = random.sample(scene_data, 10) + scene_data = scene_data[:10] + + for i, seq_data in enumerate(scene_data): + print("-" * 50) + seq_rError, seq_tError = process_sequence( + model, i, seq_data, category, args.data_dir, + args.min_num_images, args.num_frames, device, dtype, + ) + + print("-" * 50) + + if seq_rError is not None and seq_tError is not None: + rError.extend(seq_rError) + tError.extend(seq_tError) + + if not rError: + print(f"No valid sequences found for {category}, skipping") + continue + + rError = np.array(rError) + tError = np.array(tError) + + Auc_30, _ = calculate_auc_np(rError, tError, max_threshold=30) + Auc_15, _ = calculate_auc_np(rError, tError, max_threshold=15) + Auc_5, _ = calculate_auc_np(rError, tError, max_threshold=5) + Auc_3, _ = calculate_auc_np(rError, tError, max_threshold=3) + + per_category_results[category] = { + "rError": rError, + "tError": tError, + "Auc_30": Auc_30, + "Auc_15": Auc_15, + "Auc_5": Auc_5, + "Auc_3": Auc_3 + } + + print("="*80) + # Print results with colors + GREEN = "\033[92m" + RED = "\033[91m" + BLUE = "\033[94m" + BOLD = "\033[1m" + RESET = "\033[0m" + + print(f"{BOLD}{BLUE}AUC of {category} test set:{RESET} {GREEN}{Auc_30:.4f} (AUC@30), {Auc_15:.4f} (AUC@15), {Auc_5:.4f} (AUC@5), {Auc_3:.4f} (AUC@3){RESET}") + mean_AUC_30_by_now = np.mean([per_category_results[category]["Auc_30"] for category in per_category_results]) + mean_AUC_15_by_now = np.mean([per_category_results[category]["Auc_15"] for category in per_category_results]) + mean_AUC_5_by_now = np.mean([per_category_results[category]["Auc_5"] for category in per_category_results]) + mean_AUC_3_by_now = np.mean([per_category_results[category]["Auc_3"] for category in per_category_results]) + print(f"{BOLD}{BLUE}Mean AUC of categories by now:{RESET} {RED}{mean_AUC_30_by_now:.4f} (AUC@30), {mean_AUC_15_by_now:.4f} (AUC@15), {mean_AUC_5_by_now:.4f} (AUC@5), {mean_AUC_3_by_now:.4f} (AUC@3){RESET}") + print("="*80) + + # Print summary results + print("\nSummary of AUC results:") + print("-"*50) + for category in sorted(per_category_results.keys()): + print(f"{category:<15}: {per_category_results[category]['Auc_30']:.4f} (AUC@30), {per_category_results[category]['Auc_15']:.4f} (AUC@15), {per_category_results[category]['Auc_5']:.4f} (AUC@5), {per_category_results[category]['Auc_3']:.4f} (AUC@3)") + + if per_category_results: + mean_AUC_30 = np.mean([per_category_results[category]["Auc_30"] for category in per_category_results]) + mean_AUC_15 = np.mean([per_category_results[category]["Auc_15"] for category in per_category_results]) + mean_AUC_5 = np.mean([per_category_results[category]["Auc_5"] for category in per_category_results]) + mean_AUC_3 = np.mean([per_category_results[category]["Auc_3"] for category in per_category_results]) + print("-"*50) + print(f"Mean AUC: {mean_AUC_30:.4f} (AUC@30), {mean_AUC_15:.4f} (AUC@15), {mean_AUC_5:.4f} (AUC@5), {mean_AUC_3:.4f} (AUC@3)") + print(args.model_path) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/evaluation/test_scannet1500.py b/evaluation/test_scannet1500.py new file mode 100644 index 00000000..bbb88ed2 --- /dev/null +++ b/evaluation/test_scannet1500.py @@ -0,0 +1,225 @@ +import os.path as osp +import numpy as np +import torch +from PIL import Image +from tqdm import tqdm +import cv2 +from vggt.models.vggt import VGGT + + +# Code taken from https://github.com/PruneTruong/DenseMatching/blob/40c29a6b5c35e86b9509e65ab0cd12553d998e5f/validation/utils_pose_estimation.py +# --- GEOMETRY --- +def estimate_pose(kpts0, kpts1, K0, K1, norm_thresh, conf=0.99999): + if len(kpts0) < 5: + return None + K0inv = np.linalg.inv(K0[:2,:2]) + K1inv = np.linalg.inv(K1[:2,:2]) + + kpts0 = (K0inv @ (kpts0-K0[None,:2,2]).T).T + kpts1 = (K1inv @ (kpts1-K1[None,:2,2]).T).T + E, mask = cv2.findEssentialMat( + kpts0, kpts1, np.eye(3), threshold=norm_thresh, prob=conf + ) + + ret = None + if E is not None: + best_num_inliers = 0 + + for _E in np.split(E, len(E) / 3): + n, R, t, _ = cv2.recoverPose(_E, kpts0, kpts1, np.eye(3), 1e9, mask=mask) + if n > best_num_inliers: + best_num_inliers = n + ret = (R, t, mask.ravel() > 0) + return ret + +def pose_auc(errors, thresholds): + sort_idx = np.argsort(errors) + errors = np.array(errors.copy())[sort_idx] + recall = (np.arange(len(errors)) + 1) / len(errors) + errors = np.r_[0.0, errors] + recall = np.r_[0.0, recall] + aucs = [] + for t in thresholds: + last_index = np.searchsorted(errors, t) + r = np.r_[recall[:last_index], recall[last_index - 1]] + e = np.r_[errors[:last_index], t] + aucs.append(np.trapz(r, x=e).item() / t) + return aucs + +def angle_error_vec(v1, v2): + n = np.linalg.norm(v1) * np.linalg.norm(v2) + return np.rad2deg(np.arccos(np.clip(np.dot(v1, v2) / n, -1.0, 1.0))) + +def angle_error_mat(R1, R2): + cos = (np.trace(np.dot(R1.T, R2)) - 1) / 2 + cos = np.clip(cos, -1.0, 1.0) # numercial errors can make it out of bounds + return np.rad2deg(np.abs(np.arccos(cos))) + +def compute_pose_error(T_0to1, R, t): + R_gt = T_0to1[:3, :3] + t_gt = T_0to1[:3, 3] + error_t = angle_error_vec(t.squeeze(), t_gt) + error_t = np.minimum(error_t, 180 - error_t) # ambiguity of E estimation + error_R = angle_error_mat(R, R_gt) + return error_t, error_R + +class ScanNetBenchmark: + def __init__(self, data_root="data/scannet") -> None: + self.data_root = data_root + + def benchmark(self, model, model_name = None): + model.train(False) + with torch.no_grad(): + data_root = self.data_root + tmp = np.load(osp.join(data_root, "test.npz")) + pairs, rel_pose = tmp["name"], tmp["rel_pose"] + tot_e_t, tot_e_R, tot_e_pose = [], [], [] + pair_inds = np.random.choice( + range(len(pairs)), size=len(pairs), replace=False + ) + for pairind in tqdm(pair_inds, smoothing=0.9): + scene = pairs[pairind] + scene_name = f"scene0{scene[0]}_00" + im_A_path = osp.join( + self.data_root, + "scans_test", + scene_name, + "color", + f"{scene[2]}.jpg", + ) + im_A = Image.open(im_A_path) + im_B_path = osp.join( + self.data_root, + "scans_test", + scene_name, + "color", + f"{scene[3]}.jpg", + ) + im_B = Image.open(im_B_path) + T_gt = rel_pose[pairind].reshape(3, 4) + R, t = T_gt[:3, :3], T_gt[:3, 3] + K = np.stack( + [ + np.array([float(i) for i in r.split()]) + for r in open( + osp.join( + self.data_root, + "scans_test", + scene_name, + "intrinsic", + "intrinsic_color.txt", + ), + "r", + ) + .read() + .split("\n") + if r + ] + ) + w1, h1 = im_A.size + w2, h2 = im_B.size + K1 = K.copy() + K2 = K.copy() + dense_matches, dense_certainty = model.match(im_A_path, im_B_path) + sparse_matches, sparse_certainty = model.sample( + dense_matches, dense_certainty, 5000 + ) + scale1 = 480 / min(w1, h1) + scale2 = 480 / min(w2, h2) + w1, h1 = scale1 * w1, scale1 * h1 + w2, h2 = scale2 * w2, scale2 * h2 + K1 = K1 * scale1 + K2 = K2 * scale2 + + offset = 0.5 + kpts1 = sparse_matches[:, :2] + kpts1 = ( + np.stack( + ( + w1 * (kpts1[:, 0] + 1) / 2 - offset, + h1 * (kpts1[:, 1] + 1) / 2 - offset, + ), + axis=-1, + ) + ) + kpts2 = sparse_matches[:, 2:] + kpts2 = ( + np.stack( + ( + w2 * (kpts2[:, 0] + 1) / 2 - offset, + h2 * (kpts2[:, 1] + 1) / 2 - offset, + ), + axis=-1, + ) + ) + for _ in range(5): + shuffling = np.random.permutation(np.arange(len(kpts1))) + kpts1 = kpts1[shuffling] + kpts2 = kpts2[shuffling] + try: + norm_threshold = 0.5 / ( + np.mean(np.abs(K1[:2, :2])) + np.mean(np.abs(K2[:2, :2]))) + R_est, t_est, mask = estimate_pose( + kpts1, + kpts2, + K1, + K2, + norm_threshold, + conf=0.99999, + ) + T1_to_2_est = np.concatenate((R_est, t_est), axis=-1) # + e_t, e_R = compute_pose_error(T1_to_2_est, R, t) + e_pose = max(e_t, e_R) + except Exception as e: + print(repr(e)) + e_t, e_R = 90, 90 + e_pose = max(e_t, e_R) + tot_e_t.append(e_t) + tot_e_R.append(e_R) + tot_e_pose.append(e_pose) + tot_e_t.append(e_t) + tot_e_R.append(e_R) + tot_e_pose.append(e_pose) + tot_e_pose = np.array(tot_e_pose) + thresholds = [5, 10, 20] + auc = pose_auc(tot_e_pose, thresholds) + acc_5 = (tot_e_pose < 5).mean() + acc_10 = (tot_e_pose < 10).mean() + acc_15 = (tot_e_pose < 15).mean() + acc_20 = (tot_e_pose < 20).mean() + map_5 = acc_5 + map_10 = np.mean([acc_5, acc_10]) + map_20 = np.mean([acc_5, acc_10, acc_15, acc_20]) + return { + "auc_5": auc[0], + "auc_10": auc[1], + "auc_20": auc[2], + "map_5": map_5, + "map_10": map_10, + "map_20": map_20, + } + +def load_model(device, model_path): + """ + Load the VGGT model. + + Args: + device: Device to load the model on + model_path: Path to the model checkpoint + + Returns: + Loaded VGGT model + """ + print("Initializing and loading VGGT model...") + model = VGGT() + # _URL = "https://huggingface.co/facebook/VGGT-1B/resolve/main/model.pt" + # model.load_state_dict(torch.hub.load_state_dict_from_url(_URL)) + print(f"USING {model_path}") + model.load_state_dict(torch.load(model_path)) + model.eval() + model = model.to(device) + return model + +if __name__ == "__main__": + model = load_model("cuda", "../pretrained_models/model_tracker_fixed_e20.pt") + pass \ No newline at end of file diff --git a/evaluation/utils/__init__.py b/evaluation/utils/__init__.py new file mode 100644 index 00000000..66734d93 --- /dev/null +++ b/evaluation/utils/__init__.py @@ -0,0 +1,38 @@ +from vggt.models.vggt_small import VGGT as VGGTsmall +from vggt.models.vggt import VGGT +import torch + +def load_model(device, model_path, big_model=False, encoder="dinov3"): + """ + Load the VGGT model. + + Args: + device: Device to load the model on + model_path: Path to the model checkpoint + + Returns: + Loaded VGGT model + """ + print("Initializing and loading VGGT model...") + if not big_model: + model = VGGTsmall( + img_size=336, + embed_dim=768, + depth=6, + num_heads=12, + patch_size=16, + patch_embed=encoder, + enable_camera=True, + enable_depth=True, + enable_point=True, + enable_track=False, + ) + state_dict = torch.load(model_path)['model'] + else: + model = VGGT() + state_dict = torch.load(model_path) + print(f"USING {model_path}") + model.load_state_dict(state_dict, strict=True) + model.eval() + model = model.to(device) + return model \ No newline at end of file diff --git a/evaluation/utils/cropping.py b/evaluation/utils/cropping.py new file mode 100644 index 00000000..2980696b --- /dev/null +++ b/evaluation/utils/cropping.py @@ -0,0 +1,53 @@ +import cv2 +import numpy as np + +from PIL import Image +from typing import Tuple + +try: + lanczos = Image.Resampling.LANCZOS + bicubic = Image.Resampling.BICUBIC +except AttributeError: + lanczos = Image.LANCZOS + bicubic = Image.BICUBIC + +def resize_image(image: Image.Image, output_resolution: Tuple[int, int]) -> Image.Image: + max_resize_scale = max(output_resolution[0] / image.size[0], output_resolution[1] / image.size[1]) + return image.resize(output_resolution, resample=lanczos if max_resize_scale < 1 else bicubic) + +def resize_image_depth_and_intrinsic( + image: Image.Image, + depth_map: np.ndarray, + intrinsic: np.ndarray, + output_width: int, + pixel_center: bool = True, +) -> Tuple[Image.Image, np.ndarray, np.ndarray]: + if len(depth_map.shape) != 2: + raise ValueError(f"Depth map must be a 2D array, but found depthmap.shape = {depth_map.shape}") + input_resolution = np.array(depth_map.shape[::-1], dtype=np.float32) # (H, W) -> (W, H) + # output_resolution = np.array([output_width, round(input_resolution[1] * (output_width / input_resolution[0]))]) + output_resolution = np.array([output_width, round(input_resolution[1] * (output_width / input_resolution[0]) / 14) * 14]) + + image = resize_image(image, tuple(output_resolution)) + + depth_map = cv2.resize( + depth_map, + output_resolution, + interpolation = cv2.INTER_NEAREST, + ) + + intrinsic = np.copy(intrinsic) + + if pixel_center: + intrinsic[0, 2] = intrinsic[0, 2] + 0.5 + intrinsic[1, 2] = intrinsic[1, 2] + 0.5 + + resize_scale = np.max(output_resolution / input_resolution) + intrinsic[:2, :] = intrinsic[:2, :] * resize_scale + + if pixel_center: + intrinsic[0, 2] = intrinsic[0, 2] - 0.5 + intrinsic[1, 2] = intrinsic[1, 2] - 0.5 + + assert image.size == depth_map.shape[::-1], f"Image size {image.size} does not match depth map shape {depth_map.shape[::-1]}" + return image, depth_map, intrinsic \ No newline at end of file diff --git a/evaluation/utils/geometry.py b/evaluation/utils/geometry.py new file mode 100644 index 00000000..acdb178a --- /dev/null +++ b/evaluation/utils/geometry.py @@ -0,0 +1,296 @@ +# From https://github.com/facebookresearch/vggt/blob/main/vggt/utils/geometry.py, https://github.com/facebookresearch/vggt/blob/main/vggt/utils/rotation.py + +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import numpy as np +import torch.nn.functional as F + + +def quat_to_mat(quaternions: torch.Tensor) -> torch.Tensor: + """ + Quaternion Order: XYZW or say ijkr, scalar-last + + Convert rotations given as quaternions to rotation matrices. + Args: + quaternions: quaternions with real part last, + as tensor of shape (..., 4). + + Returns: + Rotation matrices as tensor of shape (..., 3, 3). + """ + i, j, k, r = torch.unbind(quaternions, -1) + # pyre-fixme[58]: `/` is not supported for operand types `float` and `Tensor`. + two_s = 2.0 / (quaternions * quaternions).sum(-1) + + o = torch.stack( + ( + 1 - two_s * (j * j + k * k), + two_s * (i * j - k * r), + two_s * (i * k + j * r), + two_s * (i * j + k * r), + 1 - two_s * (i * i + k * k), + two_s * (j * k - i * r), + two_s * (i * k - j * r), + two_s * (j * k + i * r), + 1 - two_s * (i * i + j * j), + ), + -1, + ) + return o.reshape(quaternions.shape[:-1] + (3, 3)) + + +def mat_to_quat(matrix: torch.Tensor) -> torch.Tensor: + """ + Convert rotations given as rotation matrices to quaternions. + + Args: + matrix: Rotation matrices as tensor of shape (..., 3, 3). + + Returns: + quaternions with real part last, as tensor of shape (..., 4). + Quaternion Order: XYZW or say ijkr, scalar-last + """ + if matrix.size(-1) != 3 or matrix.size(-2) != 3: + raise ValueError(f"Invalid rotation matrix shape {matrix.shape}.") + + batch_dim = matrix.shape[:-2] + m00, m01, m02, m10, m11, m12, m20, m21, m22 = torch.unbind(matrix.reshape(batch_dim + (9,)), dim=-1) + + q_abs = _sqrt_positive_part( + torch.stack( + [ + 1.0 + m00 + m11 + m22, + 1.0 + m00 - m11 - m22, + 1.0 - m00 + m11 - m22, + 1.0 - m00 - m11 + m22, + ], + dim=-1, + ) + ) + + # we produce the desired quaternion multiplied by each of r, i, j, k + quat_by_rijk = torch.stack( + [ + # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and + # `int`. + torch.stack([q_abs[..., 0] ** 2, m21 - m12, m02 - m20, m10 - m01], dim=-1), + # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and + # `int`. + torch.stack([m21 - m12, q_abs[..., 1] ** 2, m10 + m01, m02 + m20], dim=-1), + # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and + # `int`. + torch.stack([m02 - m20, m10 + m01, q_abs[..., 2] ** 2, m12 + m21], dim=-1), + # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and + # `int`. + torch.stack([m10 - m01, m20 + m02, m21 + m12, q_abs[..., 3] ** 2], dim=-1), + ], + dim=-2, + ) + + # We floor here at 0.1 but the exact level is not important; if q_abs is small, + # the candidate won't be picked. + flr = torch.tensor(0.1).to(dtype=q_abs.dtype, device=q_abs.device) + quat_candidates = quat_by_rijk / (2.0 * q_abs[..., None].max(flr)) + + # if not for numerical problems, quat_candidates[i] should be same (up to a sign), + # forall i; we pick the best-conditioned one (with the largest denominator) + out = quat_candidates[F.one_hot(q_abs.argmax(dim=-1), num_classes=4) > 0.5, :].reshape(batch_dim + (4,)) + + # Convert from rijk to ijkr + out = out[..., [1, 2, 3, 0]] + + out = standardize_quaternion(out) + + return out + + +def _sqrt_positive_part(x: torch.Tensor) -> torch.Tensor: + """ + Returns torch.sqrt(torch.max(0, x)) + but with a zero subgradient where x is 0. + """ + ret = torch.zeros_like(x) + positive_mask = x > 0 + if torch.is_grad_enabled(): + ret[positive_mask] = torch.sqrt(x[positive_mask]) + else: + ret = torch.where(positive_mask, torch.sqrt(x), ret) + return ret + + +def standardize_quaternion(quaternions: torch.Tensor) -> torch.Tensor: + """ + Convert a unit quaternion to a standard form: one in which the real + part is non negative. + + Args: + quaternions: Quaternions with real part last, + as tensor of shape (..., 4). + + Returns: + Standardized quaternions as tensor of shape (..., 4). + """ + return torch.where(quaternions[..., 3:4] < 0, -quaternions, quaternions) + + + +def unproject_depth_map_to_point_map( + depth_map: np.ndarray, extrinsics_cam: np.ndarray, intrinsics_cam: np.ndarray +) -> np.ndarray: + """ + Unproject a batch of depth maps to 3D world coordinates. + + Args: + depth_map (np.ndarray): Batch of depth maps of shape (S, H, W, 1) or (S, H, W) + extrinsics_cam (np.ndarray): Batch of camera extrinsic matrices of shape (S, 3, 4) + intrinsics_cam (np.ndarray): Batch of camera intrinsic matrices of shape (S, 3, 3) + + Returns: + np.ndarray: Batch of 3D world coordinates of shape (S, H, W, 3) + """ + if isinstance(depth_map, torch.Tensor): + depth_map = depth_map.cpu().numpy() + if isinstance(extrinsics_cam, torch.Tensor): + extrinsics_cam = extrinsics_cam.cpu().numpy() + if isinstance(intrinsics_cam, torch.Tensor): + intrinsics_cam = intrinsics_cam.cpu().numpy() + + world_points_list = [] + for frame_idx in range(depth_map.shape[0]): + cur_world_points, _, _ = depth_to_world_coords_points( + depth_map[frame_idx].squeeze(-1), extrinsics_cam[frame_idx], intrinsics_cam[frame_idx] + ) + world_points_list.append(cur_world_points) + world_points_array = np.stack(world_points_list, axis=0) + + return world_points_array + + +def depth_to_world_coords_points( + depth_map: np.ndarray, + extrinsic: np.ndarray, + intrinsic: np.ndarray, + eps=1e-8, +) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + """ + Convert a depth map to world coordinates. + + Args: + depth_map (np.ndarray): Depth map of shape (H, W). + intrinsic (np.ndarray): Camera intrinsic matrix of shape (3, 3). + extrinsic (np.ndarray): Camera extrinsic matrix of shape (3, 4). OpenCV camera coordinate convention, cam from world. + + Returns: + tuple[np.ndarray, np.ndarray]: World coordinates (H, W, 3) and valid depth mask (H, W). + """ + if depth_map is None: + return None, None, None + + # Valid depth mask + point_mask = depth_map > eps + + # Convert depth map to camera coordinates + cam_coords_points = depth_to_cam_coords_points(depth_map, intrinsic) + + # Multiply with the inverse of extrinsic matrix to transform to world coordinates + # extrinsic_inv is 4x4 (note closed_form_inverse_OpenCV is batched, the output is (N, 4, 4)) + cam_to_world_extrinsic = closed_form_inverse_se3(extrinsic[None])[0] + + R_cam_to_world = cam_to_world_extrinsic[:3, :3] + t_cam_to_world = cam_to_world_extrinsic[:3, 3] + + # Apply the rotation and translation to the camera coordinates + world_coords_points = np.dot(cam_coords_points, R_cam_to_world.T) + t_cam_to_world # HxWx3, 3x3 -> HxWx3 + # world_coords_points = np.einsum("ij,hwj->hwi", R_cam_to_world, cam_coords_points) + t_cam_to_world + + return world_coords_points, cam_coords_points, point_mask + + +def depth_to_cam_coords_points(depth_map: np.ndarray, intrinsic: np.ndarray) -> tuple[np.ndarray, np.ndarray]: + """ + Convert a depth map to camera coordinates. + + Args: + depth_map (np.ndarray): Depth map of shape (H, W). + intrinsic (np.ndarray): Camera intrinsic matrix of shape (3, 3). + + Returns: + tuple[np.ndarray, np.ndarray]: Camera coordinates (H, W, 3) + """ + H, W = depth_map.shape + assert intrinsic.shape == (3, 3), "Intrinsic matrix must be 3x3" + assert intrinsic[0, 1] == 0 and intrinsic[1, 0] == 0, "Intrinsic matrix must have zero skew" + + # Intrinsic parameters + fu, fv = intrinsic[0, 0], intrinsic[1, 1] + cu, cv = intrinsic[0, 2], intrinsic[1, 2] + + # Generate grid of pixel coordinates + u, v = np.meshgrid(np.arange(W), np.arange(H)) + + # Unproject to camera coordinates + x_cam = (u - cu) * depth_map / fu + y_cam = (v - cv) * depth_map / fv + z_cam = depth_map + + # Stack to form camera coordinates + cam_coords = np.stack((x_cam, y_cam, z_cam), axis=-1).astype(np.float32) + + return cam_coords + + +def closed_form_inverse_se3(se3, R=None, T=None): + """ + Compute the inverse of each 4x4 (or 3x4) SE3 matrix in a batch. + + If `R` and `T` are provided, they must correspond to the rotation and translation + components of `se3`. Otherwise, they will be extracted from `se3`. + + Args: + se3: Nx4x4 or Nx3x4 array or tensor of SE3 matrices. + R (optional): Nx3x3 array or tensor of rotation matrices. + T (optional): Nx3x1 array or tensor of translation vectors. + + Returns: + Inverted SE3 matrices with the same type and device as `se3`. + + Shapes: + se3: (N, 4, 4) + R: (N, 3, 3) + T: (N, 3, 1) + """ + # Check if se3 is a numpy array or a torch tensor + is_numpy = isinstance(se3, np.ndarray) + + # Validate shapes + if se3.shape[-2:] != (4, 4) and se3.shape[-2:] != (3, 4): + raise ValueError(f"se3 must be of shape (N,4,4), got {se3.shape}.") + + # Extract R and T if not provided + if R is None: + R = se3[:, :3, :3] # (N,3,3) + if T is None: + T = se3[:, :3, 3:] # (N,3,1) + + # Transpose R + if is_numpy: + # Compute the transpose of the rotation for NumPy + R_transposed = np.transpose(R, (0, 2, 1)) + # -R^T t for NumPy + top_right = -np.matmul(R_transposed, T) + inverted_matrix = np.tile(np.eye(4), (len(R), 1, 1)) + else: + R_transposed = R.transpose(1, 2) # (N,3,3) + top_right = -torch.bmm(R_transposed, T) # (N,3,1) + inverted_matrix = torch.eye(4, 4)[None].repeat(len(R), 1, 1) + inverted_matrix = inverted_matrix.to(R.dtype).to(R.device) + + inverted_matrix[:, :3, :3] = R_transposed + inverted_matrix[:, :3, 3:] = top_right + + return inverted_matrix \ No newline at end of file diff --git a/evaluation/utils/interfaces.py b/evaluation/utils/interfaces.py new file mode 100644 index 00000000..0c4d6b16 --- /dev/null +++ b/evaluation/utils/interfaces.py @@ -0,0 +1,145 @@ +import math +import torch +import torch.nn.functional as F +import torchvision.transforms as tvf +import time + +from typing import List, Optional, Tuple +from omegaconf import DictConfig +from PIL import Image + + +def load_images(filelist: List[str], PIXEL_LIMIT: int = 255000, new_width: Optional[int] = None, verbose: bool = False): + """ + Loads images from a directory or video, resizes them to a uniform size, + then converts and stacks them into a single [N, 3, H, W] PyTorch tensor. + """ + sources = [] + + # --- 1. Load image paths or video frames --- + for img_path in filelist: + try: + sources.append(Image.open(img_path).convert('RGB')) + except Exception as e: + print(f"Could not load image {img_path}: {e}") + + if not sources: + print("No images found or loaded.") + return torch.empty(0) + + if verbose: + print(f"Found {len(sources)} images/frames. Processing...") + + # --- 2. Determine a uniform target size for all images based on the first image --- + # This is necessary to ensure all tensors have the same dimensions for stacking. + first_img = sources[0] + W_orig, H_orig = first_img.size + if new_width is None: + scale = math.sqrt(PIXEL_LIMIT / (W_orig * H_orig)) if W_orig * H_orig > 0 else 1 + W_target, H_target = W_orig * scale, H_orig * scale + k, m = round(W_target / 14), round(H_target / 14) + while (k * 14) * (m * 14) > PIXEL_LIMIT: + if k / m > W_target / H_target: k -= 1 + else: m -= 1 + TARGET_W, TARGET_H = max(1, k) * 14, max(1, m) * 14 + else: + TARGET_W, TARGET_H = new_width, round(H_orig * (new_width / W_orig) / 14) * 14 + if verbose: + print(f"All images will be resized to a uniform size: ({TARGET_W}, {TARGET_H})") + + # --- 3. Resize images and convert them to tensors in the [0, 1] range --- + tensor_list = [] + # Define a transform to convert a PIL Image to a CxHxW tensor and normalize to [0,1] + to_tensor_transform = tvf.ToTensor() + + for img_pil in sources: + try: + # Resize to the uniform target size + resized_img = img_pil.resize((TARGET_W, TARGET_H), Image.Resampling.LANCZOS) + # Convert to tensor + img_tensor = to_tensor_transform(resized_img) + tensor_list.append(img_tensor) + except Exception as e: + print(f"Error processing an image: {e}") + + if not tensor_list: + print("No images were successfully processed.") + return torch.empty(0) + + # --- 4. Stack the list of tensors into a single [N, C, H, W] batch tensor --- + return torch.stack(tensor_list, dim=0) + + +def load_and_resize14(filelist: List[str], new_width: int, device: str, verbose: bool): + imgs = load_images(filelist, new_width=new_width, verbose=verbose).to(device) + + ori_h, ori_w = imgs.shape[-2:] + patch_h, patch_w = ori_h // 14, ori_w // 14 + # (N, 3, h, w) -> (1, N, 3, h_14, w_14) + imgs = F.interpolate(imgs, (patch_h * 14, patch_w * 14), mode="bilinear", align_corners=False, antialias=True).unsqueeze(0) + return imgs + +def infer_monodepth(file: str, model, hydra_cfg: DictConfig): + + imgs = load_and_resize14([file], new_width=hydra_cfg.load_img_size, device=hydra_cfg.device, verbose=hydra_cfg.verbose) + + dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] >= 8 else torch.float16 + with torch.no_grad(): + with torch.amp.autocast(hydra_cfg.device, dtype=dtype): + pred = model(imgs) + + points = pred['local_points'][0] # (1, h_14, w_14, 3) + depth_map = points[0, ..., -1].detach() # (h_14, w_14) + return depth_map # torch.Tensor + + +def infer_videodepth(filelist: str, model, hydra_cfg: DictConfig): + + imgs = load_and_resize14(filelist, new_width=hydra_cfg.load_img_size, device=hydra_cfg.device, verbose=hydra_cfg.verbose) + + dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] >= 8 else torch.float16 + + start = time.time() + with torch.no_grad(): + with torch.amp.autocast(hydra_cfg.device, dtype=dtype): + pred = model(imgs) + end = time.time() + + depth_map = pred['local_points'][0, ..., -1] # (N, h_14, w_14) + depth_conf = pred['conf'][0, ..., 0] # (N, h_14, w_14) + return end - start, depth_map, depth_conf + + + +def infer_cameras_c2w(filelist: str, model, hydra_cfg: DictConfig): + + imgs = load_and_resize14(filelist, new_width=hydra_cfg.load_img_size, device=hydra_cfg.device, verbose=hydra_cfg.verbose) + + dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] >= 8 else torch.float16 + + with torch.no_grad(): + with torch.amp.autocast(hydra_cfg.device, dtype=dtype): + pred = model(imgs) + + poses_c2w_all = pred['camera_poses'].cpu() + + return poses_c2w_all[0], None + +def infer_mv_pointclouds(filelist: str, model, hydra_cfg: DictConfig, data_size: Tuple[int, int]): + + imgs = load_and_resize14(filelist, new_width=hydra_cfg.load_img_size, device=hydra_cfg.device, verbose=hydra_cfg.verbose) + + dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] >= 8 else torch.float16 + + with torch.no_grad(): + with torch.amp.autocast(hydra_cfg.device, dtype=dtype): + pred = model(imgs) + + # global_points = pred['points'][0] # (N, h, w, 3) + global_points = pred['world_points'][0] # (N, h, w, 3) + global_points = F.interpolate( + global_points.permute(0, 3, 1, 2), data_size, + mode="bilinear", align_corners=False, antialias=True + ).permute(0, 2, 3, 1) # align to gt + + return global_points.cpu().numpy() \ No newline at end of file diff --git a/evaluation/utils/messages.py b/evaluation/utils/messages.py new file mode 100644 index 00000000..ea9997be --- /dev/null +++ b/evaluation/utils/messages.py @@ -0,0 +1,87 @@ +import os +import os.path as osp +import sys +import pandas as pd + +from typing import List + +def set_default_arg(key: str, default_value: str): + """ + check if `key` in arguments, else append default value `key=value` to argument list + """ + has_key = any(arg.startswith(f"{key}=") for arg in sys.argv) + if not has_key: + sys.argv.append(f"{key}={default_value}") + +def make_csvsdir_and_remove_history_csvs(input_root: str, seqs_csv_file: str): + """ + Make the input directory for CSV files and remove any existing history CSV files. + """ + if osp.isfile(seqs_csv_file): + os.remove(seqs_csv_file) + os.makedirs(input_root, exist_ok=True) + for file in os.listdir(input_root): + if file.endswith(".csv"): + os.remove(osp.join(input_root, file)) + +def gather_csv_and_write(input_root: str, output_file: str): + """ + Gather all CSV files in the input directory, concatenate them, and write to the output file. + If the input directory contains multiple rows in a CSV file, only the last row will be saved. + If the output file already exists, it will be overwritten. + """ + seq_dfs = [] + for seq_csv_file in sorted(os.listdir(input_root)): + if seq_csv_file.endswith(".csv"): + df = pd.read_csv(osp.join(input_root, seq_csv_file)) + if len(df) > 1: + print(f"Warning: {osp.join(input_root, seq_csv_file)} has more than one row, only the last row will be saved.") + df = df.tail(1) + seq_dfs.append(df) + + if len(seq_dfs) == 0: + raise ValueError(f"No CSV files found in {input_root}. Returning an empty DataFrame.") + + df = pd.concat(seq_dfs, ignore_index=True) + if osp.isfile(output_file): + print(f"Warning: {output_file} already exists, data will be overwritten.") + df.to_csv(output_file, index=False) + return df + +def write_csv(file_path: str, data_dict: dict): + # transform data of one row to DataFrame + new_row = pd.DataFrame([data_dict]) + + # directly save when the file does not exist; else we just append + if not osp.isfile(file_path): + new_row.to_csv(file_path, index=False) + else: + existing_data = pd.read_csv(file_path) + updated_data = pd.concat([existing_data, new_row], ignore_index=True) + updated_data.to_csv(file_path, index=False) + +def format_matrix_str(matrix): + def format_float(num, total_width=20, decimal_places=12): + # strip all right 0s, then strip '.' + s = f"{num:{total_width}.{decimal_places}f}".rstrip("0").rstrip(".") + # add space to the left + return f"{s:>{total_width}}" + formatted = [ + [ + # f"{num:20.12f}".rstrip("0").rstrip(".") if "." in f"{num}" else f"{num:20}" + # f"{num:20.12f}" if "." in f"{num}" else f"{num:20}" + format_float(num, total_width=15, decimal_places=8) + for num in row + ] + for row in matrix + ] + rows = [ + f" [{', '.join(num for num in row)}]" + for row in formatted + ] + return " [\n" + ",\n".join(rows) + "\n ]" + +def save_list_of_matrices(matrices_tosave: List[List[List[float]]], save_path: str) -> None: + json_str = "[\n" + ",\n".join(format_matrix_str(mat) for mat in matrices_tosave) + "\n]" + with open(save_path, "w") as f: + f.write(json_str) \ No newline at end of file diff --git a/evaluation/utils/mv_recon.py b/evaluation/utils/mv_recon.py new file mode 100644 index 00000000..ab5d763e --- /dev/null +++ b/evaluation/utils/mv_recon.py @@ -0,0 +1,98 @@ +# Reference: https://github.com/CUT3R/CUT3R/blob/main/eval/mv_recon/utils.py + +import numpy as np +from scipy.spatial import cKDTree as KDTree + + +def umeyama(X, Y): + """ + Estimates the Sim(3) transformation between `X` and `Y` point sets. + + Estimates c, R and t such as c * R @ X + t ~ Y. + + Parameters + ---------- + X : numpy.array + (m, n) shaped numpy array. m is the dimension of the points, + n is the number of points in the point set. + Y : numpy.array + (m, n) shaped numpy array. Indexes should be consistent with `X`. + That is, Y[:, i] must be the point corresponding to X[:, i]. + + Returns + ------- + c : float + Scale factor. + R : numpy.array + (3, 3) shaped rotation matrix. + t : numpy.array + (3, 1) shaped translation vector. + """ + mu_x = X.mean(axis=1).reshape(-1, 1) + mu_y = Y.mean(axis=1).reshape(-1, 1) + var_x = np.square(X - mu_x).sum(axis=0).mean() + cov_xy = ((Y - mu_y) @ (X - mu_x).T) / X.shape[1] + U, D, VH = np.linalg.svd(cov_xy) + S = np.eye(X.shape[0]) + if np.linalg.det(U) * np.linalg.det(VH) < 0: + S[-1, -1] = -1 + c = np.trace(np.diag(D) @ S) / var_x + R = U @ S @ VH + t = mu_y - c * R @ mu_x + return c, R, t + + +def completion_ratio(gt_points, rec_points, dist_th=0.05): + gen_points_kd_tree = KDTree(rec_points) + distances, _ = gen_points_kd_tree.query(gt_points) + comp_ratio = np.mean((distances < dist_th).astype(np.float32)) + return comp_ratio + + +def accuracy(gt_points, rec_points, gt_normals=None, rec_normals=None): + gt_points_kd_tree = KDTree(gt_points) + distances, idx = gt_points_kd_tree.query(rec_points, workers=-1) + acc = np.mean(distances) + + acc_median = np.median(distances) + + if gt_normals is not None and rec_normals is not None: + normal_dot = np.sum(gt_normals[idx] * rec_normals, axis=-1) + normal_dot = np.abs(normal_dot) + + return acc, acc_median, np.mean(normal_dot), np.median(normal_dot) + + return acc, acc_median + + +def completion(gt_points, rec_points, gt_normals=None, rec_normals=None): + gt_points_kd_tree = KDTree(rec_points) + distances, idx = gt_points_kd_tree.query(gt_points, workers=-1) + comp = np.mean(distances) + comp_median = np.median(distances) + + if gt_normals is not None and rec_normals is not None: + normal_dot = np.sum(gt_normals * rec_normals[idx], axis=-1) + normal_dot = np.abs(normal_dot) + + return comp, comp_median, np.mean(normal_dot), np.median(normal_dot) + + return comp, comp_median + + +def compute_iou(pred_vox, target_vox): + # Get voxel indices + v_pred_indices = [voxel.grid_index for voxel in pred_vox.get_voxels()] + v_target_indices = [voxel.grid_index for voxel in target_vox.get_voxels()] + + # Convert to sets for set operations + v_pred_filled = set(tuple(np.round(x, 4)) for x in v_pred_indices) + v_target_filled = set(tuple(np.round(x, 4)) for x in v_target_indices) + + # Compute intersection and union + intersection = v_pred_filled & v_target_filled + union = v_pred_filled | v_target_filled + + # Compute IoU + iou = len(intersection) / len(union) + return iou \ No newline at end of file diff --git a/evaluation/utils/vis_utils.py b/evaluation/utils/vis_utils.py new file mode 100644 index 00000000..14cfc295 --- /dev/null +++ b/evaluation/utils/vis_utils.py @@ -0,0 +1,41 @@ +import math +import numpy as np +import torch + +from typing import Union +from PIL import Image + + +def save_image_grid(images: np.ndarray, grid_shape: tuple, save_path: str): + """ + images: numpy array of shape (N, H, W, 3) + grid_shape: (rows, cols) + """ + H, W = images.shape[1], images.shape[2] + grid = np.zeros((grid_shape[0]*H, grid_shape[1]*W, 3), dtype=np.uint8) + + for i in range(min(len(images), grid_shape[0]*grid_shape[1])): + row = i // grid_shape[1] + col = i % grid_shape[1] + grid[row*H:(row+1)*H, col*W:(col+1)*W] = images[i] + + Image.fromarray(grid).save(save_path) + + +def save_image_grid_auto(images: Union[np.ndarray, torch.Tensor], save_path: str): + """ + images: np.ndarray of shape (N, H, W, 3) in [0, 255] or torch.Tensor of shape (N, 3, H, W) in range [0, 1] + """ + if isinstance(images, torch.Tensor): + assert images.ndim == 4 and (images.shape[1] == 3 or images.shape[-1] == 3), f"images must be a 4D torch tensor with shape (N, 3, H, W) or (N, H, W, 3)" + if images.shape[1] == 3: + images = images.permute(0, 2, 3, 1) + images = (images.detach().cpu().numpy() * 255).astype(np.uint8) + elif isinstance(images, np.ndarray): + assert images.ndim == 4 and images.shape[3] == 3, f"images must be a 4D numpy array with shape (N, H, W, 3)" + else: + raise ValueError(f"images must be a numpy array or a torch tensor, but got {type(images)}") + + rows = math.floor(math.sqrt(len(images))) + cols = math.ceil(len(images) / rows) + save_image_grid(images, (rows, cols), save_path) \ No newline at end of file diff --git a/pretrained_models b/pretrained_models new file mode 120000 index 00000000..fb5bc8d0 --- /dev/null +++ b/pretrained_models @@ -0,0 +1 @@ +../mv-ssl/pretrained_models \ No newline at end of file diff --git a/run_eval.sh b/run_eval.sh new file mode 100644 index 00000000..d632e1a3 --- /dev/null +++ b/run_eval.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash +#SBATCH -A NAISS2024-5-609 +#SBATCH -o /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/evaluation/slurm_outs/%x_%j.out +#SBATCH -t 0-02:00:00 +#SBATCH --gpus-per-node=A100:1 +#SBATCH --nodes 1 + +python evaluation/dtu.py \ No newline at end of file diff --git a/test.sh b/test.sh new file mode 100644 index 00000000..a0a4ed36 --- /dev/null +++ b/test.sh @@ -0,0 +1,6 @@ +find ./data/re10k -type d -name images | while read imgdir; do + count=$(find "$imgdir" -maxdepth 1 -type f | wc -l) + if [ "$count" -gt 10 ]; then + basename "$(dirname "$imgdir")" + fi +done > folders_with_many_images.txt \ No newline at end of file diff --git a/todo.txt b/todo.txt new file mode 100644 index 00000000..8a391903 --- /dev/null +++ b/todo.txt @@ -0,0 +1,18 @@ +Klara (genererar snygga ply plots): +* MegaDepth +* VKITTI +* BlendedMVS +* PointOdyssey +* ScanNet +* Hypersim + + +Datsets +- [x] MegaDepth +- [x] ScanNet +- [x] VKITTI +- [ ] CO3D (downloading) +- [ ] BlendedMVS (downloading) +- [ ] WildRGB +- [ ] PointOdyssey +- [ ] MVS-Synth \ No newline at end of file diff --git a/training/config/crocov2.yaml b/training/config/crocov2.yaml new file mode 100644 index 00000000..f1b76a08 --- /dev/null +++ b/training/config/crocov2.yaml @@ -0,0 +1,6 @@ +defaults: + - default + - _self_ +exp_name: crocov2_exp002 +model: + patch_embed: crocov2 \ No newline at end of file diff --git a/training/config/default.yaml b/training/config/default.yaml index 8bec7a73..5fdbd1b0 100644 --- a/training/config/default.yaml +++ b/training/config/default.yaml @@ -2,17 +2,17 @@ defaults: - default_dataset.yaml exp_name: exp001 -img_size: 518 +log_wandb: true +img_size: 512 num_workers: 8 seed_value: 42 -accum_steps: 2 # We did not use gradient accumulation in our training, while if you suffer from OOM, you can try to use it. -patch_size: 14 -val_epoch_freq: 5 +accum_steps: 1 # We did not use gradient accumulation in our training, while if you suffer from OOM, you can try to use it. +patch_size: 16 +val_epoch_freq: 10 max_img_per_gpu: 48 -limit_train_batches: 800 -limit_val_batches: 400 - +limit_train_batches: 1000 +limit_val_batches: 100 data: # The code for data still looks too complicated. I should refactor this again (do I have time?...) @@ -28,10 +28,51 @@ data: dataset: _target_: data.composed_dataset.ComposedDataset dataset_configs: - - _target_: data.datasets.co3d.Co3dDataset + - _target_: data.datasets.megadepth.MegadepthDataset + split: train + MEGADEPTH_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/data/megadepth + MEGADEPTH_ANNOTATION_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations + len_train: 100000 + - _target_: data.datasets.scannet.ScanNetDataset + split: train + SCANNET_DIR: /mimer/NOBACKUP/groups/3d-dl/scannet/scans/scans_train + SCANNET_ANNOTATION_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations + len_train: 100000 + - _target_: data.datasets.vkitti.VKittiDataset + split: train + VKitti_DIR: /mimer/NOBACKUP/groups/3d-dl/vkitti + len_train: 100000 + expand_ratio: 8 + - _target_: data.datasets.mvssynth.MVSSynthDataset + split: train + MVSSYNTH_DIR: /mimer/NOBACKUP/groups/3d-dl/MVS-Synth/GTAV_540 + MVSSYNTH_ANNOTATION_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations + len_train: 100000 + - _target_: data.datasets.blendedmvs.BlendedMVSDataset split: train - CO3D_DIR: /YOUR/PATH/TO/CO3D - CO3D_ANNOTATION_DIR: /YOUR/PATH/TO/CO3D_ANNOTATION + BLENDEDMVS_DIR: /mimer/NOBACKUP/groups/3d-dl/blendedmvs_full + BLENDEDMVS_ANNOTATION_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations + len_train: 100000 + - _target_: data.datasets.pointodyssey.PointOdysseyDataset + split: train + POINTODYSSEY_DIR: /mimer/NOBACKUP/groups/3d-dl/pointodyssey + POINTODYSSEY_ANNOTATION_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations + len_train: 100000 + - _target_: data.datasets.hypersim.HypersimDataset + split: train + HYPERSIM_DIR: /mimer/NOBACKUP/groups/3d-dl/ml-hypersim/contrib/99991/downloads + HYPERSIM_ANNOTATION_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations + len_train: 100000 + - _target_: data.datasets.wildrgbd.WildrgbdDataset + split: train + WILDRGBD_DIR: /mimer/NOBACKUP/groups/3d-dl/wildrgbd + WILDRGBD_ANNOTATION_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations + len_train: 100000 + - _target_: data.datasets.co3dv2.Co3dDataset + split: train + CO3D_DIR: /mimer/NOBACKUP/groups/3d-dl/co3dv2 + CO3D_ANNOTATION_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations/co3d + len_train: 100000 val: _target_: data.dynamic_dataloader.DynamicTorchDataset num_workers: ${num_workers} @@ -43,16 +84,15 @@ data: dataset: _target_: data.composed_dataset.ComposedDataset dataset_configs: - - _target_: data.datasets.co3d.Co3dDataset + - _target_: data.datasets.megadepth.MegadepthDataset split: test - CO3D_DIR: /YOUR/PATH/TO/CO3D - CO3D_ANNOTATION_DIR: /YOUR/PATH/TO/CO3D_ANNOTATION - + MEGADEPTH_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/data/megadepth + MEGADEPTH_ANNOTATION_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations logging: log_dir: logs log_visuals: False - log_freq: 1 + log_freq: 5 log_level_primary: DEBUG log_level_secondary: WARNING all_ranks: False @@ -85,8 +125,8 @@ logging: checkpoint: save_dir: logs/${exp_name}/ckpts - save_freq: 5 - resume_checkpoint_path: /YOUR/PATH/TO/CKPT + save_freq: 20 + resume_checkpoint_path: # /YOUR/PATH/TO/CKPT strict: False @@ -99,27 +139,24 @@ loss: weight: 1.0 gradient_loss_fn: "grad" valid_range: 0.98 - point: null + # point: null # If you want to enable point, use the following config - # point: - # weight: 1.0 - # gradient_loss_fn: "normal" - # valid_range: 0.98 + point: + weight: 1.0 + gradient_loss_fn: "normal" + valid_range: 0.98 track: null - - - optim: param_group_modifiers: False optimizer: _target_: torch.optim.AdamW - lr: 5e-5 + lr: 2e-4 # 5e-5 weight_decay: 0.05 frozen_module_names: - - "*aggregator*" # example, freeze the aggregator + # - "*aggregator*" # example, freeze the aggregator amp: enabled: True @@ -136,6 +173,9 @@ optim: - module_name: ["camera"] max_norm: 1.0 # feel free to reduce this if you see instabilities norm_type: 2 + - module_name: ["point"] + max_norm: 1.0 # feel free to reduce this if you see instabilities + norm_type: 2 options: lr: - scheduler: @@ -143,9 +183,9 @@ optim: schedulers: - _target_: fvcore.common.param_scheduler.LinearParamScheduler start_value: 1e-8 - end_value: 5e-5 + end_value: ${optim.optimizer.lr} - _target_: fvcore.common.param_scheduler.CosineParamScheduler - start_value: 5e-5 + start_value: ${optim.optimizer.lr} end_value: 1e-8 lengths: [0.05, 0.95] interval_scaling: ['rescaled', 'rescaled'] @@ -154,18 +194,31 @@ optim: _target_: fvcore.common.param_scheduler.ConstantParamScheduler value: 0.05 +max_epochs: 100 +# Base: +# embed_dim=768 +# depth=12 +# num_heads=12 - -max_epochs: 20 +# Large: +# embed_dim=1024 +# depth=24 +# num_heads=16 model: - _target_: vggt.models.vggt.VGGT + _target_: vggt.models.vggt_small.VGGT + img_size: ${img_size} + embed_dim: 1024 + depth: 6 + num_heads: 16 + enable_camera: True enable_depth: True - enable_point: False + enable_point: True enable_track: False - + patch_size: ${patch_size} + patch_embed: dinov3 # crocov2 # mum # dinov3 distributed: # check https://docs.pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html for options diff --git a/training/config/default_old.yaml b/training/config/default_old.yaml new file mode 100644 index 00000000..606295ff --- /dev/null +++ b/training/config/default_old.yaml @@ -0,0 +1,237 @@ +defaults: + - default_dataset.yaml + +exp_name: exp001 +log_wandb: true +img_size: 336 # 400 # 480 # 512 +num_workers: 8 +seed_value: 42 +accum_steps: 1 # We did not use gradient accumulation in our training, while if you suffer from OOM, you can try to use it. +patch_size: 16 +val_epoch_freq: 10 +max_img_per_gpu: 48 + +limit_train_batches: 1000 +limit_val_batches: 100 + +data: + # The code for data still looks too complicated. I should refactor this again (do I have time?...) + train: + _target_: data.dynamic_dataloader.DynamicTorchDataset + num_workers: ${num_workers} + max_img_per_gpu: ${max_img_per_gpu} + common_config: + img_size: ${img_size} + patch_size: ${patch_size} + debug: False + repeat_batch: False + dataset: + _target_: data.composed_dataset.ComposedDataset + dataset_configs: + - _target_: data.datasets.megadepth.MegadepthDataset + split: train + MEGADEPTH_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/data/megadepth + MEGADEPTH_ANNOTATION_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations + len_train: 100000 + - _target_: data.datasets.scannet.ScanNetDataset + split: train + SCANNET_DIR: /mimer/NOBACKUP/groups/3d-dl/scannet/scans/scans_train + SCANNET_ANNOTATION_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations + len_train: 100000 + - _target_: data.datasets.vkitti.VKittiDataset + split: train + VKitti_DIR: /mimer/NOBACKUP/groups/3d-dl/vkitti + len_train: 100000 + expand_ratio: 8 + - _target_: data.datasets.mvssynth.MVSSynthDataset + split: train + MVSSYNTH_DIR: /mimer/NOBACKUP/groups/3d-dl/MVS-Synth/GTAV_540 + MVSSYNTH_ANNOTATION_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations + len_train: 100000 + - _target_: data.datasets.blendedmvs.BlendedMVSDataset + split: train + BLENDEDMVS_DIR: /mimer/NOBACKUP/groups/3d-dl/blendedmvs_full + BLENDEDMVS_ANNOTATION_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations + len_train: 100000 + - _target_: data.datasets.pointodyssey.PointOdysseyDataset + split: train + POINTODYSSEY_DIR: /mimer/NOBACKUP/groups/3d-dl/pointodyssey + POINTODYSSEY_ANNOTATION_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations + len_train: 100000 + - _target_: data.datasets.hypersim.HypersimDataset + split: train + HYPERSIM_DIR: /mimer/NOBACKUP/groups/3d-dl/ml-hypersim/contrib/99991/downloads + HYPERSIM_ANNOTATION_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations + len_train: 100000 + - _target_: data.datasets.wildrgbd.WildrgbdDataset + split: train + WILDRGBD_DIR: /mimer/NOBACKUP/groups/3d-dl/wildrgbd + WILDRGBD_ANNOTATION_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations + len_train: 100000 + + - _target_: data.datasets.co3dv2.Co3dDataset + split: train + CO3D_DIR: /mimer/NOBACKUP/groups/3d-dl/co3dv2 + CO3D_ANNOTATION_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations/co3d + len_train: 100000 + val: + _target_: data.dynamic_dataloader.DynamicTorchDataset + num_workers: ${num_workers} + max_img_per_gpu: ${max_img_per_gpu} + common_config: + img_size: ${img_size} + patch_size: ${patch_size} + debug: False + dataset: + _target_: data.composed_dataset.ComposedDataset + dataset_configs: + - _target_: data.datasets.megadepth.MegadepthDataset + split: test + MEGADEPTH_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/data/megadepth + MEGADEPTH_ANNOTATION_DIR: /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations + +logging: + log_dir: logs + log_visuals: False + log_freq: 5 + log_level_primary: DEBUG + log_level_secondary: WARNING + all_ranks: False + tensorboard_writer: + _target_: train_utils.tb_writer.TensorBoardLogger + path: ${logging.log_dir}/tensorboard + scalar_keys_to_log: + train: + keys_to_log: + - loss_objective + - loss_camera + - loss_T + - loss_R + - loss_FL + - loss_conf_depth + - loss_reg_depth + - loss_grad_depth + val: + keys_to_log: + - loss_objective + - loss_camera + - loss_T + - loss_R + - loss_FL + - loss_conf_depth + - loss_reg_depth + - loss_grad_depth + + + +checkpoint: + save_dir: logs/${exp_name}/ckpts + save_freq: 20 + resume_checkpoint_path: # /YOUR/PATH/TO/CKPT + strict: False + + +loss: + _target_: loss.MultitaskLoss + camera: + weight: 5.0 + loss_type: "l1" # The paper uses smooth l1 loss, but we found l1 loss is more stable than smooth l1 and l2 loss. + depth: + weight: 1.0 + gradient_loss_fn: "grad" + valid_range: 0.98 + # point: null + # If you want to enable point, use the following config + point: + weight: 1.0 + gradient_loss_fn: "normal" + valid_range: 0.98 + track: null + +optim: + param_group_modifiers: False + + optimizer: + _target_: torch.optim.AdamW + lr: 1e-4 # 5e-5 + weight_decay: 0.05 + + frozen_module_names: + # - "*aggregator*" # example, freeze the aggregator + + amp: + enabled: True + amp_dtype: bfloat16 + gradient_clip: + _target_: train_utils.gradient_clip.GradientClipper + configs: + - module_name: ["aggregator"] + max_norm: 1.0 # feel free to reduce this if you see instabilities + norm_type: 2 + - module_name: ["depth"] + max_norm: 1.0 # feel free to reduce this if you see instabilities + norm_type: 2 + - module_name: ["camera"] + max_norm: 1.0 # feel free to reduce this if you see instabilities + norm_type: 2 + - module_name: ["point"] + max_norm: 1.0 # feel free to reduce this if you see instabilities + norm_type: 2 + options: + lr: + - scheduler: + _target_: fvcore.common.param_scheduler.CompositeParamScheduler + schedulers: + - _target_: fvcore.common.param_scheduler.LinearParamScheduler + start_value: 1e-8 + end_value: ${optim.optimizer.lr} + - _target_: fvcore.common.param_scheduler.CosineParamScheduler + start_value: ${optim.optimizer.lr} + end_value: 1e-8 + lengths: [0.05, 0.95] + interval_scaling: ['rescaled', 'rescaled'] + weight_decay: + - scheduler: + _target_: fvcore.common.param_scheduler.ConstantParamScheduler + value: 0.05 + +max_epochs: 100 + +# Base: +# embed_dim=768 +# depth=12 +# num_heads=12 + +# Large: +# embed_dim=1024 +# depth=24 +# num_heads=16 + +model: + _target_: vggt.models.vggt_small.VGGT + img_size: ${img_size} + embed_dim: 768 + depth: 6 + num_heads: 12 + + enable_camera: True + enable_depth: True + enable_point: True + enable_track: False + patch_size: ${patch_size} + patch_embed: dinov3 # crocov2 # mum # dinov3 + +distributed: + # check https://docs.pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html for options + backend: nccl + comms_dtype: None + find_unused_parameters: False + timeout_mins: 30 + gradient_as_bucket_view: True # Less memory used + bucket_cap_mb: 25 + broadcast_buffers: True + +cuda: + cudnn_deterministic: False + cudnn_benchmark: False + allow_tf32: True diff --git a/training/config/dinov3.yaml b/training/config/dinov3.yaml new file mode 100644 index 00000000..e57075dc --- /dev/null +++ b/training/config/dinov3.yaml @@ -0,0 +1,6 @@ +defaults: + - default + - _self_ +exp_name: dinov3_exp005 +model: + patch_embed: dinov3 \ No newline at end of file diff --git a/training/config/mum.yaml b/training/config/mum.yaml new file mode 100644 index 00000000..990b7037 --- /dev/null +++ b/training/config/mum.yaml @@ -0,0 +1,6 @@ +defaults: + - default + - _self_ +exp_name: mum_exp005 +model: + patch_embed: mum \ No newline at end of file diff --git a/training/data/dataset_util.py b/training/data/dataset_util.py index 542af78f..e1935da9 100644 --- a/training/data/dataset_util.py +++ b/training/data/dataset_util.py @@ -708,4 +708,4 @@ def load_16big_png_depth(depth_png: str) -> np.ndarray: .astype(np.float32) .reshape((depth_pil.size[1], depth_pil.size[0])) ) - return depth + return depth \ No newline at end of file diff --git a/training/data/datasets/blendedmvs.py b/training/data/datasets/blendedmvs.py new file mode 100644 index 00000000..89d045e4 --- /dev/null +++ b/training/data/datasets/blendedmvs.py @@ -0,0 +1,264 @@ +import re +import gzip +import json +import os.path as osp +import os +import logging + +import cv2 +import random +import numpy as np +import h5py + +from data.dataset_util import * +from data.base_dataset import BaseDataset + +import numpy as np +import torch +import cv2 + +def read_pfm(filename): + file = open(filename, 'rb') + color = None + width = None + height = None + scale = None + endian = None + + header = file.readline().decode('utf-8').rstrip() + if header == 'PF': + color = True + elif header == 'Pf': + color = False + else: + raise Exception('Not a PFM file.') + + dim_match = re.match(r'^(\d+)\s(\d+)\s$', file.readline().decode('utf-8')) + if dim_match: + width, height = map(int, dim_match.groups()) + else: + raise Exception('Malformed PFM header.') + + scale = float(file.readline().rstrip()) + if scale < 0: # little-endian + endian = '<' + scale = -scale + else: + endian = '>' # big-endian + + data = np.fromfile(file, endian + 'f') + shape = (height, width, 3) if color else (height, width) + + data = np.reshape(data, shape) + data = np.flipud(data) + file.close() + return data, scale + +class BlendedMVSDataset(BaseDataset): + def __init__( + self, + common_conf, + split: str = "train", + BLENDEDMVS_DIR: str = None, + BLENDEDMVS_ANNOTATION_DIR: str = None, + min_num_images: int = 24, + len_train: int = 100000, + len_test: int = 10000, + ): + """ + Initialize the BlendedMVSDataset. + + Args: + common_conf: Configuration object with common settings. + split (str): Dataset split, either 'train' or 'test'. + BLENDEDMVS_DIR (str): Directory path to BlendedMVS data. + BLENDEDMVS_ANNOTATION_DIR (str): Directory path to BlendedMVS annotations. + min_num_images (int): Minimum number of images per sequence. + len_train (int): Length of the training dataset. + len_test (int): Length of the test dataset. + Raises: + ValueError: If BLENDEDMVS_DIR or BLENDEDMVS_ANNOTATION_DIR is not specified. + """ + super().__init__(common_conf=common_conf) + + self.debug = common_conf.debug + self.training = common_conf.training + self.get_nearby = common_conf.get_nearby + self.load_depth = common_conf.load_depth + self.inside_random = common_conf.inside_random + self.allow_duplicate_img = common_conf.allow_duplicate_img + + if BLENDEDMVS_DIR is None or BLENDEDMVS_ANNOTATION_DIR is None: + raise ValueError("Both BLENDEDMVS_DIR and BLENDEDMVS_ANNOTATION_DIR must be specified.") + + if split == "train": + split_name = "train.jgz" + self.len_train = len_train + elif split == "test": + split_name = "test.jgz" + self.len_train = len_test + else: + raise ValueError(f"Invalid split: {split}") + + self.invalid_sequence = [] # set any invalid sequence names here + + + self.category_map = {} + self.data_store = {} + self.seqlen = None + self.min_num_images = min_num_images + + logging.info(f"BLENDEDMVS_DIR is {BLENDEDMVS_DIR}") + + self.BLENDEDMVS_DIR = BLENDEDMVS_DIR + self.BLENDEDMVS_ANNOTATION_DIR = BLENDEDMVS_ANNOTATION_DIR + + annotation_file = osp.join( + self.BLENDEDMVS_ANNOTATION_DIR, "blendedmvs", split_name + ) + + try: + with gzip.open(annotation_file, "r") as fin: + annotation = json.loads(fin.read()) + except FileNotFoundError: + logging.error(f"Annotation file not found: {annotation_file}") + total_frame_num = 0 + + for seq_name, seq_data in annotation.items(): + if seq_name in self.invalid_sequence: + continue + + if len(seq_data) < min_num_images: + continue + total_frame_num += len(seq_data) + self.data_store[seq_name] = seq_data + self.sequence_list = list(self.data_store.keys()) + self.sequence_list_len = len(self.sequence_list) + self.total_frame_num = total_frame_num + + status = "Training" if self.training else "Testing" + logging.info(f"{status}: BlendedMVS Data size: {self.sequence_list_len}") + logging.info(f"{status}: BlendedMVS Data dataset length: {len(self)}") + + def get_data( + self, + seq_index: int = None, + img_per_seq: int = None, + seq_name: str = None, + ids: list = None, + aspect_ratio: float = 1.0, + ) -> dict: + """ + Retrieve data for a specific sequence. + + Args: + seq_index (int): Index of the sequence to retrieve. + img_per_seq (int): Number of images per sequence. + seq_name (str): Name of the sequence. + ids (list): Specific IDs to retrieve. + aspect_ratio (float): Aspect ratio for image processing. + + Returns: + dict: A batch of data including images, depths, and other metadata. + """ + if self.inside_random: + seq_index = random.randint(0, self.sequence_list_len - 1) + + if seq_name is None: + seq_name = self.sequence_list[seq_index] + + metadata = self.data_store[seq_name] + + if ids is None: + ids = np.random.choice( + len(metadata), img_per_seq, replace=self.allow_duplicate_img + ) + + annos = [metadata[i] for i in ids] + + target_image_shape = self.get_target_shape(aspect_ratio) + + images = [] + depths = [] + cam_points = [] + world_points = [] + point_masks = [] + extrinsics = [] + intrinsics = [] + image_paths = [] + original_sizes = [] + + for anno in annos: + filepath = anno["filepath"] + + image_path = osp.join(self.BLENDEDMVS_DIR, filepath) + image = read_image_cv2(image_path) + + if self.load_depth: + depth_path = osp.join(self.BLENDEDMVS_DIR, anno["depthpath"]) + + depth_map, _ = read_pfm(depth_path) + depth_map = threshold_depth_map(depth_map, max_percentile=98, min_percentile=-1) + # depth_path = image_path.replace("/images", "/depths") + ".geometric.png" + + # mvs_mask_path = image_path.replace( + # "/images", "/depth_masks" + # ).replace(".jpg", ".png") + # mvs_mask = cv2.imread(mvs_mask_path, cv2.IMREAD_GRAYSCALE) > 128 + # depth_map[~mvs_mask] = 0 + + # depth_map = threshold_depth_map( + # depth_map, min_percentile=-1, max_percentile=98 + # ) + else: + depth_map = None + + original_size = np.array(image.shape[:2]) + extri_opencv = np.array(anno["extri"]) + intri_opencv = np.array(anno["intri"]) + + ( + image, + depth_map, + extri_opencv, + intri_opencv, + world_coords_points, + cam_coords_points, + point_mask, + _, + ) = self.process_one_image( + image, + depth_map, + extri_opencv, + intri_opencv, + original_size, + target_image_shape, + filepath=filepath, + ) + + images.append(image) + depths.append(depth_map) + extrinsics.append(extri_opencv) + intrinsics.append(intri_opencv) + cam_points.append(cam_coords_points) + world_points.append(world_coords_points) + point_masks.append(point_mask) + image_paths.append(image_path) + original_sizes.append(original_size) + + set_name = "BlendedMVS" + + batch = { + "seq_name": set_name + "_" + seq_name, + "ids": ids, + "frame_num": len(extrinsics), + "images": images, + "depths": depths, + "extrinsics": extrinsics, + "intrinsics": intrinsics, + "cam_points": cam_points, + "world_points": world_points, + "point_masks": point_masks, + "original_sizes": original_sizes, + } + return batch diff --git a/training/data/datasets/co3d.py b/training/data/datasets/co3d.py index 5636626d..69662867 100644 --- a/training/data/datasets/co3d.py +++ b/training/data/datasets/co3d.py @@ -1,8 +1,3 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# LICENSE file in the root directory of this source tree. import gzip import json @@ -13,58 +8,28 @@ import cv2 import random import numpy as np - +import h5py from data.dataset_util import * from data.base_dataset import BaseDataset +import numpy as np +import torch +import cv2 + + +def _load_16big_png_depth(depth_png): + with Image.open(depth_png) as depth_pil: + # the image is stored with 16-bit depth but PIL reads it as I (32 bit). + # we cast it to uint16, then reinterpret as float16, then cast to float32 + depth = ( + np.frombuffer(np.array(depth_pil, dtype=np.uint16), dtype=np.float16) + .astype(np.float32) + .reshape((depth_pil.size[1], depth_pil.size[0])) + ) + return depth -SEEN_CATEGORIES = [ - "apple", - "backpack", - "banana", - "baseballbat", - "baseballglove", - "bench", - "bicycle", - "bottle", - "bowl", - "broccoli", - "cake", - "car", - "carrot", - "cellphone", - "chair", - "cup", - "donut", - "hairdryer", - "handbag", - "hydrant", - "keyboard", - "laptop", - "microwave", - "motorcycle", - "mouse", - "orange", - "parkingmeter", - "pizza", - "plant", - "stopsign", - "teddybear", - "toaster", - "toilet", - "toybus", - "toyplane", - "toytrain", - "toytruck", - "tv", - "umbrella", - "vase", - "wineglass", -] - - -class Co3dDataset(BaseDataset): +class CO3DDataset(BaseDataset): def __init__( self, common_conf, @@ -76,7 +41,7 @@ def __init__( len_test: int = 10000, ): """ - Initialize the Co3dDataset. + Initialize the CO3DDataset. Args: common_conf: Configuration object with common settings. @@ -101,16 +66,11 @@ def __init__( if CO3D_DIR is None or CO3D_ANNOTATION_DIR is None: raise ValueError("Both CO3D_DIR and CO3D_ANNOTATION_DIR must be specified.") - category = sorted(SEEN_CATEGORIES) - - if self.debug: - category = ["apple"] - if split == "train": - split_name_list = ["train"] + split_name = "train.jgz" self.len_train = len_train elif split == "test": - split_name_list = ["test"] + split_name = "test.jgz" self.len_train = len_test else: raise ValueError(f"Invalid split: {split}") @@ -128,36 +88,32 @@ def __init__( self.CO3D_DIR = CO3D_DIR self.CO3D_ANNOTATION_DIR = CO3D_ANNOTATION_DIR - total_frame_num = 0 + annotation_file = osp.join( + self.CO3D_ANNOTATION_DIR, "co3d", split_name + ) - for c in category: - for split_name in split_name_list: - annotation_file = osp.join( - self.CO3D_ANNOTATION_DIR, f"{c}_{split_name}.jgz" - ) + try: + with gzip.open(annotation_file, "r") as fin: + annotation = json.loads(fin.read()) + except FileNotFoundError: + logging.error(f"Annotation file not found: {annotation_file}") + total_frame_num = 0 - try: - with gzip.open(annotation_file, "r") as fin: - annotation = json.loads(fin.read()) - except FileNotFoundError: - logging.error(f"Annotation file not found: {annotation_file}") - continue - - for seq_name, seq_data in annotation.items(): - if len(seq_data) < min_num_images: - continue - if seq_name in self.invalid_sequence: - continue - total_frame_num += len(seq_data) - self.data_store[seq_name] = seq_data + for seq_name, seq_data in annotation.items(): + if seq_name in self.invalid_sequence: + continue + if len(seq_data) < min_num_images: + continue + total_frame_num += len(seq_data) + self.data_store[seq_name] = seq_data self.sequence_list = list(self.data_store.keys()) self.sequence_list_len = len(self.sequence_list) self.total_frame_num = total_frame_num status = "Training" if self.training else "Testing" - logging.info(f"{status}: Co3D Data size: {self.sequence_list_len}") - logging.info(f"{status}: Co3D Data dataset length: {len(self)}") + logging.info(f"{status}: CO3D Data size: {self.sequence_list_len}") + logging.info(f"{status}: CO3D Data dataset length: {len(self)}") def get_data( self, @@ -216,12 +172,12 @@ def get_data( if self.load_depth: depth_path = image_path.replace("/images", "/depths") + ".geometric.png" depth_map = read_depth(depth_path, 1.0) - - mvs_mask_path = image_path.replace( - "/images", "/depth_masks" - ).replace(".jpg", ".png") - mvs_mask = cv2.imread(mvs_mask_path, cv2.IMREAD_GRAYSCALE) > 128 - depth_map[~mvs_mask] = 0 + + # mvs_mask_path = image_path.replace( + # "/images", "/depth_masks" + # ).replace(".jpg", ".png") + # mvs_mask = cv2.imread(mvs_mask_path, cv2.IMREAD_GRAYSCALE) > 128 + # depth_map[~mvs_mask] = 0 depth_map = threshold_depth_map( depth_map, min_percentile=-1, max_percentile=98 @@ -262,7 +218,7 @@ def get_data( image_paths.append(image_path) original_sizes.append(original_size) - set_name = "co3d" + set_name = "CO3D" batch = { "seq_name": set_name + "_" + seq_name, diff --git a/training/data/datasets/co3dv2.py b/training/data/datasets/co3dv2.py new file mode 100644 index 00000000..6b6104ac --- /dev/null +++ b/training/data/datasets/co3dv2.py @@ -0,0 +1,277 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import gzip +import json +import os.path as osp +import os +import logging + +import cv2 +import random +import numpy as np + + +from data.dataset_util import * +from data.base_dataset import BaseDataset + +# TV +# donut +# frisbee +# toybus +# bowl +# book +# car +# toaster +# hydrant +# keyboard +# parkingmeter +# hotdog +# handbag +# motorcycle +# pizza +# teddybear +# remote +# backpack +# cellphone +# bench +# stopsign + + +SEEN_CATEGORIES = [ + "apple", + "bowl", + "book", + "car", + "donut", + "hydrant", + "keyboard", + "parkingmeter", + "toaster", + "toybus", + "tv", + "frisbee", +] + + +class Co3dDataset(BaseDataset): + def __init__( + self, + common_conf, + split: str = "train", + CO3D_DIR: str = None, + CO3D_ANNOTATION_DIR: str = None, + min_num_images: int = 24, + len_train: int = 100000, + len_test: int = 10000, + ): + """ + Initialize the Co3dDataset. + + Args: + common_conf: Configuration object with common settings. + split (str): Dataset split, either 'train' or 'test'. + CO3D_DIR (str): Directory path to CO3D data. + CO3D_ANNOTATION_DIR (str): Directory path to CO3D annotations. + min_num_images (int): Minimum number of images per sequence. + len_train (int): Length of the training dataset. + len_test (int): Length of the test dataset. + Raises: + ValueError: If CO3D_DIR or CO3D_ANNOTATION_DIR is not specified. + """ + super().__init__(common_conf=common_conf) + + self.debug = common_conf.debug + self.training = common_conf.training + self.get_nearby = common_conf.get_nearby + self.load_depth = common_conf.load_depth + self.inside_random = common_conf.inside_random + self.allow_duplicate_img = common_conf.allow_duplicate_img + + if CO3D_DIR is None or CO3D_ANNOTATION_DIR is None: + raise ValueError("Both CO3D_DIR and CO3D_ANNOTATION_DIR must be specified.") + + + # category = sorted(SEEN_CATEGORIES) + category = sorted(os.listdir(CO3D_DIR)) + + if self.debug: + category = ["apple"] + + if split == "train": + split_name_list = ["train"] + self.len_train = len_train + elif split == "test": + split_name_list = ["test"] + self.len_train = len_test + else: + raise ValueError(f"Invalid split: {split}") + + self.invalid_sequence = [] # set any invalid sequence names here + + + self.category_map = {} + self.data_store = {} + self.seqlen = None + self.min_num_images = min_num_images + + logging.info(f"CO3D_DIR is {CO3D_DIR}") + + self.CO3D_DIR = CO3D_DIR + self.CO3D_ANNOTATION_DIR = CO3D_ANNOTATION_DIR + + total_frame_num = 0 + + for c in category: + for split_name in split_name_list: + annotation_file = osp.join( + self.CO3D_ANNOTATION_DIR, f"{c}_{split_name}.jgz" + ) + + try: + with gzip.open(annotation_file, "r") as fin: + annotation = json.loads(fin.read()) + except FileNotFoundError: + logging.error(f"Annotation file not found: {annotation_file}") + continue + + for seq_name, seq_data in annotation.items(): + if len(seq_data) < min_num_images: + continue + if seq_name in self.invalid_sequence: + continue + total_frame_num += len(seq_data) + self.data_store[seq_name] = seq_data + + self.sequence_list = list(self.data_store.keys()) + self.sequence_list_len = len(self.sequence_list) + self.total_frame_num = total_frame_num + + status = "Training" if self.training else "Testing" + logging.info(f"{status}: Co3D Data size: {self.sequence_list_len}") + logging.info(f"{status}: Co3D Data dataset length: {len(self)}") + + def get_data( + self, + seq_index: int = None, + img_per_seq: int = None, + seq_name: str = None, + ids: list = None, + aspect_ratio: float = 1.0, + ) -> dict: + """ + Retrieve data for a specific sequence. + + Args: + seq_index (int): Index of the sequence to retrieve. + img_per_seq (int): Number of images per sequence. + seq_name (str): Name of the sequence. + ids (list): Specific IDs to retrieve. + aspect_ratio (float): Aspect ratio for image processing. + + Returns: + dict: A batch of data including images, depths, and other metadata. + """ + if self.inside_random: + seq_index = random.randint(0, self.sequence_list_len - 1) + + if seq_name is None: + seq_name = self.sequence_list[seq_index] + + metadata = self.data_store[seq_name] + + if ids is None: + ids = np.random.choice( + len(metadata), img_per_seq, replace=self.allow_duplicate_img + ) + + annos = [metadata[i] for i in ids] + + target_image_shape = self.get_target_shape(aspect_ratio) + + images = [] + depths = [] + cam_points = [] + world_points = [] + point_masks = [] + extrinsics = [] + intrinsics = [] + image_paths = [] + original_sizes = [] + + for anno in annos: + filepath = anno["filepath"] + + image_path = osp.join(self.CO3D_DIR, filepath) + image = read_image_cv2(image_path) + + if self.load_depth: + depth_path = image_path.replace("/images", "/depths") + ".geometric.png" + depth_map = read_depth(depth_path, 1.0) + + mvs_mask_path = image_path.replace( + "/images", "/depth_masks" + ).replace(".jpg", ".png") + mvs_mask = cv2.imread(mvs_mask_path, cv2.IMREAD_GRAYSCALE) + if mvs_mask is not None: + mvs_mask = mvs_mask> 128 + depth_map[~mvs_mask] = 0 + + depth_map = threshold_depth_map( + depth_map, min_percentile=-1, max_percentile=98 + ) + else: + depth_map = None + + original_size = np.array(image.shape[:2]) + extri_opencv = np.array(anno["extri"]) + intri_opencv = np.array(anno["intri"]) + + ( + image, + depth_map, + extri_opencv, + intri_opencv, + world_coords_points, + cam_coords_points, + point_mask, + _, + ) = self.process_one_image( + image, + depth_map, + extri_opencv, + intri_opencv, + original_size, + target_image_shape, + filepath=filepath, + ) + + images.append(image) + depths.append(depth_map) + extrinsics.append(extri_opencv) + intrinsics.append(intri_opencv) + cam_points.append(cam_coords_points) + world_points.append(world_coords_points) + point_masks.append(point_mask) + image_paths.append(image_path) + original_sizes.append(original_size) + + set_name = "co3d" + + batch = { + "seq_name": set_name + "_" + seq_name, + "ids": ids, + "frame_num": len(extrinsics), + "images": images, + "depths": depths, + "extrinsics": extrinsics, + "intrinsics": intrinsics, + "cam_points": cam_points, + "world_points": world_points, + "point_masks": point_masks, + "original_sizes": original_sizes, + } + return batch diff --git a/training/data/datasets/hypersim.py b/training/data/datasets/hypersim.py new file mode 100644 index 00000000..55f4b5fa --- /dev/null +++ b/training/data/datasets/hypersim.py @@ -0,0 +1,290 @@ +import gzip +import json +import os.path as osp +import logging + +import cv2 +import random +import numpy as np + +from data.dataset_util import * +from data.base_dataset import BaseDataset + +import numpy as np +import cv2 +import torch +import h5py + +def to_homogeneous(x: torch.Tensor) -> torch.Tensor: + return torch.cat((x, torch.ones_like(x[..., :1])), dim=-1) + +def get_pixel_grid( + B: int, + H: int, + W: int, +) -> torch.Tensor: + x1_n = torch.meshgrid( + *[torch.arange(n) + 0.5 for n in (B, H, W)], + indexing="ij", + ) + x1_n = torch.stack((x1_n[2], x1_n[1]), dim=-1).reshape(B, H, W, 2) + return x1_n + +def load_distance(distance_path) -> np.ndarray: + with h5py.File(distance_path, "r") as x: + return x["dataset"][:] # type: ignore + +def homog_pixel_grid(H: int, W: int) -> np.ndarray: + return ( + to_homogeneous( + get_pixel_grid( + 1, + H, + W, + ) + ) + .numpy() + .reshape(-1, 3) + .T + ) + +def depth_from_distance( + distance: torch.Tensor, K: torch.Tensor + ) -> torch.Tensor: + H, W = distance.shape[0], distance.shape[1] + grid = homog_pixel_grid(H, W) + rays = torch.linalg.inv(K) @ grid # 3xHW + ray_z = rays[-1] / torch.linalg.norm(rays, dim=0) + z = distance.reshape(-1) * ray_z + return z.reshape(H, W, 1) + +class HypersimDataset(BaseDataset): + def __init__( + self, + common_conf, + split: str = "train", + HYPERSIM_DIR: str = None, + HYPERSIM_ANNOTATION_DIR: str = None, + min_num_images: int = 24, + len_train: int = 100000, + len_test: int = 10000, + ): + """ + Initialize the HypersimDataset. + + Args: + common_conf: Configuration object with common settings. + split (str): Dataset split, either 'train' or 'test'. + HYPERSIM_DIR (str): Directory path to Hypersim data. + HYPERSIM_ANNOTATION_DIR (str): Directory path to Hypersim annotations. + min_num_images (int): Minimum number of images per sequence. + len_train (int): Length of the training dataset. + len_test (int): Length of the test dataset. + Raises: + ValueError: If HYPERSIM_DIR or HYPERSIM_ANNOTATION_DIR is not specified. + """ + super().__init__(common_conf=common_conf) + + self.debug = common_conf.debug + self.training = common_conf.training + self.get_nearby = common_conf.get_nearby + self.load_depth = common_conf.load_depth + self.inside_random = common_conf.inside_random + self.allow_duplicate_img = common_conf.allow_duplicate_img + + if HYPERSIM_DIR is None or HYPERSIM_ANNOTATION_DIR is None: + raise ValueError("Both HYPERSIM_DIR and HYPERSIM_ANNOTATION_DIR must be specified.") + + if split == "train": + split_name = "train.jgz" + self.len_train = len_train + elif split == "test": + split_name = "test.jgz" + self.len_train = len_test + else: + raise ValueError(f"Invalid split: {split}") + + self.invalid_sequence = [] # set any invalid sequence names here + + + self.category_map = {} + self.data_store = {} + self.seqlen = None + self.min_num_images = min_num_images + + logging.info(f"HYPERSIM_DIR is {HYPERSIM_DIR}") + + self.HYPERSIM_DIR = HYPERSIM_DIR + self.HYPERSIM_ANNOTATION_DIR = HYPERSIM_ANNOTATION_DIR + + annotation_file = osp.join( + self.HYPERSIM_ANNOTATION_DIR, "hypersim", split_name + ) + + try: + with gzip.open(annotation_file, "r") as fin: + annotation = json.loads(fin.read()) + except FileNotFoundError: + logging.error(f"Annotation file not found: {annotation_file}") + total_frame_num = 0 + + for seq_name, seq_data in annotation.items(): + if seq_name in self.invalid_sequence: + continue + + if len(seq_data) < min_num_images: + continue + total_frame_num += len(seq_data) + self.data_store[seq_name] = seq_data + self.sequence_list = list(self.data_store.keys()) + self.sequence_list_len = len(self.sequence_list) + self.total_frame_num = total_frame_num + + status = "Training" if self.training else "Testing" + logging.info(f"{status}: Hypersim Data size: {self.sequence_list_len}") + logging.info(f"{status}: Hypersim Data dataset length: {len(self)}") + + def get_data( + self, + seq_index: int = None, + img_per_seq: int = None, + seq_name: str = None, + ids: list = None, + aspect_ratio: float = 1.0, + max_retries: int = 10, + ) -> dict: + """ + Retrieve data for a specific sequence. + + Args: + seq_index (int): Index of the sequence to retrieve. + img_per_seq (int): Number of images per sequence. + seq_name (str): Name of the sequence. + ids (list): Specific IDs to retrieve. + aspect_ratio (float): Aspect ratio for image processing. + max_retries (int): Maximum number of retry attempts. + + Returns: + dict: A batch of data including images, depths, and other metadata. + """ + original_seq_index = seq_index + original_seq_name = seq_name + + for attempt in range(max_retries): + # Force new random sequence on retry + if attempt > 0 or self.inside_random: + seq_index = random.randint(0, self.sequence_list_len - 1) + seq_name = None # Reset seq_name to force using the new index + + if seq_name is None: + seq_name = self.sequence_list[seq_index] + + metadata = self.data_store[seq_name] + + if ids is None or attempt > 0: # Also resample IDs on retry + ids = np.random.choice( + len(metadata), img_per_seq, replace=self.allow_duplicate_img + ) + + annos = [metadata[i] for i in ids] + target_image_shape = self.get_target_shape(aspect_ratio) + + images = [] + depths = [] + cam_points = [] + world_points = [] + point_masks = [] + extrinsics = [] + intrinsics = [] + image_paths = [] + original_sizes = [] + + valid_sequence = True + + for anno in annos: + filepath = anno["filepath"] + image_path = osp.join(self.HYPERSIM_DIR, filepath) + image = read_image_cv2(image_path) + + if self.load_depth: + meters_per_asset = anno["meters_per_asset"] + depth_path = osp.join(self.HYPERSIM_DIR, anno["depthpath"]) + distance = ( + torch.tensor(load_distance(depth_path)).float() / meters_per_asset + ) + intrinsic = torch.tensor(anno["intri"]).reshape(3, 3).float() + + depth = depth_from_distance(distance, intrinsic).float() + depth[depth.isnan()] = 0 + + depth_map = depth.squeeze(-1).numpy() + depth_map = threshold_depth_map( + depth_map, min_percentile=-1, max_percentile=98 + ) + else: + depth_map = None + + original_size = np.array(image.shape[:2]) + extri_opencv = np.array(anno["extri"]) + intri_opencv = np.array(anno["intri"]) + cx = intri_opencv[0, 2] + cy = intri_opencv[1, 2] + + if cy > 768 or cx > 1024: + valid_sequence = False + break # Break and try a different sequence + + # Setting zero skew + intri_opencv[0, 1] = 0.0 + + ( + image, + depth_map, + extri_opencv, + intri_opencv, + world_coords_points, + cam_coords_points, + point_mask, + _, + ) = self.process_one_image( + image, + depth_map, + extri_opencv, + intri_opencv, + original_size, + target_image_shape, + filepath=filepath, + ) + + images.append(image) + depths.append(depth_map) + extrinsics.append(extri_opencv) + intrinsics.append(intri_opencv) + cam_points.append(cam_coords_points) + world_points.append(world_coords_points) + point_masks.append(point_mask) + image_paths.append(image_path) + original_sizes.append(original_size) + + if valid_sequence: + set_name = "Hypersim" + batch = { + "seq_name": set_name + "_" + seq_name, + "ids": ids, + "frame_num": len(extrinsics), + "images": images, + "depths": depths, + "extrinsics": extrinsics, + "intrinsics": intrinsics, + "cam_points": cam_points, + "world_points": world_points, + "point_masks": point_masks, + "original_sizes": original_sizes, + } + return batch + + # Reset for next attempt + seq_index = original_seq_index + seq_name = original_seq_name + + raise RuntimeError(f"Failed to find valid sequence after {max_retries} attempts") \ No newline at end of file diff --git a/training/data/datasets/mapillary.py b/training/data/datasets/mapillary.py new file mode 100644 index 00000000..6529df49 --- /dev/null +++ b/training/data/datasets/mapillary.py @@ -0,0 +1,3 @@ +# https://www.mapillary.com/dataset/metropolis +# https://github.com/mapillary/metropolis_sdk/blob/main/FORMAT.md +# https://github.com/mapillary/metropolis_sdk/blob/main/SENSORS.md diff --git a/training/data/datasets/megadepth.py b/training/data/datasets/megadepth.py new file mode 100644 index 00000000..4f21b618 --- /dev/null +++ b/training/data/datasets/megadepth.py @@ -0,0 +1,218 @@ + +import gzip +import json +import os.path as osp +import os +import logging + +import cv2 +import random +import numpy as np +import h5py + +from data.dataset_util import * +from data.base_dataset import BaseDataset + +import numpy as np + + +class MegadepthDataset(BaseDataset): + def __init__( + self, + common_conf, + split: str = "train", + MEGADEPTH_DIR: str = None, + MEGADEPTH_ANNOTATION_DIR: str = None, + min_num_images: int = 24, + len_train: int = 100000, + len_test: int = 10000, + ): + """ + Initialize the MegadepthDataset. + + Args: + common_conf: Configuration object with common settings. + split (str): Dataset split, either 'train' or 'test'. + MEGADEPTH_DIR (str): Directory path to Megadepth data. + MEGADEPTH_ANNOTATION_DIR (str): Directory path to Megadepth annotations. + min_num_images (int): Minimum number of images per sequence. + len_train (int): Length of the training dataset. + len_test (int): Length of the test dataset. + Raises: + ValueError: If MEGADEPTH_DIR or MEGADEPTH_ANNOTATION_DIR is not specified. + """ + super().__init__(common_conf=common_conf) + + self.debug = common_conf.debug + self.training = common_conf.training + self.get_nearby = common_conf.get_nearby + self.load_depth = common_conf.load_depth + self.inside_random = common_conf.inside_random + self.allow_duplicate_img = common_conf.allow_duplicate_img + + if MEGADEPTH_DIR is None or MEGADEPTH_ANNOTATION_DIR is None: + raise ValueError("Both MEGADEPTH_DIR and MEGADEPTH_ANNOTATION_DIR must be specified.") + + if split == "train": + split_name = "train.jgz" + self.len_train = len_train + elif split == "test": + split_name = "test.jgz" + self.len_train = len_test + else: + raise ValueError(f"Invalid split: {split}") + + self.invalid_sequence = [] # set any invalid sequence names here + + + self.category_map = {} + self.data_store = {} + self.seqlen = None + self.min_num_images = min_num_images + + logging.info(f"MEGADEPTH_DIR is {MEGADEPTH_DIR}") + + self.MEGADEPTH_DIR = MEGADEPTH_DIR + self.MEGADEPTH_ANNOTATION_DIR = MEGADEPTH_ANNOTATION_DIR + + annotation_file = osp.join( + self.MEGADEPTH_ANNOTATION_DIR, "megadepth", split_name + ) + + try: + with gzip.open(annotation_file, "r") as fin: + annotation = json.loads(fin.read()) + except FileNotFoundError: + logging.error(f"Annotation file not found: {annotation_file}") + + total_frame_num = 0 + + for scene_name, scene_data in annotation.items(): + if scene_name in self.invalid_sequence: + continue + + for i, seq_data in enumerate(scene_data): + if len(seq_data) < min_num_images: + continue + total_frame_num += len(seq_data) + self.data_store[f"{scene_name}_{i}"] = seq_data + + self.sequence_list = list(self.data_store.keys()) + self.sequence_list_len = len(self.sequence_list) + self.total_frame_num = total_frame_num + + status = "Training" if self.training else "Testing" + logging.info(f"{status}: Megadepth Data size: {self.sequence_list_len}") + logging.info(f"{status}: Megadepth Data dataset length: {len(self)}") + + def get_data( + self, + seq_index: int = None, + img_per_seq: int = None, + seq_name: str = None, + ids: list = None, + aspect_ratio: float = 1.0, + ) -> dict: + """ + Retrieve data for a specific sequence. + + Args: + seq_index (int): Index of the sequence to retrieve. + img_per_seq (int): Number of images per sequence. + seq_name (str): Name of the sequence. + ids (list): Specific IDs to retrieve. + aspect_ratio (float): Aspect ratio for image processing. + + Returns: + dict: A batch of data including images, depths, and other metadata. + """ + if self.inside_random: + seq_index = random.randint(0, self.sequence_list_len - 1) + + if seq_name is None: + seq_name = self.sequence_list[seq_index] + + metadata = self.data_store[seq_name] + + if ids is None: + ids = np.random.choice( + len(metadata), img_per_seq, replace=self.allow_duplicate_img + ) + + annos = [metadata[i] for i in ids] + + target_image_shape = self.get_target_shape(aspect_ratio) + + images = [] + depths = [] + cam_points = [] + world_points = [] + point_masks = [] + extrinsics = [] + intrinsics = [] + image_paths = [] + original_sizes = [] + + for anno in annos: + filepath = anno["filepath"] + + image_path = osp.join(self.MEGADEPTH_DIR, filepath) + image = read_image_cv2(image_path) + + if self.load_depth: + depth_path = osp.join(self.MEGADEPTH_DIR, anno["depth_path"]) + depth_map = np.array(h5py.File(depth_path, "r")["depth"]) + depth_map = threshold_depth_map(depth_map, max_percentile=98, min_percentile=-1) + + else: + depth_map = None + + original_size = np.array(image.shape[:2]) + extri_opencv = np.array(anno["extri"]) + intri_opencv = np.array(anno["intri"]) + + ( + image, + depth_map, + extri_opencv, + intri_opencv, + world_coords_points, + cam_coords_points, + point_mask, + _, + ) = self.process_one_image( + image, + depth_map, + extri_opencv, + intri_opencv, + original_size, + target_image_shape, + filepath=filepath, + ) + + images.append(image) + depths.append(depth_map) + extrinsics.append(extri_opencv) + intrinsics.append(intri_opencv) + cam_points.append(cam_coords_points) + world_points.append(world_coords_points) + point_masks.append(point_mask) + image_paths.append(image_path) + original_sizes.append(original_size) + + set_name = "Megadepth" + + batch = { + "seq_name": set_name + "_" + seq_name, + "ids": ids, + "frame_num": len(extrinsics), + "images": images, + "depths": depths, + "extrinsics": extrinsics, + "intrinsics": intrinsics, + "cam_points": cam_points, + "world_points": world_points, + "point_masks": point_masks, + "original_sizes": original_sizes, + } + return batch diff --git a/training/data/datasets/mvssynth.py b/training/data/datasets/mvssynth.py new file mode 100644 index 00000000..4a1ab4d1 --- /dev/null +++ b/training/data/datasets/mvssynth.py @@ -0,0 +1,227 @@ + +import gzip +import json +import os.path as osp +import os +import logging + +import cv2 +import random +import numpy as np +import h5py +import imageio + +from data.dataset_util import * +from data.base_dataset import BaseDataset + +import numpy as np +import torch +import cv2 + +# https://github.com/phuang17/DeepMVS/issues/13 + +def read_img_depth_pose(depth_path): + raw_depth = np.asarray(imageio.imread(depth_path)[:]) + raw_depth = np.clip(raw_depth, 0.1, 1000.0) + # print('Raw depth shape:', raw_depth.shape) + return raw_depth[:, :, 0] + +class MVSSynthDataset(BaseDataset): + def __init__( + self, + common_conf, + split: str = "train", + MVSSYNTH_DIR: str = None, + MVSSYNTH_ANNOTATION_DIR: str = None, + min_num_images: int = 24, + len_train: int = 100000, + len_test: int = 10000, + ): + """ + Initialize the MVSSynthDataset. + + Args: + common_conf: Configuration object with common settings. + split (str): Dataset split, either 'train' or 'test'. + MVSSYNTH_DIR (str): Directory path to MVSSynth data. + MVSSYNTH_ANNOTATION_DIR (str): Directory path to MVSSynth annotations. + min_num_images (int): Minimum number of images per sequence. + len_train (int): Length of the training dataset. + len_test (int): Length of the test dataset. + Raises: + ValueError: If MVSSYNTH_DIR or MVSSYNTH_ANNOTATION_DIR is not specified. + """ + super().__init__(common_conf=common_conf) + + self.debug = common_conf.debug + self.training = common_conf.training + self.get_nearby = common_conf.get_nearby + self.load_depth = common_conf.load_depth + self.inside_random = common_conf.inside_random + self.allow_duplicate_img = common_conf.allow_duplicate_img + + if MVSSYNTH_DIR is None or MVSSYNTH_ANNOTATION_DIR is None: + raise ValueError("Both MVSSYNTH_DIR and MVSSYNTH_ANNOTATION_DIR must be specified.") + + if split == "train": + split_name = "train.jgz" + self.len_train = len_train + elif split == "test": + split_name = "test.jgz" + self.len_train = len_test + else: + raise ValueError(f"Invalid split: {split}") + + self.invalid_sequence = [] # set any invalid sequence names here + + + self.category_map = {} + self.data_store = {} + self.seqlen = None + self.min_num_images = min_num_images + + logging.info(f"MVSSYNTH_DIR is {MVSSYNTH_DIR}") + + self.MVSSYNTH_DIR = MVSSYNTH_DIR + self.MVSSYNTH_ANNOTATION_DIR = MVSSYNTH_ANNOTATION_DIR + + annotation_file = osp.join( + self.MVSSYNTH_ANNOTATION_DIR, "mvssynth", split_name + ) + + try: + with gzip.open(annotation_file, "r") as fin: + annotation = json.loads(fin.read()) + except FileNotFoundError: + logging.error(f"Annotation file not found: {annotation_file}") + total_frame_num = 0 + + for seq_name, seq_data in annotation.items(): + if seq_name in self.invalid_sequence: + continue + + if len(seq_data) < min_num_images: + continue + total_frame_num += len(seq_data) + self.data_store[seq_name] = seq_data + self.sequence_list = list(self.data_store.keys()) + self.sequence_list_len = len(self.sequence_list) + self.total_frame_num = total_frame_num + + status = "Training" if self.training else "Testing" + logging.info(f"{status}: MVSSynth Data size: {self.sequence_list_len}") + logging.info(f"{status}: MVSSynth Data dataset length: {len(self)}") + + def get_data( + self, + seq_index: int = None, + img_per_seq: int = None, + seq_name: str = None, + ids: list = None, + aspect_ratio: float = 1.0, + ) -> dict: + """ + Retrieve data for a specific sequence. + + Args: + seq_index (int): Index of the sequence to retrieve. + img_per_seq (int): Number of images per sequence. + seq_name (str): Name of the sequence. + ids (list): Specific IDs to retrieve. + aspect_ratio (float): Aspect ratio for image processing. + + Returns: + dict: A batch of data including images, depths, and other metadata. + """ + if self.inside_random: + seq_index = random.randint(0, self.sequence_list_len - 1) + + if seq_name is None: + seq_name = self.sequence_list[seq_index] + + metadata = self.data_store[seq_name] + + if ids is None: + ids = np.random.choice( + len(metadata), img_per_seq, replace=self.allow_duplicate_img + ) + + annos = [metadata[i] for i in ids] + + target_image_shape = self.get_target_shape(aspect_ratio) + + images = [] + depths = [] + cam_points = [] + world_points = [] + point_masks = [] + extrinsics = [] + intrinsics = [] + image_paths = [] + original_sizes = [] + + for anno in annos: + filepath = anno["filepath"] + + image_path = osp.join(self.MVSSYNTH_DIR, filepath) + image = read_image_cv2(image_path) + + if self.load_depth: + depth_path = osp.join(self.MVSSYNTH_DIR, anno["depthpath"]) + d = cv2.imread(depth_path, cv2.IMREAD_ANYDEPTH) + d[d > 1e9] = 0.0 + d[~np.isfinite(d)] = 0.0 + depth_map = d + depth_map = threshold_depth_map(depth_map, max_percentile=98, min_percentile=-1) + else: + depth_map = None + + original_size = np.array(image.shape[:2]) + extri_opencv = np.array(anno["extri"]) + intri_opencv = np.array(anno["intri"]) + + ( + image, + depth_map, + extri_opencv, + intri_opencv, + world_coords_points, + cam_coords_points, + point_mask, + _, + ) = self.process_one_image( + image, + depth_map, + extri_opencv, + intri_opencv, + original_size, + target_image_shape, + filepath=filepath, + ) + + images.append(image) + depths.append(depth_map) + extrinsics.append(extri_opencv) + intrinsics.append(intri_opencv) + cam_points.append(cam_coords_points) + world_points.append(world_coords_points) + point_masks.append(point_mask) + image_paths.append(image_path) + original_sizes.append(original_size) + + set_name = "MVSSynth" + + batch = { + "seq_name": set_name + "_" + seq_name, + "ids": ids, + "frame_num": len(extrinsics), + "images": images, + "depths": depths, + "extrinsics": extrinsics, + "intrinsics": intrinsics, + "cam_points": cam_points, + "world_points": world_points, + "point_masks": point_masks, + "original_sizes": original_sizes, + } + return batch diff --git a/training/data/datasets/pointodyssey.py b/training/data/datasets/pointodyssey.py new file mode 100644 index 00000000..a8a2634f --- /dev/null +++ b/training/data/datasets/pointodyssey.py @@ -0,0 +1,225 @@ +# Depth should be divided by 1000 + + +import gzip +import json +import os.path as osp +import logging + +import cv2 +import random +import numpy as np + +from data.dataset_util import * +from data.base_dataset import BaseDataset + +import numpy as np +import cv2 + +class PointOdysseyDataset(BaseDataset): + def __init__( + self, + common_conf, + split: str = "train", + POINTODYSSEY_DIR: str = None, + POINTODYSSEY_ANNOTATION_DIR: str = None, + min_num_images: int = 24, + len_train: int = 100000, + len_test: int = 10000, + ): + """ + Initialize the PointOdysseyDataset. + + Args: + common_conf: Configuration object with common settings. + split (str): Dataset split, either 'train' or 'test'. + POINTODYSSEY_DIR (str): Directory path to PointOdyssey data. + POINTODYSSEY_ANNOTATION_DIR (str): Directory path to PointOdyssey annotations. + min_num_images (int): Minimum number of images per sequence. + len_train (int): Length of the training dataset. + len_test (int): Length of the test dataset. + Raises: + ValueError: If POINTODYSSEY_DIR or POINTODYSSEY_ANNOTATION_DIR is not specified. + """ + super().__init__(common_conf=common_conf) + + self.debug = common_conf.debug + self.training = common_conf.training + self.get_nearby = common_conf.get_nearby + self.load_depth = common_conf.load_depth + self.inside_random = common_conf.inside_random + self.allow_duplicate_img = common_conf.allow_duplicate_img + + if POINTODYSSEY_DIR is None or POINTODYSSEY_ANNOTATION_DIR is None: + raise ValueError("Both POINTODYSSEY_DIR and POINTODYSSEY_ANNOTATION_DIR must be specified.") + + if split == "train": + split_name = "train.jgz" + self.len_train = len_train + elif split == "test": + split_name = "test.jgz" + self.len_train = len_test + else: + raise ValueError(f"Invalid split: {split}") + + self.invalid_sequence = [] # set any invalid sequence names here + + + self.category_map = {} + self.data_store = {} + self.seqlen = None + self.min_num_images = min_num_images + + logging.info(f"POINTODYSSEY_DIR is {POINTODYSSEY_DIR}") + + self.POINTODYSSEY_DIR = POINTODYSSEY_DIR + self.POINTODYSSEY_ANNOTATION_DIR = POINTODYSSEY_ANNOTATION_DIR + + annotation_file = osp.join( + self.POINTODYSSEY_ANNOTATION_DIR, "pointodyssey", split_name + ) + + try: + with gzip.open(annotation_file, "r") as fin: + annotation = json.loads(fin.read()) + except FileNotFoundError: + logging.error(f"Annotation file not found: {annotation_file}") + total_frame_num = 0 + + for seq_name, seq_data in annotation.items(): + if seq_name in self.invalid_sequence: + continue + + if len(seq_data) < min_num_images: + continue + total_frame_num += len(seq_data) + self.data_store[seq_name] = seq_data + self.sequence_list = list(self.data_store.keys()) + self.sequence_list_len = len(self.sequence_list) + self.total_frame_num = total_frame_num + + status = "Training" if self.training else "Testing" + logging.info(f"{status}: PointOdyssey Data size: {self.sequence_list_len}") + logging.info(f"{status}: PointOdyssey Data dataset length: {len(self)}") + + def get_data( + self, + seq_index: int = None, + img_per_seq: int = None, + seq_name: str = None, + ids: list = None, + aspect_ratio: float = 1.0, + ) -> dict: + """ + Retrieve data for a specific sequence. + + Args: + seq_index (int): Index of the sequence to retrieve. + img_per_seq (int): Number of images per sequence. + seq_name (str): Name of the sequence. + ids (list): Specific IDs to retrieve. + aspect_ratio (float): Aspect ratio for image processing. + + Returns: + dict: A batch of data including images, depths, and other metadata. + """ + if self.inside_random: + seq_index = random.randint(0, self.sequence_list_len - 1) + + if seq_name is None: + seq_name = self.sequence_list[seq_index] + + metadata = self.data_store[seq_name] + + if ids is None: + ids = np.random.choice( + len(metadata), img_per_seq, replace=self.allow_duplicate_img + ) + + annos = [metadata[i] for i in ids] + + target_image_shape = self.get_target_shape(aspect_ratio) + + images = [] + depths = [] + cam_points = [] + world_points = [] + point_masks = [] + extrinsics = [] + intrinsics = [] + image_paths = [] + original_sizes = [] + + for anno in annos: + filepath = anno["filepath"] + + image_path = osp.join(self.POINTODYSSEY_DIR, filepath) + image = read_image_cv2(image_path) + + if self.load_depth: + depth_path = osp.join(self.POINTODYSSEY_DIR, anno["depthpath"]) + # depth_map = read_depth(depth_path, 1.0) * 1000 + depth_16bit = cv2.imread(depth_path, cv2.IMREAD_ANYDEPTH) + depth_map = depth_16bit.astype(np.float32) / 65535.0 * 1000.0 + + # mvs_mask_path = image_path.replace( + # "/rgbs", "/masks" + # ).replace("rgb", "mask").replace(".jpg", ".png") + # mvs_mask = cv2.imread(mvs_mask_path, cv2.IMREAD_GRAYSCALE) > 128 + # depth_map[~mvs_mask] = 0 + + depth_map = threshold_depth_map( + depth_map, min_percentile=-1, max_percentile=98 + ) + else: + depth_map = None + + original_size = np.array(image.shape[:2]) + extri_opencv = np.array(anno["extri"]) + intri_opencv = np.array(anno["intri"]) + + ( + image, + depth_map, + extri_opencv, + intri_opencv, + world_coords_points, + cam_coords_points, + point_mask, + _, + ) = self.process_one_image( + image, + depth_map, + extri_opencv, + intri_opencv, + original_size, + target_image_shape, + filepath=filepath, + ) + + images.append(image) + depths.append(depth_map) + extrinsics.append(extri_opencv) + intrinsics.append(intri_opencv) + cam_points.append(cam_coords_points) + world_points.append(world_coords_points) + point_masks.append(point_mask) + image_paths.append(image_path) + original_sizes.append(original_size) + + set_name = "PointOdyssey" + + batch = { + "seq_name": set_name + "_" + seq_name, + "ids": ids, + "frame_num": len(extrinsics), + "images": images, + "depths": depths, + "extrinsics": extrinsics, + "intrinsics": intrinsics, + "cam_points": cam_points, + "world_points": world_points, + "point_masks": point_masks, + "original_sizes": original_sizes, + } + return batch diff --git a/training/data/datasets/scannet.py b/training/data/datasets/scannet.py new file mode 100644 index 00000000..66f19e7b --- /dev/null +++ b/training/data/datasets/scannet.py @@ -0,0 +1,231 @@ + +import gzip +import json +import os.path as osp +import os +import logging + +import cv2 +import random +import numpy as np +import h5py + +from data.dataset_util import * +from data.base_dataset import BaseDataset + +import numpy as np +import torch +import cv2 + +class ScanNetDataset(BaseDataset): + def __init__( + self, + common_conf, + split: str = "train", + SCANNET_DIR: str = None, + SCANNET_ANNOTATION_DIR: str = None, + min_num_images: int = 24, + len_train: int = 100000, + len_test: int = 10000, + ): + """ + Initialize the ScanNetDataset. + + Args: + common_conf: Configuration object with common settings. + split (str): Dataset split, either 'train' or 'test'. + SCANNET_DIR (str): Directory path to ScanNet data. + SCANNET_ANNOTATION_DIR (str): Directory path to ScanNet annotations. + min_num_images (int): Minimum number of images per sequence. + len_train (int): Length of the training dataset. + len_test (int): Length of the test dataset. + Raises: + ValueError: If SCANNET_DIR or SCANNET_ANNOTATION_DIR is not specified. + """ + super().__init__(common_conf=common_conf) + + self.debug = common_conf.debug + self.training = common_conf.training + self.get_nearby = common_conf.get_nearby + self.load_depth = common_conf.load_depth + self.inside_random = common_conf.inside_random + self.allow_duplicate_img = common_conf.allow_duplicate_img + + if SCANNET_DIR is None or SCANNET_ANNOTATION_DIR is None: + raise ValueError("Both SCANNET_DIR and SCANNET_ANNOTATION_DIR must be specified.") + + if split == "train": + split_name = "train.jgz" + self.len_train = len_train + elif split == "test": + split_name = "test.jgz" + self.len_train = len_test + else: + raise ValueError(f"Invalid split: {split}") + + self.invalid_sequence = [] # set any invalid sequence names here + + + self.category_map = {} + self.data_store = {} + self.seqlen = None + self.min_num_images = min_num_images + + logging.info(f"SCANNET_DIR is {SCANNET_DIR}") + + self.SCANNET_DIR = SCANNET_DIR + self.SCANNET_ANNOTATION_DIR = SCANNET_ANNOTATION_DIR + + annotation_file = osp.join( + self.SCANNET_ANNOTATION_DIR, "scannet", split_name + ) + + try: + with gzip.open(annotation_file, "r") as fin: + annotation = json.loads(fin.read()) + except FileNotFoundError: + logging.error(f"Annotation file not found: {annotation_file}") + total_frame_num = 0 + + for seq_name, seq_data in annotation.items(): + if seq_name in self.invalid_sequence: + continue + + if len(seq_data) < min_num_images: + continue + total_frame_num += len(seq_data) + self.data_store[seq_name] = seq_data + self.sequence_list = list(self.data_store.keys()) + self.sequence_list_len = len(self.sequence_list) + self.total_frame_num = total_frame_num + + status = "Training" if self.training else "Testing" + logging.info(f"{status}: ScanNet Data size: {self.sequence_list_len}") + logging.info(f"{status}: ScanNet Data dataset length: {len(self)}") + + def get_data( + self, + seq_index: int = None, + img_per_seq: int = None, + seq_name: str = None, + ids: list = None, + aspect_ratio: float = 1.0, + ) -> dict: + """ + Retrieve data for a specific sequence. + + Args: + seq_index (int): Index of the sequence to retrieve. + img_per_seq (int): Number of images per sequence. + seq_name (str): Name of the sequence. + ids (list): Specific IDs to retrieve. + aspect_ratio (float): Aspect ratio for image processing. + + Returns: + dict: A batch of data including images, depths, and other metadata. + """ + if self.inside_random: + seq_index = random.randint(0, self.sequence_list_len - 1) + + if seq_name is None: + seq_name = self.sequence_list[seq_index] + + metadata = self.data_store[seq_name] + + if ids is None: + ids = np.random.choice( + len(metadata), img_per_seq, replace=self.allow_duplicate_img + ) + + annos = [metadata[i] for i in ids] + + target_image_shape = self.get_target_shape(aspect_ratio) + + images = [] + depths = [] + cam_points = [] + world_points = [] + point_masks = [] + extrinsics = [] + intrinsics = [] + image_paths = [] + original_sizes = [] + + for anno in annos: + filepath = anno["filepath"] + + image_path = osp.join(self.SCANNET_DIR, filepath) + image = read_image_cv2(image_path) + + if self.load_depth: + depth_path = osp.join(self.SCANNET_DIR, anno["depthpath"]) + depth = cv2.imread(depth_path, cv2.IMREAD_UNCHANGED) + depth_map = depth / 1000 + + # depth_map = self.load_depth_image(depth_path) + # depth_map = read_depth(depth_path, 1.0) / 1000 + depth_map = threshold_depth_map(depth_map, max_percentile=98, min_percentile=-1) + # depth_path = image_path.replace("/images", "/depths") + ".geometric.png" + + # mvs_mask_path = image_path.replace( + # "/images", "/depth_masks" + # ).replace(".jpg", ".png") + # mvs_mask = cv2.imread(mvs_mask_path, cv2.IMREAD_GRAYSCALE) > 128 + # depth_map[~mvs_mask] = 0 + + # depth_map = threshold_depth_map( + # depth_map, min_percentile=-1, max_percentile=98 + # ) + else: + depth_map = None + + original_size = np.array(image.shape[:2]) + extri_opencv = np.array(anno["extri"]) + # print('Extri: ', extri_opencv) + intri_opencv = np.array(anno["intri"]) + # print('Intri: ', intri_opencv) + ( + image, + depth_map, + extri_opencv, + intri_opencv, + world_coords_points, + cam_coords_points, + point_mask, + _, + ) = self.process_one_image( + image, + depth_map, + extri_opencv, + intri_opencv, + original_size, + target_image_shape, + filepath=filepath, + ) + + images.append(image) + depths.append(depth_map) + extrinsics.append(extri_opencv) + intrinsics.append(intri_opencv) + cam_points.append(cam_coords_points) + world_points.append(world_coords_points) + point_masks.append(point_mask) + image_paths.append(image_path) + original_sizes.append(original_size) + + set_name = "ScanNet" + + batch = { + "seq_name": set_name + "_" + seq_name, + "ids": ids, + "frame_num": len(extrinsics), + "images": images, + "depths": depths, + "extrinsics": extrinsics, + "intrinsics": intrinsics, + "cam_points": cam_points, + "world_points": world_points, + "point_masks": point_masks, + "original_sizes": original_sizes, + } + return batch diff --git a/training/data/datasets/wildrgbd.py b/training/data/datasets/wildrgbd.py new file mode 100644 index 00000000..14c1f653 --- /dev/null +++ b/training/data/datasets/wildrgbd.py @@ -0,0 +1,222 @@ + +import gzip +import json +import os.path as osp +import logging + +import cv2 +import random +import numpy as np + +from data.dataset_util import * +from data.base_dataset import BaseDataset + +import numpy as np +import cv2 + +class WildrgbdDataset(BaseDataset): + def __init__( + self, + common_conf, + split: str = "train", + WILDRGBD_DIR: str = None, + WILDRGBD_ANNOTATION_DIR: str = None, + min_num_images: int = 24, + len_train: int = 100000, + len_test: int = 10000, + ): + """ + Initialize the WildrgbdDataset. + + Args: + common_conf: Configuration object with common settings. + split (str): Dataset split, either 'train' or 'test'. + WILDRGBD_DIR (str): Directory path to Wildrgbd data. + WILDRGBD_ANNOTATION_DIR (str): Directory path to Wildrgbd annotations. + min_num_images (int): Minimum number of images per sequence. + len_train (int): Length of the training dataset. + len_test (int): Length of the test dataset. + Raises: + ValueError: If WILDRGBD_DIR or WILDRGBD_ANNOTATION_DIR is not specified. + """ + super().__init__(common_conf=common_conf) + + self.debug = common_conf.debug + self.training = common_conf.training + self.get_nearby = common_conf.get_nearby + self.load_depth = common_conf.load_depth + self.inside_random = common_conf.inside_random + self.allow_duplicate_img = common_conf.allow_duplicate_img + + if WILDRGBD_DIR is None or WILDRGBD_ANNOTATION_DIR is None: + raise ValueError("Both WILDRGBD_DIR and WILDRGBD_ANNOTATION_DIR must be specified.") + + if split == "train": + split_name = "train.jgz" + self.len_train = len_train + elif split == "test": + split_name = "test.jgz" + self.len_train = len_test + else: + raise ValueError(f"Invalid split: {split}") + + self.invalid_sequence = [] # set any invalid sequence names here + + + self.category_map = {} + self.data_store = {} + self.seqlen = None + self.min_num_images = min_num_images + + logging.info(f"WILDRGBD_DIR is {WILDRGBD_DIR}") + + self.WILDRGBD_DIR = WILDRGBD_DIR + self.WILDRGBD_ANNOTATION_DIR = WILDRGBD_ANNOTATION_DIR + + annotation_file = osp.join( + self.WILDRGBD_ANNOTATION_DIR, "wildrgbd", split_name + ) + + try: + with gzip.open(annotation_file, "r") as fin: + annotation = json.loads(fin.read()) + except FileNotFoundError: + logging.error(f"Annotation file not found: {annotation_file}") + total_frame_num = 0 + + for seq_name, seq_data in annotation.items(): + if seq_name in self.invalid_sequence: + continue + + if len(seq_data) < min_num_images: + continue + total_frame_num += len(seq_data) + self.data_store[seq_name] = seq_data + self.sequence_list = list(self.data_store.keys()) + self.sequence_list_len = len(self.sequence_list) + self.total_frame_num = total_frame_num + + status = "Training" if self.training else "Testing" + logging.info(f"{status}: Wildrgbd Data size: {self.sequence_list_len}") + logging.info(f"{status}: Wildrgbd Data dataset length: {len(self)}") + + def get_data( + self, + seq_index: int = None, + img_per_seq: int = None, + seq_name: str = None, + ids: list = None, + aspect_ratio: float = 1.0, + ) -> dict: + """ + Retrieve data for a specific sequence. + + Args: + seq_index (int): Index of the sequence to retrieve. + img_per_seq (int): Number of images per sequence. + seq_name (str): Name of the sequence. + ids (list): Specific IDs to retrieve. + aspect_ratio (float): Aspect ratio for image processing. + + Returns: + dict: A batch of data including images, depths, and other metadata. + """ + if self.inside_random: + seq_index = random.randint(0, self.sequence_list_len - 1) + + if seq_name is None: + seq_name = self.sequence_list[seq_index] + + metadata = self.data_store[seq_name] + + if ids is None: + ids = np.random.choice( + len(metadata), img_per_seq, replace=self.allow_duplicate_img + ) + + annos = [metadata[i] for i in ids] + + target_image_shape = self.get_target_shape(aspect_ratio) + + images = [] + depths = [] + cam_points = [] + world_points = [] + point_masks = [] + extrinsics = [] + intrinsics = [] + image_paths = [] + original_sizes = [] + + for anno in annos: + filepath = anno["filepath"] + + image_path = osp.join(self.WILDRGBD_DIR, filepath) + image = read_image_cv2(image_path) + + if self.load_depth: + depth_path = osp.join(self.WILDRGBD_DIR, anno["depthpath"]) + depth = cv2.imread(depth_path, cv2.IMREAD_ANYDEPTH) + depth_map = depth.astype(np.float32) / 1000.0 + + mvs_mask_path = image_path.replace( + "/rgb", "/masks" + ) + mvs_mask = cv2.imread(mvs_mask_path, cv2.IMREAD_GRAYSCALE) > 128 + depth_map[~mvs_mask] = 0 + + depth_map = threshold_depth_map( + depth_map, min_percentile=-1, max_percentile=98 + ) + else: + depth_map = None + + original_size = np.array(image.shape[:2]) + extri_opencv = np.array(anno["extri"]) + intri_opencv = np.array(anno["intri"]) + + ( + image, + depth_map, + extri_opencv, + intri_opencv, + world_coords_points, + cam_coords_points, + point_mask, + _, + ) = self.process_one_image( + image, + depth_map, + extri_opencv, + intri_opencv, + original_size, + target_image_shape, + filepath=filepath, + ) + + images.append(image) + depths.append(depth_map) + extrinsics.append(extri_opencv) + intrinsics.append(intri_opencv) + cam_points.append(cam_coords_points) + world_points.append(world_coords_points) + point_masks.append(point_mask) + image_paths.append(image_path) + original_sizes.append(original_size) + + set_name = "Wildrgbd" + + batch = { + "seq_name": set_name + "_" + seq_name, + "ids": ids, + "frame_num": len(extrinsics), + "images": images, + "depths": depths, + "extrinsics": extrinsics, + "intrinsics": intrinsics, + "cam_points": cam_points, + "world_points": world_points, + "point_masks": point_masks, + "original_sizes": original_sizes, + } + return batch diff --git a/training/data/preprocess/blendedmvs.py b/training/data/preprocess/blendedmvs.py new file mode 100644 index 00000000..dcc24976 --- /dev/null +++ b/training/data/preprocess/blendedmvs.py @@ -0,0 +1,57 @@ +from pathlib import Path +import json +import gzip +import numpy as np +import torch +from tqdm import tqdm + +def read_cam_file(cam_path: str): + with open(cam_path) as f: + lines = f.readlines() + + # Extrinsic (world-to-camera) + extrinsic = np.array([[float(x) for x in line.split()] for line in lines[1:5]], dtype=np.float32) + pose_w2c = extrinsic[:3, :] # 3x4 + + # Intrinsic + intrinsic = np.array([[float(x) for x in line.split()] for line in lines[7:10]], dtype=np.float32) + K = intrinsic + + # Depth range info + depth_line = [float(x) for x in lines[11].split()] + depth_min, depth_interval, num_depth, depth_max = depth_line + + return pose_w2c, K, (depth_min, depth_interval, num_depth, depth_max) + +# Root folder where everything starts +root = Path("/mimer/NOBACKUP/groups/3d-dl/blendedmvs_full") + +out = {} + +for scene_dir in tqdm(root.iterdir()): + frames = sorted([p.name for p in (scene_dir / "blended_images").iterdir() if p.suffix == ".jpg" and not p.name.endswith("_masked.jpg")]) + + sequence_data = [] + for frame in frames: + cams_path = scene_dir / "cams" / (frame.replace(".jpg", "_cam.txt")) + depth_path = scene_dir / "depths" / (frame.replace(".jpg", ".pfm")) + + pose_w2c, K, depth_info = read_cam_file(cams_path) + + frame_data = { + "filepath": f"{scene_dir.name}/blended_images/{frame}", + "extri": pose_w2c.tolist(), + "intri": K.tolist(), + "depthpath": f"{scene_dir.name}/rendered_depth_maps/{frame.replace('.jpg', '.pfm')}", + } + sequence_data.append(frame_data) + + out[scene_dir.name] = sequence_data + +root = "/mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt" + +with gzip.open(root+"/annotations/blendedmvs/train.jgz", "wt", encoding="utf-8") as f: + json.dump(out, f, ensure_ascii=False, indent=4) + +print(f"Processed {len(out)} scenes with a total of {sum(len(v) for v in out.values())} images.") + diff --git a/training/data/preprocess/co3d.py b/training/data/preprocess/co3d.py new file mode 100644 index 00000000..29dfd1c1 --- /dev/null +++ b/training/data/preprocess/co3d.py @@ -0,0 +1,104 @@ +from pathlib import Path +import json +import gzip +import numpy as np +import torch +from tqdm import tqdm +import os.path as osp + +root = Path("/mimer/NOBACKUP/groups/3d-dl/co3d_full") + +def co3d_annotation_to_opencv_pose(frame_data): + p = frame_data['viewpoint']['principal_point'] + f = frame_data['viewpoint']['focal_length'] + h, w = frame_data['image']['size'] + K = np.eye(3) + s = (min(h, w) - 1) / 2 + K[0, 0] = f[0] * (w - 1) / 2 + K[1, 1] = f[1] * (h - 1) / 2 + K[0, 2] = -p[0] * s + (w - 1) / 2 + K[1, 2] = -p[1] * s + (h - 1) / 2 + + R = np.asarray(frame_data['viewpoint']['R']).T # note the transpose here + T = np.asarray(frame_data['viewpoint']['T']) + pose = np.concatenate([R,T[:,None]],1) + pose = np.diag([-1,-1,1]).astype(np.float32) @ pose # flip the direction of x,y axis + + return pose, K + +out = {} +for category_dir in tqdm(root.iterdir()): + print('Processing category: ', category_dir.name) + frame_file = osp.join(category_dir, "frame_annotations.jgz") + sequence_file = osp.join(category_dir, "sequence_annotations.jgz") + + set_list = json.load(open(osp.join(category_dir, "set_lists.json"), "r")) + + train_sequences = set() + for split in ["train_known", "train_unseen"]: + if split in set_list: + for entry in set_list[split]: + sequence_id = entry[0] # first element is the sequence ID + train_sequences.add(sequence_id) + + # Convert to a sorted list if you want + train_sequences = sorted(train_sequences) + + # print('Set list: ', train_sequences) + + with gzip.open(frame_file, "r") as fin: + frame_data = json.loads(fin.read()) + with gzip.open(sequence_file, "r") as fin: + sequence_data = json.loads(fin.read()) + + frame_data_processed = {} + for f_data in frame_data: + sequence_name = f_data["sequence_name"] + frame_data_processed.setdefault(sequence_name, {})[f_data["frame_number"]] = f_data + + + for seq in train_sequences: + # print(frame_data_processed[seq]) + seq_data = frame_data_processed[seq] + scene_dir = category_dir / seq + images_dir = scene_dir / "images" + frames = sorted([p.name for p in images_dir.iterdir() if p.suffix == ".jpg"]) + out_sequence_data = [] + for i, frame in enumerate(frames): + frame_data = seq_data[i] + # viewpoint = frame_data['viewpoint'] + # R = np.array(viewpoint['R']) + # T = np.array(viewpoint['T']).reshape(3, 1) + # extrinsic = np.eye(4) + # extrinsic[:3, :3] = R + # extrinsic[:3, 3:] = T + + # fx, fy = viewpoint['focal_length'] + # cx, cy = viewpoint['principal_point'] + + # intrinsic = np.array([ + # [fx, 0, cx], + # [0, fy, cy], + # [0, 0, 1] + # ]) + + extrinsic, intrinsic = co3d_annotation_to_opencv_pose(frame_data) + # extrinsic = np.vstack([extrinsic, [0, 0, 0, 1]]) + # extrinsic = np.linalg.inv(extrinsic) + + frame_data = { + "filepath": frame_data['image']['path'], + "extri": extrinsic[:3].tolist(), + "intri": intrinsic.tolist(), + } + out_sequence_data.append(frame_data) + # print('Frame data: ', frame_data) + out[category_dir.name+"_"+seq] = out_sequence_data + +root = "/mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt" + +with gzip.open(root+"/annotations/co3d/train.jgz", "wt", encoding="utf-8") as f: + json.dump(out, f, ensure_ascii=False, indent=4) + +print(f"Processed {len(out)} scenes with a total of {sum(len(v) for v in out.values())} images.") + diff --git a/training/data/preprocess/co3d_clean_anno.py b/training/data/preprocess/co3d_clean_anno.py new file mode 100644 index 00000000..c7bbfdf7 --- /dev/null +++ b/training/data/preprocess/co3d_clean_anno.py @@ -0,0 +1,51 @@ +import os.path as osp +import os +import random +import gzip +import json + +data_root = "/mimer/NOBACKUP/groups/3d-dl/co3dv2" +split = "train" +all_categories = os.listdir(data_root) +annotation_dir = "/mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations/co3d" + +for c in all_categories: + annotation_file = osp.join(annotation_dir, f"{c}_{split}.jgz") + + try: + # Load the annotation + with gzip.open(annotation_file, "r") as fin: + annotation = json.loads(fin.read()) + except FileNotFoundError: + print(f"Annotation file not found: {annotation_file}") + continue + + # Get existing sequences from the data directory + category_path = osp.join(data_root, c) + if not osp.isdir(category_path): + print(f"Category directory not found: {category_path}") + continue + + existing_sequences = set(os.listdir(category_path)) + print(f"Category: {c}") + print(f" Total sequences in annotation: {len(annotation)}") + print(f" Existing sequences in data: {len(existing_sequences)}") + + # Filter annotation to keep only existing sequences + filtered_annotation = { + seq_name: seq_data + for seq_name, seq_data in annotation.items() + if seq_name in existing_sequences + } + + removed_count = len(annotation) - len(filtered_annotation) + print(f" Removed sequences: {removed_count}") + print(f" Remaining sequences: {len(filtered_annotation)}") + + # Save the filtered annotation back + if removed_count > 0: + with gzip.open(annotation_file, "wt", encoding="utf-8") as fout: + json.dump(filtered_annotation, fout) + print(f" ✓ Saved filtered annotation") + else: + print(f" No changes needed") \ No newline at end of file diff --git a/training/data/preprocess/hypersim.py b/training/data/preprocess/hypersim.py new file mode 100644 index 00000000..62c41bb8 --- /dev/null +++ b/training/data/preprocess/hypersim.py @@ -0,0 +1,218 @@ +from glob import glob +from pathlib import Path + +import cv2 +import h5py +import numpy as np +import pandas as pd +import torch +from tqdm import tqdm +import json +import gzip + +def to_homogeneous(x: torch.Tensor) -> torch.Tensor: + return torch.cat((x, torch.ones_like(x[..., :1])), dim=-1) + +def get_pixel_grid( + B: int, + H: int, + W: int, +) -> torch.Tensor: + x1_n = torch.meshgrid( + *[torch.arange(n) + 0.5 for n in (B, H, W)], + indexing="ij", + ) + x1_n = torch.stack((x1_n[2], x1_n[1]), dim=-1).reshape(B, H, W, 2) + return x1_n + +def load_distance(distance_path) -> np.ndarray: + with h5py.File(distance_path, "r") as x: + return x["dataset"][:] # type: ignore + +def homog_pixel_grid(H: int, W: int) -> np.ndarray: + return ( + to_homogeneous( + get_pixel_grid( + 1, + H, + W, + ) + ) + .numpy() + .reshape(-1, 3) + .T + ) + +def depth_from_distance( + distance: torch.Tensor, K: torch.Tensor + ) -> torch.Tensor: + H, W = distance.shape[0], distance.shape[1] + grid = homog_pixel_grid(H, W) + rays = torch.linalg.inv(K) @ grid # 3xHW + ray_z = rays[-1] / torch.linalg.norm(rays, dim=0) + z = distance.reshape(-1) * ray_z + return z.reshape(H, W, 1) + +if __name__ == "__main__": + out = {} + data_root = Path("/mimer/NOBACKUP/groups/3d-dl/ml-hypersim/contrib/99991/downloads") + + metadata_camera_parameters_csv_file = ( + data_root / "metadata_camera_parameters.csv" + ) + df_camera_parameters = pd.read_csv( + metadata_camera_parameters_csv_file, index_col="scene_name" + ) + scene_names = {f"ai_{i:03d}" for i in range(61)} + + for scene_path in tqdm(list(data_root.iterdir())): + scene_name = scene_path.name + if scene_name in ["ai_024_012", "ai_026_008", "ai_026_013"]: + print("Skipping problematic scene " + scene_name) + continue + if (scene_name[:-4] not in scene_names) and (scene_name not in scene_names): + continue + df_: pd.Series = df_camera_parameters.loc[scene_name] # type: ignore + width_pixels = int(df_["settings_output_img_width"]) + height_pixels = int(df_["settings_output_img_height"]) + + M_proj = [ + [ + df_["M_proj_00"], + df_["M_proj_01"], + df_["M_proj_02"], + df_["M_proj_03"], + ], + [ + df_["M_proj_10"], + df_["M_proj_11"], + df_["M_proj_12"], + df_["M_proj_13"], + ], + [ + df_["M_proj_20"], + df_["M_proj_21"], + df_["M_proj_22"], + df_["M_proj_23"], + ], + [ + df_["M_proj_30"], + df_["M_proj_31"], + df_["M_proj_32"], + df_["M_proj_33"], + ], + ] + M_proj = np.array(M_proj) + M_screen_from_ndc = np.array( + [ + [0.5 * (width_pixels), 0, 0, 0.5 * (width_pixels)], + [0, -0.5 * (height_pixels), 0, 0.5 * (height_pixels)], + [0, 0, 0.5, 0.5], # doesn't matter + [0, 0, 0, 1.0], + ] + ) + x = (M_screen_from_ndc @ M_proj)[[0, 1, 3]] + K, R = cv2.decomposeProjectionMatrix(x)[:2] # type: ignore + K = K / K[2, 2] + + scene_root = scene_path + + metadata_scene = scene_root / "_detail" / "metadata_scene.csv" + camera_name = "cam_00" + df = pd.read_csv(metadata_scene) + meters_per_asset = df.loc[ + df["parameter_name"] == "meters_per_asset_unit", "parameter_value" + ].iloc[0] + + image_paths = sorted( + glob( + ( + scene_root + / "images" + / f"scene_{camera_name}_final_preview" + / "frame.*.color.jpg" + ).as_posix() + ) + ) + distance_paths = sorted( + glob( + ( + scene_root + / "images" + / f"scene_{camera_name}_geometry_hdf5" + / "frame.*.depth_meters.hdf5" + ).as_posix() + ) + ) + + distance_paths = {int(dp.split(".")[-3]): dp for dp in distance_paths} + image_paths = {int(ip.split(".")[-3]): ip for ip in image_paths} + image_ids = set(distance_paths.keys()).intersection( + image_paths.keys() + ) + + if len(image_ids) == 0: + print("No shared image/depth paths for scene" + scene_name) + continue + + camera_root = scene_root / "_detail" / camera_name + camera_positions_hdf5_file = camera_root / "camera_keyframe_positions.hdf5" + camera_orientations_hdf5_file = ( + camera_root / "camera_keyframe_orientations.hdf5" + ) + with ( + h5py.File(camera_positions_hdf5_file, "r") as h5_pos, + h5py.File(camera_orientations_hdf5_file, "r") as h5_rots, + ): # type: ignore + camera_positions: np.ndarray = h5_pos["dataset"][:] # type: ignore + rots: np.ndarray = h5_rots["dataset"][:] # type: ignore + rots = rots.transpose((0, 2, 1)) + translations = -rots @ camera_positions[..., None] + poses = np.zeros((len(rots), 4, 4)) + poses[:, 3, 3] = 1.0 + poses[:, :3, :3] = R[None] @ rots + poses[:, :3, 3:] = R[None] @ translations + + idx_to_image_id = { + idx: img_id for idx, img_id in enumerate(image_ids) + } + image_id_to_idx = { + img_id: idx for idx, img_id in enumerate(image_ids) + } + + # K_fixed = K.copy() + # K_fixed[0,2], K_fixed[1,2] = K[1,2], K[0,2] + intrinsic = torch.tensor(K).reshape(3, 3).float() + + sequence_data = [] + for img_id in image_ids: + T = torch.tensor(poses[img_id]).float() + im_path = Path(image_paths[img_id]) + depth_path = Path(distance_paths[img_id]) + # distance = ( + # torch.tensor(load_distance(depth_path)).float() + # / meters_per_asset + # ) + + # depth = depth_from_distance(distance, intrinsic).float() + # depth[depth.isnan()] = 0 + + + T_w2c = T[:3].numpy().tolist() + frame_data = { + "filepath": im_path.as_posix().split('downloads/')[1], + "extri": T_w2c, + "meters_per_asset": meters_per_asset, + "intri": intrinsic.numpy().tolist(), + "depthpath": depth_path.as_posix().split('downloads/')[1], + } + sequence_data.append(frame_data) + out[scene_name] = sequence_data + +root = "/mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt" + +with gzip.open(root+"/annotations/hypersim/train.jgz", "wt", encoding="utf-8") as f: + json.dump(out, f, ensure_ascii=False, indent=4) + +print(f"Processed {len(out)} scenes with a total of {sum(len(v) for v in out.values())} images.") + diff --git a/training/data/preprocess/megadepth/generate_sequences.py b/training/data/preprocess/megadepth/generate_sequences.py new file mode 100644 index 00000000..0cd40f8d --- /dev/null +++ b/training/data/preprocess/megadepth/generate_sequences.py @@ -0,0 +1,80 @@ +import numpy as np +import json +import gzip +from tqdm import tqdm + + +# See https://github.com/facebookresearch/vggt/issues/82 +# and https://github.com/facebookresearch/vggt/issues/216#issuecomment-3053586858 + +def sample_topk_sequences(overlap_matrix, image_paths, sequence_length=256, num_sequences=1000): + n_images = overlap_matrix.shape[0] + sequences = [] + + for _ in range(num_sequences): + # Randomly pick an anchor image + anchor = np.random.randint(n_images) + + overlaps = overlap_matrix[anchor] + # Exclude invalid entries (e.g., -1) + valid_mask = overlaps >= 0 + valid_mask[anchor] = False # don't include self + + valid_indices = np.where(valid_mask)[0] + if len(valid_indices) < sequence_length - 1: + continue # skip if not enough neighbors + + # Sort by overlap descending + sorted_neighbors = valid_indices[np.argsort(-overlaps[valid_indices])] + + # Pick top-k + selected_neighbors = sorted_neighbors[:sequence_length - 1] + + # Form the sequence: anchor + top neighbors + sequence = [anchor] + selected_neighbors.tolist() + + # print(image_paths[sequence]) # Access image paths for the sequence + sequence = [{ + "filepath": p, + "id": s + } for p, s in zip(image_paths[sequence], sequence)] + sequences.append(sequence) + + return sequences + +with open("train_scenes.txt", "r") as f: + train_scenes = [line.strip() for line in f.readlines()] +with open("valid_scenes.txt", "r") as f: + val_scenes = [line.strip() for line in f.readlines()] + +for split in ["train", "val"]: + if split == "train": + scenes = train_scenes + else: + scenes = val_scenes + + + out = {} + for scene in tqdm(scenes): + try: + data = np.load(f"/mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations/megadepth/scene_info/{scene}.npz", allow_pickle=True) + print('Data keys:', data.keys()) + print('Depth paths: ', data['depth_paths']) + overlap_matrix = data['overlap_matrix'] + image_paths = data['image_paths'] + print('Data: ', data) + + sequences = sample_topk_sequences(overlap_matrix, image_paths, sequence_length=256, num_sequences=1000) + out[scene] = sequences + except FileNotFoundError: + print(f"File not found for scene {scene}. Skipping...") + continue + + # root = "/mimer/NOBACKUP/groups/snic2022-6-266/davnords/mv-ssl" + # with open(root+f"/annotations/megadepth/{split}.json", "w") as f: + # json.dump(out, f, indent=4) # `indent=4` makes it pretty-printed + + # with gzip.open(root+f"/annotations/megadepth/{split}.jgz", "wt", encoding="utf-8") as f: + # json.dump(out, f, ensure_ascii=False, indent=4) + + # print(f"Processed {len(out)} scenes with a total of {sum(len(v) for v in out.values())} images.") \ No newline at end of file diff --git a/training/data/preprocess/megadepth/load.py b/training/data/preprocess/megadepth/load.py new file mode 100644 index 00000000..de124456 --- /dev/null +++ b/training/data/preprocess/megadepth/load.py @@ -0,0 +1,251 @@ +import os +import numpy as np +import shutil +import json +import random +from collections import defaultdict +from typing import List, Dict, Set, Tuple +import gzip + +def check_file_exists(path: str) -> bool: + """Check if file exists and is readable""" + return os.path.exists(path) and os.path.isfile(path) + +def build_graph_from_pairs(pairs: List[Tuple[int, int]], overlaps: np.ndarray, + image_paths: List[str], depth_paths: List[str], + min_overlap: float, max_overlap: float) -> Dict[int, List[int]]: + """Build adjacency graph from valid pairs with overlap filtering""" + graph = defaultdict(list) + + for i, (idx1, idx2) in enumerate(pairs): + overlap = overlaps[i] + + # Check overlap constraints + if overlap < min_overlap or overlap > max_overlap: + continue + + # Check if files exist + img1_exists = check_file_exists(os.path.join(data_root, image_paths[idx1])) + img2_exists = check_file_exists(os.path.join(data_root, image_paths[idx2])) + depth1_exists = check_file_exists(os.path.join(data_root, depth_paths[idx1])) + depth2_exists = check_file_exists(os.path.join(data_root, depth_paths[idx2])) + + # Only add edge if both frames have valid files + if img1_exists and img2_exists and depth1_exists and depth2_exists: + graph[idx1].append(idx2) + graph[idx2].append(idx1) + + return graph + +def generate_sequence(graph: Dict[int, List[int]], start_node: int, + target_length: int, used_nodes: Set[int]) -> List[int]: + """Generate a sequence by random walk, avoiding already used nodes when possible""" + sequence = [start_node] + current = start_node + local_used = {start_node} + + for _ in range(target_length - 1): + if current not in graph or not graph[current]: + break + + # Get neighbors, prefer unused ones + neighbors = graph[current] + unused_neighbors = [n for n in neighbors if n not in used_nodes and n not in local_used] + + if unused_neighbors: + next_node = random.choice(unused_neighbors) + else: + # Fall back to any neighbor not in current sequence + available = [n for n in neighbors if n not in local_used] + if not available: + break + next_node = random.choice(available) + + sequence.append(next_node) + local_used.add(next_node) + current = next_node + + return sequence + +def create_sequences_for_scene(scene_info: Dict, scene_name: str, + min_overlap: float, max_overlap: float, + num_sequences: int = 1000, sequence_length: int = 24) -> List[List[Dict]]: + """Create diverse sequences for a scene""" + + image_paths = scene_info["image_paths"] + depth_paths = scene_info["depth_paths"] + intrinsics = scene_info["intrinsics"] + poses = scene_info["poses"] + pairs = scene_info["pairs"] + overlaps = scene_info["overlaps"] + + print(f'Scene {scene_name}: {len(pairs)} pairs, {overlaps.shape[0]} overlaps') + + # Build graph from valid pairs + graph = build_graph_from_pairs(pairs, overlaps, image_paths, depth_paths, + min_overlap, max_overlap) + + if not graph: + print(f"No valid pairs found for scene {scene_name}") + return [] + + print(f'Built graph with {len(graph)} nodes') + + sequences = [] + used_nodes = set() + max_attempts = num_sequences * 3 # Allow some failed attempts + attempts = 0 + + # Get nodes with good connectivity for starting points + node_degrees = [(node, len(neighbors)) for node, neighbors in graph.items()] + node_degrees.sort(key=lambda x: x[1], reverse=True) + good_start_nodes = [node for node, degree in node_degrees if degree >= 2] + + if not good_start_nodes: + good_start_nodes = list(graph.keys()) + + while len(sequences) < num_sequences and attempts < max_attempts: + attempts += 1 + + # Choose starting node with preference for unused nodes + unused_start_nodes = [n for n in good_start_nodes if n not in used_nodes] + if unused_start_nodes: + start_node = random.choice(unused_start_nodes) + else: + start_node = random.choice(good_start_nodes) + + # Generate sequence + sequence_indices = generate_sequence(graph, start_node, sequence_length, used_nodes) + + if len(sequence_indices) >= sequence_length // 2: # Accept if at least half the target length + # Create sequence with all metadata + sequence_frames = [] + for frame_idx in sequence_indices: + frame_data = { + 'frame_idx': int(frame_idx), + 'filepath': image_paths[frame_idx], + 'depth_path': depth_paths[frame_idx], + 'intri': intrinsics[frame_idx].tolist(), + 'extri': poses[frame_idx][:3, :].tolist() + } + sequence_frames.append(frame_data) + + sequences.append(sequence_frames) + used_nodes.update(sequence_indices) + + if len(sequences) % 100 == 0: + print(f'Generated {len(sequences)} sequences') + + # Reset used nodes occasionally to allow more diversity + if attempts % (num_sequences // 4) == 0: + used_nodes = set() + + print(f'Generated {len(sequences)} sequences for scene {scene_name}') + return sequences + +def save_sequences(sequences: List[List[Dict]], scene_out_dir: str): + """Save sequences to disk with file copying and metadata""" + + for seq_idx, seq in enumerate(sequences): + seq_dir = os.path.join(scene_out_dir, f"sequence_{seq_idx:03d}") + os.makedirs(seq_dir, exist_ok=True) + + metadata = [] + valid_frames = [] + + for frame_idx, frame_data in enumerate(seq): + # Double-check files exist before copying + img_src = frame_data["image_path"] + depth_src = frame_data["depth_path"] + + if not (check_file_exists(img_src) and check_file_exists(depth_src)): + print(f"Warning: Skipping frame {frame_idx} in sequence {seq_idx} - missing files") + continue + + # Copy files + img_dst = os.path.join(seq_dir, f"{len(valid_frames):03d}.jpg") + depth_dst = os.path.join(seq_dir, f"{len(valid_frames):03d}.npy") + + try: + shutil.copy(img_src, img_dst) + valid_frames.append(frame_data) + + except Exception as e: + print(f"Error copying files for sequence {seq_idx}, frame {frame_idx}: {e}") + continue + + # Save metadata + if metadata: + metadata_file = os.path.join(seq_dir, "metadata.json") + with open(metadata_file, 'w') as f: + json.dump({ + 'sequence_length': len(metadata), + 'frames': metadata + }, f, indent=2) + +from tqdm import tqdm +# Main execution +data_root = "/mimer/NOBACKUP/groups/snic2022-6-266/data/megadepth" +scene_info_root = os.path.join(data_root, "prep_scene_info") +all_scenes = os.listdir(scene_info_root) +test_scenes = ["0017.npy", "0004.npy", "0048.npy", "0013.npy"] +out_root = "sequences_out" +os.makedirs(out_root, exist_ok=True) + +split = "test" + +if split == "train": + scene_names = set(all_scenes) - set(test_scenes) + +elif split == "test": + scene_names = test_scenes + +min_overlap = 0.01 +max_overlap = 1.0 + +# Set random seed for reproducibility +random.seed(42) +np.random.seed(42) + +result = {} + +for scene_name in tqdm(scene_names): + print(f"\nProcessing scene: {scene_name}") + + try: + scene_info = np.load( + os.path.join(scene_info_root, scene_name), allow_pickle=True + ).item() + + scene_name_clean = os.path.splitext(scene_name)[0] + scene_name_out = f"{scene_name_clean}_{min_overlap}_{max_overlap}" + + # Create sequences + sequences = create_sequences_for_scene( + scene_info, scene_name_clean, min_overlap, max_overlap, + num_sequences=500, sequence_length=24 + ) + result[scene_name_clean] = sequences + print(f"Total sequences for scene {scene_name_clean}: {len(sequences)}") + # if sequences: + # # Create output directory + # scene_out_dir = os.path.join(out_root, scene_name_out) + # os.makedirs(scene_out_dir, exist_ok=True) + + # # Save sequences + # save_sequences(sequences, scene_out_dir) + + # print(f"Saved {len(sequences)} sequences for scene {scene_name_clean}") + # else: + # print(f"No valid sequences generated for scene {scene_name_clean}") + + except Exception as e: + print(f"Error processing scene {scene_name}: {e}") + continue + +# Save as .jgz + +with gzip.open(f"/mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/annotations/megadepth/{split}.jgz", "wt", encoding="utf-8") as f: + json.dump(result, f, ensure_ascii=False, indent=4) + +print("\nSequence generation completed!") \ No newline at end of file diff --git a/training/data/preprocess/megadepth/preprocess_scene.py b/training/data/preprocess/megadepth/preprocess_scene.py new file mode 100644 index 00000000..59d6c106 --- /dev/null +++ b/training/data/preprocess/megadepth/preprocess_scene.py @@ -0,0 +1,241 @@ +import argparse + +import numpy as np + +import os + +parser = argparse.ArgumentParser(description='MegaDepth preprocessing script') + +parser.add_argument( + '--base_path', type=str, required=True, + help='path to MegaDepth' +) +parser.add_argument( + '--scene_id', type=str, required=True, + help='scene ID' +) + +parser.add_argument( + '--output_path', type=str, required=True, + help='path to the output directory' +) + +args = parser.parse_args() + +base_path = args.base_path +# Remove the trailing / if need be. +if base_path[-1] in ['/', '\\']: + base_path = base_path[: - 1] +scene_id = args.scene_id + +base_depth_path = os.path.join( + base_path, 'phoenix/S6/zl548/MegaDepth_v1' +) +base_undistorted_sfm_path = os.path.join( + base_path, 'Undistorted_SfM' +) + +undistorted_sparse_path = os.path.join( + base_undistorted_sfm_path, scene_id, 'sparse-txt' +) +if not os.path.exists(undistorted_sparse_path): + exit() + +depths_path = os.path.join( + base_depth_path, scene_id, 'dense0', 'depths' +) +if not os.path.exists(depths_path): + exit() + +images_path = os.path.join( + base_undistorted_sfm_path, scene_id, 'images' +) +if not os.path.exists(images_path): + exit() + +# Process cameras.txt +with open(os.path.join(undistorted_sparse_path, 'cameras.txt'), 'r') as f: + raw = f.readlines()[3 :] # skip the header + +camera_intrinsics = {} +for camera in raw: + camera = camera.split(' ') + camera_intrinsics[int(camera[0])] = [float(elem) for elem in camera[2 :]] + +# Process points3D.txt +with open(os.path.join(undistorted_sparse_path, 'points3D.txt'), 'r') as f: + raw = f.readlines()[3 :] # skip the header + +points3D = {} +for point3D in raw: + point3D = point3D.split(' ') + points3D[int(point3D[0])] = np.array([ + float(point3D[1]), float(point3D[2]), float(point3D[3]) + ]) + +# Process images.txt +with open(os.path.join(undistorted_sparse_path, 'images.txt'), 'r') as f: + raw = f.readlines()[4 :] # skip the header + +image_id_to_idx = {} +image_names = [] +raw_pose = [] +camera = [] +points3D_id_to_2D = [] +n_points3D = [] +for idx, (image, points) in enumerate(zip(raw[:: 2], raw[1 :: 2])): + image = image.split(' ') + points = points.split(' ') + + image_id_to_idx[int(image[0])] = idx + + image_name = image[-1].strip('\n') + image_names.append(image_name) + + raw_pose.append([float(elem) for elem in image[1 : -2]]) + camera.append(int(image[-2])) + current_points3D_id_to_2D = {} + for x, y, point3D_id in zip(points[:: 3], points[1 :: 3], points[2 :: 3]): + if int(point3D_id) == -1: + continue + current_points3D_id_to_2D[int(point3D_id)] = [float(x), float(y)] + points3D_id_to_2D.append(current_points3D_id_to_2D) + n_points3D.append(len(current_points3D_id_to_2D)) +n_images = len(image_names) + +# Image and depthmaps paths +image_paths = [] +depth_paths = [] +for image_name in image_names: + image_path = os.path.join(images_path, image_name) + + # Path to the depth file + depth_path = os.path.join( + depths_path, '%s.h5' % os.path.splitext(image_name)[0] + ) + + if os.path.exists(depth_path): + # Check if depth map or background / foreground mask + file_size = os.stat(depth_path).st_size + # Rough estimate - 75KB might work as well + if file_size < 100 * 1024: + depth_paths.append(None) + image_paths.append(None) + else: + depth_paths.append(depth_path[len(base_path) + 1 :]) + image_paths.append(image_path[len(base_path) + 1 :]) + else: + print('ERROR: Depth path does not exist: %s' % depth_path) + depth_paths.append(None) + image_paths.append(None) + +# Camera configuration +intrinsics = [] +poses = [] +principal_axis = [] +points3D_id_to_ndepth = [] +for idx, image_name in enumerate(image_names): + if image_paths[idx] is None: + intrinsics.append(None) + poses.append(None) + principal_axis.append([0, 0, 0]) + points3D_id_to_ndepth.append({}) + continue + image_intrinsics = camera_intrinsics[camera[idx]] + K = np.zeros([3, 3]) + K[0, 0] = image_intrinsics[2] + K[0, 2] = image_intrinsics[4] + K[1, 1] = image_intrinsics[3] + K[1, 2] = image_intrinsics[5] + K[2, 2] = 1 + intrinsics.append(K) + + image_pose = raw_pose[idx] + qvec = image_pose[: 4] + qvec = qvec / np.linalg.norm(qvec) + w, x, y, z = qvec + R = np.array([ + [ + 1 - 2 * y * y - 2 * z * z, + 2 * x * y - 2 * z * w, + 2 * x * z + 2 * y * w + ], + [ + 2 * x * y + 2 * z * w, + 1 - 2 * x * x - 2 * z * z, + 2 * y * z - 2 * x * w + ], + [ + 2 * x * z - 2 * y * w, + 2 * y * z + 2 * x * w, + 1 - 2 * x * x - 2 * y * y + ] + ]) + principal_axis.append(R[2, :]) + t = image_pose[4 : 7] + # World-to-Camera pose + current_pose = np.zeros([4, 4]) + current_pose[: 3, : 3] = R + current_pose[: 3, 3] = t + current_pose[3, 3] = 1 + # Camera-to-World pose + # pose = np.zeros([4, 4]) + # pose[: 3, : 3] = np.transpose(R) + # pose[: 3, 3] = -np.matmul(np.transpose(R), t) + # pose[3, 3] = 1 + poses.append(current_pose) + + current_points3D_id_to_ndepth = {} + for point3D_id in points3D_id_to_2D[idx].keys(): + p3d = points3D[point3D_id] + current_points3D_id_to_ndepth[point3D_id] = (np.dot(R[2, :], p3d) + t[2]) / (.5 * (K[0, 0] + K[1, 1])) + points3D_id_to_ndepth.append(current_points3D_id_to_ndepth) +principal_axis = np.array(principal_axis) +angles = np.rad2deg(np.arccos( + np.clip( + np.dot(principal_axis, np.transpose(principal_axis)), + -1, 1 + ) +)) + +# Compute overlap score +overlap_matrix = np.full([n_images, n_images], -1.) +scale_ratio_matrix = np.full([n_images, n_images], -1.) +for idx1 in range(n_images): + if image_paths[idx1] is None or depth_paths[idx1] is None: + continue + for idx2 in range(idx1 + 1, n_images): + if image_paths[idx2] is None or depth_paths[idx2] is None: + continue + matches = ( + points3D_id_to_2D[idx1].keys() & + points3D_id_to_2D[idx2].keys() + ) + min_num_points3D = min( + len(points3D_id_to_2D[idx1]), len(points3D_id_to_2D[idx2]) + ) + overlap_matrix[idx1, idx2] = len(matches) / len(points3D_id_to_2D[idx1]) # min_num_points3D + overlap_matrix[idx2, idx1] = len(matches) / len(points3D_id_to_2D[idx2]) # min_num_points3D + if len(matches) == 0: + continue + points3D_id_to_ndepth1 = points3D_id_to_ndepth[idx1] + points3D_id_to_ndepth2 = points3D_id_to_ndepth[idx2] + nd1 = np.array([points3D_id_to_ndepth1[match] for match in matches]) + nd2 = np.array([points3D_id_to_ndepth2[match] for match in matches]) + min_scale_ratio = np.min(np.maximum(nd1 / nd2, nd2 / nd1)) + scale_ratio_matrix[idx1, idx2] = min_scale_ratio + scale_ratio_matrix[idx2, idx1] = min_scale_ratio + +np.savez( + os.path.join(args.output_path, '%s.npz' % scene_id), + image_paths=image_paths, + depth_paths=depth_paths, + intrinsics=intrinsics, + poses=poses, + overlap_matrix=overlap_matrix, + scale_ratio_matrix=scale_ratio_matrix, + angles=angles, + n_points3D=n_points3D, + points3D_id_to_2D=points3D_id_to_2D, + points3D_id_to_ndepth=points3D_id_to_ndepth +) \ No newline at end of file diff --git a/training/data/preprocess/megadepth/preprocess_undistorted_megadepth.sh b/training/data/preprocess/megadepth/preprocess_undistorted_megadepth.sh new file mode 100644 index 00000000..c983ee46 --- /dev/null +++ b/training/data/preprocess/megadepth/preprocess_undistorted_megadepth.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +if [[ $# != 2 ]]; then + echo 'Usage: bash preprocess_megadepth.sh /path/to/megadepth /output/path' + exit +fi + +export dataset_path=$1 +export output_path=$2 + +mkdir $output_path +echo 0 +ls $dataset_path/Undistorted_SfM | xargs -P 8 -I % sh -c 'echo %; python preprocess_scene.py --base_path $dataset_path --scene_id % --output_path $output_path' \ No newline at end of file diff --git a/training/data/preprocess/megadepth/train_scenes.txt b/training/data/preprocess/megadepth/train_scenes.txt new file mode 100644 index 00000000..83c78d3a --- /dev/null +++ b/training/data/preprocess/megadepth/train_scenes.txt @@ -0,0 +1,117 @@ +0000 +0001 +0002 +0003 +0004 +0005 +0007 +0008 +0011 +0012 +0013 +0015 +0017 +0019 +0020 +0021 +0022 +0023 +0024 +0025 +0026 +0027 +0032 +0035 +0036 +0037 +0039 +0042 +0043 +0046 +0048 +0050 +0056 +0057 +0060 +0061 +0063 +0065 +0070 +0080 +0083 +0086 +0087 +0095 +0098 +0100 +0101 +0103 +0104 +0105 +0107 +0115 +0117 +0122 +0130 +0137 +0143 +0147 +0148 +0149 +0150 +0156 +0160 +0176 +0183 +0189 +0190 +0200 +0214 +0224 +0235 +0237 +0240 +0243 +0258 +0265 +0269 +0299 +0312 +0326 +0327 +0331 +0335 +0341 +0348 +0366 +0377 +0380 +0394 +0407 +0411 +0430 +0446 +0455 +0472 +0474 +0476 +0478 +0493 +0494 +0496 +0505 +0559 +0733 +0860 +1017 +1589 +4541 +5004 +5005 +5006 +5007 +5009 +5010 +5012 +5013 +5017 \ No newline at end of file diff --git a/training/data/preprocess/megadepth/undistort_reconstructions.py b/training/data/preprocess/megadepth/undistort_reconstructions.py new file mode 100644 index 00000000..a6b99a72 --- /dev/null +++ b/training/data/preprocess/megadepth/undistort_reconstructions.py @@ -0,0 +1,69 @@ +import argparse + +import imagesize + +import os + +import subprocess + +parser = argparse.ArgumentParser(description='MegaDepth Undistortion') + +parser.add_argument( + '--colmap_path', type=str, required=True, + help='path to colmap executable' +) +parser.add_argument( + '--base_path', type=str, required=True, + help='path to MegaDepth' +) + +args = parser.parse_args() + +sfm_path = os.path.join( + args.base_path, 'MegaDepth_v1_SfM' +) +base_depth_path = os.path.join( + args.base_path, 'phoenix/S6/zl548/MegaDepth_v1' +) +output_path = os.path.join( + args.base_path, 'Undistorted_SfM' +) + +os.mkdir(output_path) + +for scene_name in os.listdir(base_depth_path): + current_output_path = os.path.join(output_path, scene_name) + os.mkdir(current_output_path) + + image_path = os.path.join( + base_depth_path, scene_name, 'dense0', 'imgs' + ) + if not os.path.exists(image_path): + continue + + # Find the maximum image size in scene. + max_image_size = 0 + for image_name in os.listdir(image_path): + max_image_size = max( + max_image_size, + max(imagesize.get(os.path.join(image_path, image_name))) + ) + + # Undistort the images and update the reconstruction. + subprocess.call([ + os.path.join(args.colmap_path, 'colmap'), 'image_undistorter', + '--image_path', os.path.join(sfm_path, scene_name, 'images'), + '--input_path', os.path.join(sfm_path, scene_name, 'sparse', 'manhattan', '0'), + '--output_path', current_output_path, + '--max_image_size', str(max_image_size) + ]) + + # Transform the reconstruction to raw text format. + sparse_txt_path = os.path.join(current_output_path, 'sparse-txt') + os.mkdir(sparse_txt_path) + subprocess.call([ + os.path.join(args.colmap_path, 'colmap'), 'model_converter', + '--input_path', os.path.join(current_output_path, 'sparse'), + '--output_path', sparse_txt_path, + '--output_type', 'TXT' + ]) \ No newline at end of file diff --git a/training/data/preprocess/megadepth/valid_scenes.txt b/training/data/preprocess/megadepth/valid_scenes.txt new file mode 100644 index 00000000..c9e35b50 --- /dev/null +++ b/training/data/preprocess/megadepth/valid_scenes.txt @@ -0,0 +1,77 @@ +0016 +0033 +0034 +0041 +0044 +0047 +0049 +0058 +0062 +0064 +0067 +0071 +0076 +0078 +0090 +0094 +0099 +0102 +0121 +0129 +0133 +0141 +0151 +0162 +0168 +0175 +0177 +0178 +0181 +0185 +0186 +0197 +0204 +0205 +0209 +0212 +0217 +0223 +0229 +0231 +0238 +0252 +0257 +0271 +0275 +0277 +0281 +0285 +0286 +0290 +0294 +0303 +0306 +0307 +0323 +0349 +0360 +0387 +0389 +0402 +0406 +0412 +0443 +0482 +0768 +1001 +3346 +5000 +5001 +5002 +5003 +5008 +5011 +5014 +5015 +5016 +5018 \ No newline at end of file diff --git a/training/data/preprocess/mvssynth.py b/training/data/preprocess/mvssynth.py new file mode 100644 index 00000000..b4d28f15 --- /dev/null +++ b/training/data/preprocess/mvssynth.py @@ -0,0 +1,77 @@ +from pathlib import Path +import json +import gzip +import numpy as np +import torch +from tqdm import tqdm + +def read_img_depth_pose(pose_path): + with open(pose_path) as f: + r_info = json.load(f) + c_x = r_info["c_x"] + c_y = r_info["c_y"] + f_x = r_info["f_x"] + f_y = r_info["f_y"] + extrinsic = np.array(r_info["extrinsic"]) + # extrinsic = inv(extrinsic) + + # This is only for GTA 540 + f_x = f_x * 810 / 1920 + + K = np.array([[f_x, 0, c_x], [0, f_y, c_y], [0,0,1]]) + return K, extrinsic + +# Root folder where everything starts +root = Path("/mimer/NOBACKUP/groups/3d-dl/MVS-Synth/GTAV_540") + +out = {} + +for scene_dir in tqdm(root.iterdir()): + + if scene_dir.name.startswith('num_images'): + continue + + frames = sorted([p.name for p in (scene_dir / "images").iterdir() if p.suffix == ".png"]) + sequence_data = [] + for frame in frames: + pose_path = scene_dir / "poses" / (frame.replace(".png", ".json")) + depth_path = scene_dir / "depths" / (frame.replace(".png", ".exr")) + + # with open(pose_path) as f: + # cam = json.load(f) + + K, pose_w2c = read_img_depth_pose(pose_path) + # extrinsic_4x4 = np.array(cam["extrinsic"], dtype=np.float32) + # # extrinsic_4x4 = np.linalg.inv(extrinsic_4x4) + # R = extrinsic_4x4[:3, :3] + # t = extrinsic_4x4[:3, 3] + + # if np.linalg.det(R) < 0: + # R[:, 2] *= -1 + # t[2] *= -1 + # pose_w2c = np.hstack([R, t.reshape(3, 1)]) + + # K = np.array([ + # [cam["f_x"], 0, cam["c_x"]], + # [0, cam["f_y"], cam["c_y"]], + # [0, 0, 1] + # ], dtype=np.float32) + + # pose_w2c = read_scannet_pose(pose_path) + frame_data = { + "filepath": f"{scene_dir.name}/images/{frame}", + "extri": pose_w2c[:3].tolist(), + "intri": K.tolist(), + "depthpath": f"{scene_dir.name}/depths/{frame.replace('.png', '.exr')}", + } + sequence_data.append(frame_data) + + out[scene_dir.name] = sequence_data + +root = "/mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt" + +with gzip.open(root+"/annotations/mvssynth/train.jgz", "wt", encoding="utf-8") as f: + json.dump(out, f, ensure_ascii=False, indent=4) + +print(f"Processed {len(out)} scenes with a total of {sum(len(v) for v in out.values())} images.") + diff --git a/training/data/preprocess/pointodyssey.py b/training/data/preprocess/pointodyssey.py new file mode 100644 index 00000000..00fbf772 --- /dev/null +++ b/training/data/preprocess/pointodyssey.py @@ -0,0 +1,42 @@ +from pathlib import Path +import json +import gzip +import numpy as np +import torch +from tqdm import tqdm + +# Root folder where everything starts +root = Path("/mimer/NOBACKUP/groups/3d-dl/pointodyssey") + +out = {} + +for scene_dir in tqdm(root.iterdir()): + + frames = sorted([p.name for p in (scene_dir / "rgbs").iterdir() if p.suffix == ".jpg"]) + sequence_data = [] + + # info = np.load(scene_dir / "info.npz") + anno = np.load(scene_dir / "anno.npz") + + intrinsics = anno['intrinsics'] + extrinsics = anno['extrinsics'] + + for i, frame in enumerate(frames): + depth_path = scene_dir / "depths" / (frame.replace("rgb", ".depth").replace(".jpg", ".png")) + + frame_data = { + "filepath": f"{scene_dir.name}/rgbs/{frame}", + "extri": extrinsics[i][:3].tolist(), + "intri": intrinsics[i].tolist(), + "depthpath": f"{scene_dir.name}/depths/{frame.replace('rgb', 'depth').replace('.jpg', '.png')}", + } + sequence_data.append(frame_data) + out[scene_dir.name] = sequence_data + +root = "/mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt" + +with gzip.open(root+"/annotations/pointodyssey.jgz", "wt", encoding="utf-8") as f: + json.dump(out, f, ensure_ascii=False, indent=4) + +print(f"Processed {len(out)} scenes with a total of {sum(len(v) for v in out.values())} images.") + diff --git a/training/data/preprocess/prepare_eth3d.py b/training/data/preprocess/prepare_eth3d.py new file mode 100644 index 00000000..68a0e9c7 --- /dev/null +++ b/training/data/preprocess/prepare_eth3d.py @@ -0,0 +1,249 @@ +# The scripts provided here are for reference only. Please ensure you have obtained the necessary licenses from the original dataset providers before proceeding. + +import os +import os.path as osp +import cv2 +import numpy as np + +from PIL import Image +from scipy.spatial.transform import Rotation as R +from tqdm import tqdm + +def read_cameras_txt(path): + cameras = {} + with open(path, "r") as f: + for line in f: + # skip comment + if line.startswith("#"): + continue + parts = line.strip().split() + camera_id = int(parts[0]) + model = parts[1] + width = int(parts[2]) + height = int(parts[3]) + + # parse camera parameters + if model == "SIMPLE_PINHOLE": + # f, cx, cy + params = np.array(list(map(float, parts[4:]))) + fx = fy = params[0] + cx = params[1] + cy = params[2] + elif model == "PINHOLE": + # fx, fy, cx, cy + params = np.array(list(map(float, parts[4:]))) + fx = params[0] + fy = params[1] + cx = params[2] + cy = params[3] + elif model == "THIN_PRISM_FISHEYE": + # fx, fy, cx, cy, k1, k2, p1, p2, k3, k4, sx1, sy1 + params = np.array(list(map(float, parts[4:]))) + fx, fy, cx, cy = params[0], params[1], params[2], params[3] + dist_params = { + 'k1': params[4], 'k2': params[5], + 'p1': params[6], 'p2': params[7], + 'k3': params[8], 'k4': params[9], + 'sx1': params[10], 'sy1': params[11] + } + else: + print(f"Warning: camera model {model} is not supported yet") + continue + + K = np.array([ + [fx, 0, cx], + [0, fy, cy], + [0, 0, 1] + ]) + + cameras[camera_id] = { + 'K': K, + 'dist_params': dist_params, + 'model': model, + 'width': width, + 'height': height + } + return cameras + + +def read_images_txt(path): + images = {} + with open(path, "r") as f: + lines = f.readlines() + for i in range(0, len(lines), 2): + # skip comment lines + if lines[i].startswith("#"): + if "Number of images" in lines[i]: + i = 2 + else: + continue + + # first line: extrinsics + line1_parts = lines[i].strip().split() + image_id = int(line1_parts[0]) + # (qw, qx, qy, qz) + qvec = np.array(list(map(float, line1_parts[1:5]))) + # (tx, ty, tz) + tvec = np.array(list(map(float, line1_parts[5:8]))) + camera_id = int(line1_parts[8]) + image_name = line1_parts[9] + + # COLMAP (W, X, Y, Z) + # Scipy Rotation (X, Y, Z, W) + rotation = R.from_quat([qvec[1], qvec[2], qvec[3], qvec[0]]) + + # get rotation matrix R and tranlsation T, w2c + # P_camera = R * P_world + T + R_matrix = rotation.as_matrix() + + images[image_id] = { + 'R': R_matrix, + 'T': tvec, + 'camera_id': camera_id, + 'name': image_name + } + return images + + +if __name__ == '__main__': + data_root = '/mimer/NOBACKUP/groups/3d-dl/eth3d' + # sequences = [seq for seq in os.listdir('data/eth3d') if os.path.isdir(os.path.join('data/eth3d', seq))] + # print(sequences) + sequences = ["courtyard", "delivery_area", "electro", "facade", "kicker", "meadow", "office", "pipes", "playground", "relief", "relief_2", "terrace", "terrains"] + + # setup_debug() + + for seq in tqdm(sequences, desc="Processing sequences"): + cameras_intrinsics = read_cameras_txt(osp.join(data_root, seq, 'dslr_calibration_jpg', 'cameras.txt')) + images_extrinsics = read_images_txt(osp.join(data_root, seq, 'dslr_calibration_jpg', 'images.txt')) + + idxs = sorted(list(images_extrinsics.keys())) + + output_image_dir = os.path.join(data_root, seq, 'images', 'custom_undistorted') + output_depth_dir = os.path.join(data_root, seq, 'ground_truth_depth', 'custom_undistorted') + + output_camera_dir = os.path.join(data_root, seq, 'custom_undistorted_cam') + os.makedirs(output_image_dir, exist_ok=True) + os.makedirs(output_depth_dir, exist_ok=True) + os.makedirs(output_camera_dir, exist_ok=True) + + for idx in tqdm(idxs, desc=f"Processing images in {seq}"): + meta = images_extrinsics[idx] + + output_impath = os.path.join(output_image_dir, meta['name'].split('/')[1]) + if os.path.exists(output_impath): + continue + + # Fix the depth map path error: idxs is a list, should use meta['name'] or similar index + # Assume that the depth map and RGB image file names are similar, just with different extensions + impath = os.path.join(data_root, seq, 'images', meta['name']) + depthpath = os.path.join(data_root, seq, 'ground_truth_depth', meta['name']) # 假设是 .bin 文件 + + # load image and depth + rgb_image = np.array(Image.open(impath)) + height, width = rgb_image.shape[:2] + depthmap = np.fromfile(depthpath, dtype=np.float32).reshape(height, width) + + # load camera params for undistortion + intrinsic = cameras_intrinsics[meta['camera_id']]['K'].astype(np.float32) + dist_params_dict = cameras_intrinsics[meta['camera_id']]['dist_params'] + + # ################################################################## + # ### TODO 1: Undistort Image ### + # ################################################################## + print(f"Undistorting image {meta['name']}...") + + # Note: cv2.fisheye model primarily uses k1, k2, k3, k4. It ignores tangential (p1, p2) and thin prism (sx1, sy1) distortions. + # This is an approximation, but it usually works well in practice. + K = intrinsic + D = np.array([ + dist_params_dict['k1'], + dist_params_dict['k2'], + dist_params_dict['k3'], + dist_params_dict['k4'] + ]) + + # Calculate the undistortion mapping. + # K_new can be the same as K, or optimized through the balance parameter. + K_new = K.copy() + map1, map2 = cv2.fisheye.initUndistortRectifyMap(K, D, np.eye(3), K_new, (width, height), cv2.CV_16SC2) + + # Apply mapping + rgb_image_undistorted = cv2.remap( + rgb_image, map1, map2, + interpolation=cv2.INTER_LINEAR, + borderMode=cv2.BORDER_CONSTANT + ) + + # ################################################################## + # ### TODO 2: Undistort Depth ### + # ################################################################## + print(f"Undistorting depth for {meta['name']}...") + + # Core idea: For each pixel (u_d, v_d, depth) in the distorted depth map, + # we back-project it to 3D space, then re-project it onto the undistorted image plane. + + # 1. Create a grid of pixel coordinates for the distorted image + v_dist, u_dist = np.indices((height, width)) + pixels_dist = np.stack([u_dist.ravel(), v_dist.ravel()], axis=-1).astype(np.float32) + pixels_dist = pixels_dist.reshape(-1, 1, 2) # (N, 1, 2) 的形状 + + # 2. Calculate normalized coordinates in the undistorted camera frame + # `undistortPoints` will apply the inverse transformation of the fisheye model + normalized_coords_undistorted = cv2.fisheye.undistortPoints(pixels_dist, K, D) + + # 3. Multiply the normalized coordinates by the depth to get 3D points in camera coordinates + # (x', y') = normalized_coords_undistorted + # X = x' * depth, Y = y' * depth, Z = depth + depth_values = depthmap.ravel() + + # filter out invalid depth values + valid_mask = np.logical_and(depth_values > 0, np.isfinite(depth_values)) + + points_3D_X = normalized_coords_undistorted.ravel()[0::2][valid_mask] * depth_values[valid_mask] + points_3D_Y = normalized_coords_undistorted.ravel()[1::2][valid_mask] * depth_values[valid_mask] + points_3D_Z = depth_values[valid_mask] + + # 4. Project the 3D points back to the undistorted image plane + fx_new, fy_new = K_new[0, 0], K_new[1, 1] + cx_new, cy_new = K_new[0, 2], K_new[1, 2] + + u_new = (points_3D_X * fx_new / points_3D_Z) + cx_new + v_new = (points_3D_Y * fy_new / points_3D_Z) + cy_new + + # 5. Create a sparse depth map + depthmap_undistorted_sparse = np.zeros((height, width), dtype=np.float32) + u_new_int = np.round(u_new).astype(int) + v_new_int = np.round(v_new).astype(int) + + # filter out points that are out of bounds + valid_mask = (u_new_int >= 0) & (u_new_int < width) & \ + (v_new_int >= 0) & (v_new_int < height) + + u_target = u_new_int[valid_mask] + v_target = v_new_int[valid_mask] + z_target = points_3D_Z[valid_mask] + + depthmap_undistorted_sparse[v_target, u_target] = z_target + depthmap_undistorted = depthmap_undistorted_sparse + + output_impath = os.path.join(output_image_dir, meta['name'].split('/')[1]) + output_depthpath = os.path.join(output_depth_dir, meta['name'].split('/')[1]) + + print(f" -> Save Image to: {output_impath}") + Image.fromarray(rgb_image_undistorted).save(output_impath) + + print(f" -> Save Depth Map to: {output_depthpath}") + depthmap_undistorted.astype(np.float32).tofile(output_depthpath) + + extrinsic = np.eye(4) + extrinsic[:3, :3] = meta['R'] + extrinsic[:3, 3] = meta['T'] + + output_cam_path = os.path.join(output_camera_dir, meta['name'].split('/')[1].replace('JPG', 'npz')) + + np.savez( + output_cam_path, + intrinsics=K_new, + extrinsics=extrinsic + ) \ No newline at end of file diff --git a/training/data/preprocess/prepare_eth3d.sh b/training/data/preprocess/prepare_eth3d.sh new file mode 100644 index 00000000..b5557184 --- /dev/null +++ b/training/data/preprocess/prepare_eth3d.sh @@ -0,0 +1,19 @@ + +mkdir -p /mimer/NOBACKUP/groups/3d-dl/eth3d +cd /mimer/NOBACKUP/groups/3d-dl/eth3d + +wget https://www.eth3d.net/data/multi_view_training_dslr_jpg.7z +# install 7zip or p7zip on your system if not already installed +7z x multi_view_training_dslr_jpg.7z -bsp1 +rm multi_view_training_dslr_jpg.7z + +scenes=("courtyard" "delivery_area" "electro" "facade" "kicker" "meadow" "office" "pipes" "playground" "relief" "relief_2" "terrace" "terrains") +for scene in "${scenes[@]}"; do + wget -c https://www.eth3d.net/data/${scene}_dslr_depth.7z + 7z x ${scene}_dslr_depth.7z -bsp1 + rm ${scene}_dslr_depth.7z +done + +cd /mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt + +# python training/data/preprocess/prepare_eth3d.py \ No newline at end of file diff --git a/training/data/preprocess/scannet.py b/training/data/preprocess/scannet.py new file mode 100644 index 00000000..6a5a513a --- /dev/null +++ b/training/data/preprocess/scannet.py @@ -0,0 +1,100 @@ +from pathlib import Path +import json +import gzip +import numpy as np +import torch +from tqdm import tqdm + +def read_scannet_pose(path): + """ Read ScanNet's Camera2World pose and transform it to World2Camera. + + Returns: + pose_w2c (np.ndarray): (4, 4) + """ + cam2world = np.loadtxt(path, delimiter=' ') + + if not np.isfinite(cam2world).all(): + return None + + world2cam = np.linalg.inv(cam2world) + return world2cam + + +def read_scannet_intrinsic(path): + """ Read ScanNet's intrinsic matrix and return the 3x3 matrix. + """ + intrinsic = np.loadtxt(path, delimiter=' ') + return torch.tensor(intrinsic[:-1, :-1], dtype = torch.float) + +# Root folder where everything starts +root = Path("/mimer/NOBACKUP/groups/3d-dl/scannet/scans/scans_train") + +out = {} + +chunk_size = 24 + +valid_frames = 0 +invalid_frames = 0 +for scene_dir in tqdm(root.iterdir()): + + intrinsics = read_scannet_intrinsic(scene_dir / "intrinsic/intrinsic_color.txt") + + frames = sorted([p.name for p in (scene_dir / "color").iterdir() if p.suffix == ".jpg"]) + + # Maybe resized undistorted images are too high resolution? + num_frames = len(frames) + + # Since the images are taken in a sequence we will just chunk up the sequences + + sequences = [] + # Calculate how many full chunks we can take, stopping before the last chunk + num_full_chunks = (num_frames - 1) // chunk_size # leave room for overflow in last chunk + + for i in range(num_full_chunks - 1): + sequences.append(frames[i * chunk_size: (i + 1) * chunk_size]) + + # Last chunk gets the rest of the frames + sequences.append(frames[(num_full_chunks - 1) * chunk_size:]) + + + for i, seq in enumerate(sequences): + sequence_data = [] + for frame in seq: + pose_path = scene_dir / "pose" / (frame.replace(".jpg", ".txt")) + pose_w2c = read_scannet_pose(pose_path) + if pose_w2c is None: + print(f"Warning: Pose contains NaN, skipping frame {pose_path}") + invalid_frames += 1 + continue + valid_frames += 1 + R = pose_w2c[:3, :3] + assert not np.isnan(pose_w2c).any(), f"Pose contains NaN: {pose_w2c}" + # print('Determinant of R: ', np.linalg.det(R)) + # assert np.allclose(np.linalg.det(R), 1.0, atol=1e-3), f"Rotation matrix determinant is not 1 but {np.linalg.det(R)}, R is {R}" + + frame_data = { + "filepath": f"{scene_dir.name}/color/{frame}", + "extri": pose_w2c[:3].tolist(), + "intri": intrinsics.tolist(), + "depthpath": f"{scene_dir.name}/depth/{frame.replace('.jpg', '.png')}", + } + # Sanity check + assert len(pose_w2c) == 4 and len(pose_w2c[0]) == 4 + assert len(intrinsics) == 3 and len(intrinsics[0]) == 3 + + sequence_data.append(frame_data) + + out[scene_dir.name+"_"+str(i)] = sequence_data + + print(f" Created {len(sequences)} sequences for {scene_dir.name}") + + +print('Valid frames: ', valid_frames) +print('Invalid frames: ', invalid_frames) +root = "/mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt" + +with gzip.open(root+"/annotations/scannet/train.jgz", "wt", encoding="utf-8") as f: + json.dump(out, f, ensure_ascii=False, indent=4) + +print(f"Processed {len(out)} scenes with a total of {sum(len(v) for v in out.values())} images.") + diff --git a/training/data/preprocess/test_co3d.py b/training/data/preprocess/test_co3d.py new file mode 100644 index 00000000..c0173cca --- /dev/null +++ b/training/data/preprocess/test_co3d.py @@ -0,0 +1,120 @@ +import gzip +import json +import numpy as np +import cv2 +import matplotlib.pyplot as plt +from mpl_toolkits.mplot3d import Axes3D +from pathlib import Path +from PIL import Image + +def _load_16big_png_depth(depth_png): + with Image.open(depth_png) as depth_pil: + # the image is stored with 16-bit depth but PIL reads it as I (32 bit). + # we cast it to uint16, then reinterpret as float16, then cast to float32 + depth = ( + np.frombuffer(np.array(depth_pil, dtype=np.uint16), dtype=np.float16) + .astype(np.float32) + .reshape((depth_pil.size[1], depth_pil.size[0])) + ) + return depth + +def depth_to_points(depth, K, extrinsic, stride=8): + """Backprojects depth map to 3D world coordinates.""" + h, w = depth.shape + i, j = np.meshgrid(np.arange(0, w, stride), np.arange(0, h, stride)) + depth_sampled = depth[j, i] + valid = depth_sampled > 0 + pixels = np.stack([i[valid], j[valid], np.ones_like(i[valid])], axis=-1) + + K_inv = np.linalg.inv(K) + cam_points = (K_inv @ pixels.T) * depth_sampled[valid] + cam_points = np.vstack((cam_points, np.ones((1, cam_points.shape[1])))) + world_points = extrinsic @ cam_points + return world_points[:3].T + + +def plot_scene(cameras, points=None): + """Visualize cameras and optionally 3D points.""" + fig = plt.figure(figsize=(10, 8)) + ax = fig.add_subplot(111, projection="3d") + + for extri in cameras: + R = extri[:3, :3] + t = extri[:3, 3] + cam_center = -R.T @ t + ax.scatter(*cam_center, color="r", s=30) + + # draw axes + axes_len = 0.05 + cam_axes = R.T * axes_len + for k, color in enumerate(["r", "g", "b"]): + ax.plot( + [cam_center[0], cam_center[0] + cam_axes[0, k]], + [cam_center[1], cam_center[1] + cam_axes[1, k]], + [cam_center[2], cam_center[2] + cam_axes[2, k]], + color=color, + ) + + if points is not None: + ax.scatter(points[:, 0], points[:, 1], points[:, 2], s=0.1, c=points[:, 2], cmap="viridis") + + ax.set_xlabel("X") + ax.set_ylabel("Y") + ax.set_zlabel("Z") + ax.set_title("CO3D Camera & Depth Visualization") + ax.view_init(elev=20, azim=-60) + plt.tight_layout() + plt.show() + + +def main(): + # === Adjust these paths === + path = Path("/mimer/NOBACKUP/groups/3d-dl/co3d_full/189_20379_35626") + + frames = sorted((path / "images").glob("*.jpg")) + frame_file = path / "frame_annotations.jgz" + sequence_file = path / "sequence_annotations.jgz" + + with gzip.open(frame_file, "r") as fin: + frame_data = json.loads(fin.read()) + with gzip.open(sequence_file, "r") as fin: + sequence_data = json.loads(fin.read()) + + seq_data = data[category_sequence] + + # Collect camera extrinsics and intrinsics + extrinsics = [] + intrinsics = [] + for f in seq_data: + E = np.array(f["extri"]) + if E.shape == (3, 4): + E = np.vstack([E, [0, 0, 0, 1]]) + extrinsics.append(E) + intrinsics.append(np.array(f["intri"])) + + # === Visualize camera frustums === + print("Visualizing camera poses...") + plot_scene(extrinsics) + plt.savefig("cameras.png") + + # === Load one depth map and backproject === + frame = seq_data[frame_idx] + img_path = Path(frame["filepath"]) + depth_path = Path(str(img_path).replace("/images", "/depths") + ".geometric.png") + + if not depth_path.exists(): + print(f"Depth map not found at {depth_path}") + return + + print(f"Loading depth: {depth_path}") + depth = _load_16big_png_depth(depth_path, scale=1.0) + points_3d = depth_to_points(depth, intrinsics[frame_idx], extrinsics[frame_idx]) + + # === Plot cameras + point cloud === + print("Rendering 3D scene...") + plot_scene(extrinsics, points_3d) + plt.savefig("scene_with_depth.png") + + +if __name__ == "__main__": + main() diff --git a/training/data/preprocess/wildrgbd.py b/training/data/preprocess/wildrgbd.py new file mode 100644 index 00000000..16986c4e --- /dev/null +++ b/training/data/preprocess/wildrgbd.py @@ -0,0 +1,66 @@ +from pathlib import Path +import json +import gzip +import numpy as np +import torch +from tqdm import tqdm + +# Root folder where everything starts +root = Path("/mimer/NOBACKUP/groups/3d-dl/wildrgbd") + +out = {} + +def load_cam_poses(path): + poses = [] + with open(path, "r") as f: + for line in f: + tokens = line.strip().split() + frame_id = int(tokens[0]) + mat = np.array([float(x) for x in tokens[1:]]).reshape(4, 4) + poses.append((frame_id, mat)) + return poses + + +for category_dir in tqdm(root.iterdir()): + if category_dir.name.endswith('.py') or category_dir.name.endswith('.zip') or category_dir.name.startswith('.') or category_dir.name == "chair": + print('Skipping', category_dir.name) + continue + category = category_dir.name + print(f"Processing category: {category}") + for scene_dir in (category_dir / "scenes").iterdir(): + + poses = load_cam_poses(scene_dir / "cam_poses.txt") + + with open(scene_dir / "metadata", "r") as f: + meta = json.load(f) + + # Get the intrinsic matrix + K_flat = meta["K"] # list of 9 numbers + K = np.array(K_flat).reshape(3, 3).T + + frames = sorted([p.name for p in (scene_dir / "rgb").iterdir() if p.suffix == ".png"]) + sequence_data = [] + for i, frame in enumerate(frames): + frame_id, pose = poses[i] + pose = np.linalg.inv(pose) # to world to cam + assert frame_id == i + + depth_path = scene_dir / "depth" / frame + frame_data = { + "filepath": f"{category}/scenes/{scene_dir.name}/rgb/{frame}", + "extri": pose[:3].tolist(), + "intri": K.tolist(), + "depthpath": f"{category}/scenes/{scene_dir.name}/depth/{frame}", + "maskpath": f"{category}/scenes/{scene_dir.name}/masks/{frame}", + } + sequence_data.append(frame_data) + + out[scene_dir.name] = sequence_data + +root = "/mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt" + +with gzip.open(root+"/annotations/wildrgbd/train.jgz", "wt", encoding="utf-8") as f: + json.dump(out, f, ensure_ascii=False, indent=4) + +print(f"Processed {len(out)} scenes with a total of {sum(len(v) for v in out.values())} images.") + diff --git a/training/main.py b/training/main.py new file mode 100644 index 00000000..8ff24af8 --- /dev/null +++ b/training/main.py @@ -0,0 +1,75 @@ +import torch +from hydra import initialize, compose +from hydra.utils import instantiate +import torch.distributed as dist +import matplotlib.pyplot as plt +import numpy as np + +def save_ply(points, colors, filename): + import open3d as o3d + if torch.is_tensor(points): + points_visual = points.reshape(-1, 3).cpu().numpy() + else: + points_visual = points.reshape(-1, 3) + if torch.is_tensor(colors): + points_visual_rgb = colors.reshape(-1, 3).cpu().numpy() + else: + points_visual_rgb = colors.reshape(-1, 3) + pcd = o3d.geometry.PointCloud() + pcd.points = o3d.utility.Vector3dVector(points_visual.astype(np.float64)) + pcd.colors = o3d.utility.Vector3dVector(points_visual_rgb.astype(np.float64)) + o3d.io.write_point_cloud(filename, pcd, write_ascii=True) + +with initialize(version_base=None, config_path="config"): + cfg = compose(config_name="default") + +dist.init_process_group( + backend="nccl", +) + +train_dataset = instantiate(cfg.data.train, _recursive_=False) +train_dataset.seed = 1337 + +dataloader = train_dataset.get_loader(epoch=0) + +create_ply = True + +for i, batch in enumerate(dataloader): + if create_ply: + save_ply( + batch["world_points"][0].reshape(-1, 3), + batch["images"][0].permute(0, 2, 3, 1).reshape(-1, 3), + f"debug_{i:04d}.ply" + ) + print(f"Saved debug_{i:04d}.ply") + else: + images = batch['images'] # [B, T, C, H, W] + depths = batch['depths'] # [B, T, H, W] + + # pick first sample and first frame + img = images[0, 0] # [C, H, W] + depth = depths[0, 0] # [H, W] + + # move channels last for matplotlib + img_np = img.permute(1, 2, 0).cpu().numpy() + depth_np = depth.cpu().numpy() + + # normalize depth for visualization + depth_vis = (depth_np - depth_np.min()) / (depth_np.max() - depth_np.min() + 1e-8) + + plt.figure(figsize=(10,5)) + + plt.subplot(1,2,1) + plt.imshow(img_np) + plt.title("Image") + plt.axis("off") + + plt.subplot(1,2,2) + plt.imshow(depth_vis, cmap="plasma") # "viridis", "magma" also nice + plt.title("Depth") + plt.axis("off") + + plt.savefig(f"sample_{i:04d}.png") + plt.close() + + print(f"Saved sample_{i:04d}.png") \ No newline at end of file diff --git a/training/todo.txt b/training/todo.txt new file mode 100644 index 00000000..e64eed37 --- /dev/null +++ b/training/todo.txt @@ -0,0 +1,3 @@ +* Build into the config so we can choose between MuM, CroCov2 and DINOv2 +* Add config about how large the model is and try to just train a couple of layers on top of frozen backbone +* For example one could do 4xA100 for some time and see how it works on MegaDepth \ No newline at end of file diff --git a/training/train_utils/distributed.py b/training/train_utils/distributed.py index af61e269..d97bbbed 100644 --- a/training/train_utils/distributed.py +++ b/training/train_utils/distributed.py @@ -15,6 +15,11 @@ def get_machine_local_and_dist_rank(): """ local_rank = int(os.environ.get("LOCAL_RANK", None)) distributed_rank = int(os.environ.get("RANK", None)) + + # local_rank = int(os.environ["SLURM_LOCALID"]) # 0-3 on each node + # distributed_rank = int(os.environ["SLURM_PROCID"]) + + print('I am local rank', local_rank, 'and distributed rank', distributed_rank) assert ( local_rank is not None and distributed_rank is not None ), "Please the set the RANK and LOCAL_RANK environment variables." diff --git a/training/trainer.py b/training/trainer.py index 21ffa53e..f283e172 100644 --- a/training/trainer.py +++ b/training/trainer.py @@ -5,6 +5,7 @@ # LICENSE file in the root directory of this source tree. import os +import constants # --- Environment Variable Setup for Performance and Debugging --- @@ -17,6 +18,8 @@ # Enables asynchronous error handling for NCCL, which can prevent hangs. os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "1" +os.environ['WANDB_API_KEY'] = constants.WANDB_API_KEY + import contextlib import gc @@ -41,7 +44,7 @@ from train_utils.logging import setup_logging from train_utils.normalization import normalize_camera_extrinsics_and_points_batch from train_utils.optimizer import construct_optimizers - +import wandb class Trainer: """ @@ -77,6 +80,8 @@ def __init__( loss: Optional[Dict[str, Any]] = None, env_variables: Optional[Dict[str, Any]] = None, accum_steps: int = 1, + log_wandb: bool = False, + exp_name: str = "exp001", **kwargs, ): """ @@ -141,6 +146,9 @@ def __init__( assert is_dist_avail_and_initialized(), "Torch distributed needs to be initialized before calling the trainer." + wandb_mode = "online" if log_wandb and self.rank == 0 else "disabled" + wandb.init(project="vggt", entity="georgs-team", name=exp_name, reinit=False, mode = wandb_mode) + # Instantiate components (model, loss, etc.) self._setup_components() self._setup_dataloaders() @@ -376,6 +384,7 @@ def run(self): def run_train(self): """Runs the main training loop over all epochs.""" + print('Max epochs: ', self.max_epochs) while self.epoch < self.max_epochs: set_seeds(self.seed_value + self.epoch * 100, self.max_epochs, self.distributed_rank) @@ -428,9 +437,16 @@ def val_epoch(self, val_loader): loss_meters = { name: AverageMeter(name, self.device, ":.4f") for name in loss_names } + + iters_per_epoch = len(val_loader) + limit_val_batches = ( + iters_per_epoch + if self.limit_val_batches is None + else self.limit_val_batches + ) progress = ProgressMeter( - num_batches=len(val_loader), + num_batches=limit_val_batches, meters=[ batch_time, data_time, @@ -445,12 +461,6 @@ def val_epoch(self, val_loader): self.model.eval() end = time.time() - iters_per_epoch = len(val_loader) - limit_val_batches = ( - iters_per_epoch - if self.limit_val_batches is None - else self.limit_val_batches - ) for data_iter, batch in enumerate(val_loader): if data_iter > limit_val_batches: @@ -495,6 +505,13 @@ def val_epoch(self, val_loader): if data_iter % self.logging_conf.log_freq == 0: progress.display(data_iter) + avg_stats = {} + + for name, meter in loss_meters.items(): + avg_stats[f"{name}_val"] = meter.avg + + wandb.log(avg_stats) + print("Validation averages:", avg_stats) return True @@ -516,8 +533,16 @@ def train_epoch(self, train_loader): loss_meters[f"Grad/{param_names}"] = AverageMeter(f"Grad/{param_names}", self.device, ":.4f") + iters_per_epoch = len(train_loader) + limit_train_batches = ( + iters_per_epoch + if self.limit_train_batches is None + else self.limit_train_batches + ) + print('Num batches: ', limit_train_batches) + progress = ProgressMeter( - num_batches=len(train_loader), + num_batches=limit_train_batches, meters=[ batch_time, data_time, @@ -531,13 +556,6 @@ def train_epoch(self, train_loader): self.model.train() end = time.time() - - iters_per_epoch = len(train_loader) - limit_train_batches = ( - iters_per_epoch - if self.limit_train_batches is None - else self.limit_train_batches - ) if self.gradient_clipper is not None: # setup gradient clipping at the beginning of training @@ -581,9 +599,14 @@ def train_epoch(self, train_loader): logging.warning( f"Skipping scheduler update since the training is at the end, i.e, {self.where} of [0,1]." ) - - # Log schedulers + + # + + + # Log schedulers (to W&B instead of TensorBoard) if self.steps[phase] % self.logging_conf.log_freq == 0: + wandb_dict = {} + for i, optim in enumerate(self.optims): for j, param_group in enumerate(optim.optimizer.param_groups): for option in optim.schedulers[j]: @@ -596,16 +619,13 @@ def train_epoch(self, train_loader): else "" ) ) - self.tb_writer.log( - os.path.join("Optim", f"{optim_prefix}", option), - param_group[option], - self.steps[phase], - ) - self.tb_writer.log( - os.path.join("Optim", "where"), - self.where, - self.steps[phase], - ) + key = f"Optim/{optim_prefix}{option}" + wandb_dict[key] = param_group.get(option, None) + + # Also log the scheduler position (e.g., training progress) + wandb_dict["Optim/where"] = self.where + + wandb.log(wandb_dict, step=self.steps[phase]) # Clipping gradients and detecting diverging gradients if self.gradient_clipper is not None: @@ -632,6 +652,9 @@ def train_epoch(self, train_loader): if data_iter % self.logging_conf.log_freq == 0: progress.display(data_iter) + wandb.log({ + **{name: meter.avg for name, meter in loss_meters.items()} + }) return True diff --git a/training/vggt.sh b/training/vggt.sh new file mode 100644 index 00000000..f537f9d7 --- /dev/null +++ b/training/vggt.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -e + +# ==== CONFIGURABLE VARIABLES ==== +MODEL="dinov3" +GPUS_PER_NODE=1 +NODES=1 +TIME="0-00:10:00" +# TIME="2-10:00:00" + +# ==== AUTO-DERIVED VARIABLES ==== +JOB_NAME="vggt:${MODEL}" +OUTPUT_DIR="/mimer/NOBACKUP/groups/snic2022-6-266/davnords/vggt/output_dir/${MODEL}" + +mkdir -p "${OUTPUT_DIR}" + +# ==== EXPORT TO MAKE AVAILABLE INSIDE SLURM JOB ==== +export MODEL +export GPUS_PER_NODE +export NODES +export OUTPUT_DIR + +# ==== SUBMIT THE JOB ==== +sbatch \ + -A NAISS2025-5-255 \ + --job-name=${JOB_NAME} \ + --nodes=${NODES} \ + --gpus-per-node=A100:${GPUS_PER_NODE} \ + --ntasks-per-node=1 \ + --time=${TIME} \ + --output=${OUTPUT_DIR}/%j/log.out \ + --error=${OUTPUT_DIR}/%j/log.err \ + --export=ALL,MODEL,GPUS_PER_NODE,NODES,OUTPUT_DIR \ + <<'EOF' +#!/usr/bin/env bash +set -e + +echo "Running model: ${MODEL}" +echo "GPUs per node: ${GPUS_PER_NODE}" +echo "Nodes: ${SLURM_NNODES}" + +export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +export MASTER_PORT=29501 +export WORLD_SIZE=$(($SLURM_NNODES * $GPUS_PER_NODE)) + +echo "MASTER_ADDR: $MASTER_ADDR" +echo "WORLD_SIZE: $WORLD_SIZE" + +srun torchrun \ + --nproc_per_node=${GPUS_PER_NODE} \ + --nnodes=${SLURM_NNODES} \ + --rdzv_id=${SLURM_JOB_ID} \ + --rdzv_backend=c10d \ + --rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} \ + launch.py --config "${MODEL}" +EOF diff --git a/vggt/encoders/__init__.py b/vggt/encoders/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/vggt/encoders/croco/__init__.py b/vggt/encoders/croco/__init__.py new file mode 100644 index 00000000..b06ca763 --- /dev/null +++ b/vggt/encoders/croco/__init__.py @@ -0,0 +1 @@ +from .croco import CroCoNet \ No newline at end of file diff --git a/vggt/encoders/croco/blocks.py b/vggt/encoders/croco/blocks.py new file mode 100644 index 00000000..b3d02307 --- /dev/null +++ b/vggt/encoders/croco/blocks.py @@ -0,0 +1,240 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + + +# -------------------------------------------------------- +# Main encoder/decoder blocks +# -------------------------------------------------------- +# References: +# timm +# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/helpers.py +# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/drop.py +# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/mlp.py +# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/patch_embed.py + + +import torch +import torch.nn as nn + +from itertools import repeat +import collections.abc + + +def _ntuple(n): + def parse(x): + if isinstance(x, collections.abc.Iterable) and not isinstance(x, str): + return x + return tuple(repeat(x, n)) + return parse +to_2tuple = _ntuple(2) + +def drop_path(x, drop_prob: float = 0., training: bool = False, scale_by_keep: bool = True): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + if drop_prob == 0. or not training: + return x + keep_prob = 1 - drop_prob + shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = x.new_empty(shape).bernoulli_(keep_prob) + if keep_prob > 0.0 and scale_by_keep: + random_tensor.div_(keep_prob) + return x * random_tensor + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + self.scale_by_keep = scale_by_keep + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training, self.scale_by_keep) + + def extra_repr(self): + return f'drop_prob={round(self.drop_prob,3):0.3f}' + +class Mlp(nn.Module): + """ MLP as used in Vision Transformer, MLP-Mixer and related networks""" + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, bias=True, drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + bias = to_2tuple(bias) + drop_probs = to_2tuple(drop) + + self.fc1 = nn.Linear(in_features, hidden_features, bias=bias[0]) + self.act = act_layer() + self.drop1 = nn.Dropout(drop_probs[0]) + self.fc2 = nn.Linear(hidden_features, out_features, bias=bias[1]) + self.drop2 = nn.Dropout(drop_probs[1]) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop1(x) + x = self.fc2(x) + x = self.drop2(x) + return x + +class Attention(nn.Module): + + def __init__(self, dim, rope=None, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim ** -0.5 + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + self.rope = rope + + def forward(self, x, xpos): + B, N, C = x.shape + + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).transpose(1,3) + q, k, v = [qkv[:,:,i] for i in range(3)] + # q,k,v = qkv.unbind(2) # make torchscript happy (cannot use tensor as tuple) + + if self.rope is not None: + q = self.rope(q, xpos) + k = self.rope(k, xpos) + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + +class Block(nn.Module): + + def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, rope=None): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention(dim, rope=rope, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop) + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + def forward(self, x, xpos): + x = x + self.drop_path(self.attn(self.norm1(x), xpos)) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + +class CrossAttention(nn.Module): + + def __init__(self, dim, rope=None, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim ** -0.5 + + self.projq = nn.Linear(dim, dim, bias=qkv_bias) + self.projk = nn.Linear(dim, dim, bias=qkv_bias) + self.projv = nn.Linear(dim, dim, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + self.rope = rope + + def forward(self, query, key, value, qpos, kpos): + B, Nq, C = query.shape + Nk = key.shape[1] + Nv = value.shape[1] + + q = self.projq(query).reshape(B,Nq,self.num_heads, C// self.num_heads).permute(0, 2, 1, 3) + k = self.projk(key).reshape(B,Nk,self.num_heads, C// self.num_heads).permute(0, 2, 1, 3) + v = self.projv(value).reshape(B,Nv,self.num_heads, C// self.num_heads).permute(0, 2, 1, 3) + + if self.rope is not None: + q = self.rope(q, qpos) + k = self.rope(k, kpos) + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, Nq, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + +class DecoderBlock(nn.Module): + + def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, norm_mem=True, rope=None): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention(dim, rope=rope, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop) + self.cross_attn = CrossAttention(dim, rope=rope, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop) + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + self.norm3 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + self.norm_y = norm_layer(dim) if norm_mem else nn.Identity() + + def forward(self, x, y, xpos, ypos): + x = x + self.drop_path(self.attn(self.norm1(x), xpos)) + y_ = self.norm_y(y) + x = x + self.drop_path(self.cross_attn(self.norm2(x), y_, y_, xpos, ypos)) + x = x + self.drop_path(self.mlp(self.norm3(x))) + return x, y + + +# patch embedding +class PositionGetter(object): + """ return positions of patches """ + + def __init__(self): + self.cache_positions = {} + + def __call__(self, b, h, w, device): + if not (h,w) in self.cache_positions: + x = torch.arange(w, device=device) + y = torch.arange(h, device=device) + self.cache_positions[h,w] = torch.cartesian_prod(y, x) # (h, w, 2) + pos = self.cache_positions[h,w].view(1, h*w, 2).expand(b, -1, 2).clone() + return pos + +class PatchEmbed(nn.Module): + """ just adding _init_weights + position getter compared to timm.models.layers.patch_embed.PatchEmbed""" + + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None, flatten=True): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + self.img_size = img_size + self.patch_size = patch_size + self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1]) + self.num_patches = self.grid_size[0] * self.grid_size[1] + self.flatten = flatten + + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() + + self.position_getter = PositionGetter() + + def forward(self, x): + B, C, H, W = x.shape + # torch._assert(H == self.img_size[0], f"Input image height ({H}) doesn't match model ({self.img_size[0]}).") + # torch._assert(W == self.img_size[1], f"Input image width ({W}) doesn't match model ({self.img_size[1]}).") + x = self.proj(x) + pos = self.position_getter(B, x.size(2), x.size(3), x.device) + if self.flatten: + x = x.flatten(2).transpose(1, 2) # BCHW -> BNC + x = self.norm(x) + return x, pos + + def _init_weights(self): + w = self.proj.weight.data + torch.nn.init.xavier_uniform_(w.view([w.shape[0], -1])) \ No newline at end of file diff --git a/vggt/encoders/croco/criterion.py b/vggt/encoders/croco/criterion.py new file mode 100644 index 00000000..b1ef1b3f --- /dev/null +++ b/vggt/encoders/croco/criterion.py @@ -0,0 +1,37 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# Criterion to train CroCo +# -------------------------------------------------------- +# References: +# MAE: https://github.com/facebookresearch/mae +# -------------------------------------------------------- + +import torch + +class MaskedMSE(torch.nn.Module): + + def __init__(self, norm_pix_loss=False, masked=True): + """ + norm_pix_loss: normalize each patch by their pixel mean and variance + masked: compute loss over the masked patches only + """ + super().__init__() + self.norm_pix_loss = norm_pix_loss + self.masked = masked + + def forward(self, pred, mask, target): + + if self.norm_pix_loss: + mean = target.mean(dim=-1, keepdim=True) + var = target.var(dim=-1, keepdim=True) + target = (target - mean) / (var + 1.e-6)**.5 + + loss = (pred - target) ** 2 + loss = loss.mean(dim=-1) # [N, L], mean loss per patch + if self.masked: + loss = (loss * mask).sum() / mask.sum() # mean loss on masked patches + else: + loss = loss.mean() # mean loss + return loss \ No newline at end of file diff --git a/vggt/encoders/croco/croco.py b/vggt/encoders/croco/croco.py new file mode 100644 index 00000000..ec6d0c33 --- /dev/null +++ b/vggt/encoders/croco/croco.py @@ -0,0 +1,261 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + + +# -------------------------------------------------------- +# CroCo model during pretraining +# -------------------------------------------------------- + + + +import torch +import torch.nn as nn +torch.backends.cuda.matmul.allow_tf32 = True # for gpu >= Ampere and pytorch >= 1.12 +from functools import partial + +from .blocks import Block, DecoderBlock, PatchEmbed +from .pos_embed import get_2d_sincos_pos_embed, RoPE2D +from .masking import RandomMask + + +class CroCoNet(nn.Module): + + def __init__(self, + img_size=224, # input image size + patch_size=16, # patch_size + mask_ratio=0.9, # ratios of masked tokens + enc_embed_dim=768, # encoder feature dimension + enc_depth=12, # encoder depth + enc_num_heads=12, # encoder number of heads in the transformer block + dec_embed_dim=512, # decoder feature dimension + dec_depth=8, # decoder depth + dec_num_heads=16, # decoder number of heads in the transformer block + mlp_ratio=4, + norm_layer=partial(nn.LayerNorm, eps=1e-6), + norm_im2_in_dec=True, # whether to apply normalization of the 'memory' = (second image) in the decoder + pos_embed='cosine', # positional embedding (either cosine or RoPE100) + ): + + super(CroCoNet, self).__init__() + + self.patch_size = 16 + + # patch embeddings (with initialization done as in MAE) + self._set_patch_embed(img_size, patch_size, enc_embed_dim) + + # mask generations + self._set_mask_generator(self.patch_embed.num_patches, mask_ratio) + + self.pos_embed = pos_embed + if pos_embed=='cosine': + # positional embedding of the encoder + enc_pos_embed = get_2d_sincos_pos_embed(enc_embed_dim, self.patch_embed.grid_size, n_cls_token=0) + self.register_buffer('enc_pos_embed', torch.from_numpy(enc_pos_embed).float()) + # positional embedding of the decoder + dec_pos_embed = get_2d_sincos_pos_embed(dec_embed_dim, self.patch_embed.grid_size, n_cls_token=0) + self.register_buffer('dec_pos_embed', torch.from_numpy(dec_pos_embed).float()) + # pos embedding in each block + self.rope = None # nothing for cosine + elif pos_embed.startswith('RoPE'): # eg RoPE100 + self.enc_pos_embed = None # nothing to add in the encoder with RoPE + self.dec_pos_embed = None # nothing to add in the decoder with RoPE + if RoPE2D is None: raise ImportError("Cannot find cuRoPE2D, please install it following the README instructions") + freq = float(pos_embed[len('RoPE'):]) + self.rope = RoPE2D(freq=freq) + else: + raise NotImplementedError('Unknown pos_embed '+pos_embed) + + # transformer for the encoder + self.enc_depth = enc_depth + self.enc_embed_dim = enc_embed_dim + self.enc_blocks = nn.ModuleList([ + Block(enc_embed_dim, enc_num_heads, mlp_ratio, qkv_bias=True, norm_layer=norm_layer, rope=self.rope) + for i in range(enc_depth)]) + self.enc_norm = norm_layer(enc_embed_dim) + + # masked tokens + self._set_mask_token(dec_embed_dim) + + # decoder + self._set_decoder(enc_embed_dim, dec_embed_dim, dec_num_heads, dec_depth, mlp_ratio, norm_layer, norm_im2_in_dec) + + # prediction head + self._set_prediction_head(dec_embed_dim, patch_size) + + # initializer weights + self.initialize_weights() + + def _set_patch_embed(self, img_size=224, patch_size=16, enc_embed_dim=768): + self.patch_embed = PatchEmbed(img_size, patch_size, 3, enc_embed_dim) + + def _set_mask_generator(self, num_patches, mask_ratio): + self.mask_generator = RandomMask(num_patches, mask_ratio) + + def _set_mask_token(self, dec_embed_dim): + self.mask_token = nn.Parameter(torch.zeros(1, 1, dec_embed_dim)) + + def _set_decoder(self, enc_embed_dim, dec_embed_dim, dec_num_heads, dec_depth, mlp_ratio, norm_layer, norm_im2_in_dec): + self.dec_depth = dec_depth + self.dec_embed_dim = dec_embed_dim + # transfer from encoder to decoder + self.decoder_embed = nn.Linear(enc_embed_dim, dec_embed_dim, bias=True) + # transformer for the decoder + self.dec_blocks = nn.ModuleList([ + DecoderBlock(dec_embed_dim, dec_num_heads, mlp_ratio=mlp_ratio, qkv_bias=True, norm_layer=norm_layer, norm_mem=norm_im2_in_dec, rope=self.rope) + for i in range(dec_depth)]) + # final norm layer + self.dec_norm = norm_layer(dec_embed_dim) + + def _set_prediction_head(self, dec_embed_dim, patch_size): + self.prediction_head = nn.Linear(dec_embed_dim, patch_size**2 * 3, bias=True) + + + def initialize_weights(self): + # patch embed + self.patch_embed._init_weights() + # mask tokens + if self.mask_token is not None: torch.nn.init.normal_(self.mask_token, std=.02) + # linears and layer norms + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + # we use xavier_uniform following official JAX ViT: + torch.nn.init.xavier_uniform_(m.weight) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + def _encode_image(self, image, do_mask=False, return_all_blocks=False): + """ + image has B x 3 x img_size x img_size + do_mask: whether to perform masking or not + return_all_blocks: if True, return the features at the end of every block + instead of just the features from the last block (eg for some prediction heads) + """ + # embed the image into patches (x has size B x Npatches x C) + # and get position if each return patch (pos has size B x Npatches x 2) + x, pos = self.patch_embed(image) + # add positional embedding without cls token + if self.enc_pos_embed is not None: + x = x + self.enc_pos_embed[None,...] + # apply masking + B,N,C = x.size() + if do_mask: + masks = self.mask_generator(x) + x = x[~masks].view(B, -1, C) + posvis = pos[~masks].view(B, -1, 2) + else: + B,N,C = x.size() + masks = torch.zeros((B,N), dtype=bool) + posvis = pos + # now apply the transformer encoder and normalization + if return_all_blocks: + out = [] + for blk in self.enc_blocks: + x = blk(x, posvis) + out.append(x) + out[-1] = self.enc_norm(out[-1]) + return out, pos, masks + else: + for blk in self.enc_blocks: + x = blk(x, posvis) + x = self.enc_norm(x) + return x, pos, masks + + def _decoder(self, feat1, pos1, masks1, feat2, pos2, return_all_blocks=False): + """ + return_all_blocks: if True, return the features at the end of every block + instead of just the features from the last block (eg for some prediction heads) + + masks1 can be None => assume image1 fully visible + """ + # encoder to decoder layer + visf1 = self.decoder_embed(feat1) + f2 = self.decoder_embed(feat2) + # append masked tokens to the sequence + B,Nenc,C = visf1.size() + if masks1 is None: # downstreams + f1_ = visf1 + else: # pretraining + Ntotal = masks1.size(1) + f1_ = self.mask_token.repeat(B, Ntotal, 1).to(dtype=visf1.dtype) + f1_[~masks1] = visf1.view(B * Nenc, C) + # add positional embedding + if self.dec_pos_embed is not None: + f1_ = f1_ + self.dec_pos_embed + f2 = f2 + self.dec_pos_embed + # apply Transformer blocks + out = f1_ + out2 = f2 + if return_all_blocks: + _out, out = out, [] + for blk in self.dec_blocks: + _out, out2 = blk(_out, out2, pos1, pos2) + out.append(_out) + out[-1] = self.dec_norm(out[-1]) + else: + for blk in self.dec_blocks: + out, out2 = blk(out, out2, pos1, pos2) + out = self.dec_norm(out) + return out + + def patchify(self, imgs): + """ + imgs: (B, 3, H, W) + x: (B, L, patch_size**2 *3) + """ + p = self.patch_embed.patch_size[0] + assert imgs.shape[2] == imgs.shape[3] and imgs.shape[2] % p == 0 + + h = w = imgs.shape[2] // p + x = imgs.reshape(shape=(imgs.shape[0], 3, h, p, w, p)) + x = torch.einsum('nchpwq->nhwpqc', x) + x = x.reshape(shape=(imgs.shape[0], h * w, p**2 * 3)) + + return x + + def unpatchify(self, x, channels=3): + """ + x: (N, L, patch_size**2 *channels) + imgs: (N, 3, H, W) + """ + patch_size = self.patch_embed.patch_size[0] + h = w = int(x.shape[1]**.5) + assert h * w == x.shape[1] + x = x.reshape(shape=(x.shape[0], h, w, patch_size, patch_size, channels)) + x = torch.einsum('nhwpqc->nchpwq', x) + imgs = x.reshape(shape=(x.shape[0], channels, h * patch_size, h * patch_size)) + return imgs + + def forward(self, img1, img2): + """ + img1: tensor of size B x 3 x img_size x img_size + img2: tensor of size B x 3 x img_size x img_size + + out will be B x N x (3*patch_size*patch_size) + masks are also returned as B x N just in case + """ + # encoder of the masked first image + feat1, pos1, mask1 = self._encode_image(img1, do_mask=True) + # encoder of the second image + feat2, pos2, _ = self._encode_image(img2, do_mask=False) + # decoder + decfeat = self._decoder(feat1, pos1, mask1, feat2, pos2) + # prediction head + out = self.prediction_head(decfeat) + # get target + target = self.patchify(img1) + return out, mask1, target + + def forward_features(self, x, masks=None): + x_norm, pos1, mask1 = self._encode_image(x, do_mask=False) + return { + "x_norm_patchtokens": x_norm, + } + + @property + def device(self): + return self.mask_token.device \ No newline at end of file diff --git a/vggt/encoders/croco/masking.py b/vggt/encoders/croco/masking.py new file mode 100644 index 00000000..493b203f --- /dev/null +++ b/vggt/encoders/croco/masking.py @@ -0,0 +1,25 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + + +# -------------------------------------------------------- +# Masking utils +# -------------------------------------------------------- + +import torch +import torch.nn as nn + +class RandomMask(nn.Module): + """ + random masking + """ + + def __init__(self, num_patches, mask_ratio): + super().__init__() + self.num_patches = num_patches + self.num_mask = int(mask_ratio * self.num_patches) + + def __call__(self, x): + noise = torch.rand(x.size(0), self.num_patches, device=x.device) + argsort = torch.argsort(noise, dim=1) + return argsort < self.num_mask \ No newline at end of file diff --git a/vggt/encoders/croco/pos_embed.py b/vggt/encoders/croco/pos_embed.py new file mode 100644 index 00000000..33f6517c --- /dev/null +++ b/vggt/encoders/croco/pos_embed.py @@ -0,0 +1,157 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + + +# -------------------------------------------------------- +# Position embedding utils +# -------------------------------------------------------- + + + +import numpy as np + +import torch + +# -------------------------------------------------------- +# 2D sine-cosine position embedding +# References: +# MAE: https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py +# Transformer: https://github.com/tensorflow/models/blob/master/official/nlp/transformer/model_utils.py +# MoCo v3: https://github.com/facebookresearch/moco-v3 +# -------------------------------------------------------- +def get_2d_sincos_pos_embed(embed_dim, grid_size, n_cls_token=0): + """ + grid_size: tuple (height, width) of the grid + return: + pos_embed: [grid_size[0]*grid_size[1], embed_dim] or [n_cls_token+grid_size[0]*grid_size[1], embed_dim] (w/ or w/o cls_token) + """ + grid_h = np.arange(grid_size[0], dtype=np.float32) + grid_w = np.arange(grid_size[1], dtype=np.float32) + grid = np.meshgrid(grid_w, grid_h) # here w goes first + grid = np.stack(grid, axis=0) + + grid = grid.reshape([2, 1, grid_size[0], grid_size[1]]) + pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) + if n_cls_token>0: + pos_embed = np.concatenate([np.zeros([n_cls_token, embed_dim]), pos_embed], axis=0) + return pos_embed + + +def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): + assert embed_dim % 2 == 0 + + # use half of dimensions to encode grid_h + emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) + emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) + + emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) + return emb + + +def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): + """ + embed_dim: output dimension for each position + pos: a list of positions to be encoded: size (M,) + out: (M, D) + """ + assert embed_dim % 2 == 0 + omega = np.arange(embed_dim // 2, dtype=float) + omega /= embed_dim / 2. + omega = 1. / 10000**omega # (D/2,) + + pos = pos.reshape(-1) # (M,) + out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product + + emb_sin = np.sin(out) # (M, D/2) + emb_cos = np.cos(out) # (M, D/2) + + emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) + return emb + + +# -------------------------------------------------------- +# Interpolate position embeddings for high-resolution +# References: +# MAE: https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py +# DeiT: https://github.com/facebookresearch/deit +# -------------------------------------------------------- +def interpolate_pos_embed(model, checkpoint_model): + keys = ['enc_pos_embed']+(['dec_pos_embed'] if hasattr(model,'dec_blocks') else []) + img_size = model.patch_embed.img_size + if isinstance(img_size,int): img_size = (img_size,img_size) + for k in keys: + if not k in checkpoint_model: continue + pos_embed_checkpoint = checkpoint_model[k] + embedding_size = pos_embed_checkpoint.shape[-1] + num_extra_tokens = 0 # no cls token + # height (== width) for the checkpoint position embedding + orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5) + new_size = (img_size[0]//model.patch_embed.patch_size[0],img_size[1]//model.patch_embed.patch_size[1]) + if orig_size != new_size[0] or orig_size != new_size[1]: + print("Position interpolate %s from %dx%d to %dx%d" % (k, orig_size, orig_size, new_size[0], new_size[1])) + extra_tokens = pos_embed_checkpoint[:num_extra_tokens,:] + pos_tokens = pos_embed_checkpoint[num_extra_tokens:,:] + pos_tokens = pos_tokens.reshape(1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2) + pos_tokens = torch.nn.functional.interpolate(pos_tokens, size=(new_size[0], new_size[1]), mode='bicubic', align_corners=False) + pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2).squeeze(0) + new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=0) + checkpoint_model[k] = new_pos_embed.squeeze(0) + +#---------------------------------------------------------- +# RoPE2D: RoPE implementation in 2D +#---------------------------------------------------------- + +try: + from models.curope import cuRoPE2D + RoPE2D = cuRoPE2D +except ImportError: + print('Warning, cannot find cuda-compiled version of RoPE2D, using a slow pytorch version instead') + + class RoPE2D(torch.nn.Module): + + def __init__(self, freq=100.0, F0=1.0): + super().__init__() + self.base = freq + self.F0 = F0 + self.cache = {} + + def get_cos_sin(self, D, seq_len, device, dtype): + if (D,seq_len,device,dtype) not in self.cache: + inv_freq = self.F0 / (self.base ** (torch.arange(0, D, 2).float().to(device) / D)) + t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype) + freqs = torch.einsum("i,j->ij", t, inv_freq).to(dtype) + freqs = torch.cat((freqs, freqs), dim=-1) + cos = freqs.cos() # (Seq, Dim) + sin = freqs.sin() + self.cache[D,seq_len,device,dtype] = (cos,sin) + return self.cache[D,seq_len,device,dtype] + + @staticmethod + def rotate_half(x): + x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + def apply_rope1d(self, tokens, pos1d, cos, sin): + assert pos1d.ndim==2 + cos = torch.nn.functional.embedding(pos1d, cos)[:, None, :, :] + sin = torch.nn.functional.embedding(pos1d, sin)[:, None, :, :] + return (tokens * cos) + (self.rotate_half(tokens) * sin) + + def forward(self, tokens, positions): + """ + input: + * tokens: batch_size x nheads x ntokens x dim + * positions: batch_size x ntokens x 2 (y and x position of each token) + output: + * tokens after appplying RoPE2D (batch_size x nheads x ntokens x dim) + """ + assert tokens.size(3)%2==0, "number of dimensions should be a multiple of two" + D = tokens.size(3) // 2 + assert positions.ndim==3 and positions.shape[-1] == 2 # Batch, Seq, 2 + cos, sin = self.get_cos_sin(D, int(positions.max())+1, tokens.device, tokens.dtype) + # split features into two along the feature dimension, and apply rope1d on each half + y, x = tokens.chunk(2, dim=-1) + y = self.apply_rope1d(y, positions[:,:,0], cos, sin) + x = self.apply_rope1d(x, positions[:,:,1], cos, sin) + tokens = torch.cat((y, x), dim=-1) + return tokens \ No newline at end of file diff --git a/vggt/encoders/mum/__init__.py b/vggt/encoders/mum/__init__.py new file mode 100644 index 00000000..322ac1fe --- /dev/null +++ b/vggt/encoders/mum/__init__.py @@ -0,0 +1 @@ +from .model import MultiViewMaskedAutoEncoder, vit_base, vit_huge, vit_large \ No newline at end of file diff --git a/vggt/encoders/mum/layers/__init__.py b/vggt/encoders/mum/layers/__init__.py new file mode 100644 index 00000000..716ff606 --- /dev/null +++ b/vggt/encoders/mum/layers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This software may be used and distributed in accordance with +# the terms of the DINOv3 License Agreement. + +from .attention import CausalSelfAttention, LinearKMaskedBias, SelfAttention +from .block import CausalSelfAttentionBlock, SelfAttentionBlock +from .ffn_layers import Mlp, SwiGLUFFN +from .fp8_linear import convert_linears_to_fp8 +from .layer_scale import LayerScale +from .patch_embed import PatchEmbed, DINOv3PatchEmbed +from .rms_norm import RMSNorm +from .rope_position_encoding import RopePositionEmbedding diff --git a/vggt/encoders/mum/layers/attention.py b/vggt/encoders/mum/layers/attention.py new file mode 100644 index 00000000..3c7df173 --- /dev/null +++ b/vggt/encoders/mum/layers/attention.py @@ -0,0 +1,168 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This software may be used and distributed in accordance with +# the terms of the DINOv3 License Agreement. + +import math +from typing import List, Tuple + +import torch +import torch.nn.functional as F +from ..utils import cat_keep_shapes, uncat_with_shapes +from torch import Tensor, nn + + +# RoPE-related functions: +def rope_rotate_half(x: Tensor) -> Tensor: + # x: [ x0 x1 x2 x3 x4 x5] + # out: [-x3 -x4 -x5 x0 x1 x2] + x1, x2 = x.chunk(2, dim=-1) + return torch.cat([-x2, x1], dim=-1) + + +def rope_apply(x: Tensor, sin: Tensor, cos: Tensor) -> Tensor: + # x: [..., D], eg [x0, x1, x2, x3, x4, x5] + # sin: [..., D], eg [sin0, sin1, sin2, sin0, sin1, sin2] + # cos: [..., D], eg [cos0, cos1, cos2, cos0, cos1, cos2] + return (x * cos) + (rope_rotate_half(x) * sin) + + +class LinearKMaskedBias(nn.Linear): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + o = self.out_features + assert o % 3 == 0 + if self.bias is not None: + self.register_buffer("bias_mask", torch.full_like(self.bias, fill_value=math.nan)) + + def forward(self, input: Tensor) -> Tensor: + masked_bias = self.bias * self.bias_mask.to(self.bias.dtype) if self.bias is not None else None + return F.linear(input, self.weight, masked_bias) + + +class SelfAttention(nn.Module): + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = False, + proj_bias: bool = True, + attn_drop: float = 0.0, + proj_drop: float = 0.0, + mask_k_bias: bool = False, + device=None, + ) -> None: + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim**-0.5 + + linear_class = LinearKMaskedBias if mask_k_bias else nn.Linear + self.qkv = linear_class(dim, dim * 3, bias=qkv_bias, device=device) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim, bias=proj_bias, device=device) + self.proj_drop = nn.Dropout(proj_drop) + + def apply_rope(self, q: Tensor, k: Tensor, rope: Tensor | Tuple[Tensor, Tensor]) -> Tuple[Tensor, Tensor]: + # All operations will use the dtype of rope, the output is cast back to the dtype of q and k + q_dtype = q.dtype + k_dtype = k.dtype + sin, cos = rope + rope_dtype = sin.dtype + q = q.to(dtype=rope_dtype) + k = k.to(dtype=rope_dtype) + N = q.shape[-2] + prefix = N - sin.shape[-2] + assert prefix >= 0 + q_prefix = q[:, :, :prefix, :] + + q = rope_apply(q[:, :, prefix:, :], sin, cos) # [B, head, hw, D//head] + q = torch.cat((q_prefix, q), dim=-2) # [B, head, N, D//head] + k_prefix = k[:, :, :prefix, :] + k = rope_apply(k[:, :, prefix:, :], sin, cos) # [B, head, hw, D//head] + k = torch.cat((k_prefix, k), dim=-2) # [B, head, N, D//head] + q = q.to(dtype=q_dtype) + k = k.to(dtype=k_dtype) + return q, k + + def forward(self, x: Tensor, attn_bias=None, rope: Tensor = None) -> Tensor: + qkv = self.qkv(x) + attn_v = self.compute_attention(qkv=qkv, attn_bias=attn_bias, rope=rope) + x = self.proj(attn_v) + x = self.proj_drop(x) + return x + + def forward_list(self, x_list, attn_bias=None, rope_list=None) -> List[Tensor]: + assert len(x_list) == len(rope_list) # should be enforced by the Block + x_flat, shapes, num_tokens = cat_keep_shapes(x_list) + qkv_flat = self.qkv(x_flat) + qkv_list = uncat_with_shapes(qkv_flat, shapes, num_tokens) + att_out = [] + for _, (qkv, _, rope) in enumerate(zip(qkv_list, shapes, rope_list)): + att_out.append(self.compute_attention(qkv, attn_bias=attn_bias, rope=rope)) + x_flat, shapes, num_tokens = cat_keep_shapes(att_out) + x_flat = self.proj(x_flat) + return uncat_with_shapes(x_flat, shapes, num_tokens) + + def compute_attention(self, qkv: Tensor, attn_bias=None, rope=None) -> Tensor: + assert attn_bias is None + B, N, _ = qkv.shape + C = self.qkv.in_features + + qkv = qkv.reshape(B, N, 3, self.num_heads, C // self.num_heads) + q, k, v = torch.unbind(qkv, 2) + q, k, v = [t.transpose(1, 2) for t in [q, k, v]] + if rope is not None: + q, k = self.apply_rope(q, k, rope) + + # self._last_q = q.detach().cpu() + # self._last_k = k.detach().cpu() + x = torch.nn.functional.scaled_dot_product_attention(q, k, v) + x = x.transpose(1, 2) + return x.reshape([B, N, C]) + + +class CausalSelfAttention(nn.Module): + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = False, + proj_bias: bool = True, + attn_drop: float = 0.0, + proj_drop: float = 0.0, + ) -> None: + super().__init__() + self.dim = dim + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim**-0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = attn_drop + self.proj = nn.Linear(dim, dim, bias=proj_bias) + self.proj_drop = nn.Dropout(proj_drop) + + def init_weights( + self, init_attn_std: float | None = None, init_proj_std: float | None = None, factor: float = 1.0 + ) -> None: + init_attn_std = init_attn_std or (self.dim**-0.5) + init_proj_std = init_proj_std or init_attn_std * factor + nn.init.normal_(self.qkv.weight, std=init_attn_std) + nn.init.normal_(self.proj.weight, std=init_proj_std) + if self.qkv.bias is not None: + nn.init.zeros_(self.qkv.bias) + if self.proj.bias is not None: + nn.init.zeros_(self.proj.bias) + + def forward(self, x: Tensor, is_causal: bool = True) -> Tensor: + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads) + q, k, v = torch.unbind(qkv, 2) + q, k, v = [t.transpose(1, 2) for t in [q, k, v]] + x = torch.nn.functional.scaled_dot_product_attention( + q, k, v, attn_mask=None, dropout_p=self.attn_drop if self.training else 0, is_causal=is_causal + ) + x = x.transpose(1, 2).contiguous().view(B, N, C) + x = self.proj_drop(self.proj(x)) + return x diff --git a/vggt/encoders/mum/layers/block.py b/vggt/encoders/mum/layers/block.py new file mode 100644 index 00000000..21189d00 --- /dev/null +++ b/vggt/encoders/mum/layers/block.py @@ -0,0 +1,273 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This software may be used and distributed in accordance with +# the terms of the DINOv3 License Agreement. + +from typing import Callable, List, Optional + +import torch +from torch import Tensor, nn + +from ..utils import cat_keep_shapes, uncat_with_shapes + +from .attention import CausalSelfAttention, SelfAttention +from .ffn_layers import Mlp +from .layer_scale import LayerScale # , DropPath + +torch._dynamo.config.automatic_dynamic_shapes = False +torch._dynamo.config.accumulated_cache_size_limit = 1024 + + +class SelfAttentionBlock(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + ffn_ratio: float = 4.0, + qkv_bias: bool = False, + proj_bias: bool = True, + ffn_bias: bool = True, + drop: float = 0.0, + attn_drop: float = 0.0, + init_values=None, + drop_path: float = 0.0, + act_layer: Callable[..., nn.Module] = nn.GELU, + norm_layer: Callable[..., nn.Module] = nn.LayerNorm, + attn_class: Callable[..., nn.Module] = SelfAttention, + ffn_layer: Callable[..., nn.Module] = Mlp, + mask_k_bias: bool = False, + device=None, + ) -> None: + super().__init__() + # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}") + self.norm1 = norm_layer(dim) + self.attn = attn_class( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + proj_bias=proj_bias, + attn_drop=attn_drop, + proj_drop=drop, + mask_k_bias=mask_k_bias, + device=device, + ) + self.ls1 = LayerScale(dim, init_values=init_values, device=device) if init_values else nn.Identity() + + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * ffn_ratio) + self.mlp = ffn_layer( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + bias=ffn_bias, + device=device, + ) + self.ls2 = LayerScale(dim, init_values=init_values, device=device) if init_values else nn.Identity() + + self.sample_drop_ratio = drop_path + + @staticmethod + def _maybe_index_rope(rope: tuple[Tensor, Tensor] | None, indices: Tensor) -> tuple[Tensor, Tensor] | None: + if rope is None: + return None + + sin, cos = rope + assert sin.ndim == cos.ndim + if sin.ndim == 4: + # If the rope embedding has a batch dimension (is different for each batch element), index into it + return sin[indices], cos[indices] # [batch, heads, patches, embed_dim] + else: + # No batch dimension, do not index + return sin, cos # [heads, patches, embed_dim] or [patches, embed_dim] + + def _forward(self, x: Tensor, rope=None) -> Tensor: + """ + This is the reference implementation for a single tensor, matching what is done below for a list. + We call the list op on [x] instead of this function. + """ + b, _, _ = x.shape + sample_subset_size = max(int(b * (1 - self.sample_drop_ratio)), 1) + residual_scale_factor = b / sample_subset_size + + if self.training and self.sample_drop_ratio > 0.0: + indices_1 = (torch.randperm(b, device=x.device))[:sample_subset_size] + + x_subset_1 = x[indices_1] + rope_subset = self._maybe_index_rope(rope, indices_1) + residual_1 = self.attn(self.norm1(x_subset_1), rope=rope_subset) + + x_attn = torch.index_add( + x, + dim=0, + source=self.ls1(residual_1), + index=indices_1, + alpha=residual_scale_factor, + ) + + indices_2 = (torch.randperm(b, device=x.device))[:sample_subset_size] + + x_subset_2 = x_attn[indices_2] + residual_2 = self.mlp(self.norm2(x_subset_2)) + + x_ffn = torch.index_add( + x_attn, + dim=0, + source=self.ls2(residual_2), + index=indices_2, + alpha=residual_scale_factor, + ) + else: + x_attn = x + self.ls1(self.attn(self.norm1(x), rope=rope)) + x_ffn = x_attn + self.ls2(self.mlp(self.norm2(x_attn))) + + return x_ffn + + def _forward_list(self, x_list: List[Tensor], rope_list=None) -> List[Tensor]: + """ + This list operator concatenates the tokens from the list of inputs together to save + on the elementwise operations. Torch-compile memory-planning allows hiding the overhead + related to concat ops. + """ + b_list = [x.shape[0] for x in x_list] + sample_subset_sizes = [max(int(b * (1 - self.sample_drop_ratio)), 1) for b in b_list] + residual_scale_factors = [b / sample_subset_size for b, sample_subset_size in zip(b_list, sample_subset_sizes)] + + if self.training and self.sample_drop_ratio > 0.0: + indices_1_list = [ + (torch.randperm(b, device=x.device))[:sample_subset_size] + for x, b, sample_subset_size in zip(x_list, b_list, sample_subset_sizes) + ] + x_subset_1_list = [x[indices_1] for x, indices_1 in zip(x_list, indices_1_list)] + + if rope_list is not None: + rope_subset_list = [ + self._maybe_index_rope(rope, indices_1) for rope, indices_1 in zip(rope_list, indices_1_list) + ] + else: + rope_subset_list = rope_list + + flattened, shapes, num_tokens = cat_keep_shapes(x_subset_1_list) + norm1 = uncat_with_shapes(self.norm1(flattened), shapes, num_tokens) + residual_1_list = self.attn.forward_list(norm1, rope_list=rope_subset_list) + + residual_1_list = [r.to(dtype=x_list[0].dtype) for r in residual_1_list] + + x_attn_list = [ + torch.index_add( + x, + dim=0, + source=self.ls1(residual_1), + index=indices_1, + alpha=residual_scale_factor, + ) + for x, residual_1, indices_1, residual_scale_factor in zip( + x_list, residual_1_list, indices_1_list, residual_scale_factors + ) + ] + + indices_2_list = [ + (torch.randperm(b, device=x.device))[:sample_subset_size] + for x, b, sample_subset_size in zip(x_list, b_list, sample_subset_sizes) + ] + x_subset_2_list = [x[indices_2] for x, indices_2 in zip(x_attn_list, indices_2_list)] + flattened, shapes, num_tokens = cat_keep_shapes(x_subset_2_list) + norm2_flat = self.norm2(flattened) + norm2_list = uncat_with_shapes(norm2_flat, shapes, num_tokens) + + residual_2_list = self.mlp.forward_list(norm2_list) + + residual_2_list = [r.to(dtype=x_attn_list[0].dtype) for r in residual_2_list] + + x_ffn = [ + torch.index_add( + x_attn, + dim=0, + source=self.ls2(residual_2), + index=indices_2, + alpha=residual_scale_factor, + ) + for x_attn, residual_2, indices_2, residual_scale_factor in zip( + x_attn_list, residual_2_list, indices_2_list, residual_scale_factors + ) + ] + else: + x_out = [] + for x, rope in zip(x_list, rope_list): + x_attn = x + self.ls1(self.attn(self.norm1(x), rope=rope)) + x_ffn = x_attn + self.ls2(self.mlp(self.norm2(x_attn))) + x_out.append(x_ffn) + x_ffn = x_out + + return x_ffn + + def forward(self, x_or_x_list, rope_or_rope_list=None) -> List[Tensor]: + if isinstance(x_or_x_list, Tensor): + # for reference: + # return self._forward(x_or_x_list, rope=rope_or_rope_list) + # in order to match implementations we call the list op: + return self._forward_list([x_or_x_list], rope_list=[rope_or_rope_list])[0] + elif isinstance(x_or_x_list, list): + if rope_or_rope_list is None: + rope_or_rope_list = [None for x in x_or_x_list] + # return [self._forward(x, rope=rope) for x, rope in zip(x_or_x_list, rope_or_rope_list)] + return self._forward_list(x_or_x_list, rope_list=rope_or_rope_list) + else: + raise AssertionError + + +class CausalSelfAttentionBlock(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + ffn_ratio: float = 4.0, + ls_init_value: Optional[float] = None, + is_causal: bool = True, + act_layer: Callable = nn.GELU, + norm_layer: Callable = nn.LayerNorm, + dropout_prob: float = 0.0, + ): + super().__init__() + + self.dim = dim + self.is_causal = is_causal + self.ls1 = LayerScale(dim, init_values=ls_init_value) if ls_init_value else nn.Identity() + self.attention_norm = norm_layer(dim) + self.attention = CausalSelfAttention(dim, num_heads, attn_drop=dropout_prob, proj_drop=dropout_prob) + + self.ffn_norm = norm_layer(dim) + ffn_hidden_dim = int(dim * ffn_ratio) + self.feed_forward = Mlp( + in_features=dim, + hidden_features=ffn_hidden_dim, + drop=dropout_prob, + act_layer=act_layer, + ) + + self.ls2 = LayerScale(dim, init_values=ls_init_value) if ls_init_value else nn.Identity() + + def init_weights( + self, + init_attn_std: float | None = None, + init_proj_std: float | None = None, + init_fc_std: float | None = None, + factor: float = 1.0, + ) -> None: + init_attn_std = init_attn_std or (self.dim**-0.5) + init_proj_std = init_proj_std or init_attn_std * factor + init_fc_std = init_fc_std or (2 * self.dim) ** -0.5 + self.attention.init_weights(init_attn_std, init_proj_std) + self.attention_norm.reset_parameters() + nn.init.normal_(self.feed_forward.fc1.weight, std=init_fc_std) + nn.init.normal_(self.feed_forward.fc2.weight, std=init_proj_std) + self.ffn_norm.reset_parameters() + + def forward( + self, + x: torch.Tensor, + ): + + x_attn = x + self.ls1(self.attention(self.attention_norm(x), self.is_causal)) + x_ffn = x_attn + self.ls2(self.feed_forward(self.ffn_norm(x_attn))) + return x_ffn diff --git a/vggt/encoders/mum/layers/dino_head.py b/vggt/encoders/mum/layers/dino_head.py new file mode 100644 index 00000000..bb71f35f --- /dev/null +++ b/vggt/encoders/mum/layers/dino_head.py @@ -0,0 +1,67 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This software may be used and distributed in accordance with +# the terms of the DINOv3 License Agreement. + +import torch +import torch.nn as nn +from torch.nn.init import trunc_normal_ + + +class DINOHead(nn.Module): + def __init__( + self, + in_dim, + out_dim, + use_bn=False, + nlayers=3, + hidden_dim=2048, + bottleneck_dim=256, + mlp_bias=True, + ): + super().__init__() + nlayers = max(nlayers, 1) + self.mlp = _build_mlp( + nlayers, + in_dim, + bottleneck_dim, + hidden_dim=hidden_dim, + use_bn=use_bn, + bias=mlp_bias, + ) + self.last_layer = nn.Linear(bottleneck_dim, out_dim, bias=False) + + def init_weights(self) -> None: + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=0.02) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + def forward(self, x, no_last_layer=False, only_last_layer=False): + if not only_last_layer: + x = self.mlp(x) + eps = 1e-6 if x.dtype == torch.float16 else 1e-12 + x = nn.functional.normalize(x, dim=-1, p=2, eps=eps) + if not no_last_layer: + x = self.last_layer(x) + return x + + +def _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=None, use_bn=False, bias=True): + if nlayers == 1: + return nn.Linear(in_dim, bottleneck_dim, bias=bias) + else: + layers = [nn.Linear(in_dim, hidden_dim, bias=bias)] + if use_bn: + layers.append(nn.BatchNorm1d(hidden_dim)) + layers.append(nn.GELU()) + for _ in range(nlayers - 2): + layers.append(nn.Linear(hidden_dim, hidden_dim, bias=bias)) + if use_bn: + layers.append(nn.BatchNorm1d(hidden_dim)) + layers.append(nn.GELU()) + layers.append(nn.Linear(hidden_dim, bottleneck_dim, bias=bias)) + return nn.Sequential(*layers) diff --git a/vggt/encoders/mum/layers/ffn_layers.py b/vggt/encoders/mum/layers/ffn_layers.py new file mode 100644 index 00000000..cd533049 --- /dev/null +++ b/vggt/encoders/mum/layers/ffn_layers.py @@ -0,0 +1,77 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This software may be used and distributed in accordance with +# the terms of the DINOv3 License Agreement. + +from typing import Callable, List, Optional + +import torch.nn.functional as F +from torch import Tensor, nn + +from ..utils import cat_keep_shapes, uncat_with_shapes + + +class ListForwardMixin(object): + def forward(self, x: Tensor): + raise NotImplementedError + + def forward_list(self, x_list: List[Tensor]) -> List[Tensor]: + x_flat, shapes, num_tokens = cat_keep_shapes(x_list) + x_flat = self.forward(x_flat) + return uncat_with_shapes(x_flat, shapes, num_tokens) + + +class Mlp(nn.Module, ListForwardMixin): + def __init__( + self, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + act_layer: Callable[..., nn.Module] = nn.GELU, + drop: float = 0.0, + bias: bool = True, + device=None, + ) -> None: + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features, bias=bias, device=device) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features, bias=bias, device=device) + self.drop = nn.Dropout(drop) + + def forward(self, x: Tensor) -> Tensor: + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class SwiGLUFFN(nn.Module, ListForwardMixin): + def __init__( + self, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + act_layer: Optional[Callable[..., nn.Module]] = None, + drop: float = 0.0, + bias: bool = True, + align_to: int = 8, + device=None, + ) -> None: + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + d = int(hidden_features * 2 / 3) + swiglu_hidden_features = d + (-d % align_to) + self.w1 = nn.Linear(in_features, swiglu_hidden_features, bias=bias, device=device) + self.w2 = nn.Linear(in_features, swiglu_hidden_features, bias=bias, device=device) + self.w3 = nn.Linear(swiglu_hidden_features, out_features, bias=bias, device=device) + + def forward(self, x: Tensor) -> Tensor: + x1 = self.w1(x) + x2 = self.w2(x) + hidden = F.silu(x1) * x2 + return self.w3(hidden) diff --git a/vggt/encoders/mum/layers/fp8_linear.py b/vggt/encoders/mum/layers/fp8_linear.py new file mode 100644 index 00000000..0fff8a0e --- /dev/null +++ b/vggt/encoders/mum/layers/fp8_linear.py @@ -0,0 +1,141 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This software may be used and distributed in accordance with +# the terms of the DINOv3 License Agreement. + +import re + +import torch + +from .attention import LinearKMaskedBias +from ..utils import named_replace + +# avoid division by zero when calculating scale +EPS = 1e-12 + + +def scale(t, amax_t): + max_v = torch.finfo(torch.float8_e4m3fn).max + scale_t = torch.clamp(amax_t.float(), min=EPS) / max_v + t_fp8 = (t / scale_t).to(torch.float8_e4m3fn) + return t_fp8, scale_t + + +def matmul(first, amax_first, second_t, amax_second_t, bias): + first_fp8, scale_first = scale(first, amax_first) + second_t_fp8, scale_second_t = scale(second_t, amax_second_t) + # PyTorch's row-wise scaled matmul kernel is based on CUTLASS and is quite + # slow. Hence we fall back to an "unscaled" matmul, which uses cuBLAS, and + # apply the scale manually afterwards. + output = torch._scaled_mm( + first_fp8, + second_t_fp8.t(), + scale_a=scale_first.new_ones((1, 1)), + scale_b=scale_second_t.t().new_ones((1, 1)), + bias=None, + out_dtype=torch.bfloat16, + use_fast_accum=False, + ) + output = (output * scale_first * scale_second_t.t()).to(torch.bfloat16) + if bias is not None: + output = output + bias + return output + + +@torch.compiler.allow_in_graph +class Fp8LinearFn(torch.autograd.Function): + @staticmethod + def forward(ctx, a, b_t, bias): + amax_a = a.abs().amax(dim=-1, keepdim=True) + amax_b_t = b_t.abs().amax(dim=-1, keepdim=True) + out = matmul(a, amax_a, b_t, amax_b_t, bias) + + ctx.a_requires_grad = a.requires_grad + ctx.b_requires_grad = b_t.requires_grad + ctx.bias_requires_grad = bias.requires_grad if bias is not None else False + + ctx.save_for_backward(a, b_t, amax_b_t.max()) + + return out + + @staticmethod + def backward(ctx, grad_out): + a, b_t, amax_b = ctx.saved_tensors + + if ctx.a_requires_grad: + b = b_t.t().contiguous() + amax_grad_out = grad_out.abs().amax(dim=-1, keepdim=True) + amax_b = amax_b.repeat(b.shape[0], 1) + grad_a = matmul(grad_out, amax_grad_out, b, amax_b, None) + else: + grad_a = None + if ctx.b_requires_grad: + grad_b = grad_out.t() @ a + else: + grad_b = None + if ctx.bias_requires_grad: + grad_bias = grad_out.sum(dim=0) + else: + grad_bias = None + + return grad_a, grad_b, grad_bias + + +class Fp8Linear(torch.nn.Linear): + def forward(self, input: torch.Tensor) -> torch.Tensor: + out = Fp8LinearFn.apply(input.flatten(end_dim=-2), self.weight, self.bias) + out = out.unflatten(0, input.shape[:-1]) + return out + + +class Fp8LinearKMaskedBias(LinearKMaskedBias): + def forward(self, input: torch.Tensor) -> torch.Tensor: + masked_bias = self.bias * self.bias_mask if self.bias is not None else None + out = Fp8LinearFn.apply(input.flatten(end_dim=-2), self.weight, masked_bias) + out = out.unflatten(0, input.shape[:-1]) + return out + + +def convert_linears_to_fp8(root_module: torch.nn.Module, *, filter: str) -> torch.nn.Module: + filter_re = re.compile(filter) + total_count = 0 + + def replace(module: torch.nn.Module, name: str) -> torch.nn.Module: + nonlocal total_count + if not isinstance(module, torch.nn.Linear) or not filter_re.search(name): + return module + if type(module) == torch.nn.Linear: + new_cls = Fp8Linear + elif type(module) == LinearKMaskedBias: + new_cls = Fp8LinearKMaskedBias + else: + assert False, str(type(module)) + if module.in_features % 64 != 0 or module.out_features % 64 != 0: + # This is not a strict requirement, but H100 TensorCores for fp8 + # operate on tiles of 64 elements anyways, and Inductor sometimes + # pads inner dims to become multiples of 64. Also, if one day we + # switch back to cuBLAS, it artificially requires dims to be + # multiples of 16. + raise RuntimeError( + "fp8 requires all dimensions to be multiples of 64 " "(consider using ffn_layer=swiglu64 or higher)" + ) + new_module = new_cls( + in_features=module.in_features, + out_features=module.out_features, + bias=module.bias is not None, + dtype=module.weight.dtype, + device=module.weight.device, + ) + new_module.weight = module.weight + new_module.bias = module.bias + total_count += 1 + return new_module + + out = named_replace(replace, root_module) + assert total_count > 0, "fp8: no layer found to convert" + # Force re-compile everything + torch._dynamo.reset_code_caches() + from torch._inductor.cudagraph_trees import reset_cudagraph_trees + + reset_cudagraph_trees() + return out diff --git a/vggt/encoders/mum/layers/layer_scale.py b/vggt/encoders/mum/layers/layer_scale.py new file mode 100644 index 00000000..0b72b7c6 --- /dev/null +++ b/vggt/encoders/mum/layers/layer_scale.py @@ -0,0 +1,29 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This software may be used and distributed in accordance with +# the terms of the DINOv3 License Agreement. + +from typing import Union + +import torch +from torch import Tensor, nn + + +class LayerScale(nn.Module): + def __init__( + self, + dim: int, + init_values: Union[float, Tensor] = 1e-5, + inplace: bool = False, + device=None, + ) -> None: + super().__init__() + self.inplace = inplace + self.gamma = nn.Parameter(torch.empty(dim, device=device)) + self.init_values = init_values + + def reset_parameters(self): + nn.init.constant_(self.gamma, self.init_values) + + def forward(self, x: Tensor) -> Tensor: + return x.mul_(self.gamma) if self.inplace else x * self.gamma diff --git a/vggt/encoders/mum/layers/patch_embed.py b/vggt/encoders/mum/layers/patch_embed.py new file mode 100644 index 00000000..5a12cc1c --- /dev/null +++ b/vggt/encoders/mum/layers/patch_embed.py @@ -0,0 +1,135 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This software may be used and distributed in accordance with +# the terms of the DINOv3 License Agreement. + +import math +from typing import Callable, Tuple, Union +import torch +from torch import Tensor, nn + + +def make_2tuple(x): + if isinstance(x, tuple): + assert len(x) == 2 + return x + + assert isinstance(x, int) + return (x, x) + + +class PatchEmbed(nn.Module): + """ + 2D image to patch embedding: (B,C,H,W) -> (B,N,D) + + Args: + img_size: Image size. + patch_size: Patch token size. + in_chans: Number of input image channels. + embed_dim: Number of linear projection output channels. + norm_layer: Normalization layer. + """ + + def __init__( + self, + img_size: Union[int, Tuple[int, int]] = 224, + patch_size: Union[int, Tuple[int, int]] = 16, + in_chans: int = 3, + embed_dim: int = 768, + norm_layer: Callable | None = None, + flatten_embedding: bool = True, + ) -> None: + super().__init__() + + image_HW = make_2tuple(img_size) + patch_HW = make_2tuple(patch_size) + patch_grid_size = ( + image_HW[0] // patch_HW[0], + image_HW[1] // patch_HW[1], + ) + + self.img_size = image_HW + self.patch_size = patch_HW + self.patches_resolution = patch_grid_size + self.num_patches = patch_grid_size[0] * patch_grid_size[1] + + self.in_chans = in_chans + self.embed_dim = embed_dim + + self.flatten_embedding = flatten_embedding + + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW) + self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() + + def forward(self, x: Tensor) -> Tensor: + _, _, H, W = x.shape + # patch_H, patch_W = self.patch_size + # assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}" + # assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}" + + x = self.proj(x) # B C H W + H, W = x.size(2), x.size(3) + x = x.flatten(2).transpose(1, 2) # B HW C + x = self.norm(x) + if not self.flatten_embedding: + x = x.reshape(-1, H, W, self.embed_dim) # B H W C + return x + + def flops(self) -> float: + Ho, Wo = self.patches_resolution + flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1]) + if self.norm is not None: + flops += Ho * Wo * self.embed_dim + return flops + + def reset_parameters(self): + k = 1 / (self.in_chans * (self.patch_size[0] ** 2)) + nn.init.uniform_(self.proj.weight, -math.sqrt(k), math.sqrt(k)) + if self.proj.bias is not None: + nn.init.uniform_(self.proj.bias, -math.sqrt(k), math.sqrt(k)) + + +class DINOv3PatchEmbed(nn.Module): + def __init__( + self, + img_size: Union[int, Tuple[int, int]] = 224, + patch_size: Union[int, Tuple[int, int]] = 16, + in_chans: int = 3, + embed_dim: int = 768, + norm_layer: Callable | None = None, + flatten_embedding: bool = True, + **kwargs, + ): + super().__init__() + + image_HW = make_2tuple(img_size) + patch_HW = make_2tuple(patch_size) + patch_grid_size = ( + image_HW[0] // patch_HW[0], + image_HW[1] // patch_HW[1], + ) + + self.img_size = image_HW + self.patch_size = patch_HW + self.patches_resolution = patch_grid_size + self.num_patches = patch_grid_size[0] * patch_grid_size[1] + + self.in_chans = in_chans + self.embed_dim = embed_dim + + self.flatten_embedding = flatten_embedding + + self.backbone = [torch.hub.load("/mimer/NOBACKUP/groups/snic2022-6-266/davnords/dinov3", "dinov3_vitl16", source='local', weights="/mimer/NOBACKUP/groups/snic2022-6-266/davnords/mv-ssl/pretrained_models/dinov3_vitl16_pretrain_lvd1689m-8aa4cbdd.pth")] + self.proj = nn.Linear(self.backbone[0].embed_dim, embed_dim) if self.backbone[0].embed_dim != embed_dim else nn.Identity() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + _, _, H, W = x.shape + p = self.patch_size[0] + with torch.no_grad(): + if next(self.backbone[0].parameters()).device != x.device: + self.backbone[0] = self.backbone[0].to(x.device).to(x.dtype) + x = self.backbone[0].forward_features(x)['x_norm_patchtokens'] + x = self.proj(x) + if not self.flatten_embedding: + x = x.reshape(-1, H//p, W//p, self.embed_dim) + return x diff --git a/vggt/encoders/mum/layers/rms_norm.py b/vggt/encoders/mum/layers/rms_norm.py new file mode 100644 index 00000000..1d0a89c4 --- /dev/null +++ b/vggt/encoders/mum/layers/rms_norm.py @@ -0,0 +1,24 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This software may be used and distributed in accordance with +# the terms of the DINOv3 License Agreement. + +import torch +from torch import Tensor, nn + + +class RMSNorm(nn.Module): + def __init__(self, dim: int, eps: float = 1e-5): + super().__init__() + self.weight = nn.Parameter(torch.ones(dim)) + self.eps = eps + + def reset_parameters(self) -> None: + nn.init.constant_(self.weight, 1) + + def _norm(self, x: Tensor) -> Tensor: + return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) + + def forward(self, x: Tensor) -> Tensor: + output = self._norm(x.float()).type_as(x) + return output * self.weight diff --git a/vggt/encoders/mum/layers/rope_position_encoding.py b/vggt/encoders/mum/layers/rope_position_encoding.py new file mode 100644 index 00000000..2635d09e --- /dev/null +++ b/vggt/encoders/mum/layers/rope_position_encoding.py @@ -0,0 +1,121 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This software may be used and distributed in accordance with +# the terms of the DINOv3 License Agreement. + +import math +from typing import Literal + +import numpy as np +import torch +from torch import Tensor, nn + + +# RoPE positional embedding with no mixing of coordinates (axial) and no learnable weights +# Supports two parametrizations of the rope parameters: either using `base` or `min_period` and `max_period`. +class RopePositionEmbedding(nn.Module): + def __init__( + self, + embed_dim: int, + *, + num_heads: int, + base: float | None = 100.0, + min_period: float | None = None, + max_period: float | None = None, + normalize_coords: Literal["min", "max", "separate"] = "separate", + shift_coords: float | None = None, + jitter_coords: float | None = None, + rescale_coords: float | None = None, + dtype: torch.dtype | None = None, + device: torch.device | None = None, + ): + super().__init__() + assert embed_dim % (4 * num_heads) == 0 + both_periods = min_period is not None and max_period is not None + if (base is None and not both_periods) or (base is not None and both_periods): + raise ValueError("Either `base` or `min_period`+`max_period` must be provided.") + + D_head = embed_dim // num_heads + self.base = base + self.min_period = min_period + self.max_period = max_period + self.D_head = D_head + self.normalize_coords = normalize_coords + self.shift_coords = shift_coords + self.jitter_coords = jitter_coords + self.rescale_coords = rescale_coords + + # Needs persistent=True because we do teacher.load_state_dict(student.state_dict()) to initialize the teacher + self.dtype = dtype # Don't rely on self.periods.dtype + self.register_buffer( + "periods", + torch.empty(D_head // 4, device=device, dtype=dtype), + persistent=True, + ) + self._init_weights() + + def forward(self, *, H: int, W: int) -> tuple[Tensor, Tensor]: + device = self.periods.device + dtype = self.dtype + dd = {"device": device, "dtype": dtype} + + # Prepare coords in range [-1, +1] + if self.normalize_coords == "max": + max_HW = max(H, W) + coords_h = torch.arange(0.5, H, **dd) / max_HW # [H] + coords_w = torch.arange(0.5, W, **dd) / max_HW # [W] + elif self.normalize_coords == "min": + min_HW = min(H, W) + coords_h = torch.arange(0.5, H, **dd) / min_HW # [H] + coords_w = torch.arange(0.5, W, **dd) / min_HW # [W] + elif self.normalize_coords == "separate": + coords_h = torch.arange(0.5, H, **dd) / H # [H] + coords_w = torch.arange(0.5, W, **dd) / W # [W] + else: + raise ValueError(f"Unknown normalize_coords: {self.normalize_coords}") + coords = torch.stack(torch.meshgrid(coords_h, coords_w, indexing="ij"), dim=-1) # [H, W, 2] + coords = coords.flatten(0, 1) # [HW, 2] + coords = 2.0 * coords - 1.0 # Shift range [0, 1] to [-1, +1] + + # Shift coords by adding a uniform value in [-shift, shift] + if self.training and self.shift_coords is not None: + shift_hw = torch.empty(2, **dd).uniform_(-self.shift_coords, self.shift_coords) + coords += shift_hw[None, :] + + # Jitter coords by multiplying the range [-1, 1] by a log-uniform value in [1/jitter, jitter] + if self.training and self.jitter_coords is not None: + jitter_max = np.log(self.jitter_coords) + jitter_min = -jitter_max + jitter_hw = torch.empty(2, **dd).uniform_(jitter_min, jitter_max).exp() + coords *= jitter_hw[None, :] + + # Rescale coords by multiplying the range [-1, 1] by a log-uniform value in [1/rescale, rescale] + if self.training and self.rescale_coords is not None: + rescale_max = np.log(self.rescale_coords) + rescale_min = -rescale_max + rescale_hw = torch.empty(1, **dd).uniform_(rescale_min, rescale_max).exp() + coords *= rescale_hw + + # Prepare angles and sin/cos + angles = 2 * math.pi * coords[:, :, None] / self.periods[None, None, :] # [HW, 2, D//4] + angles = angles.flatten(1, 2) # [HW, D//2] + angles = angles.tile(2) # [HW, D] + cos = torch.cos(angles) # [HW, D] + sin = torch.sin(angles) # [HW, D] + + return (sin, cos) # 2 * [HW, D] + + def _init_weights(self): + device = self.periods.device + dtype = self.dtype + if self.base is not None: + periods = self.base ** ( + 2 * torch.arange(self.D_head // 4, device=device, dtype=dtype) / (self.D_head // 2) + ) # [D//4] + else: + base = self.max_period / self.min_period + exponents = torch.linspace(0, 1, self.D_head // 4, device=device, dtype=dtype) # [D//4] range [0, 1] + periods = base**exponents # range [1, max_period / min_period] + periods = periods / base # range [min_period / max_period, 1] + periods = periods * self.max_period # range [min_period, max_period] + self.periods.data = periods diff --git a/vggt/encoders/mum/model.py b/vggt/encoders/mum/model.py new file mode 100644 index 00000000..068beeae --- /dev/null +++ b/vggt/encoders/mum/model.py @@ -0,0 +1,402 @@ +import logging +from functools import partial +from typing import Any, Literal, Callable + +import torch +import torch.nn.init +from torch import nn + +from .layers import LayerScale, Mlp, PatchEmbed, RMSNorm, RopePositionEmbedding, SelfAttentionBlock, SwiGLUFFN, DINOv3PatchEmbed +from .utils import named_apply + +logger = logging.getLogger("dinov3") + +ffn_layer_dict = { + "mlp": Mlp, + "swiglu": SwiGLUFFN, + "swiglu32": partial(SwiGLUFFN, align_to=32), + "swiglu64": partial(SwiGLUFFN, align_to=64), + "swiglu128": partial(SwiGLUFFN, align_to=128), +} + +norm_layer_dict = { + "layernorm": partial(nn.LayerNorm, eps=1e-6), + "layernormbf16": partial(nn.LayerNorm, eps=1e-5), + "rmsnorm": RMSNorm, +} + +dtype_dict = { + "fp32": torch.float32, + "fp16": torch.float16, + "bf16": torch.bfloat16, +} + + +def init_weights_vit(module: nn.Module, name: str = ""): + if isinstance(module, nn.Linear): + torch.nn.init.trunc_normal_(module.weight, std=0.02) + if module.bias is not None: + nn.init.zeros_(module.bias) + if isinstance(module, nn.LayerNorm): + module.reset_parameters() + if isinstance(module, LayerScale): + module.reset_parameters() + if isinstance(module, PatchEmbed): + module.reset_parameters() + if isinstance(module, RMSNorm): + module.reset_parameters() + +def build_model(cfg): + vit_kwargs = dict(**cfg.model) + vit_kwargs['device'] = 'cuda' + # model = model_file.__dict__[cfg.model.name](**vit_kwargs) + model = globals()[cfg.model.name](**vit_kwargs) + model.init_weights() + return model + +class MultiViewMaskedAutoEncoder(nn.Module): + def __init__( + self, + *, + img_size: int = 224, + patch_size: int = 16, + in_chans: int = 3, + embed_dim: int = 1024, + depth: int = 24, + num_heads: int = 16, + decoder_embed_dim: int = 512, + decoder_depth: int = 8, + decoder_num_heads: int = 16, + norm_pix_loss:bool = True, + patch_embed: Literal['conv', 'dinov3'] = 'conv', + + pos_embed_rope_base: float = 100.0, + pos_embed_rope_min_period: float | None = None, + pos_embed_rope_max_period: float | None = None, + pos_embed_rope_normalize_coords: Literal["min", "max", "separate"] = "separate", + pos_embed_rope_shift_coords: float | None = None, + pos_embed_rope_jitter_coords: float | None = None, + pos_embed_rope_rescale_coords: float | None = None, + pos_embed_rope_dtype: str = "bf16", + + ffn_ratio: float = 4.0, + qkv_bias: bool = True, + drop_path_rate: float = 0.0, + layerscale_init: float | None = None, + norm_layer: str = "layernorm", + ffn_layer: str = "mlp", + ffn_bias: bool = True, + proj_bias: bool = True, + n_storage_tokens: int = 0, + mask_k_bias: bool = False, + device: Any | None = None, + **ignored_kwargs, + ): + super().__init__() + if len(ignored_kwargs) > 0: + logger.warning(f"Ignored kwargs: {ignored_kwargs}") + del ignored_kwargs + + norm_layer_cls = norm_layer_dict[norm_layer] + + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + self.n_blocks = depth + self.num_heads = num_heads + self.patch_size = patch_size + self.norm_pix_loss = norm_pix_loss + + # -------------------------------------------------------------------------- + # MAE encoder specifics + if patch_embed == "conv": + patch_embed_cls = PatchEmbed + elif patch_embed == "dinov3": + patch_embed_cls = DINOv3PatchEmbed + else: + raise ValueError(f"Unknown patch embedding type: {patch_embed}") + self.patch_embed = patch_embed_cls( + img_size=img_size, + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim, + flatten_embedding=False, + ) + self.cls_token = nn.Parameter(torch.empty(1, 1, embed_dim, device=device)) + self.n_storage_tokens = n_storage_tokens + if self.n_storage_tokens > 0: + self.storage_tokens = nn.Parameter(torch.empty(1, n_storage_tokens, embed_dim, device=device)) + logger.info(f"using base={pos_embed_rope_base} for rope new") + logger.info(f"using min_period={pos_embed_rope_min_period} for rope new") + logger.info(f"using max_period={pos_embed_rope_max_period} for rope new") + logger.info(f"using normalize_coords={pos_embed_rope_normalize_coords} for rope new") + logger.info(f"using shift_coords={pos_embed_rope_shift_coords} for rope new") + logger.info(f"using rescale_coords={pos_embed_rope_rescale_coords} for rope new") + logger.info(f"using jitter_coords={pos_embed_rope_jitter_coords} for rope new") + logger.info(f"using dtype={pos_embed_rope_dtype} for rope new") + + rope_cls = partial( + RopePositionEmbedding, + base=pos_embed_rope_base, + min_period=pos_embed_rope_min_period, + max_period=pos_embed_rope_max_period, + normalize_coords=pos_embed_rope_normalize_coords, + shift_coords=pos_embed_rope_shift_coords, + jitter_coords=pos_embed_rope_jitter_coords, + rescale_coords=pos_embed_rope_rescale_coords, + dtype=dtype_dict[pos_embed_rope_dtype], + device=device, + ) + self.rope_embed = rope_cls( + embed_dim=embed_dim, + num_heads=num_heads, + ) + logger.info(f"using {ffn_layer} layer as FFN") + ffn_layer_cls = ffn_layer_dict[ffn_layer] + + block_cls = partial(SelfAttentionBlock, + ffn_ratio=ffn_ratio, + qkv_bias=qkv_bias, + proj_bias=proj_bias, + ffn_bias=ffn_bias, + drop_path=drop_path_rate, + norm_layer=norm_layer_cls, + act_layer=nn.GELU, + ffn_layer=ffn_layer_cls, + init_values=layerscale_init, + mask_k_bias=mask_k_bias, + device=device, + ) + self.blocks = nn.ModuleList([block_cls(dim=embed_dim, num_heads=num_heads) for i in range(depth)]) + self.norm = norm_layer_cls(embed_dim) + + # -------------------------------------------------------------------------- + # MAE decoder specifics + self.rope_embed_decoder = rope_cls( + embed_dim=decoder_embed_dim, + num_heads=decoder_num_heads, + ) + self.decoder_embed = nn.Linear(embed_dim, decoder_embed_dim, bias=True, device=device) + self.mask_token = nn.Parameter(torch.zeros(1, 1, decoder_embed_dim, device=device)) + + self.decoder_frame_blocks = nn.ModuleList([ + block_cls(dim=decoder_embed_dim, num_heads=decoder_num_heads) + for i in range(decoder_depth//2)]) + + self.decoder_global_blocks = nn.ModuleList([ + block_cls(dim=decoder_embed_dim, num_heads=decoder_num_heads) + for i in range(decoder_depth//2)]) + + self.decoder_norm = norm_layer_cls(decoder_embed_dim) + self.decoder_pred = nn.Linear(decoder_embed_dim, patch_size**2 * in_chans, bias=True, device=device) # decoder to patch + # -------------------------------------------------------------------------- + + def init_weights(self): + self.rope_embed._init_weights() + nn.init.normal_(self.cls_token, std=0.02) + if self.n_storage_tokens > 0: + nn.init.normal_(self.storage_tokens, std=0.02) + # nn.init.zeros_(self.mask_token) + nn.init.normal_(self.mask_token, std=.02) + named_apply(init_weights_vit, self) + + def patchify(self, imgs): + """ + imgs: (N, 3, H, W) + x: (N, L, patch_size**2 *3) + """ + p = self.patch_embed.patch_size[0] + # assert imgs.shape[2] == imgs.shape[3] and imgs.shape[2] % p == 0 + assert imgs.shape[2] % p == 0 and imgs.shape[3] % p == 0 + + h, w = imgs.shape[2] // p, imgs.shape[3] // p + x = imgs.reshape(shape=(imgs.shape[0], 3, h, p, w, p)) + x = torch.einsum('nchpwq->nhwpqc', x) + x = x.reshape(shape=(imgs.shape[0], h * w, p**2 * 3)) + return x + + def unpatchify(self, x): + """ + x: (N, L, patch_size**2 *3) + imgs: (N, 3, H, W) + """ + p = self.patch_embed.patch_size[0] + h = w = int(x.shape[1]**.5) + assert h * w == x.shape[1] + + x = x.reshape(shape=(x.shape[0], h, w, p, p, 3)) + x = torch.einsum('nhwpqc->nchpwq', x) + imgs = x.reshape(shape=(x.shape[0], 3, h * p, h * p)) + return imgs + + def random_masking(self, x, mask_ratio): + """ + Perform per-sample random masking by per-sample shuffling. + Per-sample shuffling is done by argsort random noise. + x: [N, L, D], sequence + """ + N, L, D = x.shape # batch, length, dim + len_keep = int(L * (1 - mask_ratio)) + + noise = torch.rand(N, L, device=x.device) # noise in [0, 1] + + # sort noise for each sample + ids_shuffle = torch.argsort(noise, dim=1) # ascend: small is keep, large is remove + ids_restore = torch.argsort(ids_shuffle, dim=1) + + # keep the first subset + ids_keep = ids_shuffle[:, :len_keep] + x_masked = torch.gather(x, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, D)) + + # generate the binary mask: 0 is keep, 1 is remove + mask = torch.ones([N, L], device=x.device) + mask[:, :len_keep] = 0 + # unshuffle to get the binary mask + mask = torch.gather(mask, dim=1, index=ids_restore) + + return x_masked, mask, ids_restore, ids_keep + + def forward_encoder(self, x, mask_ratio, return_all_blocks=False): + # embed patches + SB, C_in, H, W = x.shape + x = self.patch_embed(x) + rope_sincos = self.rope_embed(H=x.shape[1], W=x.shape[2]) + x = x.flatten(1,2) # [SB, L, C], with L=H*W + + # masking: length -> length * mask_ratio + if not return_all_blocks: + x, mask, ids_restore, ids_keep = self.random_masking(x, mask_ratio) + + # Let's just drop the masked patches in the rope + sin, cos = rope_sincos + + sin_vis, cos_vis = sin[ids_keep], cos[ids_keep] # [B, N_vis, D_head] + sin_vis, cos_vis = sin_vis.unsqueeze(1).repeat(1, self.num_heads, 1, 1), cos_vis.unsqueeze(1).repeat(1, self.num_heads, 1, 1) + + rope_sincos = (sin_vis, cos_vis) + + # append cls token + cls_tokens = self.cls_token.expand(x.shape[0], -1, -1) + x = torch.cat((cls_tokens, x), dim=1) + + # apply Transformer blocks + if return_all_blocks: + out = [] + for blk in self.blocks: + x = blk(x, rope_sincos) + out.append(x) + return out + else: + for blk in self.blocks: + x = blk(x, rope_sincos) + x = self.norm(x) + return x, mask, ids_restore + + def forward_decoder(self, x, ids_restore, B:int, S:int, H=None, W=None): + # embed tokens + x = self.decoder_embed(x) + rope_sincos = self.rope_embed_decoder(H=H//self.patch_size, W=W//self.patch_size) + # append mask tokens to sequence + mask_tokens = self.mask_token.repeat(x.shape[0], ids_restore.shape[1] + 1 - x.shape[1], 1) + x_ = torch.cat([x[:, 1:, :], mask_tokens], dim=1) # no cls token + x_ = torch.gather(x_, dim=1, index=ids_restore.unsqueeze(-1).repeat(1, 1, x.shape[2])) # unshuffle + x = torch.cat([x[:, :1, :], x_], dim=1) # append cls token + + _, P, C = x.shape + + # apply alternating attention + for frame_block, global_block in zip(self.decoder_frame_blocks, self.decoder_global_blocks): + # Frame-wise attention + if x.shape != (B * S, P, C): + x = x.view(B, S, P, C).view(B * S, P, C) + x = frame_block(x, rope_sincos) + + # Global attention + x = x.view(B, S, P, C).view(B, S * P, C) + x = global_block(x, rope_sincos) + + x = x.view(B, S, P, C).view(B*S, P, C) + x = self.decoder_norm(x) + + # predictor projection + x = self.decoder_pred(x) + + # remove cls token + x = x[:, 1:, :] + + return x + + def forward_loss(self, imgs, pred, mask): + """ + imgs: [N, 3, H, W] + pred: [N, L, p*p*3] + mask: [N, L], 0 is keep, 1 is remove, + """ + target = self.patchify(imgs) + if self.norm_pix_loss: + mean = target.mean(dim=-1, keepdim=True) + var = target.var(dim=-1, keepdim=True) + target = (target - mean) / (var + 1.e-6)**.5 + + loss = (pred - target) ** 2 + loss = loss.mean(dim=-1) # [N, L], mean loss per patch + + loss = (loss * mask).sum() / mask.sum() # mean loss on removed patches + return loss + + def forward(self, imgs, mask_ratio=0.75): + B, S, C_in, H, W = imgs.shape + imgs = imgs.view(B*S, C_in, H, W) # [B*S, C, H, W] + latent, mask, ids_restore = self.forward_encoder(imgs, mask_ratio) + pred = self.forward_decoder(latent, ids_restore, B, S, H=H, W=W) # [N, L, p*p*3] + loss = self.forward_loss(imgs, pred, mask) + return loss, pred, mask + + def forward_features(self, x, masks=None): + out = self.forward_encoder(x, 0, return_all_blocks=True)[-1] + x_norm = self.norm(out) + return { + "x_norm_patchtokens": x_norm[:, 1:], + } + + @property + def device(self): + return self.mask_token.device + +def vit_base(patch_size=16, **kwargs): + model = MultiViewMaskedAutoEncoder( + patch_size=patch_size, + embed_dim=768, + depth=12, + num_heads=12, + decoder_embed_dim=512, + decoder_depth=8, + decoder_num_heads=16, + **kwargs, + ) + return model + + +def vit_large(patch_size=16, **kwargs): + model = MultiViewMaskedAutoEncoder( + patch_size=patch_size, + embed_dim=1024, + depth=24, + num_heads=16, + decoder_embed_dim=768, + decoder_depth=12, + decoder_num_heads=16, + **kwargs, + ) + return model + +def vit_huge(patch_size=16, **kwargs): + model = MultiViewMaskedAutoEncoder( + patch_size=patch_size, + embed_dim=1280, + depth=32, + num_heads=16, + decoder_embed_dim=1024, + decoder_depth=24, + decoder_num_heads=16, + **kwargs, + ) + return model diff --git a/vggt/encoders/mum/utils.py b/vggt/encoders/mum/utils.py new file mode 100644 index 00000000..5c578578 --- /dev/null +++ b/vggt/encoders/mum/utils.py @@ -0,0 +1,66 @@ +import torch +from torch import Tensor, nn +from typing import Callable, List, Optional, Tuple + + +def named_apply( + fn: Callable, + module: nn.Module, + name: str = "", + depth_first: bool = True, + include_root: bool = False, +) -> nn.Module: + if not depth_first and include_root: + fn(module=module, name=name) + for child_name, child_module in module.named_children(): + child_name = ".".join((name, child_name)) if name else child_name + named_apply( + fn=fn, + module=child_module, + name=child_name, + depth_first=depth_first, + include_root=True, + ) + if depth_first and include_root: + fn(module=module, name=name) + return module + +def cat_keep_shapes(x_list: List[Tensor]) -> Tuple[Tensor, List[Tuple[int]], List[int]]: + shapes = [x.shape for x in x_list] + num_tokens = [x.select(dim=-1, index=0).numel() for x in x_list] + flattened = torch.cat([x.flatten(0, -2) for x in x_list]) + return flattened, shapes, num_tokens + + +def uncat_with_shapes(flattened: Tensor, shapes: List[Tuple[int]], num_tokens: List[int]) -> List[Tensor]: + outputs_splitted = torch.split_with_sizes(flattened, num_tokens, dim=0) + shapes_adjusted = [shape[:-1] + torch.Size([flattened.shape[-1]]) for shape in shapes] + outputs_reshaped = [o.reshape(shape) for o, shape in zip(outputs_splitted, shapes_adjusted)] + return outputs_reshaped + + + + +def named_replace( + fn: Callable, + module: nn.Module, + name: str = "", + depth_first: bool = True, + include_root: bool = False, +) -> nn.Module: + if not depth_first and include_root: + module = fn(module=module, name=name) + for child_name_o, child_module in list(module.named_children()): + child_name = ".".join((name, child_name_o)) if name else child_name_o + new_child = named_replace( + fn=fn, + module=child_module, + name=child_name, + depth_first=depth_first, + include_root=True, + ) + setattr(module, child_name_o, new_child) + + if depth_first and include_root: + module = fn(module=module, name=name) + return module diff --git a/vggt/models/aggregator_small.py b/vggt/models/aggregator_small.py new file mode 100644 index 00000000..b5122a23 --- /dev/null +++ b/vggt/models/aggregator_small.py @@ -0,0 +1,363 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import logging +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.checkpoint import checkpoint +from typing import Optional, Tuple, Union, List, Dict, Any + +from vggt.layers import PatchEmbed +from vggt.layers.block import Block +from vggt.layers.rope import RotaryPositionEmbedding2D, PositionGetter +from vggt.layers.vision_transformer import vit_small, vit_base, vit_large, vit_giant2 +from typing import Literal + +logger = logging.getLogger(__name__) + +_RESNET_MEAN = [0.485, 0.456, 0.406] +_RESNET_STD = [0.229, 0.224, 0.225] + + +class Aggregator(nn.Module): + """ + The Aggregator applies alternating-attention over input frames, + as described in VGGT: Visual Geometry Grounded Transformer. + + Remember to set model.train() to enable gradient checkpointing to reduce memory usage. + + Args: + img_size (int): Image size in pixels. + patch_size (int): Size of each patch for PatchEmbed. + embed_dim (int): Dimension of the token embeddings. + depth (int): Number of blocks. + num_heads (int): Number of attention heads. + mlp_ratio (float): Ratio of MLP hidden dim to embedding dim. + num_register_tokens (int): Number of register tokens. + block_fn (nn.Module): The block type used for attention (Block by default). + qkv_bias (bool): Whether to include bias in QKV projections. + proj_bias (bool): Whether to include bias in the output projection. + ffn_bias (bool): Whether to include bias in MLP layers. + patch_embed (str): Type of patch embed. e.g., "conv" or "dinov2_vitl14_reg". + aa_order (list[str]): The order of alternating attention, e.g. ["frame", "global"]. + aa_block_size (int): How many blocks to group under each attention type before switching. If not necessary, set to 1. + qk_norm (bool): Whether to apply QK normalization. + rope_freq (int): Base frequency for rotary embedding. -1 to disable. + init_values (float): Init scale for layer scale. + """ + + def __init__( + self, + img_size=512, + patch_size=16, + embed_dim=384, + depth=6, + num_heads=6, + mlp_ratio=4.0, + num_register_tokens=0, + block_fn=Block, + qkv_bias=True, + proj_bias=True, + ffn_bias=True, + aa_order=["frame", "global"], + aa_block_size=1, + qk_norm=True, + rope_freq=100, + init_values=0.01, + patch_embed: Literal["mum", "dinov3", "crocov2", "dinov2"]="dinov3" + ): + super().__init__() + + self.__build_patch_embed__(patch_embed, img_size, patch_size, num_register_tokens, embed_dim=embed_dim) + + # Initialize rotary position embedding if frequency > 0 + self.rope = RotaryPositionEmbedding2D(frequency=rope_freq) if rope_freq > 0 else None + self.position_getter = PositionGetter() if self.rope is not None else None + + self.frame_blocks = nn.ModuleList( + [ + block_fn( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + proj_bias=proj_bias, + ffn_bias=ffn_bias, + init_values=init_values, + qk_norm=qk_norm, + rope=self.rope, + ) + for _ in range(depth) + ] + ) + + self.global_blocks = nn.ModuleList( + [ + block_fn( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + proj_bias=proj_bias, + ffn_bias=ffn_bias, + init_values=init_values, + qk_norm=qk_norm, + rope=self.rope, + ) + for _ in range(depth) + ] + ) + + self.depth = depth + self.aa_order = aa_order + self.patch_size = patch_size + self.aa_block_size = aa_block_size + + # Validate that depth is divisible by aa_block_size + if self.depth % self.aa_block_size != 0: + raise ValueError(f"depth ({depth}) must be divisible by aa_block_size ({aa_block_size})") + + self.aa_block_num = self.depth // self.aa_block_size + + # Note: We have two camera tokens, one for the first frame and one for the rest + # The same applies for register tokens + self.camera_token = nn.Parameter(torch.randn(1, 2, 1, embed_dim)) + self.register_token = nn.Parameter(torch.randn(1, 2, num_register_tokens, embed_dim)) + + # The patch tokens start after the camera and register tokens + self.patch_start_idx = 1 + num_register_tokens + + # Initialize parameters with small values + nn.init.normal_(self.camera_token, std=1e-6) + nn.init.normal_(self.register_token, std=1e-6) + + # Register normalization constants as buffers + for name, value in (("_resnet_mean", _RESNET_MEAN), ("_resnet_std", _RESNET_STD)): + self.register_buffer(name, torch.FloatTensor(value).view(1, 1, 3, 1, 1), persistent=False) + + self.use_reentrant = False # hardcoded to False + + def __build_patch_embed__( + self, + patch_embed, + img_size, + patch_size, + num_register_tokens, + interpolate_antialias=True, + interpolate_offset=0.0, + block_chunks=0, + init_values=1.0, + embed_dim=1024, + ): + """ + Build the patch embed layer. If 'conv', we use a + simple PatchEmbed conv layer. Otherwise, we use a vision transformer. + """ + if patch_embed == "dinov2": + vit_models = { + "dinov2_vitl14_reg": vit_large, + "dinov2_vitb14_reg": vit_base, + "dinov2_vits14_reg": vit_small, + "dinov2_vitg2_reg": vit_giant2, + } + + patch_embed_vit = vit_models[patch_embed]( + img_size=img_size, + patch_size=patch_size, + num_register_tokens=num_register_tokens, + interpolate_antialias=interpolate_antialias, + interpolate_offset=interpolate_offset, + block_chunks=block_chunks, + init_values=init_values, + ) + elif patch_embed == "dinov3": + patch_embed_vit = torch.hub.load("/mimer/NOBACKUP/groups/snic2022-6-266/davnords/dinov3", "dinov3_vitl16", source='local', weights="/mimer/NOBACKUP/groups/snic2022-6-266/davnords/mv-ssl/pretrained_models/dinov3_vitl16_pretrain_lvd1689m-8aa4cbdd.pth") + patch_embed_vit.patch_size = 16 + patch_embed_vit.device = patch_embed_vit.cls_token.device + elif patch_embed == "mum": + from vggt.encoders.mum import vit_large + patch_embed_vit = vit_large().eval() + pretrained_weights = "/mimer/NOBACKUP/groups/snic2022-6-266/davnords/mv-ssl/pretrained_models/MuM_ViTLarge_BaseDecoder_500k.pth" + ckpt = torch.load(pretrained_weights, map_location='cpu', weights_only=False) + patch_embed_vit.load_state_dict(ckpt['model'], strict=True) + elif patch_embed == "crocov2": + from vggt.encoders.croco import CroCoNet + ckpt = torch.load('/mimer/NOBACKUP/groups/snic2022-6-266/davnords/mv-ssl/pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth', 'cpu') + patch_embed_vit = CroCoNet( **ckpt.get('croco_kwargs',{})).eval() + patch_embed_vit.load_state_dict(ckpt['model'], strict=True) + else: + raise NotImplementedError("Invalid value for patch_embed") + + # Freeze patch embed + for param in patch_embed_vit.parameters(): + param.requires_grad = False + + # if patch_embed == "crocov2": + # for block in patch_embed_vit.enc_blocks[20:]: + # for param in block.parameters(): + # param.requires_grad = True + # else: + # for block in patch_embed_vit.blocks[20:]: + # for param in block.parameters(): + # param.requires_grad = True + + self.patch_embed = patch_embed_vit + del patch_embed_vit + + patch_embed_dim = 1024 + self.proj = nn.Linear(patch_embed_dim, embed_dim) if patch_embed_dim != embed_dim else nn.Identity() + + + def forward(self, images: torch.Tensor) -> Tuple[List[torch.Tensor], int]: + """ + Args: + images (torch.Tensor): Input images with shape [B, S, 3, H, W], in range [0, 1]. + B: batch size, S: sequence length, 3: RGB channels, H: height, W: width + + Returns: + (list[torch.Tensor], int): + The list of outputs from the attention blocks, + and the patch_start_idx indicating where patch tokens begin. + """ + B, S, C_in, H, W = images.shape + + if C_in != 3: + raise ValueError(f"Expected 3 input channels, got {C_in}") + + # Normalize images and reshape for patch embed + images = (images - self._resnet_mean) / self._resnet_std + + # Reshape to [B*S, C, H, W] for patch embedding + images = images.view(B * S, C_in, H, W) + patch_tokens = self.patch_embed.forward_features(images) + + if isinstance(patch_tokens, dict): + patch_tokens = patch_tokens["x_norm_patchtokens"] + patch_tokens = self.proj(patch_tokens) + + _, P, C = patch_tokens.shape + + # Expand camera and register tokens to match batch size and sequence length + camera_token = slice_expand_and_flatten(self.camera_token, B, S) + register_token = slice_expand_and_flatten(self.register_token, B, S) + + # Concatenate special tokens with patch tokens + tokens = torch.cat([camera_token, register_token, patch_tokens], dim=1) + + pos = None + if self.rope is not None: + pos = self.position_getter(B * S, H // self.patch_size, W // self.patch_size, device=images.device) + + if self.patch_start_idx > 0: + # do not use position embedding for special tokens (camera and register tokens) + # so set pos to 0 for the special tokens + pos = pos + 1 + pos_special = torch.zeros(B * S, self.patch_start_idx, 2).to(images.device).to(pos.dtype) + pos = torch.cat([pos_special, pos], dim=1) + + # update P because we added special tokens + _, P, C = tokens.shape + + frame_idx = 0 + global_idx = 0 + output_list = [] + + for _ in range(self.aa_block_num): + for attn_type in self.aa_order: + if attn_type == "frame": + tokens, frame_idx, frame_intermediates = self._process_frame_attention( + tokens, B, S, P, C, frame_idx, pos=pos + ) + elif attn_type == "global": + tokens, global_idx, global_intermediates = self._process_global_attention( + tokens, B, S, P, C, global_idx, pos=pos + ) + else: + raise ValueError(f"Unknown attention type: {attn_type}") + + for i in range(len(frame_intermediates)): + # concat frame and global intermediates, [B x S x P x 2C] + concat_inter = torch.cat([frame_intermediates[i], global_intermediates[i]], dim=-1) + output_list.append(concat_inter) + + del concat_inter + del frame_intermediates + del global_intermediates + return output_list, self.patch_start_idx + + def _process_frame_attention(self, tokens, B, S, P, C, frame_idx, pos=None): + """ + Process frame attention blocks. We keep tokens in shape (B*S, P, C). + """ + # If needed, reshape tokens or positions: + if tokens.shape != (B * S, P, C): + tokens = tokens.view(B, S, P, C).view(B * S, P, C) + + if pos is not None and pos.shape != (B * S, P, 2): + pos = pos.view(B, S, P, 2).view(B * S, P, 2) + + intermediates = [] + + # by default, self.aa_block_size=1, which processes one block at a time + for _ in range(self.aa_block_size): + if self.training: + tokens = checkpoint(self.frame_blocks[frame_idx], tokens, pos, use_reentrant=self.use_reentrant) + else: + tokens = self.frame_blocks[frame_idx](tokens, pos=pos) + frame_idx += 1 + intermediates.append(tokens.view(B, S, P, C)) + + return tokens, frame_idx, intermediates + + def _process_global_attention(self, tokens, B, S, P, C, global_idx, pos=None): + """ + Process global attention blocks. We keep tokens in shape (B, S*P, C). + """ + if tokens.shape != (B, S * P, C): + tokens = tokens.view(B, S, P, C).view(B, S * P, C) + + if pos is not None and pos.shape != (B, S * P, 2): + pos = pos.view(B, S, P, 2).view(B, S * P, 2) + + intermediates = [] + + # by default, self.aa_block_size=1, which processes one block at a time + for _ in range(self.aa_block_size): + if self.training: + tokens = checkpoint(self.global_blocks[global_idx], tokens, pos, use_reentrant=self.use_reentrant) + else: + tokens = self.global_blocks[global_idx](tokens, pos=pos) + global_idx += 1 + intermediates.append(tokens.view(B, S, P, C)) + + return tokens, global_idx, intermediates + + +def slice_expand_and_flatten(token_tensor, B, S): + """ + Processes specialized tokens with shape (1, 2, X, C) for multi-frame processing: + 1) Uses the first position (index=0) for the first frame only + 2) Uses the second position (index=1) for all remaining frames (S-1 frames) + 3) Expands both to match batch size B + 4) Concatenates to form (B, S, X, C) where each sequence has 1 first-position token + followed by (S-1) second-position tokens + 5) Flattens to (B*S, X, C) for processing + + Returns: + torch.Tensor: Processed tokens with shape (B*S, X, C) + """ + + # Slice out the "query" tokens => shape (1, 1, ...) + query = token_tensor[:, 0:1, ...].expand(B, 1, *token_tensor.shape[2:]) + # Slice out the "other" tokens => shape (1, S-1, ...) + others = token_tensor[:, 1:, ...].expand(B, S - 1, *token_tensor.shape[2:]) + # Concatenate => shape (B, S, ...) + combined = torch.cat([query, others], dim=1) + + # Finally flatten => shape (B*S, ...) + combined = combined.view(B * S, *combined.shape[2:]) + return combined diff --git a/vggt/models/vggt.py b/vggt/models/vggt.py index 686e6f9d..fc67251b 100644 --- a/vggt/models/vggt.py +++ b/vggt/models/vggt.py @@ -22,8 +22,8 @@ def __init__(self, img_size=518, patch_size=14, embed_dim=1024, self.aggregator = Aggregator(img_size=img_size, patch_size=patch_size, embed_dim=embed_dim) self.camera_head = CameraHead(dim_in=2 * embed_dim) if enable_camera else None - self.point_head = DPTHead(dim_in=2 * embed_dim, output_dim=4, activation="inv_log", conf_activation="expp1") if enable_point else None - self.depth_head = DPTHead(dim_in=2 * embed_dim, output_dim=2, activation="exp", conf_activation="expp1") if enable_depth else None + self.point_head = DPTHead(dim_in=2 * embed_dim, output_dim=4, activation="inv_log", conf_activation="expp1", patch_size=patch_size) if enable_point else None + self.depth_head = DPTHead(dim_in=2 * embed_dim, output_dim=2, activation="exp", conf_activation="expp1", patch_size=patch_size) if enable_depth else None self.track_head = TrackHead(dim_in=2 * embed_dim, patch_size=patch_size) if enable_track else None def forward(self, images: torch.Tensor, query_points: torch.Tensor = None): diff --git a/vggt/models/vggt_small.py b/vggt/models/vggt_small.py new file mode 100644 index 00000000..f7bd7e6e --- /dev/null +++ b/vggt/models/vggt_small.py @@ -0,0 +1,99 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn +from huggingface_hub import PyTorchModelHubMixin # used for model hub + +from vggt.models.aggregator_small import Aggregator +from vggt.heads.camera_head import CameraHead +from vggt.heads.dpt_head import DPTHead +from vggt.heads.track_head import TrackHead +from typing import Literal + +class VGGT(nn.Module, PyTorchModelHubMixin): + def __init__(self, img_size=512, patch_size=16, embed_dim=384, depth=6, num_heads=6, + enable_camera=True, enable_point=True, enable_depth=True, enable_track=True, patch_embed: Literal["mum", "dinov3", "crocov2", "dinov2"]="dinov3", + ): + super().__init__() + self.aggregator = Aggregator(img_size=img_size, patch_size=patch_size, embed_dim=embed_dim, patch_embed=patch_embed, depth=depth, num_heads=num_heads) + + self.camera_head = CameraHead(dim_in=2 * embed_dim) if enable_camera else None + self.point_head = DPTHead(dim_in=2 * embed_dim, output_dim=4, activation="inv_log", conf_activation="expp1", patch_size=patch_size, + intermediate_layer_idx=[2,3,4,5]) if enable_point else None + self.depth_head = DPTHead(dim_in=2 * embed_dim, output_dim=2, activation="exp", conf_activation="expp1", patch_size=patch_size, + intermediate_layer_idx=[2,3,4,5]) if enable_depth else None + self.track_head = TrackHead(dim_in=2 * embed_dim, patch_size=patch_size) if enable_track else None + + def forward(self, images: torch.Tensor, query_points: torch.Tensor = None): + """ + Forward pass of the VGGT model. + + Args: + images (torch.Tensor): Input images with shape [S, 3, H, W] or [B, S, 3, H, W], in range [0, 1]. + B: batch size, S: sequence length, 3: RGB channels, H: height, W: width + query_points (torch.Tensor, optional): Query points for tracking, in pixel coordinates. + Shape: [N, 2] or [B, N, 2], where N is the number of query points. + Default: None + + Returns: + dict: A dictionary containing the following predictions: + - pose_enc (torch.Tensor): Camera pose encoding with shape [B, S, 9] (from the last iteration) + - depth (torch.Tensor): Predicted depth maps with shape [B, S, H, W, 1] + - depth_conf (torch.Tensor): Confidence scores for depth predictions with shape [B, S, H, W] + - world_points (torch.Tensor): 3D world coordinates for each pixel with shape [B, S, H, W, 3] + - world_points_conf (torch.Tensor): Confidence scores for world points with shape [B, S, H, W] + - images (torch.Tensor): Original input images, preserved for visualization + + If query_points is provided, also includes: + - track (torch.Tensor): Point tracks with shape [B, S, N, 2] (from the last iteration), in pixel coordinates + - vis (torch.Tensor): Visibility scores for tracked points with shape [B, S, N] + - conf (torch.Tensor): Confidence scores for tracked points with shape [B, S, N] + """ + # If without batch dimension, add it + if len(images.shape) == 4: + images = images.unsqueeze(0) + + if query_points is not None and len(query_points.shape) == 2: + query_points = query_points.unsqueeze(0) + + aggregated_tokens_list, patch_start_idx = self.aggregator(images) + + predictions = {} + + with torch.cuda.amp.autocast(enabled=False): + if self.camera_head is not None: + pose_enc_list = self.camera_head(aggregated_tokens_list) + predictions["pose_enc"] = pose_enc_list[-1] # pose encoding of the last iteration + predictions["pose_enc_list"] = pose_enc_list + + if self.depth_head is not None: + depth, depth_conf = self.depth_head( + aggregated_tokens_list, images=images, patch_start_idx=patch_start_idx + ) + predictions["depth"] = depth + predictions["depth_conf"] = depth_conf + + if self.point_head is not None: + pts3d, pts3d_conf = self.point_head( + aggregated_tokens_list, images=images, patch_start_idx=patch_start_idx + ) + predictions["world_points"] = pts3d + predictions["world_points_conf"] = pts3d_conf + + if self.track_head is not None and query_points is not None: + track_list, vis, conf = self.track_head( + aggregated_tokens_list, images=images, patch_start_idx=patch_start_idx, query_points=query_points + ) + predictions["track"] = track_list[-1] # track of the last iteration + predictions["vis"] = vis + predictions["conf"] = conf + + if not self.training: + predictions["images"] = images # store the images for visualization during inference + + return predictions +