nerfstudio-project
diff --git a/‎nerfstudio/process_data/colmap_converter_to_nerfstudio_dataset.py‎
Lines changed: 34 additions & 3 deletions b/‎nerfstudio/process_data/colmap_converter_to_nerfstudio_dataset.py‎
Lines changed: 34 additions & 3 deletions
diff --git a/‎nerfstudio/process_data/colmap_utils.py‎
Lines changed: 98 additions & 10 deletions b/‎nerfstudio/process_data/colmap_utils.py‎
Lines changed: 98 additions & 10 deletions
diff --git a/‎nerfstudio/process_data/process_data_utils.py‎
Lines changed: 65 additions & 18 deletions b/‎nerfstudio/process_data/process_data_utils.py‎
Lines changed: 65 additions & 18 deletions
@@ -31,10 +31,10 @@ class ColmapConverterToNerfstudioDataset(BaseConverterToNerfstudioDataset):
 
     camera_type: Literal["perspective", "fisheye", "equirectangular", "pinhole", "simple_pinhole"] = "perspective"
     """Camera model to use."""
-    matching_method: Literal["exhaustive", "sequential", "vocab_tree"] = "vocab_tree"
+    matching_method: Literal["exhaustive", "sequential", "vocab_tree", "spatial"] = "vocab_tree"
     """Feature matching method to use. Vocab tree is recommended for a balance of speed
     and accuracy. Exhaustive is slower but more accurate. Sequential is faster but
-    should only be used for videos."""
+    should only be used for videos. Spatial can leverage EXIF GPS priors for pairing."""
     sfm_tool: Literal["any", "colmap", "hloc"] = "any"
     """Structure from motion tool to use. Colmap will use sift features, hloc can use
     many modern methods such as superpoint features and superglue matcher"""
@@ -104,6 +104,26 @@ class ColmapConverterToNerfstudioDataset(BaseConverterToNerfstudioDataset):
     use_single_camera_mode: bool = True
     """Whether to assume all images taken with the same camera characteristics, set to False for multiple cameras in colmap (only works with hloc sfm_tool).
     """
+    # New options for pose priors and alignment
+    use_pose_prior: bool = False
+    """If True, use EXIF pose priors by running pose_prior_mapper and optionally align to priors."""
+    prior_position_std: float = 2.0
+    """Standard deviation (meters) for x/y/z prior used by pose_prior_mapper."""
+    overwrite_priors_covariance: bool = True
+    """Whether to overwrite priors covariance in database when running pose_prior_mapper."""
+    align_model_to_priors: bool = False
+    """If True, run model_aligner to align the reconstruction to GPS priors (writes back into sparse/0)."""
+    alignment_max_error: Optional[float] = None
+    """Max alignment error for model_aligner. Defaults to prior_position_std if not set."""
+    # Normalization options
+    normalize_model: bool = False
+    """If True, apply model_transformer to center and scale the reconstructed model for numeric stability."""
+    normalization_center: Literal["bbox", "mean"] = "bbox"
+    """How to compute the model center for normalization (bbox center or mean point)."""
+    normalization_target_diagonal: float = 4.0
+    """Target diagonal length (meters) for the normalized model if no explicit scale is provided."""
+    normalization_scale: Optional[float] = None
+    """Explicit normalization scale; if set, overrides normalization_target_diagonal."""
 
     @staticmethod
     def default_colmap_path() -> Path:
@@ -219,6 +239,15 @@ def _run_colmap(self, mask_path: Optional[Path] = None):
                 matching_method=self.matching_method,
                 refine_intrinsics=self.refine_intrinsics,
                 colmap_cmd=self.colmap_cmd,
+                use_pose_prior=self.use_pose_prior,
+                prior_position_std=self.prior_position_std,
+                overwrite_priors_covariance=self.overwrite_priors_covariance,
+                align_model_to_priors=self.align_model_to_priors,
+                alignment_max_error=self.alignment_max_error,
+                normalize_model=self.normalize_model,
+                normalization_center=self.normalization_center,
+                normalization_target_diagonal=self.normalization_target_diagonal,
+                normalization_scale=self.normalization_scale,
             )
         elif sfm_tool == "hloc":
             if mask_path is not None:
@@ -227,12 +256,14 @@ def _run_colmap(self, mask_path: Optional[Path] = None):
             assert feature_type is not None
             assert matcher_type is not None
             assert matcher_type != "NN"  # Only used for colmap.
+            # hloc does not support 'spatial' matching_method; map it to 'vocab_tree' for compatibility
+            hloc_matching_method = self.matching_method if self.matching_method != "spatial" else "vocab_tree"
             hloc_utils.run_hloc(
                 image_dir=image_dir,
                 colmap_dir=self.absolute_colmap_path,
                 camera_model=CAMERA_MODELS[self.camera_type],
                 verbose=self.verbose,
-                matching_method=self.matching_method,
+                matching_method=hloc_matching_method,
                 feature_type=feature_type,
                 matcher_type=matcher_type,
                 refine_pixsfm=self.refine_pixsfm,
 
@@ -73,7 +73,7 @@ def get_vocab_tree() -> Path:
     vocab_tree_filename = Path(appdirs.user_data_dir("nerfstudio")) / "vocab_tree.fbow"
 
     if not vocab_tree_filename.exists():
-        r = requests.get("https://demuc.de/colmap/vocab_tree_flickr100K_words32K.bin", stream=True)
+        r = requests.get("http://10.126.13.216:9000/root/firmware/vocab_tree_faiss_flickr100K_words256K.bin", stream=True)
         vocab_tree_filename.parent.mkdir(parents=True, exist_ok=True)
         with open(vocab_tree_filename, "wb") as f:
             total_length = r.headers.get("content-length")
@@ -96,9 +96,18 @@ def run_colmap(
     camera_mask_path: Optional[Path] = None,
     gpu: bool = True,
     verbose: bool = False,
-    matching_method: Literal["vocab_tree", "exhaustive", "sequential"] = "vocab_tree",
+    matching_method: Literal["vocab_tree", "exhaustive", "sequential", "spatial"] = "vocab_tree",
     refine_intrinsics: bool = True,
     colmap_cmd: str = "colmap",
+    use_pose_prior: bool = False,
+    prior_position_std: float = 2.0,
+    overwrite_priors_covariance: bool = True,
+    align_model_to_priors: bool = False,
+    alignment_max_error: Optional[float] = None,
+    normalize_model: bool = False,
+    normalization_center: Literal["bbox", "mean"] = "bbox",
+    normalization_target_diagonal: float = 4.0,
+    normalization_scale: Optional[float] = None,
 ) -> None:
     """Runs COLMAP on the images.
 
@@ -112,6 +121,15 @@ def run_colmap(
         matching_method: Matching method to use.
         refine_intrinsics: If True, refine intrinsics.
         colmap_cmd: Path to the COLMAP executable.
+        use_pose_prior: If True, use pose_prior_mapper to incorporate EXIF pose priors.
+        prior_position_std: Prior position standard deviation in meters for x/y/z.
+        overwrite_priors_covariance: If True, overwrite priors covariance in database when mapping.
+        align_model_to_priors: If True, run model_aligner to align the reconstruction to GPS priors.
+        alignment_max_error: Max alignment error (falls back to prior_position_std if None).
+        normalize_model: If True, apply a similarity transform to center and scale the model with model_transformer.
+        normalization_center: How to compute center (bbox center or mean point).
+        normalization_target_diagonal: Target diagonal length (meters) to scale the model to (if normalization_scale not given).
+        normalization_scale: Explicit scale factor. If provided, overrides normalization_target_diagonal.
     """
 
     colmap_version = get_colmap_version(colmap_cmd)
@@ -126,7 +144,7 @@ def run_colmap(
         f"--image_path {image_dir}",
         "--ImageReader.single_camera 1",
         f"--ImageReader.camera_model {camera_model.value}",
-        f"--SiftExtraction.use_gpu {int(gpu)}",
+        # f"--SiftExtraction.use_gpu={bool(gpu)}",
     ]
     if camera_mask_path is not None:
         feature_extractor_cmd.append(f"--ImageReader.camera_mask_path {camera_mask_path}")
@@ -140,7 +158,7 @@ def run_colmap(
     feature_matcher_cmd = [
         f"{colmap_cmd} {matching_method}_matcher",
         f"--database_path {colmap_dir / 'database.db'}",
-        f"--SiftMatching.use_gpu {int(gpu)}",
+        # f"--SiftMatching.use_gpu={bool(gpu)}",
     ]
     if matching_method == "vocab_tree":
         vocab_tree_filename = get_vocab_tree()
@@ -150,19 +168,32 @@ def run_colmap(
         run_command(feature_matcher_cmd, verbose=verbose)
     CONSOLE.log("[bold green]:tada: Done matching COLMAP features.")
 
-    # Bundle adjustment
+    # Mapping / bundle adjustment
     sparse_dir = colmap_dir / "sparse"
     sparse_dir.mkdir(parents=True, exist_ok=True)
-    mapper_cmd = [
-        f"{colmap_cmd} mapper",
+
+    # Choose mapper variant
+    mapper_command_name = "pose_prior_mapper" if use_pose_prior else "mapper"
+
+    mapper_cmd_parts = [
+        f"{colmap_cmd} {mapper_command_name}",
         f"--database_path {colmap_dir / 'database.db'}",
         f"--image_path {image_dir}",
         f"--output_path {sparse_dir}",
     ]
-    if colmap_version >= Version("3.7"):
-        mapper_cmd.append("--Mapper.ba_global_function_tolerance=1e-6")
 
-    mapper_cmd = " ".join(mapper_cmd)
+    if not use_pose_prior and colmap_version >= Version("3.7"):
+        mapper_cmd_parts.append("--Mapper.ba_global_function_tolerance=1e-6")
+
+    if use_pose_prior:
+        # Set symmetric priors std for x/y/z and optionally overwrite covariance
+        mapper_cmd_parts.append(f"--prior_position_std_x {prior_position_std}")
+        mapper_cmd_parts.append(f"--prior_position_std_y {prior_position_std}")
+        mapper_cmd_parts.append(f"--prior_position_std_z {prior_position_std}")
+        if overwrite_priors_covariance:
+            mapper_cmd_parts.append("--overwrite_priors_covariance 1")
+
+    mapper_cmd = " ".join(mapper_cmd_parts)
 
     with status(
         msg="[bold yellow]Running COLMAP bundle adjustment... (This may take a while)",
@@ -172,6 +203,63 @@ def run_colmap(
         run_command(mapper_cmd, verbose=verbose)
     CONSOLE.log("[bold green]:tada: Done COLMAP bundle adjustment.")
 
+    # Optional alignment to GPS priors; write back into sparse/0 to keep downstream unchanged
+    if align_model_to_priors:
+        align_cmd_parts = [
+            f"{colmap_cmd} model_aligner",
+            f"--input_path {sparse_dir}/0",
+            f"--output_path {sparse_dir}/0",
+            f"--database_path {colmap_dir / 'database.db'}",
+        ]
+        max_err = alignment_max_error if alignment_max_error is not None else prior_position_std
+        align_cmd_parts.append(f"--alignment_max_error {max_err}")
+        align_cmd = " ".join(align_cmd_parts)
+        with status(msg="[bold yellow]Aligning model to pose priors...", spinner="dots", verbose=verbose):
+            run_command(align_cmd, verbose=verbose)
+        CONSOLE.log("[bold green]:tada: Done aligning model to pose priors.")
+
+    # Optional normalization to human scale and centered coordinates using model_transformer
+    if normalize_model:
+        recon_dir = sparse_dir / "0"
+        try:
+            ptid_to_info = read_points3D_binary(recon_dir / "points3D.bin")
+        except Exception as e:
+            CONSOLE.print(f"[bold yellow]Warning: Could not read points3D for normalization: {e}")
+            ptid_to_info = {}
+        if len(ptid_to_info) == 0:
+            CONSOLE.print("[bold yellow]Warning: No 3D points to estimate normalization. Skipping normalization.")
+        else:
+            import numpy as np  # local import to avoid overhead unless needed
+            pts = np.array([p.xyz for p in ptid_to_info.values()], dtype=np.float64)
+            if normalization_center == "mean":
+                Cx, Cy, Cz = pts.mean(axis=0).tolist()
+            else:
+                mins = pts.min(axis=0)
+                maxs = pts.max(axis=0)
+                Cx, Cy, Cz = ((mins + maxs) * 0.5).tolist()
+            diag = float(np.linalg.norm(pts.max(axis=0) - pts.min(axis=0)))
+            if normalization_scale is not None:
+                s = float(normalization_scale)
+            else:
+                eps = 1e-9
+                s = float(normalization_target_diagonal) / max(diag, eps)
+            # Forward transform desired: x' = s * (x - C) = s*x + t, with t = -s*C
+            tx, ty, tz = (-s * Cx, -s * Cy, -s * Cz)
+            # Write transform in format: scale qw qx qy qz tx ty tz (identity rotation)
+            transform_path = recon_dir / "normalization_transform.txt"
+            with open(transform_path, "w", encoding="utf-8") as f:
+                f.write(f"{s:.12g} 1 0 0 0 {tx:.12g} {ty:.12g} {tz:.12g}\n")
+            transform_cmd_parts = [
+                f"{colmap_cmd} model_transformer",
+                f"--input_path {recon_dir}",
+                f"--output_path {recon_dir}",
+                f"--transform_path {transform_path}",
+            ]
+            transform_cmd = " ".join(transform_cmd_parts)
+            with status(msg="[bold yellow]Normalizing model scale and center...", spinner="dots", verbose=verbose):
+                run_command(transform_cmd, verbose=verbose)
+            CONSOLE.log("[bold green]:tada: Done normalizing model (model_transformer).")
+
     if refine_intrinsics:
         with status(msg="[bold yellow]Refine intrinsics...", spinner="dqpb", verbose=verbose):
             bundle_adjuster_cmd = [
 
@@ -188,12 +188,7 @@ def convert_video_to_images(
         for dir in downscale_dirs:
             dir.mkdir(parents=True, exist_ok=True)
 
-        downscale_chain = (
-            f"split={num_downscales + 1}"
-            + "".join([f"[t{i}]" for i in range(num_downscales + 1)])
-            + ";"
-            + ";".join(downscale_chains)
-        )
+        # We will construct the split size and outputs later per-frame based on whether [out0] is needed
 
         ffmpeg_cmd += " -vsync vfr"
 
@@ -212,7 +207,7 @@ def convert_video_to_images(
             ffmpeg_cmd += " -pix_fmt bgr8"
             select_cmd = ""
 
-        downscale_cmd = f' -filter_complex "{select_cmd}{crop_cmd}{downscale_chain}"' + "".join(
+        downscale_cmd = f' -filter_complex "{select_cmd}{crop_cmd}"' + "".join(
             [f' -map "[out{i}]" "{downscale_paths[i]}"' for i in range(num_downscales + 1)]
         )
 
@@ -294,19 +289,30 @@ def copy_images_list(
             pass
         copied_image_paths.append(copied_image_path)
 
+    # Early return: if there is no transformation/downscale requested, avoid re-encoding to preserve EXIF
+    no_transform_requested = (
+        num_downscales == 0
+        and crop_border_pixels is None
+        and (crop_factor == (0.0, 0.0, 0.0, 0.0))
+        and upscale_factor is None
+        and same_dimensions
+    )
+    if no_transform_requested:
+        if len(image_paths) == 0:
+            CONSOLE.log("[bold red]:skull: No usable images in the data folder.")
+        else:
+            CONSOLE.log(f"[bold green]:tada: Done copying images with prefix '{image_prefix}'.")
+        return copied_image_paths
+
     nn_flag = "" if not nearest_neighbor else ":flags=neighbor"
+    # Build downscale graph labels. We will decide later whether to emit [out0] (base) depending on whether base transform is needed.
     downscale_chains = [f"[t{i}]scale=iw/{2**i}:ih/{2**i}{nn_flag}[out{i}]" for i in range(num_downscales + 1)]
     downscale_dirs = [Path(str(image_dir) + (f"_{2**i}" if i > 0 else "")) for i in range(num_downscales + 1)]
 
     for dir in downscale_dirs:
         dir.mkdir(parents=True, exist_ok=True)
 
-    downscale_chain = (
-        f"split={num_downscales + 1}"
-        + "".join([f"[t{i}]" for i in range(num_downscales + 1)])
-        + ";"
-        + ";".join(downscale_chains)
-    )
+    # We will construct the split size and outputs later per-frame based on whether [out0] is needed
 
     num_frames = len(image_paths)
     # ffmpeg batch commands assume all images are the same dimensions.
@@ -330,13 +336,54 @@ def copy_images_list(
         if upscale_factor is not None:
             select_cmd = f"[0:v]scale=iw*{upscale_factor}:ih*{upscale_factor}:flags=neighbor[upscaled];[upscaled]"
 
-        downscale_cmd = f' -filter_complex "{select_cmd}{crop_cmd}{downscale_chain}"' + "".join(
-            [
-                f' -map "[out{i}]" -q:v 2 "{downscale_dirs[i] / f"{framename}{copied_image_paths[0].suffix}"}"'
-                for i in range(num_downscales + 1)
-            ]
+        downscale_cmd = f' -filter_complex "{select_cmd}{crop_cmd}"' + "".join(
+            [f' -map "[out{i}]" "{downscale_dirs[i] / f"{framename}{copied_image_paths[0].suffix}"}"' for i in range(num_downscales + 1)]
         )
 
+        # Decide whether to overwrite base images ([out0])
+        need_transform_base = (
+            crop_border_pixels is not None or (crop_factor != (0.0, 0.0, 0.0, 0.0)) or upscale_factor is not None or not same_dimensions
+        )
+
+        # Build filter graph: if base not needed, split only into downscaled outputs [out1..outN]; otherwise include [out0]
+        if num_downscales > 0:
+            if need_transform_base:
+                split_targets = [f"[t{i}]" for i in range(num_downscales + 1)]  # include base
+                chains = ";".join(downscale_chains)  # [out0..outN]
+                downscale_graph = (
+                    f"split={num_downscales + 1}" + "".join(split_targets) + ";" + chains
+                )
+                downscale_cmd = f' -filter_complex "{select_cmd}{crop_cmd}{downscale_graph}"'
+                mapping_entries = [
+                    f' -map "[out0]" -map_metadata 0 -q:v 2 "{downscale_dirs[0] / f"{framename}{copied_image_paths[0].suffix}"}"'
+                ]
+                for i in range(1, num_downscales + 1):
+                    mapping_entries.append(
+                        f' -map "[out{i}]" -map_metadata 0 -q:v 2 "{downscale_dirs[i] / f"{framename}{copied_image_paths[0].suffix}"}"'
+                    )
+                downscale_cmd += "".join(mapping_entries)
+            else:
+                # Only emit downscaled outputs; reindex to start from out0 to avoid gaps and empty maps
+                # Build chains for i=1..N, then relabel [out{i}] -> [out{i-1}] via mapping labels
+                split_targets = [f"[t{i}]" for i in range(1, num_downscales + 1)]
+                chains = ";".join([f"[t{i}]scale=iw/{2**i}:ih/{2**i}{nn_flag}[out{i-1}]" for i in range(1, num_downscales + 1)])
+                downscale_graph = (
+                    f"split={num_downscales}" + "".join(split_targets) + ";" + chains
+                )
+                downscale_cmd = f' -filter_complex "{select_cmd}{crop_cmd}{downscale_graph}"'
+                mapping_entries = []
+                for i in range(num_downscales):
+                    # map out{i} to images_{2**(i+1)}
+                    out_dir = downscale_dirs[i + 1]
+                    mapping_entries.append(
+                        f' -map "[out{i}]" -map_metadata 0 -q:v 2 "{out_dir / f"{framename}{copied_image_paths[0].suffix}"}"'
+                    )
+                downscale_cmd += "".join(mapping_entries)
+        else:
+            # No downscales requested but we got here due to other transforms; keep single output
+            downscale_graph = ""
+            downscale_cmd = ""
+
         ffmpeg_cmd += downscale_cmd
         if verbose:
             CONSOLE.log(f"... {ffmpeg_cmd}")