Add Camera Pose op (#894)

Qirui-jiao · web-flow · commit 25d9d49f114a · 2026-02-11T09:52:20.000+08:00
* Add DeepCalib, Moge-2 ops &amp; Add Video Undistortion op

* update according to gemini's comments

* Improved MoGe-2 op.

* Fixed issues with excessive GPU memory usage and numpy version conflicts.

* Add video_camera_pose_mapper

* update init.py
diff --git a/data_juicer/ops/mapper/__init__.py b/data_juicer/ops/mapper/__init__.py
@@ -88,6 +88,7 @@
 from .video_camera_calibration_static_moge_mapper import (
     VideoCameraCalibrationStaticMogeMapper,
 )
+from .video_camera_pose_mapper import VideoCameraPoseMapper
 from .video_captioning_from_audio_mapper import VideoCaptioningFromAudioMapper
 from .video_captioning_from_frames_mapper import VideoCaptioningFromFramesMapper
 from .video_captioning_from_summarizer_mapper import VideoCaptioningFromSummarizerMapper
diff --git a/data_juicer/ops/mapper/video_camera_pose_mapper.py b/data_juicer/ops/mapper/video_camera_pose_mapper.py
@@ -0,0 +1,337 @@
+import os
+import subprocess
+
+import numpy as np
+from pydantic import PositiveInt
+
+import data_juicer
+from data_juicer.ops.load import load_ops
+from data_juicer.utils.cache_utils import DATA_JUICER_ASSETS_CACHE
+from data_juicer.utils.constant import Fields, MetaKeys
+from data_juicer.utils.lazy_loader import LazyLoader
+
+from ..base_op import OPERATORS, Mapper
+from ..op_fusion import LOADED_VIDEOS
+
+OP_NAME = "video_camera_pose_mapper"
+
+cv2 = LazyLoader("cv2", "opencv-python")
+torch = LazyLoader("torch")
+
+
+@OPERATORS.register_module(OP_NAME)
+@LOADED_VIDEOS.register_module(OP_NAME)
+class VideoCameraPoseMapper(Mapper):
+    """Extract camera poses by leveraging MegaSaM and MoGe-2."""
+
+    _accelerator = "cuda"
+
+    def __init__(
+        self,
+        moge_model_path: str = "Ruicheng/moge-2-vitl",
+        frame_num: PositiveInt = 3,
+        duration: float = 0,
+        tag_field_name: str = MetaKeys.video_camera_pose_tags,
+        frame_dir: str = DATA_JUICER_ASSETS_CACHE,
+        if_output_moge_info: bool = False,
+        moge_output_info_dir: str = DATA_JUICER_ASSETS_CACHE,
+        if_save_info: bool = True,
+        output_info_dir: str = DATA_JUICER_ASSETS_CACHE,
+        max_frames: int = 1000,
+        *args,
+        **kwargs,
+    ):
+        """
+        Initialization method.
+
+        :param moge_model_path: The path to the Moge-2 model.
+        :param frame_num: The number of frames to be extracted uniformly from
+            the video. If it's 1, only the middle frame will be extracted. If
+            it's 2, only the first and the last frames will be extracted. If
+            it's larger than 2, in addition to the first and the last frames,
+            other frames will be extracted uniformly within the video duration.
+            If "duration" > 0, frame_num is the number of frames per segment.
+        :param duration: The duration of each segment in seconds.
+            If 0, frames are extracted from the entire video.
+            If duration > 0, the video is segmented into multiple segments
+            based on duration, and frames are extracted from each segment.
+        :param tag_field_name: The field name to store the tags. It's
+            "video_camera_pose_tags" in default.
+        :param frame_dir: Output directory to save extracted frames.
+        :param if_output_moge_info: Whether to save the results from MoGe-2
+             to an JSON file.
+        :param moge_output_info_dir: Output directory for saving camera
+            parameters.
+        :param if_save_info: Whether to save the results to an npz file.
+        :param output_info_dir: Path for saving the results.
+        :param max_frames: Maximum number of frames to save.
+        :param args: extra args
+        :param kwargs: extra args
+
+        """
+
+        super().__init__(*args, **kwargs)
+
+        self.video_camera_calibration_static_moge_mapper_args = {
+            "model_path": moge_model_path,
+            "frame_num": frame_num,
+            "duration": duration,
+            "frame_dir": frame_dir,
+            "if_output_points_info": False,
+            "if_output_depth_info": True,
+            "if_output_mask_info": True,
+            "if_output_info": if_output_moge_info,
+            "output_info_dir": moge_output_info_dir,
+        }
+        self.fused_ops = load_ops(
+            [{"video_camera_calibration_static_moge_mapper": self.video_camera_calibration_static_moge_mapper_args}]
+        )
+
+        megasam_repo_path = os.path.join(DATA_JUICER_ASSETS_CACHE, "mega-sam")
+        if not os.path.exists(megasam_repo_path):
+            subprocess.run(["git", "clone", "https://github.com/mega-sam/mega-sam.git", megasam_repo_path], check=True)
+            subprocess.run(
+                ["git", "submodule", "update", "--init", "--recursive"], cwd=os.path.join(megasam_repo_path, "base")
+            )
+
+            with open(os.path.join(megasam_repo_path, "base", "src", "altcorr_kernel.cu"), "r") as f:
+                temp_file_content = f.read()
+                temp_file_content = temp_file_content.replace(".type()", ".scalar_type()")
+
+            with open(os.path.join(megasam_repo_path, "base", "src", "altcorr_kernel.cu"), "w") as f:
+                f.write(temp_file_content)
+
+            with open(os.path.join(megasam_repo_path, "base", "src", "correlation_kernels.cu"), "r") as f:
+                temp_file_content = f.read()
+                temp_file_content = temp_file_content.replace(".type()", ".scalar_type()")
+
+            with open(os.path.join(megasam_repo_path, "base", "src", "correlation_kernels.cu"), "w") as f:
+                f.write(temp_file_content)
+
+            with open(os.path.join(megasam_repo_path, "base", "src", "droid_kernels.cu"), "r") as f:
+                temp_file_content = f.read()
+                temp_file_content = temp_file_content.replace(".type()", ".scalar_type()")
+
+            with open(os.path.join(megasam_repo_path, "base", "src", "droid_kernels.cu"), "w") as f:
+                f.write(temp_file_content)
+
+            with open(
+                os.path.join(megasam_repo_path, "base", "thirdparty", "lietorch", "lietorch", "src", "lietorch_gpu.cu"),
+                "r",
+            ) as f:
+                temp_file_content = f.read()
+                temp_file_content = temp_file_content.replace(".type()", ".scalar_type()")
+
+            with open(
+                os.path.join(megasam_repo_path, "base", "thirdparty", "lietorch", "lietorch", "src", "lietorch_gpu.cu"),
+                "w",
+            ) as f:
+                f.write(temp_file_content)
+
+            with open(
+                os.path.join(
+                    megasam_repo_path, "base", "thirdparty", "lietorch", "lietorch", "src", "lietorch_cpu.cpp"
+                ),
+                "r",
+            ) as f:
+                temp_file_content = f.read()
+                temp_file_content = temp_file_content.replace(".type()", ".scalar_type()")
+
+            with open(
+                os.path.join(
+                    megasam_repo_path, "base", "thirdparty", "lietorch", "lietorch", "src", "lietorch_cpu.cpp"
+                ),
+                "w",
+            ) as f:
+                f.write(temp_file_content)
+
+        try:
+            import droid_backends
+            import lietorch
+
+            self.droid_backends = droid_backends
+            self.lietorch = lietorch
+        except ImportError:
+            subprocess.run(["python", "setup.py", "install"], cwd=os.path.join(megasam_repo_path, "base"))
+
+        try:
+            import torch_scatter
+
+            self.torch_scatter = torch_scatter
+        except ImportError:
+            """ "Please refer to https://github.com/rusty1s/pytorch_scatter to locate the
+            installation link that is compatible with your PyTorch and CUDA versions."""
+            torch_version = "2.8.0"
+            cuda_version = "cu128"
+            subprocess.run(
+                [
+                    "pip",
+                    "install",
+                    "torch-scatter",
+                    "-f",
+                    f"https://data.pyg.org/whl/torch-{torch_version}+{cuda_version}.html",
+                ],
+                cwd=os.path.join(megasam_repo_path, "base"),
+            )
+
+        import sys
+
+        sys.path.append(os.path.join(megasam_repo_path, "base", "droid_slam"))
+        from droid import Droid
+        from lietorch import SE3
+
+        self.SE3 = SE3
+        self.Droid = Droid
+
+        self.tag_field_name = tag_field_name
+        self.if_save_info = if_save_info
+        self.output_info_dir = output_info_dir
+        self.max_frames = max_frames
+        self.frame_dir = frame_dir
+
+    def image_stream(self, frames_path, depth_list, intrinsics_list):
+
+        for t, (image_path, depth, intrinsics) in enumerate(zip(frames_path, depth_list, intrinsics_list)):
+            image = cv2.imread(image_path)
+            h0, w0, _ = image.shape
+            h1 = int(h0 * np.sqrt((384 * 512) / (h0 * w0)))
+            w1 = int(w0 * np.sqrt((384 * 512) / (h0 * w0)))
+
+            image = cv2.resize(image, (w1, h1), interpolation=cv2.INTER_AREA)
+            image = image[: h1 - h1 % 8, : w1 - w1 % 8]
+            image = torch.as_tensor(image).permute(2, 0, 1)
+            image = image[None]
+
+            depth = torch.as_tensor(depth)
+            depth = torch.nn.functional.interpolate(depth[None, None], (h1, w1), mode="nearest-exact").squeeze()
+            depth = depth[: h1 - h1 % 8, : w1 - w1 % 8]
+
+            mask = torch.ones_like(depth)
+
+            intrinsics = torch.as_tensor([intrinsics[0][0], intrinsics[1][1], intrinsics[0][2], intrinsics[1][2]])
+            intrinsics[0::2] *= w1 / w0
+            intrinsics[1::2] *= h1 / h0
+
+            yield t, image, depth, intrinsics, mask
+
+    def process_single(self, sample=None, rank=None):
+        # check if it's generated already
+        if self.tag_field_name in sample[Fields.meta]:
+            return sample
+
+        # there is no video in this sample
+        if self.video_key not in sample or not sample[self.video_key]:
+            return []
+
+        ds_list = [{"videos": sample[self.video_key]}]
+
+        dataset = data_juicer.core.data.NestedDataset.from_list(ds_list)
+        if Fields.meta not in dataset.features:
+            dataset = dataset.add_column(name=Fields.meta, column=[{}] * dataset.num_rows)
+        dataset = dataset.map(self.fused_ops[0].process, num_proc=1, with_rank=True)
+        res_list = dataset.to_list()
+
+        temp_frame_name = os.path.splitext(os.path.basename(sample[self.video_key][0]))[0]
+        frames_root = os.path.join(self.frame_dir, temp_frame_name)
+        frame_names = os.listdir(frames_root)
+        frames_path = sorted([os.path.join(frames_root, frame_name) for frame_name in frame_names])
+
+        depth_list = res_list[0][Fields.meta][MetaKeys.static_camera_calibration_moge_tags]["depth_list"]
+        intrinsics_list = res_list[0][Fields.meta][MetaKeys.static_camera_calibration_moge_tags]["intrinsics_list"]
+
+        valid_image_list = []
+        valid_depth_list = []
+        valid_intrinsics_list = []
+        valid_mask_list = []
+
+        # for t, (image_path, depth, intrinsics) in enumerate(zip(frames_path, depth_list, intrinsics_list)):
+
+        for t, image, depth, intrinsics, mask in self.image_stream(frames_path, depth_list, intrinsics_list):
+
+            valid_image_list.append(image[0])
+            valid_depth_list.append(depth)
+            valid_mask_list.append(mask)
+            valid_intrinsics_list.append(intrinsics)
+
+            if t == 0:
+                args = droid_args(image_size=[image.shape[2], image.shape[3]])
+                droid = self.Droid(args)
+
+            droid.track(t, image, depth, intrinsics=intrinsics, mask=mask)
+
+        droid.track_final(t, image, depth, intrinsics=intrinsics, mask=mask)
+
+        traj_est, depth_est, motion_prob = droid.terminate(
+            self.image_stream(frames_path, depth_list, intrinsics_list),
+            _opt_intr=True,
+            full_ba=True,
+            scene_name=temp_frame_name,
+        )
+
+        t = traj_est.shape[0]
+        images = np.array(valid_image_list[:t])
+        disps = 1.0 / (np.array(valid_depth_list[:t]) + 1e-6)
+
+        poses = traj_est
+        intrinsics = droid.video.intrinsics[:t].cpu().numpy()
+
+        intrinsics = intrinsics[0] * 8.0
+        poses_th = torch.as_tensor(poses, device="cpu")
+        cam_c2w = self.SE3(poses_th).inv().matrix().numpy()
+
+        K = np.eye(3)
+        K[0, 0] = intrinsics[0]
+        K[1, 1] = intrinsics[1]
+        K[0, 2] = intrinsics[2]
+        K[1, 2] = intrinsics[3]
+
+        max_frames = min(self.max_frames, images.shape[0])
+
+        return_images = np.uint8(images[:max_frames, ::-1, ...].transpose(0, 2, 3, 1))
+        return_depths = np.float32(1.0 / disps[:max_frames, ...])
+        return_cam_c2w = cam_c2w[:max_frames]
+
+        if self.if_save_info:
+            os.makedirs(self.output_info_dir, exist_ok=True)
+
+            np.savez(
+                os.path.join(self.output_info_dir, "%s_droid.npz" % temp_frame_name),
+                images=return_images,
+                depths=return_depths,
+                intrinsic=K,
+                cam_c2w=return_cam_c2w,
+            )
+
+        sample[Fields.meta][self.tag_field_name] = {
+            "frames_folder": frames_root,
+            "frame_names": frame_names,
+            "images": return_images,
+            "depths": return_depths,
+            "intrinsic": K,
+            "cam_c2w": return_cam_c2w,
+        }
+
+        return sample
+
+
+class droid_args:
+    def __init__(self, image_size):
+        self.weights = os.path.join(DATA_JUICER_ASSETS_CACHE, "mega-sam", "checkpoints", "megasam_final.pth")
+        self.disable_vis = True
+        self.image_size = image_size
+        self.buffer = 1024
+        self.stereo = False
+        self.filter_thresh = 2.0
+
+        self.warmup = 8
+        self.beta = 0.3
+        self.frontend_nms = 1
+        self.keyframe_thresh = 2.0
+        self.frontend_window = 25
+        self.frontend_thresh = 12.0
+        self.frontend_radius = 2
+
+        self.upsample = False
+        self.backend_thresh = 16.0
+        self.backend_radius = 2
+        self.backend_nms = 3
diff --git a/data_juicer/utils/constant.py b/data_juicer/utils/constant.py
@@ -92,6 +92,8 @@ class MetaKeys(object):
     static_camera_calibration_moge_tags = "static_camera_calibration_moge_tags"
     # # Video Undistortion Info
     video_undistortion_tags = "video_undistortion_tags"
+    # # Camera Pose Info
+    video_camera_pose_tags = "video_camera_pose_tags"
 
     # === info extraction related tags ===
     # # for event extraction
diff --git a/docs/Operators.md b/docs/Operators.md
@@ -46,7 +46,7 @@ Data-Juicer 中的算子分为以下 8 种类型。
 | [filter](#filter) | 56 | Filters out low-quality samples. 过滤低质量样本。 |
 | [formatter](#formatter) | 8 | Discovers, loads, and canonicalizes source data. 发现、加载、规范化原始数据。 |
 | [grouper](#grouper) | 3 | Group samples to batched samples. 将样本分组，每一组组成一个批量样本。 |
-| [mapper](#mapper) | 102 | Edits and transforms samples. 对数据样本进行编辑和转换。 |
+| [mapper](#mapper) | 103 | Edits and transforms samples. 对数据样本进行编辑和转换。 |
 | [pipeline](#pipeline) | 3 | Applies dataset-level processing; both input and output are datasets. 执行数据集级别的操作，输入和输出均为完整数据集。 |
 | [selector](#selector) | 5 | Selects top samples based on ranking. 基于排序选取高质量样本。 |
 
@@ -259,6 +259,7 @@ All the specific operators are listed below, each featured with several capabili
 | vggt_mapper | 🎬Video 🚀GPU 🟡Beta | Input a video of a single scene, and use VGGT to extract information including Camera Pose, Depth Maps, Point Maps, and 3D Point Tracks. 输入单个场景的视频，并使用VGGT提取包括相机姿态、深度图、点图和3D点轨迹的信息。 | [info](operators/mapper/vggt_mapper.md) | - |
 | video_camera_calibration_static_deepcalib_mapper | 🎬Video 🚀GPU 🟡Beta | Compute the camera intrinsics and field of view (FOV) for a static camera using DeepCalib. 使用DeepCalib计算静态摄像机的摄像机内部和视场 (FOV)。 | [info](operators/mapper/video_camera_calibration_static_deepcalib_mapper.md) | - |
 | video_camera_calibration_static_moge_mapper | 🎬Video 🚀GPU 🟡Beta | Compute the camera intrinsics and field of view (FOV) for a static camera using Moge-2 (more accurate than DeepCalib). 使用Moge-2 (比DeepCalib更准确) 计算静态摄像机的摄像机内部函数和视场 (FOV)。 | [info](operators/mapper/video_camera_calibration_static_moge_mapper.md) | - |
+| video_camera_pose_mapper | 🎬Video 🚀GPU 🟡Beta | Extract camera poses by leveraging MegaSaM and MoGe-2. 通过利用MegaSaM和MoGe-2提取相机姿势。 | - | - |
 | video_captioning_from_audio_mapper | 🔮Multimodal 🚀GPU 🧩HF 🟢Stable | Mapper to caption a video according to its audio streams based on Qwen-Audio model. 映射器根据基于qwen-audio模型的音频流为视频添加字幕。 | [info](operators/mapper/video_captioning_from_audio_mapper.md) | - |
 | video_captioning_from_frames_mapper | 🔮Multimodal 🚀GPU 🧩HF 🟢Stable | Generates video captions from sampled frames using an image-to-text model. 使用图像到文本模型从采样帧生成视频字幕。 | [info](operators/mapper/video_captioning_from_frames_mapper.md) | - |
 | video_captioning_from_summarizer_mapper | 🔮Multimodal 🚀GPU 🧩HF 🟢Stable | Mapper to generate video captions by summarizing several kinds of generated texts (captions from video/audio/frames, tags from audio/frames, ...). 映射器通过总结几种生成的文本 (来自视频/音频/帧的字幕，来自音频/帧的标签，...) 来生成视频字幕。 | [info](operators/mapper/video_captioning_from_summarizer_mapper.md) | - |
diff --git a/tests/ops/mapper/test_video_camera_pose_mapper.py b/tests/ops/mapper/test_video_camera_pose_mapper.py

Original file line number	Diff line number	Diff line change
`@@ -88,6 +88,7 @@`
`88`	`88`	`from .video_camera_calibration_static_moge_mapper import (`
`89`	`89`	`VideoCameraCalibrationStaticMogeMapper,`
`90`	`90`	`)`
	`91`	`+from .video_camera_pose_mapper import VideoCameraPoseMapper`
`91`	`92`	`from .video_captioning_from_audio_mapper import VideoCaptioningFromAudioMapper`
`92`	`93`	`from .video_captioning_from_frames_mapper import VideoCaptioningFromFramesMapper`
`93`	`94`	`from .video_captioning_from_summarizer_mapper import VideoCaptioningFromSummarizerMapper`