Skip to content
1 change: 1 addition & 0 deletions data_juicer/ops/mapper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@
from .video_camera_calibration_static_moge_mapper import (
VideoCameraCalibrationStaticMogeMapper,
)
from .video_camera_pose_mapper import VideoCameraPoseMapper
from .video_captioning_from_audio_mapper import VideoCaptioningFromAudioMapper
from .video_captioning_from_frames_mapper import VideoCaptioningFromFramesMapper
from .video_captioning_from_summarizer_mapper import VideoCaptioningFromSummarizerMapper
Expand Down
337 changes: 337 additions & 0 deletions data_juicer/ops/mapper/video_camera_pose_mapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,337 @@
import os
import subprocess

import numpy as np
from pydantic import PositiveInt

import data_juicer
from data_juicer.ops.load import load_ops
from data_juicer.utils.cache_utils import DATA_JUICER_ASSETS_CACHE
from data_juicer.utils.constant import Fields, MetaKeys
from data_juicer.utils.lazy_loader import LazyLoader

from ..base_op import OPERATORS, Mapper
from ..op_fusion import LOADED_VIDEOS

OP_NAME = "video_camera_pose_mapper"

cv2 = LazyLoader("cv2", "opencv-python")
torch = LazyLoader("torch")


@OPERATORS.register_module(OP_NAME)
@LOADED_VIDEOS.register_module(OP_NAME)
class VideoCameraPoseMapper(Mapper):
"""Extract camera poses by leveraging MegaSaM and MoGe-2."""

_accelerator = "cuda"

def __init__(
self,
moge_model_path: str = "Ruicheng/moge-2-vitl",
frame_num: PositiveInt = 3,
duration: float = 0,
tag_field_name: str = MetaKeys.video_camera_pose_tags,
frame_dir: str = DATA_JUICER_ASSETS_CACHE,
if_output_moge_info: bool = False,
moge_output_info_dir: str = DATA_JUICER_ASSETS_CACHE,
if_save_info: bool = True,
output_info_dir: str = DATA_JUICER_ASSETS_CACHE,
max_frames: int = 1000,
*args,
**kwargs,
):
"""
Initialization method.
:param moge_model_path: The path to the Moge-2 model.
:param frame_num: The number of frames to be extracted uniformly from
the video. If it's 1, only the middle frame will be extracted. If
it's 2, only the first and the last frames will be extracted. If
it's larger than 2, in addition to the first and the last frames,
other frames will be extracted uniformly within the video duration.
If "duration" > 0, frame_num is the number of frames per segment.
:param duration: The duration of each segment in seconds.
If 0, frames are extracted from the entire video.
If duration > 0, the video is segmented into multiple segments
based on duration, and frames are extracted from each segment.
:param tag_field_name: The field name to store the tags. It's
"video_camera_pose_tags" in default.
:param frame_dir: Output directory to save extracted frames.
:param if_output_moge_info: Whether to save the results from MoGe-2
to an JSON file.
:param moge_output_info_dir: Output directory for saving camera
parameters.
:param if_save_info: Whether to save the results to an npz file.
:param output_info_dir: Path for saving the results.
:param max_frames: Maximum number of frames to save.
:param args: extra args
:param kwargs: extra args
"""

super().__init__(*args, **kwargs)

self.video_camera_calibration_static_moge_mapper_args = {
"model_path": moge_model_path,
"frame_num": frame_num,
"duration": duration,
"frame_dir": frame_dir,
"if_output_points_info": False,
"if_output_depth_info": True,
"if_output_mask_info": True,
"if_output_info": if_output_moge_info,
"output_info_dir": moge_output_info_dir,
}
self.fused_ops = load_ops(
[{"video_camera_calibration_static_moge_mapper": self.video_camera_calibration_static_moge_mapper_args}]
)

megasam_repo_path = os.path.join(DATA_JUICER_ASSETS_CACHE, "mega-sam")
if not os.path.exists(megasam_repo_path):
subprocess.run(["git", "clone", "https://github.com/mega-sam/mega-sam.git", megasam_repo_path], check=True)
subprocess.run(
["git", "submodule", "update", "--init", "--recursive"], cwd=os.path.join(megasam_repo_path, "base")
)

with open(os.path.join(megasam_repo_path, "base", "src", "altcorr_kernel.cu"), "r") as f:
temp_file_content = f.read()
temp_file_content = temp_file_content.replace(".type()", ".scalar_type()")

with open(os.path.join(megasam_repo_path, "base", "src", "altcorr_kernel.cu"), "w") as f:
f.write(temp_file_content)

with open(os.path.join(megasam_repo_path, "base", "src", "correlation_kernels.cu"), "r") as f:
temp_file_content = f.read()
temp_file_content = temp_file_content.replace(".type()", ".scalar_type()")

with open(os.path.join(megasam_repo_path, "base", "src", "correlation_kernels.cu"), "w") as f:
f.write(temp_file_content)

with open(os.path.join(megasam_repo_path, "base", "src", "droid_kernels.cu"), "r") as f:
temp_file_content = f.read()
temp_file_content = temp_file_content.replace(".type()", ".scalar_type()")

with open(os.path.join(megasam_repo_path, "base", "src", "droid_kernels.cu"), "w") as f:
f.write(temp_file_content)

with open(
os.path.join(megasam_repo_path, "base", "thirdparty", "lietorch", "lietorch", "src", "lietorch_gpu.cu"),
"r",
) as f:
temp_file_content = f.read()
temp_file_content = temp_file_content.replace(".type()", ".scalar_type()")

with open(
os.path.join(megasam_repo_path, "base", "thirdparty", "lietorch", "lietorch", "src", "lietorch_gpu.cu"),
"w",
) as f:
f.write(temp_file_content)

with open(
os.path.join(
megasam_repo_path, "base", "thirdparty", "lietorch", "lietorch", "src", "lietorch_cpu.cpp"
),
"r",
) as f:
temp_file_content = f.read()
temp_file_content = temp_file_content.replace(".type()", ".scalar_type()")

with open(
os.path.join(
megasam_repo_path, "base", "thirdparty", "lietorch", "lietorch", "src", "lietorch_cpu.cpp"
),
"w",
) as f:
f.write(temp_file_content)

try:
import droid_backends
import lietorch

self.droid_backends = droid_backends
self.lietorch = lietorch
except ImportError:
subprocess.run(["python", "setup.py", "install"], cwd=os.path.join(megasam_repo_path, "base"))

try:
import torch_scatter

self.torch_scatter = torch_scatter
except ImportError:
""" "Please refer to https://github.com/rusty1s/pytorch_scatter to locate the
installation link that is compatible with your PyTorch and CUDA versions."""
torch_version = "2.8.0"
cuda_version = "cu128"
subprocess.run(
[
"pip",
"install",
"torch-scatter",
"-f",
f"https://data.pyg.org/whl/torch-{torch_version}+{cuda_version}.html",
],
cwd=os.path.join(megasam_repo_path, "base"),
)

import sys

sys.path.append(os.path.join(megasam_repo_path, "base", "droid_slam"))
from droid import Droid
from lietorch import SE3

self.SE3 = SE3
self.Droid = Droid

self.tag_field_name = tag_field_name
self.if_save_info = if_save_info
self.output_info_dir = output_info_dir
self.max_frames = max_frames
self.frame_dir = frame_dir

def image_stream(self, frames_path, depth_list, intrinsics_list):

for t, (image_path, depth, intrinsics) in enumerate(zip(frames_path, depth_list, intrinsics_list)):
image = cv2.imread(image_path)
h0, w0, _ = image.shape
h1 = int(h0 * np.sqrt((384 * 512) / (h0 * w0)))
w1 = int(w0 * np.sqrt((384 * 512) / (h0 * w0)))

image = cv2.resize(image, (w1, h1), interpolation=cv2.INTER_AREA)
image = image[: h1 - h1 % 8, : w1 - w1 % 8]
image = torch.as_tensor(image).permute(2, 0, 1)
image = image[None]

depth = torch.as_tensor(depth)
depth = torch.nn.functional.interpolate(depth[None, None], (h1, w1), mode="nearest-exact").squeeze()
depth = depth[: h1 - h1 % 8, : w1 - w1 % 8]

mask = torch.ones_like(depth)

intrinsics = torch.as_tensor([intrinsics[0][0], intrinsics[1][1], intrinsics[0][2], intrinsics[1][2]])
intrinsics[0::2] *= w1 / w0
intrinsics[1::2] *= h1 / h0

yield t, image, depth, intrinsics, mask

def process_single(self, sample=None, rank=None):
# check if it's generated already
if self.tag_field_name in sample[Fields.meta]:
return sample

# there is no video in this sample
if self.video_key not in sample or not sample[self.video_key]:
return []

ds_list = [{"videos": sample[self.video_key]}]

dataset = data_juicer.core.data.NestedDataset.from_list(ds_list)
if Fields.meta not in dataset.features:
dataset = dataset.add_column(name=Fields.meta, column=[{}] * dataset.num_rows)
dataset = dataset.map(self.fused_ops[0].process, num_proc=1, with_rank=True)
res_list = dataset.to_list()

temp_frame_name = os.path.splitext(os.path.basename(sample[self.video_key][0]))[0]
frames_root = os.path.join(self.frame_dir, temp_frame_name)
frame_names = os.listdir(frames_root)
frames_path = sorted([os.path.join(frames_root, frame_name) for frame_name in frame_names])

depth_list = res_list[0][Fields.meta][MetaKeys.static_camera_calibration_moge_tags]["depth_list"]
intrinsics_list = res_list[0][Fields.meta][MetaKeys.static_camera_calibration_moge_tags]["intrinsics_list"]

valid_image_list = []
valid_depth_list = []
valid_intrinsics_list = []
valid_mask_list = []

# for t, (image_path, depth, intrinsics) in enumerate(zip(frames_path, depth_list, intrinsics_list)):

for t, image, depth, intrinsics, mask in self.image_stream(frames_path, depth_list, intrinsics_list):

valid_image_list.append(image[0])
valid_depth_list.append(depth)
valid_mask_list.append(mask)
valid_intrinsics_list.append(intrinsics)

if t == 0:
args = droid_args(image_size=[image.shape[2], image.shape[3]])
droid = self.Droid(args)

droid.track(t, image, depth, intrinsics=intrinsics, mask=mask)

droid.track_final(t, image, depth, intrinsics=intrinsics, mask=mask)

traj_est, depth_est, motion_prob = droid.terminate(
self.image_stream(frames_path, depth_list, intrinsics_list),
_opt_intr=True,
full_ba=True,
scene_name=temp_frame_name,
)

t = traj_est.shape[0]
images = np.array(valid_image_list[:t])
disps = 1.0 / (np.array(valid_depth_list[:t]) + 1e-6)

poses = traj_est
intrinsics = droid.video.intrinsics[:t].cpu().numpy()

intrinsics = intrinsics[0] * 8.0
poses_th = torch.as_tensor(poses, device="cpu")
cam_c2w = self.SE3(poses_th).inv().matrix().numpy()

K = np.eye(3)
K[0, 0] = intrinsics[0]
K[1, 1] = intrinsics[1]
K[0, 2] = intrinsics[2]
K[1, 2] = intrinsics[3]

max_frames = min(self.max_frames, images.shape[0])

return_images = np.uint8(images[:max_frames, ::-1, ...].transpose(0, 2, 3, 1))
return_depths = np.float32(1.0 / disps[:max_frames, ...])
return_cam_c2w = cam_c2w[:max_frames]

if self.if_save_info:
os.makedirs(self.output_info_dir, exist_ok=True)

np.savez(
os.path.join(self.output_info_dir, "%s_droid.npz" % temp_frame_name),
images=return_images,
depths=return_depths,
intrinsic=K,
cam_c2w=return_cam_c2w,
)

sample[Fields.meta][self.tag_field_name] = {
"frames_folder": frames_root,
"frame_names": frame_names,
"images": return_images,
"depths": return_depths,
"intrinsic": K,
"cam_c2w": return_cam_c2w,
}

return sample


class droid_args:
def __init__(self, image_size):
self.weights = os.path.join(DATA_JUICER_ASSETS_CACHE, "mega-sam", "checkpoints", "megasam_final.pth")
self.disable_vis = True
self.image_size = image_size
self.buffer = 1024
self.stereo = False
self.filter_thresh = 2.0

self.warmup = 8
self.beta = 0.3
self.frontend_nms = 1
self.keyframe_thresh = 2.0
self.frontend_window = 25
self.frontend_thresh = 12.0
self.frontend_radius = 2

self.upsample = False
self.backend_thresh = 16.0
self.backend_radius = 2
self.backend_nms = 3
2 changes: 2 additions & 0 deletions data_juicer/utils/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ class MetaKeys(object):
static_camera_calibration_moge_tags = "static_camera_calibration_moge_tags"
# # Video Undistortion Info
video_undistortion_tags = "video_undistortion_tags"
# # Camera Pose Info
video_camera_pose_tags = "video_camera_pose_tags"

# === info extraction related tags ===
# # for event extraction
Expand Down
3 changes: 2 additions & 1 deletion docs/Operators.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ Data-Juicer 中的算子分为以下 8 种类型。
| [filter](#filter) | 56 | Filters out low-quality samples. 过滤低质量样本。 |
| [formatter](#formatter) | 8 | Discovers, loads, and canonicalizes source data. 发现、加载、规范化原始数据。 |
| [grouper](#grouper) | 3 | Group samples to batched samples. 将样本分组,每一组组成一个批量样本。 |
| [mapper](#mapper) | 102 | Edits and transforms samples. 对数据样本进行编辑和转换。 |
| [mapper](#mapper) | 103 | Edits and transforms samples. 对数据样本进行编辑和转换。 |
| [pipeline](#pipeline) | 3 | Applies dataset-level processing; both input and output are datasets. 执行数据集级别的操作,输入和输出均为完整数据集。 |
| [selector](#selector) | 5 | Selects top samples based on ranking. 基于排序选取高质量样本。 |

Expand Down Expand Up @@ -259,6 +259,7 @@ All the specific operators are listed below, each featured with several capabili
| vggt_mapper | 🎬Video 🚀GPU 🟡Beta | Input a video of a single scene, and use VGGT to extract information including Camera Pose, Depth Maps, Point Maps, and 3D Point Tracks. 输入单个场景的视频,并使用VGGT提取包括相机姿态、深度图、点图和3D点轨迹的信息。 | [info](operators/mapper/vggt_mapper.md) | - |
| video_camera_calibration_static_deepcalib_mapper | 🎬Video 🚀GPU 🟡Beta | Compute the camera intrinsics and field of view (FOV) for a static camera using DeepCalib. 使用DeepCalib计算静态摄像机的摄像机内部和视场 (FOV)。 | [info](operators/mapper/video_camera_calibration_static_deepcalib_mapper.md) | - |
| video_camera_calibration_static_moge_mapper | 🎬Video 🚀GPU 🟡Beta | Compute the camera intrinsics and field of view (FOV) for a static camera using Moge-2 (more accurate than DeepCalib). 使用Moge-2 (比DeepCalib更准确) 计算静态摄像机的摄像机内部函数和视场 (FOV)。 | [info](operators/mapper/video_camera_calibration_static_moge_mapper.md) | - |
| video_camera_pose_mapper | 🎬Video 🚀GPU 🟡Beta | Extract camera poses by leveraging MegaSaM and MoGe-2. 通过利用MegaSaM和MoGe-2提取相机姿势。 | - | - |
| video_captioning_from_audio_mapper | 🔮Multimodal 🚀GPU 🧩HF 🟢Stable | Mapper to caption a video according to its audio streams based on Qwen-Audio model. 映射器根据基于qwen-audio模型的音频流为视频添加字幕。 | [info](operators/mapper/video_captioning_from_audio_mapper.md) | - |
| video_captioning_from_frames_mapper | 🔮Multimodal 🚀GPU 🧩HF 🟢Stable | Generates video captions from sampled frames using an image-to-text model. 使用图像到文本模型从采样帧生成视频字幕。 | [info](operators/mapper/video_captioning_from_frames_mapper.md) | - |
| video_captioning_from_summarizer_mapper | 🔮Multimodal 🚀GPU 🧩HF 🟢Stable | Mapper to generate video captions by summarizing several kinds of generated texts (captions from video/audio/frames, tags from audio/frames, ...). 映射器通过总结几种生成的文本 (来自视频/音频/帧的字幕,来自音频/帧的标签,...) 来生成视频字幕。 | [info](operators/mapper/video_captioning_from_summarizer_mapper.md) | - |
Expand Down
Loading