fix: properly segment videos and images

shuheng-liu · shuheng-liu · commit 6933b5dfe434 · 2026-02-26T17:10:53.000-08:00
diff --git a/src/opentau/scripts/segment_lerobot_dataset.py b/src/opentau/scripts/segment_lerobot_dataset.py
@@ -31,6 +31,7 @@
 import argparse
 import math
 import shutil
+import subprocess
 from copy import deepcopy
 from pathlib import Path
 from typing import Any, cast
@@ -42,8 +43,8 @@
 from opentau.datasets.compute_stats import compute_episode_stats
 from opentau.datasets.lerobot_dataset import CODEBASE_VERSION, LeRobotDatasetMetadata
 from opentau.datasets.utils import (
+    DEFAULT_IMAGE_PATH,
     EPISODES_PATH,
-    EPISODES_STATS_PATH,
     TASKS_PATH,
     append_jsonlines,
     write_episode_stats,
@@ -125,6 +126,127 @@ def _to_numpy_for_stats(column: pa.ChunkedArray) -> np.ndarray:
     return np.asarray(column.to_pylist())
 
 
+def _trim_video_segment(src_video_path: Path, dst_video_path: Path, start_frame: int, end_frame: int) -> None:
+    """Trim a source video to the requested frame interval.
+
+    Args:
+        src_video_path: Source episode video path.
+        dst_video_path: Output path for the trimmed segment video.
+        start_frame: Inclusive start frame index.
+        end_frame: Exclusive end frame index.
+
+    Raises:
+        RuntimeError: If ffmpeg is unavailable or the trim command fails.
+    """
+    if shutil.which("ffmpeg") is None:
+        raise RuntimeError("ffmpeg is required to trim segmented videos but was not found in PATH.")
+
+    # Trim by exact frame indices and reset timeline to start at zero.
+    vf = f"trim=start_frame={start_frame}:end_frame={end_frame},setpts=PTS-STARTPTS"
+    cmd = [
+        "ffmpeg",
+        "-hide_banner",
+        "-loglevel",
+        "error",
+        "-y",
+        "-i",
+        str(src_video_path),
+        "-vf",
+        vf,
+        "-an",
+        str(dst_video_path),
+    ]
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        raise RuntimeError(
+            f"Failed to trim video segment {start_frame}:{end_frame} from '{src_video_path}'. "
+            f"ffmpeg stderr: {result.stderr.strip()}"
+        )
+
+
+def _copy_segment_images_and_rewrite_column(
+    image_cells: list[Any],
+    input_root: Path,
+    output_root: Path,
+    image_key: str,
+    output_episode_index: int,
+    source_episode_index: int,
+    source_segment_start: int,
+) -> list[Any]:
+    """Copy image files for a segment and rewrite per-row image references.
+
+    Args:
+        image_cells: Image column values from the sliced source table.
+        input_root: Source dataset root path.
+        output_root: Output dataset root path.
+        image_key: Feature key for this image stream.
+        output_episode_index: Output episode index receiving this segment.
+        source_episode_index: Source episode index for image path fallback.
+        source_segment_start: Start frame index of this segment in source episode.
+
+    Returns:
+        New image column values with updated file paths for copied images.
+
+    Raises:
+        FileNotFoundError: If a referenced source image file does not exist.
+    """
+    rewritten_cells: list[Any] = []
+    for frame_index, cell in enumerate(image_cells):
+        rel_dst = DEFAULT_IMAGE_PATH.format(
+            image_key=image_key,
+            episode_index=output_episode_index,
+            frame_index=frame_index,
+        )
+        dst_path = output_root / rel_dst
+        dst_path.parent.mkdir(parents=True, exist_ok=True)
+
+        if isinstance(cell, dict):
+            image_bytes = cell.get("bytes")
+            if isinstance(image_bytes, (bytes, bytearray)) and len(image_bytes) > 0:
+                dst_path.write_bytes(bytes(image_bytes))
+                new_cell = dict(cell)
+                new_cell["path"] = str(dst_path)
+                rewritten_cells.append(new_cell)
+                continue
+
+        src_path: Path | None = None
+        if isinstance(cell, str):
+            src_path = Path(cell)
+        elif isinstance(cell, dict):
+            path_val = cell.get("path")
+            if isinstance(path_val, str) and path_val:
+                src_path = Path(path_val)
+
+        # Embedded-image rows may not require copying when path is empty.
+        if src_path is None:
+            rewritten_cells.append(cell)
+            continue
+
+        if not src_path.is_absolute():
+            src_path = input_root / src_path
+        if not src_path.is_file():
+            # Fallback to canonical image location under input root.
+            source_frame_index = source_segment_start + frame_index
+            src_path = input_root / DEFAULT_IMAGE_PATH.format(
+                image_key=image_key,
+                episode_index=source_episode_index,
+                frame_index=source_frame_index,
+            )
+        if not src_path.is_file():
+            raise FileNotFoundError(f"Missing source image for key '{image_key}': {src_path}")
+
+        shutil.copy2(src_path, dst_path)
+
+        if isinstance(cell, str):
+            rewritten_cells.append(str(dst_path))
+        else:
+            new_cell = dict(cell)
+            new_cell["path"] = str(dst_path)
+            rewritten_cells.append(new_cell)
+
+    return rewritten_cells
+
+
 def segment_dataset(
     input_root: Path,
     output_root: Path,
@@ -139,6 +261,12 @@ def segment_dataset(
         episode_id: Source episode index to slice.
         segments: List of ``(start, end)`` frame ranges in ``[start, end)`` form.
 
+    Notes:
+        For visual features (``dtype`` in ``{"image", "video"}``), per-episode
+        statistics (``min``, ``max``, ``mean``, ``std``) are inherited from the
+        source episode statistics and only the ``count`` is updated to the segment
+        length. They are not recomputed from the segmented visual data.
+
     Raises:
         ValueError: If inputs are invalid, source files are missing, or segment
             ranges are out of bounds.
@@ -207,6 +335,26 @@ def segment_dataset(
             if col_idx >= 0:
                 seg_table = seg_table.set_column(col_idx, key, arr)
 
+        # For image-based datasets, copy only the segment frames and rewrite image references.
+        image_keys = [k for k, ft in source_meta.features.items() if ft["dtype"] == "image"]
+        for image_key in image_keys:
+            if image_key not in seg_table.column_names:
+                continue
+            col_idx = seg_table.schema.get_field_index(image_key)
+            image_cells = seg_table.column(image_key).to_pylist()
+            rewritten = _copy_segment_images_and_rewrite_column(
+                image_cells=image_cells,
+                input_root=input_root,
+                output_root=output_root,
+                image_key=image_key,
+                output_episode_index=output_episode_index,
+                source_episode_index=episode_id,
+                source_segment_start=start,
+            )
+            seg_table = seg_table.set_column(
+                col_idx, image_key, pa.array(rewritten, type=seg_table.schema.field(image_key).type)
+            )
+
         episode_chunk = output_episode_index // chunks_size
         output_parquet_path = output_root / source_meta.data_path.format(
             episode_chunk=episode_chunk,
@@ -267,15 +415,15 @@ def segment_dataset(
         src_video_path = input_root / source_meta.get_video_file_path(episode_id, video_key)
         if not src_video_path.is_file():
             raise ValueError(f"Missing source video for key '{video_key}': {src_video_path}")
-        for output_episode_index in range(len(segments)):
+        for output_episode_index, (start, end) in enumerate(segments):
             episode_chunk = output_episode_index // chunks_size
             dst_video_path = output_root / video_path_template_str.format(
                 episode_chunk=episode_chunk,
                 video_key=video_key,
                 episode_index=output_episode_index,
             )
             dst_video_path.parent.mkdir(parents=True, exist_ok=True)
-            shutil.copy2(src_video_path, dst_video_path)
+            _trim_video_segment(src_video_path, dst_video_path, start, end)
 
     for episode in output_episodes:
         append_jsonlines(episode, output_root / EPISODES_PATH)
@@ -290,9 +438,6 @@ def segment_dataset(
     info["splits"] = {"train": f"0:{total_episodes}"}
     write_json(info, output_root / "meta" / "info.json")
 
-    # Ensure expected meta files exist and are explicit outputs.
-    _ = output_root / EPISODES_STATS_PATH
-
 
 def main() -> None:
     """CLI entry point."""
diff --git a/tests/datasets/test_segment_lerobot_dataset.py b/tests/datasets/test_segment_lerobot_dataset.py
@@ -16,6 +16,7 @@
 
 from pathlib import Path
 from typing import Any
+from unittest.mock import patch
 
 import numpy as np
 import pyarrow.parquet as pq
@@ -25,6 +26,24 @@
 from opentau.scripts.segment_lerobot_dataset import segment_dataset
 
 
+def _extract_image_path(cell: Any) -> str | None:
+    """Extract image path from a parquet image cell.
+
+    Args:
+        cell: A parquet image value (string path or dict with `path`/`bytes`).
+
+    Returns:
+        Image path string if present, otherwise None.
+    """
+    if isinstance(cell, str):
+        return cell
+    if isinstance(cell, dict):
+        path = cell.get("path")
+        if isinstance(path, str) and path:
+            return path
+    return None
+
+
 def test_segment_lerobot_v21_dataset(tmp_path: Path, empty_lerobot_dataset_factory: Any) -> None:
     """Validate baseline segmentation behavior for v2.1 input.
 
@@ -207,3 +226,91 @@ def test_segment_lerobot_non_consecutive_and_overlapping_ranges(
     assert [float(x) for x in ep0["state"]] == [float(i) for i in range(0, 10)]
     assert [float(x) for x in ep1["state"]] == [float(i) for i in range(18, 23)]
     assert [float(x) for x in ep2["state"]] == [float(i) for i in range(5, 15)]
+
+
+def test_segment_lerobot_copies_image_files_for_segments(
+    tmp_path: Path, empty_lerobot_dataset_factory: Any
+) -> None:
+    """Ensure segmented datasets copy and rewrite image file references.
+
+    Args:
+        tmp_path: Temporary directory fixture provided by pytest.
+        empty_lerobot_dataset_factory: Fixture that creates a writable dataset.
+    """
+    input_root = tmp_path / "source_image_dataset"
+    output_root = tmp_path / "segmented_image_dataset"
+    image_key = "observation.images.camera"
+
+    features = {
+        "state": {"dtype": "float32", "shape": (1,), "names": None},
+        "actions": {"dtype": "float32", "shape": (1,), "names": None},
+        image_key: {"dtype": "image", "shape": (3, 8, 8), "names": ["channel", "height", "width"]},
+    }
+    dataset = empty_lerobot_dataset_factory(root=input_root, features=features, use_videos=False)
+    for i in range(8):
+        dataset.add_frame(
+            {
+                "state": np.array([float(i)], dtype=np.float32),
+                "actions": np.array([float(i) + 1.0], dtype=np.float32),
+                "observation.images.camera": np.full((8, 8, 3), i / 8.0, dtype=np.float32),
+                "task": "image task",
+            }
+        )
+    dataset.save_episode()
+
+    segment_dataset(
+        input_root=input_root,
+        output_root=output_root,
+        episode_id=0,
+        segments=[(1, 4), (4, 8)],
+    )
+
+    out_meta = LeRobotDatasetMetadata(repo_id=output_root.name, root=output_root)
+    ep0 = pq.read_table(output_root / out_meta.get_data_file_path(0)).to_pydict()
+    ep1 = pq.read_table(output_root / out_meta.get_data_file_path(1)).to_pydict()
+
+    ep0_paths = [_extract_image_path(cell) for cell in ep0[image_key]]
+    ep1_paths = [_extract_image_path(cell) for cell in ep1[image_key]]
+    assert all(path is not None for path in ep0_paths)
+    assert all(path is not None for path in ep1_paths)
+
+    for frame_idx, path in enumerate(ep0_paths):
+        assert path is not None
+        expected = output_root / f"images/{image_key}/episode_000000/frame_{frame_idx:06d}.png"
+        assert Path(path) == expected
+        assert expected.is_file()
+
+    for frame_idx, path in enumerate(ep1_paths):
+        assert path is not None
+        expected = output_root / f"images/{image_key}/episode_000001/frame_{frame_idx:06d}.png"
+        assert Path(path) == expected
+        assert expected.is_file()
+
+
+def test_trim_video_segment_uses_frame_range_filter(tmp_path: Path) -> None:
+    """Ensure ffmpeg trim command uses frame-range segmentation.
+
+    Args:
+        tmp_path: Temporary directory fixture provided by pytest.
+    """
+    src = tmp_path / "src.mp4"
+    dst = tmp_path / "dst.mp4"
+    src.write_bytes(b"fake")
+
+    with (
+        patch("opentau.scripts.segment_lerobot_dataset.shutil.which", return_value="/usr/bin/ffmpeg"),
+        patch("opentau.scripts.segment_lerobot_dataset.subprocess.run") as run_mock,
+    ):
+        run_mock.return_value.returncode = 0
+        run_mock.return_value.stderr = ""
+
+        from opentau.scripts.segment_lerobot_dataset import _trim_video_segment
+
+        _trim_video_segment(src, dst, 5, 15)
+
+    assert run_mock.call_count == 1
+    cmd = run_mock.call_args.args[0]
+    assert "ffmpeg" in cmd[0]
+    assert "-vf" in cmd
+    vf_expr = cmd[cmd.index("-vf") + 1]
+    assert "trim=start_frame=5:end_frame=15" in vf_expr