Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 13 additions & 18 deletions src/lerobot/datasets/dataset_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -962,28 +962,23 @@ def _copy_data_with_feature_changes(
remove_features: list[str] | None = None,
) -> None:
"""Copy data while adding or removing features."""
if dataset.meta.episodes is None:
dataset.meta.episodes = load_episodes(dataset.meta.root)
data_dir = dataset.root / "data"
parquet_files = sorted(data_dir.glob("*/*.parquet"))

# Map file paths to episode indices to extract chunk/file indices
file_to_episodes: dict[Path, set[int]] = {}
for ep_idx in range(dataset.meta.total_episodes):
file_path = dataset.meta.get_data_file_path(ep_idx)
if file_path not in file_to_episodes:
file_to_episodes[file_path] = set()
file_to_episodes[file_path].add(ep_idx)
if not parquet_files:
raise ValueError(f"No parquet files found in {data_dir}")

frame_idx = 0

for src_path in tqdm(sorted(file_to_episodes.keys()), desc="Processing data files"):
df = pd.read_parquet(dataset.root / src_path).reset_index(drop=True)
for src_path in tqdm(parquet_files, desc="Processing data files"):
df = pd.read_parquet(src_path).reset_index(drop=True)

relative_path = src_path.relative_to(dataset.root)
Copy link

Copilot AI Oct 29, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hardcoded array indexing without bounds checking. If the path structure differs from expected data/chunk-XXX/file-YYY.parquet, this will raise an IndexError. Consider validating that len(relative_path.parts) >= 3 before accessing indices.

Suggested change
relative_path = src_path.relative_to(dataset.root)
relative_path = src_path.relative_to(dataset.root)
if len(relative_path.parts) < 3:
raise ValueError(
f"Unexpected path structure for {src_path}: expected at least 3 parts, got {len(relative_path.parts)} ({relative_path.parts})"
)

Copilot uses AI. Check for mistakes.
chunk_dir = relative_path.parts[1]
file_name = relative_path.parts[2]

# Get chunk_idx and file_idx from the source file's first episode
episodes_in_file = file_to_episodes[src_path]
first_ep_idx = min(episodes_in_file)
src_ep = dataset.meta.episodes[first_ep_idx]
chunk_idx = src_ep["data/chunk_index"]
file_idx = src_ep["data/file_index"]
chunk_idx = int(chunk_dir.split("-")[1])
file_idx = int(file_name.split("-")[1].split(".")[0])
Comment on lines +980 to +981
Copy link

Copilot AI Oct 29, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

String parsing without validation. If file naming convention differs from chunk-{number} or file-{number}.parquet, this will raise IndexError or ValueError. Consider adding error handling or validation to ensure the expected format before parsing, or using regex with pattern matching for more robust extraction.

Copilot uses AI. Check for mistakes.

if remove_features:
df = df.drop(columns=remove_features, errors="ignore")
Expand All @@ -1009,7 +1004,7 @@ def _copy_data_with_feature_changes(
df[feature_name] = feature_slice
frame_idx = end_idx

# Write using the preserved chunk_idx and file_idx from source
# Write using the same chunk/file structure as source
dst_path = new_meta.root / DEFAULT_DATA_PATH.format(chunk_index=chunk_idx, file_index=file_idx)
dst_path.parent.mkdir(parents=True, exist_ok=True)

Expand Down
Loading