-
Notifications
You must be signed in to change notification settings - Fork 2.9k
fix(dataset_tools) Critical bug in modify features #2342
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -962,28 +962,23 @@ def _copy_data_with_feature_changes( | |
| remove_features: list[str] | None = None, | ||
| ) -> None: | ||
| """Copy data while adding or removing features.""" | ||
| if dataset.meta.episodes is None: | ||
| dataset.meta.episodes = load_episodes(dataset.meta.root) | ||
| data_dir = dataset.root / "data" | ||
| parquet_files = sorted(data_dir.glob("*/*.parquet")) | ||
|
|
||
| # Map file paths to episode indices to extract chunk/file indices | ||
| file_to_episodes: dict[Path, set[int]] = {} | ||
| for ep_idx in range(dataset.meta.total_episodes): | ||
| file_path = dataset.meta.get_data_file_path(ep_idx) | ||
| if file_path not in file_to_episodes: | ||
| file_to_episodes[file_path] = set() | ||
| file_to_episodes[file_path].add(ep_idx) | ||
| if not parquet_files: | ||
| raise ValueError(f"No parquet files found in {data_dir}") | ||
|
|
||
| frame_idx = 0 | ||
|
|
||
| for src_path in tqdm(sorted(file_to_episodes.keys()), desc="Processing data files"): | ||
| df = pd.read_parquet(dataset.root / src_path).reset_index(drop=True) | ||
| for src_path in tqdm(parquet_files, desc="Processing data files"): | ||
| df = pd.read_parquet(src_path).reset_index(drop=True) | ||
|
|
||
| relative_path = src_path.relative_to(dataset.root) | ||
| chunk_dir = relative_path.parts[1] | ||
| file_name = relative_path.parts[2] | ||
|
|
||
| # Get chunk_idx and file_idx from the source file's first episode | ||
| episodes_in_file = file_to_episodes[src_path] | ||
| first_ep_idx = min(episodes_in_file) | ||
| src_ep = dataset.meta.episodes[first_ep_idx] | ||
| chunk_idx = src_ep["data/chunk_index"] | ||
| file_idx = src_ep["data/file_index"] | ||
| chunk_idx = int(chunk_dir.split("-")[1]) | ||
| file_idx = int(file_name.split("-")[1].split(".")[0]) | ||
|
Comment on lines
+980
to
+981
|
||
|
|
||
| if remove_features: | ||
| df = df.drop(columns=remove_features, errors="ignore") | ||
|
|
@@ -1009,7 +1004,7 @@ def _copy_data_with_feature_changes( | |
| df[feature_name] = feature_slice | ||
| frame_idx = end_idx | ||
|
|
||
| # Write using the preserved chunk_idx and file_idx from source | ||
| # Write using the same chunk/file structure as source | ||
| dst_path = new_meta.root / DEFAULT_DATA_PATH.format(chunk_index=chunk_idx, file_index=file_idx) | ||
| dst_path.parent.mkdir(parents=True, exist_ok=True) | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hardcoded array indexing without bounds checking. If the path structure differs from expected
data/chunk-XXX/file-YYY.parquet, this will raise anIndexError. Consider validating thatlen(relative_path.parts) >= 3before accessing indices.