Skip to content
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/b2aiprep/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -638,7 +638,7 @@ def validate_bundled_dataset(dataset_path, config_dir):
with open(config_dir / "participants_to_remove.json") as f:
participants_to_remove = set(json.load(f))

with open(config_dir / "sensitive_audio_tasks.json") as f:
with open(config_dir / "audio_tasks_to_include.json") as f:
sensitive_tasks = set(json.load(f))
sensitive_tasks_normalized = {
normalize_task_label(task)
Expand Down Expand Up @@ -809,7 +809,7 @@ def deidentify_bids_dataset(
- participants_to_remove.json
- audio_to_remove.json
- id_remapping.json
- sensitive_audio_tasks.json
- audio_tasks_to_include.json
"""
bids_path = Path(bids_path)
deidentify_config_dir = Path(deidentify_config_dir)
Expand Down
57 changes: 40 additions & 17 deletions src/b2aiprep/prepare/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1701,18 +1701,18 @@ def load_audio_filestems_to_remove(publish_config_dir: Path) -> t.List[str]:
return data

@staticmethod
def load_sensitive_audio_tasks(deidentify_config_dir: Path) -> t.List[str]:
def load_audio_tasks_to_include(deidentify_config_dir: Path) -> t.List[str]:
"""Load list of audio tasks that are sensitive from JSON file."""
sensitive_audio_tasks_path = deidentify_config_dir / "sensitive_audio_tasks.json"
if not sensitive_audio_tasks_path.exists():
audio_tasks_to_include_path = deidentify_config_dir / "audio_tasks_to_include.json"
if not audio_tasks_to_include_path.exists():
# If file doesn't exist, raise an error
raise FileNotFoundError(f"Sensitive audio tasks file {sensitive_audio_tasks_path} does not exist.")
raise FileNotFoundError(f"Sensitive audio tasks file {audio_tasks_to_include_path} does not exist.")

with open(sensitive_audio_tasks_path, 'r') as f:
with open(audio_tasks_to_include_path, 'r') as f:
data = json.load(f)

if not isinstance(data, list):
raise ValueError(f"Sensitive audio tasks file {sensitive_audio_tasks_path} should contain a list of audio task names.")
raise ValueError(f"Sensitive audio tasks file {audio_tasks_to_include_path} should contain a list of audio task names.")

return data

Expand Down Expand Up @@ -1741,7 +1741,8 @@ def deidentify(self, outdir: t.Union[str, Path], deidentify_config_dir: Path, sk
participant_ids_to_remap = BIDSDataset.load_remap_id_list(deidentify_config_dir)
participant_ids_to_remove = BIDSDataset.load_participant_ids_to_remove(deidentify_config_dir)
audio_filestems_to_remove = BIDSDataset.load_audio_filestems_to_remove(deidentify_config_dir)
sensitive_audio_tasks = BIDSDataset.load_sensitive_audio_tasks(deidentify_config_dir)
#sensitive_audio_tasks = BIDSDataset.load_sensitive_audio_tasks(deidentify_config_dir)
audio_tasks_to_include = BIDSDataset.load_audio_tasks_to_include(deidentify_config_dir)
participant_session_id_to_remap = BIDSDataset.map_sequential_session_ids(self.data_path)

# Allow users to specify filestems either in the original ID space (pre-deidentify)
Expand Down Expand Up @@ -1784,7 +1785,7 @@ def deidentify(self, outdir: t.Union[str, Path], deidentify_config_dir: Path, sk
outdir,
participant_ids_to_remove,
audio_filestems_to_remove,
sensitive_audio_tasks,
audio_tasks_to_include,
participant_ids_to_remap,
participant_session_id_to_remap,
max_workers=max_workers,
Expand All @@ -1799,7 +1800,7 @@ def deidentify(self, outdir: t.Union[str, Path], deidentify_config_dir: Path, sk
outdir,
participant_ids_to_remove,
audio_filestems_to_remove,
sensitive_audio_tasks,
audio_tasks_to_include,
participant_ids_to_remap,
participant_session_id_to_remap,
max_workers=max_workers,
Expand Down Expand Up @@ -1843,7 +1844,6 @@ def _apply_exclusion_list_to_filepaths(
exclusion = set(exclusion_list)
if len(exclusion) == 0:
return paths

def _canonical_recording_stem(stem: str) -> str:
"""Canonicalize a BIDS-like recording stem for matching.

Expand Down Expand Up @@ -1894,6 +1894,29 @@ def _canonical_recording_stem(stem: str) -> str:
)
return paths

@staticmethod
def _grab_filepaths_list_to_include(
paths: t.List[Path],
inclusion_list: t.List[str],
) -> t.List[Path]:
"""Grabs filepaths based on overlap with a specified inclusion list."""
n = len(paths)
inclusion = set(inclusion_list)
if len(inclusion) == 0:
return paths

new_paths = []
for file in paths:
for incl in inclusion_list:
if normalize_task_label(incl) in normalize_task_label(file.stem):
new_paths.append(file)
Comment on lines 1902 to 1911
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

This implementation has a couple of issues:

  1. It can add duplicate file paths to new_paths if a file stem matches multiple inclusion criteria.
  2. It iterates over inclusion_list instead of the inclusion set that was created, which is less efficient if the list contains duplicates.
  3. The normalization of inclusion strings and file stems is done repeatedly inside the loops, which is inefficient.

I suggest refactoring this part to be more efficient and to prevent adding duplicate paths.

Suggested change
n = len(paths)
inclusion = set(inclusion_list)
if len(inclusion) == 0:
return paths
new_paths = []
for file in paths:
for incl in inclusion_list:
if normalize_task_label(incl) in normalize_task_label(file.stem):
new_paths.append(file)
n = len(paths)
inclusion_set = {normalize_task_label(i) for i in inclusion_list}
if not inclusion_set:
return paths
new_paths = []
for file in paths:
normalized_stem = normalize_task_label(file.stem)
if any(incl in normalized_stem for incl in inclusion_set):
new_paths.append(file)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added an early break should achieve the same result.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you don't do the efficient things it mentions but u do solve the possible error


if len(new_paths) < n:
_LOGGER.info(
f"Removed {n - len(new_paths)} records due to not being part of inclusion list: {inclusion_list}."
)
return new_paths

@staticmethod
def _expand_filestems_for_deidentification(
filestems: t.Iterable[str],
Expand Down Expand Up @@ -1996,7 +2019,7 @@ def _deidentify_audio_files(
outdir: Path,
exclude_participant_ids: t.List[str] = [],
exclude_audio_filestems: t.List[str] = [],
sensitive_audio_task_list: t.List[str] = [],
audio_tasks_to_include_list: t.List[str] = [],
participant_ids_to_remap: t.Dict[str, str] = {},
participant_session_id_to_remap: t.Dict[str, str] = {},
max_workers: int = 16,
Expand Down Expand Up @@ -2041,9 +2064,9 @@ def _deidentify_audio_files(
audio_paths, exclusion_list=['audio-check'], exclusion_type='filestem_contains'
)

sensitive_audio_task_list = [f"task-{normalize_task_label(task)}" for task in sensitive_audio_task_list]
audio_paths = BIDSDataset._apply_exclusion_list_to_filepaths(
audio_paths, exclusion_list=sensitive_audio_task_list, exclusion_type='filestem_contains'
sensitive_audio_task_list = [f"task-{normalize_task_label(task)}" for task in audio_tasks_to_include_list]
audio_paths = BIDSDataset._grab_filepaths_list_to_include(
audio_paths, inclusion_list=sensitive_audio_task_list
)

_LOGGER.info(f"Copying {len(audio_paths)} recordings.")
Expand Down Expand Up @@ -2085,7 +2108,7 @@ def _deidentify_feature_files(
outdir: Path,
exclude_participant_ids: t.List[str] = [],
exclude_audio_filestems: t.List[str] = [],
sensitive_audio_task_list: t.List[str] = [],
audio_tasks_to_include_list: t.List[str] = [],
participant_ids_to_remap: t.Dict[str, str] = {},
participant_session_id_to_remap: t.Dict[str, str] = {},
max_workers: int = 16,
Expand Down Expand Up @@ -2130,7 +2153,7 @@ def _deidentify_feature_files(
paths, exclusion_list=['audio-check'], exclusion_type='filestem_contains'
)

sensitive_task_labels = {normalize_task_label(t) for t in sensitive_audio_task_list}
audio_task_labels = {normalize_task_label(t) for t in audio_tasks_to_include_list}

_LOGGER.info(f"Copying {len(paths)} feature files.")

Expand All @@ -2157,7 +2180,7 @@ def process_feature_file(features_path):

# if it is not sensitive and we want to keep features, move all features over
task_name = BIDSDataset._extract_task_name_from_path(features_path)
if normalize_task_label(task_name) not in sensitive_task_labels:
if normalize_task_label(task_name) in audio_task_labels:
shutil.copy(features_path, output_path)
else:
features = torch.load(features_path, weights_only=False, map_location=torch.device('cpu'))
Expand Down
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def setup_publish_config(tmp_path):
"audio_filestems_to_remove.json": [],
"id_remapping.json": {},
"participants_to_remove.json": [],
"sensitive_audio_tasks.json": [],
"audio_tasks_to_include.json": [],
}

for filename, content in defaults.items():
Expand Down