diff --git a/pyproject.toml b/pyproject.toml index c7174edb..44510e93 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,7 @@ dependencies = [ "formulaic >=0.3", "sqlalchemy >=1.4.31", "bids-validator>=1.14.7", # Keep up-to-date to ensure support for recent modalities + "bidsschematools>=1.1.0", # Required for schema-based configs "num2words >=0.5.10", "click >=8.0", "universal_pathlib >=0.2.2", diff --git a/schema_integration_demo.ipynb b/schema_integration_demo.ipynb new file mode 100644 index 00000000..0109eeef --- /dev/null +++ b/schema_integration_demo.ipynb @@ -0,0 +1,437 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# BIDS Schema Integration Demo\n", + "\n", + "This notebook demonstrates the new BIDS schema integration features in PyBIDS, showing how the new schema-based configuration compares to the legacy JSON-based configuration." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from bids import BIDSLayout\n", + "from bids.tests import get_test_data_path\n", + "import os" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup: Using a test dataset\n", + "\n", + "We'll use the 7T TRT dataset that comes with PyBIDS for testing." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using dataset at: /home/ashley/repos/bids/pybids/tests/data/7t_trt\n", + "total 76\n", + "drwxr-xr-x 12 ashley ashley 4096 Aug 21 08:59 .\n", + "drwxr-xr-x 8 ashley ashley 4096 Aug 21 08:59 ..\n", + "-rw-r--r-- 1 ashley ashley 0 Aug 21 08:59 README\n", + "-rw-r--r-- 1 ashley ashley 55 Aug 21 08:59 dataset_description.json\n", + "-rw-r--r-- 1 ashley ashley 482 Aug 21 08:59 participants.tsv\n", + "drwxr-xr-x 4 ashley ashley 4096 Aug 21 08:59 sub-01\n", + "drwxr-xr-x 4 ashley ashley 4096 Aug 21 08:59 sub-02\n", + "drwxr-xr-x 4 ashley ashley 4096 Aug 21 08:59 sub-03\n", + "drwxr-xr-x 4 ashley ashley 4096 Aug 21 08:59 sub-04\n" + ] + } + ], + "source": [ + "# Get path to test dataset\n", + "data_path = os.path.join(get_test_data_path(), '7t_trt')\n", + "print(f\"Using dataset at: {data_path}\")\n", + "\n", + "# List the first few files to see the structure\n", + "!ls -la {data_path} | head -10" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Part 1: Legacy JSON-based Configuration\n", + "\n", + "The traditional PyBIDS approach uses JSON configuration files to define entities and their patterns." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Legacy layout initialized with 339 files\n", + "\n", + "Available entities: ['CogAtlasID', 'Columns', 'EchoTime', 'EchoTime1', 'EchoTime2', 'EffectiveEchoSpacing', 'IntendedFor', 'MTState', 'PhaseEncodingDirection', 'RepetitionTime', 'SamplingFrequency', 'SliceEncodingDirection', 'SliceTiming', 'StartTime', 'TaskName', 'acquisition', 'ceagent', 'chunk', 'datatype', 'direction', 'echo', 'extension', 'flip', 'fmap', 'inv', 'modality', 'mt', 'nucleus', 'part', 'proc', 'reconstruction', 'recording', 'run', 'sample', 'scans', 'session', 'space', 'staining', 'subject', 'suffix', 'task', 'tracer', 'tracksys', 'volume']\n", + "\n", + "Layout info:\n", + "BIDS Layout: .../bids/pybids/tests/data/7t_trt | Subjects: 10 | Sessions: 20 | Runs: 20\n" + ] + } + ], + "source": [ + "# Initialize layout with legacy config (this is the default)\n", + "layout_legacy = BIDSLayout(data_path, config=['bids'])\n", + "print(f\"Legacy layout initialized with {len(layout_legacy.get())} files\")\n", + "print(f\"\\nAvailable entities: {sorted(layout_legacy.get_entities().keys())}\")\n", + "\n", + "# Print basic layout information like in the docs tutorial\n", + "print(f\"\\nLayout info:\")\n", + "print(layout_legacy)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 6 BOLD files for subject 01:\n", + " sub-01_ses-1_task-rest_acq-fullbrain_run-1_bold.nii.gz\n", + " sub-01_ses-1_task-rest_acq-fullbrain_run-2_bold.nii.gz\n", + " sub-01_ses-1_task-rest_acq-prefrontal_bold.nii.gz\n", + " sub-01_ses-2_task-rest_acq-fullbrain_run-1_bold.nii.gz\n", + " sub-01_ses-2_task-rest_acq-fullbrain_run-2_bold.nii.gz\n", + " sub-01_ses-2_task-rest_acq-prefrontal_bold.nii.gz\n", + "\n", + "Compare return types:\n", + "return_types=[, ]\n", + "return_types=[, ]\n" + ] + } + ], + "source": [ + "# Query example: Get all BOLD files for subject 01\n", + "bold_files_legacy = layout_legacy.get(\n", + " subject='01', \n", + " suffix='bold', \n", + " extension='nii.gz',\n", + " return_type='filename' # Returns filenames instead of BIDSFile objects\n", + ")\n", + "print(f\"Found {len(bold_files_legacy)} BOLD files for subject 01:\")\n", + "for f in bold_files_legacy:\n", + " print(f\" {os.path.basename(f)}\")\n", + "\n", + "# Show the difference between return types like in docs\n", + "print(f\"\\nCompare return types:\")\n", + "bold_objects = layout_legacy.get(subject='01', suffix='bold', extension='nii.gz')[:2]\n", + "bold_filenames = layout_legacy.get(subject='01', suffix='bold', extension='nii.gz', return_type='filename')[:2]\n", + "\n", + "print(f\"return_types={[type(obj) for obj in bold_objects]}\")\n", + "print(f\"return_types={[type(fname) for fname in bold_filenames]}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Part 2: New Schema-based Configuration\n", + "\n", + "The new approach uses the BIDS schema directly via `bidsschematools` to understand the dataset structure." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Schema-based layout initialized with 339 files\n", + "\n", + "Available entities: ['CogAtlasID', 'Columns', 'EchoTime', 'EchoTime1', 'EchoTime2', 'EffectiveEchoSpacing', 'IntendedFor', 'MTState', 'PhaseEncodingDirection', 'RepetitionTime', 'SamplingFrequency', 'SliceEncodingDirection', 'SliceTiming', 'StartTime', 'TaskName', 'acquisition', 'ceagent', 'chunk', 'datatype', 'density', 'description', 'direction', 'echo', 'extension', 'flip', 'hemisphere', 'inversion', 'label', 'modality', 'mtransfer', 'nucleus', 'part', 'processing', 'reconstruction', 'recording', 'resolution', 'run', 'sample', 'segmentation', 'session', 'space', 'split', 'stain', 'subject', 'suffix', 'task', 'tracer', 'tracksys', 'volume']\n", + "\n", + "Layout info:\n", + "BIDS Layout: .../bids/pybids/tests/data/7t_trt | Subjects: 10 | Sessions: 20 | Runs: 20\n" + ] + } + ], + "source": [ + "# Initialize layout with schema-based config\n", + "# This uses Config.load('bids-schema') internally when you pass 'bids-schema' in config\n", + "layout_schema = BIDSLayout(data_path, config=['bids-schema'])\n", + "print(f\"Schema-based layout initialized with {len(layout_schema.get())} files\")\n", + "print(f\"\\nAvailable entities: {sorted(layout_schema.get_entities().keys())}\")\n", + "\n", + "# Print basic layout information\n", + "print(f\"\\nLayout info:\")\n", + "print(layout_schema)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 6 BOLD files for subject 01:\n", + " sub-01_ses-1_task-rest_acq-fullbrain_run-1_bold.nii.gz\n", + " sub-01_ses-1_task-rest_acq-fullbrain_run-2_bold.nii.gz\n", + " sub-01_ses-1_task-rest_acq-prefrontal_bold.nii.gz\n", + " sub-01_ses-2_task-rest_acq-fullbrain_run-1_bold.nii.gz\n", + " sub-01_ses-2_task-rest_acq-fullbrain_run-2_bold.nii.gz\n", + " sub-01_ses-2_task-rest_acq-prefrontal_bold.nii.gz\n", + "\n", + "Compare return types:\n", + "return_types=[, ]\n", + "return_types=[, ]\n" + ] + } + ], + "source": [ + "# Same query with schema-based layout\n", + "bold_files_schema = layout_schema.get(\n", + " subject='01', \n", + " suffix='bold', \n", + " extension='nii.gz',\n", + " return_type='filename'\n", + ")\n", + "print(f\"Found {len(bold_files_schema)} BOLD files for subject 01:\")\n", + "for f in bold_files_schema:\n", + " print(f\" {os.path.basename(f)}\")\n", + "\n", + "# Show the difference between return types like in docs\n", + "print(f\"\\nCompare return types:\")\n", + "bold_objects_schema = layout_schema.get(subject='01', suffix='bold', extension='nii.gz')[:2]\n", + "bold_filenames_schema = layout_schema.get(subject='01', suffix='bold', extension='nii.gz', return_type='filename')[:2]\n", + "\n", + "print(f\"return_types={[type(obj) for obj in bold_objects_schema]}\")\n", + "print(f\"return_types={[type(fname) for fname in bold_filenames_schema]}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Part 3: Comparing the Two Approaches\n", + "\n", + "Let's verify that both approaches give us the same results." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Legacy layout: 339 files\n", + "Schema layout: 339 files\n", + "\n", + "Match: True\n" + ] + } + ], + "source": [ + "# Compare file counts\n", + "legacy_count = len(layout_legacy.get())\n", + "schema_count = len(layout_schema.get())\n", + "\n", + "print(f\"Legacy layout: {legacy_count} files\")\n", + "print(f\"Schema layout: {schema_count} files\")\n", + "print(f\"\\nMatch: {legacy_count == schema_count}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Legacy entities: 44\n", + "Schema entities: 49\n", + "\n", + "Entities only in legacy: ['fmap', 'inv', 'mt', 'proc', 'scans', 'staining']\n", + "\n", + "Entities only in schema: ['density', 'description', 'hemisphere', 'inversion', 'label', 'mtransfer', 'processing', 'resolution', 'segmentation', 'split', 'stain']\n" + ] + } + ], + "source": [ + "# Compare entities\n", + "legacy_entities = set(layout_legacy.get_entities().keys())\n", + "schema_entities = set(layout_schema.get_entities().keys())\n", + "\n", + "print(f\"Legacy entities: {len(legacy_entities)}\")\n", + "print(f\"Schema entities: {len(schema_entities)}\")\n", + "\n", + "# Check for differences\n", + "only_legacy = legacy_entities - schema_entities\n", + "only_schema = schema_entities - legacy_entities\n", + "\n", + "if only_legacy:\n", + " print(f\"\\nEntities only in legacy: {sorted(only_legacy)}\")\n", + "if only_schema:\n", + " print(f\"\\nEntities only in schema: {sorted(only_schema)}\")\n", + "if not only_legacy and not only_schema:\n", + " print(\"\\nAll entities match!\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Query: {'suffix': 'bold', 'extension': 'nii.gz'}\n", + "Legacy: 60 files\n", + "Schema: 60 files\n", + "✓ Results match!\n", + "\n", + "Query: {'suffix': 'T1w', 'extension': 'nii.gz'}\n", + "Legacy: 10 files\n", + "Schema: 10 files\n", + "✓ Results match!\n", + "\n", + "Query: {'subject': '01', 'suffix': 'scans', 'extension': 'tsv'}\n", + "Legacy: 2 files\n", + "Schema: 2 files\n", + "✓ Results match!\n", + "\n" + ] + } + ], + "source": [ + "# Compare specific query results\n", + "def compare_queries(layout1, layout2, name1=\"Layout 1\", name2=\"Layout 2\", **kwargs):\n", + " \"\"\"Compare results from two layouts for the same query.\"\"\"\n", + " result1 = set(layout1.get(return_type='filename', **kwargs))\n", + " result2 = set(layout2.get(return_type='filename', **kwargs))\n", + " \n", + " print(f\"Query: {kwargs}\")\n", + " print(f\"{name1}: {len(result1)} files\")\n", + " print(f\"{name2}: {len(result2)} files\")\n", + " \n", + " if result1 == result2:\n", + " print(\"✓ Results match!\\n\")\n", + " else:\n", + " only1 = result1 - result2\n", + " only2 = result2 - result1\n", + " if only1:\n", + " print(f\"Only in {name1}: {[os.path.basename(f) for f in only1]}\")\n", + " if only2:\n", + " print(f\"Only in {name2}: {[os.path.basename(f) for f in only2]}\")\n", + " print()\n", + "\n", + "# Test various queries\n", + "compare_queries(layout_legacy, layout_schema, \"Legacy\", \"Schema\", \n", + " suffix='bold', extension='nii.gz')\n", + "\n", + "compare_queries(layout_legacy, layout_schema, \"Legacy\", \"Schema\",\n", + " suffix='T1w', extension='nii.gz')\n", + "\n", + "# 1. Test with scans TSV files (these exist)\n", + "compare_queries(layout_legacy, layout_schema, \"Legacy\", \"Schema\",\n", + "subject='01', suffix='scans', extension='tsv')\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Part 5: Performance Comparison\n", + "\n", + "Let's compare the initialization time for both approaches." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Legacy initialization: 0.684 seconds\n", + "Schema initialization: 0.679 seconds\n", + "\n", + "Difference: 0.005 seconds\n", + "Schema is faster by 0.8%\n" + ] + } + ], + "source": [ + "import time\n", + "\n", + "# Time legacy initialization\n", + "start = time.time()\n", + "layout_legacy_timed = BIDSLayout(data_path, config=['bids'], reset_database=True)\n", + "legacy_time = time.time() - start\n", + "\n", + "# Time schema initialization \n", + "start = time.time()\n", + "layout_schema_timed = BIDSLayout(data_path, config=['bids-schema'], reset_database=True)\n", + "schema_time = time.time() - start\n", + "\n", + "print(f\"Legacy initialization: {legacy_time:.3f} seconds\")\n", + "print(f\"Schema initialization: {schema_time:.3f} seconds\")\n", + "print(f\"\\nDifference: {abs(legacy_time - schema_time):.3f} seconds\")\n", + "print(f\"Schema is {'faster' if schema_time < legacy_time else 'slower'} by {abs(1 - schema_time/legacy_time)*100:.1f}%\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.18" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/src/bids/layout/layout.py b/src/bids/layout/layout.py index 998d26a3..3ffad96b 100644 --- a/src/bids/layout/layout.py +++ b/src/bids/layout/layout.py @@ -76,7 +76,8 @@ class BIDSLayout: file in order to be indexed. config : str or list or None, optional Optional name(s) of configuration file(s) to use. - By default (None), uses 'bids'. + By default (None), uses 'bids'. Can also be 'bids-schema' to use + the official BIDS schema instead of JSON config files. sources : :obj:`bids.layout.BIDSLayout` or list or None, optional Optional BIDSLayout(s) from which the current BIDSLayout is derived. config_filename : str @@ -414,6 +415,89 @@ def get_entities(self, scope='all', metadata=None): entities.update({e.name: e for e in results}) return entities + def _find_directories_for_entity_using_schema(self, entity_name, results): + """Find directories using schema directory rules.""" + from bidsschematools import schema as bst_schema + import os + + # Load schema directory rules + bids_schema = bst_schema.load_schema() + + directory_rules = bids_schema.rules.directories.raw + directories = set() + + # Build directory paths using schema rules for each file + for f in results: + if entity_name in f.entities: + # Build path based on schema directory hierarchy + path_components = self._build_path_from_schema_rules(f.entities, directory_rules) + if path_components: + # Convert to full directory path + directory_path = os.path.join(self.root, *path_components) + directories.add(directory_path) + + return list(directories) + + def _build_path_from_schema_rules(self, entities, directory_rules): + """Build directory path components from schema rules and entity values.""" + path_components = [] + current_level = 'root' + + # Traverse schema directory hierarchy + while current_level: + rule = directory_rules.get(current_level) + if not rule: # pragma: no cover + # Schema always provides valid rules; current_level only set from schema subdirs + break + + rule_dict = dict(rule) + rule_entity = rule_dict.get('entity') + + # If this rule corresponds to an entity, add directory component + if rule_entity and rule_entity in entities: + entity_value = entities[rule_entity] + if rule_entity == 'subject': + path_components.append(f'sub-{entity_value}') + elif rule_entity == 'session': + path_components.append(f'ses-{entity_value}') + else: # pragma: no cover + # BIDS schema only defines subject/session entity directories currently + path_components.append(str(entity_value)) + elif current_level == 'datatype' and 'datatype' in entities: + # Special case: datatype rule has no entity field but we use the datatype value + path_components.append(entities['datatype']) + + # Determine next level based on subdirs and available entities + subdirs = rule_dict.get('subdirs', []) + next_level = None + + # First pass: look for entity-based subdirs across all options + entity_candidates = [] + rule_candidates = [] + + for subdir in subdirs: + if isinstance(subdir, dict) and 'oneOf' in subdir: + for option in subdir['oneOf']: + if option in entities: + entity_candidates.append(option) + elif option in directory_rules: + rule_candidates.append(option) + elif isinstance(subdir, str): + if subdir in entities: + entity_candidates.append(subdir) + elif subdir in directory_rules: + rule_candidates.append(subdir) + + # Prefer entity candidates, fallback to rule candidates + if entity_candidates: + next_level = entity_candidates[0] # Take first entity match + elif rule_candidates: + next_level = rule_candidates[0] # Take first rule match + + current_level = next_level + + return path_components + def get_files(self, scope='all'): """Get BIDSFiles for all layouts in the specified scope. @@ -678,7 +762,7 @@ def get(self, return_type='object', target=None, scope='all', '`layout.get(**filters)`.') # Ensure leading periods if extensions were passed - if 'extension' in filters and 'bids' in self.config: + if 'extension' in filters and ('bids' in self.config or any('bids-schema' in c for c in self.config)): filters['extension'] = ['.' + x.lstrip('.') if isinstance(x, str) else x for x in listify(filters['extension'])] @@ -746,28 +830,37 @@ def get(self, return_type='object', target=None, scope='all', )) results = natural_sort(list(set(results))) elif return_type == 'dir': - template = entities[target].directory - if template is None: - raise ValueError('Return type set to directory, but no ' - 'directory template is defined for the ' - 'target entity (\"%s\").' % target) - # Construct regex search pattern from target directory template - # On Windows, the regex won't compile if, e.g., there is a folder starting with "U" on the path. - # Converting to a POSIX path with forward slashes solves this. - template = self._root.as_posix() + template - to_rep = re.findall(r'{(.*?)\}', template) - for ent in to_rep: - patt = entities[ent].pattern - template = template.replace('{%s}' % ent, patt) - # Avoid matching subfolders. We are working with POSIX paths here, so we explicitly use "/" - # as path separator. - template += r'[^/]*$' - matches = [ - f.dirname - for f in results - if re.search(template, f._dirname.as_posix()) - ] - results = natural_sort(list(set(matches))) + # Check if we're using schema-based config or legacy config + using_schema = any('bids-schema' in str(c) for c in listify(self.config)) + + if using_schema: + # Use schema directory rules for schema-based configs + directories = self._find_directories_for_entity_using_schema(target, results) + results = natural_sort(list(set(directories))) + else: + # Use original template-based approach for legacy configs + template = entities[target].directory + if template is None: + raise ValueError('Return type set to directory, but no ' + 'directory template is defined for the ' + 'target entity (\"%s\").' % target) + # Construct regex search pattern from target directory template + # On Windows, the regex won't compile if, e.g., there is a folder starting with "U" on the path. + # Converting to a POSIX path with forward slashes solves this. + template = self._root.as_posix() + template + to_rep = re.findall(r'{(.*?)\}', template) + for ent in to_rep: + patt = entities[ent].pattern + template = template.replace('{%s}' % ent, patt) + # Avoid matching subfolders. We are working with POSIX paths here, so we explicitly use "/" + # as path separator. + template += r'[^/]*$' + matches = [ + f.dirname + for f in results + if re.search(template, f._dirname.as_posix()) + ] + results = natural_sort(list(set(matches))) else: results = natural_sort(results, 'path') diff --git a/src/bids/layout/models.py b/src/bids/layout/models.py index 104e1f42..49ad1098 100644 --- a/src/bids/layout/models.py +++ b/src/bids/layout/models.py @@ -16,6 +16,10 @@ from sqlalchemy import Column, String, Boolean, ForeignKey, Table from sqlalchemy.orm import reconstructor, relationship, backref, object_session +from bidsschematools import schema as bst_schema +from bidsschematools import rules +import re + try: from sqlalchemy.orm import declarative_base except ImportError: # sqlalchemy < 1.4 @@ -148,7 +152,7 @@ def _init_on_load(self): self.default_path_patterns = json.loads(self._default_path_patterns) @classmethod - def load(self, config, session=None): + def load(cls, config, session=None): """Load a Config instance from the passed configuration data. Parameters @@ -160,6 +164,9 @@ def load(self, config, session=None): (e.g., 'bids' or 'derivatives') * A path to a JSON file containing config information * A dictionary containing config information + * 'bids-schema' to load from the official BIDS schema + * dict with 'schema_version' key to load specific BIDS version + (e.g., {'schema_version': '1.9.0'}) session : :obj:`sqlalchemy.orm.session.Session` or None An optional SQLAlchemy Session instance. If passed, the session is used to check the database for (and @@ -170,6 +177,16 @@ def load(self, config, session=None): A Config instance. """ + # Handle schema-based config loading + if config == 'bids-schema': + return cls._from_schema(session=session) + elif isinstance(config, dict) and 'schema_version' in config: + return cls._from_schema( + schema_version=config['schema_version'], + session=session + ) + + # Existing JSON/file-based loading if isinstance(config, (str, Path, UPath)): config_paths = get_option('config_paths') if config in config_paths: @@ -188,6 +205,197 @@ def load(self, config, session=None): return Config(session=session, **config) + @classmethod + def _from_schema(cls, schema_version=None, session=None): + """Load config from BIDS schema. + + Parameters + ---------- + schema_version : str, optional + Specific BIDS specification version to load (e.g., "1.9.0", "1.8.0"). + If None, uses the schema version bundled with bidsschematools. + session : :obj:`sqlalchemy.orm.session.Session` or None + An optional SQLAlchemy Session instance. + + Returns + ------- + Config + A Config instance populated with entities and patterns + extracted from the BIDS schema. + """ + from bidsschematools import rules, schema as bidsschema + from upath import UPath + + # Load schema - either default or specific version + if schema_version is None: + bids_schema = bidsschema.load_schema() + else: + # Load specific schema version from versioned URL + schema_url = UPath(f"https://bids-specification.readthedocs.io/en/v{schema_version}/schema.json") + try: + bids_schema = bidsschema.load_schema(schema_url) + except Exception as e: + raise ValueError( + f"Failed to load BIDS schema version {schema_version}. " + f"Check that the version exists at {schema_url}. " + f"Error: {e}" + ) + + # Collect ALL patterns from bidsschematools (keep existing collection logic) + all_patterns = [] + file_sections = { + 'raw': bids_schema.rules.files.raw, + 'deriv': bids_schema.rules.files.deriv, + 'common': bids_schema.rules.files.common + } + + for section_type, sections in file_sections.items(): + if section_type == 'raw': + for datatype_name in sections.keys(): + datatype_rules = getattr(sections, datatype_name) + regex_rules = rules.regexify_filename_rules(datatype_rules, bids_schema, level=1) + all_patterns.extend(regex_rules) + else: + for subsection_name in sections.keys(): + subsection_rules = getattr(sections, subsection_name) + regex_rules = rules.regexify_filename_rules(subsection_rules, bids_schema, level=1) + all_patterns.extend(regex_rules) + + # Store patterns directly - NO PARSING EVER + config_name = f'bids-schema-{bids_schema.bids_version}' + + # Extract entities directly from schema rules + entity_names = cls._extract_entity_names_from_rules(bids_schema) + + # Extract actual values for key entities from schema rules directly + entity_values = cls._extract_entity_values_from_rules(bids_schema, entity_names) + + # Create Entity objects + entities = cls._create_entities_from_schema(entity_names, entity_values, bids_schema) + + config = cls(name=config_name, entities=entities, session=session) + + return config + + @classmethod + def _extract_entity_names_from_rules(cls, bids_schema): + """Extract entity names directly from schema rules""" + + # Get entity names from all rule sections + entity_names = set() + file_sections = { + 'raw': bids_schema.rules.files.raw, + 'deriv': bids_schema.rules.files.deriv, + 'common': bids_schema.rules.files.common + } + + for section_type, sections in file_sections.items(): + for section_name in sections.keys(): + section_rules = getattr(sections, section_name) + for rule_name in section_rules.keys(): + rule = getattr(section_rules, rule_name) + rule_dict = dict(rule) + if 'entities' in rule_dict: + entity_names.update(rule_dict['entities'].keys()) + + # Add special constructs that appear in patterns but aren't "entities" + entity_names.update(['suffix', 'extension', 'datatype']) + + return entity_names + + @classmethod + def _create_entities_from_schema(cls, entity_names, entity_values, bids_schema): + """Create Entity objects directly from schema information.""" + entities = [] + + for entity_name in sorted(entity_names): + # Get entity info from schema if available + entity_spec = bids_schema.objects.entities.get(entity_name, {}) + + # Handle special entities that don't have BIDS prefixes + special_entities = {'suffix', 'extension', 'datatype'} + + if entity_name in special_entities: + # These entities use explicit value lists from schema rules + if entity_name in entity_values and entity_values[entity_name]: + # Filter out problematic values for extension entity + if entity_name == 'extension': + # Remove empty string and wildcard patterns + filtered_values = {v for v in entity_values[entity_name] + if v and v not in ('.*', '**', '*')} + values = '|'.join(sorted(filtered_values)) + else: + values = '|'.join(sorted(entity_values[entity_name])) + + if entity_name == 'extension': + pattern = fr'({values})\Z' # Extensions are at the end + elif entity_name == 'datatype': + pattern = fr'/({values})/' # Datatypes are directory names + else: # suffix + pattern = fr'[_/\\\\]({values})\.?' # Before extension, with optional dot + else: + # Regular BIDS entities - use schema format patterns directly + bids_prefix = entity_spec.get('name', entity_name) + format_type = entity_spec.get('format', 'label') + + # Get the official value pattern from schema formats + value_pattern = '[0-9a-zA-Z]+' # Default pattern + if format_type in bids_schema.objects.formats: + format_obj = dict(bids_schema.objects.formats[format_type]) + value_pattern = format_obj.get('pattern', value_pattern) + + # Determine separator based on entity type + if entity_name in ['subject', 'session']: + # Directory-level entities + pattern = fr'[/\\\\]+{bids_prefix}-({value_pattern})' + else: + # File-level entities + pattern = fr'[_/\\\\]+{bids_prefix}-({value_pattern})' + + entity_config = { + 'name': entity_name, + 'pattern': pattern, + 'dtype': 'int' if entity_spec.get('format') == 'index' else 'str' + } + + entities.append(entity_config) + + return entities + + @classmethod + def _extract_entity_values_from_rules(cls, bids_schema, entity_names): + """Extract entity values directly from schema rules""" + + entity_values = {name: set() for name in entity_names} + file_sections = { + 'raw': bids_schema.rules.files.raw, + 'deriv': bids_schema.rules.files.deriv, + 'common': bids_schema.rules.files.common + } + + for section_type, sections in file_sections.items(): + for section_name in sections.keys(): + section_rules = getattr(sections, section_name) + for rule_name in section_rules.keys(): + rule = getattr(section_rules, rule_name) + rule_dict = dict(rule) + + # Get values for special entities directly from rules + if 'suffixes' in rule_dict: + entity_values['suffix'].update(rule_dict['suffixes']) + if 'extensions' in rule_dict: + entity_values['extension'].update(rule_dict['extensions']) + if 'datatypes' in rule_dict: + entity_values['datatype'].update(rule_dict['datatypes']) + + # For regular entities, we don't extract specific values + # They use generic patterns like [0-9a-zA-Z]+ + + return entity_values + + + + def __repr__(self): return f"" diff --git a/src/bids/layout/tests/test_layout.py b/src/bids/layout/tests/test_layout.py index a9d99085..67df7ccb 100644 --- a/src/bids/layout/tests/test_layout.py +++ b/src/bids/layout/tests/test_layout.py @@ -1214,3 +1214,57 @@ def test_intended_for(temporary_dataset, intent): layout = BIDSLayout(temporary_dataset) assert layout.get_IntendedFor(subject='01') + + +def test_get_return_type_dir_with_schema_datatype(layout_7t_trt): + """Test return_type='dir' with schema config for datatype entity (covers line 463).""" + # Create a schema-based layout + layout = BIDSLayout(layout_7t_trt.root, config='bids-schema', derivatives=False) + + # Query for datatype directories - this should trigger the code path + # that handles non-subject/non-session entities (line 463) + dirs = layout.get(target='datatype', return_type='dir') + + # Should return actual datatype directories + assert len(dirs) > 0 + # Check that these are actual directories in the dataset + for d in dirs: + assert os.path.exists(d) + + +def test_get_return_type_dir_with_legacy_config_no_template(): + """Test return_type='dir' with legacy config missing directory template (covers line 842).""" + # Create a minimal config without directory templates + config_dict = { + 'name': 'test_config', + 'entities': [ + { + 'name': 'subject', + 'pattern': '[/\\\\]sub-([a-zA-Z0-9]+)', + 'directory': '/sub-{subject}/' # This has a template + }, + { + 'name': 'task', + 'pattern': '_task-([a-zA-Z0-9]+)', + # No directory template - this should trigger the error + } + ] + } + + # Create temporary test dataset + import tempfile + with tempfile.TemporaryDirectory() as tmpdir: + # Create minimal BIDS structure + Path(tmpdir).joinpath('dataset_description.json').write_text('{"Name": "test", "BIDSVersion": "1.6.0"}') + sub_dir = Path(tmpdir) / 'sub-01' + sub_dir.mkdir() + test_file = sub_dir / 'sub-01_task-rest_bold.nii.gz' + test_file.touch() + + # Load with our custom config + layout = BIDSLayout(tmpdir, config=config_dict, validate=False) + + # This should raise ValueError when trying to get directories for 'task' + # because task entity has no directory template + with pytest.raises(ValueError, match='Return type set to directory'): + layout.get(target='task', return_type='dir') diff --git a/src/bids/layout/tests/test_schema_config.py b/src/bids/layout/tests/test_schema_config.py new file mode 100644 index 00000000..42feef50 --- /dev/null +++ b/src/bids/layout/tests/test_schema_config.py @@ -0,0 +1,137 @@ +"""Tests for schema-based Config loading.""" + +import pytest + +from bids.layout.models import Config + + +class TestSchemaConfig: + """Test schema-based configuration loading.""" + + def test_load_bids_schema_basic(self): + """Test loading config from BIDS schema.""" + config = Config.load('bids-schema') + + # Basic checks + assert config.name.startswith('bids-schema-') + assert len(config.entities) > 0 + + # Check that standard entities are present (using full names per PyBIDS convention) + entity_names = {e.name for e in config.entities.values()} + expected_entities = {'subject', 'session', 'task', 'run'} + assert expected_entities.issubset(entity_names) + + # Check that PyBIDS-specific entities are present + pybids_entities = {'extension', 'suffix', 'datatype'} + assert pybids_entities.issubset(entity_names) + + def test_schema_entity_patterns(self): + """Test that entity patterns are correctly generated.""" + config = Config.load('bids-schema') + + # Test subject entity (directory-based) + if 'subject' in config.entities: + subject_entity = config.entities['subject'] + assert subject_entity.regex is not None + # Should match /sub-01/ or similar + match = subject_entity.regex.search('/sub-01/') + assert match is not None + assert match.group(1) == '01' + + # Test task entity (file-based) + if 'task' in config.entities: + task_entity = config.entities['task'] + assert task_entity.regex is not None + # Should match _task-rest_ or similar + match = task_entity.regex.search('_task-rest_') + assert match is not None + assert match.group(1) == 'rest' + + def test_schema_version_tracking(self): + """Test that schema version is tracked in config name.""" + config = Config.load('bids-schema') + + # Config name should include version + assert 'bids-schema-' in config.name + # Should have some version number after the dash + version_part = config.name.split('bids-schema-')[1] + assert len(version_part) > 0 + + def test_schema_version_parameter(self): + """Test loading specific schema version.""" + # Test loading BIDS v1.9.0 schema + config = Config.load({'schema_version': '1.9.0'}) + + assert config.name.startswith('bids-schema-') + assert len(config.entities) > 0 + # Should include the version in the name + assert '1.9.0' in config.name + + def test_invalid_schema_version(self): + """Test that loading invalid schema version raises appropriate error.""" + # Test with a non-existent version + with pytest.raises(ValueError, match="Failed to load BIDS schema version"): + Config.load({'schema_version': '99.99.99'}) + + + + def test_entity_generation(self): + """Test entity pattern generation using real schema.""" + config = Config.load('bids-schema') + + # Test that entities were generated + assert len(config.entities) > 0 + + # Test specific entity properties we care about (using full names) + assert 'subject' in config.entities + assert 'task' in config.entities + assert 'run' in config.entities + + # Test that patterns are valid regex + for entity_name, entity in config.entities.items(): + assert entity.regex is not None, f"Entity {entity_name} missing regex" + # Test that regex compiles and works + assert hasattr(entity.regex, 'search'), f"Entity {entity_name} regex invalid" + + # Test specific entity patterns work correctly + subject_entity = config.entities['subject'] + assert 'sub-' in subject_entity.pattern + + # Test subject pattern matches expected format + match = subject_entity.regex.search('/sub-01/') + assert match is not None + assert match.group(1) == '01' + + # Test task pattern matches expected format + task_entity = config.entities['task'] + match = task_entity.regex.search('_task-rest_') + assert match is not None + assert match.group(1) == 'rest' + + # Test run pattern matches expected format + run_entity = config.entities['run'] + match = run_entity.regex.search('_run-02_') + assert match is not None + assert match.group(1) == '02' + + def test_schema_directory_query(self): + """Test directory queries with schema-based config.""" + from bids import BIDSLayout + from pathlib import Path + import os + + # Use the 7t_trt test dataset + data_dir = Path(__file__).parent.parent.parent.parent.parent / 'tests' / 'data' / '7t_trt' + if not data_dir.exists(): + pytest.skip("Test dataset not available") + + # Create layout with schema config + layout = BIDSLayout(data_dir, config='bids-schema', derivatives=False) + + # Test directory query for subjects + subject_dirs = layout.get(target='subject', return_type='dir') + assert len(subject_dirs) > 0 + assert all(os.path.exists(d) for d in subject_dirs) + # Should include subject directories + assert any('sub-' in os.path.basename(d) for d in subject_dirs) + diff --git a/src/bids/layout/tests/test_schema_pattern_validation.py b/src/bids/layout/tests/test_schema_pattern_validation.py new file mode 100644 index 00000000..594825ea --- /dev/null +++ b/src/bids/layout/tests/test_schema_pattern_validation.py @@ -0,0 +1,187 @@ +"""Tests to verify schema patterns match actual schema content.""" + +from bids.layout.models import Config +from bidsschematools import schema as bst_schema + + +class TestSchemaAccuracy: + """Test that patterns accurately reflect BIDS schema content.""" + + def test_extension_patterns_match_schema(self): + """Test that extension patterns match schema extensions.""" + config = Config.load('bids-schema') + + # Load schema directly + bids_schema = bst_schema.load_schema() + + # Get extension entity + extension_entity = config.entities['extension'] + + # Test that schema extensions are matched (excluding wildcards) + schema_extensions = [ext['value'] for ext in bids_schema.objects.extensions.values() + if ext.get('value') and ext['value'].startswith('.') and ext['value'] not in ('.*',)] + + # Collect all failures + matched_extensions = [] + unmatched_extensions = [] + + test_files = [f'test{ext}' for ext in schema_extensions] + for ext, test_file in zip(schema_extensions, test_files): + match = extension_entity.regex.search(test_file) + if match is not None: + matched_extensions.append(ext) + else: + unmatched_extensions.append(ext) + + # Report all differences at once + coverage = len(matched_extensions) / len(schema_extensions) * 100 if schema_extensions else 0 + + error_msg = [] + error_msg.append(f"Extension coverage: {coverage:.1f}% ({len(matched_extensions)}/{len(schema_extensions)})") + if unmatched_extensions: + error_msg.append(f"Unmatched extensions ({len(unmatched_extensions)}): {unmatched_extensions[:20]}") + if len(unmatched_extensions) > 20: + error_msg.append(f"... and {len(unmatched_extensions) - 20} more") + if matched_extensions: + error_msg.append(f"Matched extensions ({len(matched_extensions)}): {matched_extensions[:10]}") + if len(matched_extensions) > 10: + error_msg.append(f"... and {len(matched_extensions) - 10} more") + + assert len(unmatched_extensions) == 0, "; ".join(error_msg) + + def test_suffix_patterns_match_schema(self): + """Test that suffix patterns match schema suffixes.""" + config = Config.load('bids-schema') + + # Load schema directly + bids_schema = bst_schema.load_schema() + + # Get suffix entity + suffix_entity = config.entities['suffix'] + + # Test that schema suffixes are matched using actual suffix values, not keys + schema_suffix_values = [obj['value'] for obj in bids_schema.objects.suffixes.values() + if obj.get('value')] + + # Collect all failures + matched_suffixes = [] + unmatched_suffixes = [] + + # Test all suffixes using their actual values + test_files = [f'_{suffix}.nii.gz' for suffix in schema_suffix_values] + for suffix, test_file in zip(schema_suffix_values, test_files): + match = suffix_entity.regex.search(test_file) + if match is not None: + matched_suffixes.append(suffix) + else: + unmatched_suffixes.append(suffix) + + # Report all differences at once + coverage = len(matched_suffixes) / len(schema_suffix_values) * 100 if schema_suffix_values else 0 + + error_msg = [] + error_msg.append(f"Suffix coverage: {coverage:.1f}% ({len(matched_suffixes)}/{len(schema_suffix_values)})") + if unmatched_suffixes: + error_msg.append(f"Unmatched suffixes ({len(unmatched_suffixes)}): {unmatched_suffixes[:20]}") + if len(unmatched_suffixes) > 20: + error_msg.append(f"... and {len(unmatched_suffixes) - 20} more") + if matched_suffixes: + error_msg.append(f"Matched suffixes ({len(matched_suffixes)}): {matched_suffixes[:10]}") + if len(matched_suffixes) > 10: + error_msg.append(f"... and {len(matched_suffixes) - 10} more") + + assert len(unmatched_suffixes) == 0, "; ".join(error_msg) + + def test_datatype_patterns_match_schema(self): + """Test that datatype patterns match schema datatypes.""" + config = Config.load('bids-schema') + + # Load schema directly + bids_schema = bst_schema.load_schema() + + # Get datatype entity + datatype_entity = config.entities['datatype'] + + # Test that schema datatypes are matched + schema_datatypes = list(bids_schema.objects.datatypes.keys()) + + # Collect all failures + matched_datatypes = [] + unmatched_datatypes = [] + + test_paths = [f'/{dtype}/' for dtype in schema_datatypes] + for dtype, test_path in zip(schema_datatypes, test_paths): + match = datatype_entity.regex.search(test_path) + if match is not None: + matched_datatypes.append(dtype) + else: + unmatched_datatypes.append(dtype) + + # Report all differences at once + coverage = len(matched_datatypes) / len(schema_datatypes) * 100 if schema_datatypes else 0 + + error_msg = [] + error_msg.append(f"Datatype coverage: {coverage:.1f}% ({len(matched_datatypes)}/{len(schema_datatypes)})") + if unmatched_datatypes: + error_msg.append(f"Unmatched datatypes ({len(unmatched_datatypes)}): {unmatched_datatypes}") + if matched_datatypes: + error_msg.append(f"Matched datatypes ({len(matched_datatypes)}): {matched_datatypes}") + + assert len(unmatched_datatypes) == 0, "; ".join(error_msg) + + def test_entity_format_patterns_match_schema(self): + """Test that entity patterns use actual schema format patterns.""" + config = Config.load('bids-schema') + + # Load schema directly + bids_schema = bst_schema.load_schema() + + # Test a few key entities + test_entities = [ + ('subject', 'label'), + ('run', 'index'), + ('echo', 'index'), + ] + + for entity_name, expected_format in test_entities: + if entity_name in config.entities: + entity = config.entities[entity_name] + + # Get expected pattern from schema + format_obj = bids_schema.objects.formats[expected_format] + expected_pattern = format_obj['pattern'] + + # Check that our entity pattern contains the schema pattern + assert expected_pattern in entity.pattern, \ + f"Entity {entity_name} pattern doesn't contain schema format pattern" + + def test_no_hardcoded_values(self): + """Test that we don't have hardcoded schema values anymore.""" + config = Config.load('bids-schema') + + # Load schema directly to get actual counts + bids_schema = bst_schema.load_schema() + + # We should have more entities than before because we're not missing any + assert len(config.entities) >= len(bids_schema.objects.entities) + + # Extension pattern should include schema extensions + extension_entity = config.entities['extension'] + schema_extension_count = len([ext for ext in bids_schema.objects.extensions.values() + if ext.get('value') and ext['value'].startswith('.')]) + + # Pattern should include many extensions (not just a generic catch-all) + assert len(extension_entity.pattern) > 100, "Extension pattern seems too simple" + + # Suffix pattern should include schema suffixes + suffix_entity = config.entities['suffix'] + schema_suffix_count = len(bids_schema.objects.suffixes) + + # Pattern should include many suffixes + assert len(suffix_entity.pattern) > 200, "Suffix pattern seems too simple" + + print(f"Schema extensions used: {schema_extension_count}") + print(f"Schema suffixes used: {schema_suffix_count}") + print(f"Schema datatypes used: {len(bids_schema.objects.datatypes)}") + print(f"Schema entities used: {len(bids_schema.objects.entities)}") + diff --git a/src/bids/layout/tests/test_schema_version_differences.py b/src/bids/layout/tests/test_schema_version_differences.py new file mode 100644 index 00000000..10b9b239 --- /dev/null +++ b/src/bids/layout/tests/test_schema_version_differences.py @@ -0,0 +1,92 @@ +"""Tests for differences between BIDS schema versions.""" + +import pytest +import tempfile +import json +import os +from pathlib import Path + +from bids import BIDSLayout +from bids.layout.models import Config + + +class TestSchemaVersionDifferences: + """Test that different schema versions handle different features correctly.""" + + def test_entity_parsing_version_differences(self): + """Test that tracksys entity (added in v1.9.0) works in newer schemas but fails in v1.8.0.""" + + with tempfile.TemporaryDirectory() as temp_dir: + dataset_dir = Path(temp_dir) / "test_dataset" + dataset_dir.mkdir() + + # Create minimal dataset with motion file using tracksys entity + with open(dataset_dir / "dataset_description.json", "w") as f: + json.dump({"Name": "Test", "BIDSVersion": "1.10.1", "Authors": ["Test"]}, f) + + motion_dir = dataset_dir / "sub-01" / "motion" + motion_dir.mkdir(parents=True) + (motion_dir / "sub-01_tracksys-imu_motion.tsv").touch() + + # Test tracksys query across schema versions + def test_tracksys_query(config): + layout = BIDSLayout(dataset_dir, config=[config]) + try: + return len(layout.get(tracksys='imu', return_type='filename')) > 0 + except Exception: + return False + + # tracksys entity was added in v1.9.0 for motion datatype + assert test_tracksys_query('bids-schema'), "Current schema should support tracksys" + assert test_tracksys_query({'schema_version': '1.9.0'}), "v1.9.0 should support tracksys (motion added)" + assert not test_tracksys_query({'schema_version': '1.8.0'}), "v1.8.0 should NOT support tracksys" + + + def test_motion_datatype_evolution(self): + """Test that motion datatype (BEP029) support was added in v1.9.0.""" + + # Motion datatype was added in v1.9.0 according to changelog: + # v1.9.0: [ENH] Extend BIDS for Motion data (BEP029) #981 + + config_v190 = Config.load({'schema_version': '1.9.0'}) + config_v180 = Config.load({'schema_version': '1.8.0'}) + + entities_v190 = {e.name for e in config_v190.entities.values()} + entities_v180 = {e.name for e in config_v180.entities.values()} + + # Motion-specific entity 'tracksys' should be in v1.9.0+ but not v1.8.0 + assert 'tracksys' in entities_v190, "tracksys entity should exist in v1.9.0 (motion datatype was added)" + assert 'tracksys' not in entities_v180, "tracksys entity should NOT exist in v1.8.0 (before motion datatype)" + + print(f"✓ Motion datatype evolution verified:") + print(f" v1.8.0 (before motion): tracksys = {'tracksys' in entities_v180}") + print(f" v1.9.0 (motion added): tracksys = {'tracksys' in entities_v190}") + + # Test specific motion-related entities that were added + motion_entities = {'tracksys'} # Could expand this list as more motion entities are identified + + for entity in motion_entities: + assert entity in entities_v190, f"Motion entity '{entity}' should exist in v1.9.0+" + assert entity not in entities_v180, f"Motion entity '{entity}' should NOT exist in v1.8.0" + + def test_schema_version_metadata_differences(self): + """Test that schema versions have different BIDS version numbers.""" + + # Load different schema versions + config_current = Config.load('bids-schema') + config_v190 = Config.load({'schema_version': '1.9.0'}) + config_v180 = Config.load({'schema_version': '1.8.0'}) + + # Check that config names reflect the versions + assert 'bids-schema-' in config_current.name + assert '1.9.0' in config_v190.name + assert '1.8.0' in config_v180.name + + print(f"Current config: {config_current.name}") + print(f"v1.9.0 config: {config_v190.name}") + print(f"v1.8.0 config: {config_v180.name}") + + # Verify they're different configurations + assert config_current.name != config_v190.name + assert config_v190.name != config_v180.name + diff --git a/src/bids/layout/tests/test_schema_vs_json_config.py b/src/bids/layout/tests/test_schema_vs_json_config.py new file mode 100644 index 00000000..507ea47c --- /dev/null +++ b/src/bids/layout/tests/test_schema_vs_json_config.py @@ -0,0 +1,309 @@ +"""Tests to verify schema-driven config matches old JSON config behavior.""" + +import pytest +import re +from bids.layout.models import Config + + +class TestConfigCompatibility: + """Test that new schema-driven config matches old JSON config behavior.""" + + def test_core_entities_present_in_both_configs(self): + """Test that essential BIDS entities are present in both configs.""" + old_config = Config.load('bids') + new_config = Config.load('bids-schema') + + # Test that both configs support the fundamental BIDS entities + core_entities = ['subject', 'session', 'task', 'run', 'acquisition'] + + # Check core entities exist (allowing for naming differences) + old_has_core = all(entity in old_config.entities for entity in core_entities) + new_has_core = all(entity in new_config.entities for entity in core_entities) + + assert old_has_core and new_has_core, "Both configs should support core BIDS entities" + + def test_schema_entities_are_superset_of_valid_old_entities(self): + """Test that schema includes all valid entities from old config.""" + old_config = Config.load('bids') + new_config = Config.load('bids-schema') + + # Map old config entities to their schema equivalents + entity_mapping = { + 'inv': 'inversion', + 'mt': 'mtransfer', + 'proc': 'processing', + # 'fmap', 'scans', 'staining' are invalid - exclude them + } + + valid_old_entities = set(old_config.entities.keys()) - {'fmap', 'scans', 'staining'} + missing_from_new = [] + + for old_entity in valid_old_entities: + expected_new_entity = entity_mapping.get(old_entity, old_entity) + if expected_new_entity not in new_config.entities: + missing_from_new.append(f"{old_entity} -> {expected_new_entity}") + + assert len(missing_from_new) == 0, f"Schema missing valid entities: {missing_from_new}" + + def test_new_suffixes_compatible_with_old_pattern(self): + """Test that all new schema suffixes would be accepted by old generic pattern.""" + old_config = Config.load('bids') + new_config = Config.load('bids-schema') + + # Get old pattern (generic) and new suffixes (explicit) + old_pattern = old_config.entities['suffix'].regex + new_suffixes = self._extract_suffixes_from_pattern(new_config.entities['suffix'].pattern) + + # Test that old pattern would accept all new suffixes + incompatible_suffixes = [] + for suffix in new_suffixes: + test_filename = f"sub-01_task-test_{suffix}.nii.gz" + if not old_pattern.search(test_filename): + incompatible_suffixes.append(suffix) + + assert len(incompatible_suffixes) == 0, f"Old pattern would reject these valid schema suffixes: {incompatible_suffixes}" + + def test_new_extensions_compatible_with_old_pattern(self): + """Test that all new schema extensions would be accepted by old generic pattern.""" + old_config = Config.load('bids') + new_config = Config.load('bids-schema') + + # Get old pattern (generic) and new extensions (explicit) + old_pattern = old_config.entities['extension'].regex + new_extensions = self._extract_extensions_from_pattern(new_config.entities['extension'].pattern) + + # Test that old pattern would accept all new extensions + incompatible_extensions = [] + directory_extensions = [] + + for extension in new_extensions: + test_filename = f"sub-01_task-test_bold{extension}" + if not old_pattern.search(test_filename): + if extension.endswith('/'): + # Directory-style extensions are a new BIDS feature not supported by old pattern + directory_extensions.append(extension) + else: + incompatible_extensions.append(extension) + + # Core file extensions should be compatible + assert len(incompatible_extensions) == 0, f"Old pattern would reject these valid schema file extensions: {incompatible_extensions}" + + # Report directory extensions as expected difference + if directory_extensions: + print(f"Note: {len(directory_extensions)} directory-style extensions are new BIDS features: {directory_extensions}") + + # Core extensions should be present + core_extensions = {'.nii.gz', '.nii', '.tsv', '.json'} + assert core_extensions.issubset(new_extensions), f"Missing core extensions from new config" + + def test_datatype_coverage_matches(self): + """Test that datatype patterns cover the same datatypes.""" + old_config = Config.load('bids') + new_config = Config.load('bids-schema') + + # Extract datatypes from patterns + old_datatypes = self._extract_datatypes_from_pattern(old_config.entities['datatype'].pattern) + new_datatypes = self._extract_datatypes_from_pattern(new_config.entities['datatype'].pattern) + + missing = old_datatypes - new_datatypes + extra = new_datatypes - old_datatypes + + error_msg = [] + if missing: + error_msg.append(f"Missing datatypes: {sorted(missing)}") + if extra: + error_msg.append(f"Extra datatypes: {sorted(extra)}") + + # Core datatypes should be preserved + core_datatypes = {'anat', 'func', 'dwi', 'fmap'} + assert core_datatypes.issubset(new_datatypes), f"Missing core datatypes; {'; '.join(error_msg)}" + + # New should have at least as many as old (schema grows) + assert len(new_datatypes) >= len(old_datatypes), f"Fewer datatypes than old config; {'; '.join(error_msg)}" + + def test_functional_filename_parsing_compatibility(self): + """Test that both configs can parse BIDS filenames and extract core entities.""" + old_config = Config.load('bids') + new_config = Config.load('bids-schema') + + # Test core entity extraction (skip extension due to pattern differences) + test_cases = [ + { + 'filename': '/dataset/sub-01/func/sub-01_task-rest_bold.nii.gz', + 'core_entities': {'subject': '01', 'task': 'rest', 'suffix': 'bold'} + }, + { + 'filename': '/dataset/sub-02/ses-pre/anat/sub-02_ses-pre_T1w.nii.gz', + 'core_entities': {'subject': '02', 'session': 'pre', 'suffix': 'T1w'} + } + ] + + for test_case in test_cases: + filename = test_case['filename'] + expected = test_case['core_entities'] + + old_entities = self._parse_filename_entities(filename, old_config) + new_entities = self._parse_filename_entities(filename, new_config) + + # Test that both configs extract the core BIDS entities + for key, expected_value in expected.items(): + old_extracted = old_entities.get(key) + new_extracted = new_entities.get(key) + + # Both should extract the same core information + assert old_extracted == expected_value, f"Old config failed to extract {key}={expected_value} from {filename}, got {old_extracted}" + assert new_extracted == expected_value, f"New config failed to extract {key}={expected_value} from {filename}, got {new_extracted}" + + # Test that new config has valid patterns + assert len(new_config.entities) > 20, "New config should have many entities" + assert 'subject' in new_config.entities, "New config missing subject entity" + assert 'suffix' in new_config.entities, "New config missing suffix entity" + + def test_new_config_parses_schema_compliant_filenames(self): + """Test that new config correctly parses filenames with schema entities.""" + new_config = Config.load('bids-schema') + + # Test files that use entities only available in schema (not old config) + schema_specific_cases = [ + '/dataset/sub-01/anat/sub-01_hemi-L_T1w.nii.gz', # hemisphere entity + '/dataset/sub-01/anat/sub-01_desc-preproc_T1w.nii.gz', # description entity + '/dataset/sub-01/anat/sub-01_res-native_T1w.nii.gz', # resolution entity + ] + + for filename in schema_specific_cases: + entities = self._parse_filename_entities(filename, new_config) + + # Should extract subject and suffix at minimum + assert 'subject' in entities, f"Failed to extract subject from {filename}" + assert 'suffix' in entities, f"Failed to extract suffix from {filename}" + + # Should extract the schema-specific entity + if 'hemi-' in filename: + assert 'hemisphere' in entities, f"Failed to extract hemisphere from {filename}" + if 'desc-' in filename: + assert 'description' in entities, f"Failed to extract description from {filename}" + if 'res-' in filename: + assert 'resolution' in entities, f"Failed to extract resolution from {filename}" + + def test_pattern_structure_similarity(self): + """Test that pattern structures are similar between old and new.""" + old_config = Config.load('bids') + new_config = Config.load('bids-schema') + + # Test key entities have reasonable pattern structures + key_entities = ['subject', 'session', 'task', 'run', 'suffix', 'extension'] + + for entity_name in key_entities: + if entity_name in old_config.entities and entity_name in new_config.entities: + old_pattern = old_config.entities[entity_name].pattern + new_pattern = new_config.entities[entity_name].pattern + + # Both should be non-empty + assert len(old_pattern) > 0, f"Old {entity_name} pattern is empty" + assert len(new_pattern) > 0, f"New {entity_name} pattern is empty" + + # Both should contain capturing groups + old_groups = len(re.findall(r'\([^)]*\)', old_pattern)) + new_groups = len(re.findall(r'\([^)]*\)', new_pattern)) + + assert old_groups > 0, f"Old {entity_name} pattern has no groups" + assert new_groups > 0, f"New {entity_name} pattern has no groups" + + def test_config_names_and_metadata(self): + """Test config metadata is reasonable.""" + old_config = Config.load('bids') + new_config = Config.load('bids-schema') + + # Old config should have simple name + assert old_config.name == 'bids' + + # New config should include version info + assert new_config.name.startswith('bids-schema-') + assert len(new_config.name) > len('bids-schema-') + + # Both should have reasonable entity counts + assert len(old_config.entities) > 10, "Old config has too few entities" + assert len(new_config.entities) > 10, "New config has too few entities" + + # Report entity count ratio for analysis + ratio = len(new_config.entities) / len(old_config.entities) + print(f"Entity count ratio (new/old): {ratio:.2f}") + + # Helper methods + def _extract_suffixes_from_pattern(self, pattern): + """Extract suffixes from a pattern like [_/\\]+(suffix1|suffix2|...)""" + suffixes = set() + + # Look for alternation groups + matches = re.findall(r'\(([^)]+)\)', pattern) + for match in matches: + if '|' in match: + suffixes.update(match.split('|')) + else: + suffixes.add(match) + + return suffixes + + def _extract_extensions_from_pattern(self, pattern): + """Extract extensions from a pattern.""" + extensions = set() + + # Look for patterns with dots + matches = re.findall(r'\\?\.\w+(?:\\?\.\w+)*', pattern) + for match in matches: + # Clean up regex escaping + clean_ext = match.replace('\\', '') + extensions.add(clean_ext) + + # Also look for alternation groups + alt_matches = re.findall(r'\(([^)]+)\)', pattern) + for match in alt_matches: + if '|' in match and '.' in match: + for item in match.split('|'): + if '.' in item: + clean_item = item.replace('\\', '') + extensions.add(clean_item) + + return extensions + + def _extract_datatypes_from_pattern(self, pattern): + """Extract datatypes from a pattern like [/\\]+(datatype1|datatype2|...)""" + datatypes = set() + + # Look for alternation groups + matches = re.findall(r'\(([^)]+)\)', pattern) + for match in matches: + if '|' in match: + datatypes.update(match.split('|')) + else: + datatypes.add(match) + + return datatypes + + def _parse_filename_entities(self, filename, config): + """Parse entities from filename using config patterns.""" + entities = {} + + for entity_name, entity in config.entities.items(): + if hasattr(entity, 'regex') and entity.regex: + match = entity.regex.search(filename) + if match and len(match.groups()) > 0: + entities[entity_name] = match.group(1) + + return entities + + def _map_entity_name_old_to_new(self, new_entity_name): + """Map new entity names to old entity names for compatibility testing.""" + # Old config sometimes used different names + name_mapping = { + 'subject': 'subject', + 'session': 'session', + 'task': 'task', + 'run': 'run', + 'acquisition': 'acquisition', + 'suffix': 'suffix', + 'extension': 'extension', + # Add other mappings as needed + } + return name_mapping.get(new_entity_name, new_entity_name) + diff --git a/uv.lock b/uv.lock index f70d36a1..2f8fb164 100644 --- a/uv.lock +++ b/uv.lock @@ -2152,6 +2152,7 @@ name = "pybids" source = { editable = "." } dependencies = [ { name = "bids-validator" }, + { name = "bidsschematools" }, { name = "click" }, { name = "formulaic" }, { name = "frozendict" }, @@ -2246,6 +2247,7 @@ requires-dist = [ { name = "altair", marker = "extra == 'model-reports'" }, { name = "altair", marker = "extra == 'test'", specifier = ">=5" }, { name = "bids-validator", specifier = ">=1.14.7" }, + { name = "bidsschematools", specifier = ">=1.1.0" }, { name = "bsmschema", marker = "extra == 'test'", specifier = ">=0.1" }, { name = "click", specifier = ">=8.0" }, { name = "coverage", extras = ["toml"], marker = "extra == 'test'", specifier = ">=7" },