diff --git a/.github/workflows/1-fetch-openneuro-datasets-nemar.yml b/.github/workflows/1-fetch-openneuro-datasets-nemar.yml
new file mode 100644
index 00000000..9dcf25b8
--- /dev/null
+++ b/.github/workflows/1-fetch-openneuro-datasets-nemar.yml
@@ -0,0 +1,130 @@
+name: Fetch OpenNeuro & NEMAR Datasets
+
+on:
+  pull_request:
+    branches:
+      - '**'
+  # schedule:
+  #   # Run weekly on Monday at 00:00 UTC
+  #   - cron: '0 0 * * 1'
+  workflow_dispatch:  # Allow manual triggering
+
+jobs:
+  fetch-datasets:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+      
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          ref: ${{ github.head_ref }}
+          fetch-depth: 0
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+          cache: 'pip'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install gql[requests] requests
+          pip install -e .
+
+      - name: Fetch OpenNeuro datasets
+        run: |
+          python scripts/ingestions/1_fetch_openneuro_datasets.py \
+            --page-size 100 \
+            --output consolidated/openneuro_datasets.json
+
+      - name: Fetch NEMAR GitHub repositories
+        run: |
+          python scripts/ingestions/1_fetch_github_organization.py \
+            --organization nemardatasets \
+            --output consolidated/nemardatasets_repos.json
+
+      - name: Verify OpenNeuro output
+        run: |
+          if [ -f consolidated/openneuro_datasets.json ]; then
+            echo "✓ OpenNeuro dataset file created successfully"
+            python -c "import json; data = json.load(open('consolidated/openneuro_datasets.json')); print(f'Total entries: {len(data)}'); modalities = set(d['modality'] for d in data); print(f'Modalities: {sorted(modalities)}')"
+          else
+            echo "✗ OpenNeuro dataset file not created"
+            exit 1
+          fi
+
+      - name: Verify NEMAR output
+        run: |
+          if [ -f consolidated/nemardatasets_repos.json ]; then
+            echo "✓ NEMAR repositories file created successfully"
+            python -c "import json; data = json.load(open('consolidated/nemardatasets_repos.json')); print(f'Total repositories: {len(data)}'); topics = set(); [topics.update(d.get('topics', [])) for d in data]; print(f'Topics: {sorted(topics) if topics else \"None\"}')"
+          else
+            echo "✗ NEMAR repositories file not created"
+            exit 1
+          fi
+
+      - name: Filter new OpenNeuro datasets
+        run: |
+          python scripts/ingestions/2_filter_new_datasets.py \
+            consolidated/openneuro_datasets.json
+
+      - name: Filter new NEMAR datasets
+        run: |
+          python scripts/ingestions/2_filter_new_datasets.py \
+            consolidated/nemardatasets_repos.json
+
+      - name: Verify filtered outputs
+        run: |
+          echo "📊 Filtering Results:"
+          echo ""
+          if [ -f consolidated/to_digest_openneuro_datasets.json ]; then
+            echo "✓ OpenNeuro filtered datasets created"
+            python -c "import json; data = json.load(open('consolidated/to_digest_openneuro_datasets.json')); print(f'  Datasets to digest: {len(data)}')"
+          else
+            echo "✗ OpenNeuro filtered datasets not created"
+            exit 1
+          fi
+          echo ""
+          if [ -f consolidated/to_digest_nemardatasets_repos.json ]; then
+            echo "✓ NEMAR filtered datasets created"
+            python -c "import json; data = json.load(open('consolidated/to_digest_nemardatasets_repos.json')); print(f'  Datasets to digest: {len(data)}')"
+          else
+            echo "✗ NEMAR filtered datasets not created"
+            exit 1
+          fi
+
+      - name: Commit and push changes if datasets updated
+        run: |
+          git config user.name "github-actions[bot]"
+          git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
+          
+          # Add all dataset files to staging
+          git add consolidated/openneuro_datasets.json
+          git add consolidated/nemardatasets_repos.json
+          git add consolidated/to_digest_openneuro_datasets.json
+          git add consolidated/to_digest_nemardatasets_repos.json
+          
+          # Check if there are actual changes (not just timestamp differences)
+          if git diff --cached --quiet; then
+            echo "No changes detected in dataset files, skipping commit"
+          else
+            echo "Changes detected, committing..."
+            git commit -m "chore: update OpenNeuro & NEMAR dataset listings and filtered to_digest files"
+            git push origin HEAD:${{ github.head_ref }}
+            echo "✓ Changes committed and pushed"
+          fi
+
+      - name: Upload artifacts for downstream jobs
+        uses: actions/upload-artifact@v4
+        with:
+          name: dataset-listings
+          path: |
+            consolidated/openneuro_datasets.json
+            consolidated/nemardatasets_repos.json
+            consolidated/to_digest_openneuro_datasets.json
+            consolidated/to_digest_nemardatasets_repos.json
+          retention-days: 7
diff --git a/.github/workflows/clone-openneuro-datasets.yml b/.github/workflows/clone-openneuro-datasets.yml
new file mode 100644
index 00000000..a3a97800
--- /dev/null
+++ b/.github/workflows/clone-openneuro-datasets.yml
@@ -0,0 +1,98 @@
+name: Clone OpenNeuro Datasets
+
+on:
+  schedule:
+    # Run weekly on Monday at 02:00 UTC (after fetch completes)
+    - cron: '0 2 * * 1'
+  workflow_dispatch:  # Allow manual triggering
+  # TODO: Add other triggers here as needed
+
+jobs:
+  clone-datasets:
+    runs-on: ubuntu-latest
+    timeout-minutes: 720  # 12 hours max for all clones
+    
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+
+      - name: Verify Python script and dataset listings
+        run: |
+          if [ ! -f scripts/ingestions/clone_openneuro_datasets.py ]; then
+            echo "Error: clone_openneuro_datasets.py not found"
+            exit 1
+          fi
+          if [ ! -f consolidated/openneuro_datasets.json ]; then
+            echo "Error: consolidated/openneuro_datasets.json not found"
+            exit 1
+          fi
+          DATASET_COUNT=$(jq 'length' consolidated/openneuro_datasets.json)
+          echo "Found $DATASET_COUNT dataset entries"
+
+      - name: Create test_diggestion directory
+        run: mkdir -p test_diggestion
+
+      - name: Clone OpenNeuro datasets
+        run: |
+          python scripts/ingestions/clone_openneuro_datasets.py \
+            --output-dir test_diggestion \
+            --timeout 300 \
+            --datasets-file consolidated/openneuro_datasets.json
+        continue-on-error: true  # Don't fail workflow if some clones fail
+
+      - name: Generate clone report
+        if: always()
+        run: |
+          if [ -f test_diggestion/clone_results.json ]; then
+            echo "## Clone Results" >> $GITHUB_STEP_SUMMARY
+            echo "" >> $GITHUB_STEP_SUMMARY
+            jq -r '"- Success: \(.success | length)\n- Failed: \(.failed | length)\n- Timeout: \(.timeout | length)\n- Skipped: \(.skip | length)\n- Errors: \(.error | length)"' test_diggestion/clone_results.json >> $GITHUB_STEP_SUMMARY
+          fi
+
+      - name: Upload clone results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: clone-results
+          path: |
+            test_diggestion/clone_results.json
+            test_diggestion/retry.json
+          retention-days: 30
+
+      - name: Create issue if clones failed
+        if: failure()
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const fs = require('fs');
+            if (fs.existsSync('test_diggestion/clone_results.json')) {
+              const results = JSON.parse(fs.readFileSync('test_diggestion/clone_results.json'));
+              const failedCount = (results.failed || []).length + (results.timeout || []).length;
+              if (failedCount > 0) {
+                github.rest.issues.create({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  title: `⚠️ Dataset Cloning: ${failedCount} datasets failed`,
+                  body: `Failed/timeout clones detected.\n\nSee artifacts for details: ${context.runId}`,
+                  labels: ['ci', 'datasets']
+                });
+              }
+            }
+
+      - name: Commit cloned datasets (optional)
+        if: success()
+        run: |
+          cd test_diggestion
+          git config --local user.email "action@github.com"
+          git config --local user.name "GitHub Action"
+          git add .
+          git commit -m "chore: update cloned OpenNeuro datasets" || echo "Nothing to commit"
+          git push
+        continue-on-error: true
diff --git a/.gitignore b/.gitignore
index 59819db3..ab7b092a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -39,7 +39,6 @@ examples/data
 .DS_Store
 
 data/
-*.json
 *.isorted
 *.py.isorted
 
diff --git a/eegdash/api.py b/eegdash/api.py
index d867d267..194a0a5e 100644
--- a/eegdash/api.py
+++ b/eegdash/api.py
@@ -10,13 +10,15 @@
 EEG data from S3 for matched records.
 """
 
+import json
 import os
 from pathlib import Path
 from typing import Any, Mapping
 
 import mne
+import numpy as np
+import pandas as pd
 from mne.utils import _soft_import
-from pymongo import InsertOne, UpdateOne
 
 from .bids_eeg_metadata import (
     build_query_from_kwargs,
@@ -353,9 +355,18 @@ def _raise_if_conflicting_constraints(
                     )
 
     def add_bids_dataset(
-        self, dataset: str, data_dir: str, overwrite: bool = True
-    ) -> None:
-        """Scan a local BIDS dataset and upsert records into MongoDB.
+        self,
+        dataset: str,
+        data_dir: str,
+        overwrite: bool = True,
+        output_path: str | Path | None = None,
+    ) -> dict[str, Any]:
+        """Collect metadata for a local BIDS dataset as JSON-ready records.
+
+        Instead of inserting records directly into MongoDB, this method scans
+        ``data_dir`` and returns a JSON-serializable manifest describing every
+        EEG recording that was discovered. The manifest can be written to disk
+        or forwarded to the EEGDash ingestion API for persistence.
 
         Parameters
         ----------
@@ -364,127 +375,91 @@ def add_bids_dataset(
         data_dir : str
             Path to the local BIDS dataset directory.
         overwrite : bool, default True
-            If ``True``, update existing records when encountered; otherwise,
-            skip records that already exist.
+            If ``False``, skip records that already exist in the database based
+            on ``data_name`` lookups.
+        output_path : str | Path | None, optional
+            If provided, the manifest is written to the given JSON file.
 
-        Raises
-        ------
-        ValueError
-            If called on a public client ``(is_public=True)``.
+        Returns
+        -------
+        dict
+            A manifest with keys ``dataset``, ``source``, ``records`` and, when
+            applicable, ``skipped`` or ``errors``.
 
         """
-        if self.is_public:
-            raise ValueError("This operation is not allowed for public users")
-
-        if not overwrite and self.exist({"dataset": dataset}):
-            logger.info("Dataset %s already exists in the database", dataset)
-            return
+        source_dir = Path(data_dir).expanduser()
         try:
             bids_dataset = EEGBIDSDataset(
-                data_dir=data_dir,
+                data_dir=str(source_dir),
                 dataset=dataset,
             )
-        except Exception as e:
-            logger.error("Error creating bids dataset %s: %s", dataset, str(e))
-            raise e
-        requests = []
-        for bids_file in bids_dataset.get_files():
-            try:
-                data_id = f"{dataset}_{Path(bids_file).name}"
-
-                if self.exist({"data_name": data_id}):
-                    if overwrite:
-                        eeg_attrs = load_eeg_attrs_from_bids_file(
-                            bids_dataset, bids_file
-                        )
-                        requests.append(self._update_request(eeg_attrs))
-                else:
-                    eeg_attrs = load_eeg_attrs_from_bids_file(bids_dataset, bids_file)
-                    requests.append(self._add_request(eeg_attrs))
-            except Exception as e:
-                logger.error("Error adding record %s", bids_file)
-                logger.error(str(e))
-
-        logger.info("Number of requests: %s", len(requests))
-
-        if requests:
-            result = self.__collection.bulk_write(requests, ordered=False)
-            logger.info("Inserted: %s ", result.inserted_count)
-            logger.info("Modified: %s ", result.modified_count)
-            logger.info("Deleted: %s", result.deleted_count)
-            logger.info("Upserted: %s", result.upserted_count)
-            logger.info("Errors: %s ", result.bulk_api_result.get("writeErrors", []))
-
-    def _add_request(self, record: dict) -> InsertOne:
-        """Create a MongoDB insertion request for a record.
-
-        Parameters
-        ----------
-        record : dict
-            The record to insert.
-
-        Returns
-        -------
-        InsertOne
-            A PyMongo ``InsertOne`` object.
-
-        """
-        return InsertOne(record)
-
-    def add(self, record: dict) -> None:
-        """Add a single record to the MongoDB collection.
-
-        Parameters
-        ----------
-        record : dict
-            The record to add.
-
-        """
-        try:
-            self.__collection.insert_one(record)
-        except ValueError as e:
-            logger.error("Validation error for record: %s ", record["data_name"])
-            logger.error(e)
         except Exception as exc:
-            logger.error(
-                "Error adding record: %s ", record.get("data_name", "<unknown>")
-            )
-            logger.debug("Add operation failed", exc_info=exc)
+            logger.error("Error creating BIDS dataset %s: %s", dataset, exc)
+            raise exc
 
-    def _update_request(self, record: dict) -> UpdateOne:
-        """Create a MongoDB update request for a record.
+        records: list[dict[str, Any]] = []
+        skipped: list[str] = []
+        errors: list[dict[str, str]] = []
 
-        Parameters
-        ----------
-        record : dict
-            The record to update.
-
-        Returns
-        -------
-        UpdateOne
-            A PyMongo ``UpdateOne`` object.
-
-        """
-        return UpdateOne({"data_name": record["data_name"]}, {"$set": record})
+        for bids_file in bids_dataset.get_files():
+            data_id = f"{dataset}_{Path(bids_file).name}"
+            if not overwrite:
+                try:
+                    if self.exist({"data_name": data_id}):
+                        skipped.append(data_id)
+                        continue
+                except Exception as exc:
+                    logger.warning(
+                        "Could not verify existing record %s due to: %s",
+                        data_id,
+                        exc,
+                    )
 
-    def update(self, record: dict) -> None:
-        """Update a single record in the MongoDB collection.
+            try:
+                eeg_attrs = load_eeg_attrs_from_bids_file(bids_dataset, bids_file)
+                records.append(eeg_attrs)
+            except Exception as exc:  # log and continue collecting
+                logger.error("Error extracting metadata for %s", bids_file)
+                logger.error(str(exc))
+                errors.append({"file": str(bids_file), "error": str(exc)})
+
+        manifest: dict[str, Any] = {
+            "dataset": dataset,
+            "source": str(source_dir.resolve()),
+            "record_count": len(records),
+            "records": records,
+        }
+        if skipped:
+            manifest["skipped"] = skipped
+        if errors:
+            manifest["errors"] = errors
+
+        if output_path is not None:
+            output_path = Path(output_path)
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+            with output_path.open("w", encoding="utf-8") as fh:
+                json.dump(
+                    manifest,
+                    fh,
+                    indent=2,
+                    sort_keys=True,
+                    default=_json_default,
+                )
+            logger.info(
+                "Wrote EEGDash ingestion manifest for %s to %s",
+                dataset,
+                output_path,
+            )
 
-        Parameters
-        ----------
-        record : dict
-            Record content to set at the matching ``data_name``.
+        logger.info(
+            "Prepared %s records for dataset %s (skipped=%s, errors=%s)",
+            len(records),
+            dataset,
+            len(skipped),
+            len(errors),
+        )
 
-        """
-        try:
-            self.__collection.update_one(
-                {"data_name": record["data_name"]}, {"$set": record}
-            )
-        except Exception as exc:  # log and continue
-            logger.error(
-                "Error updating record: %s", record.get("data_name", "<unknown>")
-            )
-            logger.debug("Update operation failed", exc_info=exc)
+        return manifest
 
     def exists(self, query: dict[str, Any]) -> bool:
         """Check if at least one record matches the query.
@@ -504,35 +479,6 @@ def exists(self, query: dict[str, Any]) -> bool:
         """
         return self.exist(query)
 
-    def remove_field(self, record: dict, field: str) -> None:
-        """Remove a field from a specific record in the MongoDB collection.
-
-        Parameters
-        ----------
-        record : dict
-            Record-identifying object with a ``data_name`` key.
-        field : str
-            The name of the field to remove.
-
-        """
-        self.__collection.update_one(
-            {"data_name": record["data_name"]}, {"$unset": {field: 1}}
-        )
-
-    def remove_field_from_db(self, field: str) -> None:
-        """Remove a field from all records in the database.
-
-        .. warning::
-            This is a destructive operation and cannot be undone.
-
-        Parameters
-        ----------
-        field : str
-            The name of the field to remove from all documents.
-
-        """
-        self.__collection.update_many({}, {"$unset": {field: 1}})
-
     @property
     def collection(self):
         """The underlying PyMongo ``Collection`` object.
@@ -545,26 +491,38 @@ def collection(self):
         """
         return self.__collection
 
-    def close(self) -> None:
-        """Close the MongoDB connection.
-
-        .. deprecated:: 0.1
-            Connections are now managed globally by :class:`MongoConnectionManager`.
-            This method is a no-op and will be removed in a future version.
-            Use :meth:`EEGDash.close_all_connections` to close all clients.
-        """
-        # Individual instances no longer close the shared client
-        pass
-
     @classmethod
     def close_all_connections(cls) -> None:
         """Close all MongoDB client connections managed by the singleton manager."""
         MongoConnectionManager.close_all()
 
-    def __del__(self) -> None:
-        """Destructor; no explicit action needed due to global connection manager."""
-        # No longer needed since we're using singleton pattern
+
+def _json_default(value: Any) -> Any:
+    """Fallback serializer for complex objects when exporting ingestion JSON."""
+    try:
+        if isinstance(value, (np.generic,)):
+            return value.item()
+        if isinstance(value, np.ndarray):
+            return value.tolist()
+    except Exception:
         pass
 
+    try:
+        if value is pd.NA:
+            return None
+        if isinstance(value, (pd.Timestamp, pd.Timedelta)):
+            return value.isoformat()
+        if isinstance(value, pd.Series):
+            return value.to_dict()
+    except Exception:
+        pass
+
+    if isinstance(value, Path):
+        return value.as_posix()
+    if isinstance(value, set):
+        return sorted(value)
+
+    raise TypeError(f"Object of type {type(value).__name__} is not JSON serializable")
+
 
 __all__ = ["EEGDash"]
diff --git a/eegdash/dataset/bids_dataset.py b/eegdash/dataset/bids_dataset.py
index 1788765e..7e7c1685 100644
--- a/eegdash/dataset/bids_dataset.py
+++ b/eegdash/dataset/bids_dataset.py
@@ -16,14 +16,26 @@
 
 import pandas as pd
 from mne_bids import BIDSPath, find_matching_paths
+from mne_bids.config import ALLOWED_DATATYPE_EXTENSIONS, EPHY_ALLOWED_DATATYPES, reader
+
+# Known companion/sidecar files for specific formats (BIDS spec requirement)
+# These files must be downloaded together with the primary file
+_COMPANION_FILES = {
+    ".set": [".fdt"],  # EEGLAB: data file
+    ".vhdr": [".eeg", ".vmrk"],  # BrainVision: data + marker files
+}
 
 
 class EEGBIDSDataset:
-    """An interface to a local BIDS dataset containing EEG recordings.
+    """An interface to a local BIDS dataset containing electrophysiology recordings.
 
     This class centralizes interactions with a BIDS dataset on the local
     filesystem, providing methods to parse metadata, find files, and
-    retrieve BIDS-related information.
+    retrieve BIDS-related information. Supports multiple modalities including
+    EEG, MEG, iEEG, and NIRS.
+
+    The class uses MNE-BIDS constants to stay synchronized with the BIDS
+    specification and automatically supports all file formats recognized by MNE.
 
     Parameters
     ----------
@@ -31,28 +43,65 @@ class EEGBIDSDataset:
         The path to the local BIDS dataset directory.
     dataset : str
         A name for the dataset (e.g., "ds002718").
+    allow_symlinks : bool, default False
+        If True, accept broken symlinks (e.g., git-annex) for metadata extraction.
+        If False, require actual readable files for data loading.
+        Set to True when doing metadata digestion without loading raw data.
+    modalities : list of str or None, default None
+        List of modalities to search for (e.g., ["eeg", "meg"]).
+        If None, defaults to all electrophysiology modalities from MNE-BIDS:
+        ['meg', 'eeg', 'ieeg', 'nirs'].
+
+    Attributes
+    ----------
+    RAW_EXTENSIONS : dict
+        Mapping of file extensions to their companion files, dynamically
+        built from mne_bids.config.reader.
+    files : list of str
+        List of all recording file paths found in the dataset.
+    detected_modality : str
+        The modality of the first file found (e.g., 'eeg', 'meg').
+
+    Examples
+    --------
+    >>> # Load EEG-only dataset
+    >>> dataset = EEGBIDSDataset(
+    ...     data_dir="/path/to/ds002718",
+    ...     dataset="ds002718",
+    ...     modalities=["eeg"]
+    ... )
+
+    >>> # Load dataset with multiple modalities
+    >>> dataset = EEGBIDSDataset(
+    ...     data_dir="/path/to/ds005810",
+    ...     dataset="ds005810",
+    ...     modalities=["meg", "eeg"]
+    ... )
+
+    >>> # Metadata extraction from git-annex (symlinks)
+    >>> dataset = EEGBIDSDataset(
+    ...     data_dir="/path/to/dataset",
+    ...     dataset="ds000001",
+    ...     allow_symlinks=True
+    ... )
 
     """
 
-    ALLOWED_FILE_FORMAT = ["eeglab", "brainvision", "biosemi", "european"]
+    # Dynamically build from MNE-BIDS constants (mne_bids.config.reader)
+    # reader dict maps file extensions to MNE read functions
+    # This ensures compatibility with the latest BIDS specification
+
+    # Primary extension + companions = files that must be downloaded together
     RAW_EXTENSIONS = {
-        ".set": [".set", ".fdt"],  # eeglab
-        ".edf": [".edf"],  # european
-        ".vhdr": [".eeg", ".vhdr", ".vmrk", ".dat", ".raw"],  # brainvision
-        ".bdf": [".bdf"],  # biosemi
+        ext: [ext] + _COMPANION_FILES.get(ext, []) for ext in reader.keys()
     }
-    METADATA_FILE_EXTENSIONS = [
-        "eeg.json",
-        "channels.tsv",
-        "electrodes.tsv",
-        "events.tsv",
-        "events.json",
-    ]
 
     def __init__(
         self,
         data_dir=None,  # location of bids dataset
         dataset="",  # dataset name
+        allow_symlinks=False,  # allow broken symlinks for digestion
+        modalities=None,  # list of modalities to search for (e.g., ["eeg", "meg", "ieeg"])
     ):
         if data_dir is None or not os.path.exists(data_dir):
             raise ValueError("data_dir must be specified and must exist")
@@ -60,6 +109,15 @@ def __init__(
         self.bidsdir = Path(data_dir)
         self.dataset = dataset
         self.data_dir = data_dir
+        self.allow_symlinks = allow_symlinks
+
+        # Set modalities to search for (default: all electrophysiology modalities from MNE-BIDS)
+        if modalities is None:
+            self.modalities = EPHY_ALLOWED_DATATYPES  # ['meg', 'eeg', 'ieeg', 'nirs']
+        else:
+            self.modalities = (
+                modalities if isinstance(modalities, list) else [modalities]
+            )
 
         # Accept exact dataset folder or a variant with informative suffixes
         # (e.g., dsXXXXX-bdf, dsXXXXX-bdf-mini) to avoid collisions.
@@ -74,9 +132,12 @@ def __init__(
 
         # get all recording files in the bids directory
         assert len(self.files) > 0, ValueError(
-            "Unable to construct EEG dataset. No EEG recordings found."
+            f"Unable to construct dataset. No recordings found for modalities: {self.modalities}"
         )
-        assert self.check_eeg_dataset(), ValueError("Dataset is not an EEG dataset.")
+        # Store the detected modality for later use
+        self.detected_modality = self.get_bids_file_attribute(
+            "modality", self.files[0]
+        ).lower()
 
     def check_eeg_dataset(self) -> bool:
         """Check if the BIDS dataset contains EEG data.
@@ -87,25 +148,37 @@ def check_eeg_dataset(self) -> bool:
             True if the dataset's modality is EEG, False otherwise.
 
         """
-        return self.get_bids_file_attribute("modality", self.files[0]).lower() == "eeg"
+        return self.detected_modality == "eeg"
 
     def _init_bids_paths(self) -> None:
         """Initialize BIDS file paths using mne_bids for fast discovery.
 
         Uses mne_bids.find_matching_paths() for efficient pattern-based file
-        discovery instead of heavy pybids BIDSLayout indexing.
+        discovery. Falls back to manual glob search if needed.
+
+        When allow_symlinks=True, includes broken symlinks (e.g., git-annex)
+        for metadata extraction without requiring actual data files.
+
+        Searches across multiple modalities (eeg, meg, ieeg) based on self.modalities.
         """
         # Initialize cache for BIDSPath objects
         self._bids_path_cache = {}
 
-        # Find all EEG recordings using pattern matching (fast!)
+        # Find all recordings across specified modalities
+        # Use MNE-BIDS constants to get valid extensions per modality
         self.files = []
-        for ext in self.RAW_EXTENSIONS.keys():
-            # find_matching_paths returns BIDSPath objects
-            paths = find_matching_paths(self.bidsdir, datatypes="eeg", extensions=ext)
-            if paths:
-                # Convert BIDSPath objects to filename strings
-                self.files = [str(p.fpath) for p in paths]
+        for modality in self.modalities:
+            for ext in ALLOWED_DATATYPE_EXTENSIONS.get(modality, []):
+                found_files = _find_bids_files(
+                    self.bidsdir,
+                    ext,
+                    modalities=[modality],
+                    allow_symlinks=self.allow_symlinks,
+                )
+                if found_files:
+                    self.files = found_files
+                    break
+            if self.files:
                 break
 
     def _get_bids_path_from_file(self, data_filepath: str):
@@ -127,8 +200,17 @@ def _get_bids_path_from_file(self, data_filepath: str):
             filepath = Path(data_filepath)
             filename = filepath.name
 
+            # Detect modality from the directory path
+            # BIDS structure: .../sub-XX/[ses-YY/]<modality>/sub-XX_...
+            path_parts = filepath.parts
+            modality = "eeg"  # default
+            for part in path_parts:
+                if part in ["eeg", "meg", "ieeg", "emg"]:
+                    modality = part
+                    break
+
             # Extract entities from filename using BIDS pattern
-            # Expected format: sub-<label>[_ses-<label>][_task-<label>][_run-<label>]_eeg.<ext>
+            # Expected format: sub-<label>[_ses-<label>][_task-<label>][_run-<label>]_<modality>.<ext>
             subject = re.search(r"sub-([^_]*)", filename)
             session = re.search(r"ses-([^_]*)", filename)
             task = re.search(r"task-([^_]*)", filename)
@@ -139,7 +221,7 @@ def _get_bids_path_from_file(self, data_filepath: str):
                 session=session.group(1) if session else None,
                 task=task.group(1) if task else None,
                 run=int(run.group(1)) if run else None,
-                datatype="eeg",
+                datatype=modality,
                 extension=filepath.suffix,
                 root=self.bidsdir,
             )
@@ -174,10 +256,23 @@ def _get_json_with_inheritance(
 
         # Walk up from file directory to root, collecting JSON files
         while current_dir >= root_dir:
+            # Try exact match first (e.g., "eeg.json" at root level)
             json_path = current_dir / json_filename
             if json_path.exists():
                 with open(json_path) as f:
                     json_dict.update(json.load(f))
+            else:
+                # Look for BIDS-specific JSON files (e.g., "sub-001_task-rest_eeg.json")
+                # Match files ending with the json_filename pattern
+                for json_file in current_dir.glob(f"*_{json_filename}"):
+                    # Check if this JSON corresponds to the data file
+                    data_basename = Path(data_filepath).stem
+                    json_basename = json_file.stem
+                    # They should share the same BIDS entities prefix
+                    if data_basename.split("_eeg")[0] == json_basename.split("_eeg")[0]:
+                        with open(json_file) as f:
+                            json_dict.update(json.load(f))
+                        break
 
             # Stop at BIDS root (contains dataset_description.json)
             if (current_dir / "dataset_description.json").exists():
@@ -243,8 +338,14 @@ def get_bids_metadata_files(
         """
         if isinstance(filepath, str):
             filepath = Path(filepath)
-        if not filepath.exists():
-            raise ValueError(f"filepath {filepath} does not exist")
+
+        # Validate file based on current mode
+        if not _is_valid_eeg_file(filepath, allow_symlinks=self.allow_symlinks):
+            raise ValueError(
+                f"filepath {filepath} does not exist. "
+                f"If doing metadata extraction from git-annex, set allow_symlinks=True"
+            )
+
         path, filename = os.path.split(filepath)
         basename = filename[: filename.rfind("_")]
         meta_files = self._get_bids_file_inheritance(
@@ -293,12 +394,18 @@ def get_bids_file_attribute(self, attribute: str, data_filepath: str) -> Any:
         if attribute in direct_attrs:
             return direct_attrs[attribute]
 
-        # For JSON-based attributes, read and cache eeg.json
-        eeg_json = self._get_json_with_inheritance(data_filepath, "eeg.json")
+        # For JSON-based attributes, read the modality-specific JSON file
+        # (eeg.json for EEG, meg.json for MEG, ieeg.json for iEEG)
+        modality = bids_path.datatype or "eeg"
+        json_filename = f"{modality}.json"
+        modality_json = self._get_json_with_inheritance(data_filepath, json_filename)
+
         json_attrs = {
-            "sfreq": eeg_json.get("SamplingFrequency"),
-            "ntimes": eeg_json.get("RecordingDuration"),
-            "nchans": eeg_json.get("EEGChannelCount"),
+            "sfreq": modality_json.get("SamplingFrequency"),
+            "ntimes": modality_json.get("RecordingDuration"),
+            "nchans": modality_json.get("EEGChannelCount")
+            or modality_json.get("MEGChannelCount")
+            or modality_json.get("iEEGChannelCount"),
         }
 
         return json_attrs.get(attribute)
@@ -440,4 +547,95 @@ def eeg_json(self, data_filepath: str) -> dict[str, Any]:
         return self._get_json_with_inheritance(data_filepath, "eeg.json")
 
 
+def _is_valid_eeg_file(filepath: Path, allow_symlinks: bool = False) -> bool:
+    """Check if a file path is valid for EEG processing.
+
+    Parameters
+    ----------
+    filepath : Path
+        The file path to check.
+    allow_symlinks : bool, default False
+        If True, accept broken symlinks (e.g., git-annex pointers).
+        If False, only accept files that actually exist and can be read.
+
+    Returns
+    -------
+    bool
+        True if the file is valid for the current mode.
+
+    """
+    if filepath.exists():
+        return True
+    if allow_symlinks and filepath.is_symlink():
+        return True
+    return False
+
+
+def _find_bids_files(
+    bidsdir: Path,
+    extension: str,
+    modalities: list[str] = None,
+    allow_symlinks: bool = False,
+) -> list[str]:
+    """Find BIDS files in a BIDS directory across multiple modalities.
+
+    Parameters
+    ----------
+    bidsdir : Path
+        The BIDS dataset root directory.
+    extension : str
+        File extension to search for (e.g., '.set', '.bdf', '.fif').
+    modalities : list of str, optional
+        List of modalities to search (e.g., ["eeg", "meg", "ieeg"]).
+        If None, defaults to EPHY_ALLOWED_DATATYPES from mne_bids.config.
+    allow_symlinks : bool, default False
+        If True, include broken symlinks in results (for metadata extraction).
+        If False, only return files that can be read (for data loading).
+
+    Returns
+    -------
+    list of str
+        List of file paths found.
+
+    """
+    if modalities is None:
+        modalities = EPHY_ALLOWED_DATATYPES
+
+    all_files = []
+
+    for modality in modalities:
+        # First try mne_bids (fast, but skips broken symlinks)
+        if not allow_symlinks:
+            try:
+                paths = find_matching_paths(
+                    bidsdir, datatypes=modality, extensions=extension
+                )
+                if paths:
+                    all_files.extend([str(p.fpath) for p in paths])
+            except Exception:
+                pass  # Continue to fallback search
+
+        # Fallback: manual glob search (finds symlinks too)
+        pattern = f"**/{modality}/*{extension}"
+        found = list(bidsdir.glob(pattern))
+
+        # Filter based on validation mode
+        valid_files = [
+            str(f)
+            for f in found
+            if _is_valid_eeg_file(f, allow_symlinks=allow_symlinks)
+        ]
+        all_files.extend(valid_files)
+
+    # Remove duplicates while preserving order
+    seen = set()
+    unique_files = []
+    for f in all_files:
+        if f not in seen:
+            seen.add(f)
+            unique_files.append(f)
+
+    return unique_files
+
+
 __all__ = ["EEGBIDSDataset"]
diff --git a/pyproject.toml b/pyproject.toml
index 63b445ce..58cbb3ac 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -87,7 +87,8 @@ docs = [
 
 digestion = [
   "pybids",
-  "python-dotenv",
+  "gql[requests]",
+  "requests_toolbelt",
 ]
 
 all = [
diff --git a/scripts/data_ingest.py b/scripts/data_ingest.py
deleted file mode 100644
index 97eb6a3c..00000000
--- a/scripts/data_ingest.py
+++ /dev/null
@@ -1,404 +0,0 @@
-import argparse
-import json
-from pathlib import Path
-
-from eegdash import EEGDash
-
-
-def main():
-    # Create the parser
-    parser = argparse.ArgumentParser(
-        description="A simple command line argument parser"
-    )
-
-    # Add arguments
-    parser.add_argument(
-        "--data",
-        type=str,
-        default="/mnt/nemar/openneuro/ds004186",
-        help="Path to data directory (Default: /mnt/nemar/openneuro/ds004186)",
-    )
-    parser.add_argument(
-        "--dataset",
-        type=str,
-        default="ds004186",
-        help="Dataset name (Default: ds004186)",
-    )
-
-    # Parse the arguments
-    args = parser.parse_args()
-    print("Arguments:", args)
-
-    obj = EEGDash(
-        is_public=False,
-    )
-    hbn_datasets = [
-        "ds005507",
-        "ds005506",
-        "ds005510",
-        "ds005512",
-        "ds005505",
-        "ds005508",
-        "ds005509",
-        "ds005514",
-        "ds005511",
-        "ds005515",
-        "ds005516",
-    ]
-
-    config_path = Path(__file__).parent / "datasets.json"
-    with open(config_path, "r") as f:
-        datasets_config = json.load(f)
-    failed_ds = set(datasets_config["failed_datasets"])
-
-    datasets = [
-        "ds004841",
-        "ds004770",
-        "ds004561",
-        "ds005261",
-        "ds000247",
-        "ds005131",
-        "ds003753",
-        "ds003420",
-        "ds005028",
-        "ds005557",
-        "ds005170",
-        "ds004840",
-        "ds004855",
-        "ds004718",
-        "ds002725",
-        "ds005565",
-        "ds004408",
-        "ds004796",
-        "ds002550",
-        "ds004511",
-        "ds002893",
-        "ds003682",
-        "ds004817",
-        "ds000248",
-        "ds003190",
-        "ds004819",
-        "ds005089",
-        "ds003822",
-        "ds003670",
-        "ds005048",
-        "ds004917",
-        "ds004574",
-        "ds004852",
-        "ds004357",
-        "ds003082",
-        "ds005574",
-        "ds005397",
-        "ds004519",
-        "ds004602",
-        "ds004784",
-        "ds005491",
-        "ds003846",
-        "ds002799",
-        "ds004024",
-        "ds005815",
-        "ds003694",
-        "ds005429",
-        "ds004771",
-        "ds003518",
-        "ds004977",
-        "ds003702",
-        "ds004577",
-        "ds005207",
-        "ds005866",
-        "ds004127",
-        "ds003574",
-        "ds004703",
-        "ds005779",
-        "ds004398",
-        "ds003523",
-        "ds005558",
-        "ds004212",
-        "ds004347",
-        "ds005185",
-        "ds005489",
-        "ds005398",
-        "ds004588",
-        "ds001787",
-        "ds003505",
-        "ds005670",
-        "ds003568",
-        "ds003703",
-        "ds005811",
-        "ds004370",
-        "ds005340",
-        "ds003987",
-        "ds004865",
-        "ds005363",
-        "ds005121",
-        "ds004078",
-        "ds003392",
-        "ds004317",
-        "ds004851",
-        "ds004033",
-        "ds004011",
-        "ds003876",
-        "ds004166",
-        "ds005691",
-        "ds005087",
-        "ds004330",
-        "ds004256",
-        "ds004315",
-        "ds005279",
-        "ds005420",
-        "ds003474",
-        "ds002034",
-        "ds003509",
-        "ds004186",
-        "ds003825",
-        "ds005868",
-        "ds003516",
-        "ds004587",
-        "ds005415",
-        "ds004942",
-        "ds004348",
-        "ds003633",
-        "ds004598",
-        "ds005383",
-        "ds003195",
-        "ds004473",
-        "ds005403",
-        "ds002908",
-        "ds004621",
-        "ds005863",
-        "ds003848",
-        "ds004625",
-        "ds005594",
-        "ds002336",
-        "ds004043",
-        "ds003517",
-        "ds005083",
-        "ds004368",
-        "ds004584",
-        "ds004012",
-        "ds003374",
-        "ds005624",
-        "ds005810",
-        "ds003506",
-        "ds005106",
-        "ds004284",
-        "ds005620",
-        "ds004738",
-        "ds004849",
-        "ds005234",
-        "ds003570",
-        "ds003490",
-        "ds002720",
-        "ds005307",
-        "ds002094",
-        "ds002833",
-        "ds002218",
-        "ds000117",
-        "ds004117",
-        "ds005021",
-        "ds004194",
-        "ds005356",
-        "ds004264",
-        "ds004446",
-        "ds004980",
-        "ds002722",
-        "ds004457",
-        "ds004505",
-        "ds004853",
-        "ds002885",
-        "ds004580",
-        "ds003944",
-        "ds005545",
-        "ds004279",
-        "ds005876",
-        "ds004532",
-        "ds004346",
-        "ds003816",
-        "ds005385",
-        "ds004572",
-        "ds005095",
-        "ds004696",
-        "ds004460",
-        "ds004902",
-        "ds005189",
-        "ds005274",
-        "ds004075",
-        "ds004447",
-        "ds004295",
-        "ds003519",
-        "ds004107",
-        "ds004952",
-        "ds003458",
-        "ds002724",
-        "ds003004",
-        "ds005571",
-        "ds003104",
-        "ds004200",
-        "ds002791",
-        "ds004015",
-        "ds005592",
-        "ds004262",
-        "ds004850",
-        "ds005273",
-        "ds002712",
-        "ds004520",
-        "ds004444",
-        "ds004582",
-        "ds002723",
-        "ds004017",
-        "ds004595",
-        "ds004626",
-        "ds003751",
-        "ds004475",
-        "ds000246",
-        "ds004515",
-        "ds003421",
-        "ds002158",
-        "ds004951",
-        "ds005522",
-        "ds004883",
-        "ds004483",
-        "ds005065",
-        "ds004624",
-        "ds004802",
-        "ds004993",
-        "ds004278",
-        "ds004816",
-        "ds003739",
-        "ds005873",
-        "ds004389",
-        "ds003194",
-        "ds004356",
-        "ds004367",
-        "ds004369",
-        "ds004381",
-        "ds004196",
-        "ds005692",
-        "ds002338",
-        "ds004022",
-        "ds004579",
-        "ds004859",
-        "ds005416",
-        "ds004603",
-        "ds004752",
-        "ds003768",
-        "ds003947",
-        "ds004229",
-        "ds005530",
-        "ds004844",
-        "ds005555",
-        "ds004998",
-        "ds004843",
-        "ds004477",
-        "ds001785",
-        "ds005688",
-        "ds003766",
-        "ds004276",
-        "ds005540",
-        "ds004152",
-        "ds004944",
-        "ds001971",
-        "ds003352",
-        "ds003626",
-        "ds002814",
-        "ds003645",
-        "ds005007",
-        "ds004551",
-        "ds005586",
-        "ds001784",
-        "ds004809",
-        "ds003922",
-        "ds004388",
-        "ds003810",
-        "ds004306",
-        "ds004642",
-        "ds003478",
-        "ds004100",
-        "ds003969",
-        "ds004000",
-        "ds005411",
-        "ds004842",
-        "ds005305",
-        "ds005494",
-        "ds004995",
-        "ds005114",
-        "ds004854",
-        "ds003638",
-        "ds004521",
-        "ds002761",
-        "ds001849",
-        "ds003844",
-        "ds003039",
-        "ds004706",
-        "ds004252",
-        "ds004448",
-        "ds005795",
-        "ds003602",
-        "ds005169",
-        "ds003380",
-        "ds004018",
-        "ds004080",
-        "ds004324",
-        "ds003887",
-        "ds004789",
-        "ds004860",
-        "ds004837",
-        "ds005241",
-        "ds003688",
-        "ds005107",
-        "ds002721",
-        "ds003655",
-        "ds004395",
-        "ds004147",
-        "ds003483",
-        "ds003555",
-        "ds005486",
-        "ds005520",
-        "ds005262",
-        "ds002778",
-        "ds004661",
-        "ds003885",
-        "ds004657",
-        "ds005523",
-        "ds003498",
-        "ds003522",
-        "ds005406",
-        "ds003710",
-        "ds003343",
-        "ds003708",
-        "ds002001",
-        "ds005345",
-        "ds004067",
-        "ds003078",
-        "ds003801",
-        "ds005059",
-        "ds003029",
-        "ds001810",
-        "ds005296",
-        "ds004660",
-    ]
-    datasets = hbn_datasets
-    for i, ds in enumerate(datasets):
-        if ds in failed_ds:
-            continue
-        try:
-            if i % 50 == 0:
-                print("Saving failed datasets")
-                with open(config_path, "w") as f:
-                    json.dump({"failed_datasets": list(failed_ds)}, f)
-            print(f"Processing {ds}")
-            obj.add_bids_dataset(
-                dataset=ds, data_dir=f"/mnt/nemar/openneuro/{ds}", overwrite=True
-            )
-        except Exception as e:
-            print(e)
-            failed_ds.add(ds)
-            pass
-
-    print(f"Failed datasets: {list(failed_ds)}")
-    with open(config_path, "w") as f:
-        json.dump({"failed_datasets": list(failed_ds)}, f)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/scripts/datasets.json b/scripts/datasets.json
deleted file mode 100644
index fa5483e6..00000000
--- a/scripts/datasets.json
+++ /dev/null
@@ -1 +0,0 @@
-{"failed_datasets": ["ds004770", "ds002885", "ds004865", "ds003374", "ds005398", "ds003568", "ds004278", "ds004551", "ds003392", "ds005592", "ds000117", "ds004706", "ds004017", "ds003082", "ds000248", "ds004457", "ds003708", "ds005873", "ds004212", "ds004229", "ds004012", "ds004078", "ds002550", "ds005545", "ds004148", "ds005522", "ds005523", "ds001784", "ds005507", "ds004944", "ds003922", "ds004107", "ds004993", "ds003029", "ds004483", "ds005624", "ds004789", "ds004127", "ds002791", "ds005279", "ds003420", "ds004395", "ds004276", "ds003844", "ds004080", "ds005083", "ds004194", "ds004398", "ds003848", "ds005107", "ds003078", "ds004166", "ds004642", "ds004346", "ds003682", "ds002799", "ds000247", "ds005059", "ds005415", "ds003104", "ds005670", "ds005007", "ds005558", "ds003483", "ds005087", "ds004837", "ds003352", "ds004147", "ds005691", "ds002712", "ds004100", "ds005494", "ds005574", "ds005411", "ds005234", "ds004998", "ds003498", "ds005169", "ds004819", "ds004859", "ds004738", "ds005810", "ds005489", "ds005356", "ds005241", "ds004186", "ds004977", "ds002908", "ds003876", "ds003703", "ds005491", "ds002761", "ds003688", "ds004703", "ds004473", "ds004696", "ds005065", "ds003694", "ds000246", "ds003380", "ds002001", "ds004809", "ds004370", "ds003633", "ds004011", "ds004330", "ds004624", "ds005261", "ds005557"]}
\ No newline at end of file
diff --git a/scripts/ingestions/1_fetch_eegmanylabs.py b/scripts/ingestions/1_fetch_eegmanylabs.py
new file mode 100644
index 00000000..e26a3d51
--- /dev/null
+++ b/scripts/ingestions/1_fetch_eegmanylabs.py
@@ -0,0 +1,173 @@
+"""Fetch EEGManyLabs datasets from G-Node GIN organization.
+
+This script scrapes the EEGManyLabs organization page on GIN (G-Node Infrastructure)
+to retrieve information about all available datasets. GIN is a git-based repository
+hosting service for neuroscience data.
+
+Output: consolidated/eegmanylabs_datasets.json
+"""
+
+import argparse
+import json
+import re
+import sys
+from pathlib import Path
+from typing import Any
+
+import requests
+from bs4 import BeautifulSoup
+
+
+def fetch_eegmanylabs_repos(
+    organization: str = "EEGManyLabs",
+    base_url: str = "https://gin.g-node.org",
+) -> list[dict[str, Any]]:
+    """Fetch all repositories from the EEGManyLabs organization.
+
+    Args:
+        organization: GIN organization name
+        base_url: Base URL for GIN
+
+    Returns:
+        List of repository dictionaries with metadata
+
+    """
+    org_url = f"{base_url}/{organization}"
+
+    print(f"Fetching repositories from {org_url}")
+
+    try:
+        response = requests.get(org_url, timeout=30)
+        response.raise_for_status()
+    except requests.RequestException as e:
+        print(f"Error fetching organization page: {e}", file=sys.stderr)
+        return []
+
+    soup = BeautifulSoup(response.text, "html.parser")
+
+    # Find all repository items
+    repo_items = soup.find_all("div", class_="item")
+
+    if not repo_items:
+        print(f"Warning: No repositories found for {organization}", file=sys.stderr)
+        return []
+
+    datasets = []
+
+    for item in repo_items:
+        try:
+            # Get repository link
+            repo_link = item.find("a", class_="name")
+            if not repo_link:
+                continue
+
+            repo_path = repo_link.get("href", "")
+            if not repo_path.startswith(f"/{organization}/"):
+                continue
+
+            repo_name = repo_path.split("/")[-1]
+
+            # Get description
+            desc_elem = item.find("p", class_="description")
+            description = desc_elem.get_text(strip=True) if desc_elem else ""
+
+            # Get metadata (stars, forks, updated time)
+            meta_div = item.find("div", class_="meta")
+            stars = 0
+            forks = 0
+            updated = ""
+
+            if meta_div:
+                # Extract stars
+                star_elem = meta_div.find("a", href=re.compile(r"/stars$"))
+                if star_elem:
+                    stars = int(star_elem.get_text(strip=True))
+
+                # Extract forks
+                fork_elem = meta_div.find("a", href=re.compile(r"/forks$"))
+                if fork_elem:
+                    forks = int(fork_elem.get_text(strip=True))
+
+                # Extract update time
+                time_elem = meta_div.find("span", class_="time")
+                if time_elem:
+                    updated = time_elem.get("title", time_elem.get_text(strip=True))
+
+            # Construct clone URLs
+            clone_url = f"{base_url}/{organization}/{repo_name}.git"
+            ssh_url = f"git@gin.g-node.org:{organization}/{repo_name}.git"
+
+            dataset = {
+                "dataset_id": repo_name,
+                "full_name": f"{organization}/{repo_name}",
+                "description": description,
+                "url": f"{base_url}/{organization}/{repo_name}",
+                "clone_url": clone_url,
+                "ssh_url": ssh_url,
+                "stars": stars,
+                "forks": forks,
+                "updated": updated,
+                "source": "gin",
+                "organization": organization,
+            }
+
+            datasets.append(dataset)
+
+        except Exception as e:
+            print(f"Warning: Error parsing repository item: {e}", file=sys.stderr)
+            continue
+
+    print(f"Found {len(datasets)} repositories")
+
+    return datasets
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Fetch EEGManyLabs datasets from GIN organization."
+    )
+    parser.add_argument(
+        "--organization",
+        type=str,
+        default="EEGManyLabs",
+        help="GIN organization name (default: EEGManyLabs).",
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=Path("consolidated/eegmanylabs_datasets.json"),
+        help="Output JSON file path (default: consolidated/eegmanylabs_datasets.json).",
+    )
+    parser.add_argument(
+        "--base-url",
+        type=str,
+        default="https://gin.g-node.org",
+        help="Base URL for GIN (default: https://gin.g-node.org).",
+    )
+
+    args = parser.parse_args()
+
+    # Fetch repositories
+    datasets = fetch_eegmanylabs_repos(
+        organization=args.organization,
+        base_url=args.base_url,
+    )
+
+    if not datasets:
+        print("No datasets fetched. Exiting.", file=sys.stderr)
+        sys.exit(1)
+
+    # Create output directory
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+
+    # Save to JSON
+    with args.output.open("w") as fh:
+        json.dump(datasets, fh, indent=2)
+
+    print(f"\n{'=' * 60}")
+    print(f"Successfully saved {len(datasets)} datasets to {args.output}")
+    print(f"{'=' * 60}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/ingestions/1_fetch_figshare.py b/scripts/ingestions/1_fetch_figshare.py
new file mode 100644
index 00000000..d7ccb2d2
--- /dev/null
+++ b/scripts/ingestions/1_fetch_figshare.py
@@ -0,0 +1,345 @@
+"""Fetch EEG BIDS datasets from Figshare.
+
+This script searches Figshare for datasets containing both "EEG" and "BIDS" keywords
+using the Figshare API v2. It retrieves comprehensive metadata including DOIs,
+descriptions, files, authors, and download URLs.
+
+Output: consolidated/figshare_datasets.json
+"""
+
+import argparse
+import json
+import sys
+import time
+from pathlib import Path
+from typing import Any
+
+import requests
+
+
+def search_figshare(
+    query: str,
+    size: int = 100,
+    page_size: int = 100,
+    item_type: int = 3,  # 3 = dataset
+) -> list[dict[str, Any]]:
+    """Search Figshare for datasets matching the query.
+
+    Args:
+        query: Search query string
+        size: Maximum number of results to fetch
+        page_size: Number of results per page (max 1000)
+        item_type: Figshare item type (3=dataset, 1=figure, 2=media, etc.)
+
+    Returns:
+        List of Figshare article dictionaries
+
+    """
+    base_url = "https://api.figshare.com/v2/articles/search"
+
+    print(f"Searching Figshare with query: {query}")
+    print(f"Item type: {item_type} (dataset)")
+    print(f"Max results to fetch: {size}")
+    print(f"Results per page: {min(page_size, 1000)}")
+
+    all_articles = []
+    page = 1
+    actual_page_size = min(page_size, 1000)  # Figshare max is 1000
+
+    while True:
+        print(f"\nFetching page {page}...", end=" ", flush=True)
+
+        # Build request payload
+        payload = {
+            "search_for": query,
+            "item_type": item_type,
+            "page": page,
+            "page_size": actual_page_size,
+        }
+
+        try:
+            response = requests.post(
+                base_url,
+                json=payload,
+                headers={"Content-Type": "application/json"},
+                timeout=30,
+            )
+            response.raise_for_status()
+        except requests.RequestException as e:
+            print(f"\nError fetching page {page}: {e}", file=sys.stderr)
+            break
+
+        articles = response.json()
+
+        if not articles:
+            print("No more results")
+            break
+
+        print(f"Got {len(articles)} articles")
+        all_articles.extend(articles)
+
+        # Check if we've reached the requested size (only if size > 0)
+        if size > 0 and len(all_articles) >= size:
+            print(f"Reached limit ({len(all_articles)} articles)")
+            break
+
+        # If we got fewer results than page_size, we've reached the end
+        if len(articles) < actual_page_size:
+            print("Reached end of results")
+            break
+
+        page += 1
+
+        # Be nice to the API
+        time.sleep(0.5)
+
+    print(f"\nTotal articles fetched: {len(all_articles)}")
+    # Return all articles if size=0, otherwise trim to exact size
+    return all_articles if size <= 0 else all_articles[:size]
+
+
+def get_article_details(article_id: int) -> dict[str, Any]:
+    """Fetch detailed information for a specific article.
+
+    Args:
+        article_id: Figshare article ID
+
+    Returns:
+        Detailed article dictionary
+
+    """
+    url = f"https://api.figshare.com/v2/articles/{article_id}"
+
+    try:
+        response = requests.get(url, timeout=30)
+        response.raise_for_status()
+        return response.json()
+    except requests.RequestException as e:
+        print(
+            f"Warning: Error fetching details for article {article_id}: {e}",
+            file=sys.stderr,
+        )
+        return {}
+
+
+def extract_dataset_info(article: dict, fetch_details: bool = False) -> dict[str, Any]:
+    """Extract relevant information from a Figshare article.
+
+    Args:
+        article: Figshare article dictionary
+        fetch_details: Whether to fetch full details (slower but more complete)
+
+    Returns:
+        Cleaned dataset dictionary
+
+    """
+    # Get basic info from search results
+    article_id = article.get("id", "")
+    title = article.get("title", "")
+    doi = article.get("doi", "")
+
+    # Fetch full details if requested
+    if fetch_details and article_id:
+        details = get_article_details(article_id)
+        if details:
+            article = details
+
+    # Extract metadata
+    description = article.get("description", "")
+    defined_type = article.get("defined_type", 0)
+    defined_type_name = article.get("defined_type_name", "")
+
+    # Extract dates
+    published_date = article.get("published_date", "")
+    created_date = article.get("created_date", "")
+    modified_date = article.get("modified_date", "")
+
+    # Extract URLs
+    url_public_html = article.get("url_public_html", "")
+    url_public_api = article.get("url_public_api", "")
+
+    # Extract resource info (linked DOI/resource)
+    resource_title = article.get("resource_title", "")
+    resource_doi = article.get("resource_doi", "")
+
+    # Extract authors (may not be in search results)
+    authors = []
+    for author in article.get("authors", []):
+        author_info = {
+            "name": author.get("full_name", ""),
+        }
+        if "id" in author:
+            author_info["id"] = author["id"]
+        if "orcid_id" in author:
+            author_info["orcid"] = author["orcid_id"]
+        authors.append(author_info)
+
+    # Extract tags/categories
+    tags = article.get("tags", [])
+    categories = article.get("categories", [])
+
+    # Extract files info
+    files = []
+    for file_entry in article.get("files", []):
+        file_info = {
+            "id": file_entry.get("id", ""),
+            "name": file_entry.get("name", ""),
+            "size_bytes": file_entry.get("size", 0),
+            "download_url": file_entry.get("download_url", ""),
+        }
+        if "computed_md5" in file_entry:
+            file_info["md5"] = file_entry["computed_md5"]
+        files.append(file_info)
+
+    # Calculate total size
+    total_size_bytes = sum(f.get("size", 0) for f in article.get("files", []))
+    total_size_mb = round(total_size_bytes / (1024 * 1024), 2)
+
+    # Extract license
+    license_info = article.get("license", {})
+    license_name = license_info.get("name", "") if license_info else ""
+
+    # Build dataset dictionary
+    dataset = {
+        "dataset_id": str(article_id),
+        "doi": doi,
+        "title": title,
+        "description": description,
+        "defined_type": defined_type,
+        "defined_type_name": defined_type_name,
+        "published_date": published_date,
+        "created_date": created_date,
+        "modified_date": modified_date,
+        "authors": authors,
+        "tags": tags,
+        "categories": categories,
+        "license": license_name,
+        "url": url_public_html,
+        "api_url": url_public_api,
+        "resource_title": resource_title,
+        "resource_doi": resource_doi,
+        "files": files,
+        "file_count": len(files),
+        "total_size_mb": total_size_mb,
+        "source": "figshare",
+    }
+
+    return dataset
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Fetch EEG BIDS datasets from Figshare."
+    )
+    parser.add_argument(
+        "--query",
+        type=str,
+        default="EEG BIDS",
+        help='Search query (default: "EEG BIDS").',
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=Path("consolidated/figshare_datasets.json"),
+        help="Output JSON file path (default: consolidated/figshare_datasets.json).",
+    )
+    parser.add_argument(
+        "--size",
+        type=int,
+        default=1000,
+        help="Maximum number of results to fetch (default: 1000).",
+    )
+    parser.add_argument(
+        "--page-size",
+        type=int,
+        default=100,
+        help="Number of results per page (max 1000, default: 100).",
+    )
+    parser.add_argument(
+        "--fetch-details",
+        action="store_true",
+        help="Fetch full details for each article (slower but more complete).",
+    )
+    parser.add_argument(
+        "--item-type",
+        type=int,
+        default=3,
+        help="Figshare item type: 3=dataset, 1=figure, 2=media, etc. (default: 3).",
+    )
+
+    args = parser.parse_args()
+
+    # Search Figshare
+    articles = search_figshare(
+        query=args.query,
+        size=args.size,
+        page_size=args.page_size,
+        item_type=args.item_type,
+    )
+
+    if not articles:
+        print("No articles found. Exiting.", file=sys.stderr)
+        sys.exit(1)
+
+    # Extract dataset information
+    print("\nExtracting dataset information...")
+    datasets = []
+    for idx, article in enumerate(articles, start=1):
+        if args.fetch_details and idx % 10 == 0:
+            print(f"Processing {idx}/{len(articles)}...", flush=True)
+
+        try:
+            dataset = extract_dataset_info(article, fetch_details=args.fetch_details)
+            datasets.append(dataset)
+        except Exception as e:
+            print(
+                f"Warning: Error extracting info for article {article.get('id')}: {e}",
+                file=sys.stderr,
+            )
+            continue
+
+        # Be nice to the API when fetching details
+        if args.fetch_details:
+            time.sleep(0.2)
+
+    # Create output directory
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+
+    # Save to JSON
+    with args.output.open("w") as fh:
+        json.dump(datasets, fh, indent=2)
+
+    # Print summary statistics
+    print(f"\n{'=' * 60}")
+    print(f"Successfully saved {len(datasets)} datasets to {args.output}")
+    print(f"{'=' * 60}")
+    print("\nDataset Statistics:")
+
+    # Count by type
+    types = {}
+    for ds in datasets:
+        dtype = ds.get("defined_type_name", "unknown")
+        types[dtype] = types.get(dtype, 0) + 1
+
+    print("\nBy Type:")
+    for dtype, count in sorted(types.items(), key=lambda x: x[1], reverse=True):
+        print(f"  {dtype}: {count}")
+
+    # Datasets with files
+    datasets_with_files = sum(1 for ds in datasets if ds.get("file_count", 0) > 0)
+    print(f"\nDatasets with files: {datasets_with_files}/{len(datasets)}")
+
+    # Total size
+    total_size_mb = sum(ds.get("total_size_mb", 0) for ds in datasets)
+    total_size_gb = round(total_size_mb / 1024, 2)
+    print(f"Total Size: {total_size_gb} GB ({total_size_mb} MB)")
+
+    # Datasets with DOI
+    datasets_with_doi = sum(1 for ds in datasets if ds.get("doi", ""))
+    print(f"Datasets with DOI: {datasets_with_doi}/{len(datasets)}")
+
+    print(f"{'=' * 60}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/ingestions/1_fetch_nemardatasets.py b/scripts/ingestions/1_fetch_nemardatasets.py
new file mode 100644
index 00000000..e7dd7936
--- /dev/null
+++ b/scripts/ingestions/1_fetch_nemardatasets.py
@@ -0,0 +1,201 @@
+"""Fetch GitHub organization repositories with metadata using GitHub API."""
+
+import argparse
+import json
+from collections.abc import Iterator
+from pathlib import Path
+
+import requests
+
+GITHUB_API_URL = "https://api.github.com"
+
+
+def fetch_repositories(
+    organization: str = "nemardatasets",
+    page_size: int = 100,
+    timeout: float = 30.0,
+    retries: int = 5,
+) -> Iterator[dict]:
+    """Fetch all repositories from a GitHub organization.
+
+    Args:
+        organization: GitHub organization name
+        page_size: Number of repositories per page (max 100)
+        timeout: Request timeout in seconds
+        retries: Number of retry attempts
+
+    Yields:
+        Repository metadata dictionaries
+
+    """
+    headers = {
+        "Accept": "application/vnd.github.v3+json",
+    }
+
+    page = 1
+
+    while True:
+        url = f"{GITHUB_API_URL}/orgs/{organization}/repos"
+        params = {
+            "per_page": min(page_size, 100),  # GitHub API max is 100
+            "page": page,
+            "type": "all",  # public, private, forks, sources, member, internal
+            "sort": "created",  # created, updated, pushed, full_name
+            "direction": "asc",  # asc or desc
+        }
+
+        attempt = 0
+        while attempt < retries:
+            try:
+                response = requests.get(
+                    url,
+                    headers=headers,
+                    params=params,
+                    timeout=timeout,
+                )
+
+                # Check rate limit
+                if (
+                    response.status_code == 403
+                    and "rate limit" in response.text.lower()
+                ):
+                    print("  Warning: GitHub API rate limit exceeded")
+                    print(
+                        f"  Rate limit reset: {response.headers.get('X-RateLimit-Reset', 'unknown')}"
+                    )
+                    break
+
+                response.raise_for_status()
+                repos = response.json()
+
+                # If empty list, we've fetched all repositories
+                if not repos:
+                    return
+
+                for repo in repos:
+                    repo_name = repo.get("name")
+
+                    # Skip special GitHub repositories
+                    if repo_name in [".github", ".gitignore"]:
+                        continue
+
+                    yield {
+                        "dataset_id": repo_name,
+                        "full_name": repo.get("full_name"),
+                        "description": repo.get("description"),
+                        "url": repo.get("html_url"),
+                        "created": repo.get("created_at"),
+                        "modified": repo.get("updated_at"),
+                        "pushed": repo.get("pushed_at"),
+                        "size_kb": repo.get("size"),
+                        "default_branch": repo.get("default_branch"),
+                        "topics": repo.get("topics", []),
+                        "language": repo.get("language"),
+                        "has_wiki": repo.get("has_wiki"),
+                        "has_issues": repo.get("has_issues"),
+                        "archived": repo.get("archived"),
+                        "visibility": repo.get("visibility"),
+                        "clone_url": repo.get("clone_url"),
+                        "ssh_url": repo.get("ssh_url"),
+                    }
+
+                # Check if there are more pages
+                link_header = response.headers.get("Link", "")
+                if 'rel="next"' not in link_header:
+                    return
+
+                page += 1
+                break
+
+            except requests.exceptions.RequestException as e:
+                attempt += 1
+                print(
+                    f"  Warning: Error fetching page {page} (attempt {attempt}/{retries}): {e}"
+                )
+                if attempt >= retries:
+                    print(f"  Skipping to next page after {retries} failed attempts")
+                    page += 1
+                    break
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Fetch all repositories from a GitHub organization with metadata."
+    )
+    parser.add_argument(
+        "--organization",
+        type=str,
+        default="nemardatasets",
+        help="GitHub organization name (default: nemardatasets)",
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=Path("consolidated/github_organization_repos.json"),
+        help="Output JSON file (default: consolidated/github_organization_repos.json).",
+    )
+    parser.add_argument(
+        "--page-size",
+        type=int,
+        default=100,
+        help="Repositories per page (max 100, default: 100)",
+    )
+    parser.add_argument(
+        "--timeout",
+        type=float,
+        default=30.0,
+        help="Request timeout in seconds (default: 30.0)",
+    )
+    parser.add_argument(
+        "--retries",
+        type=int,
+        default=5,
+        help="Number of retry attempts (default: 5)",
+    )
+    args = parser.parse_args()
+
+    # Fetch and save
+    print(f"Fetching repositories from GitHub organization: {args.organization}")
+
+    repositories = list(
+        fetch_repositories(
+            organization=args.organization,
+            page_size=args.page_size,
+            timeout=args.timeout,
+            retries=args.retries,
+        )
+    )
+
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+    with args.output.open("w", encoding="utf-8") as fh:
+        json.dump(repositories, fh, indent=2)
+
+    print(f"Saved {len(repositories)} repository entries to {args.output}")
+
+    # Print summary
+    if repositories:
+        print("\nSummary:")
+        print(f"  Organization: {args.organization}")
+        print(f"  Total repositories: {len(repositories)}")
+
+        # Count by language
+        languages = {}
+        for repo in repositories:
+            lang = repo.get("language") or "None"
+            languages[lang] = languages.get(lang, 0) + 1
+
+        print(
+            f"  Languages: {dict(sorted(languages.items(), key=lambda x: x[1], reverse=True))}"
+        )
+
+        # Count archived
+        archived = sum(1 for repo in repositories if repo.get("archived"))
+        print(f"  Archived: {archived}")
+
+        # Count with topics
+        with_topics = sum(1 for repo in repositories if repo.get("topics"))
+        print(f"  With topics: {with_topics}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/ingestions/1_fetch_openneuro_datasets.py b/scripts/ingestions/1_fetch_openneuro_datasets.py
new file mode 100644
index 00000000..b62d0856
--- /dev/null
+++ b/scripts/ingestions/1_fetch_openneuro_datasets.py
@@ -0,0 +1,254 @@
+"""Fetch OpenNeuro dataset IDs with metadata using requests library."""
+
+import argparse
+import json
+from collections.abc import Iterator
+from pathlib import Path
+
+import requests
+
+GRAPHQL_URL = "https://openneuro.org/crn/graphql"
+
+DATASETS_QUERY = """
+    query ($modality: String!, $first: Int!, $after: String) {
+      datasets(modality: $modality, first: $first, after: $after) {
+        pageInfo { hasNextPage endCursor }
+        edges {
+          node {
+            id
+            created
+          }
+        }
+      }
+    }
+    """
+
+
+def build_batch_modified_query(dataset_ids: list[str]) -> str:
+    """Build a batch query to fetch modified dates for multiple datasets."""
+    query_parts = []
+    for i, dataset_id in enumerate(dataset_ids):
+        # Create an alias for each dataset to avoid conflicts
+        query_parts.append(
+            f"""
+    ds{i}: dataset(id: "{dataset_id}") {{
+      id
+      latestSnapshot {{
+        created
+      }}
+    }}"""
+        )
+
+    return f"""
+query {{
+{chr(10).join(query_parts)}
+}}
+"""
+
+
+def fetch_batch_modified(
+    dataset_ids: list[str], timeout: float = 30.0
+) -> dict[str, str]:
+    """Fetch modified dates for a batch of datasets in a single query."""
+    result = {}
+
+    if not dataset_ids:
+        return result
+
+    try:
+        query = build_batch_modified_query(dataset_ids)
+        payload = {"query": query}
+        response = requests.post(GRAPHQL_URL, json=payload, timeout=timeout)
+        data = response.json()
+
+        if "errors" in data or "data" not in data:
+            return result
+
+        response_data = data["data"]
+
+        # Extract modified dates from batch response
+        for i, dataset_id in enumerate(dataset_ids):
+            alias = f"ds{i}"
+            if alias in response_data:
+                dataset = response_data[alias]
+                if dataset:
+                    latest_snapshot = dataset.get("latestSnapshot")
+                    if latest_snapshot:
+                        created = latest_snapshot.get("created")
+                        if created:
+                            result[dataset_id] = created
+    except Exception as e:
+        print(f"  Error fetching batch: {str(e)[:100]}")
+
+    return result
+
+
+def fetch_datasets(
+    page_size: int = 100,
+    timeout: float = 30.0,
+    max_consecutive_errors: int = 5,
+) -> Iterator[dict]:
+    """Fetch all OpenNeuro datasets with id and modality using requests."""
+    # Use smaller page size for iEEG due to known API issue at offset 50
+    modality_configs = {
+        "eeg": {"page_size": page_size, "max_errors": 3},
+        "ieeg": {"page_size": 10, "max_errors": 5},  # Smaller for iEEG
+        "meg": {"page_size": page_size, "max_errors": 3},
+    }
+
+    for modality in ["eeg", "ieeg", "meg"]:
+        config = modality_configs[modality]
+        cursor = None
+        consecutive_errors = 0
+        current_page_size = config["page_size"]
+        print(f"Fetching {modality} datasets (page size: {current_page_size})...")
+
+        while True:
+            try:
+                payload = {
+                    "query": DATASETS_QUERY,
+                    "variables": {
+                        "modality": modality,
+                        "first": current_page_size,
+                        "after": cursor,
+                    },
+                }
+
+                response = requests.post(GRAPHQL_URL, json=payload, timeout=timeout)
+                response.raise_for_status()
+
+                result = response.json()
+
+                # Check for GraphQL errors
+                if "errors" in result:
+                    raise Exception(f"GraphQL Error: {result['errors'][0]['message']}")
+
+                # Reset error counter on successful fetch
+                consecutive_errors = 0
+
+            except Exception as e:
+                consecutive_errors += 1
+                error_msg = str(e)
+
+                # Check if it's a server-side error
+                if "Not Found" in error_msg or "INTERNAL_SERVER_ERROR" in error_msg:
+                    print(
+                        f"  [{modality}] Server error (attempt {consecutive_errors}/{config['max_errors']})"
+                    )
+
+                    if consecutive_errors < config["max_errors"]:
+                        print("    Skipping this batch and continuing...")
+                        continue
+
+                    # If we've tried multiple times, move to next modality
+                    print(
+                        f"  [{modality}] Reached max consecutive errors, moving to next modality"
+                    )
+                    break
+                else:
+                    # For other errors, log and stop
+                    print(f"  [{modality}] Error: {error_msg[:100]}")
+                    break
+
+            data = result.get("data")
+            if not data:
+                print(f"  [{modality}] No data in response")
+                break
+
+            page = data.get("datasets")
+            if not page:
+                print(f"  [{modality}] No page data")
+                break
+
+            edges = page.get("edges", [])
+            if not edges:
+                print(f"  [{modality}] No edges in response")
+                break
+
+            # Process each dataset in the page
+            for edge in edges:
+                node = edge.get("node")
+                if not node:
+                    continue
+
+                dataset_id = node.get("id")
+                created = node.get("created")
+
+                # Get modified date (will be fetched separately if needed)
+                modified = created  # Default to created date
+
+                yield {
+                    "dataset_id": dataset_id,
+                    "modality": modality,
+                    "created": created,
+                    "modified": modified,
+                }
+
+            # Check if there are more pages
+            page_info = page.get("pageInfo", {})
+            if not page_info.get("hasNextPage"):
+                print(f"  [{modality}] Completed")
+                break
+            cursor = page_info.get("endCursor")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Fetch all OpenNeuro datasets with metadata."
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=Path("consolidated/openneuro_datasets.json"),
+        help="Output JSON file (default: consolidated/openneuro_datasets.json).",
+    )
+    parser.add_argument("--page-size", type=int, default=100)
+    parser.add_argument("--timeout", type=float, default=30.0)
+    parser.add_argument(
+        "--fetch-modified",
+        action="store_true",
+        help="Fetch actual modified dates for each dataset (slower but complete)",
+    )
+    args = parser.parse_args()
+
+    # Fetch main dataset list
+    datasets = list(
+        fetch_datasets(
+            page_size=args.page_size,
+            timeout=args.timeout,
+        )
+    )
+
+    # Optionally fetch actual modified dates
+    if args.fetch_modified:
+        print(f"\nFetching actual modified dates for {len(datasets)} datasets...")
+        batch_size = 20  # Batch 20 datasets per query
+
+        for batch_start in range(0, len(datasets), batch_size):
+            batch_end = min(batch_start + batch_size, len(datasets))
+            batch = datasets[batch_start:batch_end]
+            dataset_ids = [d["dataset_id"] for d in batch]
+
+            # Fetch modified dates for this batch
+            modified_dates = fetch_batch_modified(dataset_ids, timeout=args.timeout)
+
+            # Update datasets with fetched modified dates
+            for dataset in batch:
+                dataset_id = dataset["dataset_id"]
+                if dataset_id in modified_dates:
+                    dataset["modified"] = modified_dates[dataset_id]
+
+            if batch_end % 100 == 0 or batch_end == len(datasets):
+                print(
+                    f"  Fetched modified dates for {batch_end}/{len(datasets)} datasets"
+                )
+
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+    with args.output.open("w", encoding="utf-8") as fh:
+        json.dump(datasets, fh, indent=2)
+
+    print(f"Saved {len(datasets)} dataset entries to {args.output}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/ingestions/1_fetch_osf.py b/scripts/ingestions/1_fetch_osf.py
new file mode 100644
index 00000000..ca69876d
--- /dev/null
+++ b/scripts/ingestions/1_fetch_osf.py
@@ -0,0 +1,291 @@
+"""Fetch EEG BIDS projects from Open Science Framework (OSF).
+
+This script searches OSF for projects containing both "EEG" and "BIDS" keywords
+using the OSF API v2. It retrieves comprehensive metadata including project IDs,
+descriptions, contributors, files, and DOIs.
+
+Output: consolidated/osf_projects.json
+"""
+
+import argparse
+import json
+import sys
+import time
+from pathlib import Path
+from typing import Any
+
+import requests
+
+
+def search_osf(
+    query: str,
+    size: int = 100,
+    page_size: int = 100,
+) -> list[dict[str, Any]]:
+    """Search OSF for projects matching the query.
+
+    Args:
+        query: Search query string
+        size: Maximum number of results to fetch
+        page_size: Number of results per page (max 100)
+
+    Returns:
+        List of OSF project dictionaries
+
+    """
+    base_url = "https://api.osf.io/v2/search/"
+
+    # Build query parameters
+    params = {
+        "q": query,
+        "page[size]": min(page_size, 100),  # OSF max is 100
+    }
+
+    print(f"Searching OSF with query: {query}")
+    print(f"Max results to fetch: {size}")
+    print(f"Results per page: {params['page[size]']}")
+
+    all_records = []
+    page = 1
+    current_url = base_url
+
+    while True:
+        print(f"\nFetching page {page}...", end=" ", flush=True)
+
+        try:
+            response = requests.get(current_url, params=params, timeout=30)
+            response.raise_for_status()
+        except requests.RequestException as e:
+            print(f"\nError fetching page {page}: {e}", file=sys.stderr)
+            break
+
+        data = response.json()
+        records = data.get("data", [])
+
+        if not records:
+            print("No more results")
+            break
+
+        # Get total count from meta
+        links = data.get("links", {})
+        meta = links.get("meta", {})
+        total = meta.get("total", 0)
+
+        print(f"Got {len(records)} records (total available: {total})")
+        all_records.extend(records)
+
+        # Check if we've reached the requested size or all available records
+        if len(all_records) >= size:
+            print(f"Reached limit ({len(all_records)} records)")
+            break
+
+        # Check if there are more pages
+        next_url = links.get("next")
+
+        if not next_url:
+            print("No more pages available")
+            break
+
+        # Update for next page
+        current_url = next_url
+        params = {}  # Next URL already has all params
+        page += 1
+
+        # Be nice to the API
+        time.sleep(0.5)
+
+    print(f"\nTotal records fetched: {len(all_records)}")
+    return all_records
+
+
+def extract_project_info(record: dict) -> dict[str, Any]:
+    """Extract relevant information from an OSF project record.
+
+    Args:
+        record: Raw OSF project record dictionary
+
+    Returns:
+        Cleaned project dictionary
+
+    """
+    project_id = record.get("id", "")
+    record_type = record.get("type", "")
+
+    attributes = record.get("attributes", {})
+    relationships = record.get("relationships", {})
+    links = record.get("links", {})
+
+    # Extract basic metadata
+    title = attributes.get("title", "")
+    description = attributes.get("description", "")
+    category = attributes.get("category", "")
+    tags = attributes.get("tags", [])
+
+    # Extract dates
+    date_created = attributes.get("date_created", "")
+    date_modified = attributes.get("date_modified", "")
+
+    # Extract flags
+    is_public = attributes.get("public", False)
+    is_registration = attributes.get("registration", False)
+    is_preprint = attributes.get("preprint", False)
+    is_fork = attributes.get("fork", False)
+    wiki_enabled = attributes.get("wiki_enabled", False)
+
+    # Extract license info
+    node_license = attributes.get("node_license", {})
+    license_year = node_license.get("year", "")
+    copyright_holders = node_license.get("copyright_holders", [])
+
+    # Extract DOI if available
+    doi = None
+    identifiers = (
+        relationships.get("identifiers", {}).get("links", {}).get("related", {})
+    )
+    if identifiers:
+        doi_href = identifiers.get("href", "")
+        if doi_href:
+            # We'd need to fetch this separately, but for now we'll note it exists
+            doi = "available"  # Placeholder
+
+    # Extract URLs
+    html_url = links.get("html", "")
+    self_url = links.get("self", "")
+
+    # Build project dictionary
+    project = {
+        "project_id": project_id,
+        "type": record_type,
+        "title": title,
+        "description": description,
+        "category": category,
+        "tags": tags,
+        "date_created": date_created,
+        "date_modified": date_modified,
+        "is_public": is_public,
+        "is_registration": is_registration,
+        "is_preprint": is_preprint,
+        "is_fork": is_fork,
+        "wiki_enabled": wiki_enabled,
+        "license_year": license_year,
+        "copyright_holders": copyright_holders,
+        "url": html_url,
+        "api_url": self_url,
+        "source": "osf",
+    }
+
+    # Add DOI if available
+    if doi:
+        project["doi"] = doi
+
+    return project
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Fetch EEG BIDS projects from Open Science Framework."
+    )
+    parser.add_argument(
+        "--query",
+        type=str,
+        default="EEG BIDS",
+        help='Search query (default: "EEG BIDS").',
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=Path("consolidated/osf_projects.json"),
+        help="Output JSON file path (default: consolidated/osf_projects.json).",
+    )
+    parser.add_argument(
+        "--size",
+        type=int,
+        default=1000,
+        help="Maximum number of results to fetch (default: 1000).",
+    )
+    parser.add_argument(
+        "--page-size",
+        type=int,
+        default=100,
+        help="Number of results per page (max 100, default: 100).",
+    )
+
+    args = parser.parse_args()
+
+    # Search OSF
+    records = search_osf(
+        query=args.query,
+        size=args.size,
+        page_size=args.page_size,
+    )
+
+    if not records:
+        print("No records found. Exiting.", file=sys.stderr)
+        sys.exit(1)
+
+    # Extract project information
+    print("\nExtracting project information...")
+    projects = []
+    for record in records:
+        try:
+            project = extract_project_info(record)
+            projects.append(project)
+        except Exception as e:
+            print(
+                f"Warning: Error extracting info for record {record.get('id')}: {e}",
+                file=sys.stderr,
+            )
+            continue
+
+    # Create output directory
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+
+    # Save to JSON
+    with args.output.open("w") as fh:
+        json.dump(projects, fh, indent=2)
+
+    # Print summary statistics
+    print(f"\n{'=' * 60}")
+    print(f"Successfully saved {len(projects)} projects to {args.output}")
+    print(f"{'=' * 60}")
+    print("\nProject Statistics:")
+
+    # Count by category
+    categories = {}
+    for proj in projects:
+        cat = proj.get("category", "unknown")
+        categories[cat] = categories.get(cat, 0) + 1
+
+    print("\nBy Category:")
+    for cat, count in sorted(categories.items(), key=lambda x: x[1], reverse=True):
+        print(f"  {cat}: {count}")
+
+    # Count by type
+    types = {}
+    for proj in projects:
+        ptype = proj.get("type", "unknown")
+        types[ptype] = types.get(ptype, 0) + 1
+
+    print("\nBy Type:")
+    for ptype, count in sorted(types.items(), key=lambda x: x[1], reverse=True):
+        print(f"  {ptype}: {count}")
+
+    # Public vs private
+    public_count = sum(1 for proj in projects if proj.get("is_public", False))
+    print(f"\nPublic projects: {public_count}/{len(projects)}")
+
+    # Registrations
+    registration_count = sum(
+        1 for proj in projects if proj.get("is_registration", False)
+    )
+    print(f"Registrations: {registration_count}/{len(projects)}")
+
+    # Preprints
+    preprint_count = sum(1 for proj in projects if proj.get("is_preprint", False))
+    print(f"Preprints: {preprint_count}/{len(projects)}")
+
+    print(f"{'=' * 60}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/ingestions/1_fetch_scidb.py b/scripts/ingestions/1_fetch_scidb.py
new file mode 100644
index 00000000..fcc29a30
--- /dev/null
+++ b/scripts/ingestions/1_fetch_scidb.py
@@ -0,0 +1,401 @@
+"""Fetch EEG BIDS datasets from SciDB (Science Data Bank).
+
+This script searches SciDB for public datasets containing both "EEG" and "BIDS" keywords
+using the SciDB query service API. It retrieves comprehensive metadata including DOIs,
+CSTR identifiers, descriptions, authors, and file information.
+
+Supports simple queries and advanced boolean queries using AND, OR, NOT operators.
+
+Output: consolidated/scidb_datasets.json
+"""
+
+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import Any
+
+import requests
+
+# Predefined query templates for neuroimaging research
+QUERY_TEMPLATES = {
+    "eeg_bids": "EEG BIDS",
+    "neuroimaging": ("(EEG OR MEG OR iEEG OR ECoG OR EMG) AND (BIDS OR neuroimaging)"),
+    # TODO: Revisit query structure - SciDB may require different query syntax
+    # Current structure may not correctly combine boolean operators
+    # See: SCIDB_QUERY_TEMPLATES.md for details and testing notes
+    # "comprehensive": (
+    #     '(("EEG" OR "electroencephalography" OR "MEG" OR "magnetoencephalography" '
+    #     'OR "iEEG" OR "intracranial EEG" OR "ECoG" OR "electrocorticography" '
+    #     'OR "SEEG" OR "stereo EEG" OR "EMG" OR "electromyography") '
+    #     'AND '
+    #     '("BIDS" OR "Brain Imaging Data Structure" OR "BIDS-EEG" OR "BIDS-MEG" '
+    #     'OR "BIDS-iEEG" OR "BIDS specification" OR "BIDS extension" '
+    #     'OR "BIDS derivatives" OR "BIDS apps" OR "BIDS validator" OR "BIDS converter")) '
+    #     'OR '
+    #     '(("EEG-BIDS" OR "MEG-BIDS" OR "iEEG-BIDS" OR "EMG-BIDS") '
+    #     'AND '
+    #     '("data sharing" OR "open data" OR "FAIR principles" OR "neuroimaging standardization"))'
+    # ),
+}
+
+
+def search_scidb(
+    query: str,
+    size: int = 100,
+    page_size: int = 100,
+    public_only: bool = True,
+) -> list[dict[str, Any]]:
+    """Search SciDB for datasets matching the query.
+
+    Args:
+        query: Search query string
+        size: Maximum number of results to fetch
+        page_size: Number of results per page (max 100)
+        public_only: Filter for public datasets only (default: True)
+
+    Returns:
+        List of SciDB dataset dictionaries
+
+    """
+    base_url = "https://www.scidb.cn/api/sdb-query-service/query"
+
+    # Build query parameters
+    params = {
+        "queryCode": "",
+        "q": query,
+    }
+
+    print(f"Searching SciDB with query: {query}")
+    print(f"Max results to fetch: {size}")
+    print(f"Results per page: {min(page_size, 100)}")
+    if public_only:
+        print("Filtering: Public datasets only")
+
+    all_datasets = []
+    page = 1
+    actual_page_size = min(page_size, 100)  # SciDB max appears to be 100
+
+    while True:
+        print(f"\nFetching page {page}...", end=" ", flush=True)
+
+        # Build request body
+        body = {
+            "fileType": ["001"],  # 001 = dataset type
+            "dataSetStatus": ["PUBLIC"] if public_only else [],
+            "copyrightCode": [],
+            "publishDate": [],
+            "ordernum": "6",  # Sort order (6 appears to be relevance)
+            "rorId": [],
+            "ror": "",
+            "taxonomyEn": [],
+            "journalNameEn": [],
+            "page": page,
+            "size": actual_page_size,
+        }
+
+        try:
+            response = requests.post(
+                base_url,
+                params=params,
+                json=body,
+                headers={
+                    "Content-Type": "application/json;charset=utf-8",
+                    "Accept": "application/json, text/plain, */*",
+                },
+                timeout=30,
+            )
+            response.raise_for_status()
+        except requests.RequestException as e:
+            print(f"\nError fetching page {page}: {e}", file=sys.stderr)
+            break
+
+        try:
+            data = response.json()
+        except json.JSONDecodeError as e:
+            print(f"\nError parsing JSON response: {e}", file=sys.stderr)
+            break
+
+        # Check API response code
+        code = data.get("code", 0)
+        if code != 20000:
+            message = data.get("message", "Unknown error")
+            print(f"\nAPI Error (code {code}): {message}", file=sys.stderr)
+            break
+
+        # Extract records from nested data structure: response.data.data[]
+        datasets = data.get("data", {}).get("data", [])
+
+        if not datasets:
+            print("No more results")
+            break
+
+        print(f"Got {len(datasets)} records (total: {len(all_datasets)})")
+        all_datasets.extend(datasets)
+
+        # If we got fewer results than page_size, we've reached the end
+        if len(datasets) < actual_page_size:
+            print("Reached end of results")
+            break
+
+        # Check if we've reached the requested size (only if size > 0)
+        if size > 0 and len(all_datasets) >= size:
+            print(f"Reached target limit ({len(all_datasets)} datasets)")
+            break
+
+        page += 1
+
+    # Return all if size is 0 or negative, otherwise trim to size
+    if size <= 0:
+        return all_datasets
+    else:
+        return all_datasets[:size]
+
+    print(f"\nTotal datasets fetched: {len(all_datasets)}")
+    return all_datasets[:size]  # Trim to exact size
+
+
+def extract_dataset_info(record: dict) -> dict[str, Any]:
+    """Extract relevant information from a SciDB dataset record.
+
+    Args:
+        record: SciDB dataset record dictionary
+
+    Returns:
+        Cleaned dataset dictionary with normalized fields
+
+    """
+    # Extract IDs
+    dataset_id = record.get("id", "")
+    doi = record.get("doi", "")
+    cstr = record.get("cstr", "")
+    pid = record.get("pid", "")
+
+    # Extract basic metadata - API uses titleEn/titleZh pattern
+    title_en = record.get("titleEn", "")
+    title_zh = record.get("titleZh", "")
+    abstract_en = record.get("introductionEn", "")
+    abstract_zh = record.get("introductionZh", "")
+
+    # Extract keywords
+    keywords_en = record.get("keywordEn", [])
+    if isinstance(keywords_en, str):
+        keywords_en = [keywords_en] if keywords_en else []
+    keywords_zh = record.get("keywordZh", [])
+    if isinstance(keywords_zh, str):
+        keywords_zh = [keywords_zh] if keywords_zh else []
+
+    # Extract authors with affiliations
+    authors = []
+    for author in record.get("author", []):
+        author_info = {
+            "name_en": author.get("nameEn", ""),
+            "name_zh": author.get("nameZh", ""),
+        }
+        if author.get("email"):
+            author_info["email"] = author["email"]
+
+        # Handle affiliations (organizations)
+        affiliations = []
+        for org in author.get("organizations", []):
+            aff_info = {
+                "name_en": org.get("nameEn", ""),
+                "name_zh": org.get("nameZh", ""),
+            }
+            # Extract ROR ID if available
+            for orgid in org.get("orgids", []):
+                if orgid.get("type") == "ROR":
+                    aff_info["ror_id"] = orgid.get("value")
+                    break
+            affiliations.append(aff_info)
+        if affiliations:
+            author_info["affiliations"] = affiliations
+
+        authors.append(author_info)
+
+    # Extract dates
+    publication_date = record.get("dataSetPublishDate", "")
+    create_time = record.get("create_time", "")
+    update_time = record.get("update_time", "")
+
+    # Extract access and status
+    data_set_status = record.get("dataSetStatus", "")
+
+    # Extract license information
+    license_info = record.get("copyRight", {})
+    license_code = license_info.get("code", "")
+
+    # Extract file information
+    file_size_bytes = record.get("size", 0)
+    file_size_mb = round(file_size_bytes / (1024 * 1024), 2) if file_size_bytes else 0
+
+    # Extract metrics
+    download_count = record.get("download", 0)
+    visit_count = record.get("visit", 0)
+
+    # Extract URL
+    url = record.get("url", "")
+    if not url and dataset_id:
+        url = f"https://www.scidb.cn/en/detail?id={dataset_id}"
+
+    # Build dataset dictionary
+    dataset = {
+        "dataset_id": str(dataset_id),
+        "doi": doi,
+        "cstr": cstr,
+        "pid": pid,
+        "title": title_en or title_zh,
+        "title_en": title_en,
+        "title_zh": title_zh,
+        "description": abstract_en or abstract_zh,
+        "description_en": abstract_en,
+        "description_zh": abstract_zh,
+        "authors": authors,
+        "keywords_en": keywords_en,
+        "keywords_zh": keywords_zh,
+        "publication_date": publication_date,
+        "created": create_time,
+        "modified": update_time,
+        "status": data_set_status,
+        "license": license_code,
+        "file_size_mb": file_size_mb,
+        "file_size_bytes": file_size_bytes,
+        "download_count": download_count,
+        "visit_count": visit_count,
+        "url": url,
+        "source": "scidb",
+    }
+
+    return dataset
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Fetch EEG BIDS datasets from SciDB (Science Data Bank).",
+        epilog="Query templates: eeg_bids, neuroimaging | TODO: Add comprehensive template after query syntax validation",
+    )
+    parser.add_argument(
+        "--query",
+        type=str,
+        default="eeg_bids",
+        help=(
+            'Search query or template name (default: "eeg_bids"). '
+            "Available templates: eeg_bids, neuroimaging. "
+            "Use --list-queries to see all templates. "
+            "Or provide a custom query with boolean operators (AND, OR, NOT). "
+            "TODO: comprehensive template syntax may need adjustment for SciDB API."
+        ),
+    )
+    parser.add_argument(
+        "--list-queries",
+        action="store_true",
+        help="List all available query templates and exit.",
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=Path("consolidated/scidb_datasets.json"),
+        help="Output JSON file path (default: consolidated/scidb_datasets.json).",
+    )
+    parser.add_argument(
+        "--size",
+        type=int,
+        default=10000,
+        help="Maximum number of results to fetch (default: 10000, use 0 to fetch all pages).",
+    )
+    parser.add_argument(
+        "--page-size",
+        type=int,
+        default=100,
+        help="Number of results per page (max 100, default: 100).",
+    )
+    parser.add_argument(
+        "--include-restricted",
+        action="store_true",
+        help="Include restricted/private datasets (default: public only).",
+    )
+
+    args = parser.parse_args()
+
+    # Handle list-queries
+    if args.list_queries:
+        print("Available query templates:\n")
+        for name, query in QUERY_TEMPLATES.items():
+            print(f"  {name}:")
+            print(f"    {query}\n")
+        sys.exit(0)
+
+    # Resolve query template or use custom query
+    query = QUERY_TEMPLATES.get(args.query, args.query)
+
+    # Search SciDB
+    records = search_scidb(
+        query=query,
+        size=args.size,
+        page_size=args.page_size,
+        public_only=not args.include_restricted,
+    )
+
+    if not records:
+        print("No datasets found. Exiting.", file=sys.stderr)
+        sys.exit(1)
+
+    # Extract dataset information
+    print("\nExtracting dataset information...")
+    datasets = []
+    for record in records:
+        try:
+            dataset = extract_dataset_info(record)
+            datasets.append(dataset)
+        except Exception as e:
+            print(
+                f"Warning: Error extracting info for record {record.get('id')}: {e}",
+                file=sys.stderr,
+            )
+            continue
+
+    # Create output directory
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+
+    # Save to JSON
+    with args.output.open("w") as fh:
+        json.dump(datasets, fh, indent=2, ensure_ascii=False)
+
+    # Print summary statistics
+    print(f"\n{'=' * 60}")
+    print(f"Successfully saved {len(datasets)} datasets to {args.output}")
+    print(f"{'=' * 60}")
+    print("\nDataset Statistics:")
+
+    # Count by status
+    statuses = {}
+    for ds in datasets:
+        status = ds.get("status", "unknown")
+        statuses[status] = statuses.get(status, 0) + 1
+
+    print("\nBy Status:")
+    for status, count in sorted(statuses.items(), key=lambda x: x[1], reverse=True):
+        print(f"  {status}: {count}")
+
+    # Datasets with files
+    datasets_with_files = sum(1 for ds in datasets if ds.get("file_count", 0) > 0)
+    print(f"\nDatasets with files: {datasets_with_files}/{len(datasets)}")
+
+    # Total size
+    total_size_mb = sum(ds.get("total_size_mb", 0) for ds in datasets)
+    total_size_gb = round(total_size_mb / 1024, 2)
+    print(f"Total Size: {total_size_gb} GB ({total_size_mb} MB)")
+
+    # Datasets with DOI
+    datasets_with_doi = sum(1 for ds in datasets if ds.get("doi", ""))
+    print(f"Datasets with DOI: {datasets_with_doi}/{len(datasets)}")
+
+    # Datasets with CSTR
+    datasets_with_cstr = sum(1 for ds in datasets if ds.get("cstr", ""))
+    print(f"Datasets with CSTR: {datasets_with_cstr}/{len(datasets)}")
+
+    print(f"{'=' * 60}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/ingestions/1_fetch_zenodo.py b/scripts/ingestions/1_fetch_zenodo.py
new file mode 100644
index 00000000..f3871457
--- /dev/null
+++ b/scripts/ingestions/1_fetch_zenodo.py
@@ -0,0 +1,542 @@
+"""Fetch neuroimaging datasets from Zenodo.
+
+This script searches Zenodo for datasets combining neuroimaging modalities
+(EEG, MEG, iEEG, ECoG, SEEG, EMG) with BIDS standards using the Zenodo REST API.
+It retrieves comprehensive metadata including DOIs, descriptions, file information,
+and download URLs.
+
+Search Strategy:
+- Modalities: EEG, electroencephalography, MEG, magnetoencephalography, iEEG,
+              intracranial EEG, ECoG, electrocorticography, SEEG, stereo EEG,
+              EMG, electromyography
+- Standards: BIDS, Brain Imaging Data Structure, neuroimaging
+- Logic: AND operator for balanced recall and precision
+
+Output: consolidated/zenodo_datasets.json
+"""
+
+import argparse
+import json
+import sys
+import time
+from pathlib import Path
+from typing import Any
+
+import requests
+
+
+def search_zenodo(
+    query: str,
+    size: int = 100,
+    all_versions: bool = False,
+    resource_type: str = "dataset",
+    access_status: str = "open",
+    subject: str = "",
+    sort: str = "bestmatch",
+    access_token: str | None = None,
+) -> list[dict[str, Any]]:
+    """Search Zenodo for records matching the query.
+
+    Implements Zenodo REST API v1 search endpoint with comprehensive filtering.
+    See: https://developers.zenodo.org/#list39
+
+    Args:
+        query: Search query string using Elasticsearch syntax (e.g., '"EEG" "BIDS"' for exact match)
+        size: Maximum number of total results to fetch (will paginate)
+        all_versions: Include all versions of records (default: latest only)
+        resource_type: Filter by resource type (default: 'dataset'). Options: 'publication', 'dataset', 'image', 'video', 'software', etc.
+        access_status: Filter by access status (default: 'open'). Options: 'open', 'embargoed', 'restricted', 'closed'
+        subject: Filter by subject classification (e.g., 'EEG' to filter by subject taxonomy)
+        sort: Sort order (default: 'bestmatch' for relevance). Options: 'bestmatch', 'mostrecent', '-mostrecent'
+        access_token: Personal Access Token from https://zenodo.org/account/settings/applications/tokens/new/
+                     Increases rate limits from 60 to 100 req/min
+
+    Returns:
+        List of Zenodo record dictionaries
+
+    Rate Limits:
+        Guest users: 60 requests/min, 2000 requests/hour
+        Authenticated users: 100 requests/min, 5000 requests/hour
+        Response headers: X-RateLimit-Limit, X-RateLimit-Remaining, X-RateLimit-Reset
+
+    """
+    base_url = "https://zenodo.org/api/records"
+
+    # Build base params following API documentation
+    base_params = {
+        "q": query,
+        "sort": sort,
+    }
+
+    # Add faceted filters as URL parameters
+    # Based on API docs and testing: type, subject, access_status are separate params
+    if resource_type:
+        base_params["type"] = resource_type
+    if access_status:
+        base_params["access_status"] = access_status
+    if subject:
+        base_params["subject"] = subject
+
+    if all_versions:
+        base_params["all_versions"] = "1"
+
+    # Build headers with auth if provided
+    headers = {
+        "Accept": "application/vnd.inveniordm.v1+json",
+    }
+
+    # Check rate limit headers if available
+    rate_limit_info = ""
+    if access_token:
+        headers["Authorization"] = f"Bearer {access_token}"
+        rate_limit_info = "Authenticated (100 req/min, 5000 req/hour)"
+    else:
+        rate_limit_info = "Guest (60 req/min, 2000 req/hour)"
+
+    print(f"{'=' * 70}")
+    print(f"Zenodo API Search - {rate_limit_info}")
+    print(f"{'=' * 70}")
+    print(f"Query: {query}")
+    filters_list = [f"type={resource_type}"] if resource_type else []
+    if access_status:
+        filters_list.append(f"access_status={access_status}")
+    if subject:
+        filters_list.append(f"subject={subject}")
+    filters_list.append(f"sort={sort}")
+    print(f"Filters: {', '.join(filters_list)}")
+    print(f"Max results: {size}")
+    print(f"{'=' * 70}\n")
+
+    all_records = []
+    page = 1
+    page_size = min(size, 100)  # API supports up to 100 per page
+    consecutive_errors = 0
+    total_available = None
+
+    while len(all_records) < size:
+        print(f"Page {page:3d}...", end=" ", flush=True)
+
+        # Build params for this page
+        params = base_params.copy()
+        params["page"] = page
+        params["size"] = page_size
+
+        try:
+            response = requests.get(
+                base_url, params=params, headers=headers, timeout=120
+            )
+
+            # Handle rate limiting (429 = Too Many Requests)
+            if response.status_code == 429:
+                retry_after = int(response.headers.get("Retry-After", 60))
+                remaining = response.headers.get("X-RateLimit-Remaining", "unknown")
+                reset_time = response.headers.get("X-RateLimit-Reset", "unknown")
+                print(f"\n⚠️ Rate limited! Remaining: {remaining}, Reset: {reset_time}")
+                print(f"   Waiting {retry_after} seconds...", file=sys.stderr)
+                time.sleep(retry_after + 1)
+                continue  # Retry this page
+
+            response.raise_for_status()
+            consecutive_errors = 0
+
+            # Log rate limit status (helpful for monitoring)
+            if page == 1 or page % 10 == 0:
+                limit = response.headers.get("X-RateLimit-Limit", "N/A")
+                remaining = response.headers.get("X-RateLimit-Remaining", "N/A")
+                if page > 1:
+                    print(f"\n[Rate Limit: {remaining}/{limit}]", end=" ")
+
+        except requests.Timeout:
+            consecutive_errors += 1
+            print(f"TIMEOUT (attempt {consecutive_errors}/3)")
+            if consecutive_errors >= 3:
+                print("Too many timeouts. Stopping.", file=sys.stderr)
+                break
+            time.sleep(5)
+            continue
+
+        except requests.RequestException as e:
+            consecutive_errors += 1
+            print(f"ERROR: {type(e).__name__}")
+            if consecutive_errors >= 3:
+                print("Too many errors. Stopping.", file=sys.stderr)
+                break
+            time.sleep(5)
+            continue
+
+        try:
+            data = response.json()
+        except json.JSONDecodeError as e:
+            print(f"JSON Error: {e}")
+            break
+
+        hits = data.get("hits", {}).get("hits", [])
+        total = data.get("hits", {}).get("total", 0)
+
+        if total_available is None:
+            total_available = total
+
+        if not hits:
+            print("No more results")
+            break
+
+        print(
+            f"{len(hits):3d} records | Total available: {total_available:5d} | Fetched so far: {len(all_records) + len(hits):5d}"
+        )
+        all_records.extend(hits)
+
+        # Check if we've reached the requested size
+        if len(all_records) >= size:
+            break
+
+        # Check if we've reached the end of results
+        if len(hits) < page_size:
+            print(f"Reached end of results at page {page}")
+            break
+
+        page += 1
+
+        # Respect rate limiting with conservative delays
+        # Guest: 60 req/min → 1 req/sec max → use 1.1s delay
+        # Auth: 100 req/min → 0.6 req/sec max → use 0.7s delay
+        delay = 1.1 if not access_token else 0.7
+        time.sleep(delay)
+
+    print(f"\n{'=' * 70}")
+    print(
+        f"Total records fetched: {len(all_records)} (requested: {size}, available: {total_available or 'unknown'})"
+    )
+    print(f"{'=' * 70}\n")
+    return all_records[:size]  # Trim to exact size
+
+
+def extract_dataset_info(record: dict) -> dict[str, Any]:
+    """Extract relevant information from a Zenodo record.
+
+    Handles both InvenioRDM v1 format (from API with vnd.inveniordm.v1+json)
+    and legacy Zenodo format.
+
+    Args:
+        record: Raw Zenodo record dictionary
+
+    Returns:
+        Cleaned dataset dictionary
+
+    """
+    metadata = record.get("metadata", {})
+
+    # Detect format based on structure
+    is_inveniordm = "pids" in record  # InvenioRDM has 'pids' at top level
+
+    # Extract DOI and IDs
+    dataset_id = str(record.get("id", ""))
+
+    if is_inveniordm:
+        # InvenioRDM format
+        doi = record.get("pids", {}).get("doi", {}).get("identifier", "")
+        concept_doi = record.get("parent", {}).get("id", "")
+    else:
+        # Legacy format
+        doi = record.get("doi", "")
+        concept_doi = record.get("conceptdoi", "")
+
+    # Extract basic metadata
+    title = metadata.get("title", "")
+    description = metadata.get("description", "")
+    publication_date = metadata.get("publication_date", "")
+
+    # Extract creators/authors
+    creators = []
+    for creator in metadata.get("creators", []):
+        if isinstance(creator, dict):
+            creator_info = {
+                "name": creator.get("person_or_org", {}).get(
+                    "name", creator.get("name", "")
+                )
+            }
+            # InvenioRDM uses affiliations array
+            affiliations = creator.get("affiliations", [])
+            if affiliations and isinstance(affiliations[0], dict):
+                creator_info["affiliation"] = affiliations[0].get("name", "")
+            elif "affiliation" in creator:
+                creator_info["affiliation"] = creator["affiliation"]
+            # ORCID
+            person = creator.get("person_or_org", creator)
+            for identifier in person.get("identifiers", []):
+                if identifier.get("scheme") == "orcid":
+                    creator_info["orcid"] = identifier.get("identifier", "")
+            if "orcid" in person:
+                creator_info["orcid"] = person["orcid"]
+            creators.append(creator_info)
+
+    # Extract keywords/subjects
+    if is_inveniordm:
+        # InvenioRDM uses 'subjects' array
+        keywords = [
+            s.get("subject", s.get("id", "")) for s in metadata.get("subjects", [])
+        ]
+    else:
+        # Legacy uses 'keywords' array
+        keywords = metadata.get("keywords", [])
+
+    # Extract resource type
+    resource_type_data = metadata.get("resource_type", {})
+    if isinstance(resource_type_data, dict):
+        if is_inveniordm:
+            resource_type_str = resource_type_data.get("id", "")
+            resource_title = resource_type_data.get("title", {}).get("en", "")
+        else:
+            resource_type_str = resource_type_data.get("type", "")
+            resource_title = resource_type_data.get("title", "")
+    else:
+        resource_type_str = str(resource_type_data)
+        resource_title = ""
+
+    # Extract license
+    if is_inveniordm:
+        # InvenioRDM uses 'rights' array
+        rights = metadata.get("rights", [])
+        license_id = (
+            rights[0].get("id", "") if rights and isinstance(rights[0], dict) else ""
+        )
+    else:
+        # Legacy uses 'license' dict
+        license_info = metadata.get("license", {})
+        license_id = (
+            license_info.get("id", "")
+            if isinstance(license_info, dict)
+            else str(license_info)
+        )
+
+    # Extract access rights/status
+    if is_inveniordm:
+        # InvenioRDM uses 'access' dict at top level
+        access_data = record.get("access", {})
+        access_right = access_data.get("status", "")
+    else:
+        # Legacy uses 'access_right' in metadata
+        access_right = metadata.get("access_right", "")
+
+    # Extract file information
+    # InvenioRDM v1 format: files is a dict with 'entries' containing filename→fileinfo mapping
+    # Legacy format: files is a list of file dicts
+    files = []
+    files_data = record.get("files", {})
+
+    if isinstance(files_data, dict):
+        # InvenioRDM v1 format
+        file_entries = files_data.get("entries", {})
+        if isinstance(file_entries, dict):
+            # entries is a dict: {filename: file_info}
+            for filename, file_info in file_entries.items():
+                files.append(
+                    {
+                        "filename": file_info.get("key", filename),
+                        "size_bytes": file_info.get("size", 0),
+                        "checksum": file_info.get("checksum", ""),
+                        "download_url": record.get("links", {}).get("self", "")
+                        + f"/files/{filename}/content",
+                    }
+                )
+        else:
+            # entries is a list (shouldn't happen but handle gracefully)
+            for file_entry in file_entries:
+                files.append(
+                    {
+                        "filename": file_entry.get("key", ""),
+                        "size_bytes": file_entry.get("size", 0),
+                        "checksum": file_entry.get("checksum", ""),
+                        "download_url": file_entry.get("links", {}).get("self", ""),
+                    }
+                )
+    elif isinstance(files_data, list):
+        # Legacy format: files is a list
+        for file_entry in files_data:
+            files.append(
+                {
+                    "filename": file_entry.get("key", ""),
+                    "size_bytes": file_entry.get("size", 0),
+                    "checksum": file_entry.get("checksum", ""),
+                    "download_url": file_entry.get("links", {}).get("self", ""),
+                }
+            )
+
+    # Extract links
+    links = record.get("links", {})
+
+    # Calculate total size from files structure
+    if isinstance(files_data, dict):
+        # InvenioRDM v1 format has total_bytes at top level
+        total_size_bytes = files_data.get("total_bytes", 0)
+    else:
+        # Legacy format: sum from individual files
+        total_size_bytes = sum(f.get("size_bytes", 0) for f in files)
+
+    total_size_mb = round(total_size_bytes / (1024 * 1024), 2)
+
+    # Build dataset dictionary
+    dataset = {
+        "dataset_id": dataset_id,
+        "doi": doi,
+        "concept_doi": concept_doi,
+        "title": title,
+        "description": description,
+        "publication_date": publication_date,
+        "created": record.get("created", ""),
+        "modified": record.get("modified", ""),
+        "creators": creators,
+        "keywords": keywords,
+        "resource_type": resource_type_str,
+        "resource_title": resource_title,
+        "license": license_id,
+        "access_right": access_right,
+        "url": links.get("self_html", ""),
+        "doi_url": links.get("doi", ""),
+        "files": files,
+        "file_count": len(files),
+        "total_size_mb": total_size_mb,
+        "source": "zenodo",
+    }
+
+    return dataset
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Fetch EEG BIDS datasets from Zenodo.")
+    parser.add_argument(
+        "--query",
+        type=str,
+        default='(EEG OR electroencephalography OR MEG OR magnetoencephalography OR iEEG OR "intracranial EEG" OR ECoG OR electrocorticography OR SEEG OR "stereo EEG" OR EMG OR electromyography) AND (BIDS OR "Brain Imaging Data Structure" OR neuroimaging)',
+        help=(
+            "Search query (default: comprehensive neuroimaging + BIDS for balanced recall and precision). "
+            "Searches across all modalities (EEG, MEG, iEEG, ECoG, SEEG, EMG) combined with BIDS standards. "
+            "Use quotes for multi-word terms. Example: '\"intracranial EEG\" AND BIDS'"
+        ),
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=Path("consolidated/zenodo_datasets.json"),
+        help="Output JSON file path (default: consolidated/zenodo_datasets.json).",
+    )
+    parser.add_argument(
+        "--size",
+        type=int,
+        default=10000,
+        help="Maximum number of results to fetch (default: 10000, max: 9999).",
+    )
+    parser.add_argument(
+        "--all-versions",
+        action="store_true",
+        help="Include all versions of records (default: latest only).",
+    )
+    parser.add_argument(
+        "--resource-type",
+        type=str,
+        default="dataset",
+        help='Filter by resource type (default: "dataset"). Set to empty string to disable filter.',
+    )
+    parser.add_argument(
+        "--access-status",
+        type=str,
+        default="open",
+        help='Filter by access status (default: "open"). Options: "open", "embargoed", "restricted", "closed". Empty to disable.',
+    )
+    parser.add_argument(
+        "--subject",
+        type=str,
+        default="",
+        help='Filter by subject classification (e.g., "EEG"). Leave empty for broader coverage. Default: no subject filter.',
+    )
+    parser.add_argument(
+        "--sort",
+        type=str,
+        default="bestmatch",
+        help='Sort order (default: "bestmatch"). Options: "bestmatch", "mostrecent", "-mostrecent".',
+    )
+    parser.add_argument(
+        "--access-token",
+        type=str,
+        default="v3dkycQdOlyc0gXXkeeroSIkpSyAgaTyzpsIJLw8lhQsoEw089MFICKqnxWz",
+        help="Zenodo Personal Access Token (default: temp test token). Get from https://zenodo.org/account/settings/applications/tokens/new/",
+    )
+
+    args = parser.parse_args()
+
+    # Search Zenodo with all filters
+    records = search_zenodo(
+        query=args.query,
+        size=args.size,
+        all_versions=args.all_versions,
+        resource_type=args.resource_type or "",
+        access_status=args.access_status or "",
+        subject=args.subject or "",
+        sort=args.sort,
+        access_token=args.access_token,
+    )
+
+    if not records:
+        print("No records found. Exiting.", file=sys.stderr)
+        sys.exit(1)
+
+    # Extract dataset information
+    print("\nExtracting dataset information...")
+    datasets = []
+    for record in records:
+        try:
+            dataset = extract_dataset_info(record)
+            datasets.append(dataset)
+        except Exception as e:
+            print(
+                f"Warning: Error extracting info for record {record.get('id')}: {e}",
+                file=sys.stderr,
+            )
+            continue
+
+    # Create output directory
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+
+    # Save to JSON
+    with args.output.open("w") as fh:
+        json.dump(datasets, fh, indent=2)
+
+    # Print summary statistics
+    print(f"\n{'=' * 60}")
+    print(f"Successfully saved {len(datasets)} datasets to {args.output}")
+    print(f"{'=' * 60}")
+    print("\nDataset Statistics:")
+
+    # Count by resource type
+    resource_types = {}
+    for ds in datasets:
+        rt = ds.get("resource_type", "unknown")
+        resource_types[rt] = resource_types.get(rt, 0) + 1
+
+    print("\nBy Resource Type:")
+    for rt, count in sorted(resource_types.items(), key=lambda x: x[1], reverse=True):
+        print(f"  {rt}: {count}")
+
+    # Count by access right
+    access_rights = {}
+    for ds in datasets:
+        ar = ds.get("access_right", "unknown")
+        access_rights[ar] = access_rights.get(ar, 0) + 1
+
+    print("\nBy Access Rights:")
+    for ar, count in sorted(access_rights.items(), key=lambda x: x[1], reverse=True):
+        print(f"  {ar}: {count}")
+
+    # Total size
+    total_size_mb = sum(ds.get("total_size_mb", 0) for ds in datasets)
+    total_size_gb = round(total_size_mb / 1024, 2)
+    print(f"\nTotal Size: {total_size_gb} GB ({total_size_mb} MB)")
+
+    # Datasets with files
+    datasets_with_files = sum(1 for ds in datasets if ds.get("file_count", 0) > 0)
+    print(f"Datasets with files: {datasets_with_files}/{len(datasets)}")
+
+    print(f"{'=' * 60}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/ingestions/1_fetch_zenodo_enhanced.py b/scripts/ingestions/1_fetch_zenodo_enhanced.py
new file mode 100644
index 00000000..7c53360e
--- /dev/null
+++ b/scripts/ingestions/1_fetch_zenodo_enhanced.py
@@ -0,0 +1,756 @@
+"""AGGRESSIVE Zenodo neurophysiology dataset fetcher - Maximum Recall Strategy.
+
+PHILOSOPHY: Cast a wide net first, filter false positives later!
+
+This script implements an aggressive multi-strategy search combining:
+1. Extended terminology search (electroencephalography, magnetoencephalography, etc.)
+2. All neurophysiology modalities (EEG, MEG, iEEG, ECoG, SEEG, EMG)
+3. BIDS and related terms (Brain Imaging Data Structure)
+4. Optional OAI-PMH enrichment for complete metadata
+
+AGGRESSIVE QUERY RESULTS (Tested Nov 2024):
+┌─────────────────────────────────────────────┬──────────┐
+│ Query Strategy                              │ Results  │
+├─────────────────────────────────────────────┼──────────┤
+│ Extended terms (all synonyms)               │ 1,570 ✓  │
+│ All modalities OR BIDS                      │ 1,391    │
+│ Simple modality terms                       │ 1,324    │
+│ Conservative "EEG BIDS" (old approach)      │   904    │
+└─────────────────────────────────────────────┴──────────┘
+
+STRATEGY: Use extended terminology query to maximize recall (1,570 datasets)
+Then filter out false positives during post-processing based on:
+- File structure (presence of BIDS-compliant files)
+- Metadata keywords and descriptions
+- File extensions (.eeg, .vhdr, .fif, .set, etc.)
+
+Key findings from Zenodo API documentation analysis:
+- Default behavior: Multiple terms are OR-ed (maximizes recall)
+- Explicit AND/OR operators work with proper parentheses grouping
+- type=dataset filter essential (reduces 7,595 → 1,570)
+- access_status=open ensures only usable datasets
+- Field searches available but not needed for initial broad sweep
+
+Output: consolidated/zenodo_datasets_aggressive.json
+Post-filtering: Will create consolidated/zenodo_datasets_validated.json
+"""
+
+import argparse
+import json
+import sys
+import time
+from pathlib import Path
+from typing import Any
+
+import requests
+
+
+def search_zenodo_rest_api(
+    query: str,
+    size: int = 1000,
+    all_versions: bool = False,
+    resource_type: str = "dataset",
+    access_status: str = "open",
+    sort: str = "bestmatch",
+    access_token: str | None = None,
+) -> list[dict[str, Any]]:
+    """Search Zenodo using REST API for comprehensive dataset discovery.
+
+    REST API advantages:
+    - Fast pagination (100 records/page)
+    - Rich filtering options
+    - InvenioRDM v1 JSON format with complete file metadata
+    - Good for initial discovery
+
+    Args:
+        query: Search query (default: "EEG BIDS" for broad fuzzy matching)
+        size: Maximum results to fetch
+        all_versions: Include all versions
+        resource_type: Filter by type (default: "dataset")
+        access_status: Filter by access (default: "open")
+        sort: Sort order (default: "bestmatch")
+        access_token: Personal Access Token for higher rate limits
+
+    Returns:
+        List of record dictionaries with InvenioRDM v1 structure
+
+    Rate Limits:
+        Guest: 60 req/min, 2000 req/hour
+        Auth: 100 req/min, 5000 req/hour
+
+    """
+    base_url = "https://zenodo.org/api/records"
+
+    params = {"q": query, "sort": sort}
+    if resource_type:
+        params["type"] = resource_type
+    if access_status:
+        params["access_status"] = access_status
+    if all_versions:
+        params["all_versions"] = "1"
+
+    headers = {"Accept": "application/vnd.inveniordm.v1+json"}
+    if access_token:
+        headers["Authorization"] = f"Bearer {access_token}"
+
+    rate_limit_info = "Auth (100 req/min)" if access_token else "Guest (60 req/min)"
+    print(f"{'=' * 70}")
+    print(f"REST API Search - {rate_limit_info}")
+    print(f"{'=' * 70}")
+    print(f"Query: {query}")
+    print(f"Filters: type={resource_type}, access_status={access_status}")
+    print(f"Max results: {size}")
+    print(f"{'=' * 70}\n")
+
+    all_records = []
+    page = 1
+    page_size = min(size, 100)
+    consecutive_errors = 0
+    total_available = None
+
+    while len(all_records) < size:
+        print(f"Page {page:3d}...", end=" ", flush=True)
+
+        params_with_page = params.copy()
+        params_with_page["page"] = page
+        params_with_page["size"] = page_size
+
+        try:
+            response = requests.get(
+                base_url, params=params_with_page, headers=headers, timeout=120
+            )
+
+            if response.status_code == 429:
+                retry_after = int(response.headers.get("Retry-After", 60))
+                print(f"\n⚠️  Rate limited! Waiting {retry_after}s...")
+                time.sleep(retry_after + 1)
+                continue
+
+            response.raise_for_status()
+            consecutive_errors = 0
+
+        except (requests.Timeout, requests.RequestException) as e:
+            consecutive_errors += 1
+            print(f"ERROR ({consecutive_errors}/3): {type(e).__name__}")
+            if consecutive_errors >= 3:
+                print("\nToo many errors. Stopping.")
+                break
+            time.sleep(5)
+            continue
+
+        try:
+            data = response.json()
+        except json.JSONDecodeError as e:
+            print(f"JSON Error: {e}")
+            break
+
+        hits = data.get("hits", {}).get("hits", [])
+        total = data.get("hits", {}).get("total", 0)
+
+        if total_available is None:
+            total_available = total
+
+        if not hits:
+            print("No more results")
+            break
+
+        print(
+            f"{len(hits):3d} records | Total: {total_available:5d} | Fetched: {len(all_records) + len(hits):5d}"
+        )
+        all_records.extend(hits)
+
+        if len(all_records) >= size or len(hits) < page_size:
+            if len(hits) < page_size:
+                print(f"Reached end of results at page {page}")
+            break
+
+        page += 1
+
+        # Conservative rate limiting
+        delay = 1.1 if not access_token else 0.7
+        time.sleep(delay)
+
+    print(f"\n{'=' * 70}")
+    print(
+        f"Total fetched: {len(all_records)} (available: {total_available or 'unknown'})"
+    )
+    print(f"{'=' * 70}\n")
+
+    return all_records[:size]
+
+
+def enrich_with_oaipmh(
+    record_id: str,
+    access_token: str | None = None,
+    metadata_format: str = "oai_datacite",
+) -> dict[str, Any] | None:
+    """Enrich dataset with additional metadata from OAI-PMH API.
+
+    OAI-PMH advantages:
+    - Richer metadata formats (oai_datacite, datacite, dcat, oai_dc)
+    - Access to controlled vocabularies (subjects)
+    - Complete contributor information
+    - Related identifiers (publications, datasets)
+    - Grant/funding information
+    - More detailed descriptions
+
+    Args:
+        record_id: Zenodo record ID
+        access_token: Optional auth token
+        metadata_format: Metadata format (oai_datacite recommended)
+
+    Returns:
+        Parsed metadata dict or None if error
+
+    Metadata formats:
+        oai_datacite: Most complete (recommended)
+        datacite: DataCite schema only
+        oai_dc: Dublin Core (minimal)
+        dcat: DCAT-AP format (includes file links)
+        marc21: Legacy format
+
+    """
+    try:
+        import xml.etree.ElementTree as ET
+
+        oai_url = f"https://zenodo.org/oai2d?verb=GetRecord&identifier=oai:zenodo.org:{record_id}&metadataPrefix={metadata_format}"
+
+        headers = {}
+        if access_token:
+            headers["Authorization"] = f"Bearer {access_token}"
+
+        response = requests.get(oai_url, headers=headers, timeout=30)
+        response.raise_for_status()
+
+        root = ET.fromstring(response.content)
+
+        # Extract DataCite metadata if using oai_datacite format
+        if metadata_format == "oai_datacite":
+            namespaces = {
+                "oai": "http://www.openarchives.org/OAI/2.0/",
+                "oai_datacite": "http://schema.datacite.org/oai/oai-1.1/",
+                "datacite": "http://datacite.org/schema/kernel-4",
+            }
+
+            resource = root.find(".//datacite:resource", namespaces)
+            if resource is None:
+                return None
+
+            enriched = {}
+
+            # Extract subjects (controlled vocabularies)
+            subjects = []
+            for subject in resource.findall(".//datacite:subject", namespaces):
+                subject_data = {"term": subject.text}
+                if "subjectScheme" in subject.attrib:
+                    subject_data["scheme"] = subject.attrib["subjectScheme"]
+                if "schemeURI" in subject.attrib:
+                    subject_data["scheme_uri"] = subject.attrib["schemeURI"]
+                if "valueURI" in subject.attrib:
+                    subject_data["value_uri"] = subject.attrib["valueURI"]
+                subjects.append(subject_data)
+            if subjects:
+                enriched["subjects"] = subjects
+
+            # Extract contributors
+            contributors = []
+            for contrib in resource.findall(".//datacite:contributor", namespaces):
+                contrib_name = contrib.find("datacite:contributorName", namespaces)
+                contrib_type = contrib.find("datacite:contributorType", namespaces)
+                if contrib_name is not None:
+                    contrib_data = {
+                        "name": contrib_name.text,
+                        "type": contrib_type.text
+                        if contrib_type is not None
+                        else "Other",
+                    }
+                    # Get affiliation
+                    affiliation = contrib.find("datacite:affiliation", namespaces)
+                    if affiliation is not None:
+                        contrib_data["affiliation"] = affiliation.text
+                    contributors.append(contrib_data)
+            if contributors:
+                enriched["contributors"] = contributors
+
+            # Extract related identifiers
+            related_ids = []
+            for related in resource.findall(
+                ".//datacite:relatedIdentifier", namespaces
+            ):
+                if related.text:
+                    related_data = {
+                        "identifier": related.text,
+                        "relation_type": related.attrib.get("relationType", ""),
+                        "identifier_type": related.attrib.get(
+                            "relatedIdentifierType", ""
+                        ),
+                    }
+                    if "resourceTypeGeneral" in related.attrib:
+                        related_data["resource_type"] = related.attrib[
+                            "resourceTypeGeneral"
+                        ]
+                    related_ids.append(related_data)
+            if related_ids:
+                enriched["related_identifiers"] = related_ids
+
+            # Extract funding information
+            funding = []
+            for funder in resource.findall(".//datacite:fundingReference", namespaces):
+                funder_name = funder.find("datacite:funderName", namespaces)
+                if funder_name is not None:
+                    funding_data = {"funder_name": funder_name.text}
+
+                    award_number = funder.find("datacite:awardNumber", namespaces)
+                    if award_number is not None:
+                        funding_data["award_number"] = award_number.text
+
+                    award_title = funder.find("datacite:awardTitle", namespaces)
+                    if award_title is not None:
+                        funding_data["award_title"] = award_title.text
+
+                    funding.append(funding_data)
+            if funding:
+                enriched["funding"] = funding
+
+            return enriched if enriched else None
+
+        return None
+
+    except Exception as e:
+        print(f"OAI-PMH enrichment failed for {record_id}: {e}", file=sys.stderr)
+        return None
+
+
+def get_archive_preview_files(
+    record_id: str, filename: str, access_token: str | None = None
+) -> list[str]:
+    """Extract file listing from Zenodo archive preview without downloading.
+
+    Zenodo provides a preview endpoint that shows archive contents as HTML.
+    This allows us to inspect BIDS structure inside .zip files without downloading.
+
+    Args:
+        record_id: Zenodo record ID
+        filename: Archive filename (e.g., "data.zip")
+        access_token: Optional auth token
+
+    Returns:
+        List of filenames found inside the archive
+
+    """
+    try:
+        preview_url = f"https://zenodo.org/records/{record_id}/preview/{filename}"
+        params = {"include_deleted": "0"}
+
+        headers = {}
+        if access_token:
+            headers["Authorization"] = f"Bearer {access_token}"
+
+        response = requests.get(preview_url, params=params, headers=headers, timeout=30)
+        response.raise_for_status()
+
+        # Simple HTML parsing to extract filenames
+        html_content = response.text.lower()
+
+        # Extract filenames from the HTML structure
+        # Pattern: <span><i class="file outline icon"></i> filename</span>
+        import re
+
+        file_pattern = r'<span><i class="file outline icon"></i>[^<]*?([^<]+)</span>'
+        folder_pattern = r'<i class="folder icon"></i>\s*<a[^>]*>([^<]+)</a>'
+
+        files = re.findall(file_pattern, html_content)
+        folders = re.findall(folder_pattern, html_content)
+
+        # Clean up extracted names
+        all_items = []
+        for item in files:
+            cleaned = item.strip()
+            if cleaned:
+                all_items.append(cleaned)
+
+        for folder in folders:
+            cleaned = folder.strip()
+            if cleaned:
+                all_items.append(cleaned + "/")
+
+        return all_items
+
+    except Exception:
+        # Silently fail - this is a best-effort enhancement
+        return []
+
+
+def extract_dataset_info(
+    record: dict, enrich_oaipmh: bool = False, access_token: str | None = None
+) -> dict[str, Any]:
+    """Extract and optionally enrich dataset information.
+
+    Handles InvenioRDM v1 format from REST API and optionally enriches
+    with OAI-PMH metadata for more complete information.
+
+    NEW: Also extracts archive contents preview for better BIDS detection.
+
+    Args:
+        record: REST API record dictionary
+        enrich_oaipmh: Whether to fetch additional metadata via OAI-PMH
+        access_token: Optional auth token for OAI-PMH
+
+    Returns:
+        Comprehensive dataset dictionary
+
+    """
+    metadata = record.get("metadata", {})
+    dataset_id = str(record.get("id", ""))
+
+    # Extract DOI (InvenioRDM v1 format)
+    doi = record.get("pids", {}).get("doi", {}).get("identifier", "")
+    concept_doi = record.get("parent", {}).get("id", "")
+
+    # Basic metadata
+    title = metadata.get("title", "")
+    description = metadata.get("description", "")
+    publication_date = metadata.get("publication_date", "")
+
+    # Creators
+    creators = []
+    for creator in metadata.get("creators", []):
+        if isinstance(creator, dict):
+            person = creator.get("person_or_org", {})
+            creator_info = {"name": person.get("name", creator.get("name", ""))}
+
+            # Affiliations
+            affiliations = creator.get("affiliations", [])
+            if affiliations and isinstance(affiliations[0], dict):
+                creator_info["affiliation"] = affiliations[0].get("name", "")
+
+            # ORCID
+            for identifier in person.get("identifiers", []):
+                if identifier.get("scheme") == "orcid":
+                    creator_info["orcid"] = identifier.get("identifier", "")
+
+            creators.append(creator_info)
+
+    # Keywords/subjects
+    keywords = [s.get("subject", s.get("id", "")) for s in metadata.get("subjects", [])]
+
+    # Resource type
+    resource_type_data = metadata.get("resource_type", {})
+    resource_type_str = resource_type_data.get("id", "")
+    resource_title = resource_type_data.get("title", {}).get("en", "")
+
+    # License
+    rights = metadata.get("rights", [])
+    license_id = (
+        rights[0].get("id", "") if rights and isinstance(rights[0], dict) else ""
+    )
+
+    # Access status
+    access_data = record.get("access", {})
+    access_right = access_data.get("status", "")
+
+    # Files (InvenioRDM v1 format)
+    files = []
+    archive_contents = {}  # Store archive preview contents
+    files_data = record.get("files", {})
+
+    if isinstance(files_data, dict):
+        file_entries = files_data.get("entries", {})
+        for filename, file_info in file_entries.items():
+            file_dict = {
+                "filename": file_info.get("key", filename),
+                "size_bytes": file_info.get("size", 0),
+                "checksum": file_info.get("checksum", ""),
+                "mimetype": file_info.get("mimetype", ""),
+                "download_url": record.get("links", {}).get("self", "")
+                + f"/files/{filename}/content",
+            }
+            files.append(file_dict)
+
+            # Check if file is an archive - if so, try to get preview
+            archived_extensions = [
+                ".zip",
+                ".tar.gz",
+                ".tgz",
+                ".tar",
+                ".rar",
+                ".7z",
+                ".gz",
+            ]
+            fn_lower = filename.lower()
+            if any(fn_lower.endswith(ext) for ext in archived_extensions):
+                # Try to get archive contents preview
+                preview_files = get_archive_preview_files(
+                    dataset_id, filename, access_token
+                )
+                if preview_files:
+                    archive_contents[filename] = preview_files
+
+    total_size_bytes = (
+        files_data.get("total_bytes", 0) if isinstance(files_data, dict) else 0
+    )
+    total_size_mb = round(total_size_bytes / (1024 * 1024), 2)
+
+    # Build dataset dictionary
+    dataset = {
+        "dataset_id": dataset_id,
+        "doi": doi,
+        "concept_doi": concept_doi,
+        "title": title,
+        "description": description,
+        "publication_date": publication_date,
+        "created": record.get("created", ""),
+        "modified": record.get("updated", ""),
+        "creators": creators,
+        "keywords": keywords,
+        "resource_type": resource_type_str,
+        "resource_title": resource_title,
+        "license": license_id,
+        "access_right": access_right,
+        "url": record.get("links", {}).get("self_html", ""),
+        "doi_url": f"https://doi.org/{doi}" if doi else "",
+        "files": files,
+        "file_count": len(files),
+        "total_size_mb": total_size_mb,
+        "archive_contents": archive_contents,  # NEW: Archive preview contents
+        "has_archive_preview": len(archive_contents) > 0,  # NEW: Flag for filtering
+        "source": "zenodo",
+    }
+
+    # Enrich with OAI-PMH if requested
+    if enrich_oaipmh and dataset_id:
+        enriched = enrich_with_oaipmh(dataset_id, access_token)
+        if enriched:
+            dataset.update(
+                {
+                    "enriched_subjects": enriched.get("subjects", []),
+                    "contributors": enriched.get("contributors", []),
+                    "related_identifiers": enriched.get("related_identifiers", []),
+                    "funding": enriched.get("funding", []),
+                    "oaipmh_enriched": True,
+                }
+            )
+
+    return dataset
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Fetch EEG BIDS datasets from Zenodo with optional OAI-PMH enrichment.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+AGGRESSIVE SEARCH STRATEGY - Maximum Recall:
+
+Query Performance (Tested with type=dataset filter):
+  Extended terminology    → 1,570 results ✓ DEFAULT (Maximum recall)
+  All modalities OR BIDS  → 1,391 results
+  Simple "EEG BIDS"       →   904 results (old conservative approach)
+
+The aggressive query captures ALL neurophysiology datasets by using:
+  • Full terminology (electroencephalography, magnetoencephalography, etc.)
+  • All modalities (EEG, MEG, iEEG, ECoG, SEEG, EMG)
+  • BIDS variants (BIDS, "Brain Imaging Data Structure")
+  • Boolean OR logic for maximum coverage
+
+Expected Results Distribution:
+  - True BIDS datasets: ~60-70% (need validation)
+  - Neurophysiology non-BIDS: ~20-30% (can be converted)
+  - False positives: ~5-10% (will filter out)
+
+Filtering Strategy (Post-Processing):
+  1. Check for BIDS-compliant file structure
+  2. Validate metadata keywords (BIDS, EEG, MEG, etc.)
+  3. Verify file extensions (.eeg, .vhdr, .fif, .set, .bdf, .edf)
+  4. Analyze dataset descriptions for BIDS mentions
+  5. Flag potential BIDS vs non-BIDS datasets
+
+OAI-PMH Enrichment (Optional):
+  Adds: controlled vocabularies, contributors, related identifiers, funding
+  Trade-off: ~2x slower (requires extra API call per dataset)
+  Recommended: Use --enrich-oaipmh for complete metadata extraction
+
+Examples:
+  # Maximum recall (default - gets ~1,570 datasets)
+  python scripts/ingestions/1_fetch_zenodo_enhanced.py
+
+  # With OAI-PMH enrichment (slower but complete)
+  python scripts/ingestions/1_fetch_zenodo_enhanced.py --enrich-oaipmh
+
+  # Conservative search (old approach)
+  python scripts/ingestions/1_fetch_zenodo_enhanced.py --query "EEG BIDS" --size 1000
+
+  # Custom modality search
+  python scripts/ingestions/1_fetch_zenodo_enhanced.py --query "MEG OR magnetoencephalography"
+""",
+    )
+
+    parser.add_argument(
+        "--query",
+        type=str,
+        default='EEG OR electroencephalography OR MEG OR magnetoencephalography OR iEEG OR intracranial OR ECoG OR electrocorticography OR SEEG OR EMG OR electromyography OR BIDS OR "Brain Imaging Data Structure"',
+        help="Search query (default: AGGRESSIVE - ALL neurophysiology OR BIDS for maximum recall)",
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=Path("consolidated/zenodo_datasets_aggressive.json"),
+        help="Output JSON file path (default: aggressive unfiltered results).",
+    )
+    parser.add_argument(
+        "--size",
+        type=int,
+        default=2000,
+        help="Maximum number of results to fetch (default: 2000 to capture all ~1,570 aggressive results).",
+    )
+    parser.add_argument(
+        "--resource-type",
+        type=str,
+        default="dataset",
+        help='Filter by resource type (default: "dataset").',
+    )
+    parser.add_argument(
+        "--access-status",
+        type=str,
+        default="open",
+        help='Filter by access status (default: "open").',
+    )
+    parser.add_argument(
+        "--sort",
+        type=str,
+        default="bestmatch",
+        help='Sort order (default: "bestmatch").',
+    )
+    parser.add_argument(
+        "--access-token",
+        type=str,
+        default="v3dkycQdOlyc0gXXkeeroSIkpSyAgaTyzpsIJLw8lhQsoEw089MFICKqnxWz",
+        help="Zenodo Personal Access Token (default: temp test token).",
+    )
+    parser.add_argument(
+        "--enrich-oaipmh",
+        action="store_true",
+        help="Enrich datasets with OAI-PMH metadata (slower but more complete).",
+    )
+    parser.add_argument(
+        "--oaipmh-sample",
+        type=int,
+        default=0,
+        help="Only enrich first N datasets with OAI-PMH (0 = all if --enrich-oaipmh).",
+    )
+
+    args = parser.parse_args()
+
+    # Search via REST API
+    records = search_zenodo_rest_api(
+        query=args.query,
+        size=args.size,
+        resource_type=args.resource_type or "",
+        access_status=args.access_status or "",
+        sort=args.sort,
+        access_token=args.access_token,
+    )
+
+    if not records:
+        print("No records found. Exiting.", file=sys.stderr)
+        sys.exit(1)
+
+    # Extract dataset information
+    print("\nExtracting dataset information...")
+    if args.enrich_oaipmh:
+        oaipmh_limit = args.oaipmh_sample if args.oaipmh_sample > 0 else len(records)
+        print(f"OAI-PMH enrichment enabled for first {oaipmh_limit} datasets")
+        print("(This will be slower but provides richer metadata)\n")
+
+    datasets = []
+    for i, record in enumerate(records, 1):
+        try:
+            enrich = args.enrich_oaipmh and (
+                args.oaipmh_sample == 0 or i <= args.oaipmh_sample
+            )
+            dataset = extract_dataset_info(
+                record, enrich_oaipmh=enrich, access_token=args.access_token
+            )
+            datasets.append(dataset)
+
+            if enrich and i % 10 == 0:
+                print(f"  Enriched {i}/{len(records)} datasets...")
+        except Exception as e:
+            print(
+                f"Warning: Error extracting record {record.get('id')}: {e}",
+                file=sys.stderr,
+            )
+            continue
+
+    # Save to JSON
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+    with args.output.open("w") as fh:
+        json.dump(datasets, fh, indent=2)
+
+    # Print statistics
+    print(f"\n{'=' * 70}")
+    print(f"Successfully saved {len(datasets)} datasets to {args.output}")
+    print(f"{'=' * 70}")
+
+    # Modality analysis
+    modalities = {
+        "EEG": sum(
+            1
+            for d in datasets
+            if "EEG" in (d.get("title", "") + " ".join(d.get("keywords", []))).upper()
+            and "IEEG"
+            not in (d.get("title", "") + " ".join(d.get("keywords", []))).upper()
+        ),
+        "MEG": sum(
+            1
+            for d in datasets
+            if "MEG" in (d.get("title", "") + " ".join(d.get("keywords", []))).upper()
+        ),
+        "iEEG/ECoG": sum(
+            1
+            for d in datasets
+            if any(
+                x in (d.get("title", "") + " ".join(d.get("keywords", []))).upper()
+                for x in ["IEEG", "ECOG", "INTRACRANIAL"]
+            )
+        ),
+        "EMG": sum(
+            1
+            for d in datasets
+            if "EMG" in (d.get("title", "") + " ".join(d.get("keywords", []))).upper()
+        ),
+    }
+
+    print("\nModality Detection:")
+    for mod, count in modalities.items():
+        print(f"  {mod:15s}: {count:4d} datasets ({count / len(datasets) * 100:.1f}%)")
+
+    # Metadata coverage
+    print("\nMetadata Coverage:")
+    print(
+        f"  With DOI:       {sum(1 for d in datasets if d.get('doi')):4d}/{len(datasets)} ({sum(1 for d in datasets if d.get('doi')) / len(datasets) * 100:.1f}%)"
+    )
+    print(
+        f"  With license:   {sum(1 for d in datasets if d.get('license')):4d}/{len(datasets)} ({sum(1 for d in datasets if d.get('license')) / len(datasets) * 100:.1f}%)"
+    )
+    print(
+        f"  With keywords:  {sum(1 for d in datasets if d.get('keywords')):4d}/{len(datasets)} ({sum(1 for d in datasets if d.get('keywords')) / len(datasets) * 100:.1f}%)"
+    )
+    print(
+        f"  With files:     {sum(1 for d in datasets if d.get('file_count', 0) > 0):4d}/{len(datasets)} ({sum(1 for d in datasets if d.get('file_count', 0) > 0) / len(datasets) * 100:.1f}%)"
+    )
+
+    if args.enrich_oaipmh:
+        enriched_count = sum(1 for d in datasets if d.get("oaipmh_enriched"))
+        print(
+            f"  OAI-PMH enriched: {enriched_count:4d}/{len(datasets)} ({enriched_count / len(datasets) * 100:.1f}%)"
+        )
+
+    # Total size
+    total_size_mb = sum(d.get("total_size_mb", 0) for d in datasets)
+    print(f"\nTotal Size: {total_size_mb / 1024:.2f} GB ({total_size_mb:.2f} MB)")
+
+    print(f"{'=' * 70}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/ingestions/2_filter_new_datasets.py b/scripts/ingestions/2_filter_new_datasets.py
new file mode 100644
index 00000000..43453d01
--- /dev/null
+++ b/scripts/ingestions/2_filter_new_datasets.py
@@ -0,0 +1,77 @@
+"""Filter datasets that are not yet in EEGDash MongoDB."""
+
+import argparse
+import json
+from pathlib import Path
+
+from eegdash.api import EEGDash
+
+
+def filter_new_datasets(input_file: Path, output_file: Path, is_public: bool = True):
+    """Filter datasets that don't exist in MongoDB yet.
+
+    Args:
+        input_file: Input JSON file with dataset list
+        output_file: Output JSON file with filtered datasets
+        is_public: Whether to check public or private MongoDB
+
+    """
+    # Load input datasets
+    with input_file.open("r") as f:
+        datasets = json.load(f)
+
+    # Connect to MongoDB
+    eegdash = EEGDash(is_public=is_public)
+
+    # Get existing dataset IDs from MongoDB
+    existing_ids = set(
+        doc["dataset"] for doc in eegdash.collection.find({}, {"dataset": 1, "_id": 0})
+    )
+
+    # Filter out datasets that already exist
+    new_datasets = [ds for ds in datasets if ds.get("dataset_id") not in existing_ids]
+
+    # Save filtered datasets
+    output_file.parent.mkdir(parents=True, exist_ok=True)
+    with output_file.open("w") as f:
+        json.dump(new_datasets, f, indent=2)
+
+    # Print summary
+    print(f"Total datasets in input: {len(datasets)}")
+    print(f"Already in MongoDB: {len(datasets) - len(new_datasets)}")
+    print(f"New datasets to digest: {len(new_datasets)}")
+    print(f"Filtered dataset list saved to: {output_file}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Filter datasets that are not yet in EEGDash MongoDB"
+    )
+    parser.add_argument(
+        "input",
+        type=Path,
+        help="Input JSON file (e.g., consolidated/openneuro_datasets.json)",
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=None,
+        help="Output JSON file (default: adds 'to_digest_' prefix)",
+    )
+    parser.add_argument(
+        "--private",
+        action="store_true",
+        help="Check private MongoDB instead of public",
+    )
+    args = parser.parse_args()
+
+    # Default output filename with prefix
+    if args.output is None:
+        filename = args.input.name
+        args.output = args.input.parent / f"to_digest_{filename}"
+
+    filter_new_datasets(args.input, args.output, is_public=not args.private)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/ingestions/2_filter_zenodo_bids.py b/scripts/ingestions/2_filter_zenodo_bids.py
new file mode 100644
index 00000000..3ad58180
--- /dev/null
+++ b/scripts/ingestions/2_filter_zenodo_bids.py
@@ -0,0 +1,645 @@
+"""Filter Zenodo aggressive results to identify genuine BIDS datasets.
+
+CRITICAL INSIGHT: A dataset can be BIDS-compliant without mentioning "BIDS"!
+
+Many neurophysiology datasets follow BIDS structure perfectly but never use
+the term "BIDS" in their title, description, or keywords. Therefore, we must
+prioritize FILE STRUCTURE over keyword mentions when classifying datasets.
+
+NEW FEATURE: Archive Preview Detection
+-------------------------------------
+Zenodo provides a preview endpoint that shows archive contents WITHOUT downloading!
+We now inspect .zip files to detect BIDS structure inside archived datasets.
+
+This dramatically improves BIDS detection accuracy:
+- OLD approach: 89% rejection (couldn't see inside archives)
+- NEW approach: Expected 30-50% BIDS detection (can inspect archive contents)
+
+Example: https://zenodo.org/records/13790279/preview/database.zip
+Returns HTML with complete file tree showing:
+  - dataset_description.json
+  - participants.tsv
+  - sub-01/sub-01_eeg.json
+  - sub-01/sub-01_channels.tsv
+
+This script post-processes the aggressive Zenodo fetch results to separate:
+1. Genuine BIDS datasets (detected by structure OR keywords)
+2. Neurophysiology datasets (convertible to BIDS)
+3. False positives (not relevant)
+
+Filtering criteria based on BIDS specification analysis:
+
+PRIMARY INDICATORS (File Structure - Most Reliable):
+- BIDS naming patterns: sub-XX, ses-XX, task-XX, run-XX
+- Core files: dataset_description.json, participants.tsv
+- Sidecars: *_eeg.json, *_channels.tsv, *_events.tsv
+- Directory structure: code/, derivatives/, sourcedata/
+- NOW: Archive preview contents inspection!
+
+SECONDARY INDICATORS (Keywords - Less Reliable):
+- Explicit mentions: "BIDS", "Brain Imaging Data Structure"
+- Related terms: "BIDS-compliant", "BIDS format", "OpenNeuro"
+
+CLASSIFICATION PRIORITY:
+1. Strong file structure (≥3 BIDS indicators) → BIDS strict
+2. Moderate structure (≥2 indicators) → BIDS moderate
+3. Weak structure (≥1 indicator) + neuro files → BIDS probable
+4. No structure + neuro files → Neurophysiology (convertible)
+5. No indicators → Rejected
+
+Input: consolidated/zenodo_datasets_aggressive.json (~1,570 records)
+Outputs:
+  - consolidated/zenodo_datasets_bids.json (validated BIDS)
+  - consolidated/zenodo_datasets_neurophysiology.json (convertible)
+  - consolidated/zenodo_datasets_rejected.json (false positives)
+"""
+
+import argparse
+import json
+import re
+from pathlib import Path
+from typing import Any
+
+# BIDS-specific indicators
+BIDS_KEYWORDS = {
+    "strict": [
+        "bids",
+        "brain imaging data structure",
+        "bids format",
+        "bids-compliant",
+        "bids specification",
+        "bids validator",
+        "openneuro",
+    ],
+    "moderate": [
+        "bids-like",
+        "bids standard",
+        "neuroimaging data structure",
+        "standardized format",
+    ],
+}
+
+BIDS_FILE_INDICATORS = [
+    # Core BIDS files (definitive indicators)
+    "dataset_description.json",
+    "participants.tsv",
+    "participants.json",
+    # Modality-specific sidecars (strong indicators)
+    "_eeg.json",
+    "_meg.json",
+    "_ieeg.json",
+    "_channels.tsv",
+    "_electrodes.tsv",
+    "_events.tsv",
+    "_coordsystem.json",
+    # Task and run indicators
+    "_task-",
+    "_run-",
+    "_ses-",
+    "_sub-",
+    # BIDS metadata files
+    "README",
+    "CHANGES",
+    "code/",
+    "derivatives/",
+    "sourcedata/",
+    # Specific BIDS naming patterns
+    "_bold.nii",
+    "_T1w.nii",
+    "_dwi.nii",
+    "_eeg.eeg",
+    "_eeg.vhdr",
+    "_eeg.set",
+    "_meg.fif",
+    "_meg.ds",
+]
+
+NEUROPHYSIOLOGY_EXTENSIONS = {
+    "eeg": [".eeg", ".vhdr", ".vmrk", ".edf", ".bdf", ".set", ".fdt", ".cnt", ".mff"],
+    "meg": [".fif", ".sqd", ".con", ".raw", ".ave", ".mrk", ".elp", ".hsp", ".ds"],
+    "ieeg": [".edf", ".eeg", ".vhdr", ".trc", ".bin"],
+    "emg": [".edf", ".cnt", ".poly5"],
+}
+
+NEUROPHYSIOLOGY_TERMS = [
+    "eeg",
+    "electroencephalography",
+    "electroencephalogram",
+    "meg",
+    "magnetoencephalography",
+    "magnetoencephalogram",
+    "ieeg",
+    "intracranial",
+    "ecog",
+    "electrocorticography",
+    "seeg",
+    "emg",
+    "electromyography",
+    "electromyogram",
+    "evoked potential",
+    "event-related potential",
+    "erp",
+    "resting state",
+    "task-based",
+    "paradigm",
+]
+
+
+def check_bids_keywords(dataset: dict[str, Any]) -> tuple[bool, str]:
+    """Check if dataset mentions BIDS in title, description, or keywords.
+
+    Returns:
+        (is_bids, confidence_level) where confidence is 'strict' or 'moderate'
+
+    """
+    text = " ".join(
+        [
+            dataset.get("title", ""),
+            dataset.get("description", ""),
+            " ".join(dataset.get("keywords", [])),
+        ]
+    ).lower()
+
+    # Check strict BIDS keywords
+    for keyword in BIDS_KEYWORDS["strict"]:
+        if keyword in text:
+            return True, "strict"
+
+    # Check moderate BIDS keywords
+    for keyword in BIDS_KEYWORDS["moderate"]:
+        if keyword in text:
+            return True, "moderate"
+
+    return False, "none"
+
+
+def check_description_for_bids_evidence(dataset: dict[str, Any]) -> int:
+    """Analyze description for BIDS structure evidence.
+
+    Many archived datasets describe their BIDS structure in text without
+    mentioning "BIDS" explicitly. Look for telltale phrases.
+
+    Returns:
+        Evidence score (0-3): 0=none, 1=weak, 2=moderate, 3=strong
+
+    """
+    description = dataset.get("description", "").lower()
+    title = dataset.get("title", "").lower()
+    combined = description + " " + title
+
+    score = 0
+
+    # Strong evidence phrases
+    strong_evidence = [
+        "participants.tsv",
+        "dataset_description.json",
+        "subject-level",
+        "session-level",
+        "organized according to",
+        "follows the structure",
+        "standardized format",
+        "sub-<participant_label>",
+        "sub-XX",
+        "task-<task_name>",
+    ]
+
+    for phrase in strong_evidence:
+        if phrase in combined:
+            score = max(score, 3)
+            break
+
+    # Moderate evidence phrases
+    moderate_evidence = [
+        "organized dataset",
+        "structured data",
+        "standardized data",
+        "metadata files",
+        "sidecar files",
+        "json metadata",
+        "channel locations",
+        "event markers",
+    ]
+
+    if score < 3:
+        for phrase in moderate_evidence:
+            if phrase in combined:
+                score = max(score, 2)
+                break
+
+    # Weak evidence
+    weak_evidence = [
+        "well-organized",
+        "structured",
+        "metadata",
+        "annotations",
+        "documented",
+    ]
+
+    if score < 2:
+        for phrase in weak_evidence:
+            if phrase in combined:
+                score = max(score, 1)
+                break
+
+    return score
+
+
+def check_bids_files(dataset: dict[str, Any]) -> tuple[int, list[str]]:
+    """Check how many BIDS-specific files are present.
+
+    IMPORTANT: Checks both exact file matches AND BIDS naming patterns.
+    Many BIDS datasets don't have "bids" in metadata but follow BIDS structure.
+
+    NEW FEATURE: If dataset has archive preview contents, we can inspect
+    the internal structure without downloading! This dramatically improves
+    BIDS detection accuracy for archived datasets.
+
+    Returns:
+        (count, matched_files)
+
+    """
+    files = dataset.get("files", [])
+    filenames = [f.get("filename", "").lower() for f in files]
+
+    matched = []
+
+    # NEW: Check archive contents if available
+    archive_contents = dataset.get("archive_contents", {})
+    all_filenames = filenames.copy()
+
+    if archive_contents:
+        # Add all files found inside archives
+        for archive_name, contents in archive_contents.items():
+            all_filenames.extend([f.lower() for f in contents])
+
+    # Check if dataset is mostly archived (heuristic approach needed)
+    archived_extensions = [".zip", ".tar.gz", ".tgz", ".tar", ".rar", ".7z", ".gz"]
+    is_archived = any(fn.endswith(tuple(archived_extensions)) for fn in filenames)
+    has_preview = len(archive_contents) > 0
+
+    # Check for specific BIDS files (now including archive contents!)
+    for indicator in BIDS_FILE_INDICATORS:
+        if any(indicator.lower() in fn for fn in all_filenames):
+            matched.append(indicator)
+
+    # Check for BIDS naming patterns (sub-XX/ses-XX/task-XX)
+    bids_patterns = [
+        r"sub-[a-zA-Z0-9]+",  # Subject ID
+        r"ses-[a-zA-Z0-9]+",  # Session ID
+        r"task-[a-zA-Z0-9]+",  # Task name
+        r"run-[0-9]+",  # Run number
+        r"acq-[a-zA-Z0-9]+",  # Acquisition
+    ]
+
+    pattern_matches = set()
+    for fn in all_filenames:
+        for pattern in bids_patterns:
+            if re.search(pattern, fn):
+                pattern_matches.add(f"pattern:{pattern}")
+
+    matched.extend(list(pattern_matches))
+
+    # If archived WITHOUT preview, check archive name for BIDS indicators
+    if is_archived and not has_preview:
+        for fn in filenames:
+            # Check if archive name suggests BIDS content
+            if any(
+                term in fn for term in ["bids", "openneuro", "ds0", "sub-", "dataset"]
+            ):
+                matched.append("archive:bids_indicator")
+                break
+
+    # If we have archive preview, note this as additional evidence
+    if has_preview and any(
+        "sub-" in fn or "dataset_description" in fn for fn in all_filenames
+    ):
+        matched.append("archive_preview:confirmed")
+
+    # Deduplicate
+    matched = list(set(matched))
+
+    return len(matched), matched
+
+
+def check_neurophysiology_files(dataset: dict[str, Any]) -> tuple[str | None, int]:
+    """Check if dataset contains neurophysiology data files.
+
+    NEW: Also checks archive preview contents if available.
+
+    Returns:
+        (modality, extension_count) where modality is 'eeg', 'meg', 'ieeg', 'emg', or None
+
+    """
+    files = dataset.get("files", [])
+    filenames = [f.get("filename", "").lower() for f in files]
+
+    # NEW: Add archive contents if available
+    archive_contents = dataset.get("archive_contents", {})
+    if archive_contents:
+        for archive_name, contents in archive_contents.items():
+            filenames.extend([f.lower() for f in contents])
+
+    modality_counts = {}
+    for modality, extensions in NEUROPHYSIOLOGY_EXTENSIONS.items():
+        count = sum(
+            1 for fn in filenames if any(fn.endswith(ext) for ext in extensions)
+        )
+        if count > 0:
+            modality_counts[modality] = count
+
+    if not modality_counts:
+        return None, 0
+
+    # Return modality with most files
+    primary_modality = max(modality_counts, key=modality_counts.get)
+    return primary_modality, modality_counts[primary_modality]
+
+
+def check_neurophysiology_terms(dataset: dict[str, Any]) -> bool:
+    """Check if dataset mentions neurophysiology-related terms."""
+    text = " ".join(
+        [
+            dataset.get("title", ""),
+            dataset.get("description", ""),
+            " ".join(dataset.get("keywords", [])),
+        ]
+    ).lower()
+
+    return any(term in text for term in NEUROPHYSIOLOGY_TERMS)
+
+
+def classify_dataset(dataset: dict[str, Any]) -> tuple[str, dict[str, Any]]:
+    """Classify dataset as 'bids', 'neurophysiology', or 'rejected'.
+
+    CRITICAL: A dataset can be BIDS-compliant without mentioning "BIDS"!
+    We prioritize file structure indicators over keyword mentions.
+
+    NEW: For archived datasets with preview contents, we can now accurately
+    detect BIDS structure by inspecting the archive contents directly!
+
+    For archived datasets without preview, we use description analysis to find BIDS evidence.
+
+    Returns:
+        (category, metadata) where metadata contains classification details
+
+    """
+    # Check BIDS indicators
+    has_bids_keywords, keyword_confidence = check_bids_keywords(dataset)
+    bids_file_count, bids_files = check_bids_files(dataset)
+    description_score = check_description_for_bids_evidence(dataset)
+
+    # Check neurophysiology indicators
+    modality, neuro_file_count = check_neurophysiology_files(dataset)
+    has_neuro_terms = check_neurophysiology_terms(dataset)
+
+    # NEW: Check if we have archive preview
+    has_archive_preview = dataset.get("has_archive_preview", False)
+    archive_file_count = sum(
+        len(contents) for contents in dataset.get("archive_contents", {}).values()
+    )
+
+    metadata = {
+        "has_bids_keywords": has_bids_keywords,
+        "bids_keyword_confidence": keyword_confidence,
+        "bids_file_count": bids_file_count,
+        "bids_files_found": bids_files,
+        "description_bids_score": description_score,
+        "primary_modality": modality,
+        "neurophysiology_file_count": neuro_file_count,
+        "has_neurophysiology_terms": has_neuro_terms,
+        "has_archive_preview": has_archive_preview,  # NEW
+        "archive_file_count": archive_file_count,  # NEW
+    }
+
+    # Classification logic - MULTIPLE PATHWAYS TO BIDS CLASSIFICATION
+
+    # PATH 1: STRICT BIDS - Strong file structure evidence
+    # Many BIDS datasets don't mention "BIDS" but have perfect structure
+    # NOW WITH ARCHIVE PREVIEW: This is much more accurate!
+    if bids_file_count >= 3:
+        return "bids_strict", metadata
+
+    # PATH 2: STRICT BIDS - Explicit BIDS keywords + some evidence
+    if (
+        has_bids_keywords
+        and keyword_confidence == "strict"
+        and (bids_file_count >= 1 or description_score >= 2)
+    ):
+        return "bids_strict", metadata
+
+    # PATH 3: MODERATE BIDS - Good file evidence OR strong description evidence
+    if bids_file_count >= 2 or (description_score >= 3 and has_neuro_terms):
+        return "bids_moderate", metadata
+
+    # PATH 4: MODERATE BIDS - BIDS keywords + moderate evidence
+    if has_bids_keywords and (bids_file_count >= 1 or description_score >= 2):
+        return "bids_moderate", metadata
+
+    # PATH 5: PROBABLE BIDS - Weak BIDS evidence + neurophysiology data
+    if (bids_file_count >= 1 or description_score >= 2) and modality:
+        return "bids_probable", metadata
+
+    # NEUROPHYSIOLOGY: Has neuro files and terms but no BIDS structure
+    if (
+        modality
+        and neuro_file_count > 0
+        and has_neuro_terms
+        and bids_file_count == 0
+        and description_score == 0
+    ):
+        return "neurophysiology", metadata
+
+    # REJECTED: No clear indicators
+    return "rejected", metadata
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Filter aggressive Zenodo results to identify genuine BIDS datasets.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+
+    parser.add_argument(
+        "--input",
+        type=Path,
+        default=Path("consolidated/zenodo_datasets_aggressive.json"),
+        help="Input JSON file from aggressive fetch.",
+    )
+    parser.add_argument(
+        "--output-bids",
+        type=Path,
+        default=Path("consolidated/zenodo_datasets_bids.json"),
+        help="Output file for validated BIDS datasets.",
+    )
+    parser.add_argument(
+        "--output-neurophysiology",
+        type=Path,
+        default=Path("consolidated/zenodo_datasets_neurophysiology.json"),
+        help="Output file for neurophysiology datasets (convertible to BIDS).",
+    )
+    parser.add_argument(
+        "--output-rejected",
+        type=Path,
+        default=Path("consolidated/zenodo_datasets_rejected.json"),
+        help="Output file for rejected datasets (false positives).",
+    )
+    parser.add_argument(
+        "--output-stats",
+        type=Path,
+        default=Path("consolidated/zenodo_filtering_stats.json"),
+        help="Output file for filtering statistics.",
+    )
+
+    args = parser.parse_args()
+
+    # Load aggressive results
+    if not args.input.exists():
+        print(f"Error: Input file {args.input} not found!")
+        print("Run 1_fetch_zenodo_enhanced.py first.")
+        return
+
+    with args.input.open() as f:
+        datasets = json.load(f)
+
+    print(f"{'=' * 70}")
+    print(f"Filtering {len(datasets)} aggressive Zenodo results")
+    print(f"{'=' * 70}\n")
+
+    # Classify datasets
+    categories = {
+        "bids_strict": [],
+        "bids_moderate": [],
+        "bids_probable": [],
+        "neurophysiology": [],
+        "rejected": [],
+    }
+
+    for i, dataset in enumerate(datasets, 1):
+        category, metadata = classify_dataset(dataset)
+
+        # Add classification metadata to dataset
+        dataset["classification"] = {"category": category, **metadata}
+
+        categories[category].append(dataset)
+
+        if i % 100 == 0:
+            print(f"  Processed {i}/{len(datasets)} datasets...")
+
+    # Combine BIDS categories
+    bids_datasets = (
+        categories["bids_strict"]
+        + categories["bids_moderate"]
+        + categories["bids_probable"]
+    )
+
+    # Save results
+    args.output_bids.parent.mkdir(parents=True, exist_ok=True)
+
+    with args.output_bids.open("w") as f:
+        json.dump(bids_datasets, f, indent=2)
+
+    with args.output_neurophysiology.open("w") as f:
+        json.dump(categories["neurophysiology"], f, indent=2)
+
+    with args.output_rejected.open("w") as f:
+        json.dump(categories["rejected"], f, indent=2)
+
+    # Calculate statistics
+    archive_preview_count = sum(
+        1 for d in datasets if d.get("has_archive_preview", False)
+    )
+    archive_preview_in_bids = sum(
+        1 for d in bids_datasets if d.get("has_archive_preview", False)
+    )
+
+    stats = {
+        "total_input": len(datasets),
+        "bids_total": len(bids_datasets),
+        "bids_strict": len(categories["bids_strict"]),
+        "bids_moderate": len(categories["bids_moderate"]),
+        "bids_probable": len(categories["bids_probable"]),
+        "neurophysiology": len(categories["neurophysiology"]),
+        "rejected": len(categories["rejected"]),
+        "bids_percentage": round(len(bids_datasets) / len(datasets) * 100, 1),
+        "neurophysiology_percentage": round(
+            len(categories["neurophysiology"]) / len(datasets) * 100, 1
+        ),
+        "rejected_percentage": round(
+            len(categories["rejected"]) / len(datasets) * 100, 1
+        ),
+        "datasets_with_archive_preview": archive_preview_count,
+        "archive_preview_percentage": round(
+            archive_preview_count / len(datasets) * 100, 1
+        )
+        if datasets
+        else 0,
+        "bids_with_archive_preview": archive_preview_in_bids,
+        "bids_archive_preview_percentage": round(
+            archive_preview_in_bids / len(bids_datasets) * 100, 1
+        )
+        if bids_datasets
+        else 0,
+    }
+
+    with args.output_stats.open("w") as f:
+        json.dump(stats, f, indent=2)
+
+    # Print summary
+    print(f"\n{'=' * 70}")
+    print("FILTERING RESULTS")
+    print(f"{'=' * 70}")
+    print(f"Total input:           {stats['total_input']:5d}")
+    print("")
+    print("Archive Preview Stats:")
+    print(
+        f"  Datasets with preview: {stats['datasets_with_archive_preview']:4d} ({stats['archive_preview_percentage']:5.1f}%)"
+    )
+    print(
+        f"  BIDS with preview:     {stats['bids_with_archive_preview']:4d} ({stats['bids_archive_preview_percentage']:5.1f}%)"
+    )
+    print("")
+    print(
+        f"BIDS Datasets:         {stats['bids_total']:5d} ({stats['bids_percentage']:5.1f}%)"
+    )
+    print(f"  - Strict (confident): {stats['bids_strict']:4d}")
+    print(f"  - Moderate:           {stats['bids_moderate']:4d}")
+    print(f"  - Probable:           {stats['bids_probable']:4d}")
+    print("")
+    print(
+        f"Neurophysiology:       {stats['neurophysiology']:5d} ({stats['neurophysiology_percentage']:5.1f}%)"
+    )
+    print("  (Convertible to BIDS)")
+    print("")
+    print(
+        f"Rejected:              {stats['rejected']:5d} ({stats['rejected_percentage']:5.1f}%)"
+    )
+    print("  (False positives)")
+    print(f"{'=' * 70}")
+    print("")
+    print("Output files:")
+    print(f"  BIDS:           {args.output_bids}")
+    print(f"  Neurophysiology: {args.output_neurophysiology}")
+    print(f"  Rejected:       {args.output_rejected}")
+    print(f"  Statistics:     {args.output_stats}")
+    print(f"{'=' * 70}")
+
+    # Sample analysis
+    if bids_datasets:
+        print("\nSample BIDS dataset (strict confidence):")
+        strict_samples = [d for d in categories["bids_strict"]]
+        if strict_samples:
+            sample = strict_samples[0]
+            print(f"  Title: {sample.get('title', 'N/A')}")
+            print(f"  DOI: {sample.get('doi', 'N/A')}")
+            print(f"  Keywords: {', '.join(sample.get('keywords', [])[:5])}")
+            print(f"  BIDS files found: {sample['classification']['bids_files_found']}")
+            print(f"  Modality: {sample['classification']['primary_modality']}")
+            if sample.get("has_archive_preview"):
+                archive_count = sample["classification"].get("archive_file_count", 0)
+                print(f"  Archive preview: ✓ ({archive_count} files inspected)")
+            else:
+                print("  Archive preview: ✗ (classified by keywords/description)")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/ingestions/3_clone_datasets_detailed.py b/scripts/ingestions/3_clone_datasets_detailed.py
new file mode 100644
index 00000000..a285fd5d
--- /dev/null
+++ b/scripts/ingestions/3_clone_datasets_detailed.py
@@ -0,0 +1,665 @@
+r"""Clone EEG datasets from OpenNeuro, NEMAR, and EEGManyLabs.
+
+This unified script handles cloning from three major sources with comprehensive
+source detection, timeout handling, error recovery, and reporting.
+
+Features:
+
+- Automatic source detection (OpenNeuro, NEMAR, EEGManyLabs/GIN)
+- Timeout handling for large datasets
+- Error recovery with retry list generation
+- Progress tracking and comprehensive reporting
+- Partial clone cleanup on failure
+- Skip previously cloned datasets
+- Source-specific metadata preservation
+
+Example Usage:
+    # Clone all datasets from all sources
+    python 3_clone_datasets_detailed.py
+
+    # Clone specific source only
+    python 3_clone_datasets_detailed.py \\
+        --datasets-file consolidated/openneuro_datasets.json \\
+        --output-dir data/openneuro
+
+    # Clone with longer timeout
+    python 3_clone_datasets_detailed.py --timeout 600
+
+    # Retry failed datasets
+    python 3_clone_datasets_detailed.py \\
+        --datasets-file test_diggestion/retry.json \\
+        --timeout 600
+"""
+
+import argparse
+import json
+import subprocess
+import sys
+import time
+from pathlib import Path
+
+# ============================================================================
+# Source Detection & Routing
+# ============================================================================
+
+
+def detect_source_type(dataset: dict) -> str:
+    """Detect the data source based on dataset fields.
+
+    This function uses hierarchical detection to identify which repository
+    the dataset comes from based on its metadata structure.
+
+    Detection Priority:
+    1. Explicit "source" field (fastest, most reliable)
+    2. GIN detection: clone_url contains "gin.g-node.org"
+    3. NEMAR detection: clone_url contains "nemarDatasets"
+    4. Fallback GIT: Has clone_url or ssh_url (assume NEMAR for backwards compat)
+    5. OpenNeuro: Has "modality" field (EEG/iEEG/MEG)
+    6. Unknown: None of the above
+
+    Args:
+        dataset: Dataset dictionary from consolidated JSON
+
+    Returns:
+        String: 'openneuro', 'nemar', 'gin', or 'unknown'
+
+    Examples:
+        >>> d1 = {"dataset_id": "ds001785", "modality": "eeg"}
+        >>> detect_source_type(d1)
+        'openneuro'
+
+        >>> d2 = {"clone_url": "https://github.com/nemarDatasets/ds004350.git"}
+        >>> detect_source_type(d2)
+        'nemar'
+
+        >>> d3 = {"clone_url": "https://gin.g-node.org/EEGManyLabs/..."}
+        >>> detect_source_type(d3)
+        'gin'
+
+    """
+    # Check for explicit source field (most reliable)
+    if "source" in dataset:
+        return dataset["source"]
+
+    clone_url = dataset.get("clone_url", "")
+    ssh_url = dataset.get("ssh_url", "")
+    urls = clone_url + ssh_url  # Concatenate for simpler checking
+
+    # GIN detection (highest priority for git-based)
+    if "gin.g-node.org" in urls:
+        return "gin"
+
+    # NEMAR detection
+    if "nemarDatasets" in urls:
+        return "nemar"
+
+    # Generic git detection (fallback to NEMAR for backwards compatibility)
+    if clone_url or ssh_url:
+        return "nemar"
+
+    # OpenNeuro detection (modality field is unique to OpenNeuro)
+    if "modality" in dataset:
+        return "openneuro"
+
+    return "unknown"
+
+
+# ============================================================================
+# Clone URL Generation
+# ============================================================================
+
+
+def get_clone_url(dataset: dict, source_type: str) -> str:
+    """Generate the appropriate Git clone URL for the dataset.
+
+    Source-specific URL handling:
+
+    **OpenNeuro**:
+    - Manual construction from dataset_id
+    - Format: https://github.com/OpenNeuroDatasets/{dataset_id}
+    - No .git suffix needed
+    - Example: ds001785 → https://github.com/OpenNeuroDatasets/ds001785
+
+    **NEMAR**:
+    - Direct from clone_url field
+    - Format: https://github.com/nemarDatasets/{dataset_id}.git
+    - Includes .git suffix
+    - Example: ds004350 → https://github.com/nemarDatasets/ds004350.git
+
+    **EEGManyLabs (GIN)**:
+    - Direct from clone_url field
+    - Format: https://gin.g-node.org/EEGManyLabs/{dataset_id}.git
+    - Includes .git suffix
+    - Example: EEGManyLabs_Replication_ClarkHillyard1996_Raw
+      → https://gin.g-node.org/EEGManyLabs/EEGManyLabs_Replication_ClarkHillyard1996_Raw.git
+
+    Args:
+        dataset: Dataset dictionary from consolidated JSON
+        source_type: Detected source ('openneuro', 'nemar', 'gin', 'unknown')
+
+    Returns:
+        String: Full Git clone URL
+
+    Raises:
+        KeyError: If required field missing for source type
+        ValueError: If source_type is unknown
+
+    """
+    if source_type == "openneuro":
+        # OpenNeuro requires URL construction
+        dataset_id = dataset["dataset_id"]
+        return f"https://github.com/OpenNeuroDatasets/{dataset_id}"
+
+    elif source_type in ("nemar", "gin"):
+        # NEMAR and GIN have direct clone_url in metadata
+        clone_url = dataset.get("clone_url")
+        if not clone_url:
+            # Fallback to SSH URL if clone_url not available
+            clone_url = dataset.get("ssh_url")
+        if not clone_url:
+            raise KeyError(f"No clone_url or ssh_url found for {source_type} dataset")
+        return clone_url
+
+    else:
+        raise ValueError(f"Cannot generate clone URL for unknown source: {source_type}")
+
+
+# ============================================================================
+# Git Clone Execution
+# ============================================================================
+
+
+def clone_dataset(dataset: dict, output_dir: Path, timeout: int) -> dict:
+    """Execute Git clone for a single dataset with timeout handling.
+
+    Clone Workflow:
+    1. Detect source type from dataset metadata
+    2. Generate appropriate clone URL for source
+    3. Check if already cloned (skip if exists)
+    4. Execute git clone with timeout
+    5. On success: Return status with dataset metadata
+    6. On timeout: Clean up partial clone, return timeout status
+    7. On failure: Clean up partial clone, return error details
+    8. On exception: Capture error and return error status
+
+    Timeout Handling:
+    - Default timeout: 300 seconds (5 minutes)
+    - Supports up to 1000 seconds for very large datasets
+    - On timeout: Partial clone automatically cleaned up
+    - Creates retry.json for failed/timeout datasets
+
+    Error Handling:
+    - Missing URL fields: Returns error status
+    - Network errors: Captured in stderr
+    - Disk space errors: Captured in stderr
+    - Permission errors: Captured in stderr
+    - Unknown errors: Generic exception handling
+
+    Storage:
+    - Clone directory: output_dir / dataset_id
+    - Partial clones cleaned on failure
+    - Skip if directory already exists
+
+    Args:
+        dataset: Dataset dictionary with at minimum:
+                - dataset_id (string)
+                - and either: modality (OpenNeuro), clone_url (NEMAR/GIN), or source field
+        output_dir: Path object pointing to target clone directory
+        timeout: Maximum seconds to wait for clone (typically 300-600)
+
+    Returns:
+        dict: Status result with keys:
+            - status: 'success', 'skip', 'timeout', 'failed', or 'error'
+            - dataset_id: The dataset identifier
+            - source: Detected source type
+            - Additional context based on status (error, reason, timeout_seconds, etc.)
+
+    Examples:
+        >>> result = clone_dataset(
+        ...     {"dataset_id": "ds001785", "modality": "eeg"},
+        ...     Path("data"),
+        ...     timeout=300
+        ... )
+        >>> result["status"]
+        'success'
+
+        >>> result = clone_dataset(
+        ...     {"dataset_id": "already_cloned", "modality": "eeg"},
+        ...     Path("data/already_has_ds001785"),
+        ...     timeout=300
+        ... )
+        >>> result["status"]
+        'skip'
+
+    """
+    dataset_id = dataset["dataset_id"]
+    source_type = detect_source_type(dataset)
+    clone_dir = output_dir / dataset_id
+
+    # ---------------------------------------------------------------
+    # Step 1: Get clone URL
+    # ---------------------------------------------------------------
+    try:
+        url = get_clone_url(dataset, source_type)
+    except (KeyError, ValueError) as e:
+        return {
+            "status": "error",
+            "dataset_id": dataset_id,
+            "source": source_type,
+            "error": str(e),
+            "phase": "url_generation",
+        }
+
+    # ---------------------------------------------------------------
+    # Step 2: Check if already cloned
+    # ---------------------------------------------------------------
+    if clone_dir.exists():
+        return {
+            "status": "skip",
+            "dataset_id": dataset_id,
+            "source": source_type,
+            "reason": "already exists",
+            "path": str(clone_dir),
+        }
+
+    # ---------------------------------------------------------------
+    # Step 3: Execute git clone with timeout
+    # ---------------------------------------------------------------
+    try:
+        # Build git command with options
+        cmd = ["git", "clone", url, str(clone_dir)]
+
+        # Add shallow clone option if requested
+        if getattr(clone_dataset, "shallow", False):
+            cmd.insert(2, "--depth")
+            cmd.insert(3, str(getattr(clone_dataset, "depth", 1)))
+
+        # Skip LFS files if requested
+        if getattr(clone_dataset, "no_lfs", False):
+            cmd.insert(2, "--no-checkout")
+
+        # Run with subprocess timeout
+        result = subprocess.run(
+            cmd,
+            timeout=timeout,
+            capture_output=True,
+            text=True,
+        )
+
+        # Check return code
+        if result.returncode == 0:
+            return {
+                "status": "success",
+                "dataset_id": dataset_id,
+                "source": source_type,
+                "path": str(clone_dir),
+            }
+        else:
+            # Clone failed - clean up partial clone
+            if clone_dir.exists():
+                import shutil
+
+                shutil.rmtree(clone_dir, ignore_errors=True)
+
+            # Return error with details
+            error_msg = result.stderr[:500]  # First 500 chars of error
+            return {
+                "status": "failed",
+                "dataset_id": dataset_id,
+                "source": source_type,
+                "error": error_msg,
+                "returncode": result.returncode,
+            }
+
+    except subprocess.TimeoutExpired:
+        # Clone took too long - clean up partial clone
+        if clone_dir.exists():
+            import shutil
+
+            shutil.rmtree(clone_dir, ignore_errors=True)
+
+        return {
+            "status": "timeout",
+            "dataset_id": dataset_id,
+            "source": source_type,
+            "timeout_seconds": timeout,
+        }
+
+    except Exception as e:
+        # Unexpected error - clean up and report
+        if clone_dir.exists():
+            import shutil
+
+            shutil.rmtree(clone_dir, ignore_errors=True)
+
+        return {
+            "status": "error",
+            "dataset_id": dataset_id,
+            "source": source_type,
+            "error": str(e)[:500],
+        }
+
+
+# ============================================================================
+# Batch Processing & Reporting
+# ============================================================================
+
+
+def print_summary(results: dict, source_counts: dict, elapsed_time: float) -> None:
+    """Print comprehensive clone results summary.
+
+    Summary Format:
+    - Clone statistics by source
+    - Clone statistics by status
+    - Performance metrics
+    - Success rate calculation
+    - Retry list information
+
+    Args:
+        results: Dictionary with keys: success, failed, timeout, skip, error
+        source_counts: Dictionary with counts by source
+        elapsed_time: Total execution time in seconds
+
+    """
+    total_datasets = sum(len(v) for v in results.values())
+    total_success = len(results["success"]) + len(results["skip"])
+
+    print()
+    print("=" * 70)
+    print(f"Clone Operation Completed at {time.strftime('%Y-%m-%d %H:%M:%S')}")
+    print("=" * 70)
+    print()
+
+    # By Source
+    print("DATASETS BY SOURCE:")
+    print(f"  OpenNeuro:    {source_counts.get('openneuro', 0):3d}")
+    print(f"  NEMAR:        {source_counts.get('nemar', 0):3d}")
+    print(f"  EEGManyLabs:  {source_counts.get('gin', 0):3d}")
+    if source_counts.get("unknown", 0):
+        print(f"  Unknown:      {source_counts.get('unknown', 0):3d}")
+    print()
+
+    # By Status
+    print("DATASETS BY STATUS:")
+    print(f"  ✓ Success:    {len(results['success']):3d}")
+    print(f"  ⊘ Skipped:    {len(results['skip']):3d}")
+    print(f"  ✗ Failed:     {len(results['failed']):3d}")
+    print(f"  ⏱ Timeout:    {len(results['timeout']):3d}")
+    print(f"  ? Error:      {len(results['error']):3d}")
+    print()
+
+    # Success Rate
+    if total_datasets > 0:
+        success_rate = (total_success / total_datasets) * 100
+        print(f"SUCCESS RATE: {success_rate:.1f}% ({total_success}/{total_datasets})")
+    print()
+
+    # Performance
+    hours, remainder = divmod(elapsed_time, 3600)
+    minutes, seconds = divmod(remainder, 60)
+    time_str = f"{int(hours)}h {int(minutes)}m {seconds:.0f}s"
+    avg_time = elapsed_time / max(total_datasets, 1)
+
+    print("PERFORMANCE:")
+    print(f"  Total time:   {time_str}")
+    print(f"  Average time: {avg_time:.1f}s per dataset")
+    print("=" * 70)
+
+
+def main() -> None:
+    """Main clone orchestration function."""
+    parser = argparse.ArgumentParser(
+        description="Clone EEG datasets from OpenNeuro, NEMAR, and EEGManyLabs.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Clone all datasets with defaults
+  python 3_clone_datasets_detailed.py
+
+  # Custom output directory and longer timeout for large datasets
+  python 3_clone_datasets_detailed.py \\
+    --output-dir data/cloned \\
+    --timeout 600
+
+  # Clone only EEGManyLabs datasets
+  python 3_clone_datasets_detailed.py \\
+    --datasets-file consolidated/eegmanylabs_datasets.json \\
+    --output-dir data/eegmanylabs \\
+    --timeout 600
+
+  # Retry previously failed datasets
+  python 3_clone_datasets_detailed.py \\
+    --datasets-file test_diggestion/retry.json \\
+    --output-dir test_diggestion \\
+    --timeout 600
+        """,
+    )
+
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=Path("test_diggestion"),
+        help="Output directory for cloned repos (default: test_diggestion).",
+    )
+
+    parser.add_argument(
+        "--timeout",
+        type=int,
+        default=300,
+        help="Timeout per clone in seconds (default: 300, max 1000).",
+    )
+
+    parser.add_argument(
+        "--datasets-file",
+        type=Path,
+        default=None,
+        help="JSON file with dataset listings. If not specified, will try: "
+        "openneuro_datasets.json, nemardatasets_repos.json, eegmanylabs_datasets.json",
+    )
+
+    parser.add_argument(
+        "--max-parallel",
+        type=int,
+        default=1,
+        help="Max parallel clones (currently single-threaded, default: 1).",
+    )
+
+    parser.add_argument(
+        "--shallow",
+        action="store_true",
+        help="Use shallow clone (faster, no history) - saves bandwidth.",
+    )
+
+    parser.add_argument(
+        "--depth",
+        type=int,
+        default=1,
+        help="Depth for shallow clone (default: 1 = latest commit only).",
+    )
+
+    parser.add_argument(
+        "--no-lfs",
+        action="store_true",
+        help="Skip Git LFS files (avoids downloading large data files).",
+    )
+
+    args = parser.parse_args()
+
+    # Validate timeout
+    if args.timeout > 1000:
+        print("Warning: Timeout > 1000s may be excessive", file=sys.stderr)
+    if args.timeout < 10:
+        print(
+            "Warning: Timeout < 10s may be too short for large datasets",
+            file=sys.stderr,
+        )
+
+    # Set clone options as attributes on the function
+    clone_dataset.shallow = args.shallow
+    clone_dataset.depth = args.depth
+    clone_dataset.no_lfs = args.no_lfs
+
+    # ---------------------------------------------------------------
+    # Step 1: Load Datasets
+    # ---------------------------------------------------------------
+
+    if args.datasets_file:
+        # Use specified file
+        if not args.datasets_file.exists():
+            print(f"Error: {args.datasets_file} not found", file=sys.stderr)
+            sys.exit(1)
+        dataset_files = [args.datasets_file]
+    else:
+        # Try to find consolidated files
+        dataset_files = []
+        for fname in [
+            "consolidated/openneuro_datasets.json",
+            "consolidated/nemardatasets_repos.json",
+            "consolidated/eegmanylabs_datasets.json",
+        ]:
+            if Path(fname).exists():
+                dataset_files.append(Path(fname))
+
+    if not dataset_files:
+        print(
+            "Error: No dataset files found. Specify with --datasets-file or ensure "
+            "consolidated/ files exist",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    # Load all datasets
+    datasets = []
+    for fpath in dataset_files:
+        with fpath.open("r") as fh:
+            file_datasets = json.load(fh)
+        datasets.extend(file_datasets)
+
+    total = len(datasets)
+    if total == 0:
+        print("Error: No datasets to clone", file=sys.stderr)
+        sys.exit(1)
+
+    # ---------------------------------------------------------------
+    # Step 2: Start Clone Operation
+    # ---------------------------------------------------------------
+
+    print(f"\nStarting dataset cloning at {time.strftime('%Y-%m-%d %H:%M:%S')}")
+    print(f"Output directory: {args.output_dir}")
+    print(f"Timeout per clone: {args.timeout}s")
+    print(f"Total datasets: {total}")
+    print()
+
+    # Create output directory
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+
+    # ---------------------------------------------------------------
+    # Step 3: Clone All Datasets
+    # ---------------------------------------------------------------
+
+    results = {
+        "success": [],
+        "failed": [],
+        "timeout": [],
+        "skip": [],
+        "error": [],
+    }
+
+    source_counts = {"openneuro": 0, "nemar": 0, "gin": 0, "unknown": 0}
+
+    start_time = time.time()
+
+    for idx, dataset in enumerate(datasets, start=1):
+        dataset_id = dataset["dataset_id"]
+        source_type = detect_source_type(dataset)
+        source_counts[source_type] = source_counts.get(source_type, 0) + 1
+
+        # Format output
+        status_str = f"[{idx:3d}/{total}] {dataset_id:30s} ({source_type:10s})"
+        print(f"{status_str}...", end=" ", flush=True)
+
+        # Clone the dataset
+        result = clone_dataset(dataset, args.output_dir, args.timeout)
+        status = result.pop("status")
+        results[status].append(result)
+
+        # Print status indicator
+        if status == "success":
+            print("✓")
+        elif status == "skip":
+            print("⊘ (already cloned)")
+        elif status == "timeout":
+            print(f"⏱ ({args.timeout}s timeout)")
+        elif status == "failed":
+            error = result.get("error", "unknown")[:40]
+            print(f"✗ ({error}...)")
+        else:
+            error = result.get("error", "unknown")[:40]
+            print(f"? ({error}...)")
+
+    elapsed = time.time() - start_time
+
+    # ---------------------------------------------------------------
+    # Step 4: Generate Report
+    # ---------------------------------------------------------------
+
+    print_summary(results, source_counts, elapsed)
+
+    # ---------------------------------------------------------------
+    # Step 5: Save Results
+    # ---------------------------------------------------------------
+
+    # Save detailed results
+    results_file = args.output_dir / "clone_results.json"
+    with results_file.open("w") as fh:
+        json.dump(results, fh, indent=2)
+    print(f"\nDetailed results saved to: {results_file}")
+
+    # Save retry list for failed/timeout datasets
+    retry_datasets = []
+    for status_list in [results["failed"], results["timeout"], results["error"]]:
+        for result in status_list:
+            # Find original dataset to include full metadata
+            orig_dataset = None
+            for ds in datasets:
+                if ds["dataset_id"] == result["dataset_id"]:
+                    orig_dataset = ds
+                    break
+            if orig_dataset:
+                retry_datasets.append(orig_dataset)
+
+    if retry_datasets:
+        retry_file = args.output_dir / "retry.json"
+        with retry_file.open("w") as fh:
+            json.dump(retry_datasets, fh, indent=2)
+        print(f"Retry list saved to: {retry_file} ({len(retry_datasets)} datasets)")
+
+    # ---------------------------------------------------------------
+    # Step 6: Print Next Steps
+    # ---------------------------------------------------------------
+
+    print()
+    print("=" * 70)
+    print("NEXT STEPS:")
+    print("=" * 70)
+
+    if len(results["success"]) > 0:
+        print(f"\n1. Successfully cloned {len(results['success'])} datasets")
+        print(f"   Location: {args.output_dir}/")
+        print("   Next: Run digestion pipeline on these datasets")
+
+    if retry_datasets:
+        print(f"\n2. {len(retry_datasets)} datasets need retry")
+        print("   Command: python 3_clone_datasets_detailed.py \\")
+        print(f"       --datasets-file {retry_file} \\")
+        print(f"       --output-dir {args.output_dir} \\")
+        print(f"       --timeout {args.timeout + 100}")
+
+    if len(results["success"]) + len(results["skip"]) == total:
+        print(f"\n✓ All {total} datasets processed successfully!")
+
+    print()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/ingestions/4_digest_single_dataset.py b/scripts/ingestions/4_digest_single_dataset.py
new file mode 100644
index 00000000..b904ef1e
--- /dev/null
+++ b/scripts/ingestions/4_digest_single_dataset.py
@@ -0,0 +1,326 @@
+r"""Digest a single OpenNeuro dataset and generate JSON for MongoDB ingestion.
+
+This script processes a single BIDS dataset to extract metadata and create
+a two-tier JSON structure optimized for the new EEGDash API Gateway architecture:
+
+- Core metadata: Essential fields needed to load the dataset (always loaded)
+- Enriched metadata: Additional information loaded on-demand for performance
+
+The generated JSON files are ready for ingestion into MongoDB via the API Gateway
+at https://data.eegdash.org using the admin endpoints.
+
+Architecture:
+    MongoDB (via API Gateway) <- JSON files <- This script <- BIDS dataset
+
+Output files:
+    - {dataset_id}_core.json: Core metadata for efficient querying
+    - {dataset_id}_enriched.json: Extended metadata loaded on-demand
+    - {dataset_id}_full_manifest.json: Complete metadata for reference
+    - {dataset_id}_summary.json: Processing summary and statistics
+Usage:
+    python digest_single_dataset.py ds002718 --dataset-dir test_diggestion/ds002718
+Upload to MongoDB:
+    curl -X POST https://data.eegdash.org/admin/eegdashstaging/records/bulk \\
+         -H "Authorization: Bearer AdminWrite2025SecureToken" \\
+         -H "Content-Type: application/json" \\
+         -d @digestion_output/ds002718/ds002718_core.json
+"""
+
+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import Any
+
+
+def split_core_and_enriched_metadata(record: dict[str, Any]) -> tuple[dict, dict]:
+    """Split a record into core and enriched metadata.
+
+    Core metadata includes fields strictly necessary to:
+    - Identify the recording
+    - Locate the data file
+    - Load the dataset
+
+    Enriched metadata includes:
+    - Participant information
+    - EEG technical details
+    - BIDS dependencies
+    - Additional JSON metadata
+
+    Parameters
+    ----------
+    record : dict
+        The full metadata record from EEGDash
+
+    Returns
+    -------
+    core : dict
+        Core metadata fields (always loaded)
+    enriched : dict
+        Enriched metadata fields (loaded on-demand)
+
+    """
+    # Core fields: minimal set to identify and load the recording
+    core_fields = {
+        "data_name",  # Unique identifier
+        "dataset",  # Dataset ID (e.g., ds002718)
+        "bidspath",  # Path within BIDS structure
+        "subject",  # Subject identifier
+        "task",  # Task name
+        "session",  # Session identifier (optional)
+        "run",  # Run number (optional)
+        "modality",  # Data modality (eeg, meg, etc.)
+        "sampling_frequency",  # Sampling rate (needed for basic validation)
+        "nchans",  # Number of channels
+        "ntimes",  # Number of time points
+    }
+
+    core = {k: record.get(k) for k in core_fields if k in record}
+
+    # Everything else goes into enriched metadata
+    enriched = {k: v for k, v in record.items() if k not in core_fields}
+
+    return core, enriched
+
+
+def digest_dataset(
+    dataset_id: str,
+    dataset_dir: Path,
+    output_dir: Path,
+) -> dict[str, Any]:
+    """Process a single dataset and generate JSON metadata.
+
+    Parameters
+    ----------
+    dataset_id : str
+        Dataset identifier (e.g., "ds002718")
+    dataset_dir : Path
+        Path to the local BIDS dataset directory
+    output_dir : Path
+        Directory where JSON output will be saved
+
+    Returns
+    -------
+    dict
+        Summary of the digestion process
+
+    """
+    if not dataset_dir.exists():
+        raise FileNotFoundError(f"Dataset directory not found: {dataset_dir}")
+
+    print(f"Processing dataset: {dataset_id}")
+    print(f"  Source: {dataset_dir}")
+    print(f"  Output: {output_dir}")
+    print()
+
+    # Extract metadata directly from BIDS dataset (no DB connection needed)
+    from eegdash.bids_eeg_metadata import load_eeg_attrs_from_bids_file
+    from eegdash.dataset.bids_dataset import EEGBIDSDataset
+
+    try:
+        # Use allow_symlinks=True for metadata extraction without loading raw data
+        # This allows processing git-annex repositories where files are symlinks
+        bids_dataset = EEGBIDSDataset(
+            data_dir=str(dataset_dir),
+            dataset=dataset_id,
+            allow_symlinks=True,  # Enable metadata extraction from symlinked files
+        )
+        print(f"✓ Loaded BIDS dataset: {len(bids_dataset.get_files())} files found")
+        print("  Mode: Metadata extraction (symlinks allowed)")
+    except Exception as exc:
+        print(f"✗ Error creating BIDS dataset: {exc}")
+        return {
+            "status": "error",
+            "dataset_id": dataset_id,
+            "error": str(exc),
+        }
+
+    # Extract metadata for each file
+    records = []
+    errors = []
+
+    print("\nExtracting metadata from files...")
+    for idx, bids_file in enumerate(bids_dataset.get_files(), 1):
+        file_name = Path(bids_file).name
+        print(
+            f"  [{idx:3d}/{len(bids_dataset.get_files()):3d}] {file_name[:50]:<50}",
+            end=" ",
+        )
+
+        try:
+            eeg_attrs = load_eeg_attrs_from_bids_file(bids_dataset, bids_file)
+            records.append(eeg_attrs)
+            print("✓")
+        except Exception as exc:
+            print(f"✗ {str(exc)[:50]}")
+            errors.append({"file": str(bids_file), "error": str(exc)})
+
+    manifest = {
+        "dataset": dataset_id,
+        "source": str(dataset_dir.resolve()),
+        "record_count": len(records),
+        "records": records,
+    }
+    if errors:
+        manifest["errors"] = errors
+
+    print(f"\n✓ Extracted metadata for {manifest['record_count']} recordings")
+    if manifest.get("errors"):
+        print(f"  ⚠ {len(manifest['errors'])} errors encountered")
+
+    # Split into core and enriched metadata
+    core_records = []
+    enriched_records = []
+
+    for record in manifest["records"]:
+        core, enriched = split_core_and_enriched_metadata(record)
+
+        # Store data_name in enriched for linking
+        if "data_name" in core:
+            enriched["data_name"] = core["data_name"]
+
+        core_records.append(core)
+        enriched_records.append(enriched)
+
+    # Save outputs
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # 1. Full manifest (for reference/debugging)
+    full_manifest_path = output_dir / f"{dataset_id}_full_manifest.json"
+    with full_manifest_path.open("w", encoding="utf-8") as f:
+        json.dump(manifest, f, indent=2, default=_json_serializer)
+    print(f"\n✓ Saved full manifest: {full_manifest_path}")
+
+    # 2. Core metadata (for efficient loading)
+    core_manifest = {
+        "dataset": dataset_id,
+        "record_count": len(core_records),
+        "records": core_records,
+    }
+    core_path = output_dir / f"{dataset_id}_core.json"
+    with core_path.open("w", encoding="utf-8") as f:
+        json.dump(core_manifest, f, indent=2, default=_json_serializer)
+    print(f"✓ Saved core metadata: {core_path}")
+
+    # 3. Enriched metadata (for on-demand loading)
+    enriched_manifest = {
+        "dataset": dataset_id,
+        "record_count": len(enriched_records),
+        "records": enriched_records,
+    }
+    enriched_path = output_dir / f"{dataset_id}_enriched.json"
+    with enriched_path.open("w", encoding="utf-8") as f:
+        json.dump(enriched_manifest, f, indent=2, default=_json_serializer)
+    print(f"✓ Saved enriched metadata: {enriched_path}")
+
+    # Create summary
+    summary = {
+        "status": "success",
+        "dataset_id": dataset_id,
+        "record_count": manifest["record_count"],
+        "error_count": len(manifest.get("errors", [])),
+        "outputs": {
+            "full_manifest": str(full_manifest_path),
+            "core_metadata": str(core_path),
+            "enriched_metadata": str(enriched_path),
+        },
+        "upload_instructions": {
+            "description": "Upload to MongoDB via API Gateway",
+            "endpoint": "https://data.eegdash.org/admin/{database}/records/bulk",
+            "auth_header": "Authorization: Bearer AdminWrite2025SecureToken",
+            "example_curl": f"curl -X POST https://data.eegdash.org/admin/eegdashstaging/records/bulk -H 'Authorization: Bearer AdminWrite2025SecureToken' -H 'Content-Type: application/json' -d @{core_path}",
+        },
+    }
+
+    # Save summary
+    summary_path = output_dir / f"{dataset_id}_summary.json"
+    with summary_path.open("w", encoding="utf-8") as f:
+        json.dump(summary, f, indent=2)
+    print(f"✓ Saved summary: {summary_path}")
+
+    return summary
+
+
+def _json_serializer(obj):
+    """Handle non-serializable objects for JSON export."""
+    from pathlib import Path
+
+    import numpy as np
+    import pandas as pd
+
+    if isinstance(obj, (np.integer, np.floating)):
+        return obj.item()
+    elif isinstance(obj, np.ndarray):
+        return obj.tolist()
+    elif isinstance(obj, Path):
+        return str(obj)
+    elif isinstance(obj, set):
+        return sorted(list(obj))
+    elif pd.isna(obj):
+        return None
+    elif isinstance(obj, (pd.Timestamp, pd.Timedelta)):
+        return obj.isoformat()
+
+    raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Digest a single OpenNeuro dataset to extract metadata."
+    )
+    parser.add_argument(
+        "dataset_id",
+        type=str,
+        help="Dataset identifier (e.g., ds002718)",
+    )
+    parser.add_argument(
+        "--dataset-dir",
+        type=Path,
+        required=True,
+        help="Path to the local BIDS dataset directory",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=Path("digestion_output"),
+        help="Directory for output JSON files (default: digestion_output)",
+    )
+
+    args = parser.parse_args()
+
+    try:
+        summary = digest_dataset(
+            dataset_id=args.dataset_id,
+            dataset_dir=args.dataset_dir,
+            output_dir=args.output_dir,
+        )
+
+        print("\n" + "=" * 60)
+        print("Digestion completed successfully!")
+        print("=" * 60)
+        print(f"Dataset: {summary['dataset_id']}")
+        print(f"Records processed: {summary['record_count']}")
+        if summary["error_count"] > 0:
+            print(f"Errors: {summary['error_count']}")
+        print("\nOutputs:")
+        for name, path in summary["outputs"].items():
+            print(f"  {name}: {path}")
+        print("\n" + "=" * 60)
+        print("Next Steps: Upload to MongoDB")
+        print("=" * 60)
+        print("\nTo upload the core metadata to MongoDB:")
+        print(f"\n{summary['upload_instructions']['example_curl']}")
+        print("\nReplace 'eegdashstaging' with 'eegdash' for production database.")
+
+        return 0
+
+    except Exception as e:
+        print(f"\n✗ Digestion failed: {e}", file=sys.stderr)
+        import traceback
+
+        traceback.print_exc()
+        return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/scripts/ingestions/5_digest_datasets_minimal.py b/scripts/ingestions/5_digest_datasets_minimal.py
new file mode 100644
index 00000000..57deee77
--- /dev/null
+++ b/scripts/ingestions/5_digest_datasets_minimal.py
@@ -0,0 +1,473 @@
+r"""Digest cloned BIDS datasets and generate minimal MongoDB records.
+
+This script processes locally cloned BIDS datasets to extract MINIMAL metadata
+needed for the MongoDB database. It focuses on the core attributes required
+for the EEGDash API without loading actual EEG data or extracting unnecessary metadata.
+
+Architecture:
+    Cloned BIDS datasets -> This script -> Minimal JSON records -> MongoDB (via API)
+
+Key Differences from Full Digestion:
+- Only extracts CORE attributes (10 fields defined in eegdash.const)
+- No participant metadata extraction (participant_tsv, eeg_json)
+- No BIDS dependencies tracking
+- Faster processing (metadata-only, no data loading)
+- Smaller JSON files (minimal records)
+
+Core Attributes Extracted (from eegdash.const.config["attributes"]):
+1. data_name - Unique identifier (dataset_filename)
+2. dataset - Dataset ID (e.g., ds002718)
+3. bidspath - S3 path for download
+4. subject - Subject identifier
+5. task - Task name
+6. session - Session identifier (optional)
+7. run - Run number (optional)
+8. modality - Data modality (eeg, meg, ieeg)
+9. sampling_frequency - Sampling rate in Hz
+10. nchans - Number of channels
+11. ntimes - Number of time points
+# Note: NEMAR datasets (nm*) automatically use s3://nemar/ regardless of --s3-base
+
+Usage:
+    # Process all cloned datasets (OpenNeuro + NEMAR)
+    python 5_digest_datasets_minimal.py \\
+        --cloned-dir data/cloned_all \\
+        --output-dir digestion_output \\
+        --s3-base s3://openneuro.org
+
+    # Process specific datasets
+    python 5_digest_datasets_minimal.py \\
+        --cloned-dir data/cloned_all \\
+        --output-dir digestion_output \\
+        --datasets ds002718 ds005506
+
+    # Process with parallel processing
+    python 5_digest_datasets_minimal.py \\
+        --cloned-dir data/cloned_all \\
+        --output-dir digestion_output \\
+        --workers 4
+
+Output Structure:
+    digestion_output/
+    ├── ds002718/
+    │   ├── ds002718_minimal.json        # Minimal records for MongoDB
+    │   └── ds002718_summary.json        # Processing summary
+    ├── ds005506/
+    │   ├── ds005506_minimal.json
+    │   └── ds005506_summary.json
+    └── batch_summary.json               # Overall batch results
+
+Upload to MongoDB:
+    # Upload single dataset
+    curl -X POST https://data.eegdash.org/admin/eegdashstaging/records/bulk \\
+         -H "Authorization: Bearer AdminWrite2025SecureToken" \\
+         -H "Content-Type: application/json" \\
+         -d @digestion_output/ds002718/ds002718_minimal.json
+
+    # Bulk upload all datasets (Python)
+    python scripts/upload_to_mongodb.py \\
+        --input-dir digestion_output \\
+        --database eegdashstaging
+"""
+
+import argparse
+import json
+import sys
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+
+
+def extract_minimal_metadata(
+    dataset_id: str, dataset_dir: Path, s3_base: str
+) -> tuple[list[dict], list[dict]]:
+    """Extract minimal metadata from a BIDS dataset.
+
+    Only extracts the core 11 fields required for MongoDB without loading
+    participant info, BIDS dependencies, or actual EEG data.
+
+    Parameters
+    ----------
+    dataset_id : str
+        Dataset identifier (e.g., "ds002718")
+    dataset_dir : Path
+        Path to the local BIDS dataset directory
+    s3_base : str
+        S3 base URL (e.g., "s3://openneuro.org")
+
+    Returns
+    -------
+    records : list of dict
+        List of minimal metadata records
+    errors : list of dict
+        List of errors encountered during extraction
+
+    """
+    from eegdash.dataset.bids_dataset import EEGBIDSDataset
+
+    try:
+        # Use allow_symlinks=True for metadata extraction without loading raw data
+        bids_dataset = EEGBIDSDataset(
+            data_dir=str(dataset_dir),
+            dataset=dataset_id,
+            allow_symlinks=True,  # Enable metadata extraction from git-annex symlinks
+        )
+    except Exception as exc:
+        return [], [
+            {"dataset": dataset_id, "error": f"Failed to load BIDS dataset: {exc}"}
+        ]
+
+    records = []
+    errors = []
+
+    files = bids_dataset.get_files()
+
+    for bids_file in files:
+        _ = Path(bids_file).name
+
+        try:
+            # Extract minimal attributes directly without loading full metadata
+            record = extract_minimal_record(
+                bids_dataset, bids_file, dataset_id, s3_base
+            )
+            records.append(record)
+
+        except Exception as exc:
+            errors.append(
+                {"dataset": dataset_id, "file": str(bids_file), "error": str(exc)}
+            )
+
+    return records, errors
+
+
+def extract_minimal_record(
+    bids_dataset, bids_file: str, dataset_id: str, s3_base: str
+) -> dict[str, Any]:
+    """Extract minimal metadata for a single BIDS file.
+
+    Only extracts the 11 core attributes defined in eegdash.const.config["attributes"].
+
+    Parameters
+    ----------
+    bids_dataset : EEGBIDSDataset
+        The BIDS dataset object
+    bids_file : str
+        Path to the BIDS file
+    dataset_id : str
+        Dataset identifier
+    s3_base : str
+        S3 base URL
+
+    Returns
+    -------
+    dict
+        Minimal metadata record with only core attributes
+
+    """
+    file_name = Path(bids_file).name
+
+    # Construct S3 path based on dataset type
+    # NEMAR datasets (nm*) use s3://nemar/nm000XXX/...
+    # OpenNeuro datasets (ds*) use s3://openneuro.org/ds00XXXX/...
+    openneuro_path = dataset_id + bids_file.split(dataset_id)[1]
+
+    if dataset_id.startswith("nm"):
+        # NEMAR datasets use s3://nemar/{dataset_id}/...
+        s3_path = f"s3://nemar/{openneuro_path}"
+    else:
+        # OpenNeuro datasets use the provided s3_base
+        s3_path = f"{s3_base.rstrip('/')}/{openneuro_path}"
+
+    # Extract only the 11 core attributes
+    record = {
+        # Required fields
+        "data_name": f"{dataset_id}_{file_name}",
+        "dataset": dataset_id,
+        "bidspath": s3_path,
+        # BIDS entity fields
+        "subject": bids_dataset.get_bids_file_attribute("subject", bids_file),
+        "task": bids_dataset.get_bids_file_attribute("task", bids_file),
+        "session": bids_dataset.get_bids_file_attribute("session", bids_file),
+        "run": bids_dataset.get_bids_file_attribute("run", bids_file),
+        "modality": bids_dataset.get_bids_file_attribute("modality", bids_file),
+        # Technical metadata
+        "sampling_frequency": bids_dataset.get_bids_file_attribute("sfreq", bids_file),
+        "nchans": bids_dataset.get_bids_file_attribute("nchans", bids_file),
+        "ntimes": bids_dataset.get_bids_file_attribute("ntimes", bids_file),
+    }
+
+    return record
+
+
+def digest_single_dataset(
+    dataset_id: str, cloned_dir: Path, output_dir: Path, s3_base: str
+) -> dict[str, Any]:
+    """Process a single dataset and generate minimal JSON.
+
+    Parameters
+    ----------
+    dataset_id : str
+        Dataset identifier
+    cloned_dir : Path
+        Directory containing cloned datasets
+    output_dir : Path
+        Directory for output JSON files
+    s3_base : str
+        S3 base URL
+
+    Returns
+    -------
+    dict
+        Summary of the digestion process
+
+    """
+    dataset_dir = cloned_dir / dataset_id
+
+    if not dataset_dir.exists():
+        return {
+            "status": "error",
+            "dataset_id": dataset_id,
+            "error": f"Dataset directory not found: {dataset_dir}",
+            "record_count": 0,
+            "error_count": 0,
+        }
+
+    print(f"[{dataset_id}] Processing...")
+
+    try:
+        # Extract minimal metadata
+        records, errors = extract_minimal_metadata(dataset_id, dataset_dir, s3_base)
+
+        # Create output directory
+        dataset_output_dir = output_dir / dataset_id
+        dataset_output_dir.mkdir(parents=True, exist_ok=True)
+
+        # Save minimal JSON for MongoDB
+        minimal_json = {
+            "dataset": dataset_id,
+            "record_count": len(records),
+            "records": records,
+        }
+
+        minimal_path = dataset_output_dir / f"{dataset_id}_minimal.json"
+        with minimal_path.open("w", encoding="utf-8") as f:
+            json.dump(minimal_json, f, indent=2, default=_json_serializer)
+
+        # Save summary
+        summary = {
+            "status": "success",
+            "dataset_id": dataset_id,
+            "record_count": len(records),
+            "error_count": len(errors),
+            "errors": errors if errors else None,
+            "output_file": str(minimal_path),
+            "processed_at": datetime.now().isoformat(),
+        }
+
+        summary_path = dataset_output_dir / f"{dataset_id}_summary.json"
+        with summary_path.open("w", encoding="utf-8") as f:
+            json.dump(summary, f, indent=2)
+
+        print(f"[{dataset_id}] ✓ {len(records)} records, {len(errors)} errors")
+
+        return summary
+
+    except Exception as exc:
+        error_summary = {
+            "status": "error",
+            "dataset_id": dataset_id,
+            "error": str(exc),
+            "record_count": 0,
+            "error_count": 1,
+            "processed_at": datetime.now().isoformat(),
+        }
+
+        print(f"[{dataset_id}] ✗ {str(exc)}")
+
+        return error_summary
+
+
+def _json_serializer(obj):
+    """Handle non-serializable objects for JSON export."""
+    from pathlib import Path
+
+    import numpy as np
+    import pandas as pd
+
+    if isinstance(obj, (np.integer, np.floating)):
+        return obj.item()
+    elif isinstance(obj, np.ndarray):
+        return obj.tolist()
+    elif isinstance(obj, Path):
+        return str(obj)
+    elif isinstance(obj, set):
+        return sorted(list(obj))
+    elif pd.isna(obj):
+        return None
+    elif isinstance(obj, (pd.Timestamp, pd.Timedelta)):
+        return obj.isoformat()
+
+    raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Digest cloned BIDS datasets to extract minimal MongoDB metadata.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
+    )
+
+    parser.add_argument(
+        "--cloned-dir",
+        type=Path,
+        required=True,
+        help="Directory containing cloned BIDS datasets",
+    )
+
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=Path("digestion_output"),
+        help="Directory for output JSON files (default: digestion_output)",
+    )
+
+    parser.add_argument(
+        "--s3-base",
+        type=str,
+        default="s3://openneuro.org",
+        help="S3 base URL (default: s3://openneuro.org)",
+    )
+
+    parser.add_argument(
+        "--datasets",
+        nargs="+",
+        help="Specific dataset IDs to process (default: all in cloned-dir)",
+    )
+
+    parser.add_argument(
+        "--workers",
+        type=int,
+        default=1,
+        help="Number of parallel workers (default: 1, sequential)",
+    )
+
+    args = parser.parse_args()
+
+    # Determine which datasets to process
+    if args.datasets:
+        datasets_to_process = args.datasets
+    else:
+        # Auto-detect from cloned directory
+        datasets_to_process = [
+            d.name
+            for d in args.cloned_dir.iterdir()
+            if d.is_dir() and (d.name.startswith("ds") or d.name.startswith("nm"))
+        ]
+
+    print("=" * 80)
+    print("MINIMAL DATASET DIGESTION")
+    print("=" * 80)
+    print(f"Cloned directory: {args.cloned_dir}")
+    print(f"Output directory: {args.output_dir}")
+    print(f"S3 base URL: {args.s3_base}")
+    print(f"Datasets to process: {len(datasets_to_process)}")
+    print(f"Parallel workers: {args.workers}")
+    print("=" * 80)
+    print()
+
+    # Create output directory
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Process datasets
+    start_time = datetime.now()
+    summaries = []
+
+    if args.workers == 1:
+        # Sequential processing
+        for dataset_id in datasets_to_process:
+            summary = digest_single_dataset(
+                dataset_id, args.cloned_dir, args.output_dir, args.s3_base
+            )
+            summaries.append(summary)
+    else:
+        # Parallel processing
+        with ProcessPoolExecutor(max_workers=args.workers) as executor:
+            futures = {
+                executor.submit(
+                    digest_single_dataset,
+                    dataset_id,
+                    args.cloned_dir,
+                    args.output_dir,
+                    args.s3_base,
+                ): dataset_id
+                for dataset_id in datasets_to_process
+            }
+
+            for future in as_completed(futures):
+                summary = future.result()
+                summaries.append(summary)
+
+    end_time = datetime.now()
+    duration = (end_time - start_time).total_seconds()
+
+    # Calculate statistics
+    successful = [s for s in summaries if s["status"] == "success"]
+    failed = [s for s in summaries if s["status"] == "error"]
+    total_records = sum(s["record_count"] for s in successful)
+    total_errors = sum(s["error_count"] for s in summaries)
+
+    # Save batch summary
+    batch_summary = {
+        "processed_at": end_time.isoformat(),
+        "duration_seconds": duration,
+        "total_datasets": len(datasets_to_process),
+        "successful_datasets": len(successful),
+        "failed_datasets": len(failed),
+        "total_records": total_records,
+        "total_errors": total_errors,
+        "datasets": summaries,
+    }
+
+    batch_summary_path = args.output_dir / "batch_summary.json"
+    with batch_summary_path.open("w", encoding="utf-8") as f:
+        json.dump(batch_summary, f, indent=2)
+
+    # Print results
+    print()
+    print("=" * 80)
+    print("DIGESTION COMPLETE")
+    print("=" * 80)
+    print(f"Duration: {duration:.1f} seconds")
+    print(f"Datasets processed: {len(datasets_to_process)}")
+    print(f"  ✓ Successful: {len(successful)}")
+    print(f"  ✗ Failed: {len(failed)}")
+    print(f"Total records extracted: {total_records}")
+    print(f"Total errors: {total_errors}")
+    print(f"\nBatch summary: {batch_summary_path}")
+    print("=" * 80)
+
+    if failed:
+        print("\nFailed datasets:")
+        for s in failed:
+            print(f"  ✗ {s['dataset_id']}: {s.get('error', 'Unknown error')}")
+
+    print("\n" + "=" * 80)
+    print("NEXT STEPS: Upload to MongoDB")
+    print("=" * 80)
+    print("\nOption 1: Upload single dataset")
+    print(
+        "  curl -X POST http://137.110.244.65:3000/admin/eegdashstaging/records/bulk \\"
+    )
+    print("       -H 'Authorization: Bearer AdminWrite2025SecureToken' \\")
+    print("       -H 'Content-Type: application/json' \\")
+    print("       -d @digestion_output/ds002718/ds002718_minimal.json")
+    print("\nOption 2: Bulk upload all datasets (requires Python script)")
+    print("  See: scripts/upload_to_mongodb.py")
+    print("=" * 80)
+
+    return 0 if len(failed) == 0 else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/scripts/ingestions/README.md b/scripts/ingestions/README.md
new file mode 100644
index 00000000..10fadc4c
--- /dev/null
+++ b/scripts/ingestions/README.md
@@ -0,0 +1,481 @@
+# OpenNeuro Dataset Digestion Pipeline
+
+This directory contains scripts for the EEGDash dataset ingestion pipeline, designed to extract metadata from OpenNeuro BIDS datasets and prepare it for MongoDB ingestion via the API Gateway.
+
+## Architecture Overview
+
+```
+┌─────────────────┐
+│  OpenNeuro      │
+│  GitHub Repos   │
+│ (git-annex)     │
+└────────┬────────┘
+         │ 1. Clone (symlinks only)
+         ▼
+┌─────────────────┐
+│  Local BIDS     │
+│  Datasets       │
+│ (test_diggestion)│
+│ *.set → annex/* │ ← Symlinks to annexed data
+│ *.json, *.tsv   │ ← Actual sidecar metadata
+└────────┬────────┘
+         │ 2. Digest (metadata only)
+         ▼
+┌─────────────────┐
+│  JSON Files     │
+│  - Core         │
+│  - Enriched     │
+│  - Full         │
+└────────┬────────┘
+         │ 3. Upload
+         ▼
+┌─────────────────┐
+│  API Gateway    │
+│ data.eegdash.org│
+└────────┬────────┘
+         │ 4. Store
+         ▼
+┌─────────────────┐
+│  MongoDB        │
+│  - eegdashstaging│
+│  - eegdash      │
+└─────────────────┘
+```
+
+### Two Operating Modes
+
+The `EEGBIDSDataset` class supports two modes via the `allow_symlinks` parameter:
+
+**Digestion Mode** (`allow_symlinks=True`):
+- Purpose: Extract metadata without loading raw EEG data
+- Use case: Initial dataset ingestion, metadata collection
+- Accepts: Broken symlinks (git-annex), actual files
+- Extracts from: JSON sidecars, TSV files, BIDS structure
+- Requirements: Only metadata files need to exist
+
+**Loading Mode** (`allow_symlinks=False`, default):
+- Purpose: Load actual EEG data for analysis
+- Use case: Training models, running analyses, data processing
+- Accepts: Only actual readable files
+- Requires: Complete data files, not just symlinks
+- Used by: `EEGDashDataset`, analysis scripts
+
+This separation allows efficient metadata collection from thousands of datasets without downloading terabytes of EEG data.
+
+## Scripts
+
+### 1. Clone Datasets: `clone_openneuro_datasets.py`
+
+Clones OpenNeuro datasets from GitHub to local storage.
+
+**Important Note on Git-Annex:**
+Most OpenNeuro datasets use git-annex for large file management. After cloning, EEG data files are symlinks pointing to annexed objects that aren't downloaded by default. This is intentional - the digestion pipeline extracts metadata from BIDS sidecar files (JSON, TSV) without needing the actual EEG data.
+
+**Usage:**
+```bash
+python clone_openneuro_datasets.py \
+    --output-dir test_diggestion \
+    --datasets-file consolidated/openneuro_datasets.json \
+    --timeout 300
+```
+
+**Features:**
+- Timeout protection (default: 5 minutes per dataset)
+- Automatic retry on failure
+- Progress tracking
+- Results logging
+
+**Output:**
+- Cloned datasets in `test_diggestion/`
+- `clone_results.json` - detailed cloning results
+- `retry.json` - list of failed datasets for retry
+
+---
+
+### 2. Digest Dataset: `digest_single_dataset.py`
+
+Extracts metadata from a single BIDS dataset and generates JSON files optimized for MongoDB ingestion.
+
+**Usage:**
+```bash
+python digest_single_dataset.py ds002718 \
+    --dataset-dir test_diggestion/ds002718 \
+    --output-dir digestion_output/ds002718
+```
+
+**Arguments:**
+- `dataset_id` (required): Dataset identifier (e.g., `ds002718`)
+- `--dataset-dir` (required): Path to local BIDS dataset
+- `--output-dir` (optional): Output directory (default: `digestion_output`)
+
+**How It Handles Git-Annex:**
+The script uses `allow_symlinks=True` when initializing the BIDS dataset, which enables metadata extraction from symlinked files without requiring the actual data. This is achieved through:
+- Reading BIDS sidecar JSON files (e.g., `sub-001_task-rest_eeg.json`)
+- Extracting technical parameters: `SamplingFrequency`, `EEGChannelCount`, `RecordingDuration`
+- Reading participant information from `participants.tsv`
+- Parsing BIDS file structure for subject, task, session, run information
+
+**Output Files:**
+
+1. **`{dataset_id}_core.json`** - Core metadata for MongoDB
+   - Essential fields only (data_name, dataset, bidspath, etc.)
+   - Optimized for fast querying
+   - Ready for bulk upload to MongoDB
+
+2. **`{dataset_id}_enriched.json`** - Extended metadata
+   - Participant information
+   - EEG technical details
+   - BIDS dependencies
+   - Additional JSON metadata from sidecar files
+
+3. **`{dataset_id}_full_manifest.json`** - Complete metadata
+   - All extracted information combined
+   - For reference and debugging
+
+4. **`{dataset_id}_summary.json`** - Processing summary
+   - Record counts
+   - Error statistics
+   - Upload instructions
+
+---
+
+## Metadata Structure
+
+### Core Metadata Fields (Always Loaded)
+
+These fields are stored in MongoDB for efficient querying:
+
+```json
+{
+  "data_name": "ds002718_sub-012_task-RestingState_eeg.set",
+  "dataset": "ds002718",
+  "bidspath": "ds002718/sub-012/eeg/sub-012_task-RestingState_eeg.set",
+  "subject": "012",
+  "task": "RestingState",
+  "session": null,
+  "run": null,
+  "modality": "eeg",
+  "sampling_frequency": 500.0,
+  "nchans": 64,
+  "ntimes": 150000
+}
+```
+
+### Enriched Metadata Fields (Loaded On-Demand)
+
+Additional information loaded only when needed:
+
+```json
+{
+  "data_name": "ds002718_sub-012_task-RestingState_eeg.set",
+  "participant_tsv": {
+    "age": 25,
+    "sex": "M",
+    "handedness": "R"
+  },
+  "eeg_json": {
+    "PowerLineFrequency": 60,
+    "EEGReference": "Average",
+    "EEGGround": "AFz",
+    "InstitutionName": "University Example"
+  },
+  "bidsdependencies": [
+    "sub-012/eeg/sub-012_task-RestingState_channels.tsv",
+    "sub-012/eeg/sub-012_task-RestingState_events.tsv"
+  ]
+}
+```
+
+---
+
+## MongoDB Ingestion via API Gateway
+
+### Upload Core Metadata
+
+After digestion, upload the core metadata to MongoDB using the API Gateway:
+
+**For Staging Database:**
+```bash
+curl -X POST https://data.eegdash.org/admin/eegdashstaging/records/bulk \
+     -H "Authorization: Bearer AdminWrite2025SecureToken" \
+     -H "Content-Type: application/json" \
+     -d @digestion_output/ds002718/ds002718_core.json
+```
+
+**For Production Database:**
+```bash
+curl -X POST https://data.eegdash.org/admin/eegdash/records/bulk \
+     -H "Authorization: Bearer AdminWrite2025SecureToken" \
+     -H "Content-Type: application/json" \
+     -d @digestion_output/ds002718/ds002718_core.json
+```
+
+**Expected Response:**
+```json
+{
+  "success": true,
+  "database": "eegdashstaging",
+  "insertedCount": 42,
+  "message": "42 records inserted successfully"
+}
+```
+
+---
+
+## Complete Workflow Example
+
+### Step 1: Clone Datasets
+
+```bash
+# Clone all datasets from OpenNeuro
+python scripts/ingestions/clone_openneuro_datasets.py \
+    --output-dir test_diggestion \
+    --timeout 300
+
+# Check results
+cat test_diggestion/clone_results.json
+```
+
+### Step 2: Digest Individual Datasets
+
+```bash
+# Digest a single dataset
+python scripts/ingestions/digest_single_dataset.py ds002718 \
+    --dataset-dir test_diggestion/ds002718 \
+    --output-dir digestion_output/ds002718
+
+# Output will be in digestion_output/ds002718/
+ls digestion_output/ds002718/
+# ds002718_core.json
+# ds002718_enriched.json
+# ds002718_full_manifest.json
+# ds002718_summary.json
+```
+
+### Step 3: Review Output
+
+```bash
+# Check processing summary
+cat digestion_output/ds002718/ds002718_summary.json
+
+# Inspect core metadata (first 5 records)
+cat digestion_output/ds002718/ds002718_core.json | jq '.records[:5]'
+```
+
+### Step 4: Upload to MongoDB
+
+```bash
+# Upload to staging database
+curl -X POST https://data.eegdash.org/admin/eegdashstaging/records/bulk \
+     -H "Authorization: Bearer AdminWrite2025SecureToken" \
+     -H "Content-Type: application/json" \
+     -d @digestion_output/ds002718/ds002718_core.json
+
+# Verify upload
+curl -H "Authorization: Bearer Competition2025AccessToken" \
+     "https://data.eegdash.org/api/eegdashstaging/count?filter=%7B%22dataset%22%3A%22ds002718%22%7D"
+```
+
+---
+
+## Batch Processing Multiple Datasets
+
+To process multiple datasets, create a shell script:
+
+```bash
+#!/bin/bash
+# batch_digest.sh
+
+DATASETS_DIR="test_diggestion"
+OUTPUT_DIR="digestion_output"
+
+for dataset_dir in $DATASETS_DIR/ds*/; do
+    dataset_id=$(basename "$dataset_dir")
+    echo "Processing $dataset_id..."
+    
+    python scripts/ingestions/digest_single_dataset.py "$dataset_id" \
+        --dataset-dir "$dataset_dir" \
+        --output-dir "$OUTPUT_DIR/$dataset_id"
+    
+    if [ $? -eq 0 ]; then
+        echo "✓ $dataset_id digested successfully"
+    else
+        echo "✗ $dataset_id digestion failed"
+    fi
+done
+```
+
+Run with:
+```bash
+chmod +x batch_digest.sh
+./batch_digest.sh
+```
+
+---
+
+## Error Handling
+
+### Common Errors
+
+**1. Dataset directory not found**
+```
+Error: Dataset directory not found: test_diggestion/ds002718
+```
+Solution: Ensure the dataset has been cloned first.
+
+**2. Invalid BIDS structure**
+```
+Error creating BIDS dataset: No EEG recordings found
+```
+Solution: Verify the dataset contains valid EEG data in BIDS format.
+
+**3. Metadata extraction failure**
+```
+✗ Error extracting metadata for sub-001_eeg.set: channels.tsv not found
+```
+Solution: Check that all required BIDS sidecar files are present.
+
+### Error Logs
+
+Each digestion creates a summary with errors:
+
+```json
+{
+  "status": "success",
+  "dataset_id": "ds002718",
+  "record_count": 40,
+  "error_count": 2,
+  "outputs": {...},
+  "errors": [
+    {
+      "file": "sub-999/eeg/sub-999_task-rest_eeg.set",
+      "error": "channels.tsv not found"
+    }
+  ]
+}
+```
+
+---
+
+## Performance Optimization
+
+### Parallel Processing
+
+Process multiple datasets in parallel:
+
+```bash
+#!/bin/bash
+# parallel_digest.sh
+
+DATASETS_DIR="test_diggestion"
+OUTPUT_DIR="digestion_output"
+MAX_JOBS=4
+
+find "$DATASETS_DIR" -maxdepth 1 -type d -name "ds*" | \
+    parallel -j $MAX_JOBS '
+        dataset_id=$(basename {})
+        python scripts/ingestions/digest_single_dataset.py "$dataset_id" \
+            --dataset-dir {} \
+            --output-dir "'$OUTPUT_DIR'/$dataset_id"
+    '
+```
+
+Requires: `sudo apt install parallel` (or `brew install parallel` on macOS)
+
+---
+
+## Data Model Reference
+
+### MongoDB Collection Schema
+
+```javascript
+{
+  // Core metadata (indexed)
+  "_id": ObjectId("..."),
+  "data_name": String,      // Unique identifier (indexed)
+  "dataset": String,         // Dataset ID (indexed)
+  "bidspath": String,        // S3 path to file
+  "subject": String,         // Subject ID
+  "task": String,            // Task name
+  "session": String | null,  // Session ID
+  "run": String | null,      // Run number
+  "modality": String,        // Data modality
+  "sampling_frequency": Number,
+  "nchans": Number,
+  "ntimes": Number,
+  
+  // Enriched metadata (optional)
+  "participant_tsv": Object,  // Participant info
+  "eeg_json": Object,         // EEG technical metadata
+  "bidsdependencies": Array   // Related BIDS files
+}
+```
+
+### Indexes
+
+Core fields are indexed for fast querying:
+- `data_name` (unique)
+- `dataset`
+- `subject`
+- `task`
+
+---
+
+## API Reference
+
+See `EEGDash-mongoDB-files/API_DOCUMENTATION.md` for complete API documentation.
+
+**Key Endpoints:**
+
+- **Upload Records (Bulk):** `POST /admin/{database}/records/bulk`
+- **Query Records:** `GET /api/{database}/records`
+- **Count Records:** `GET /api/{database}/count`
+- **Get Metadata:** `GET /api/{database}/metadata/{dataset}`
+
+---
+
+## Troubleshooting
+
+### Check Dataset Status
+
+```bash
+# List all datasets in staging
+curl -H "Authorization: Bearer Competition2025AccessToken" \
+     https://data.eegdash.org/api/eegdashstaging/datasets
+
+# Count records for a dataset
+curl -H "Authorization: Bearer Competition2025AccessToken" \
+     "https://data.eegdash.org/api/eegdashstaging/count?filter=%7B%22dataset%22%3A%22ds002718%22%7D"
+```
+
+### Verify File Structure
+
+```bash
+# Check BIDS validity
+python -c "
+from eegdash.dataset.bids_dataset import EEGBIDSDataset
+ds = EEGBIDSDataset('test_diggestion/ds002718', 'ds002718')
+print(f'Files found: {len(ds.get_files())}')
+for f in ds.get_files()[:3]:
+    print(f'  - {f}')
+"
+```
+
+---
+
+## Contributing
+
+When adding new features to the digestion pipeline:
+
+1. Ensure backward compatibility with existing JSON structure
+2. Update both core and enriched metadata schemas
+3. Test with multiple datasets of varying sizes
+4. Document any new fields in this README
+
+---
+
+## Related Documentation
+
+- **Architecture:** `EEGDash-mongoDB-files/ARCH_DOCUMENTATION.md`
+- **API Documentation:** `EEGDash-mongoDB-files/API_DOCUMENTATION.md`
+- **Main README:** `README.md`
diff --git a/tests/test_correctness.py b/tests/test_correctness.py
index 750d9614..a9d2ceae 100644
--- a/tests/test_correctness.py
+++ b/tests/test_correctness.py
@@ -96,7 +96,7 @@ def preprocess_instance(eeg_dash_dataset, cache_dir: Path):
         ]
 
         eeg_dash_dataset = preprocess(
-            eeg_dash_dataset, preprocessors, n_jobs=-1, save_dir=pre_processed_dir
+            eeg_dash_dataset, preprocessors, n_jobs=20, save_dir=pre_processed_dir
         )
 
         return eeg_dash_dataset
diff --git a/tests/test_mongo_connection.py b/tests/test_mongo_connection.py
index cc269f8a..7588a85b 100644
--- a/tests/test_mongo_connection.py
+++ b/tests/test_mongo_connection.py
@@ -71,16 +71,19 @@ def test_different_staging_flags_use_same_connections(mongo_mocks):
 
 
 def test_close_does_not_close_singleton(mongo_mocks):
-    """EEGDash.close() should not close the shared Mongo client."""
+    """Creating multiple EEGDash instances should share the singleton client."""
     e1 = EEGDash(is_public=True, is_staging=False)
     client = e1._EEGDash__client  # grab the underlying client
-    e1.close()
-
-    client.close.assert_not_called()
 
+    # Create another instance with same parameters
     e2 = EEGDash(is_public=True, is_staging=False)
+
+    # They should share the same client
     assert e2._EEGDash__client is client
 
+    # Client should not be closed yet
+    client.close.assert_not_called()
+
 
 def test_close_all_connections_closes_clients(mongo_mocks):
     """EEGDash.close_all_connections() closes clients and clears the registry."""