|
| 1 | +#!/usr/bin/env python3 |
| 2 | +"""Generate individual documentation pages for each EEGDash dataset. |
| 3 | +
|
| 4 | +This script creates individual RST files for each dataset with comprehensive |
| 5 | +information including metadata, usage examples, and dataset statistics. |
| 6 | +""" |
| 7 | + |
| 8 | +import sys |
| 9 | +from pathlib import Path |
| 10 | + |
| 11 | +import pandas as pd |
| 12 | + |
| 13 | +# Add the parent directory to the path to import eegdash modules |
| 14 | +sys.path.insert(0, str(Path(__file__).parent.parent)) |
| 15 | + |
| 16 | +from eegdash.dataset.registry import _markdown_table |
| 17 | + |
| 18 | + |
| 19 | +def create_dataset_page_template(dataset_id: str, row_series: pd.Series) -> str: |
| 20 | + """Create an RST page template for a specific dataset.""" |
| 21 | + # Extract key metadata |
| 22 | + n_subjects = row_series.get("n_subjects", "Unknown") |
| 23 | + n_records = row_series.get("n_records", "Unknown") |
| 24 | + n_tasks = row_series.get("n_tasks", "Unknown") |
| 25 | + modality = row_series.get("modality of exp", "") |
| 26 | + exp_type = row_series.get("type of exp", "") |
| 27 | + subject_type = row_series.get("Type Subject", "") |
| 28 | + duration = row_series.get("duration_hours_total", "Unknown") |
| 29 | + size = row_series.get("size", "Unknown") |
| 30 | + |
| 31 | + # Create description |
| 32 | + description_parts = [] |
| 33 | + if modality and str(modality).strip(): |
| 34 | + description_parts.append(f"**{modality}**") |
| 35 | + if exp_type and str(exp_type).strip(): |
| 36 | + description_parts.append(f"{exp_type}") |
| 37 | + if subject_type and str(subject_type).strip(): |
| 38 | + description_parts.append(f"{subject_type} subjects") |
| 39 | + |
| 40 | + description = ( |
| 41 | + " | ".join(description_parts) |
| 42 | + if description_parts |
| 43 | + else "EEG dataset from OpenNeuro" |
| 44 | + ) |
| 45 | + |
| 46 | + # Generate the metadata table |
| 47 | + table_content = _markdown_table(row_series) |
| 48 | + |
| 49 | + # Create the RST content |
| 50 | + rst_content = f'''.. _{dataset_id.lower()}: |
| 51 | +
|
| 52 | +{dataset_id.upper()} |
| 53 | +{"=" * len(dataset_id)} |
| 54 | +
|
| 55 | +OpenNeuro Dataset {dataset_id} |
| 56 | +------------------------------ |
| 57 | +
|
| 58 | +{description} |
| 59 | +
|
| 60 | +This dataset contains **{n_subjects} subjects** with **{n_records} recordings** across **{n_tasks} tasks**. |
| 61 | +Total duration: **{duration} hours**. Dataset size: **{size}**. |
| 62 | +
|
| 63 | +Dataset Overview |
| 64 | +---------------- |
| 65 | +
|
| 66 | +{table_content} |
| 67 | +
|
| 68 | +Usage Examples |
| 69 | +-------------- |
| 70 | +
|
| 71 | +Basic usage: |
| 72 | +
|
| 73 | +.. code-block:: python |
| 74 | +
|
| 75 | + from eegdash.dataset import {dataset_id.upper()} |
| 76 | +
|
| 77 | + # Initialize the dataset |
| 78 | + dataset = {dataset_id.upper()}(cache_dir="./data") |
| 79 | +
|
| 80 | + # Check dataset size |
| 81 | + print(f"Number of recordings: {{len(dataset)}}") |
| 82 | +
|
| 83 | + # Access first recording |
| 84 | + if len(dataset) > 0: |
| 85 | + recording = dataset[0] |
| 86 | + print(f"Recording description: {{recording.description}}") |
| 87 | +
|
| 88 | +Loading EEG Data: |
| 89 | +
|
| 90 | +.. code-block:: python |
| 91 | +
|
| 92 | + # Load raw EEG data |
| 93 | + if len(dataset) > 0: |
| 94 | + recording = dataset[0] |
| 95 | + raw = recording.load() |
| 96 | +
|
| 97 | + # Inspect the data |
| 98 | + print(f"Sampling rate: {{raw.info['sfreq']}} Hz") |
| 99 | + print(f"Number of channels: {{len(raw.ch_names)}}") |
| 100 | + print(f"Duration: {{raw.times[-1]:.1f}} seconds") |
| 101 | + print(f"Channel names: {{raw.ch_names[:5]}}...") # First 5 channels |
| 102 | +
|
| 103 | +Advanced Filtering: |
| 104 | +
|
| 105 | +.. code-block:: python |
| 106 | +
|
| 107 | + # Filter by specific criteria (if applicable) |
| 108 | + filtered_dataset = {dataset_id.upper()}( |
| 109 | + cache_dir="./data", |
| 110 | + query={{"task": "RestingState"}} # Example filter |
| 111 | + ) |
| 112 | +
|
| 113 | + # Combine with other datasets |
| 114 | + from eegdash import EEGDashDataset |
| 115 | +
|
| 116 | + # Load multiple datasets |
| 117 | + combined = EEGDashDataset( |
| 118 | + cache_dir="./data", |
| 119 | + dataset=["{dataset_id}", "ds002718"], # Multiple datasets |
| 120 | + subject=["001", "002"] # Specific subjects |
| 121 | + ) |
| 122 | +
|
| 123 | +Dataset Information |
| 124 | +------------------- |
| 125 | +
|
| 126 | +**Dataset ID**: {dataset_id} |
| 127 | +
|
| 128 | +**OpenNeuro URL**: https://openneuro.org/datasets/{dataset_id} |
| 129 | +
|
| 130 | +**NeMAR URL**: https://nemar.org/dataexplorer/detail?dataset_id={dataset_id} |
| 131 | +
|
| 132 | +**Key Statistics**: |
| 133 | +
|
| 134 | +- **Subjects**: {n_subjects} |
| 135 | +- **Recordings**: {n_records} |
| 136 | +- **Tasks**: {n_tasks} |
| 137 | +- **Duration**: {duration} hours |
| 138 | +- **Size**: {size} |
| 139 | +- **Modality**: {modality or "EEG"} |
| 140 | +- **Experiment Type**: {exp_type or "Not specified"} |
| 141 | +- **Subject Type**: {subject_type or "Not specified"} |
| 142 | +
|
| 143 | +Related Documentation |
| 144 | +--------------------- |
| 145 | +
|
| 146 | +- :class:`eegdash.api.EEGDashDataset` - Main dataset class |
| 147 | +- :doc:`../api_core` - Core API reference |
| 148 | +- :ref:`overview` - EEGDash overview |
| 149 | +
|
| 150 | +See Also |
| 151 | +-------- |
| 152 | +
|
| 153 | +- `OpenNeuro dataset page <https://openneuro.org/datasets/{dataset_id}>`_ |
| 154 | +- `NeMAR data explorer <https://nemar.org/dataexplorer/detail?dataset_id={dataset_id}>`_ |
| 155 | +- :ref:`dataset_index` - Browse all available datasets |
| 156 | +''' |
| 157 | + |
| 158 | + return rst_content |
| 159 | + |
| 160 | + |
| 161 | +def generate_dataset_index_page(df: pd.DataFrame) -> str: |
| 162 | + """Generate an index page listing all datasets.""" |
| 163 | + # Group datasets by modality for better organization |
| 164 | + modalities = df.groupby("modality of exp").size().sort_values(ascending=False) |
| 165 | + total_datasets = len(df) |
| 166 | + rst_content = """.. _dataset_index: |
| 167 | +
|
| 168 | +Dataset Index |
| 169 | +============= |
| 170 | +
|
| 171 | +EEGDash provides access to **{total_datasets} EEG datasets** from OpenNeuro. Each dataset has its own dedicated documentation page with detailed metadata, usage examples, and statistics. |
| 172 | +
|
| 173 | +Quick Statistics |
| 174 | +---------------- |
| 175 | +
|
| 176 | +- **Total Datasets**: {total_datasets} |
| 177 | +- **Total Subjects**: {total_subjects:,} |
| 178 | +- **Total Recordings**: {total_records:,} |
| 179 | +- **Total Duration**: {total_duration:.1f} hours |
| 180 | +- **Total Size**: {total_size:.1f} GB |
| 181 | +
|
| 182 | +Browse by Modality |
| 183 | +------------------ |
| 184 | +
|
| 185 | +""".format( |
| 186 | + total_datasets=total_datasets, |
| 187 | + total_subjects=df["n_subjects"].sum(), |
| 188 | + total_records=df["n_records"].sum(), |
| 189 | + total_duration=df["duration_hours_total"].sum(), |
| 190 | + total_size=df["size_bytes"].sum() / (1024**3), # Convert to GB |
| 191 | + ) |
| 192 | + |
| 193 | + # Add modality sections |
| 194 | + for modality, count in modalities.head(10).items(): |
| 195 | + if pd.isna(modality) or modality == "": |
| 196 | + modality = "Other" |
| 197 | + |
| 198 | + rst_content += f""" |
| 199 | +{modality} ({count} datasets) |
| 200 | +{"^" * (len(modality) + len(f" ({count} datasets)"))} |
| 201 | +
|
| 202 | +""" |
| 203 | + |
| 204 | + # List datasets for this modality |
| 205 | + modality_datasets = ( |
| 206 | + df[df["modality of exp"] == modality] |
| 207 | + if modality != "Other" |
| 208 | + else df[df["modality of exp"].isna() | (df["modality of exp"] == "")] |
| 209 | + ) |
| 210 | + |
| 211 | + # Show ALL datasets for this modality (no truncation) |
| 212 | + for _, row in modality_datasets.iterrows(): |
| 213 | + dataset_id = row["dataset"] |
| 214 | + n_subjects = row["n_subjects"] |
| 215 | + n_records = row["n_records"] |
| 216 | + exp_type = row.get("type of exp", "") |
| 217 | + |
| 218 | + rst_content += f"- :doc:`{dataset_id} <datasets/{dataset_id}>` - {n_subjects} subjects, {n_records} recordings" |
| 219 | + if exp_type and pd.notna(exp_type): |
| 220 | + rst_content += f" ({exp_type})" |
| 221 | + rst_content += "\n" |
| 222 | + |
| 223 | + rst_content += "\n" |
| 224 | + |
| 225 | + # Add alphabetical index |
| 226 | + rst_content += """ |
| 227 | +Complete Alphabetical Index |
| 228 | +--------------------------- |
| 229 | +
|
| 230 | +.. toctree:: |
| 231 | + :maxdepth: 1 |
| 232 | + :glob: |
| 233 | +
|
| 234 | + datasets/* |
| 235 | +
|
| 236 | +All Datasets (Alphabetical) |
| 237 | +--------------------------- |
| 238 | +
|
| 239 | +""" |
| 240 | + |
| 241 | + # Add alphabetical list |
| 242 | + for _, row in df.sort_values("dataset").iterrows(): |
| 243 | + dataset_id = row["dataset"] |
| 244 | + n_subjects = row["n_subjects"] |
| 245 | + n_records = row["n_records"] |
| 246 | + size = row["size"] |
| 247 | + |
| 248 | + rst_content += f"- :doc:`{dataset_id} <datasets/{dataset_id}>` - {n_subjects} subjects, {n_records} recordings, {size}\n" |
| 249 | + |
| 250 | + # Include key API module pages to satisfy toctree inclusion and avoid warnings |
| 251 | + rst_content += """ |
| 252 | +
|
| 253 | +.. toctree:: |
| 254 | + :hidden: |
| 255 | + :maxdepth: 1 |
| 256 | +
|
| 257 | + dataset/eegdash.dataset |
| 258 | + dataset/eegdash.downloader |
| 259 | +""" |
| 260 | + |
| 261 | + return rst_content |
| 262 | + |
| 263 | + |
| 264 | +def main(): |
| 265 | + """Generate all dataset documentation pages.""" |
| 266 | + # Load dataset metadata |
| 267 | + csv_path = ( |
| 268 | + Path(__file__).parent.parent / "eegdash" / "dataset" / "dataset_summary.csv" |
| 269 | + ) |
| 270 | + df = pd.read_csv(csv_path, comment="#", skip_blank_lines=True) |
| 271 | + |
| 272 | + print(f"Generating documentation for {len(df)} datasets...") |
| 273 | + |
| 274 | + # Create output directories |
| 275 | + output_dir = Path(__file__).parent / "source" / "api" / "datasets" |
| 276 | + output_dir.mkdir(exist_ok=True, parents=True) |
| 277 | + |
| 278 | + # Generate individual dataset pages |
| 279 | + for _, row in df.iterrows(): |
| 280 | + dataset_id = row["dataset"] |
| 281 | + print(f" Generating {dataset_id}...") |
| 282 | + |
| 283 | + # Create RST content |
| 284 | + rst_content = create_dataset_page_template(dataset_id, row) |
| 285 | + |
| 286 | + # Write to file |
| 287 | + output_file = output_dir / f"{dataset_id}.rst" |
| 288 | + with open(output_file, "w") as f: |
| 289 | + f.write(rst_content) |
| 290 | + |
| 291 | + # Generate index page |
| 292 | + print("Generating dataset index page...") |
| 293 | + index_content = generate_dataset_index_page(df) |
| 294 | + index_file = Path(__file__).parent / "source" / "api" / "api_dataset.rst" |
| 295 | + with open(index_file, "w") as f: |
| 296 | + f.write(index_content) |
| 297 | + |
| 298 | + print(f"✅ Generated {len(df)} dataset pages + index page") |
| 299 | + print(f"📁 Output directory: {output_dir}") |
| 300 | + print(f"📄 Index page: {index_file}") |
| 301 | + |
| 302 | + |
| 303 | +if __name__ == "__main__": |
| 304 | + main() |
0 commit comments