Add MongoDB migration to Parquet dataset

oesteban · oesteban · commit a0c4f0ef1394 · 2025-11-18T17:10:28.000+01:00
diff --git a/README.md b/README.md
@@ -86,3 +86,23 @@ To run it every Monday at 5 AM, add this line to your crontab:
 0 5 * * 1 /path/to/fmriprep_stats/scripts/update_plots.sh 2>> $HOME/var/log/update_plots.err >> $HOME/var/log/update_plots.log
 ```
 
+## Migrating from MongoDB
+
+`scripts/migrate_mongo_to_parquet.py` streams the MongoDB collections into the
+partitioned Parquet layout consumed by the new tooling. Run it before switching
+workflows so that `_manifest.parquet` already knows which events have been
+ingested:
+
+```bash
+python scripts/migrate_mongo_to_parquet.py \
+  --mongo-uri mongodb://localhost:27017 \
+  --db-name fmriprep_stats \
+  /path/to/dataset
+```
+
+The script buffers up to 1,000 events (one Parquet file) at a time by default.
+Reduce `--batch-size` if you are memory-constrained or increase it on beefier
+machines to reduce the number of tiny Parquet files. Re-running the migration
+is safe: the manifest tracks event IDs and stops duplicates—we tested a
+double-run and the second invocation reported no new rows.
+
diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,9 @@
 pymongo
 pandas
+pyarrow
 matplotlib
 seaborn
 notebook
 nbconvert
-requests
+click
+requests
diff --git a/scripts/migrate_mongo_to_parquet.py b/scripts/migrate_mongo_to_parquet.py
@@ -0,0 +1,148 @@
+#!/usr/bin/env python3
+"""Convert MongoDB event collections into a partitioned Parquet dataset."""
+
+from __future__ import annotations
+
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, Iterable, List, Tuple
+
+import click
+import pandas as pd
+from pymongo import MongoClient
+from uuid import uuid4
+
+from src import api
+
+DEFAULT_BATCH_SIZE = 1000
+
+
+def _normalize_records(records: Iterable[Dict]) -> pd.DataFrame:
+    """Return a flattened dataframe for *records*."""
+
+    return pd.json_normalize(list(records), sep=".")
+
+
+@click.command()
+@click.option(
+    "--mongo-uri",
+    default="mongodb://localhost:27017",
+    show_default=True,
+    help="MongoDB connection URI.",
+)
+@click.option(
+    "--db-name",
+    default="fmriprep_stats",
+    show_default=True,
+    help="MongoDB database name.",
+)
+@click.option(
+    "--batch-size",
+    type=click.IntRange(min=1),
+    default=DEFAULT_BATCH_SIZE,
+    show_default=True,
+    help="Number of events to buffer before writing Parquet partitions.",
+)
+@click.argument("dataset_root", type=click.Path(path_type=Path))
+def main(mongo_uri: str, db_name: str, batch_size: int, dataset_root: Path) -> None:
+    """Stream MongoDB events into a partitioned Parquet dataset."""
+
+    dataset_root = dataset_root.resolve()
+    dataset_root.mkdir(parents=True, exist_ok=True)
+
+    manifest_path = api._manifest_path(dataset_root)
+    manifest = api._load_manifest(manifest_path)
+    manifest_cache = api._load_manifest_cache(manifest)
+
+    client = MongoClient(mongo_uri)
+    db = client[db_name]
+
+    buffers: Dict[Tuple[str, Path], List[Dict]] = defaultdict(list)
+    pending_records = 0
+    totals = {event: 0 for event in api.ISSUES}
+
+    def flush_buffers() -> None:
+        nonlocal pending_records, manifest
+
+        if pending_records == 0:
+            return
+
+        new_manifest_rows: List[Dict] = []
+
+        for (event_name, partition_dir), entries in list(buffers.items()):
+            if not entries:
+                continue
+
+            partition_dir.mkdir(parents=True, exist_ok=True)
+            records = [entry["record"] for entry in entries]
+            df = _normalize_records(records)
+            if df.empty:
+                continue
+
+            part_path = partition_dir / f"part-{uuid4().hex}.parquet"
+            df.to_parquet(part_path, index=False)
+            relative = str(part_path.relative_to(dataset_root))
+
+            totals[event_name] += len(entries)
+            for entry in entries:
+                new_manifest_rows.append(
+                    {
+                        "event": event_name,
+                        "id": entry["record"]["id"],
+                        "date": entry["date"].isoformat(),
+                        "path": relative,
+                    }
+                )
+
+        if new_manifest_rows:
+            manifest = pd.concat(
+                [manifest, pd.DataFrame(new_manifest_rows)], ignore_index=True
+            )
+            api._write_manifest(manifest_path, manifest)
+            api._update_manifest_cache(manifest_cache, new_manifest_rows)
+
+        buffers.clear()
+        pending_records = 0
+
+    try:
+        for event_name in api.ISSUES:
+            click.echo(f"Migrating '{event_name}' events…")
+
+            collection = db[event_name]
+            cursor = collection.find({}, batch_size=batch_size)
+
+            for document in cursor:
+                record = api._normalize_event(document)
+                event_id = record.get("id")
+                if not event_id:
+                    continue
+
+                cache = manifest_cache.setdefault(event_name, set())
+                if event_id in cache:
+                    continue
+
+                event_date = api._event_date(record)
+                if event_date is None:
+                    continue
+
+                partition_dir = api._partition_path(dataset_root, event_name, event_date)
+                buffers[(event_name, partition_dir)].append(
+                    {"record": record, "date": event_date}
+                )
+                cache.add(event_id)
+                pending_records += 1
+
+                if pending_records >= batch_size:
+                    flush_buffers()
+
+    finally:
+        flush_buffers()
+        client.close()
+
+    click.echo("Migration summary:")
+    for event_name, count in totals.items():
+        click.echo(f"  {event_name}: {count} new event(s) written")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/api.py b/src/api.py
@@ -24,14 +24,20 @@
 
 import os
 import sys
+from pathlib import Path
 from time import sleep
-import requests
-from requests.utils import parse_header_links
+from typing import Dict, Iterable, Optional, Set
+
 import datetime
-from pymongo import MongoClient
+import pandas as pd
+import requests
 from concurrent.futures import ThreadPoolExecutor, as_completed
+from pymongo import MongoClient
+from requests.utils import parse_header_links
 
 DEFAULT_MAX_ERRORS = 5
+_MANIFEST_FILENAME = "_manifest.parquet"
+_MANIFEST_COLUMNS = ["event", "id", "date", "path"]
 ISSUES = {
     "success": "758615130",
     "started": "540334560",
@@ -41,6 +47,105 @@
 }
 
 
+def _normalize_event(event: Dict) -> Dict:
+    """Flatten a raw Sentry event for storage."""
+
+    normalized = dict(event)
+    normalized.pop("_id", None)
+
+    tags = normalized.pop("tags", []) or []
+    for tag in tags:
+        key = tag.get("key")
+        if not key:
+            continue
+        normalized[key.replace(".", "_")] = tag.get("value")
+
+    normalized.pop("environment", None)
+
+    if "id" in normalized and normalized["id"] is not None:
+        normalized["id"] = str(normalized["id"])
+
+    return normalized
+
+
+def _event_date(event: Dict) -> Optional[datetime.date]:
+    """Return the calendar date to use for partitioning."""
+
+    for key in ("dateCreated", "date_received", "dateReceived", "date"):
+        value = event.get(key)
+        if value is None:
+            continue
+
+        if isinstance(value, datetime.datetime):
+            dt_value = value
+        elif isinstance(value, datetime.date) and not isinstance(value, datetime.datetime):
+            dt_value = datetime.datetime.combine(
+                value, datetime.time.min, tzinfo=datetime.timezone.utc
+            )
+        elif isinstance(value, str):
+            try:
+                dt_value = datetime.datetime.fromisoformat(value.replace("Z", "+00:00"))
+            except ValueError:
+                continue
+        else:
+            continue
+
+        if dt_value.tzinfo is None:
+            dt_value = dt_value.replace(tzinfo=datetime.timezone.utc)
+        else:
+            dt_value = dt_value.astimezone(datetime.timezone.utc)
+
+        return dt_value.date()
+
+    return None
+
+
+def _partition_path(dataset_root: Path, event_name: str, date: datetime.date) -> Path:
+    """Return the Parquet partition directory for *event_name* on *date*."""
+
+    return Path(dataset_root) / event_name / f"date={date:%Y-%m-%d}"
+
+
+def _manifest_path(dataset_root: Path) -> Path:
+    return Path(dataset_root) / _MANIFEST_FILENAME
+
+
+def _load_manifest(manifest_path: Path) -> pd.DataFrame:
+    if Path(manifest_path).exists():
+        return pd.read_parquet(manifest_path)
+    return pd.DataFrame(columns=_MANIFEST_COLUMNS)
+
+
+def _write_manifest(manifest_path: Path, manifest: pd.DataFrame) -> None:
+    manifest_path = Path(manifest_path)
+    manifest_path.parent.mkdir(parents=True, exist_ok=True)
+    tmp_path = manifest_path.with_suffix(manifest_path.suffix + ".tmp")
+    manifest.reset_index(drop=True).to_parquet(tmp_path, index=False)
+    tmp_path.replace(manifest_path)
+
+
+def _load_manifest_cache(manifest: pd.DataFrame) -> Dict[str, Set[str]]:
+    cache: Dict[str, Set[str]] = {name: set() for name in ISSUES}
+
+    if manifest is None or manifest.empty:
+        return cache
+
+    grouped = manifest.groupby("event")
+    for event, group in grouped:
+        cache[event] = set(group["id"].astype(str))
+
+    return cache
+
+
+def _update_manifest_cache(cache: Dict[str, Set[str]], rows: Iterable[Dict]) -> None:
+    for row in rows:
+        event = row.get("event")
+        identifier = row.get("id")
+        if not event or identifier is None:
+            continue
+        cache.setdefault(event, set()).add(str(identifier))
+
+
 def filter_new(events, collection):
     """Return the subset of *events* not already cached in *collection*."""