INTPYTHON-850 Add script to migrate checkpoint collections created before v0.2.2 (langchain-ai#291)

caseyclements · aclark4life · dependabot[bot] · web-flow · commit 43c3e99cc532 · 2025-12-18T16:45:21.000-05:00
[INTPYTHON-850](https://jira.mongodb.org/browse/INTPYTHON-850) ## Summary  Addresses issue langchain-ai#287 by providing a migration script: `migrate_checkpoints_to_typed_metadata.py` Additional typing information was added in v0.2.2 to address [CVE - LangGraph Checkpoint affected by RCE in "json" mode of JsonPlusSerializer](https://osv.dev/vulnerability/GHSA-wwqv-p2pp-99h5). ## Changes in this PR ``` ~/src/langchain-mongodb/libs/langgraph-checkpoint-mongodb (INTPYTHON-850-SerializerMigration) $ uv run python scripts/migrate_checkpoints_to_typed_metadata.py -h usage: migrate_checkpoints_to_typed_metadata.py [-h] [--mongodb-uri MONGODB_URI] --db DB --collections COLLECTIONS [COLLECTIONS ...] [--batch-size BATCH_SIZE] [--suffix SUFFIX] [--workers WORKERS] [--dry-run] [--clear-destination] [--log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}] Migrate langgraph checkpoint metadata to typed format (>= v0.2.2). options: -h, --help show this help message and exit --mongodb-uri MONGODB_URI MongoDB connection URI --db DB Database name containing checkpoint collections --collections COLLECTIONS [COLLECTIONS ...] One or more checkpoint collection names to migrate --batch-size BATCH_SIZE Number of documents per insert batch --suffix SUFFIX Suffix for migrated collections (default: -new) --workers WORKERS Number of worker processes (default: 1) --dry-run Run migration without writing any data --clear-destination Delete destination collection before migrating --log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL} Logging verbosity ```  ## Test Plan  No new unit tests were created for this. Instead, work was done manually to create a test checkpoint collection in v0.2.2, migrate this collection with the script, and then run a modified version of tests/test_sync.py against the latest MongoDBSaver referencing the migrated collection. ## Checklist  ### Checklist for Author - [x] Did you update the changelog (if necessary)? - [ ] Is the intention of the code captured in relevant tests? - [ ] If there are new TODOs, has a related JIRA ticket been created? - [ ] Has a MongoDB Employee run [the patch build of this PR](https://github.com/mongodb-labs/ai-ml-pipeline-testing?tab=readme-ov-file#running-a-patch-build-of-a-given-pr)? ### Checklist for Reviewer - [ ] Does the title of the PR reference a JIRA Ticket? - [ ] Do you fully understand the implementation? (Would you be comfortable explaining how this code works to someone else?) - [ ] Is all relevant documentation (README or docstring) updated? --------- Co-authored-by: Jeffrey A. Clark <aclark@aclark.net> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Steven Silvester <steve.silvester@mongodb.com> Co-authored-by: Noah Stapp <noah.stapp@mongodb.com>
diff --git a/libs/langgraph-checkpoint-mongodb/CHANGELOG.md b/libs/langgraph-checkpoint-mongodb/CHANGELOG.md
@@ -2,6 +2,9 @@
 
 ---
 
+## Changes in version 0.3.1 (DATE ???)
+- Fixes issue #287 to migrate checkpoint data created with v<0.2.2 with a migration script: [migrate_checkpoints_to_typed_metadata.py](./scripts/migrate_checkpoints_to_typed_metadata.py).
+
 ## Changes in version 0.3.0 (2025/11/19)
 - Allow custom serde objects to be passed to MongoDBSaver for serialization/deserialization.
 - Remove the deprecated AsyncMongoDBSaver class, which has been replaced by MongoDBSaver's async methods.
diff --git a/libs/langgraph-checkpoint-mongodb/scripts/__init__.py b/libs/langgraph-checkpoint-mongodb/scripts/__init__.py
diff --git a/libs/langgraph-checkpoint-mongodb/scripts/migrate_checkpoints_to_typed_metadata.py b/libs/langgraph-checkpoint-mongodb/scripts/migrate_checkpoints_to_typed_metadata.py
@@ -0,0 +1,286 @@
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#   "pymongo>=4.6,<5",
+#   "langgraph-checkpoint-mongodb>=0.2.2",
+# ]
+# ///
+
+"""Script to migrate metadata of checkpoint collections
+- from <=v0.2.1 which is json
+- to >=v0.2.2 which is typed (defaulting to msgpack)
+
+Data that was created on <v0.2.2 cannot be read by newer langgraph-checkpoint-mongodb.
+
+Invoke using PEP 723 (Inline Script Metadata):
+`$ uv run scripts/migrate_checkpoints_to_typed_metadata.py -h`
+
+Notes:
+    - writes_collections is not in scope as it has always used serde.dumps_typed / serde.loads_typed
+
+"""
+
+import argparse
+import logging
+import multiprocessing as mp
+import time
+from typing import Any, Union
+
+from bson.raw_bson import RawBSONDocument
+from langgraph.checkpoint.base import CheckpointMetadata
+from langgraph.checkpoint.serde.jsonplus import JsonPlusSerializer
+from pymongo import MongoClient
+from pymongo.collection import Collection
+from pymongo.errors import BulkWriteError
+from pymongo.typings import _DocumentType
+
+from langgraph.checkpoint.mongodb import MongoDBSaver
+
+serde = JsonPlusSerializer()
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Migrate langgraph checkpoint metadata to typed format (>= v0.2.2)."
+    )
+
+    parser.add_argument(
+        "--mongodb-uri",
+        default="mongodb://localhost:27017/?directConnection=true",
+        help="MongoDB connection URI",
+    )
+
+    parser.add_argument(
+        "--db",
+        required=True,
+        help="Database name containing checkpoint collections",
+    )
+
+    parser.add_argument(
+        "--collections",
+        nargs="+",
+        required=True,
+        help="One or more checkpoint collection names to migrate",
+    )
+
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=1000,
+        help="Number of documents per insert batch",
+    )
+
+    parser.add_argument(
+        "--suffix",
+        default="-new",
+        help="Suffix for migrated collections (default: -new)",
+    )
+
+    parser.add_argument(
+        "--workers",
+        type=int,
+        default=1,
+        help="Number of worker processes (default: 1)",
+    )
+
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Run migration without writing any data",
+    )
+
+    parser.add_argument(
+        "--clear-destination",
+        action="store_true",
+        help="Delete destination collection before migrating",
+    )
+
+    parser.add_argument(
+        "--log-level",
+        default="INFO",
+        choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
+        help="Logging verbosity",
+    )
+
+    return parser.parse_args()
+
+
+def loads_metadata_orig(metadata: dict[str, Any]) -> CheckpointMetadata:
+    if isinstance(metadata, dict):
+        return {k: loads_metadata_orig(v) for k, v in metadata.items()}
+    return serde.loads_typed(("json", metadata))
+
+
+def dumps_metadata_new(
+    metadata: Union[CheckpointMetadata, Any],
+) -> Union[bytes, dict[str, Any]]:
+    if isinstance(metadata, dict):
+        return {k: dumps_metadata_new(v) for k, v in metadata.items()}
+    return serde.dumps_typed(metadata)
+
+
+def insert_non_duplicates(
+    clxn: Collection, buffer: list[Union[_DocumentType, RawBSONDocument]]
+) -> None:
+    try:
+        clxn.insert_many(buffer, ordered=False)
+    except BulkWriteError as e:
+        write_errors = e.details.get("writeErrors", [])
+        non_dupe_errors = [err for err in write_errors if err.get("code") != 11000]
+        if non_dupe_errors:
+            raise
+    finally:
+        buffer.clear()
+
+
+def worker_migrate(
+    worker_id: int,
+    args: argparse.Namespace,
+    collection_name: str,
+) -> dict[str, int]:
+    """
+    Worker process that migrates a shard of documents determined by _id hash.
+    """
+    client: MongoClient = MongoClient(args.mongodb_uri)
+    db = client[args.db]
+
+    clxn_orig = db[collection_name]
+    dest_name = f"{collection_name}{args.suffix}"
+    clxn_new = db[dest_name]
+
+    scanned = 0
+    migrated = 0
+    buffer = []
+
+    cursor = clxn_orig.find({}, batch_size=args.batch_size)
+
+    for doc in cursor:
+        # Deterministic partition
+        if hash(doc["_id"]) % args.workers != worker_id:
+            continue
+
+        scanned += 1
+
+        if "metadata" in doc:
+            doc["metadata"] = dumps_metadata_new(loads_metadata_orig(doc["metadata"]))
+
+        buffer.append(doc)
+        migrated += 1
+
+        if len(buffer) >= args.batch_size:
+            if not args.dry_run:
+                insert_non_duplicates(clxn_new, buffer)
+            else:
+                buffer.clear()
+
+    if buffer:
+        if not args.dry_run:
+            insert_non_duplicates(clxn_new, buffer)
+        else:
+            buffer.clear()
+
+    return {
+        "scanned": scanned,
+        "migrated": migrated,
+    }
+
+
+def main() -> None:
+    args = parse_args()
+
+    logging.basicConfig(
+        level=getattr(logging, args.log_level),
+        format="%(asctime)s [%(levelname)s] %(message)s",
+    )
+
+    start_time = time.time()
+
+    logging.info("Beginning checkpoint data migration")
+    logging.info(f"mongodb_uri={args.mongodb_uri}")
+    logging.info(f"db={args.db}")
+    logging.info(f"collections={args.collections}")
+    logging.info(f"batch_size={args.batch_size}")
+    logging.info(f"suffix={args.suffix}")
+    logging.info(f"dry_run={args.dry_run}")
+
+    total_scanned = 0
+    total_migrated = 0
+
+    for collection_name in args.collections:
+        logging.info(f"--- Migrating collection: {collection_name} ---")
+
+        client: MongoClient = MongoClient(args.mongodb_uri)
+        db = client[args.db]
+
+        clxn_orig = db[collection_name]
+        dest_name = f"{collection_name}{args.suffix}"
+        clxn_new = db[dest_name]
+
+        if args.clear_destination and not args.dry_run:
+            logging.warning(f"Clearing destination collection {dest_name}")
+            clxn_new.delete_many({})
+
+        n_orig = clxn_orig.count_documents({})
+        logging.info(f"Source collection contains {n_orig} documents")
+
+        if n_orig == 0:
+            logging.warning(f"Skipping empty or missing collection: {collection_name}")
+            continue
+
+        if args.workers == 1:
+            # Single-process fallback (existing behavior)
+            result = worker_migrate(0, args, collection_name)
+            total_scanned += result["scanned"]
+            total_migrated += result["migrated"]
+        else:
+            logging.info(f"Starting {args.workers} workers")
+
+            with mp.Pool(processes=args.workers) as pool:
+                results = pool.starmap(
+                    worker_migrate,
+                    [
+                        (worker_id, args, collection_name)
+                        for worker_id in range(args.workers)
+                    ],
+                )
+
+            for r in results:
+                total_scanned += r["scanned"]
+                total_migrated += r["migrated"]
+
+        if not args.dry_run:
+            n_new = clxn_new.count_documents({})
+            assert n_new == total_migrated or n_new <= n_orig
+
+            saver_new = MongoDBSaver(
+                client=client,
+                db_name=args.db,
+                checkpoint_collection_name=dest_name,
+            )
+
+            checkpoints_new = saver_new.list(config=None, limit=1)
+            sample_thread = next(checkpoints_new).config["configurable"]["thread_id"]
+            sample_checkpoint = saver_new.get_tuple(
+                config={"configurable": {"thread_id": sample_thread}}
+            )
+            if sample_checkpoint is not None:
+                logging.debug(
+                    f"[{collection_name}] Sample metadata: {sample_checkpoint.metadata}"
+                )
+
+    elapsed = time.time() - start_time
+    rate = total_migrated / elapsed if elapsed > 0 else 0
+
+    logging.info("=== Migration Summary ===")
+    logging.info(f"Collections processed: {len(args.collections)}")
+    logging.info(f"Documents scanned:    {total_scanned}")
+    logging.info(f"Documents migrated:   {total_migrated}")
+    logging.info(f"Elapsed time:         {elapsed:.2f}s")
+    logging.info(f"Throughput:           {rate:.2f} docs/sec")
+
+    if args.dry_run:
+        logging.info("Dry-run mode enabled: no data was written")
+
+
+if __name__ == "__main__":
+    main()