ai-dynamo · galletas1712 · Jan 23, 2026 · Jan 20, 2026 · Jan 22, 2026 · Jan 22, 2026
diff --git a/.devcontainer/post-create.sh b/.devcontainer/post-create.sh
@@ -111,7 +111,8 @@ $SANITY_STATUS
 Now build the project:
   cargo build --locked --profile dev --features dynamo-llm/block-manager
   cd lib/bindings/python && maturin develop --uv
-  DYNAMO_BIN_PATH=$CARGO_TARGET_DIR/debug uv pip install -e .
+  uv pip install -e lib/gpu_memory_service  # GPU memory manager with C++ extension
+  DYNAMO_BIN_PATH=\$CARGO_TARGET_DIR/debug uv pip install -e .
 
 Optional: cd lib/bindings/kvbm && maturin develop --uv  # For KVBM support
 

diff --git a/.dockerignore b/.dockerignore
@@ -43,6 +43,11 @@ container/Dockerfile*
 .venv
 .venv-docs
 
+# GPU Memory Service build artifacts
+lib/gpu_memory_service/build/
+lib/gpu_memory_service/*.egg-info/
+lib/gpu_memory_service/**/*.so
+
 # Python
 __pycache__/
 *.pyc
@@ -75,4 +80,4 @@ docs/_build/
 
 # AI
 .cursor/
-.claude/
+.claude/
@@ -78,6 +78,7 @@ core:
   - 'components/src/dynamo/mocker/**'
   - 'components/src/dynamo/frontend/**'
   - 'components/src/dynamo/common/**'
+  - 'components/src/dynamo/gpu_memory_service/**'
   - '*.toml'
   - '*.lock'
   - '*.py'

diff --git a/.gitignore b/.gitignore
@@ -57,6 +57,7 @@ tensorrtllm_checkpoints/
 tensorrtllm_engines/
 api_server_models/
 server/
+!lib/gpu_memory_service/server/
 # Replay/Snapshot test artifacts
 *.new
 lib/llm/tests/data/sample-models/models--meta-llama--Llama-3.1-70B-Instruct/

diff --git a/README.md b/README.md
@@ -331,7 +331,16 @@ cd lib/bindings/python
 maturin develop --uv
 ```
 
-## 6. Install the Wheel
+## 6. Install GPU Memory Service
+
+The GPU Memory Service is a Python package with a C++ extension. It requires only Python development headers and a C++ compiler (g++).
+
+```bash
+cd $PROJECT_ROOT
+uv pip install -e lib/gpu_memory_service
+```
+
+## 7. Install the Wheel
 
 ```
 cd $PROJECT_ROOT

@@ -0,0 +1,41 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""GPU Memory Service component for Dynamo.
+
+This module provides the Dynamo component wrapper around the gpu_memory_service package.
+The core functionality is in the gpu_memory package; this module provides:
+- CLI entry point (python -m dynamo.gpu_memory_service)
+- Re-exports for backwards compatibility
+"""
+
+# Re-export core functionality from gpu_memory_service package
+from gpu_memory_service import (
+    GMSClientMemoryManager,
+    StaleMemoryLayoutError,
+    get_gms_client_memory_manager,
+    get_or_create_gms_client_memory_manager,
+)
+
+# Re-export extensions (built separately)
+from gpu_memory_service.client.torch.extensions import _allocator_ext
+
+# Re-export module utilities
+from gpu_memory_service.client.torch.module import (
+    materialize_module_from_gms,
+    register_module_tensors,
+)
+
+__all__ = [
+    # Core
+    "GMSClientMemoryManager",
+    "StaleMemoryLayoutError",
+    # GMS client memory manager
+    "get_or_create_gms_client_memory_manager",
+    "get_gms_client_memory_manager",
+    # Tensor utilities
+    "register_module_tensors",
+    "materialize_module_from_gms",
+    # Extensions
+    "_allocator_ext",
+]
@@ -0,0 +1,7 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from dynamo.gpu_memory_service.server import main
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,66 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Argument parsing for GPU Memory Service server component."""
+
+import argparse
+import logging
+from dataclasses import dataclass
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class Config:
+    """Configuration for GPU Memory Service server."""
+
+    # GPU Memory Service specific
+    device: int
+    socket_path: str
+    verbose: bool
+
+
+def parse_args() -> Config:
+    """Parse command line arguments for GPU Memory Service server."""
+    parser = argparse.ArgumentParser(
+        description="GPU Memory Service allocation server for Dynamo."
+    )
+
+    # GPU Memory Service specific arguments
+    parser.add_argument(
+        "--device",
+        type=int,
+        required=True,
+        help="CUDA device ID to manage memory for.",
+    )
+    parser.add_argument(
+        "--socket-path",
+        type=str,
+        default=None,
+        help="Path for Unix domain socket. Default: /tmp/gpu_memory_service_{device}.sock. "
+        "Supports {device} placeholder for multi-GPU setups.",
+    )
+    parser.add_argument(
+        "--verbose",
+        "-v",
+        action="store_true",
+        help="Enable verbose logging.",
+    )
+
+    args = parser.parse_args()
+
+    # Generate default socket path if not provided
+    socket_path = args.socket_path
+    if socket_path is None:
+        socket_path = f"/tmp/gpu_memory_service_{args.device}.sock"
+    else:
+        # Expand {device} placeholder
+        socket_path = socket_path.format(device=args.device)
+
+    config = Config(
+        device=args.device,
+        socket_path=socket_path,
+        verbose=args.verbose,
+    )
+
+    return config
@@ -0,0 +1,169 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""GPU Memory Service allocation server component for Dynamo.
+
+This component wraps the GMSRPCServer from gpu_memory_service to manage
+GPU memory allocations with connection-based RW/RO locking.
+
+Workers connect via the socket path, which should be passed to vLLM/SGLang via:
+    --load-format gpu_memory_service
+    --model-loader-extra-config '{"gpu_memory_service_socket_path": "/tmp/gpu_memory_service_{device}.sock"}'
+
+Usage:
+    python -m dynamo.gpu_memory_service --device 0
+    python -m dynamo.gpu_memory_service --device 0 --socket-path /tmp/gpu_memory_service_{device}.sock
+"""
+
+import asyncio
+import logging
+import os
+import signal
+import threading
+from typing import Optional
+
+import uvloop
+from gpu_memory_service.server import GMSRPCServer
+
+from .args import parse_args
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+
+
+class GMSRPCServerThread:
+    """Wrapper to run GMSRPCServer in a background thread."""
+
+    def __init__(self, socket_path: str, device: int):
+        self.socket_path = socket_path
+        self.device = device
+        self._server: Optional[GMSRPCServer] = None
+        self._thread: Optional[threading.Thread] = None
+        self._started = threading.Event()
+        self._error: Optional[Exception] = None
+        self._loop: Optional[asyncio.AbstractEventLoop] = None
+
+    def start(self) -> None:
+        """Start the allocation server in a background thread."""
+        self._thread = threading.Thread(
+            target=self._run_server,
+            name=f"GMSRPCServer-GPU{self.device}",
+            daemon=True,
+        )
+        self._thread.start()
+        # Wait for server to be ready (socket file created)
+        self._started.wait(timeout=10.0)
+        if self._error is not None:
+            raise self._error
+        if not self._started.is_set():
+            raise RuntimeError("GMSRPCServer failed to start within timeout")
+
+    def _run_server(self) -> None:
+        """Run the server (called in background thread).
+
+        The GMSRPCServer is async-based, so we create a new event loop for this thread.
+        """
+        try:
+            # Create a new event loop for this thread
+            self._loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(self._loop)
+
+            self._server = GMSRPCServer(self.socket_path, device=self.device)
+
+            # Start the server (creates the socket)
+            self._loop.run_until_complete(self._server.start())
+            logger.info(
+                f"GMSRPCServer started on device {self.device} at {self.socket_path}"
+            )
+            self._started.set()
+
+            # Run the main loop
+            while self._server._running:
+                self._loop.run_until_complete(asyncio.sleep(1))
+
+        except Exception as e:
+            logger.error(f"GMSRPCServer error: {e}")
+            self._error = e
+            self._started.set()  # Unblock waiter even on error
+        finally:
+            if self._loop is not None:
+                self._loop.close()
+
+    def stop(self) -> None:
+        """Stop the allocation server."""
+        if self._server is not None:
+            logger.info(f"Stopping GMSRPCServer on device {self.device}")
+            # Signal the server to stop - the loop in _run_server will exit
+            self._server._running = False
+            self._server._shutdown = True
+            # Wake any blocked waiters from the server's event loop
+            if self._loop is not None and self._loop.is_running():
+
+                async def _notify():
+                    async with self._server._condition:
+                        self._server._condition.notify_all()
+
+                asyncio.run_coroutine_threadsafe(_notify(), self._loop)
+        if self._thread is not None and self._thread.is_alive():
+            self._thread.join(timeout=5.0)
+
+
+async def worker() -> None:
+    """Main async worker function."""
+    config = parse_args()
+
+    # Configure logging level
+    if config.verbose:
+        logging.getLogger().setLevel(logging.DEBUG)
+        logging.getLogger("dynamo.gpu_memory_service").setLevel(logging.DEBUG)
+
+    logger.info(f"Starting GPU Memory Service Server for device {config.device}")
+    logger.info(f"Socket path: {config.socket_path}")
+
+    loop = asyncio.get_running_loop()
+
+    # Clean up any existing socket file
+    if config.socket_path and os.path.exists(config.socket_path):
+        os.unlink(config.socket_path)
+        logger.debug(f"Removed existing socket file: {config.socket_path}")
+
+    # Start GMSRPCServer in a background thread
+    server = GMSRPCServerThread(config.socket_path, config.device)
+    server.start()
+
+    # Set up shutdown event
+    shutdown_event = asyncio.Event()
+
+    def signal_handler():
+        logger.info("Received shutdown signal")
+        shutdown_event.set()
+
+    for sig in (signal.SIGTERM, signal.SIGINT):
+        loop.add_signal_handler(sig, signal_handler)
+
+    logger.info("GPU Memory Service Server ready, waiting for connections...")
+    logger.info(
+        f"To connect vLLM workers, use: --load-format gpu_memory_service "
+        f'--model-loader-extra-config \'{{"gpu_memory_service_socket_path": "{config.socket_path}"}}\''
+    )
+
+    # Wait for shutdown signal
+    try:
+        await shutdown_event.wait()
+    finally:
+        logger.info("Shutting down GPU Memory Service Server...")
+        server.stop()
+        logger.info("GPU Memory Service Server shutdown complete")
+
+
+def main() -> None:
+    """Entry point for GPU Memory Service server."""
+    uvloop.install()
+    asyncio.run(worker())
+
+
+if __name__ == "__main__":
+    main()
@@ -431,6 +431,10 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
     fi && \
     /tmp/use-sccache.sh show-stats "Dynamo"
 
+# Build gpu_memory_service wheel (C++ extension only needs Python headers, no CUDA/torch)
+RUN source ${VIRTUAL_ENV}/bin/activate && \
+    uv build --wheel --out-dir /opt/dynamo/dist /opt/dynamo/lib/gpu_memory_service
+
 ##############################################
 ########## Runtime image ##############
 ##############################################
@@ -505,7 +509,8 @@ ARG ENABLE_KVBM
 RUN uv pip install \
     /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
     /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
-    /opt/dynamo/wheelhouse/nixl/nixl*.whl && \
+    /opt/dynamo/wheelhouse/nixl/nixl*.whl \
+    /opt/dynamo/wheelhouse/gpu_memory_service*.whl && \
     if [ "$ENABLE_KVBM" = "true" ]; then \
         KVBM_WHEEL=$(ls /opt/dynamo/wheelhouse/kvbm*.whl 2>/dev/null | head -1); \
         if [ -z "$KVBM_WHEEL" ]; then \
@@ -596,7 +601,8 @@ ARG ENABLE_KVBM
 RUN uv pip install \
     /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
     /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
-    /opt/dynamo/wheelhouse/nixl/nixl*.whl && \
+    /opt/dynamo/wheelhouse/nixl/nixl*.whl \
+    /opt/dynamo/wheelhouse/gpu_memory_service*.whl && \
     if [ "$ENABLE_KVBM" = "true" ]; then \
         KVBM_WHEEL=$(ls /opt/dynamo/wheelhouse/kvbm*.whl 2>/dev/null | head -1); \
         if [ -z "$KVBM_WHEEL" ]; then \

@@ -442,6 +442,10 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
     fi && \
     /tmp/use-sccache.sh show-stats "Dynamo"
 
+# Build gpu_memory_service wheel (C++ extension only needs Python headers, no CUDA/torch)
+RUN source ${VIRTUAL_ENV}/bin/activate && \
+    uv build --wheel --out-dir /opt/dynamo/dist /opt/dynamo/lib/gpu_memory_service
+
 ##################################
 ########## Runtime Image #########
 ##################################
@@ -505,6 +509,7 @@ RUN --mount=type=bind,source=.,target=/mnt/local_src \
         /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
         /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
         /opt/dynamo/wheelhouse/nixl/nixl*.whl \
+        /opt/dynamo/wheelhouse/gpu_memory_service*.whl \
         sglang==${SGLANG_VERSION}
 
 # Install common and test dependencies as root