ai-dynamo
diff --git a/‎.devcontainer/post-create.sh‎
Lines changed: 11 additions & 0 deletions b/‎.devcontainer/post-create.sh‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎.dockerignore‎
Lines changed: 6 additions & 1 deletion b/‎.dockerignore‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎components/src/dynamo/gpu_memory_service/__init__.py‎
Lines changed: 45 additions & 0 deletions b/‎components/src/dynamo/gpu_memory_service/__init__.py‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎components/src/dynamo/gpu_memory_service/__main__.py‎
Lines changed: 7 additions & 0 deletions b/‎components/src/dynamo/gpu_memory_service/__main__.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎components/src/dynamo/gpu_memory_service/args.py‎
Lines changed: 66 additions & 0 deletions b/‎components/src/dynamo/gpu_memory_service/args.py‎
Lines changed: 66 additions & 0 deletions
diff --git a/‎components/src/dynamo/gpu_memory_service/server.py‎
Lines changed: 169 additions & 0 deletions b/‎components/src/dynamo/gpu_memory_service/server.py‎
Lines changed: 169 additions & 0 deletions
@@ -66,6 +66,16 @@ mkdir -p $CARGO_TARGET_DIR
 
 # Note: Build steps moved to after sanity check - see instructions at the end
 
+# Install gpu_memory_service package and build CUDA extensions
+# First uninstall any existing install from the Docker image, then do editable install
+# Use --no-build-isolation so uv uses the current environment (with PyTorch) instead of an isolated one
+echo "Installing gpu_memory_service package..."
+uv pip uninstall gpu-memory-service 2>/dev/null || true
+# Use dynamic path detection instead of hardcoded python3.12
+PYTHON_SITE_PACKAGES=$(python3 -c "import sysconfig; print(sysconfig.get_path('purelib'))")
+rm -rf "${PYTHON_SITE_PACKAGES}/gpu_memory_service/" 2>/dev/null || true
+uv pip install --no-build-isolation -e $WORKSPACE_DIR/lib/gpu_memory_service
+
 { set +x; } 2>/dev/null
 
 echo -e "\n" >> ~/.bashrc
@@ -107,6 +117,7 @@ cat <<EOF
 ========================================
 $SANITY_STATUS
 ✅ Pre-commit hooks configured
+✅ gpu_memory_service package installed with CUDA extensions
 
 Now build the project:
   cargo build --locked --profile dev --features dynamo-llm/block-manager
 
@@ -41,4 +41,9 @@
 **/*safetensors
 container/Dockerfile*
 .venv
-.venv-docs
+.venv-docs
+
+# GPU Memory Service build artifacts
+lib/gpu_memory_service/build/
+lib/gpu_memory_service/*.egg-info/
+lib/gpu_memory_service/**/*.so
@@ -57,6 +57,7 @@ tensorrtllm_checkpoints/
 tensorrtllm_engines/
 api_server_models/
 server/
+!lib/gpu_memory_service/server/
 # Replay/Snapshot test artifacts
 *.new
 lib/llm/tests/data/sample-models/models--meta-llama--Llama-3.1-70B-Instruct/
 
@@ -0,0 +1,45 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""GPU Memory Service component for Dynamo.
+
+This module provides the Dynamo component wrapper around the gpu_memory_service package.
+The core functionality is in the gpu_memory package; this module provides:
+- CLI entry point (python -m dynamo.gpu_memory_service)
+- Re-exports for backwards compatibility
+"""
+
+# Re-export core functionality from gpu_memory_service package
+from gpu_memory_service import (
+    GMSClientMemoryManager,
+    StaleWeightsError,
+    get_allocator,
+    get_or_create_allocator,
+)
+
+# Re-export extensions (built separately)
+from gpu_memory_service.client.torch.extensions import (
+    _allocator_ext,
+    _tensor_from_pointer,
+)
+
+# Re-export tensor utilities
+from gpu_memory_service.client.torch.tensor import (
+    materialize_module_from_gms,
+    register_module_tensors,
+)
+
+__all__ = [
+    # Core allocator
+    "GMSClientMemoryManager",
+    "StaleWeightsError",
+    # Lifecycle management
+    "get_or_create_allocator",
+    "get_allocator",
+    # Tensor utilities
+    "register_module_tensors",
+    "materialize_module_from_gms",
+    # Extensions
+    "_allocator_ext",
+    "_tensor_from_pointer",
+]
@@ -0,0 +1,7 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from dynamo.gpu_memory_service.server import main
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,66 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Argument parsing for GPU Memory Service server component."""
+
+import argparse
+import logging
+from dataclasses import dataclass
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class Config:
+    """Configuration for GPU Memory Service server."""
+
+    # GPU Memory Service specific
+    device: int
+    socket_path: str
+    verbose: bool
+
+
+def parse_args() -> Config:
+    """Parse command line arguments for GPU Memory Service server."""
+    parser = argparse.ArgumentParser(
+        description="GPU Memory Service allocation server for Dynamo."
+    )
+
+    # GPU Memory Service specific arguments
+    parser.add_argument(
+        "--device",
+        type=int,
+        required=True,
+        help="CUDA device ID to manage memory for.",
+    )
+    parser.add_argument(
+        "--socket-path",
+        type=str,
+        default=None,
+        help="Path for Unix domain socket. Default: /tmp/gpu_memory_service_{device}.sock. "
+        "Supports {device} placeholder for multi-GPU setups.",
+    )
+    parser.add_argument(
+        "--verbose",
+        "-v",
+        action="store_true",
+        help="Enable verbose logging.",
+    )
+
+    args = parser.parse_args()
+
+    # Generate default socket path if not provided
+    socket_path = args.socket_path
+    if socket_path is None:
+        socket_path = f"/tmp/gpu_memory_service_{args.device}.sock"
+    else:
+        # Expand {device} placeholder
+        socket_path = socket_path.format(device=args.device)
+
+    config = Config(
+        device=args.device,
+        socket_path=socket_path,
+        verbose=args.verbose,
+    )
+
+    return config
@@ -0,0 +1,169 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""GPU Memory Service allocation server component for Dynamo.
+
+This component wraps the GMSRPCServer from gpu_memory_service to manage
+GPU memory allocations with connection-based RW/RO locking.
+
+Workers connect via the socket path, which should be passed to vLLM/SGLang via:
+    --load-format gpu_memory_service
+    --model-loader-extra-config '{"gpu_memory_service_socket_path": "/tmp/gpu_memory_service_{device}.sock"}'
+
+Usage:
+    python -m dynamo.gpu_memory_service --device 0
+    python -m dynamo.gpu_memory_service --device 0 --socket-path /tmp/gpu_memory_service_{device}.sock
+"""
+
+import asyncio
+import logging
+import os
+import signal
+import threading
+from typing import Optional
+
+import uvloop
+from gpu_memory_service.server import GMSRPCServer
+
+from .args import parse_args
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+
+
+class GMSRPCServerThread:
+    """Wrapper to run GMSRPCServer in a background thread."""
+
+    def __init__(self, socket_path: str, device: int):
+        self.socket_path = socket_path
+        self.device = device
+        self._server: Optional[GMSRPCServer] = None
+        self._thread: Optional[threading.Thread] = None
+        self._started = threading.Event()
+        self._error: Optional[Exception] = None
+        self._loop: Optional[asyncio.AbstractEventLoop] = None
+
+    def start(self) -> None:
+        """Start the allocation server in a background thread."""
+        self._thread = threading.Thread(
+            target=self._run_server,
+            name=f"GMSRPCServer-GPU{self.device}",
+            daemon=True,
+        )
+        self._thread.start()
+        # Wait for server to be ready (socket file created)
+        self._started.wait(timeout=10.0)
+        if self._error is not None:
+            raise self._error
+        if not self._started.is_set():
+            raise RuntimeError("GMSRPCServer failed to start within timeout")
+
+    def _run_server(self) -> None:
+        """Run the server (called in background thread).
+
+        The GMSRPCServer is async-based, so we create a new event loop for this thread.
+        """
+        try:
+            # Create a new event loop for this thread
+            self._loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(self._loop)
+
+            self._server = GMSRPCServer(self.socket_path, device=self.device)
+
+            # Start the server (creates the socket)
+            self._loop.run_until_complete(self._server.start())
+            logger.info(
+                f"GMSRPCServer started on device {self.device} at {self.socket_path}"
+            )
+            self._started.set()
+
+            # Run the main loop
+            while self._server._running:
+                self._loop.run_until_complete(asyncio.sleep(1))
+
+        except Exception as e:
+            logger.error(f"GMSRPCServer error: {e}")
+            self._error = e
+            self._started.set()  # Unblock waiter even on error
+        finally:
+            if self._loop is not None:
+                self._loop.close()
+
+    def stop(self) -> None:
+        """Stop the allocation server."""
+        if self._server is not None:
+            logger.info(f"Stopping GMSRPCServer on device {self.device}")
+            # Signal the server to stop - the loop in _run_server will exit
+            self._server._running = False
+            self._server._shutdown = True
+            # Wake any blocked waiters from the server's event loop
+            if self._loop is not None and self._loop.is_running():
+
+                async def _notify():
+                    async with self._server._condition:
+                        self._server._condition.notify_all()
+
+                asyncio.run_coroutine_threadsafe(_notify(), self._loop)
+        if self._thread is not None and self._thread.is_alive():
+            self._thread.join(timeout=5.0)
+
+
+async def worker() -> None:
+    """Main async worker function."""
+    config = parse_args()
+
+    # Configure logging level
+    if config.verbose:
+        logging.getLogger().setLevel(logging.DEBUG)
+        logging.getLogger("dynamo.gpu_memory_service").setLevel(logging.DEBUG)
+
+    logger.info(f"Starting GPU Memory Service Server for device {config.device}")
+    logger.info(f"Socket path: {config.socket_path}")
+
+    loop = asyncio.get_running_loop()
+
+    # Clean up any existing socket file
+    if config.socket_path and os.path.exists(config.socket_path):
+        os.unlink(config.socket_path)
+        logger.debug(f"Removed existing socket file: {config.socket_path}")
+
+    # Start GMSRPCServer in a background thread
+    server = GMSRPCServerThread(config.socket_path, config.device)
+    server.start()
+
+    # Set up shutdown event
+    shutdown_event = asyncio.Event()
+
+    def signal_handler():
+        logger.info("Received shutdown signal")
+        shutdown_event.set()
+
+    for sig in (signal.SIGTERM, signal.SIGINT):
+        loop.add_signal_handler(sig, signal_handler)
+
+    logger.info("GPU Memory Service Server ready, waiting for connections...")
+    logger.info(
+        f"To connect vLLM workers, use: --load-format gpu_memory_service "
+        f'--model-loader-extra-config \'{{"gpu_memory_service_socket_path": "{config.socket_path}"}}\''
+    )
+
+    # Wait for shutdown signal
+    try:
+        await shutdown_event.wait()
+    finally:
+        logger.info("Shutting down GPU Memory Service Server...")
+        server.stop()
+        logger.info("GPU Memory Service Server shutdown complete")
+
+
+def main() -> None:
+    """Entry point for GPU Memory Service server."""
+    uvloop.install()
+    asyncio.run(worker())
+
+
+if __name__ == "__main__":
+    main()