ai-dynamo · galletas1712 · Jan 23, 2026 · Jan 20, 2026 · Jan 22, 2026 · Jan 22, 2026
diff --git a/.devcontainer/post-create.sh b/.devcontainer/post-create.sh
@@ -111,7 +111,8 @@ $SANITY_STATUS
 Now build the project:
   cargo build --locked --profile dev --features dynamo-llm/block-manager
   cd lib/bindings/python && maturin develop --uv
-  DYNAMO_BIN_PATH=$CARGO_TARGET_DIR/debug uv pip install -e .
+  uv pip install -e lib/gpu_memory_service  # GPU memory manager with C++ extension
+  DYNAMO_BIN_PATH=\$CARGO_TARGET_DIR/debug uv pip install -e .
 
 Optional: cd lib/bindings/kvbm && maturin develop --uv  # For KVBM support
 

diff --git a/.dockerignore b/.dockerignore
@@ -45,6 +45,10 @@ container/Dockerfile*
 .venv
 .venv-docs
 
+# GPU Memory Service build artifacts
+lib/gpu_memory_service/build/
+lib/gpu_memory_service/*.egg-info/
+lib/gpu_memory_service/**/*.so
 
 # Python
 __pycache__/

@@ -78,6 +78,7 @@ core:
   - 'components/src/dynamo/mocker/**'
   - 'components/src/dynamo/frontend/**'
   - 'components/src/dynamo/common/**'
+  - 'components/src/dynamo/gpu_memory_service/**'
   - '*.toml'
   - '*.lock'
   - '*.py'

diff --git a/.gitignore b/.gitignore
@@ -57,6 +57,7 @@ tensorrtllm_checkpoints/
 tensorrtllm_engines/
 api_server_models/
 server/
+!lib/gpu_memory_service/server/
 # Replay/Snapshot test artifacts
 *.new
 lib/llm/tests/data/sample-models/models--meta-llama--Llama-3.1-70B-Instruct/

diff --git a/README.md b/README.md
@@ -331,7 +331,16 @@ cd lib/bindings/python
 maturin develop --uv
 ```
 
-## 6. Install the Wheel
+## 6. Install GPU Memory Service
+
+The GPU Memory Service is a Python package with a C++ extension. It requires only Python development headers and a C++ compiler (g++).
+
+```bash
+cd $PROJECT_ROOT
+uv pip install -e lib/gpu_memory_service
+```
+
+## 7. Install the Wheel
 
 ```
 cd $PROJECT_ROOT

@@ -0,0 +1,41 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""GPU Memory Service component for Dynamo.
+
+This module provides the Dynamo component wrapper around the gpu_memory_service package.
+The core functionality is in the gpu_memory package; this module provides:
+- CLI entry point (python -m dynamo.gpu_memory_service)
+- Re-exports for backwards compatibility
+"""
+
+# Re-export core functionality from gpu_memory_service package
+from gpu_memory_service import (
+    GMSClientMemoryManager,
+    StaleMemoryLayoutError,
+    get_gms_client_memory_manager,
+    get_or_create_gms_client_memory_manager,
+)
+
+# Re-export extensions (built separately)
+from gpu_memory_service.client.torch.extensions import _allocator_ext
+
+# Re-export module utilities
+from gpu_memory_service.client.torch.module import (
+    materialize_module_from_gms,
+    register_module_tensors,
+)
+
+__all__ = [
+    # Core
+    "GMSClientMemoryManager",
+    "StaleMemoryLayoutError",
+    # GMS client memory manager
+    "get_or_create_gms_client_memory_manager",
+    "get_gms_client_memory_manager",
+    # Tensor utilities
+    "register_module_tensors",
+    "materialize_module_from_gms",
+    # Extensions
+    "_allocator_ext",
+]
@@ -0,0 +1,7 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from dynamo.gpu_memory_service.server import main
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,66 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Argument parsing for GPU Memory Service server component."""
+
+import argparse
+import logging
+from dataclasses import dataclass
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class Config:
+    """Configuration for GPU Memory Service server."""
+
+    # GPU Memory Service specific
+    device: int
+    socket_path: str
+    verbose: bool
+
+
+def parse_args() -> Config:
+    """Parse command line arguments for GPU Memory Service server."""
+    parser = argparse.ArgumentParser(
+        description="GPU Memory Service allocation server for Dynamo."
+    )
+
+    # GPU Memory Service specific arguments
+    parser.add_argument(
+        "--device",
+        type=int,
+        required=True,
+        help="CUDA device ID to manage memory for.",
+    )
+    parser.add_argument(
+        "--socket-path",
+        type=str,
+        default=None,
+        help="Path for Unix domain socket. Default: /tmp/gpu_memory_service_{device}.sock. "
+        "Supports {device} placeholder for multi-GPU setups.",
+    )
+    parser.add_argument(
+        "--verbose",
+        "-v",
+        action="store_true",
+        help="Enable verbose logging.",
+    )
+
+    args = parser.parse_args()
+
+    # Generate default socket path if not provided
+    socket_path = args.socket_path
+    if socket_path is None:
+        socket_path = f"/tmp/gpu_memory_service_{args.device}.sock"
+    else:
+        # Expand {device} placeholder
+        socket_path = socket_path.format(device=args.device)
+
+    config = Config(
+        device=args.device,
+        socket_path=socket_path,
+        verbose=args.verbose,
+    )
+
+    return config
@@ -0,0 +1,83 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""GPU Memory Service allocation server component for Dynamo.
+
+This component wraps the GMSRPCServer from gpu_memory_service to manage
+GPU memory allocations with connection-based RW/RO locking.
+
+Workers connect via the socket path, which should be passed to vLLM/SGLang via:
+    --load-format gpu_memory_service
+    --model-loader-extra-config '{"gpu_memory_service_socket_path": "/tmp/gpu_memory_service_{device}.sock"}'
+
+Usage:
+    python -m dynamo.gpu_memory_service --device 0
+    python -m dynamo.gpu_memory_service --device 0 --socket-path /tmp/gpu_memory_service_{device}.sock
+"""
+
+import asyncio
+import logging
+import signal
+
+import uvloop
+from gpu_memory_service.server import GMSRPCServer
+
+from .args import parse_args
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+
+
+async def worker() -> None:
+    """Main async worker function."""
+    config = parse_args()
+
+    # Configure logging level
+    if config.verbose:
+        logging.getLogger().setLevel(logging.DEBUG)
+        logging.getLogger("dynamo.gpu_memory_service").setLevel(logging.DEBUG)
+
+    logger.info(f"Starting GPU Memory Service Server for device {config.device}")
+    logger.info(f"Socket path: {config.socket_path}")
+
+    server = GMSRPCServer(config.socket_path, device=config.device)
+
+    # Set up shutdown handling
+    shutdown_event = asyncio.Event()
+
+    def signal_handler():
+        logger.info("Received shutdown signal")
+        shutdown_event.set()
+
+    loop = asyncio.get_running_loop()
+    for sig in (signal.SIGTERM, signal.SIGINT):
+        loop.add_signal_handler(sig, signal_handler)
+
+    await server.start()
+
+    logger.info("GPU Memory Service Server ready, waiting for connections...")
+    logger.info(
+        f"To connect vLLM workers, use: --load-format gpu_memory_service "
+        f'--model-loader-extra-config \'{{"gpu_memory_service_socket_path": "{config.socket_path}"}}\''
+    )
+
+    # Wait for shutdown signal
+    try:
+        await shutdown_event.wait()
+    finally:
+        logger.info("Shutting down GPU Memory Service Server...")
+        await server.stop()
+        logger.info("GPU Memory Service Server shutdown complete")
+
+
+def main() -> None:
+    """Entry point for GPU Memory Service server."""
+    uvloop.install()
+    asyncio.run(worker())
+
+
+if __name__ == "__main__":
+    main()
@@ -27,6 +27,7 @@ ARG EPP_IMAGE="us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inferen
 
 ARG PYTHON_VERSION
 ARG ENABLE_KVBM
+ARG ENABLE_GPU_MEMORY_SERVICE
 ARG ENABLE_MEDIA_NIXL
 ARG ENABLE_MEDIA_FFMPEG
 ARG CARGO_BUILD_JOBS
@@ -431,6 +432,13 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
     fi && \
     /tmp/use-sccache.sh show-stats "Dynamo"
 
+# Build gpu_memory_service wheel (C++ extension only needs Python headers, no CUDA/torch)
+ARG ENABLE_GPU_MEMORY_SERVICE
+RUN if [ "$ENABLE_GPU_MEMORY_SERVICE" = "true" ]; then \
+        source ${VIRTUAL_ENV}/bin/activate && \
+        uv build --wheel --out-dir /opt/dynamo/dist /opt/dynamo/lib/gpu_memory_service; \
+    fi
+
 ##############################################
 ########## Runtime image ##############
 ##############################################
@@ -502,10 +510,19 @@ ENV VIRTUAL_ENV=/opt/dynamo/venv \
 
 # Install dynamo wheels (runtime packages only, no test dependencies)
 ARG ENABLE_KVBM
+ARG ENABLE_GPU_MEMORY_SERVICE
 RUN uv pip install \
     /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
     /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
     /opt/dynamo/wheelhouse/nixl/nixl*.whl && \
+    if [ "$ENABLE_GPU_MEMORY_SERVICE" = "true" ]; then \
+        GMS_WHEEL=$(ls /opt/dynamo/wheelhouse/gpu_memory_service*.whl 2>/dev/null | head -1); \
+        if [ -z "$GMS_WHEEL" ]; then \
+            echo "ERROR: ENABLE_GPU_MEMORY_SERVICE is true but no gpu_memory_service wheel found in wheelhouse" >&2; \
+            exit 1; \
+        fi; \
+        uv pip install "$GMS_WHEEL"; \
+    fi && \
     if [ "$ENABLE_KVBM" = "true" ]; then \
         KVBM_WHEEL=$(ls /opt/dynamo/wheelhouse/kvbm*.whl 2>/dev/null | head -1); \
         if [ -z "$KVBM_WHEEL" ]; then \
@@ -593,10 +610,19 @@ RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requi
         --requirement /tmp/requirements.test.txt
 
 ARG ENABLE_KVBM
+ARG ENABLE_GPU_MEMORY_SERVICE
 RUN uv pip install \
     /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
     /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
     /opt/dynamo/wheelhouse/nixl/nixl*.whl && \
+    if [ "$ENABLE_GPU_MEMORY_SERVICE" = "true" ]; then \
+        GMS_WHEEL=$(ls /opt/dynamo/wheelhouse/gpu_memory_service*.whl 2>/dev/null | head -1); \
+        if [ -z "$GMS_WHEEL" ]; then \
+            echo "ERROR: ENABLE_GPU_MEMORY_SERVICE is true but no gpu_memory_service wheel found in wheelhouse" >&2; \
+            exit 1; \
+        fi; \
+        uv pip install "$GMS_WHEEL"; \
+    fi && \
     if [ "$ENABLE_KVBM" = "true" ]; then \
         KVBM_WHEEL=$(ls /opt/dynamo/wheelhouse/kvbm*.whl 2>/dev/null | head -1); \
         if [ -z "$KVBM_WHEEL" ]; then \

@@ -36,6 +36,7 @@ ARG BASE_IMAGE_TAG
 
 ARG PYTHON_VERSION
 ARG ENABLE_KVBM
+ARG ENABLE_GPU_MEMORY_SERVICE
 ARG ENABLE_MEDIA_NIXL
 ARG ENABLE_MEDIA_FFMPEG
 ARG CARGO_BUILD_JOBS
@@ -442,6 +443,13 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
     fi && \
     /tmp/use-sccache.sh show-stats "Dynamo"
 
+# Build gpu_memory_service wheel (C++ extension only needs Python headers, no CUDA/torch)
+ARG ENABLE_GPU_MEMORY_SERVICE
+RUN if [ "$ENABLE_GPU_MEMORY_SERVICE" = "true" ]; then \
+        source ${VIRTUAL_ENV}/bin/activate && \
+        uv build --wheel --out-dir /opt/dynamo/dist /opt/dynamo/lib/gpu_memory_service; \
+    fi
+
 ##################################
 ########## Runtime Image #########
 ##################################
@@ -500,12 +508,21 @@ COPY --chmod=775 --chown=dynamo:0 --from=wheel_builder /workspace/nixl/build/src
 
 ENV SGLANG_VERSION="${RUNTIME_IMAGE_TAG%%-*}"
 # Install packages as root to ensure they go to system location (/usr/local/lib/python3.12/dist-packages)
+ARG ENABLE_GPU_MEMORY_SERVICE
 RUN --mount=type=bind,source=.,target=/mnt/local_src \
     pip install --no-cache-dir --break-system-packages \
         /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
         /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
         /opt/dynamo/wheelhouse/nixl/nixl*.whl \
-        sglang==${SGLANG_VERSION}
+        sglang==${SGLANG_VERSION} && \
+    if [ "${ENABLE_GPU_MEMORY_SERVICE}" = "true" ]; then \
+        GMS_WHEEL=$(ls /opt/dynamo/wheelhouse/gpu_memory_service*.whl 2>/dev/null | head -1); \
+        if [ -z "$GMS_WHEEL" ]; then \
+            echo "ERROR: ENABLE_GPU_MEMORY_SERVICE is true but no gpu_memory_service wheel found in wheelhouse" >&2; \
+            exit 1; \
+        fi; \
+        pip install --no-cache-dir --break-system-packages "$GMS_WHEEL"; \
+    fi
 
 # Install common and test dependencies as root
 RUN --mount=type=bind,source=.,target=/mnt/local_src \