Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .devcontainer/post-create.sh
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,8 @@ $SANITY_STATUS
Now build the project:
cargo build --locked --profile dev --features dynamo-llm/block-manager
cd lib/bindings/python && maturin develop --uv
DYNAMO_BIN_PATH=$CARGO_TARGET_DIR/debug uv pip install -e .
uv pip install -e lib/gpu_memory_service # GPU memory manager with C++ extension
DYNAMO_BIN_PATH=\$CARGO_TARGET_DIR/debug uv pip install -e .

Optional: cd lib/bindings/kvbm && maturin develop --uv # For KVBM support

Expand Down
7 changes: 6 additions & 1 deletion .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,11 @@ container/Dockerfile*
.venv
.venv-docs

# GPU Memory Service build artifacts
lib/gpu_memory_service/build/
lib/gpu_memory_service/*.egg-info/
lib/gpu_memory_service/**/*.so

# Python
__pycache__/
*.pyc
Expand Down Expand Up @@ -75,4 +80,4 @@ docs/_build/

# AI
.cursor/
.claude/
.claude/
1 change: 1 addition & 0 deletions .github/filters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ core:
- 'components/src/dynamo/mocker/**'
- 'components/src/dynamo/frontend/**'
- 'components/src/dynamo/common/**'
- 'components/src/dynamo/gpu_memory_service/**'
- '*.toml'
- '*.lock'
- '*.py'
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ tensorrtllm_checkpoints/
tensorrtllm_engines/
api_server_models/
server/
!lib/gpu_memory_service/server/
# Replay/Snapshot test artifacts
*.new
lib/llm/tests/data/sample-models/models--meta-llama--Llama-3.1-70B-Instruct/
Expand Down
11 changes: 10 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,16 @@ cd lib/bindings/python
maturin develop --uv
```

## 6. Install the Wheel
## 6. Install GPU Memory Service

The GPU Memory Service is a Python package with a C++ extension. It requires only Python development headers and a C++ compiler (g++).

```bash
cd $PROJECT_ROOT
uv pip install -e lib/gpu_memory_service
```

## 7. Install the Wheel

```
cd $PROJECT_ROOT
Expand Down
41 changes: 41 additions & 0 deletions components/src/dynamo/gpu_memory_service/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""GPU Memory Service component for Dynamo.

This module provides the Dynamo component wrapper around the gpu_memory_service package.
The core functionality is in the gpu_memory package; this module provides:
- CLI entry point (python -m dynamo.gpu_memory_service)
- Re-exports for backwards compatibility
"""

# Re-export core functionality from gpu_memory_service package
from gpu_memory_service import (
GMSClientMemoryManager,
StaleMemoryLayoutError,
get_gms_client_memory_manager,
get_or_create_gms_client_memory_manager,
)

# Re-export extensions (built separately)
from gpu_memory_service.client.torch.extensions import _allocator_ext

# Re-export module utilities
from gpu_memory_service.client.torch.module import (
materialize_module_from_gms,
register_module_tensors,
)

__all__ = [
# Core
"GMSClientMemoryManager",
"StaleMemoryLayoutError",
# GMS client memory manager
"get_or_create_gms_client_memory_manager",
"get_gms_client_memory_manager",
# Tensor utilities
"register_module_tensors",
"materialize_module_from_gms",
# Extensions
"_allocator_ext",
]
7 changes: 7 additions & 0 deletions components/src/dynamo/gpu_memory_service/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

from dynamo.gpu_memory_service.server import main

if __name__ == "__main__":
main()
66 changes: 66 additions & 0 deletions components/src/dynamo/gpu_memory_service/args.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""Argument parsing for GPU Memory Service server component."""

import argparse
import logging
from dataclasses import dataclass

logger = logging.getLogger(__name__)


@dataclass
class Config:
"""Configuration for GPU Memory Service server."""

# GPU Memory Service specific
device: int
socket_path: str
verbose: bool


def parse_args() -> Config:
"""Parse command line arguments for GPU Memory Service server."""
parser = argparse.ArgumentParser(
description="GPU Memory Service allocation server for Dynamo."
)

# GPU Memory Service specific arguments
parser.add_argument(
"--device",
type=int,
required=True,
help="CUDA device ID to manage memory for.",
)
parser.add_argument(
"--socket-path",
type=str,
default=None,
help="Path for Unix domain socket. Default: /tmp/gpu_memory_service_{device}.sock. "
"Supports {device} placeholder for multi-GPU setups.",
)
parser.add_argument(
"--verbose",
"-v",
action="store_true",
help="Enable verbose logging.",
)

args = parser.parse_args()

# Generate default socket path if not provided
socket_path = args.socket_path
if socket_path is None:
socket_path = f"/tmp/gpu_memory_service_{args.device}.sock"
else:
# Expand {device} placeholder
socket_path = socket_path.format(device=args.device)

config = Config(
device=args.device,
socket_path=socket_path,
verbose=args.verbose,
)

return config
169 changes: 169 additions & 0 deletions components/src/dynamo/gpu_memory_service/server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""GPU Memory Service allocation server component for Dynamo.

This component wraps the GMSRPCServer from gpu_memory_service to manage
GPU memory allocations with connection-based RW/RO locking.

Workers connect via the socket path, which should be passed to vLLM/SGLang via:
--load-format gpu_memory_service
--model-loader-extra-config '{"gpu_memory_service_socket_path": "/tmp/gpu_memory_service_{device}.sock"}'

Usage:
python -m dynamo.gpu_memory_service --device 0
python -m dynamo.gpu_memory_service --device 0 --socket-path /tmp/gpu_memory_service_{device}.sock
"""

import asyncio
import logging
import os
import signal
import threading
from typing import Optional

import uvloop
from gpu_memory_service.server import GMSRPCServer

from .args import parse_args

logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)


class GMSRPCServerThread:
"""Wrapper to run GMSRPCServer in a background thread."""

def __init__(self, socket_path: str, device: int):
self.socket_path = socket_path
self.device = device
self._server: Optional[GMSRPCServer] = None
self._thread: Optional[threading.Thread] = None
self._started = threading.Event()
self._error: Optional[Exception] = None
self._loop: Optional[asyncio.AbstractEventLoop] = None

def start(self) -> None:
"""Start the allocation server in a background thread."""
self._thread = threading.Thread(
target=self._run_server,
name=f"GMSRPCServer-GPU{self.device}",
daemon=True,
)
self._thread.start()
# Wait for server to be ready (socket file created)
self._started.wait(timeout=10.0)
if self._error is not None:
raise self._error
if not self._started.is_set():
raise RuntimeError("GMSRPCServer failed to start within timeout")

def _run_server(self) -> None:
"""Run the server (called in background thread).

The GMSRPCServer is async-based, so we create a new event loop for this thread.
"""
try:
# Create a new event loop for this thread
self._loop = asyncio.new_event_loop()
asyncio.set_event_loop(self._loop)

self._server = GMSRPCServer(self.socket_path, device=self.device)

# Start the server (creates the socket)
self._loop.run_until_complete(self._server.start())
logger.info(
f"GMSRPCServer started on device {self.device} at {self.socket_path}"
)
self._started.set()

# Run the main loop
while self._server._running:
self._loop.run_until_complete(asyncio.sleep(1))

except Exception as e:
logger.error(f"GMSRPCServer error: {e}")
self._error = e
self._started.set() # Unblock waiter even on error
finally:
if self._loop is not None:
self._loop.close()

def stop(self) -> None:
"""Stop the allocation server."""
if self._server is not None:
logger.info(f"Stopping GMSRPCServer on device {self.device}")
# Signal the server to stop - the loop in _run_server will exit
self._server._running = False
self._server._shutdown = True
# Wake any blocked waiters from the server's event loop
if self._loop is not None and self._loop.is_running():

async def _notify():
async with self._server._condition:
self._server._condition.notify_all()

asyncio.run_coroutine_threadsafe(_notify(), self._loop)
if self._thread is not None and self._thread.is_alive():
self._thread.join(timeout=5.0)


async def worker() -> None:
"""Main async worker function."""
config = parse_args()

# Configure logging level
if config.verbose:
logging.getLogger().setLevel(logging.DEBUG)
logging.getLogger("dynamo.gpu_memory_service").setLevel(logging.DEBUG)

logger.info(f"Starting GPU Memory Service Server for device {config.device}")
logger.info(f"Socket path: {config.socket_path}")

loop = asyncio.get_running_loop()

# Clean up any existing socket file
if config.socket_path and os.path.exists(config.socket_path):
os.unlink(config.socket_path)
logger.debug(f"Removed existing socket file: {config.socket_path}")

# Start GMSRPCServer in a background thread
server = GMSRPCServerThread(config.socket_path, config.device)
server.start()

# Set up shutdown event
shutdown_event = asyncio.Event()

def signal_handler():
logger.info("Received shutdown signal")
shutdown_event.set()

for sig in (signal.SIGTERM, signal.SIGINT):
loop.add_signal_handler(sig, signal_handler)

logger.info("GPU Memory Service Server ready, waiting for connections...")
logger.info(
f"To connect vLLM workers, use: --load-format gpu_memory_service "
f'--model-loader-extra-config \'{{"gpu_memory_service_socket_path": "{config.socket_path}"}}\''
)

# Wait for shutdown signal
try:
await shutdown_event.wait()
finally:
logger.info("Shutting down GPU Memory Service Server...")
server.stop()
logger.info("GPU Memory Service Server shutdown complete")


def main() -> None:
"""Entry point for GPU Memory Service server."""
uvloop.install()
asyncio.run(worker())


if __name__ == "__main__":
main()
10 changes: 8 additions & 2 deletions container/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,10 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
fi && \
/tmp/use-sccache.sh show-stats "Dynamo"

# Build gpu_memory_service wheel (C++ extension only needs Python headers, no CUDA/torch)
RUN source ${VIRTUAL_ENV}/bin/activate && \
uv build --wheel --out-dir /opt/dynamo/dist /opt/dynamo/lib/gpu_memory_service

##############################################
########## Runtime image ##############
##############################################
Expand Down Expand Up @@ -505,7 +509,8 @@ ARG ENABLE_KVBM
RUN uv pip install \
/opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
/opt/dynamo/wheelhouse/ai_dynamo*any.whl \
/opt/dynamo/wheelhouse/nixl/nixl*.whl && \
/opt/dynamo/wheelhouse/nixl/nixl*.whl \
/opt/dynamo/wheelhouse/gpu_memory_service*.whl && \
if [ "$ENABLE_KVBM" = "true" ]; then \
KVBM_WHEEL=$(ls /opt/dynamo/wheelhouse/kvbm*.whl 2>/dev/null | head -1); \
if [ -z "$KVBM_WHEEL" ]; then \
Expand Down Expand Up @@ -596,7 +601,8 @@ ARG ENABLE_KVBM
RUN uv pip install \
/opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
/opt/dynamo/wheelhouse/ai_dynamo*any.whl \
/opt/dynamo/wheelhouse/nixl/nixl*.whl && \
/opt/dynamo/wheelhouse/nixl/nixl*.whl \
/opt/dynamo/wheelhouse/gpu_memory_service*.whl && \
if [ "$ENABLE_KVBM" = "true" ]; then \
KVBM_WHEEL=$(ls /opt/dynamo/wheelhouse/kvbm*.whl 2>/dev/null | head -1); \
if [ -z "$KVBM_WHEEL" ]; then \
Expand Down
5 changes: 5 additions & 0 deletions container/Dockerfile.sglang
Original file line number Diff line number Diff line change
Expand Up @@ -442,6 +442,10 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
fi && \
/tmp/use-sccache.sh show-stats "Dynamo"

# Build gpu_memory_service wheel (C++ extension only needs Python headers, no CUDA/torch)
RUN source ${VIRTUAL_ENV}/bin/activate && \
uv build --wheel --out-dir /opt/dynamo/dist /opt/dynamo/lib/gpu_memory_service

##################################
########## Runtime Image #########
##################################
Expand Down Expand Up @@ -505,6 +509,7 @@ RUN --mount=type=bind,source=.,target=/mnt/local_src \
/opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
/opt/dynamo/wheelhouse/ai_dynamo*any.whl \
/opt/dynamo/wheelhouse/nixl/nixl*.whl \
/opt/dynamo/wheelhouse/gpu_memory_service*.whl \
sglang==${SGLANG_VERSION}

# Install common and test dependencies as root
Expand Down
Loading
Loading