Skip to content
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .devcontainer/post-create.sh
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,8 @@ $SANITY_STATUS
Now build the project:
cargo build --locked --profile dev --features dynamo-llm/block-manager
cd lib/bindings/python && maturin develop --uv
DYNAMO_BIN_PATH=$CARGO_TARGET_DIR/debug uv pip install -e .
uv pip install -e lib/gpu_memory_service # GPU memory manager with C++ extension
DYNAMO_BIN_PATH=\$CARGO_TARGET_DIR/debug uv pip install -e .

Optional: cd lib/bindings/kvbm && maturin develop --uv # For KVBM support

Expand Down
4 changes: 4 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ container/Dockerfile*
.venv
.venv-docs

# GPU Memory Service build artifacts
lib/gpu_memory_service/build/
lib/gpu_memory_service/*.egg-info/
lib/gpu_memory_service/**/*.so

# Python
__pycache__/
Expand Down
1 change: 1 addition & 0 deletions .github/filters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ core:
- 'components/src/dynamo/mocker/**'
- 'components/src/dynamo/frontend/**'
- 'components/src/dynamo/common/**'
- 'components/src/dynamo/gpu_memory_service/**'
- '*.toml'
- '*.lock'
- '*.py'
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ tensorrtllm_checkpoints/
tensorrtllm_engines/
api_server_models/
server/
!lib/gpu_memory_service/server/
# Replay/Snapshot test artifacts
*.new
lib/llm/tests/data/sample-models/models--meta-llama--Llama-3.1-70B-Instruct/
Expand Down
11 changes: 10 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,16 @@ cd lib/bindings/python
maturin develop --uv
```

## 6. Install the Wheel
## 6. Install GPU Memory Service

The GPU Memory Service is a Python package with a C++ extension. It requires only Python development headers and a C++ compiler (g++).

```bash
cd $PROJECT_ROOT
uv pip install -e lib/gpu_memory_service
```

## 7. Install the Wheel

```
cd $PROJECT_ROOT
Expand Down
41 changes: 41 additions & 0 deletions components/src/dynamo/gpu_memory_service/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""GPU Memory Service component for Dynamo.

This module provides the Dynamo component wrapper around the gpu_memory_service package.
The core functionality is in the gpu_memory package; this module provides:
- CLI entry point (python -m dynamo.gpu_memory_service)
- Re-exports for backwards compatibility
"""

# Re-export core functionality from gpu_memory_service package
from gpu_memory_service import (
GMSClientMemoryManager,
StaleMemoryLayoutError,
get_gms_client_memory_manager,
get_or_create_gms_client_memory_manager,
)

# Re-export extensions (built separately)
from gpu_memory_service.client.torch.extensions import _allocator_ext

# Re-export module utilities
from gpu_memory_service.client.torch.module import (
materialize_module_from_gms,
register_module_tensors,
)

__all__ = [
# Core
"GMSClientMemoryManager",
"StaleMemoryLayoutError",
# GMS client memory manager
"get_or_create_gms_client_memory_manager",
"get_gms_client_memory_manager",
# Tensor utilities
"register_module_tensors",
"materialize_module_from_gms",
# Extensions
"_allocator_ext",
]
7 changes: 7 additions & 0 deletions components/src/dynamo/gpu_memory_service/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

from dynamo.gpu_memory_service.server import main

if __name__ == "__main__":
main()
66 changes: 66 additions & 0 deletions components/src/dynamo/gpu_memory_service/args.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""Argument parsing for GPU Memory Service server component."""

import argparse
import logging
from dataclasses import dataclass

logger = logging.getLogger(__name__)


@dataclass
class Config:
"""Configuration for GPU Memory Service server."""

# GPU Memory Service specific
device: int
socket_path: str
verbose: bool


def parse_args() -> Config:
"""Parse command line arguments for GPU Memory Service server."""
parser = argparse.ArgumentParser(
description="GPU Memory Service allocation server for Dynamo."
)

# GPU Memory Service specific arguments
parser.add_argument(
"--device",
type=int,
required=True,
help="CUDA device ID to manage memory for.",
)
parser.add_argument(
"--socket-path",
type=str,
default=None,
help="Path for Unix domain socket. Default: /tmp/gpu_memory_service_{device}.sock. "
"Supports {device} placeholder for multi-GPU setups.",
)
parser.add_argument(
"--verbose",
"-v",
action="store_true",
help="Enable verbose logging.",
)

args = parser.parse_args()

# Generate default socket path if not provided
socket_path = args.socket_path
if socket_path is None:
socket_path = f"/tmp/gpu_memory_service_{args.device}.sock"
else:
# Expand {device} placeholder
socket_path = socket_path.format(device=args.device)

config = Config(
device=args.device,
socket_path=socket_path,
verbose=args.verbose,
)

return config
83 changes: 83 additions & 0 deletions components/src/dynamo/gpu_memory_service/server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""GPU Memory Service allocation server component for Dynamo.

This component wraps the GMSRPCServer from gpu_memory_service to manage
GPU memory allocations with connection-based RW/RO locking.

Workers connect via the socket path, which should be passed to vLLM/SGLang via:
--load-format gpu_memory_service
--model-loader-extra-config '{"gpu_memory_service_socket_path": "/tmp/gpu_memory_service_{device}.sock"}'

Usage:
python -m dynamo.gpu_memory_service --device 0
python -m dynamo.gpu_memory_service --device 0 --socket-path /tmp/gpu_memory_service_{device}.sock
"""

import asyncio
import logging
import signal

import uvloop
from gpu_memory_service.server import GMSRPCServer

from .args import parse_args

logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)


async def worker() -> None:
"""Main async worker function."""
config = parse_args()

# Configure logging level
if config.verbose:
logging.getLogger().setLevel(logging.DEBUG)
logging.getLogger("dynamo.gpu_memory_service").setLevel(logging.DEBUG)

logger.info(f"Starting GPU Memory Service Server for device {config.device}")
logger.info(f"Socket path: {config.socket_path}")

server = GMSRPCServer(config.socket_path, device=config.device)

# Set up shutdown handling
shutdown_event = asyncio.Event()

def signal_handler():
logger.info("Received shutdown signal")
shutdown_event.set()

loop = asyncio.get_running_loop()
for sig in (signal.SIGTERM, signal.SIGINT):
loop.add_signal_handler(sig, signal_handler)

await server.start()

logger.info("GPU Memory Service Server ready, waiting for connections...")
logger.info(
f"To connect vLLM workers, use: --load-format gpu_memory_service "
f'--model-loader-extra-config \'{{"gpu_memory_service_socket_path": "{config.socket_path}"}}\''
)

# Wait for shutdown signal
try:
await shutdown_event.wait()
finally:
logger.info("Shutting down GPU Memory Service Server...")
await server.stop()
logger.info("GPU Memory Service Server shutdown complete")


def main() -> None:
"""Entry point for GPU Memory Service server."""
uvloop.install()
asyncio.run(worker())


if __name__ == "__main__":
main()
26 changes: 26 additions & 0 deletions container/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ ARG EPP_IMAGE="us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inferen

ARG PYTHON_VERSION
ARG ENABLE_KVBM
ARG ENABLE_GPU_MEMORY_SERVICE
ARG ENABLE_MEDIA_NIXL
ARG ENABLE_MEDIA_FFMPEG
ARG CARGO_BUILD_JOBS
Expand Down Expand Up @@ -431,6 +432,13 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
fi && \
/tmp/use-sccache.sh show-stats "Dynamo"

# Build gpu_memory_service wheel (C++ extension only needs Python headers, no CUDA/torch)
ARG ENABLE_GPU_MEMORY_SERVICE
RUN if [ "$ENABLE_GPU_MEMORY_SERVICE" = "true" ]; then \
source ${VIRTUAL_ENV}/bin/activate && \
uv build --wheel --out-dir /opt/dynamo/dist /opt/dynamo/lib/gpu_memory_service; \
fi

##############################################
########## Runtime image ##############
##############################################
Expand Down Expand Up @@ -502,10 +510,19 @@ ENV VIRTUAL_ENV=/opt/dynamo/venv \

# Install dynamo wheels (runtime packages only, no test dependencies)
ARG ENABLE_KVBM
ARG ENABLE_GPU_MEMORY_SERVICE
RUN uv pip install \
/opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
/opt/dynamo/wheelhouse/ai_dynamo*any.whl \
/opt/dynamo/wheelhouse/nixl/nixl*.whl && \
if [ "$ENABLE_GPU_MEMORY_SERVICE" = "true" ]; then \
GMS_WHEEL=$(ls /opt/dynamo/wheelhouse/gpu_memory_service*.whl 2>/dev/null | head -1); \
if [ -z "$GMS_WHEEL" ]; then \
echo "ERROR: ENABLE_GPU_MEMORY_SERVICE is true but no gpu_memory_service wheel found in wheelhouse" >&2; \
exit 1; \
fi; \
uv pip install "$GMS_WHEEL"; \
fi && \
if [ "$ENABLE_KVBM" = "true" ]; then \
KVBM_WHEEL=$(ls /opt/dynamo/wheelhouse/kvbm*.whl 2>/dev/null | head -1); \
if [ -z "$KVBM_WHEEL" ]; then \
Expand Down Expand Up @@ -593,10 +610,19 @@ RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requi
--requirement /tmp/requirements.test.txt

ARG ENABLE_KVBM
ARG ENABLE_GPU_MEMORY_SERVICE
RUN uv pip install \
/opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
/opt/dynamo/wheelhouse/ai_dynamo*any.whl \
/opt/dynamo/wheelhouse/nixl/nixl*.whl && \
if [ "$ENABLE_GPU_MEMORY_SERVICE" = "true" ]; then \
GMS_WHEEL=$(ls /opt/dynamo/wheelhouse/gpu_memory_service*.whl 2>/dev/null | head -1); \
if [ -z "$GMS_WHEEL" ]; then \
echo "ERROR: ENABLE_GPU_MEMORY_SERVICE is true but no gpu_memory_service wheel found in wheelhouse" >&2; \
exit 1; \
fi; \
uv pip install "$GMS_WHEEL"; \
fi && \
if [ "$ENABLE_KVBM" = "true" ]; then \
KVBM_WHEEL=$(ls /opt/dynamo/wheelhouse/kvbm*.whl 2>/dev/null | head -1); \
if [ -z "$KVBM_WHEEL" ]; then \
Expand Down
19 changes: 18 additions & 1 deletion container/Dockerfile.sglang
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ ARG BASE_IMAGE_TAG

ARG PYTHON_VERSION
ARG ENABLE_KVBM
ARG ENABLE_GPU_MEMORY_SERVICE
ARG ENABLE_MEDIA_NIXL
ARG ENABLE_MEDIA_FFMPEG
ARG CARGO_BUILD_JOBS
Expand Down Expand Up @@ -442,6 +443,13 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
fi && \
/tmp/use-sccache.sh show-stats "Dynamo"

# Build gpu_memory_service wheel (C++ extension only needs Python headers, no CUDA/torch)
ARG ENABLE_GPU_MEMORY_SERVICE
RUN if [ "$ENABLE_GPU_MEMORY_SERVICE" = "true" ]; then \
source ${VIRTUAL_ENV}/bin/activate && \
uv build --wheel --out-dir /opt/dynamo/dist /opt/dynamo/lib/gpu_memory_service; \
fi

##################################
########## Runtime Image #########
##################################
Expand Down Expand Up @@ -500,12 +508,21 @@ COPY --chmod=775 --chown=dynamo:0 --from=wheel_builder /workspace/nixl/build/src

ENV SGLANG_VERSION="${RUNTIME_IMAGE_TAG%%-*}"
# Install packages as root to ensure they go to system location (/usr/local/lib/python3.12/dist-packages)
ARG ENABLE_GPU_MEMORY_SERVICE
RUN --mount=type=bind,source=.,target=/mnt/local_src \
pip install --no-cache-dir --break-system-packages \
/opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
/opt/dynamo/wheelhouse/ai_dynamo*any.whl \
/opt/dynamo/wheelhouse/nixl/nixl*.whl \
sglang==${SGLANG_VERSION}
sglang==${SGLANG_VERSION} && \
if [ "${ENABLE_GPU_MEMORY_SERVICE}" = "true" ]; then \
GMS_WHEEL=$(ls /opt/dynamo/wheelhouse/gpu_memory_service*.whl 2>/dev/null | head -1); \
if [ -z "$GMS_WHEEL" ]; then \
echo "ERROR: ENABLE_GPU_MEMORY_SERVICE is true but no gpu_memory_service wheel found in wheelhouse" >&2; \
exit 1; \
fi; \
pip install --no-cache-dir --break-system-packages "$GMS_WHEEL"; \
fi

# Install common and test dependencies as root
RUN --mount=type=bind,source=.,target=/mnt/local_src \
Expand Down
Loading
Loading