Skip to content

Commit 499bc45

Browse files
galletas1712claude
andcommitted
Add GPU Memory Service core library and component wrapper
Add the GPU Memory Service (GMS) library for distributed GPU memory management. This provides: - lib/gpu_memory_service/: Core GMS library - Server component for managing GPU memory lifecycle - Client component for RPC communication - CUDA VMM utilities for virtual memory management - PyTorch extensions for custom allocators - components/src/dynamo/gpu_memory_service/: Dynamo component wrapper - CLI entry point (python -m dynamo.gpu_memory_service) - Standalone server for managing GPU memory - Dockerfile changes to build and install the GMS wheel - Dev container and ignore file updates for GMS build artifacts Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 07721d1 commit 499bc45

31 files changed

+5987
-2
lines changed

.devcontainer/post-create.sh

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,16 @@ mkdir -p $CARGO_TARGET_DIR
6666

6767
# Note: Build steps moved to after sanity check - see instructions at the end
6868

69+
# Install gpu_memory_service package and build CUDA extensions
70+
# First uninstall any existing install from the Docker image, then do editable install
71+
# Use --no-build-isolation so uv uses the current environment (with PyTorch) instead of an isolated one
72+
echo "Installing gpu_memory_service package..."
73+
uv pip uninstall gpu-memory-service 2>/dev/null || true
74+
# Use dynamic path detection instead of hardcoded python3.12
75+
PYTHON_SITE_PACKAGES=$(python3 -c "import sysconfig; print(sysconfig.get_path('purelib'))")
76+
rm -rf "${PYTHON_SITE_PACKAGES}/gpu_memory_service/" 2>/dev/null || true
77+
uv pip install --no-build-isolation -e $WORKSPACE_DIR/lib/gpu_memory_service
78+
6979
{ set +x; } 2>/dev/null
7080

7181
echo -e "\n" >> ~/.bashrc
@@ -107,6 +117,7 @@ cat <<EOF
107117
========================================
108118
$SANITY_STATUS
109119
✅ Pre-commit hooks configured
120+
✅ gpu_memory_service package installed with CUDA extensions
110121
111122
Now build the project:
112123
cargo build --locked --profile dev --features dynamo-llm/block-manager

.dockerignore

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,4 +41,9 @@
4141
**/*safetensors
4242
container/Dockerfile*
4343
.venv
44-
.venv-docs
44+
.venv-docs
45+
46+
# GPU Memory Service build artifacts
47+
lib/gpu_memory_service/build/
48+
lib/gpu_memory_service/*.egg-info/
49+
lib/gpu_memory_service/**/*.so

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ tensorrtllm_checkpoints/
5757
tensorrtllm_engines/
5858
api_server_models/
5959
server/
60+
!lib/gpu_memory_service/server/
6061
# Replay/Snapshot test artifacts
6162
*.new
6263
lib/llm/tests/data/sample-models/models--meta-llama--Llama-3.1-70B-Instruct/
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
"""GPU Memory Service component for Dynamo.
5+
6+
This module provides the Dynamo component wrapper around the gpu_memory_service package.
7+
The core functionality is in the gpu_memory package; this module provides:
8+
- CLI entry point (python -m dynamo.gpu_memory_service)
9+
- Re-exports for backwards compatibility
10+
"""
11+
12+
# Re-export core functionality from gpu_memory_service package
13+
from gpu_memory_service import (
14+
GMSClientMemoryManager,
15+
StaleWeightsError,
16+
get_allocator,
17+
get_or_create_allocator,
18+
)
19+
20+
# Re-export extensions (built separately)
21+
from gpu_memory_service.client.torch.extensions import (
22+
_allocator_ext,
23+
_tensor_from_pointer,
24+
)
25+
26+
# Re-export tensor utilities
27+
from gpu_memory_service.client.torch.tensor import (
28+
materialize_module_from_gms,
29+
register_module_tensors,
30+
)
31+
32+
__all__ = [
33+
# Core allocator
34+
"GMSClientMemoryManager",
35+
"StaleWeightsError",
36+
# Lifecycle management
37+
"get_or_create_allocator",
38+
"get_allocator",
39+
# Tensor utilities
40+
"register_module_tensors",
41+
"materialize_module_from_gms",
42+
# Extensions
43+
"_allocator_ext",
44+
"_tensor_from_pointer",
45+
]
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
from dynamo.gpu_memory_service.server import main
5+
6+
if __name__ == "__main__":
7+
main()
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
"""Argument parsing for GPU Memory Service server component."""
5+
6+
import argparse
7+
import logging
8+
from dataclasses import dataclass
9+
10+
logger = logging.getLogger(__name__)
11+
12+
13+
@dataclass
14+
class Config:
15+
"""Configuration for GPU Memory Service server."""
16+
17+
# GPU Memory Service specific
18+
device: int
19+
socket_path: str
20+
verbose: bool
21+
22+
23+
def parse_args() -> Config:
24+
"""Parse command line arguments for GPU Memory Service server."""
25+
parser = argparse.ArgumentParser(
26+
description="GPU Memory Service allocation server for Dynamo."
27+
)
28+
29+
# GPU Memory Service specific arguments
30+
parser.add_argument(
31+
"--device",
32+
type=int,
33+
required=True,
34+
help="CUDA device ID to manage memory for.",
35+
)
36+
parser.add_argument(
37+
"--socket-path",
38+
type=str,
39+
default=None,
40+
help="Path for Unix domain socket. Default: /tmp/gpu_memory_service_{device}.sock. "
41+
"Supports {device} placeholder for multi-GPU setups.",
42+
)
43+
parser.add_argument(
44+
"--verbose",
45+
"-v",
46+
action="store_true",
47+
help="Enable verbose logging.",
48+
)
49+
50+
args = parser.parse_args()
51+
52+
# Generate default socket path if not provided
53+
socket_path = args.socket_path
54+
if socket_path is None:
55+
socket_path = f"/tmp/gpu_memory_service_{args.device}.sock"
56+
else:
57+
# Expand {device} placeholder
58+
socket_path = socket_path.format(device=args.device)
59+
60+
config = Config(
61+
device=args.device,
62+
socket_path=socket_path,
63+
verbose=args.verbose,
64+
)
65+
66+
return config
Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
"""GPU Memory Service allocation server component for Dynamo.
5+
6+
This component wraps the GMSRPCServer from gpu_memory_service to manage
7+
GPU memory allocations with connection-based RW/RO locking.
8+
9+
Workers connect via the socket path, which should be passed to vLLM/SGLang via:
10+
--load-format gpu_memory_service
11+
--model-loader-extra-config '{"gpu_memory_service_socket_path": "/tmp/gpu_memory_service_{device}.sock"}'
12+
13+
Usage:
14+
python -m dynamo.gpu_memory_service --device 0
15+
python -m dynamo.gpu_memory_service --device 0 --socket-path /tmp/gpu_memory_service_{device}.sock
16+
"""
17+
18+
import asyncio
19+
import logging
20+
import os
21+
import signal
22+
import threading
23+
from typing import Optional
24+
25+
import uvloop
26+
from gpu_memory_service.server import GMSRPCServer
27+
28+
from .args import parse_args
29+
30+
logging.basicConfig(
31+
level=logging.INFO,
32+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
33+
)
34+
logger = logging.getLogger(__name__)
35+
36+
37+
class GMSRPCServerThread:
38+
"""Wrapper to run GMSRPCServer in a background thread."""
39+
40+
def __init__(self, socket_path: str, device: int):
41+
self.socket_path = socket_path
42+
self.device = device
43+
self._server: Optional[GMSRPCServer] = None
44+
self._thread: Optional[threading.Thread] = None
45+
self._started = threading.Event()
46+
self._error: Optional[Exception] = None
47+
self._loop: Optional[asyncio.AbstractEventLoop] = None
48+
49+
def start(self) -> None:
50+
"""Start the allocation server in a background thread."""
51+
self._thread = threading.Thread(
52+
target=self._run_server,
53+
name=f"GMSRPCServer-GPU{self.device}",
54+
daemon=True,
55+
)
56+
self._thread.start()
57+
# Wait for server to be ready (socket file created)
58+
self._started.wait(timeout=10.0)
59+
if self._error is not None:
60+
raise self._error
61+
if not self._started.is_set():
62+
raise RuntimeError("GMSRPCServer failed to start within timeout")
63+
64+
def _run_server(self) -> None:
65+
"""Run the server (called in background thread).
66+
67+
The GMSRPCServer is async-based, so we create a new event loop for this thread.
68+
"""
69+
try:
70+
# Create a new event loop for this thread
71+
self._loop = asyncio.new_event_loop()
72+
asyncio.set_event_loop(self._loop)
73+
74+
self._server = GMSRPCServer(self.socket_path, device=self.device)
75+
76+
# Start the server (creates the socket)
77+
self._loop.run_until_complete(self._server.start())
78+
logger.info(
79+
f"GMSRPCServer started on device {self.device} at {self.socket_path}"
80+
)
81+
self._started.set()
82+
83+
# Run the main loop
84+
while self._server._running:
85+
self._loop.run_until_complete(asyncio.sleep(1))
86+
87+
except Exception as e:
88+
logger.error(f"GMSRPCServer error: {e}")
89+
self._error = e
90+
self._started.set() # Unblock waiter even on error
91+
finally:
92+
if self._loop is not None:
93+
self._loop.close()
94+
95+
def stop(self) -> None:
96+
"""Stop the allocation server."""
97+
if self._server is not None:
98+
logger.info(f"Stopping GMSRPCServer on device {self.device}")
99+
# Signal the server to stop - the loop in _run_server will exit
100+
self._server._running = False
101+
self._server._shutdown = True
102+
# Wake any blocked waiters from the server's event loop
103+
if self._loop is not None and self._loop.is_running():
104+
105+
async def _notify():
106+
async with self._server._condition:
107+
self._server._condition.notify_all()
108+
109+
asyncio.run_coroutine_threadsafe(_notify(), self._loop)
110+
if self._thread is not None and self._thread.is_alive():
111+
self._thread.join(timeout=5.0)
112+
113+
114+
async def worker() -> None:
115+
"""Main async worker function."""
116+
config = parse_args()
117+
118+
# Configure logging level
119+
if config.verbose:
120+
logging.getLogger().setLevel(logging.DEBUG)
121+
logging.getLogger("dynamo.gpu_memory_service").setLevel(logging.DEBUG)
122+
123+
logger.info(f"Starting GPU Memory Service Server for device {config.device}")
124+
logger.info(f"Socket path: {config.socket_path}")
125+
126+
loop = asyncio.get_running_loop()
127+
128+
# Clean up any existing socket file
129+
if config.socket_path and os.path.exists(config.socket_path):
130+
os.unlink(config.socket_path)
131+
logger.debug(f"Removed existing socket file: {config.socket_path}")
132+
133+
# Start GMSRPCServer in a background thread
134+
server = GMSRPCServerThread(config.socket_path, config.device)
135+
server.start()
136+
137+
# Set up shutdown event
138+
shutdown_event = asyncio.Event()
139+
140+
def signal_handler():
141+
logger.info("Received shutdown signal")
142+
shutdown_event.set()
143+
144+
for sig in (signal.SIGTERM, signal.SIGINT):
145+
loop.add_signal_handler(sig, signal_handler)
146+
147+
logger.info("GPU Memory Service Server ready, waiting for connections...")
148+
logger.info(
149+
f"To connect vLLM workers, use: --load-format gpu_memory_service "
150+
f'--model-loader-extra-config \'{{"gpu_memory_service_socket_path": "{config.socket_path}"}}\''
151+
)
152+
153+
# Wait for shutdown signal
154+
try:
155+
await shutdown_event.wait()
156+
finally:
157+
logger.info("Shutting down GPU Memory Service Server...")
158+
server.stop()
159+
logger.info("GPU Memory Service Server shutdown complete")
160+
161+
162+
def main() -> None:
163+
"""Entry point for GPU Memory Service server."""
164+
uvloop.install()
165+
asyncio.run(worker())
166+
167+
168+
if __name__ == "__main__":
169+
main()

0 commit comments

Comments
 (0)