Skip to content

Commit 5012c21

Browse files
galletas1712claude
andcommitted
Add GPU Memory Service core library
GPU Memory Service provides distributed GPU memory management for model weights: - Server: Manages GPU memory lifecycle, handles sleep/wake requests - Client: Provides RPC interface for memory operations - PyTorch integration: Custom allocator and tensor management for VA-stable operations Components: - lib/gpu_memory_service/ - Core library (server, client, common utilities) - components/src/dynamo/gpu_memory_service/ - Dynamo service wrapper Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent c94d097 commit 5012c21

27 files changed

+6511
-0
lines changed
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
"""GPU Memory Service component for Dynamo.
5+
6+
This module provides the Dynamo component wrapper around the gpu_memory_service package.
7+
The core functionality is in the gpu_memory package; this module provides:
8+
- CLI entry point (python -m dynamo.gpu_memory_service)
9+
- Re-exports for backwards compatibility
10+
"""
11+
12+
# Re-export core functionality from gpu_memory_service package
13+
from gpu_memory_service import (
14+
GMSClientMemoryManager,
15+
StaleWeightsError,
16+
get_allocator,
17+
get_or_create_allocator,
18+
)
19+
20+
# Re-export extensions (built separately)
21+
from gpu_memory_service.client.torch.extensions import (
22+
_allocator_ext,
23+
_tensor_from_pointer,
24+
)
25+
26+
# Re-export tensor utilities
27+
from gpu_memory_service.client.torch.tensor import (
28+
materialize_module_from_gms,
29+
register_module_tensors,
30+
)
31+
32+
__all__ = [
33+
# Core allocator
34+
"GMSClientMemoryManager",
35+
"StaleWeightsError",
36+
# Lifecycle management
37+
"get_or_create_allocator",
38+
"get_allocator",
39+
# Tensor utilities
40+
"register_module_tensors",
41+
"materialize_module_from_gms",
42+
# Extensions
43+
"_allocator_ext",
44+
"_tensor_from_pointer",
45+
]
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
from dynamo.gpu_memory_service.server import main
5+
6+
if __name__ == "__main__":
7+
main()
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
"""Argument parsing for GPU Memory Service server component."""
5+
6+
import argparse
7+
import logging
8+
from dataclasses import dataclass
9+
10+
logger = logging.getLogger(__name__)
11+
12+
13+
@dataclass
14+
class Config:
15+
"""Configuration for GPU Memory Service server."""
16+
17+
# GPU Memory Service specific
18+
device: int
19+
socket_path: str
20+
verbose: bool
21+
22+
23+
def parse_args() -> Config:
24+
"""Parse command line arguments for GPU Memory Service server."""
25+
parser = argparse.ArgumentParser(
26+
description="GPU Memory Service allocation server for Dynamo."
27+
)
28+
29+
# GPU Memory Service specific arguments
30+
parser.add_argument(
31+
"--device",
32+
type=int,
33+
required=True,
34+
help="CUDA device ID to manage memory for.",
35+
)
36+
parser.add_argument(
37+
"--socket-path",
38+
type=str,
39+
default=None,
40+
help="Path for Unix domain socket. Default: /tmp/gpu_memory_service_{device}.sock. "
41+
"Supports {device} placeholder for multi-GPU setups.",
42+
)
43+
parser.add_argument(
44+
"--verbose",
45+
"-v",
46+
action="store_true",
47+
help="Enable verbose logging.",
48+
)
49+
50+
args = parser.parse_args()
51+
52+
# Generate default socket path if not provided
53+
socket_path = args.socket_path
54+
if socket_path is None:
55+
socket_path = f"/tmp/gpu_memory_service_{args.device}.sock"
56+
else:
57+
# Expand {device} placeholder
58+
socket_path = socket_path.format(device=args.device)
59+
60+
config = Config(
61+
device=args.device,
62+
socket_path=socket_path,
63+
verbose=args.verbose,
64+
)
65+
66+
return config
Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
"""GPU Memory Service allocation server component for Dynamo.
5+
6+
This component wraps the GMSRPCServer from gpu_memory_service to manage
7+
GPU memory allocations with connection-based RW/RO locking.
8+
9+
Workers connect via the socket path, which should be passed to vLLM/SGLang via:
10+
--load-format gpu_memory_service
11+
--model-loader-extra-config '{"gpu_memory_service_socket_path": "/tmp/gpu_memory_service_{device}.sock"}'
12+
13+
Usage:
14+
python -m dynamo.gpu_memory_service --device 0
15+
python -m dynamo.gpu_memory_service --device 0 --socket-path /tmp/gpu_memory_service_{device}.sock
16+
"""
17+
18+
import asyncio
19+
import logging
20+
import os
21+
import signal
22+
import threading
23+
from typing import Optional
24+
25+
import uvloop
26+
from gpu_memory_service.server import GMSRPCServer
27+
28+
from .args import parse_args
29+
30+
logging.basicConfig(
31+
level=logging.INFO,
32+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
33+
)
34+
logger = logging.getLogger(__name__)
35+
36+
37+
class GMSRPCServerThread:
38+
"""Wrapper to run GMSRPCServer in a background thread."""
39+
40+
def __init__(self, socket_path: str, device: int):
41+
self.socket_path = socket_path
42+
self.device = device
43+
self._server: Optional[GMSRPCServer] = None
44+
self._thread: Optional[threading.Thread] = None
45+
self._started = threading.Event()
46+
self._error: Optional[Exception] = None
47+
self._loop: Optional[asyncio.AbstractEventLoop] = None
48+
49+
def start(self) -> None:
50+
"""Start the allocation server in a background thread."""
51+
self._thread = threading.Thread(
52+
target=self._run_server,
53+
name=f"GMSRPCServer-GPU{self.device}",
54+
daemon=True,
55+
)
56+
self._thread.start()
57+
# Wait for server to be ready (socket file created)
58+
self._started.wait(timeout=10.0)
59+
if self._error is not None:
60+
raise self._error
61+
if not self._started.is_set():
62+
raise RuntimeError("GMSRPCServer failed to start within timeout")
63+
64+
def _run_server(self) -> None:
65+
"""Run the server (called in background thread).
66+
67+
The GMSRPCServer is async-based, so we create a new event loop for this thread.
68+
"""
69+
try:
70+
# Create a new event loop for this thread
71+
self._loop = asyncio.new_event_loop()
72+
asyncio.set_event_loop(self._loop)
73+
74+
self._server = GMSRPCServer(self.socket_path, device=self.device)
75+
76+
# Start the server (creates the socket)
77+
self._loop.run_until_complete(self._server.start())
78+
logger.info(
79+
f"GMSRPCServer started on device {self.device} at {self.socket_path}"
80+
)
81+
self._started.set()
82+
83+
# Run the main loop
84+
while self._server._running:
85+
self._loop.run_until_complete(asyncio.sleep(1))
86+
87+
except Exception as e:
88+
logger.error(f"GMSRPCServer error: {e}")
89+
self._error = e
90+
self._started.set() # Unblock waiter even on error
91+
finally:
92+
if self._loop is not None:
93+
self._loop.close()
94+
95+
def stop(self) -> None:
96+
"""Stop the allocation server."""
97+
if self._server is not None:
98+
logger.info(f"Stopping GMSRPCServer on device {self.device}")
99+
# Signal the server to stop - the loop in _run_server will exit
100+
self._server._running = False
101+
self._server._shutdown = True
102+
# Wake any blocked waiters from the server's event loop
103+
if self._loop is not None and self._loop.is_running():
104+
105+
async def _notify():
106+
async with self._server._condition:
107+
self._server._condition.notify_all()
108+
109+
asyncio.run_coroutine_threadsafe(_notify(), self._loop)
110+
if self._thread is not None and self._thread.is_alive():
111+
self._thread.join(timeout=5.0)
112+
113+
114+
async def worker() -> None:
115+
"""Main async worker function."""
116+
config = parse_args()
117+
118+
# Configure logging level
119+
if config.verbose:
120+
logging.getLogger().setLevel(logging.DEBUG)
121+
logging.getLogger("dynamo.gpu_memory_service").setLevel(logging.DEBUG)
122+
123+
logger.info(f"Starting GPU Memory Service Server for device {config.device}")
124+
logger.info(f"Socket path: {config.socket_path}")
125+
126+
loop = asyncio.get_running_loop()
127+
128+
# Clean up any existing socket file
129+
if config.socket_path and os.path.exists(config.socket_path):
130+
os.unlink(config.socket_path)
131+
logger.debug(f"Removed existing socket file: {config.socket_path}")
132+
133+
# Start GMSRPCServer in a background thread
134+
server = GMSRPCServerThread(config.socket_path, config.device)
135+
server.start()
136+
137+
# Set up shutdown event
138+
shutdown_event = asyncio.Event()
139+
140+
def signal_handler():
141+
logger.info("Received shutdown signal")
142+
shutdown_event.set()
143+
144+
for sig in (signal.SIGTERM, signal.SIGINT):
145+
loop.add_signal_handler(sig, signal_handler)
146+
147+
logger.info("GPU Memory Service Server ready, waiting for connections...")
148+
logger.info(
149+
f"To connect vLLM workers, use: --load-format gpu_memory_service "
150+
f'--model-loader-extra-config \'{{"gpu_memory_service_socket_path": "{config.socket_path}"}}\''
151+
)
152+
153+
# Wait for shutdown signal
154+
try:
155+
await shutdown_event.wait()
156+
finally:
157+
logger.info("Shutting down GPU Memory Service Server...")
158+
server.stop()
159+
logger.info("GPU Memory Service Server shutdown complete")
160+
161+
162+
def main() -> None:
163+
"""Entry point for GPU Memory Service server."""
164+
uvloop.install()
165+
asyncio.run(worker())
166+
167+
168+
if __name__ == "__main__":
169+
main()

0 commit comments

Comments
 (0)