refactor: Address remaining PR review feedback from nv-hwoo

KavinKrishnan · cursoragent · KavinKrishnan · commit 49f6bad3d838 · 2026-02-06T15:28:28.000-08:00
- Move CLAUDE.md to project root (#14) - Fix transfer time claims and sanitize hardcoded paths (#15, #17) - Update env var references to MODEL_EXPRESS_URL (#17) - Add Lua script explanation for atomic worker merge (#7) - Mark unused Rust functions with #[allow(dead_code)] (#6) - Move MAX_MESSAGE_SIZE to module-level constant (#13) - Remove pyzmq from dependencies (provided by vLLM) (#18) - Update K8s README architecture diagram (#23) - Add __pycache__ and local docs to .gitignore Co-authored-by: Cursor <cursoragent@cursor.com>
diff --git a/.gitignore b/.gitignore
@@ -49,3 +49,5 @@ logs/
 
 # Models database
 models.db
+**/__pycache__/
+docs/FEEDBACK.md
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -16,8 +16,8 @@ This file provides context for AI assistants (Claude, Cursor, Copilot) working o
 
 | Model | Status | Transfer Time | Notes |
 |-------|--------|---------------|-------|
-| DeepSeek-V3 (671B, FP8) | Working | 40-80s | 681GB across 8 GPUs |
-| Llama 3.3 70B | Working | ~5s | 140GB across 4-8 GPUs |
+| DeepSeek-V3 (671B, FP8) | Working | ~40s | 681GB across 8 GPUs @ ~112 Gbps |
+| Llama 3.3 70B | Working | ~5s | 140GB across 8 GPUs @ ~112 Gbps |
 
 ---
 
@@ -125,7 +125,7 @@ UCX_LOG_LEVEL: "WARN"         # DEBUG for troubleshooting
 
 ```
 modelexpress/
-├── CLAUDE.md                 # THIS FILE - AI assistant context
+├── CLAUDE.md                 # THIS FILE (project root) - AI assistant context
 ├── modelexpress_server/      # Rust gRPC server
 │   └── src/
 │       ├── main.rs
@@ -162,7 +162,7 @@ Contains custom vLLM model loaders:
 
 - **`MxSourceModelLoader`**: Loads weights from disk, registers with NIXL, publishes metadata
 - **`MxTargetModelLoader`**: Creates dummy weights, receives via RDMA, applies FP8 processing
-- **`SourceReadyCoordinator`**: Redis-based coordination for source-target synchronization
+- **`SourceReadyCoordinator`**: gRPC-based coordination for source-target synchronization (via MxClient)
 
 ```python
 class MxSourceModelLoader(DefaultModelLoader):
@@ -214,20 +214,20 @@ Rust gRPC service implementation:
 ### Building Docker Image
 
 ```bash
-cd /home/kavink/work/gitlab/modelexpress
+cd path/to/modelexpress
 
 # Build client image
 docker build -f examples/p2p_transfer_k8s/Dockerfile.client \
-  -t nvcr.io/nvidian/dynamo-dev/modelexpress-p2p-client:YOUR_TAG .
+  -t nvcr.io/nvidian/dynamo-dev/IMAGE_NAME:YOUR_TAG .
 
-docker push nvcr.io/nvidian/dynamo-dev/modelexpress-p2p-client:YOUR_TAG
+docker push nvcr.io/nvidian/dynamo-dev/IMAGE_NAME:YOUR_TAG
 ```
 
 ### Deploying to Kubernetes
 
 ```bash
 # Namespace
-NAMESPACE=kavin
+NAMESPACE=<your-namespace>
 
 # 1. Flush Redis (clear stale metadata)
 microk8s kubectl -n $NAMESPACE exec deploy/modelexpress-server -c redis -- redis-cli FLUSHALL
@@ -247,14 +247,14 @@ watch microk8s kubectl -n $NAMESPACE get pods -l 'app in (mx-source, mx-target)'
 
 ```bash
 # Stream logs
-microk8s kubectl -n kavin logs -f deploy/mx-source
-microk8s kubectl -n kavin logs -f deploy/mx-target
+kubectl -n $NAMESPACE logs -f deploy/mx-source
+kubectl -n $NAMESPACE logs -f deploy/mx-target
 
 # Check Redis state
-microk8s kubectl -n kavin exec deploy/modelexpress-server -c redis -- redis-cli KEYS '*'
+kubectl -n $NAMESPACE exec deploy/modelexpress-server -c redis -- redis-cli KEYS '*'
 
 # Test inference
-microk8s kubectl -n kavin exec deploy/mx-target -- curl -s http://localhost:8000/v1/completions \
+kubectl -n $NAMESPACE exec deploy/mx-target -- curl -s http://localhost:8000/v1/completions \
   -H "Content-Type: application/json" \
   -d '{"model": "deepseek-ai/DeepSeek-V3", "prompt": "Hello", "max_tokens": 10}'
 ```
@@ -268,8 +268,7 @@ microk8s kubectl -n kavin exec deploy/mx-target -- curl -s http://localhost:8000
 | Variable | Default | Description |
 |----------|---------|-------------|
 | `MX_REGISTER_LOADERS` | `1` | Auto-register mx-source/mx-target loaders with vLLM |
-| `MX_SERVER_ADDRESS` | `modelexpress-server:8001` | gRPC server address |
-| `MX_REDIS_HOST` | `modelexpress-server` | Redis host for coordination |
+| `MODEL_EXPRESS_URL` | `localhost:8001` | gRPC server address (also reads `MX_SERVER_ADDRESS` for compat) |
 | `MX_CONTIGUOUS_REG` | `0` | Enable contiguous region registration (experimental) |
 | `MX_EXPECTED_WORKERS` | `8` | Number of GPU workers to wait for |
 | `MX_SYNC_PUBLISH` | `1` | Source: wait for all workers before publishing |
diff --git a/examples/p2p_transfer_k8s/README.md b/examples/p2p_transfer_k8s/README.md
@@ -5,38 +5,30 @@ This example demonstrates how to set up ModelExpress for P2P GPU weight transfer
 ## Architecture
 
 ```
-Node A (Source - first to start)              Node B (Target - starts later)
+Node A (Source)                               Node B (Target)
 +----------------------------------+          +----------------------------------+
-| vLLM Container                   |          | vLLM Container                   |
-| - Loads real model weights       |          | - Starts with dummy weights      |
-| - Exposes weights via ZMQ        |          | - Exposes buffers via ZMQ        |
-| - MX_ZMQ_ADDRESS=ipc:///tmp/mx/  |          | - MX_ZMQ_ADDRESS=ipc:///tmp/mx/  |
-+----------------------------------+          +----------------------------------+
-        |  ZMQ (IPC sockets)                          |  ZMQ (IPC sockets)
-        v                                             v
-+----------------------------------+          +----------------------------------+
-| Client Container                 |          | Client Container                 |
-| - Creates NIXL agents (1 per GPU)|          | - Creates NIXL agents (1 per GPU)|
-| - Queries server: no source found|   RDMA  | - Queries server: finds source A |
-| - Becomes source, publishes meta |<========>| - Receives weights via NIXL      |
-+----------------------------------+  NIXL   | - Also publishes metadata        |
+| vLLM + MxSourceModelLoader       |          | vLLM + MxTargetModelLoader       |
+| - Loads weights from disk        |          | - Starts with dummy weights      |
+| - Registers tensors with NIXL    |          | - Waits for source ready flag    |
+| - Publishes metadata via MxClient|   RDMA   | - Receives weights via NIXL      |
+| - Publishes ready flag           |=========>| - Runs FP8 processing            |
++----------------------------------+  NIXL    | - Serves inference               |
         |                                     +----------------------------------+
         |                                             |
         v                                             v
 +--------------------------------------------------------------------+
-|                    ModelExpress Server (CPU)                        |
-|   - Stores model metadata (NIXL metadata + tensor descriptors)      |
-|   - Keyed by model name                                            |
-|   - Redis backend for persistence                                  |
+|                    ModelExpress Server (gRPC + Redis)               |
+|   - PublishMetadata / GetMetadata: tensor metadata coordination     |
+|   - PublishReady / GetReady: source readiness coordination         |
 +--------------------------------------------------------------------+
 ```
 
 ### Key Design Points
 
-1. **Client Container**: NIXL transfer logic runs in a separate client container, not in vLLM
-2. **Symmetric Clients**: Both source and target run identical client code; role is determined dynamically
-3. **ZMQ Communication**: vLLM exposes weights via ZMQ IPC sockets (one per TP rank)
-4. **Tensor Parallelism**: Full TP > 1 support with rank-matched transfers
+1. **Custom vLLM Loaders**: NIXL transfer logic runs inside vLLM via `--load-format mx-source` / `--load-format mx-target`
+2. **MxClient**: All gRPC communication goes through `MxClient` (workers never access Redis directly)
+3. **FP8 Support**: Raw tensors (including `weight_scale_inv`) transfer BEFORE FP8 processing
+4. **Tensor Parallelism**: Full TP support with rank-matched transfers (one NIXL agent per GPU)
 
 ## Prerequisites
 
diff --git a/modelexpress_client/python/pyproject.toml b/modelexpress_client/python/pyproject.toml
@@ -32,7 +32,6 @@ dependencies = [
     "numpy>=1.24.0",
     "protobuf>=4.25.0",
     "pydantic>=2.0.0",
-    "pyzmq>=25.0.0",
     "torch>=2.6.0",
 ]
 
diff --git a/modelexpress_server/src/main.rs b/modelexpress_server/src/main.rs
@@ -19,6 +19,10 @@ use tonic::transport::Server;
 use tracing::{error, info, warn};
 use tracing_subscriber::{EnvFilter, FmtSubscriber};
 
+/// Maximum gRPC message size (100MB) for large models like DeepSeek-V3.
+/// Each worker can have thousands of tensor descriptors with NIXL metadata.
+const MAX_MESSAGE_SIZE: usize = 100 * 1024 * 1024;
+
 #[tokio::main]
 async fn main() -> Result<(), Box<dyn std::error::Error>> {
     // Parse command line arguments
@@ -125,8 +129,6 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
 
     // Start the gRPC server
     info!("Starting gRPC server on: {addr}");
-    // Set max message size to 100MB for large models like DeepSeek-V3
-    const MAX_MESSAGE_SIZE: usize = 100 * 1024 * 1024;
     let server_result = Server::builder()
         .add_service(HealthServiceServer::new(health_service))
         .add_service(ApiServiceServer::new(api_service))
diff --git a/modelexpress_server/src/state.rs b/modelexpress_server/src/state.rs
@@ -236,8 +236,16 @@ impl P2pStateManager {
         let new_workers_json = serde_json::to_string(&new_workers)?;
         let timestamp = chrono::Utc::now().timestamp();
 
-        // Lua script for atomic read-modify-write merge
-        // This runs atomically in Redis, preventing race conditions
+        // Lua script for atomic read-modify-write merge of worker metadata.
+        //
+        // WHY LUA? In a TP=8 setup, 8 GPU workers publish metadata concurrently.
+        // Without atomicity, two workers could read the same state, each add their
+        // own entry, and one overwrites the other (lost update). The Lua script
+        // runs as a single atomic operation in Redis, so the read-merge-write
+        // sequence is never interleaved with another worker's publish.
+        //
+        // The script: 1) reads existing workers, 2) merges new workers by rank
+        // (update if rank exists, append if new), 3) sorts by rank, 4) writes back.
         let script = redis::Script::new(
             r#"
             local key = KEYS[1]
@@ -333,7 +341,9 @@ impl P2pStateManager {
         }
     }
 
-    /// Remove metadata for a model (cleanup)
+    /// Remove metadata for a model (cleanup).
+    /// Currently unused - reserved for future admin/cleanup endpoints.
+    #[allow(dead_code)]
     pub async fn remove_metadata(
         &self,
         model_name: &str,
@@ -348,7 +358,9 @@ impl P2pStateManager {
         Ok(())
     }
 
-    /// List all registered model names
+    /// List all registered model names.
+    /// Currently unused - reserved for future admin/list endpoints.
+    #[allow(dead_code)]
     pub async fn list_models(
         &self,
     ) -> Result<Vec<String>, Box<dyn std::error::Error + Send + Sync>> {

Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,6 @@ dependencies = [`
`32`	`32`	`"numpy>=1.24.0",`
`33`	`33`	`"protobuf>=4.25.0",`
`34`	`34`	`"pydantic>=2.0.0",`
`35`		`- "pyzmq>=25.0.0",`
`36`	`35`	`"torch>=2.6.0",`
`37`	`36`	`]`
`38`	`37`