treadiehq
diff --git a/‎Cargo.toml‎
Lines changed: 1 addition & 1 deletion b/‎Cargo.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎mcp/Cargo.toml‎
Lines changed: 1 addition & 1 deletion b/‎mcp/Cargo.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎mcp/src/server.rs‎
Lines changed: 2 additions & 1 deletion b/‎mcp/src/server.rs‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎mcp/src/types.rs‎
Lines changed: 45 additions & 1 deletion b/‎mcp/src/types.rs‎
Lines changed: 45 additions & 1 deletion
diff --git a/‎src/args.rs‎
Lines changed: 10 additions & 2 deletions b/‎src/args.rs‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎src/config.rs‎
Lines changed: 67 additions & 46 deletions b/‎src/config.rs‎
Lines changed: 67 additions & 46 deletions
diff --git a/‎src/coordinator.rs‎
Lines changed: 101 additions & 4 deletions b/‎src/coordinator.rs‎
Lines changed: 101 additions & 4 deletions
@@ -6,7 +6,7 @@ members = [
 
 [package]
 name = "gpukill"
-version = "0.1.10"
+version = "0.1.11"
 edition = "2021"
 authors = ["Kage <info@treadie.com>"]
 description = "A CLI tool for GPU management and monitoring supporting NVIDIA, AMD, Intel, and Apple Silicon GPUs"
 
@@ -1,6 +1,6 @@
 [package]
 name = "gpukill-mcp"
-version = "0.1.1"
+version = "0.1.2"
 edition = "2021"
 authors = ["GPU Kill Team"]
 description = "MCP server for GPU Kill - AI-accessible GPU management"
 
@@ -153,13 +153,14 @@ impl GpuKillMCPServer {
                     move |request: axum::extract::Json<JsonRpcRequest>| {
                         let server = server.clone();
                         async move {
+                            let request_id = request.0.id.clone();
                             match server.handle_request(request.0).await {
                                 Ok(response) => axum::response::Json(response),
                                 Err(e) => {
                                     error!("Failed to handle HTTP request: {}", e);
                                     axum::response::Json(JsonRpcResponse {
                                         jsonrpc: "2.0".to_string(),
-                                        id: crate::types::RequestId::Null, // Per JSON-RPC 2.0: use null when id cannot be determined
+                                        id: request_id,
                                         result: None,
                                         error: Some(JsonRpcError {
                                             code: -32603,
 
@@ -61,10 +61,25 @@ impl From<i32> for RequestId {
     }
 }
 
+fn validate_jsonrpc_version<'de, D>(deserializer: D) -> Result<String, D::Error>
+where
+    D: serde::Deserializer<'de>,
+{
+    let version = String::deserialize(deserializer)?;
+    if version != "2.0" {
+        return Err(serde::de::Error::custom(format!(
+            "jsonrpc must be '2.0', got '{}'",
+            version
+        )));
+    }
+    Ok(version)
+}
+
 /// MCP Request/Response types
 #[derive(Debug, Serialize, Deserialize)]
-#[serde(tag = "jsonrpc", rename = "2.0")]
 pub struct JsonRpcRequest {
+    #[serde(deserialize_with = "validate_jsonrpc_version")]
+    pub jsonrpc: String,
     /// Request identifier - can be String, Number, or Null per JSON-RPC 2.0
     pub id: RequestId,
     pub method: String,
@@ -73,6 +88,7 @@ pub struct JsonRpcRequest {
 
 #[derive(Debug, Serialize, Deserialize)]
 pub struct JsonRpcResponse {
+    #[serde(deserialize_with = "validate_jsonrpc_version")]
     pub jsonrpc: String,
     /// Response identifier - must match the request id per JSON-RPC 2.0
     pub id: RequestId,
@@ -298,6 +314,34 @@ mod tests {
         assert_eq!(parsed.method, "initialize");
     }
 
+    #[test]
+    fn test_jsonrpc_request_rejects_wrong_version() {
+        let request = json!({
+            "jsonrpc": "1.0",
+            "method": "initialize",
+            "params": {},
+            "id": 1
+        });
+
+        let parsed: Result<JsonRpcRequest, _> = from_value(request);
+        assert!(parsed.is_err(), "expected jsonrpc version to be rejected");
+    }
+
+    #[test]
+    fn test_jsonrpc_request_rejects_missing_version() {
+        let request = json!({
+            "method": "initialize",
+            "params": {},
+            "id": 1
+        });
+
+        let parsed: Result<JsonRpcRequest, _> = from_value(request);
+        assert!(
+            parsed.is_err(),
+            "expected missing jsonrpc field to be rejected"
+        );
+    }
+
     #[test]
     fn test_jsonrpc_request_with_null_id() {
         // Null IDs are valid in JSON-RPC 2.0 (but not MCP)
 
@@ -86,8 +86,8 @@ pub struct Cli {
     #[arg(long)]
     pub filter: Option<String>,
 
-    /// Kill multiple processes matching the filter
-    #[arg(long, requires = "filter")]
+    /// Kill multiple processes matching the filter or GPU
+    #[arg(long)]
     pub batch: bool,
 
     /// Show container information for processes
@@ -628,6 +628,14 @@ mod tests {
         assert!(cli.force);
     }
 
+    #[test]
+    fn test_kill_batch_with_gpu() {
+        let cli = Cli::try_parse_from(["gpukill", "--kill", "--batch", "--gpu", "0"]).unwrap();
+        assert!(cli.kill);
+        assert!(cli.batch);
+        assert_eq!(cli.gpu, Some(0));
+    }
+
     #[test]
     fn test_reset_single_gpu() {
         let cli = Cli::try_parse_from(["gpukill", "--reset", "--gpu", "0"]).unwrap();
 
@@ -88,42 +88,7 @@ impl ConfigManager {
     /// Load configuration from environment variables
     pub fn load_from_env() -> Self {
         let mut config = Config::default();
-
-        // Override with environment variables if present
-        if let Ok(log_level) = std::env::var("GPUKILL_LOG_LEVEL") {
-            config.log_level = log_level;
-        }
-
-        if let Ok(output_format) = std::env::var("GPUKILL_OUTPUT_FORMAT") {
-            config.output_format = output_format;
-        }
-
-        if let Ok(timeout) = std::env::var("GPUKILL_DEFAULT_TIMEOUT") {
-            if let Ok(timeout_secs) = timeout.parse::<u16>() {
-                config.default_timeout_secs = timeout_secs;
-            }
-        }
-
-        if let Ok(show_details) = std::env::var("GPUKILL_SHOW_DETAILS") {
-            config.show_details = show_details.parse().unwrap_or(false);
-        }
-
-        if let Ok(watch_interval) = std::env::var("GPUKILL_WATCH_INTERVAL") {
-            if let Ok(interval_secs) = watch_interval.parse::<u64>() {
-                config.watch_interval_secs = interval_secs;
-            }
-        }
-
-        if let Ok(table_width) = std::env::var("GPUKILL_TABLE_WIDTH") {
-            if let Ok(width) = table_width.parse::<usize>() {
-                config.table_width = width;
-            }
-        }
-
-        if let Ok(use_colors) = std::env::var("GPUKILL_USE_COLORS") {
-            config.use_colors = use_colors.parse().unwrap_or(true);
-        }
-
+        apply_env_overrides(&mut config);
         Self { config }
     }
 
@@ -181,20 +146,55 @@ impl ConfigManager {
     }
 }
 
-/// Get configuration with fallback chain
-pub fn get_config(config_path: Option<String>) -> Result<ConfigManager> {
-    // 1. Try to load from specified path
-    if let Some(path) = config_path {
-        return ConfigManager::load_from_file(path);
+fn apply_env_overrides(config: &mut Config) {
+    // Override with environment variables if present
+    if let Ok(log_level) = std::env::var("GPUKILL_LOG_LEVEL") {
+        config.log_level = log_level;
+    }
+
+    if let Ok(output_format) = std::env::var("GPUKILL_OUTPUT_FORMAT") {
+        config.output_format = output_format;
+    }
+
+    if let Ok(timeout) = std::env::var("GPUKILL_DEFAULT_TIMEOUT") {
+        if let Ok(timeout_secs) = timeout.parse::<u16>() {
+            config.default_timeout_secs = timeout_secs;
+        }
     }
 
-    // 2. Try to load from default location
-    if let Ok(config_manager) = ConfigManager::load_default() {
-        return Ok(config_manager);
+    if let Ok(show_details) = std::env::var("GPUKILL_SHOW_DETAILS") {
+        config.show_details = show_details.parse().unwrap_or(false);
     }
 
-    // 3. Load from environment variables
-    Ok(ConfigManager::load_from_env())
+    if let Ok(watch_interval) = std::env::var("GPUKILL_WATCH_INTERVAL") {
+        if let Ok(interval_secs) = watch_interval.parse::<u64>() {
+            config.watch_interval_secs = interval_secs;
+        }
+    }
+
+    if let Ok(table_width) = std::env::var("GPUKILL_TABLE_WIDTH") {
+        if let Ok(width) = table_width.parse::<usize>() {
+            config.table_width = width;
+        }
+    }
+
+    if let Ok(use_colors) = std::env::var("GPUKILL_USE_COLORS") {
+        config.use_colors = use_colors.parse().unwrap_or(true);
+    }
+}
+
+/// Get configuration with fallback chain
+pub fn get_config(config_path: Option<String>) -> Result<ConfigManager> {
+    let mut config = if let Some(path) = config_path {
+        ConfigManager::load_from_file(path)?.config
+    } else if let Ok(config_manager) = ConfigManager::load_default() {
+        config_manager.config
+    } else {
+        Config::default()
+    };
+
+    apply_env_overrides(&mut config);
+    Ok(ConfigManager { config })
 }
 
 #[cfg(test)]
@@ -239,4 +239,25 @@ mod tests {
         let manager = ConfigManager::new();
         assert_eq!(manager.config().log_level, "info");
     }
+
+    #[test]
+    fn test_env_overrides_config_file() {
+        let mut config = Config::default();
+        config.log_level = "warn".to_string();
+        config.watch_interval_secs = 2;
+        let toml_str = toml::to_string_pretty(&config).unwrap();
+
+        let temp_file = NamedTempFile::new().unwrap();
+        std::fs::write(temp_file.path(), toml_str).unwrap();
+
+        std::env::set_var("GPUKILL_LOG_LEVEL", "debug");
+        std::env::set_var("GPUKILL_WATCH_INTERVAL", "10");
+
+        let loaded = get_config(Some(temp_file.path().to_string_lossy().to_string())).unwrap();
+        assert_eq!(loaded.config().log_level, "debug");
+        assert_eq!(loaded.config().watch_interval_secs, 10);
+
+        std::env::remove_var("GPUKILL_LOG_LEVEL");
+        std::env::remove_var("GPUKILL_WATCH_INTERVAL");
+    }
 }
@@ -239,7 +239,7 @@ impl CoordinatorState {
         let snapshots = self.snapshots.read().await;
         let mut blocked_gpus = Vec::new();
         // Track unique (node_id, gpu_index) pairs per user to correctly count GPUs
-        // Tuple: (unique_gpus, memory, utilization, process_count)
+        // Tuple: (unique_gpus, memory, utilization_sum, process_count)
         #[allow(clippy::type_complexity)]
         let mut user_stats: HashMap<String, (HashSet<(String, u16)>, u32, f32, u32)> =
             HashMap::new();
@@ -279,9 +279,10 @@ impl CoordinatorState {
                         0,
                     ));
                     // Track unique (node_id, gpu_index) pairs to correctly count GPUs
-                    entry.0.insert((node_id.clone(), gpu.gpu_index));
+                    if entry.0.insert((node_id.clone(), gpu.gpu_index)) {
+                        entry.2 += gpu.util_pct; // utilization (sum per unique GPU)
+                    }
                     entry.1 += process.used_mem_mb; // memory
-                    entry.2 += gpu.util_pct; // utilization (will average later)
                     entry.3 += 1; // process_count
                 }
             }
@@ -295,7 +296,11 @@ impl CoordinatorState {
                     user,
                     gpu_count: gpu_set.len() as u32, // Count unique GPUs
                     total_memory_mb,
-                    avg_utilization: total_util / process_count as f32,
+                    avg_utilization: if gpu_set.is_empty() {
+                        0.0
+                    } else {
+                        total_util / gpu_set.len() as f32
+                    },
                     process_count,
                 },
             )
@@ -961,4 +966,96 @@ mod tests {
             "Bob uses 4 unique GPUs across 2 nodes"
         );
     }
+
+    #[tokio::test]
+    async fn test_contention_analysis_avg_utilization_unique_gpus() {
+        let state = CoordinatorState::new();
+
+        let snapshot = NodeSnapshot {
+            node_id: "test-node".to_string(),
+            hostname: "test-host".to_string(),
+            timestamp: Utc::now(),
+            gpus: vec![
+                GpuSnapshot {
+                    gpu_index: 0,
+                    name: "GPU 0".to_string(),
+                    vendor: GpuVendor::Nvidia,
+                    mem_used_mb: 8000,
+                    mem_total_mb: 10000,
+                    util_pct: 90.0,
+                    temp_c: 75,
+                    power_w: 200.0,
+                    ecc_volatile: None,
+                    pids: 2,
+                    top_proc: None,
+                },
+                GpuSnapshot {
+                    gpu_index: 1,
+                    name: "GPU 1".to_string(),
+                    vendor: GpuVendor::Nvidia,
+                    mem_used_mb: 3000,
+                    mem_total_mb: 10000,
+                    util_pct: 30.0,
+                    temp_c: 65,
+                    power_w: 100.0,
+                    ecc_volatile: None,
+                    pids: 1,
+                    top_proc: None,
+                },
+            ],
+            processes: vec![
+                GpuProc {
+                    gpu_index: 0,
+                    pid: 1001,
+                    user: "charlie".to_string(),
+                    proc_name: "train1".to_string(),
+                    used_mem_mb: 4000,
+                    start_time: "2025-09-20T01:00:00Z".to_string(),
+                    container: None,
+                },
+                GpuProc {
+                    gpu_index: 0,
+                    pid: 1002,
+                    user: "charlie".to_string(),
+                    proc_name: "train2".to_string(),
+                    used_mem_mb: 4000,
+                    start_time: "2025-09-20T01:00:00Z".to_string(),
+                    container: None,
+                },
+                GpuProc {
+                    gpu_index: 1,
+                    pid: 1003,
+                    user: "charlie".to_string(),
+                    proc_name: "train3".to_string(),
+                    used_mem_mb: 3000,
+                    start_time: "2025-09-20T01:00:00Z".to_string(),
+                    container: None,
+                },
+            ],
+            status: NodeStatus::Online,
+        };
+
+        state
+            .update_snapshot("test-node".to_string(), snapshot)
+            .await
+            .unwrap();
+
+        let analysis = state.get_contention_analysis().await.unwrap();
+
+        let charlie_stats = analysis
+            .top_users
+            .iter()
+            .find(|u| u.user == "charlie")
+            .expect("Charlie should be in top users");
+
+        assert_eq!(charlie_stats.gpu_count, 2, "Charlie uses 2 unique GPUs");
+        assert_eq!(charlie_stats.process_count, 3, "Charlie has 3 processes");
+
+        let expected_avg = 60.0;
+        let diff = (charlie_stats.avg_utilization - expected_avg).abs();
+        assert!(
+            diff < 0.01,
+            "Average utilization should be calculated per unique GPU"
+        );
+    }
 }