evalops
diff --git a/‎pyproject.toml‎
Lines changed: 3 additions & 0 deletions b/‎pyproject.toml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/nimbus/common/metrics.py‎
Lines changed: 6 additions & 2 deletions b/‎src/nimbus/common/metrics.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎src/nimbus/runners/docker.py‎
Lines changed: 1 addition & 1 deletion b/‎src/nimbus/runners/docker.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/nimbus/runners/gpu.py‎
Lines changed: 2 additions & 2 deletions b/‎src/nimbus/runners/gpu.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/nimbus/runners/pool_manager.py‎
Lines changed: 15 additions & 5 deletions b/‎src/nimbus/runners/pool_manager.py‎
Lines changed: 15 additions & 5 deletions
diff --git a/‎src/nimbus/runners/resource_manager.py‎
Lines changed: 5 additions & 5 deletions b/‎src/nimbus/runners/resource_manager.py‎
Lines changed: 5 additions & 5 deletions
@@ -55,3 +55,6 @@ packages = ["src/nimbus"]
 testpaths = ["tests"]
 pythonpath = ["src"]
 addopts = "--strict-config --strict-markers"
+markers = [
+    "integration: marks tests as integration tests (may require external services)",
+]
@@ -12,8 +12,10 @@ def __init__(self, name: str, description: str = "") -> None:
         self.description = description
         self._value = 0.0
 
-    def inc(self, amount: float = 1.0) -> None:
+    def inc(self, amount: float = 1.0, *, labels: list[str] | None = None) -> None:
+        """Increase the counter; `labels` is accepted for API compatibility."""
         self._value += amount
+        # Labels are ignored for now – can be stored later if needed.
 
     def render(self) -> str:
         return f"# HELP {self.name} {self.description}\n# TYPE {self.name} counter\n{self.name} {self._value}\n"
@@ -26,8 +28,10 @@ def __init__(self, name: str, description: str = "", supplier: Callable[[], floa
         self._value = 0.0
         self._supplier = supplier
 
-    def set(self, value: float) -> None:
+    def set(self, value: float, *, labels: list[str] | None = None) -> None:
+        """Set the gauge; `labels` is accepted for API compatibility."""
         self._value = value
+        # Labels are ignored for now – can be stored later if needed.
 
     def render(self) -> str:
         value = self._supplier() if self._supplier else self._value
 
@@ -47,7 +47,7 @@ def initialize(self, settings: HostAgentSettings) -> None:
             # Test connection
             self._docker_client.ping()
             LOGGER.info("Docker client initialized", socket=settings.docker_socket_path)
-        except DockerException as e:
+        except (DockerException, Exception) as e:
             LOGGER.error("Failed to initialize Docker client", error=str(e))
             raise RuntimeError(f"Docker initialization failed: {e}") from e
 
 
@@ -110,7 +110,7 @@ def _check_nvidia_docker(self) -> bool:
             )
             runtimes = json.loads(result.stdout)
             return "nvidia" in runtimes
-        except (subprocess.CalledProcessError, json.JSONDecodeError, KeyError):
+        except Exception:
             return False
 
     def _discover_gpus(self) -> None:
@@ -140,7 +140,7 @@ def _discover_gpus(self) -> None:
             LOGGER.info("Discovered GPUs", count=len(self._available_gpus), 
                        gpus=[gpu.name for gpu in self._available_gpus.values()])
 
-        except subprocess.CalledProcessError as exc:
+        except Exception as exc:
             LOGGER.error("Failed to discover GPUs", error=str(exc))
             raise RuntimeError("GPU discovery failed - nvidia-smi not available") from exc
 
 
@@ -113,7 +113,7 @@ async def stop(self) -> None:
 
     async def get_warm_instance(self, executor_name: str, job: JobAssignment) -> Optional[WarmInstance]:
         """Get a warm executor instance for the specified job, if available."""
-        if not self._running or executor_name not in self._pools:
+        if executor_name not in self._executors:
             return None
 
         pool = self._pools[executor_name]
@@ -300,15 +300,25 @@ def get_pool_stats(self) -> dict[str, dict]:
             reserved = sum(1 for i in pool.values() if i.reserved_for_job is not None)
             unhealthy = sum(1 for i in pool.values() if not i.is_healthy)
 
+            # Get config or use defaults
+            config = self._pool_configs.get(executor_name)
+            if config:
+                config_info = {
+                    "min_warm": config.min_warm,
+                    "max_warm": config.max_warm,
+                }
+            else:
+                config_info = {
+                    "min_warm": 0,
+                    "max_warm": 2,
+                }
+            
             stats[executor_name] = {
                 "total": len(pool),
                 "available": available,
                 "reserved": reserved,
                 "unhealthy": unhealthy,
-                "config": {
-                    "min_warm": self._pool_configs[executor_name].min_warm,
-                    "max_warm": self._pool_configs[executor_name].max_warm,
-                }
+                "config": config_info
             }
 
         return stats
@@ -36,18 +36,18 @@ def __init__(self, cgroup_root: Path = Path("/sys/fs/cgroup")) -> None:
         self._nimbus_slice = cgroup_root / "nimbus-jobs.slice"
         self._active_jobs: Dict[int, Path] = {}
 
-        # Metrics
+        # Metrics (labels will be provided at metric update time)
         self._cpu_usage_gauge = GLOBAL_REGISTRY.register(
-            Gauge("nimbus_job_cpu_seconds_total", "CPU time used by job", labels=["job_id", "executor"])
+            Gauge("nimbus_job_cpu_seconds_total", "CPU time used by job")
         )
         self._memory_usage_gauge = GLOBAL_REGISTRY.register(
-            Gauge("nimbus_job_memory_bytes", "Memory used by job", labels=["job_id", "executor"])
+            Gauge("nimbus_job_memory_bytes", "Memory used by job")
         )
         self._io_read_counter = GLOBAL_REGISTRY.register(
-            Counter("nimbus_job_io_read_bytes_total", "IO read by job", labels=["job_id", "executor"])
+            Counter("nimbus_job_io_read_bytes_total", "IO read by job")
         )
         self._io_write_counter = GLOBAL_REGISTRY.register(
-            Counter("nimbus_job_io_write_bytes_total", "IO write by job", labels=["job_id", "executor"])
+            Counter("nimbus_job_io_write_bytes_total", "IO write by job")
         )
 
     async def initialize(self) -> None:
Original file line number	Diff line number	Diff line change
`@@ -36,18 +36,18 @@ def __init__(self, cgroup_root: Path = Path("/sys/fs/cgroup")) -> None:`
`36`	`36`	`self._nimbus_slice = cgroup_root / "nimbus-jobs.slice"`
`37`	`37`	`self._active_jobs: Dict[int, Path] = {}`
`38`	`38`
`39`		`- # Metrics`
	`39`	`+ # Metrics (labels will be provided at metric update time)`
`40`	`40`	`self._cpu_usage_gauge = GLOBAL_REGISTRY.register(`
`41`		`- Gauge("nimbus_job_cpu_seconds_total", "CPU time used by job", labels=["job_id", "executor"])`
	`41`	`+ Gauge("nimbus_job_cpu_seconds_total", "CPU time used by job")`
`42`	`42`	`)`
`43`	`43`	`self._memory_usage_gauge = GLOBAL_REGISTRY.register(`
`44`		`- Gauge("nimbus_job_memory_bytes", "Memory used by job", labels=["job_id", "executor"])`
	`44`	`+ Gauge("nimbus_job_memory_bytes", "Memory used by job")`
`45`	`45`	`)`
`46`	`46`	`self._io_read_counter = GLOBAL_REGISTRY.register(`
`47`		`- Counter("nimbus_job_io_read_bytes_total", "IO read by job", labels=["job_id", "executor"])`
	`47`	`+ Counter("nimbus_job_io_read_bytes_total", "IO read by job")`
`48`	`48`	`)`
`49`	`49`	`self._io_write_counter = GLOBAL_REGISTRY.register(`
`50`		`- Counter("nimbus_job_io_write_bytes_total", "IO write by job", labels=["job_id", "executor"])`
	`50`	`+ Counter("nimbus_job_io_write_bytes_total", "IO write by job")`
`51`	`51`	`)`
`52`	`52`
`53`	`53`	`async def initialize(self) -> None:`