ml-energy
diff --git a/‎tests/optimizer/test_power_limit_optimizer.py‎
Lines changed: 4 additions & 0 deletions b/‎tests/optimizer/test_power_limit_optimizer.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎zeus/device/cpu/rapl.py‎
Lines changed: 29 additions & 15 deletions b/‎zeus/device/cpu/rapl.py‎
Lines changed: 29 additions & 15 deletions
diff --git a/‎zeus/device/gpu/nvidia.py‎
Lines changed: 41 additions & 11 deletions b/‎zeus/device/gpu/nvidia.py‎
Lines changed: 41 additions & 11 deletions
diff --git a/‎zeus/monitor/power_streaming.py‎
Lines changed: 9 additions & 9 deletions b/‎zeus/monitor/power_streaming.py‎
Lines changed: 9 additions & 9 deletions
@@ -127,6 +127,10 @@ def test_power_limit_optimizer(
     # Mock away the atexit hook, which raises an NVML error when testing finishes.
     mocker.patch("zeus.optimizer.power_limit.atexit", autospec=True)
 
+    # Disable missing `SYS_ADMIN` capability warning during testing.
+    for gpu in get_gpus().gpus:
+        gpu._disable_sys_admin_warning = True
+
     monitor = ReplayZeusMonitor(
         log_file=replay_log.log_file,
         ignore_sync_execution=True,
 
@@ -286,40 +286,54 @@ def __init__(
         self.zeusd_sock_path = zeusd_sock_path
 
         self._client = httpx.Client(transport=httpx.HTTPTransport(uds=zeusd_sock_path))
-        self._url_prefix = f"http://zeusd/cpu/{cpu_index}"
 
         self.dram_available = self._supports_get_dram_energy_consumption()
 
     def _supports_get_dram_energy_consumption(self) -> bool:
-        """Calls zeusd to return if the specified CPU supports DRAM energy monitoring."""
-        resp = self._client.get(
-            self._url_prefix + "/supports_dram_energy",
-        )
+        """Query the /discover endpoint to check DRAM energy support for this CPU."""
+        resp = self._client.get("http://zeusd/discover")
         if resp.status_code != 200:
-            raise ZeusdError(f"Failed to query Zeusd whether DRAM energy is supported: {resp.text}")
+            raise ZeusdError(f"Failed to query Zeusd discovery endpoint: {resp.text}")
         data = resp.json()
         dram_available = data.get("dram_available")
         if dram_available is None:
-            raise ZeusdError("Failed to get whether DRAM energy is supported.")
-        return dram_available
+            raise ZeusdError("Discovery response missing 'dram_available' field.")
+        cpu_ids = data.get("cpu_ids", [])
+        try:
+            idx = cpu_ids.index(self.cpu_index)
+        except ValueError as e:
+            raise ZeusdError(f"CPU {self.cpu_index} not found in discovery response (available: {cpu_ids})") from e
+        if len(cpu_ids) != len(dram_available):
+            raise ZeusdError(
+                f"Discovery response has mismatched lengths: "
+                f"{len(cpu_ids)} cpu_ids vs {len(dram_available)} dram_available entries"
+            )
+        return dram_available[idx]
 
     def get_total_energy_consumption(self) -> CpuDramMeasurement:
         """Returns the total energy consumption of the specified powerzone. Units: mJ."""
-        resp = self._client.post(
-            self._url_prefix + "/get_index_energy",
-            json={
-                "cpu": True,
-                "dram": True,
+        resp = self._client.get(
+            "http://zeusd/cpu/get_cumulative_energy",
+            params={
+                "cpu_ids": str(self.cpu_index),
+                "cpu": "true",
+                "dram": "true",
             },
         )
         if resp.status_code != 200:
             raise ZeusdError(f"Failed to get total energy consumption: {resp.text}")
 
         data = resp.json()
-        cpu_mj = data["cpu_energy_uj"] / 1000
+        cpu_data = data.get(str(self.cpu_index))
+        if cpu_data is None:
+            raise ZeusdError(f"CPU {self.cpu_index} not found in response")
+        cpu_uj = cpu_data.get("cpu_energy_uj")
+        if cpu_uj is None:
+            raise ZeusdError(f"CPU {self.cpu_index}: cpu_energy_uj is null in response")
+        cpu_mj = cpu_uj / 1000
 
         dram_mj = None
-        dram_uj = data.get("dram_energy_uj")
+        dram_uj = cpu_data.get("dram_energy_uj")
         if dram_uj is None:
             if self.dram_available:
                 raise ZeusdError("DRAM energy should be available but no measurement was found")
 
@@ -304,7 +304,7 @@ def __init__(
         self.zeusd_sock_path = zeusd_sock_path
 
         self._client = httpx.Client(transport=httpx.HTTPTransport(uds=zeusd_sock_path))
-        self._url_prefix = f"http://zeusd/gpu/{gpu_index}"
+        self._gpu_index = gpu_index
 
     @property
     def supports_nonblocking_setters(self) -> bool:
@@ -318,8 +318,12 @@ def set_power_management_limit(self, power_limit_mw: int, block: bool = True) ->
             return
 
         resp = self._client.post(
-            self._url_prefix + "/set_power_limit",
-            json=dict(power_limit_mw=power_limit_mw, block=block),
+            "http://zeusd/gpu/set_power_limit",
+            params={
+                "gpu_ids": str(self._gpu_index),
+                "power_limit_mw": str(power_limit_mw),
+                "block": "true" if block else "false",
+            },
         )
         if resp.status_code != 200:
             raise ZeusdError(f"Failed to set power management limit: {resp.text}")
@@ -336,8 +340,12 @@ def reset_power_management_limit(self, block: bool = True) -> None:
     def set_persistence_mode(self, enabled: bool, block: bool = True) -> None:
         """Set persistence mode."""
         resp = self._client.post(
-            self._url_prefix + "/set_persistence_mode",
-            json=dict(enabled=enabled, block=block),
+            "http://zeusd/gpu/set_persistence_mode",
+            params={
+                "gpu_ids": str(self._gpu_index),
+                "enabled": "true" if enabled else "false",
+                "block": "true" if block else "false",
+            },
         )
         if resp.status_code != 200:
             raise ZeusdError(f"Failed to set persistence mode: {resp.text}")
@@ -346,31 +354,53 @@ def set_persistence_mode(self, enabled: bool, block: bool = True) -> None:
     def set_memory_locked_clocks(self, min_clock_mhz: int, max_clock_mhz: int, block: bool = True) -> None:
         """Lock the memory clock to a specified range. Units: MHz."""
         resp = self._client.post(
-            self._url_prefix + "/set_mem_locked_clocks",
-            json=dict(min_clock_mhz=min_clock_mhz, max_clock_mhz=max_clock_mhz, block=block),
+            "http://zeusd/gpu/set_mem_locked_clocks",
+            params={
+                "gpu_ids": str(self._gpu_index),
+                "min_clock_mhz": str(min_clock_mhz),
+                "max_clock_mhz": str(max_clock_mhz),
+                "block": "true" if block else "false",
+            },
         )
         if resp.status_code != 200:
             raise ZeusdError(f"Failed to set memory locked clocks: {resp.text}")
         logger.debug("Took %s ms to set memory locked clocks", resp.elapsed.microseconds / 1000)
 
     def reset_memory_locked_clocks(self, block: bool = True) -> None:
         """Reset the locked memory clocks to the default."""
-        resp = self._client.post(self._url_prefix + "/reset_mem_locked_clocks", json=dict(block=block))
+        resp = self._client.post(
+            "http://zeusd/gpu/reset_mem_locked_clocks",
+            params={
+                "gpu_ids": str(self._gpu_index),
+                "block": "true" if block else "false",
+            },
+        )
         if resp.status_code != 200:
             raise ZeusdError(f"Failed to reset memory locked clocks: {resp.text}")
 
     def set_gpu_locked_clocks(self, min_clock_mhz: int, max_clock_mhz: int, block: bool = True) -> None:
         """Lock the GPU clock to a specified range. Units: MHz."""
         resp = self._client.post(
-            self._url_prefix + "/set_gpu_locked_clocks",
-            json=dict(min_clock_mhz=min_clock_mhz, max_clock_mhz=max_clock_mhz, block=block),
+            "http://zeusd/gpu/set_gpu_locked_clocks",
+            params={
+                "gpu_ids": str(self._gpu_index),
+                "min_clock_mhz": str(min_clock_mhz),
+                "max_clock_mhz": str(max_clock_mhz),
+                "block": "true" if block else "false",
+            },
         )
         if resp.status_code != 200:
             raise ZeusdError(f"Failed to set GPU locked clocks: {resp.text}")
 
     def reset_gpu_locked_clocks(self, block: bool = True) -> None:
         """Reset the locked GPU clocks to the default."""
-        resp = self._client.post(self._url_prefix + "/reset_gpu_locked_clocks", json=dict(block=block))
+        resp = self._client.post(
+            "http://zeusd/gpu/reset_gpu_locked_clocks",
+            params={
+                "gpu_ids": str(self._gpu_index),
+                "block": "true" if block else "false",
+            },
+        )
         if resp.status_code != 200:
             raise ZeusdError(f"Failed to reset GPU locked clocks: {resp.text}")
 
 
@@ -315,7 +315,7 @@ def _check_server_reachable(self, server: ZeusdTcpConfig | ZeusdUdsConfig) -> No
             ConnectionError: If the server is not reachable.
             ValueError: If requested GPU or CPU indices are not available.
         """
-        url = self._url(server, "/gpu/power")
+        url = self._url(server, "/discover")
         try:
             with self._make_http_client(server, timeout=5.0) as client:
                 response = client.get(url)
@@ -329,7 +329,7 @@ def _check_server_reachable(self, server: ZeusdTcpConfig | ZeusdUdsConfig) -> No
             raise ConnectionError(f"zeusd at {server.key} returned HTTP {e.response.status_code}") from e
 
         if server.gpu_indices is not None:
-            available = {int(k) for k in data.get("power_mw", {})}
+            available = set(data.get("gpu_ids", []))
             requested = set(server.gpu_indices)
             missing = requested - available
             if missing:
@@ -344,16 +344,16 @@ def _check_cpu_available(self, server: ZeusdTcpConfig | ZeusdUdsConfig) -> bool:
         Raises:
             ValueError: If requested CPU indices are not available on the server.
         """
-        url = self._url(server, "/cpu/power")
+        url = self._url(server, "/discover")
         try:
             with self._make_http_client(server, timeout=5.0) as client:
                 response = client.get(url)
                 response.raise_for_status()
                 data = response.json()
-                power_mw = data.get("power_mw", {})
-                if power_mw:
+                cpu_ids = data.get("cpu_ids", [])
+                if cpu_ids:
                     if server.cpu_indices is not None:
-                        available = {int(k) for k in power_mw}
+                        available = set(cpu_ids)
                         requested = set(server.cpu_indices)
                         missing = requested - available
                         if missing:
@@ -365,12 +365,12 @@ def _check_cpu_available(self, server: ZeusdTcpConfig | ZeusdUdsConfig) -> bool:
                     return True
                 return False
         except (httpx.RequestError, httpx.HTTPStatusError):
-            logger.warning("Failed to probe CPU power endpoint on %s", server.key, exc_info=True)
+            logger.warning("Failed to probe discovery endpoint on %s", server.key, exc_info=True)
             return False
 
     def _gpu_stream_loop(self, server: ZeusdTcpConfig | ZeusdUdsConfig) -> None:
         """Background thread: stream GPU power from a single server."""
-        base_url = self._url(server, "/gpu/power/stream")
+        base_url = self._url(server, "/gpu/stream_power")
         # User specified specific indices to stream
         if server.gpu_indices is not None:
             ids_param = ",".join(str(i) for i in server.gpu_indices)
@@ -382,7 +382,7 @@ def _gpu_stream_loop(self, server: ZeusdTcpConfig | ZeusdUdsConfig) -> None:
 
     def _cpu_stream_loop(self, server: ZeusdTcpConfig | ZeusdUdsConfig) -> None:
         """Background thread: stream CPU power from a single server."""
-        base_url = self._url(server, "/cpu/power/stream")
+        base_url = self._url(server, "/cpu/stream_power")
         # User specified specific indices to stream
         if server.cpu_indices is not None:
             ids_param = ",".join(str(i) for i in server.cpu_indices)