[UX] Minor improvements of dstack metrics (#2667)

peterschmidt85 · jvstme · web-flow · commit f0ddae88bec4 · 2025-05-21T18:45:12.000+02:00
Co-authored-by: jvstme &lt;36324149+jvstme@users.noreply.github.com&gt;
diff --git a/src/dstack/_internal/cli/commands/metrics.py b/src/dstack/_internal/cli/commands/metrics.py
@@ -39,8 +39,6 @@ def _command(self, args: argparse.Namespace):
         run = self.api.runs.get(run_name=args.run_name)
         if run is None:
             raise CLIError(f"Run {args.run_name} not found")
-        if run.status.is_finished():
-            raise CLIError(f"Run {args.run_name} is finished")
         metrics = _get_run_jobs_metrics(api=self.api, run=run)
 
         if not args.watch:
@@ -55,8 +53,6 @@ def _command(self, args: argparse.Namespace):
                     run = self.api.runs.get(run_name=args.run_name)
                     if run is None:
                         raise CLIError(f"Run {args.run_name} not found")
-                    if run.status.is_finished():
-                        raise CLIError(f"Run {args.run_name} is finished")
                     metrics = _get_run_jobs_metrics(api=self.api, run=run)
         except KeyboardInterrupt:
             pass
@@ -78,11 +74,12 @@ def _get_run_jobs_metrics(api: Client, run: Run) -> List[JobMetrics]:
 def _get_metrics_table(run: Run, metrics: List[JobMetrics]) -> Table:
     table = Table(box=None)
     table.add_column("NAME", style="bold", no_wrap=True)
+    table.add_column("STATUS")
     table.add_column("CPU")
     table.add_column("MEMORY")
     table.add_column("GPU")
 
-    run_row: Dict[Union[str, int], Any] = {"NAME": run.name}
+    run_row: Dict[Union[str, int], Any] = {"NAME": run.name, "STATUS": run.status.value}
     if len(run._run.jobs) != 1:
         add_row_from_dict(table, run_row)
 
@@ -101,9 +98,9 @@ def _get_metrics_table(run: Run, metrics: List[JobMetrics]) -> Table:
             cpu_usage = f"{cpu_usage:.0f}%"
         memory_usage = _get_metric_value(job_metrics, "memory_working_set_bytes")
         if memory_usage is not None:
-            memory_usage = f"{round(memory_usage / 1024 / 1024)}MB"
+            memory_usage = _format_memory(memory_usage, 2)
             if resources is not None:
-                memory_usage += f"/{resources.memory_mib}MB"
+                memory_usage += f"/{_format_memory(resources.memory_mib * 1024 * 1024, 2)}"
         gpu_metrics = ""
         gpus_detected_num = _get_metric_value(job_metrics, "gpus_detected_num")
         if gpus_detected_num is not None:
@@ -113,13 +110,16 @@ def _get_metrics_table(run: Run, metrics: List[JobMetrics]) -> Table:
                 if gpu_memory_usage is not None:
                     if i != 0:
                         gpu_metrics += "\n"
-                    gpu_metrics += f"#{i} {round(gpu_memory_usage / 1024 / 1024)}MB"
+                    gpu_metrics += f"gpu={i} mem={_format_memory(gpu_memory_usage, 2)}"
                     if resources is not None:
-                        gpu_metrics += f"/{resources.gpus[i].memory_mib}MB"
-                    gpu_metrics += f" {gpu_util_percent}% Util"
+                        gpu_metrics += (
+                            f"/{_format_memory(resources.gpus[i].memory_mib * 1024 * 1024, 2)}"
+                        )
+                    gpu_metrics += f" util={gpu_util_percent}%"
 
         job_row: Dict[Union[str, int], Any] = {
             "NAME": f"  replica={job.job_spec.replica_num} job={job.job_spec.job_num}",
+            "STATUS": job.job_submissions[-1].status.value,
             "CPU": cpu_usage or "-",
             "MEMORY": memory_usage or "-",
             "GPU": gpu_metrics or "-",
@@ -136,3 +136,18 @@ def _get_metric_value(job_metrics: JobMetrics, name: str) -> Optional[Any]:
         if metric.name == name:
             return metric.values[-1]
     return None
+
+
+def _format_memory(memory_bytes: int, decimal_places: int) -> str:
+    """See test_format_memory in tests/_internal/cli/commands/test_metrics.py for examples."""
+    memory_mb = memory_bytes / 1024 / 1024
+    if memory_mb >= 1024:
+        value = memory_mb / 1024
+        unit = "GB"
+    else:
+        value = memory_mb
+        unit = "MB"
+
+    if decimal_places == 0:
+        return f"{round(value)}{unit}"
+    return f"{value:.{decimal_places}f}".rstrip("0").rstrip(".") + unit
diff --git a/src/tests/_internal/cli/commands/test_metrics.py b/src/tests/_internal/cli/commands/test_metrics.py
@@ -0,0 +1,34 @@
+import pytest
+
+from dstack._internal.cli.commands.metrics import _format_memory
+
+
+@pytest.mark.parametrize(
+    "bytes_value,decimal_places,expected",
+    [
+        # Test MB values with different decimal places
+        (512 * 1024 * 1024, 0, "512MB"),  # exact MB, no decimals
+        (512 * 1024 * 1024, 2, "512MB"),  # exact MB, with decimals
+        (512.5 * 1024 * 1024, 0, "512MB"),  # decimal MB, no decimals
+        (512.5 * 1024 * 1024, 2, "512.5MB"),  # decimal MB, 2 decimals
+        (512.5 * 1024 * 1024, 3, "512.5MB"),  # decimal MB, 3 decimals
+        (999 * 1024 * 1024, 0, "999MB"),  # just under 1GB, no decimals
+        (999 * 1024 * 1024, 2, "999MB"),  # just under 1GB, with decimals
+        # Test GB values with different decimal places
+        (1.5 * 1024 * 1024 * 1024, 0, "2GB"),  # decimal GB, no decimals
+        (1.5 * 1024 * 1024 * 1024, 2, "1.5GB"),  # decimal GB, 2 decimals
+        (1.5 * 1024 * 1024 * 1024, 3, "1.5GB"),  # decimal GB, 3 decimals
+        (2 * 1024 * 1024 * 1024, 0, "2GB"),  # exact GB, no decimals
+        (2 * 1024 * 1024 * 1024, 2, "2GB"),  # exact GB, with decimals
+        # Test edge cases
+        (0, 0, "0MB"),  # zero bytes, no decimals
+        (0, 2, "0MB"),  # zero bytes, with decimals
+        (1023 * 1024, 0, "1MB"),  # just under 1MB, no decimals
+        (1023 * 1024, 2, "1MB"),  # just under 1MB, with decimals
+        (1024 * 1024 * 1024 - 1, 0, "1024MB"),  # just under 1GB, no decimals
+        (1024 * 1024 * 1024 - 1, 2, "1024MB"),  # just under 1GB, with decimals
+    ],
+)
+def test_format_memory(bytes_value: int, decimal_places: int, expected: str):
+    result = _format_memory(bytes_value, decimal_places)
+    assert result == expected