adding back br wait

memalhot · memalhot · commit 72c55419d528 · 2025-12-07T16:17:41.000-05:00
diff --git a/batchtools/br.py b/batchtools/br.py
@@ -21,7 +21,6 @@
 from .prom_metrics import (
     PROMETHEUS_INSTANCE,
     IN_PROGRESS,
-    # record helpers (emit both histogram + counter)
     record_batch_observation,
     record_queue_observation,
     record_wall_observation,
@@ -191,7 +190,6 @@ def run(args: argparse.Namespace):
                     job_name=job_name, wait=True, timeout=args.timeout
                 )
 
-            # Emit metrics if we captured any timing
             if (
                 run_elapsed is not None
                 or queue_wait is not None
@@ -222,8 +220,8 @@ def run(args: argparse.Namespace):
                 IN_PROGRESS.labels(**labels, result=result_phase).dec()
 
                 group = {
-                    "instance": PROMETHEUS_INSTANCE,  # e.g. "ope-test"
-                    "job_name": job_name            # e.g. "job-none-b799d46a6f995a9d8d58b6aff76abefc"
+                    "instance": PROMETHEUS_INSTANCE,  # oc project
+                    "job_name": job_name,  # unique name
                 }
 
                 push_registry_text(grouping_key=group)
diff --git a/batchtools/prom_metrics.py b/batchtools/prom_metrics.py
@@ -1,4 +1,3 @@
-import subprocess
 from datetime import datetime, timezone
 from .helpers import is_on_project
 
@@ -18,9 +17,8 @@
 
 LONG_JOB_BUCKETS = (1, 2, 5, 10, 20, 30, 60, 120, 180, 300, 600, 900, float("inf"))
 
-PUSHGATEWAY_ADDR = os.getenv(
-    "PUSHGATEWAY_ADDR", "pushgateway.ope-test.svc:9091"
-)
+PUSHGATEWAY_ADDR = os.getenv("PUSHGATEWAY_ADDR", "pushgateway.ope-test.svc:9091")
+
 
 def detect_instance() -> str:
     if shutil.which("oc") is None:
@@ -94,6 +92,7 @@ def detect_instance() -> str:
     registry=registry,
 )
 
+
 def now_rfc3339() -> str:
     return datetime.now(timezone.utc).isoformat()
 
@@ -141,6 +140,7 @@ def generate_metrics_text() -> tuple[str, str]:
     payload = generate_latest(registry)
     return payload.decode("utf-8"), CONTENT_TYPE_LATEST
 
+
 def push_registry_text(grouping_key: dict[str, str] | None = None) -> None:
     if not PUSHGATEWAY_ADDR:
         body, _ = generate_metrics_text()
diff --git a/pushgateway/prom.yaml b/pushgateway/prom.yaml
@@ -48,4 +48,4 @@ spec:
     - name: http
       port: 9091
       targetPort: 9091
-  type: ClusterIP
+  type: ClusterIP
diff --git a/tests/test_prom_metrics.py b/tests/test_prom_metrics.py
@@ -1,138 +1,83 @@
-import batchtools.prom_metrics as pm
-import types
-from datetime import datetime
+import re
 from unittest import mock
+import batchtools.prom_metrics as pm
 
 
-def test_now_rfc3339_parses_with_timezone():
-    s = pm.now_rfc3339()
-    # ISO 8601 parseable and includes timezone info (+00:00)
-    dt = datetime.fromisoformat(s)
-    assert dt.tzinfo is not None
-    assert s.endswith("+00:00")
-
-
-def _labels(job="job-x", gpu="none", queue="dummy-localqueue", instance="ns"):
-    return {"job": job, "gpu": gpu, "queue": queue, "instance": instance}
-
-
-def _registry_text() -> str:
-    body, ctype = pm.generate_metrics_text()
-    assert isinstance(body, str)
-    assert ctype.startswith("text/plain")
-    return body
+def _base_labels():
+    return {
+        "job_name": "job-none-xyz",
+        "gpu": "none",
+        "queue": "dummy-localqueue",
+        "instance": pm.PROMETHEUS_INSTANCE or "test",
+    }
 
 
 def test_record_batch_observation_updates_hist_and_counters():
-    labels = _labels()
-    pm.record_batch_observation(labels=labels, elapsed_sec=7.5, result="succeeded")
-    text = _registry_text()
-    assert "batch_duration_seconds_bucket" in text
-    assert "batch_duration_total_seconds" in text  # counter that sums durations
-    assert "batch_runs_total" in text
-    # spot-check that our label set appears on at least one line
-    for k, v in {**labels, "result": "succeeded"}.items():
-        assert f'{k}="{v}"' in text
+    labels = _base_labels()
+    pm.record_batch_observation(labels=labels, elapsed_sec=1.23, result="succeeded")
+    body, _ = pm.generate_metrics_text()
+    # Verify histogram/counter series were created with the expected labels
+    assert "batch_duration_seconds_bucket" in body
+    assert "batch_duration_total_seconds" in body
+    assert "batch_runs_total" in body
+    # spot-check that our labelset is present
+    assert 'job_name="job-none-xyz"' in body
+    assert 'queue="dummy-localqueue"' in body
+    assert 'result="succeeded"' in body
 
 
 def test_record_queue_and_wall_observation_update_metrics():
-    labels = _labels(job="job-y")
-    pm.record_queue_observation(labels=labels, elapsed_sec=3.0, result="succeeded")
-    pm.record_wall_observation(labels=labels, elapsed_sec=10.0, result="succeeded")
-    text = _registry_text()
-    assert "batch_queue_wait_seconds_bucket" in text
-    assert "batch_queue_wait_total_seconds" in text
-    assert "batch_total_wall_seconds_bucket" in text
-    assert "batch_total_wall_total_seconds" in text
-
-
-def _metric_samples(text: str, name: str):
-    """Yield lines for a metric family (not HELP/TYPE)."""
-    for line in text.splitlines():
-        if line.startswith(name) and not line.startswith("#"):
-            yield line
-
-
-def _any_sample_with_value(text: str, name: str, value: float) -> bool:
-    target = f" {value:.1f}"
-    return any(line.endswith(target) for line in _metric_samples(text, name))
+    labels = _base_labels()
+    pm.record_queue_observation(labels=labels, elapsed_sec=2.5, result="succeeded")
+    pm.record_wall_observation(labels=labels, elapsed_sec=3.5, result="succeeded")
+    body, _ = pm.generate_metrics_text()
+    assert "batch_queue_wait_seconds_bucket" in body
+    assert "batch_queue_wait_total_seconds" in body
+    assert "batch_total_wall_seconds_bucket" in body
+    assert "batch_total_wall_total_seconds" in body
 
 
 def test_set_in_progress_inc_and_dec_affect_gauge():
-    labels = _labels(job="job-z")
+    labels = _base_labels()
+    # inc
     pm.set_in_progress(labels=labels, result="running", inc=True)
-    text1 = _registry_text()
-    assert "batch_in_progress" in text1
-    assert _any_sample_with_value(text1, "batch_in_progress", 1.0)
-
+    body_inc, _ = pm.generate_metrics_text()
+    assert "batch_in_progress" in body_inc
+    assert 'result="running"' in body_inc
+    # dec (should not throw; series may remain visible in exposition)
     pm.set_in_progress(labels=labels, result="running", inc=False)
-    text2 = _registry_text()
-    assert _any_sample_with_value(text2, "batch_in_progress", 0.0)
+    body_dec, _ = pm.generate_metrics_text()
+    assert "batch_in_progress" in body_dec
 
 
-def test_generate_metrics_text_returns_valid_payload_and_ctype():
-    body, ctype = pm.generate_metrics_text()
-    assert "# HELP" in body
-    assert "# TYPE" in body
-    assert ctype.startswith("text/plain")
-
-
-def test_push_registry_text_no_url_prints_payload(capsys):
-    # Temporarily clear the push URL so it prints instead of POSTing
-    with mock.patch.object(pm, "PROMETHEUS_PUSH_URL", "", create=True):
-        pm.push_registry_text()
+def test_push_registry_text_no_url_prints_payload(capsys, monkeypatch):
+    # Simulate "no address configured" by blanking the module variable.
+    monkeypatch.setattr(pm, "PUSHGATEWAY_ADDR", "", raising=False)
+    pm.push_registry_text()
     out = capsys.readouterr().out
-    assert "PROMETHEUS_PUSH_URL not set" in out
-    assert "# HELP" in out  # the metrics text is printed
-
-
-def test_push_registry_text_posts_success(capsys):
-    with (
-        mock.patch.object(
-            pm, "PROMETHEUS_PUSH_URL", "http://example/metrics", create=True
-        ),
-        mock.patch.object(pm.subprocess, "run") as mock_run,
-    ):
-        mock_run.return_value = types.SimpleNamespace(returncode=0)
-
-        pm.push_registry_text()
-        out = capsys.readouterr().out
-        assert "metrics successfully pushed" in out
+    assert "PUSHGATEWAY_ADDR not set; below is the metrics payload:" in out
+    assert "# HELP" in out and "# TYPE" in out  # payload printed
 
-        # Verify we invoked curl with POST and sent our payload
-        assert mock_run.called
-        args, kwargs = mock_run.call_args
-        argv = args[0]
-        assert "curl" in argv[0]
-        assert "-X" in argv and "POST" in argv
-        assert "http://example/metrics" in argv
 
-        payload = kwargs.get("input", b"").decode("utf-8")
-        assert "# HELP" in payload
-        assert "# TYPE" in payload
-
-
-def test_push_registry_text_posts_failure(capsys):
-    with (
-        mock.patch.object(
-            pm, "PROMETHEUS_PUSH_URL", "http://example/metrics", create=True
-        ),
-        mock.patch.object(pm.subprocess, "run") as mock_run,
-    ):
-        mock_run.return_value = types.SimpleNamespace(returncode=7)
-
-        pm.push_registry_text()
-        out = capsys.readouterr().out
-        assert "curl returned nonzero exit 7" in out
+def test_push_registry_text_posts_success(capsys, monkeypatch):
+    monkeypatch.setattr(
+        pm, "PUSHGATEWAY_ADDR", "pushgateway.example:9091", raising=False
+    )
+    with mock.patch.object(pm, "pushadd_to_gateway", autospec=True) as m:
+        pm.push_registry_text(grouping_key={"instance": "test", "job_name": "job-1"})
+        m.assert_called_once()
+    out = capsys.readouterr().out
+    assert "PROM: metrics pushed to pushgateway=pushgateway.example:9091" in out
 
 
-def test_push_registry_text_handles_exception(capsys):
-    with (
-        mock.patch.object(
-            pm, "PROMETHEUS_PUSH_URL", "http://example/metrics", create=True
-        ),
-        mock.patch.object(pm.subprocess, "run", side_effect=RuntimeError("boom")),
+def test_push_registry_text_posts_failure(capsys, monkeypatch):
+    monkeypatch.setattr(
+        pm, "PUSHGATEWAY_ADDR", "pushgateway.example:9091", raising=False
+    )
+    with mock.patch.object(
+        pm, "pushadd_to_gateway", side_effect=Exception("boom"), autospec=True
     ):
-        pm.push_registry_text()
+        pm.push_registry_text(grouping_key={"instance": "test", "job_name": "job-2"})
     out = capsys.readouterr().out
-    assert "failed to push metrics via curl" in out
+    # Example: "PROM: failed to push metrics to pushgateway pushgateway.example:9091: boom"
+    assert re.search(r"PROM: failed to push metrics to pushgateway .*: boom", out)