Skip to content

Commit 72c5541

Browse files
committed
adding back br wait
1 parent 1373862 commit 72c5541

File tree

4 files changed

+67
-124
lines changed

4 files changed

+67
-124
lines changed

batchtools/br.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
from .prom_metrics import (
2222
PROMETHEUS_INSTANCE,
2323
IN_PROGRESS,
24-
# record helpers (emit both histogram + counter)
2524
record_batch_observation,
2625
record_queue_observation,
2726
record_wall_observation,
@@ -191,7 +190,6 @@ def run(args: argparse.Namespace):
191190
job_name=job_name, wait=True, timeout=args.timeout
192191
)
193192

194-
# Emit metrics if we captured any timing
195193
if (
196194
run_elapsed is not None
197195
or queue_wait is not None
@@ -222,8 +220,8 @@ def run(args: argparse.Namespace):
222220
IN_PROGRESS.labels(**labels, result=result_phase).dec()
223221

224222
group = {
225-
"instance": PROMETHEUS_INSTANCE, # e.g. "ope-test"
226-
"job_name": job_name # e.g. "job-none-b799d46a6f995a9d8d58b6aff76abefc"
223+
"instance": PROMETHEUS_INSTANCE, # oc project
224+
"job_name": job_name, # unique name
227225
}
228226

229227
push_registry_text(grouping_key=group)

batchtools/prom_metrics.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import subprocess
21
from datetime import datetime, timezone
32
from .helpers import is_on_project
43

@@ -18,9 +17,8 @@
1817

1918
LONG_JOB_BUCKETS = (1, 2, 5, 10, 20, 30, 60, 120, 180, 300, 600, 900, float("inf"))
2019

21-
PUSHGATEWAY_ADDR = os.getenv(
22-
"PUSHGATEWAY_ADDR", "pushgateway.ope-test.svc:9091"
23-
)
20+
PUSHGATEWAY_ADDR = os.getenv("PUSHGATEWAY_ADDR", "pushgateway.ope-test.svc:9091")
21+
2422

2523
def detect_instance() -> str:
2624
if shutil.which("oc") is None:
@@ -94,6 +92,7 @@ def detect_instance() -> str:
9492
registry=registry,
9593
)
9694

95+
9796
def now_rfc3339() -> str:
9897
return datetime.now(timezone.utc).isoformat()
9998

@@ -141,6 +140,7 @@ def generate_metrics_text() -> tuple[str, str]:
141140
payload = generate_latest(registry)
142141
return payload.decode("utf-8"), CONTENT_TYPE_LATEST
143142

143+
144144
def push_registry_text(grouping_key: dict[str, str] | None = None) -> None:
145145
if not PUSHGATEWAY_ADDR:
146146
body, _ = generate_metrics_text()

pushgateway/prom.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,4 +48,4 @@ spec:
4848
- name: http
4949
port: 9091
5050
targetPort: 9091
51-
type: ClusterIP
51+
type: ClusterIP

tests/test_prom_metrics.py

Lines changed: 60 additions & 115 deletions
Original file line numberDiff line numberDiff line change
@@ -1,138 +1,83 @@
1-
import batchtools.prom_metrics as pm
2-
import types
3-
from datetime import datetime
1+
import re
42
from unittest import mock
3+
import batchtools.prom_metrics as pm
54

65

7-
def test_now_rfc3339_parses_with_timezone():
8-
s = pm.now_rfc3339()
9-
# ISO 8601 parseable and includes timezone info (+00:00)
10-
dt = datetime.fromisoformat(s)
11-
assert dt.tzinfo is not None
12-
assert s.endswith("+00:00")
13-
14-
15-
def _labels(job="job-x", gpu="none", queue="dummy-localqueue", instance="ns"):
16-
return {"job": job, "gpu": gpu, "queue": queue, "instance": instance}
17-
18-
19-
def _registry_text() -> str:
20-
body, ctype = pm.generate_metrics_text()
21-
assert isinstance(body, str)
22-
assert ctype.startswith("text/plain")
23-
return body
6+
def _base_labels():
7+
return {
8+
"job_name": "job-none-xyz",
9+
"gpu": "none",
10+
"queue": "dummy-localqueue",
11+
"instance": pm.PROMETHEUS_INSTANCE or "test",
12+
}
2413

2514

2615
def test_record_batch_observation_updates_hist_and_counters():
27-
labels = _labels()
28-
pm.record_batch_observation(labels=labels, elapsed_sec=7.5, result="succeeded")
29-
text = _registry_text()
30-
assert "batch_duration_seconds_bucket" in text
31-
assert "batch_duration_total_seconds" in text # counter that sums durations
32-
assert "batch_runs_total" in text
33-
# spot-check that our label set appears on at least one line
34-
for k, v in {**labels, "result": "succeeded"}.items():
35-
assert f'{k}="{v}"' in text
16+
labels = _base_labels()
17+
pm.record_batch_observation(labels=labels, elapsed_sec=1.23, result="succeeded")
18+
body, _ = pm.generate_metrics_text()
19+
# Verify histogram/counter series were created with the expected labels
20+
assert "batch_duration_seconds_bucket" in body
21+
assert "batch_duration_total_seconds" in body
22+
assert "batch_runs_total" in body
23+
# spot-check that our labelset is present
24+
assert 'job_name="job-none-xyz"' in body
25+
assert 'queue="dummy-localqueue"' in body
26+
assert 'result="succeeded"' in body
3627

3728

3829
def test_record_queue_and_wall_observation_update_metrics():
39-
labels = _labels(job="job-y")
40-
pm.record_queue_observation(labels=labels, elapsed_sec=3.0, result="succeeded")
41-
pm.record_wall_observation(labels=labels, elapsed_sec=10.0, result="succeeded")
42-
text = _registry_text()
43-
assert "batch_queue_wait_seconds_bucket" in text
44-
assert "batch_queue_wait_total_seconds" in text
45-
assert "batch_total_wall_seconds_bucket" in text
46-
assert "batch_total_wall_total_seconds" in text
47-
48-
49-
def _metric_samples(text: str, name: str):
50-
"""Yield lines for a metric family (not HELP/TYPE)."""
51-
for line in text.splitlines():
52-
if line.startswith(name) and not line.startswith("#"):
53-
yield line
54-
55-
56-
def _any_sample_with_value(text: str, name: str, value: float) -> bool:
57-
target = f" {value:.1f}"
58-
return any(line.endswith(target) for line in _metric_samples(text, name))
30+
labels = _base_labels()
31+
pm.record_queue_observation(labels=labels, elapsed_sec=2.5, result="succeeded")
32+
pm.record_wall_observation(labels=labels, elapsed_sec=3.5, result="succeeded")
33+
body, _ = pm.generate_metrics_text()
34+
assert "batch_queue_wait_seconds_bucket" in body
35+
assert "batch_queue_wait_total_seconds" in body
36+
assert "batch_total_wall_seconds_bucket" in body
37+
assert "batch_total_wall_total_seconds" in body
5938

6039

6140
def test_set_in_progress_inc_and_dec_affect_gauge():
62-
labels = _labels(job="job-z")
41+
labels = _base_labels()
42+
# inc
6343
pm.set_in_progress(labels=labels, result="running", inc=True)
64-
text1 = _registry_text()
65-
assert "batch_in_progress" in text1
66-
assert _any_sample_with_value(text1, "batch_in_progress", 1.0)
67-
44+
body_inc, _ = pm.generate_metrics_text()
45+
assert "batch_in_progress" in body_inc
46+
assert 'result="running"' in body_inc
47+
# dec (should not throw; series may remain visible in exposition)
6848
pm.set_in_progress(labels=labels, result="running", inc=False)
69-
text2 = _registry_text()
70-
assert _any_sample_with_value(text2, "batch_in_progress", 0.0)
49+
body_dec, _ = pm.generate_metrics_text()
50+
assert "batch_in_progress" in body_dec
7151

7252

73-
def test_generate_metrics_text_returns_valid_payload_and_ctype():
74-
body, ctype = pm.generate_metrics_text()
75-
assert "# HELP" in body
76-
assert "# TYPE" in body
77-
assert ctype.startswith("text/plain")
78-
79-
80-
def test_push_registry_text_no_url_prints_payload(capsys):
81-
# Temporarily clear the push URL so it prints instead of POSTing
82-
with mock.patch.object(pm, "PROMETHEUS_PUSH_URL", "", create=True):
83-
pm.push_registry_text()
53+
def test_push_registry_text_no_url_prints_payload(capsys, monkeypatch):
54+
# Simulate "no address configured" by blanking the module variable.
55+
monkeypatch.setattr(pm, "PUSHGATEWAY_ADDR", "", raising=False)
56+
pm.push_registry_text()
8457
out = capsys.readouterr().out
85-
assert "PROMETHEUS_PUSH_URL not set" in out
86-
assert "# HELP" in out # the metrics text is printed
87-
88-
89-
def test_push_registry_text_posts_success(capsys):
90-
with (
91-
mock.patch.object(
92-
pm, "PROMETHEUS_PUSH_URL", "http://example/metrics", create=True
93-
),
94-
mock.patch.object(pm.subprocess, "run") as mock_run,
95-
):
96-
mock_run.return_value = types.SimpleNamespace(returncode=0)
97-
98-
pm.push_registry_text()
99-
out = capsys.readouterr().out
100-
assert "metrics successfully pushed" in out
58+
assert "PUSHGATEWAY_ADDR not set; below is the metrics payload:" in out
59+
assert "# HELP" in out and "# TYPE" in out # payload printed
10160

102-
# Verify we invoked curl with POST and sent our payload
103-
assert mock_run.called
104-
args, kwargs = mock_run.call_args
105-
argv = args[0]
106-
assert "curl" in argv[0]
107-
assert "-X" in argv and "POST" in argv
108-
assert "http://example/metrics" in argv
10961

110-
payload = kwargs.get("input", b"").decode("utf-8")
111-
assert "# HELP" in payload
112-
assert "# TYPE" in payload
113-
114-
115-
def test_push_registry_text_posts_failure(capsys):
116-
with (
117-
mock.patch.object(
118-
pm, "PROMETHEUS_PUSH_URL", "http://example/metrics", create=True
119-
),
120-
mock.patch.object(pm.subprocess, "run") as mock_run,
121-
):
122-
mock_run.return_value = types.SimpleNamespace(returncode=7)
123-
124-
pm.push_registry_text()
125-
out = capsys.readouterr().out
126-
assert "curl returned nonzero exit 7" in out
62+
def test_push_registry_text_posts_success(capsys, monkeypatch):
63+
monkeypatch.setattr(
64+
pm, "PUSHGATEWAY_ADDR", "pushgateway.example:9091", raising=False
65+
)
66+
with mock.patch.object(pm, "pushadd_to_gateway", autospec=True) as m:
67+
pm.push_registry_text(grouping_key={"instance": "test", "job_name": "job-1"})
68+
m.assert_called_once()
69+
out = capsys.readouterr().out
70+
assert "PROM: metrics pushed to pushgateway=pushgateway.example:9091" in out
12771

12872

129-
def test_push_registry_text_handles_exception(capsys):
130-
with (
131-
mock.patch.object(
132-
pm, "PROMETHEUS_PUSH_URL", "http://example/metrics", create=True
133-
),
134-
mock.patch.object(pm.subprocess, "run", side_effect=RuntimeError("boom")),
73+
def test_push_registry_text_posts_failure(capsys, monkeypatch):
74+
monkeypatch.setattr(
75+
pm, "PUSHGATEWAY_ADDR", "pushgateway.example:9091", raising=False
76+
)
77+
with mock.patch.object(
78+
pm, "pushadd_to_gateway", side_effect=Exception("boom"), autospec=True
13579
):
136-
pm.push_registry_text()
80+
pm.push_registry_text(grouping_key={"instance": "test", "job_name": "job-2"})
13781
out = capsys.readouterr().out
138-
assert "failed to push metrics via curl" in out
82+
# Example: "PROM: failed to push metrics to pushgateway pushgateway.example:9091: boom"
83+
assert re.search(r"PROM: failed to push metrics to pushgateway .*: boom", out)

0 commit comments

Comments
 (0)