Skip to content

Commit 5eca856

Browse files
committed
using pushgateway, but metrics are not aggregate,
single snapshot of jobs output
1 parent ff00705 commit 5eca856

File tree

4 files changed

+94
-45
lines changed

4 files changed

+94
-45
lines changed

batchtools/br.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ def run(args: argparse.Namespace):
198198
or total_wall is not None
199199
):
200200
labels = {
201-
"job": job_name,
201+
"job_name": job_name,
202202
"gpu": args.gpu,
203203
"queue": queue_name,
204204
"instance": PROMETHEUS_INSTANCE,

batchtools/prom_metrics.py

Lines changed: 27 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,14 @@
1313
Gauge,
1414
generate_latest,
1515
CONTENT_TYPE_LATEST,
16+
pushadd_to_gateway, # <-- add this
1617
)
1718

1819
LONG_JOB_BUCKETS = (1, 2, 5, 10, 20, 30, 60, 120, 180, 300, 600, 900, float("inf"))
1920

20-
PROMETHEUS_PUSH_URL = "http://localhost:8080/metrics"
21-
21+
PUSHGATEWAY_ADDR = os.getenv(
22+
"PUSHGATEWAY_ADDR", "pushgateway.ope-test.svc:9091"
23+
)
2224

2325
def detect_instance() -> str:
2426
if shutil.which("oc") is None:
@@ -33,69 +35,65 @@ def detect_instance() -> str:
3335

3436
registry = CollectorRegistry()
3537

36-
# pod execution duration (gpu runtime)
3738
BATCH_DURATION = Histogram(
3839
"batch_duration_seconds",
3940
"Runtime of batch job (seconds)",
40-
["job", "gpu", "queue", "result", "instance"],
41+
["job_name", "gpu", "queue", "result", "instance"],
4142
registry=registry,
4243
buckets=LONG_JOB_BUCKETS,
4344
)
45+
4446
BATCH_DURATION_TOTAL = Counter(
4547
"batch_duration_total_seconds",
4648
"Total runtime accumulated across all jobs (sum of durations)",
47-
["job", "gpu", "queue", "result", "instance"],
49+
["job_name", "gpu", "queue", "result", "instance"],
4850
registry=registry,
4951
)
5052

51-
# counter for runs based on what its completion status is: succeeded, failed, timed_out
5253
BATCH_RUNS = Counter(
5354
"batch_runs_total",
5455
"Number of batch runs by result",
55-
["job", "gpu", "queue", "result", "instance"],
56+
["job_name", "gpu", "queue", "result", "instance"],
5657
registry=registry,
5758
)
5859

59-
# gauge for number of jobs currently running
6060
IN_PROGRESS = Gauge(
6161
"batch_in_progress",
6262
"Currently running batch jobs",
63-
["job", "gpu", "queue", "result", "instance"],
63+
["job_name", "gpu", "queue", "result", "instance"],
6464
registry=registry,
6565
)
6666

67-
# queue wait (submission -> running)
6867
QUEUE_WAIT = Histogram(
6968
"batch_queue_wait_seconds",
70-
"Time from job submission until pod enters Running (includes Kueue admission, scheduling, image pull)",
71-
["job", "gpu", "queue", "result", "instance"],
69+
"Time from job submission until pod enters Running ...",
70+
["job_name", "gpu", "queue", "result", "instance"],
7271
registry=registry,
7372
buckets=LONG_JOB_BUCKETS,
7473
)
74+
7575
QUEUE_WAIT_COUNT = Counter(
7676
"batch_queue_wait_total_seconds",
7777
"Total accumulated queue wait time across jobs (sum of durations)",
78-
["job", "gpu", "queue", "result", "instance"],
78+
["job_name", "gpu", "queue", "result", "instance"],
7979
registry=registry,
8080
)
8181

82-
# end-to-end wall time (submission -> final phase)
8382
TOTAL_WALL = Histogram(
8483
"batch_total_wall_seconds",
85-
"End-to-end time from job submission until terminal phase (Succeeded/Failed/timeout)",
86-
["job", "gpu", "queue", "result", "instance"],
84+
"End-to-end time from job submission until terminal phase ...",
85+
["job_name", "gpu", "queue", "result", "instance"],
8786
registry=registry,
8887
buckets=LONG_JOB_BUCKETS,
8988
)
9089

9190
TOTAL_WALL_COUNT = Counter(
9291
"batch_total_wall_total_seconds",
9392
"Total accumulated wall time across jobs (sum of durations)",
94-
["job", "gpu", "queue", "result", "instance"],
93+
["job_name", "gpu", "queue", "result", "instance"],
9594
registry=registry,
9695
)
9796

98-
9997
def now_rfc3339() -> str:
10098
return datetime.now(timezone.utc).isoformat()
10199

@@ -143,39 +141,24 @@ def generate_metrics_text() -> tuple[str, str]:
143141
payload = generate_latest(registry)
144142
return payload.decode("utf-8"), CONTENT_TYPE_LATEST
145143

146-
147144
def push_registry_text() -> None:
148145
"""
149-
Push the current registry by posting its text exposition to PROMETHEUS_PUSH_URL
146+
Push the current registry to a Prometheus Pushgateway.
147+
148+
Uses PUSHGATEWAY_ADDR (host:port) and the logical job name "batchtools".
150149
"""
151-
if not PROMETHEUS_PUSH_URL:
150+
if not PUSHGATEWAY_ADDR:
152151
body, _ = generate_metrics_text()
153-
print("PROM: PROMETHEUS_PUSH_URL not set; below is the metrics payload:\n")
152+
print("PROM: PUSHGATEWAY_ADDR not set; below is the metrics payload:\n")
154153
print(body)
155154
return
156155

157-
body, content_type = generate_metrics_text()
158156
try:
159-
proc = subprocess.run(
160-
[
161-
"curl",
162-
"-sS",
163-
"-X",
164-
"POST",
165-
PROMETHEUS_PUSH_URL,
166-
"--data-binary",
167-
"@-",
168-
"-H",
169-
f"Content-Type: {content_type}",
170-
],
171-
input=body.encode("utf-8"),
172-
check=False,
157+
pushadd_to_gateway(
158+
PUSHGATEWAY_ADDR,
159+
job="batchtools",
160+
registry=registry,
173161
)
174-
if proc.returncode != 0:
175-
print(
176-
f"PROM: curl returned nonzero exit {proc.returncode}; metrics not confirmed."
177-
)
178-
else:
179-
print("PROM: metrics successfully pushed.")
162+
print(f"PROM: metrics pushed to pushgateway={PUSHGATEWAY_ADDR}")
180163
except Exception as e:
181-
print(f"PROM: failed to push metrics via curl: {e}")
164+
print(f"PROM: failed to push metrics to pushgateway {PUSHGATEWAY_ADDR}: {e}")

prom.yaml

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
apiVersion: apps/v1
2+
kind: Deployment
3+
metadata:
4+
name: pushgateway
5+
namespace: ope-test
6+
labels:
7+
app: pushgateway
8+
spec:
9+
replicas: 1
10+
selector:
11+
matchLabels:
12+
app: pushgateway
13+
template:
14+
metadata:
15+
labels:
16+
app: pushgateway
17+
spec:
18+
containers:
19+
- name: pushgateway
20+
image: prom/pushgateway:latest
21+
args:
22+
- "--web.listen-address=:9091"
23+
ports:
24+
- containerPort: 9091
25+
name: http
26+
readinessProbe:
27+
httpGet:
28+
path: /-/ready
29+
port: 9091
30+
initialDelaySeconds: 5
31+
livenessProbe:
32+
httpGet:
33+
path: /-/healthy
34+
port: 9091
35+
initialDelaySeconds: 10
36+
---
37+
apiVersion: v1
38+
kind: Service
39+
metadata:
40+
name: pushgateway
41+
namespace: ope-test
42+
labels:
43+
app: pushgateway
44+
spec:
45+
selector:
46+
app: pushgateway
47+
ports:
48+
- name: http
49+
port: 9091
50+
targetPort: 9091
51+
type: ClusterIP

service.yaml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# pushgateway-servicemonitor.yaml
2+
apiVersion: monitoring.coreos.com/v1
3+
kind: ServiceMonitor
4+
metadata:
5+
name: pushgateway
6+
namespace: ope-test
7+
labels:
8+
app: pushgateway
9+
spec:
10+
selector:
11+
matchLabels:
12+
app: pushgateway
13+
endpoints:
14+
- port: http # <- must match the Service port name
15+
interval: 15s

0 commit comments

Comments
 (0)