1313 Gauge ,
1414 generate_latest ,
1515 CONTENT_TYPE_LATEST ,
16+ pushadd_to_gateway , # <-- add this
1617)
1718
1819LONG_JOB_BUCKETS = (1 , 2 , 5 , 10 , 20 , 30 , 60 , 120 , 180 , 300 , 600 , 900 , float ("inf" ))
1920
20- PROMETHEUS_PUSH_URL = "http://localhost:8080/metrics"
21-
21+ PUSHGATEWAY_ADDR = os .getenv (
22+ "PUSHGATEWAY_ADDR" , "pushgateway.ope-test.svc:9091"
23+ )
2224
2325def detect_instance () -> str :
2426 if shutil .which ("oc" ) is None :
@@ -33,69 +35,65 @@ def detect_instance() -> str:
3335
3436registry = CollectorRegistry ()
3537
36- # pod execution duration (gpu runtime)
3738BATCH_DURATION = Histogram (
3839 "batch_duration_seconds" ,
3940 "Runtime of batch job (seconds)" ,
40- ["job " , "gpu" , "queue" , "result" , "instance" ],
41+ ["job_name " , "gpu" , "queue" , "result" , "instance" ],
4142 registry = registry ,
4243 buckets = LONG_JOB_BUCKETS ,
4344)
45+
4446BATCH_DURATION_TOTAL = Counter (
4547 "batch_duration_total_seconds" ,
4648 "Total runtime accumulated across all jobs (sum of durations)" ,
47- ["job " , "gpu" , "queue" , "result" , "instance" ],
49+ ["job_name " , "gpu" , "queue" , "result" , "instance" ],
4850 registry = registry ,
4951)
5052
51- # counter for runs based on what its completion status is: succeeded, failed, timed_out
5253BATCH_RUNS = Counter (
5354 "batch_runs_total" ,
5455 "Number of batch runs by result" ,
55- ["job " , "gpu" , "queue" , "result" , "instance" ],
56+ ["job_name " , "gpu" , "queue" , "result" , "instance" ],
5657 registry = registry ,
5758)
5859
59- # gauge for number of jobs currently running
6060IN_PROGRESS = Gauge (
6161 "batch_in_progress" ,
6262 "Currently running batch jobs" ,
63- ["job " , "gpu" , "queue" , "result" , "instance" ],
63+ ["job_name " , "gpu" , "queue" , "result" , "instance" ],
6464 registry = registry ,
6565)
6666
67- # queue wait (submission -> running)
6867QUEUE_WAIT = Histogram (
6968 "batch_queue_wait_seconds" ,
70- "Time from job submission until pod enters Running (includes Kueue admission, scheduling, image pull) " ,
71- ["job " , "gpu" , "queue" , "result" , "instance" ],
69+ "Time from job submission until pod enters Running ... " ,
70+ ["job_name " , "gpu" , "queue" , "result" , "instance" ],
7271 registry = registry ,
7372 buckets = LONG_JOB_BUCKETS ,
7473)
74+
7575QUEUE_WAIT_COUNT = Counter (
7676 "batch_queue_wait_total_seconds" ,
7777 "Total accumulated queue wait time across jobs (sum of durations)" ,
78- ["job " , "gpu" , "queue" , "result" , "instance" ],
78+ ["job_name " , "gpu" , "queue" , "result" , "instance" ],
7979 registry = registry ,
8080)
8181
82- # end-to-end wall time (submission -> final phase)
8382TOTAL_WALL = Histogram (
8483 "batch_total_wall_seconds" ,
85- "End-to-end time from job submission until terminal phase (Succeeded/Failed/timeout) " ,
86- ["job " , "gpu" , "queue" , "result" , "instance" ],
84+ "End-to-end time from job submission until terminal phase ... " ,
85+ ["job_name " , "gpu" , "queue" , "result" , "instance" ],
8786 registry = registry ,
8887 buckets = LONG_JOB_BUCKETS ,
8988)
9089
9190TOTAL_WALL_COUNT = Counter (
9291 "batch_total_wall_total_seconds" ,
9392 "Total accumulated wall time across jobs (sum of durations)" ,
94- ["job " , "gpu" , "queue" , "result" , "instance" ],
93+ ["job_name " , "gpu" , "queue" , "result" , "instance" ],
9594 registry = registry ,
9695)
9796
98-
9997def now_rfc3339 () -> str :
10098 return datetime .now (timezone .utc ).isoformat ()
10199
@@ -143,39 +141,24 @@ def generate_metrics_text() -> tuple[str, str]:
143141 payload = generate_latest (registry )
144142 return payload .decode ("utf-8" ), CONTENT_TYPE_LATEST
145143
146-
147144def push_registry_text () -> None :
148145 """
149- Push the current registry by posting its text exposition to PROMETHEUS_PUSH_URL
146+ Push the current registry to a Prometheus Pushgateway.
147+
148+ Uses PUSHGATEWAY_ADDR (host:port) and the logical job name "batchtools".
150149 """
151- if not PROMETHEUS_PUSH_URL :
150+ if not PUSHGATEWAY_ADDR :
152151 body , _ = generate_metrics_text ()
153- print ("PROM: PROMETHEUS_PUSH_URL not set; below is the metrics payload:\n " )
152+ print ("PROM: PUSHGATEWAY_ADDR not set; below is the metrics payload:\n " )
154153 print (body )
155154 return
156155
157- body , content_type = generate_metrics_text ()
158156 try :
159- proc = subprocess .run (
160- [
161- "curl" ,
162- "-sS" ,
163- "-X" ,
164- "POST" ,
165- PROMETHEUS_PUSH_URL ,
166- "--data-binary" ,
167- "@-" ,
168- "-H" ,
169- f"Content-Type: { content_type } " ,
170- ],
171- input = body .encode ("utf-8" ),
172- check = False ,
157+ pushadd_to_gateway (
158+ PUSHGATEWAY_ADDR ,
159+ job = "batchtools" ,
160+ registry = registry ,
173161 )
174- if proc .returncode != 0 :
175- print (
176- f"PROM: curl returned nonzero exit { proc .returncode } ; metrics not confirmed."
177- )
178- else :
179- print ("PROM: metrics successfully pushed." )
162+ print (f"PROM: metrics pushed to pushgateway={ PUSHGATEWAY_ADDR } " )
180163 except Exception as e :
181- print (f"PROM: failed to push metrics via curl : { e } " )
164+ print (f"PROM: failed to push metrics to pushgateway { PUSHGATEWAY_ADDR } : { e } " )
0 commit comments