using pushgateway, but metrics are not aggregate,

memalhot · memalhot · commit 5eca8560c2da · 2025-12-03T10:58:58.000-05:00
single snapshot of jobs output
diff --git a/batchtools/br.py b/batchtools/br.py
@@ -198,7 +198,7 @@ def run(args: argparse.Namespace):
                 or total_wall is not None
             ):
                 labels = {
-                    "job": job_name,
+                    "job_name": job_name,
                     "gpu": args.gpu,
                     "queue": queue_name,
                     "instance": PROMETHEUS_INSTANCE,
diff --git a/batchtools/prom_metrics.py b/batchtools/prom_metrics.py
@@ -13,12 +13,14 @@
     Gauge,
     generate_latest,
     CONTENT_TYPE_LATEST,
+    pushadd_to_gateway,   # <-- add this
 )
 
 LONG_JOB_BUCKETS = (1, 2, 5, 10, 20, 30, 60, 120, 180, 300, 600, 900, float("inf"))
 
-PROMETHEUS_PUSH_URL = "http://localhost:8080/metrics"
-
+PUSHGATEWAY_ADDR = os.getenv(
+    "PUSHGATEWAY_ADDR", "pushgateway.ope-test.svc:9091"
+)
 
 def detect_instance() -> str:
     if shutil.which("oc") is None:
@@ -33,69 +35,65 @@ def detect_instance() -> str:
 
 registry = CollectorRegistry()
 
-# pod execution duration (gpu runtime)
 BATCH_DURATION = Histogram(
     "batch_duration_seconds",
     "Runtime of batch job (seconds)",
-    ["job", "gpu", "queue", "result", "instance"],
+    ["job_name", "gpu", "queue", "result", "instance"],
     registry=registry,
     buckets=LONG_JOB_BUCKETS,
 )
+
 BATCH_DURATION_TOTAL = Counter(
     "batch_duration_total_seconds",
     "Total runtime accumulated across all jobs (sum of durations)",
-    ["job", "gpu", "queue", "result", "instance"],
+    ["job_name", "gpu", "queue", "result", "instance"],
     registry=registry,
 )
 
-# counter for runs based on what its completion status is: succeeded, failed, timed_out
 BATCH_RUNS = Counter(
     "batch_runs_total",
     "Number of batch runs by result",
-    ["job", "gpu", "queue", "result", "instance"],
+    ["job_name", "gpu", "queue", "result", "instance"],
     registry=registry,
 )
 
-# gauge for number of jobs currently running
 IN_PROGRESS = Gauge(
     "batch_in_progress",
     "Currently running batch jobs",
-    ["job", "gpu", "queue", "result", "instance"],
+    ["job_name", "gpu", "queue", "result", "instance"],
     registry=registry,
 )
 
-# queue wait (submission -> running)
 QUEUE_WAIT = Histogram(
     "batch_queue_wait_seconds",
-    "Time from job submission until pod enters Running (includes Kueue admission, scheduling, image pull)",
-    ["job", "gpu", "queue", "result", "instance"],
+    "Time from job submission until pod enters Running ...",
+    ["job_name", "gpu", "queue", "result", "instance"],
     registry=registry,
     buckets=LONG_JOB_BUCKETS,
 )
+
 QUEUE_WAIT_COUNT = Counter(
     "batch_queue_wait_total_seconds",
     "Total accumulated queue wait time across jobs (sum of durations)",
-    ["job", "gpu", "queue", "result", "instance"],
+    ["job_name", "gpu", "queue", "result", "instance"],
     registry=registry,
 )
 
-# end-to-end wall time (submission -> final phase)
 TOTAL_WALL = Histogram(
     "batch_total_wall_seconds",
-    "End-to-end time from job submission until terminal phase (Succeeded/Failed/timeout)",
-    ["job", "gpu", "queue", "result", "instance"],
+    "End-to-end time from job submission until terminal phase ...",
+    ["job_name", "gpu", "queue", "result", "instance"],
     registry=registry,
     buckets=LONG_JOB_BUCKETS,
 )
 
 TOTAL_WALL_COUNT = Counter(
     "batch_total_wall_total_seconds",
     "Total accumulated wall time across jobs (sum of durations)",
-    ["job", "gpu", "queue", "result", "instance"],
+    ["job_name", "gpu", "queue", "result", "instance"],
     registry=registry,
 )
 
-
 def now_rfc3339() -> str:
     return datetime.now(timezone.utc).isoformat()
 
@@ -143,39 +141,24 @@ def generate_metrics_text() -> tuple[str, str]:
     payload = generate_latest(registry)
     return payload.decode("utf-8"), CONTENT_TYPE_LATEST
 
-
 def push_registry_text() -> None:
     """
-    Push the current registry by posting its text exposition to PROMETHEUS_PUSH_URL
+    Push the current registry to a Prometheus Pushgateway.
+
+    Uses PUSHGATEWAY_ADDR (host:port) and the logical job name "batchtools".
     """
-    if not PROMETHEUS_PUSH_URL:
+    if not PUSHGATEWAY_ADDR:
         body, _ = generate_metrics_text()
-        print("PROM: PROMETHEUS_PUSH_URL not set; below is the metrics payload:\n")
+        print("PROM: PUSHGATEWAY_ADDR not set; below is the metrics payload:\n")
         print(body)
         return
 
-    body, content_type = generate_metrics_text()
     try:
-        proc = subprocess.run(
-            [
-                "curl",
-                "-sS",
-                "-X",
-                "POST",
-                PROMETHEUS_PUSH_URL,
-                "--data-binary",
-                "@-",
-                "-H",
-                f"Content-Type: {content_type}",
-            ],
-            input=body.encode("utf-8"),
-            check=False,
+        pushadd_to_gateway(
+            PUSHGATEWAY_ADDR,
+            job="batchtools",
+            registry=registry,
         )
-        if proc.returncode != 0:
-            print(
-                f"PROM: curl returned nonzero exit {proc.returncode}; metrics not confirmed."
-            )
-        else:
-            print("PROM: metrics successfully pushed.")
+        print(f"PROM: metrics pushed to pushgateway={PUSHGATEWAY_ADDR}")
     except Exception as e:
-        print(f"PROM: failed to push metrics via curl: {e}")
+        print(f"PROM: failed to push metrics to pushgateway {PUSHGATEWAY_ADDR}: {e}")
diff --git a/prom.yaml b/prom.yaml
@@ -0,0 +1,51 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: pushgateway
+  namespace: ope-test
+  labels:
+    app: pushgateway
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: pushgateway
+  template:
+    metadata:
+      labels:
+        app: pushgateway
+    spec:
+      containers:
+        - name: pushgateway
+          image: prom/pushgateway:latest
+          args:
+            - "--web.listen-address=:9091"
+          ports:
+            - containerPort: 9091
+              name: http
+          readinessProbe:
+            httpGet:
+              path: /-/ready
+              port: 9091
+            initialDelaySeconds: 5
+          livenessProbe:
+            httpGet:
+              path: /-/healthy
+              port: 9091
+            initialDelaySeconds: 10
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: pushgateway
+  namespace: ope-test
+  labels:
+    app: pushgateway
+spec:
+  selector:
+    app: pushgateway
+  ports:
+    - name: http
+      port: 9091
+      targetPort: 9091
+  type: ClusterIP
diff --git a/service.yaml b/service.yaml
@@ -0,0 +1,15 @@
+# pushgateway-servicemonitor.yaml
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: pushgateway
+  namespace: ope-test
+  labels:
+    app: pushgateway
+spec:
+  selector:
+    matchLabels:
+      app: pushgateway
+  endpoints:
+    - port: http        # <- must match the Service port name
+      interval: 15s