prometheus metrics

memalhot · memalhot · commit b4ff138bcbd5 · 2025-11-10T13:10:21.000-05:00
diff --git a/batchtools/br.py b/batchtools/br.py
@@ -1,6 +1,6 @@
 # pyright: reportUninitializedInstanceVariable=false
 from typing import cast
-from typing_extensions import override
+from typing_extensions import override, Optional
 
 import argparse
 import os
@@ -17,6 +17,8 @@
 from .helpers import pretty_print
 from .helpers import oc_delete
 from .file_setup import prepare_context
+from .prom_metrics import render_histogram_samples, now_rfc3339, push_metrics_text
+from .prom_metrics import PROMETHEUS_INSTANCE
 
 
 class CreateJobCommandArgs(argparse.Namespace):
@@ -218,7 +220,27 @@ def run(args: argparse.Namespace):
             oc.create(job_body)
             print(f"Job: {job_name} created successfully. Now checking pod...")
             if args.wait:
-                log_job_output(job_name=job_name, wait=True, timeout=args.timeout)
+                # log_job_output(job_name=job_name, wait=True, timeout=args.timeout)
+                result_phase, run_elapsed = log_job_output(
+                    job_name=job_name, wait=True, timeout=args.timeout
+                )
+
+            if run_elapsed is not None:
+                labels = {
+                    "job": job_name,
+                    "gpu": args.gpu,
+                    "queue": queue_name,
+                    "result": result_phase.lower(),  # succeeded|failed (or unknown)
+                    "instance": PROMETHEUS_INSTANCE,
+                }
+                metrics = render_histogram_samples(
+                    metric_base="batch_duration_seconds",
+                    elapsed=run_elapsed,
+                    labels=labels,
+                )
+                # Include a helpful comment block and a timestamp line the scraper can ignore
+                preface = [f"# Batch run observed at {now_rfc3339()}"]
+                push_metrics_text("\n".join(preface) + metrics)
 
         except oc.OpenShiftPythonException as e:
             sys.exit(f"Error occurred while creating job: {e}")
@@ -243,36 +265,51 @@ def get_pod_status(pod_name: str | None = None) -> str:
     return pod.model.status.phase or "Unknown"
 
 
-def log_job_output(job_name: str, *, wait: bool, timeout: int | None) -> None:
+def log_job_output(job_name: str, *, wait: bool, timeout: int | None) -> tuple[str, Optional[float]]:
     """
     Wait until the job's pod completes (Succeeded/Failed), then print its logs once.
     """
     pods = oc.selector("pod", labels={"job-name": job_name}).objects()
     if not pods:
         print(f"No pods found for job {job_name}")
-        return
+        return ("unknown", None)
 
     pod = pods[0]
     pod_name = pod.model.metadata.name
-    phase="Unknown"
-    
+
+    run_start = None
+    result_phase = "unknown"
+    run_elapsed = None
+
     if wait:
-        start = time.monotonic()
+        start_poll = time.monotonic()
         while True:
             phase = get_pod_status(pod_name)
+            # Mark the first time the pod is actually Running
+            if phase == "Running" and run_start is None:
+                run_start = time.monotonic()
+
             if phase in ("Succeeded", "Failed"):
+                result_phase = phase
                 print(f"Pod, {pod_name} finished with phase={phase}")
-                # end = time.monotonic()
-                # for logging information
-                # elapsed = end - start
                 break
-            if timeout and (time.monotonic() - start) > timeout:
+
+            if timeout and (time.monotonic() - start_poll) > timeout:
                 print(f"Timeout waiting for pod {pod_name} to complete")
-                print(f"Deleting pod {pod_name}")
+                print(f"Deleting job {job_name}")
                 oc_delete("job", job_name)
-                return
+                # No run elapsed available on timeout
+                return ("timeout", None)
 
-            # sleep to avoid hammering the server
             time.sleep(2)
-    # pass in the pod object to get logs from, not the name
+
     print(pretty_print(pod))
+
+    # Compute elapsed RUN phase (if pod ran)
+    if run_start is not None:
+        run_elapsed = time.monotonic() - run_start
+    else:
+        # if pod never ran (e.g., failed in Pending), we can report 0 or None.
+        run_elapsed = 0.0 if result_phase.lower() == "failed" else None
+
+    return (result_phase.lower(), run_elapsed)
diff --git a/batchtools/prom_metrics.py b/batchtools/prom_metrics.py
@@ -0,0 +1,80 @@
+import os, socket, subprocess, datetime
+from typing import Optional
+from datetime import timezone
+from datetime import datetime
+
+
+PROMETHEUS_PUSH_URL = 'http://localhost:8080/metrics'
+PROMETHEUS_INSTANCE = os.environ.get("PROMETHEUS_INSTANCE", socket.gethostname())
+
+DEFAULT_BUCKETS = [1, 5, 10, 30, 60, 120, 300, 600, 1200, 1800, 3600]  # +Inf is implicit
+
+
+def now_rfc3339() -> str:
+    return datetime.now(timezone.utc).isoformat()
+
+def render_histogram_samples(
+    metric_base: str,
+    elapsed: float,
+    labels: dict[str, str],
+    buckets: list[float] = DEFAULT_BUCKETS,
+) -> str:
+    """
+    Render Prometheus text exposition for a single observation into a histogram.
+    Produces cumulative _bucket lines, plus _sum and _count.
+
+    Example metric_base: 'batch_duration_seconds'
+    """
+    # Build label string once
+    def lbl(extra: Optional[dict[str, str]] = None) -> str:
+        merged = {**labels}
+        if extra:
+            merged.update(extra)
+        # Ensure stable, quoted label set
+        inner = ",".join(f'{k}="{v}"' for k, v in sorted(merged.items()))
+        return f"{{{inner}}}"
+
+    # Header
+    lines = [
+        f"# HELP {metric_base} Runtime of batch job (seconds)",
+        f"# TYPE {metric_base} histogram",
+    ]
+
+    # Cumulative bucket counts: 1 for all buckets >= elapsed else 0
+    # Because this is a single observation.
+    for b in buckets:
+        val = 1 if elapsed <= b else 0
+        lines.append(f'{metric_base}_bucket{({"le": str(b)})} {val}')
+    # +Inf bucket is always 1 for a single observation
+    lines.append(f'{metric_base}_bucket{lbl({"le": "+Inf"})} 1')
+
+    # Sum and count
+    # Use full precision float; Prometheus parser handles standard float format
+    lines.append(f"{metric_base}_sum{lbl()} {elapsed}")
+    lines.append(f"{metric_base}_count{lbl()} 1")
+
+    return "\n".join(lines) + "\n"
+
+
+def push_metrics_text(text: str) -> None:
+    """
+    POST text exposition to PROMETHEUS_PUSH_URL if set. Uses curl (as requested).
+    """
+    if not PROMETHEUS_PUSH_URL:
+        print("PROM: PROMETHEUS_PUSH_URL not set; skipping push. Below is the metrics payload:\n")
+        print(text)
+        return
+
+    try:
+        proc = subprocess.run(
+            ["curl", "-sS", "-X", "POST", PROMETHEUS_PUSH_URL, "--data-binary", "@-",
+             "-H", "Content-Type: text/plain; version=0.0.4"],
+            input=text.encode("utf-8"),
+            check=False,
+        )
+        if proc.returncode != 0:
+            print(f"PROM: curl returned nonzero exit {proc.returncode}; metrics not confirmed.")
+        else:
+            print("PROM: metrics successfully pushed.")
+    except Exception as e:
+        print(f"PROM: failed to push metrics via curl: {e}")