posit-dev
diff --git a/‎CHANGELOG.md‎
Lines changed: 11 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎api/types.go‎
Lines changed: 1 addition & 1 deletion b/‎api/types.go‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎conformance/conformance_test.go‎
Lines changed: 21 additions & 4 deletions b/‎conformance/conformance_test.go‎
Lines changed: 21 additions & 4 deletions
diff --git a/‎conformance/scenarios.go‎
Lines changed: 50 additions & 0 deletions b/‎conformance/scenarios.go‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎docs/API.md‎
Lines changed: 88 additions & 1 deletion b/‎docs/API.md‎
Lines changed: 88 additions & 1 deletion
diff --git a/‎docs/ARCHITECTURE.md‎
Lines changed: 22 additions & 0 deletions b/‎docs/ARCHITECTURE.md‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎docs/GUIDE.md‎
Lines changed: 83 additions & 0 deletions b/‎docs/GUIDE.md‎
Lines changed: 83 additions & 0 deletions
diff --git a/‎examples/inmemory/main.go‎
Lines changed: 5 additions & 3 deletions b/‎examples/inmemory/main.go‎
Lines changed: 5 additions & 3 deletions
@@ -8,6 +8,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 
 ### Added
+- **Plugin API 3.7.0**: Plugin metrics framework. Plugins can now report periodic metrics to the Launcher for Prometheus exposition.
+  - `MetricsPlugin` optional interface in `launcher` package — plugins implement `Metrics(ctx context.Context) PluginMetrics` to report custom metrics
+  - `PluginMetrics` struct with `ClusterInteractionLatency` field for scheduler command latency histograms
+  - `Histogram` type for thread-safe metric accumulation with `Observe()` and `Drain()` methods
+  - `ClusterInteractionLatencyBuckets` variable with standard bucket boundaries matching the Launcher
+  - `MetricsInterval` field on `Runtime` and `DefaultOptions` for configuring the collection interval
+  - `--plugin-metrics-interval-seconds` CLI flag (default: 60, 0 to disable)
+  - Framework automatically reports `uptimeSeconds`; custom metrics are opt-in via `MetricsPlugin`
+  - `RunMetrics` conformance test scenario for validating `MetricsPlugin` implementations
+- Protocol support for metrics response (message type 203) in `internal/protocol`
+- New dependency: `github.com/prometheus/client_golang` for histogram accumulation
 - **Plugin API 3.6.0**: Config reload support. The Launcher can now request plugins to reload configuration at runtime without restarting.
   - `ConfigReloadablePlugin` optional interface in `launcher` package — plugins implement `ReloadConfig(ctx context.Context) error` to handle reload requests
   - `ConfigReloadError` type for classified reload failures (Load, Validate, Save)
 
@@ -835,7 +835,7 @@ type Version struct {
 
 // APIVersion is the Launcher plugin API version supported by the types defined
 // in this package.
-var APIVersion = Version{Major: 3, Minor: 6, Patch: 0}
+var APIVersion = Version{Major: 3, Minor: 7, Patch: 0}
 
 // ConfigReloadErrorType classifies config reload errors.
 type ConfigReloadErrorType int
 
@@ -19,9 +19,10 @@ import (
 // testPlugin is a minimal in-memory plugin for testing the conformance suite
 // itself. It uses the cache package to handle storage and streaming.
 type testPlugin struct {
-	cache  *cache.JobCache
-	nextID int32
-	wg     sync.WaitGroup
+	cache   *cache.JobCache
+	nextID  int32
+	wg      sync.WaitGroup
+	latency *launcher.Histogram
 }
 
 func newTestPlugin(t *testing.T) *testPlugin {
@@ -31,7 +32,10 @@ func newTestPlugin(t *testing.T) *testPlugin {
 	if err != nil {
 		t.Fatalf("failed to create job cache: %v", err)
 	}
-	tp := &testPlugin{cache: c}
+	tp := &testPlugin{
+		cache:   c,
+		latency: launcher.NewHistogram(launcher.ClusterInteractionLatencyBuckets),
+	}
 	t.Cleanup(func() {
 		tp.wg.Wait()
 		_ = c.Close()
@@ -183,6 +187,12 @@ func (p *testPlugin) GetJobNetwork(_ context.Context, w launcher.ResponseWriter,
 	}
 }
 
+func (p *testPlugin) Metrics(_ context.Context) launcher.PluginMetrics {
+	return launcher.PluginMetrics{
+		ClusterInteractionLatency: p.latency.Drain(),
+	}
+}
+
 func (p *testPlugin) ClusterInfo(_ context.Context, w launcher.ResponseWriter, _ string) {
 	w.WriteClusterInfo(launcher.ClusterOptions{
 		Queues:       []string{"default"},
@@ -324,6 +334,13 @@ func TestRunFieldFiltering(t *testing.T) {
 	})
 }
 
+func TestRunMetrics(t *testing.T) {
+	p := newTestPlugin(t)
+	conformance.RunMetrics(t, p, conformance.MetricsOpts{
+		Timeout: 2 * time.Second,
+	})
+}
+
 func TestRunControlInvalidState(t *testing.T) {
 	p := newTestPlugin(t)
 	conformance.RunControlInvalidState(t, p, "testuser", conformance.InvalidStateOpts{
 
@@ -638,6 +638,56 @@ func RunControlInvalidState(t *testing.T, p launcher.Plugin, user string, opts I
 	}
 }
 
+// MetricsOpts configures the [RunMetrics] scenario.
+type MetricsOpts struct {
+	// Timeout for the Metrics call. Default: 1s.
+	Timeout time.Duration
+}
+
+// RunMetrics verifies that a [launcher.MetricsPlugin] implementation returns
+// promptly and produces valid metrics data.
+func RunMetrics(t *testing.T, p launcher.Plugin, opts MetricsOpts) {
+	t.Helper()
+	timeout := defaultTimeout(opts.Timeout, time.Second)
+
+	mp, ok := p.(launcher.MetricsPlugin)
+	if !ok {
+		t.Skip("plugin does not implement MetricsPlugin")
+	}
+
+	t.Run("ReturnsWithinTimeout", func(t *testing.T) {
+		ctx, cancel := context.WithTimeout(context.Background(), timeout)
+		defer cancel()
+
+		done := make(chan launcher.PluginMetrics, 1)
+		go func() {
+			done <- mp.Metrics(ctx)
+		}()
+
+		select {
+		case <-done:
+			// Good — Metrics returned promptly.
+		case <-ctx.Done():
+			t.Fatal("Metrics() did not return within timeout")
+		}
+	})
+
+	t.Run("LatencyBucketsNonNegative", func(t *testing.T) {
+		metrics := mp.Metrics(context.Background())
+		if metrics.ClusterInteractionLatency == nil {
+			t.Skip("no cluster interaction latency reported")
+		}
+		for i, v := range metrics.ClusterInteractionLatency.Buckets {
+			if v < 0 {
+				t.Errorf("bucket[%d] = %v, want >= 0", i, v)
+			}
+		}
+		if metrics.ClusterInteractionLatency.Sum < 0 {
+			t.Errorf("sum = %v, want >= 0", metrics.ClusterInteractionLatency.Sum)
+		}
+	})
+}
+
 // assertExitCode checks that the job's exit code is in the list of
 // acceptable values.
 func assertExitCode(t *testing.T, job *api.Job, acceptable []int) {
 
@@ -66,6 +66,34 @@ Every response from the plugin includes:
 | 7 | Get Job Resource Util | Stream resource utilization metrics for a job. |
 | 8 | Get Job Network | Get network information (hostname, IPs) for a job. |
 | 9 | Get Cluster Info | Get cluster capabilities and configuration. |
+| 17 | Multi Cluster Info | Get capabilities for multiple clusters. |
+| 201 | Set Load Balancer Nodes | Update load-balanced node list. |
+| 202 | Config Reload | Request configuration reload. |
+| 203 | Metrics Response | Periodic plugin metrics (plugin-initiated, no request). |
+
+### Metrics response (type 203)
+
+Unlike all other protocol messages, the plugin initiates the metrics response. The plugin sends it periodically on a timer (controlled by `--plugin-metrics-interval-seconds`) without any corresponding request from the Launcher. Both `requestId` and `responseId` are zero.
+
+```json
+{
+  "messageType": 203,
+  "requestId": 0,
+  "responseId": 0,
+  "uptimeSeconds": 3600,
+  "clusterInteractionLatencySample": {
+    "buckets": [0, 2, 3, 0, 0, 0, 0, 0, 0, 0],
+    "sum": 1.52
+  }
+}
+```
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `uptimeSeconds` | uint64 | Seconds since the plugin started. Always present. |
+| `clusterInteractionLatencySample` | object | Histogram snapshot of cluster interaction latency. Optional. |
+| `clusterInteractionLatencySample.buckets` | []float64 | Per-bucket observation counts (non-cumulative). |
+| `clusterInteractionLatencySample.sum` | float64 | Sum of all observed values. |
 
 ### Stream responses
 
@@ -239,6 +267,58 @@ Returns information about cluster capabilities (queues, resource limits, contain
 - `w` - ResponseWriter to send cluster info
 - `user` - Username requesting info
 
+### Type: MetricsPlugin (optional interface)
+
+```go
+type MetricsPlugin interface {
+    Plugin
+    Metrics(ctx context.Context) PluginMetrics
+}
+```
+
+Plugins that want to report custom metrics to the Launcher implement this interface. The `Metrics` method is called periodically (controlled by `--plugin-metrics-interval-seconds`). All plugins automatically report `uptimeSeconds`; implement this interface only for additional plugin-specific metrics like cluster interaction latency.
+
+Implementations should return quickly and avoid blocking I/O.
+
+### Type: PluginMetrics
+
+```go
+type PluginMetrics struct {
+    ClusterInteractionLatency *protocol.HistogramSample
+}
+```
+
+Contains metrics data collected by a plugin.
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `ClusterInteractionLatency` | `*protocol.HistogramSample` | Histogram snapshot of cluster interaction latency. Nil means no data. |
+
+### Type: Histogram
+
+```go
+type Histogram struct { /* unexported fields */ }
+
+func NewHistogram(buckets []float64) *Histogram
+func (h *Histogram) Observe(v float64)
+func (h *Histogram) Drain() *protocol.HistogramSample
+```
+
+A thread-safe histogram that accumulates observations locally and can be drained into a portable snapshot for sending to the Launcher. Use `NewHistogram(ClusterInteractionLatencyBuckets)` to create one with the correct bucket boundaries.
+
+- `Observe` records a single observation (e.g., a latency measurement in seconds). Safe for concurrent use.
+- `Drain` collects all accumulated observations since the last drain, resets the histogram, and returns a portable snapshot. Returns nil if no observations have been recorded.
+
+### Variable: ClusterInteractionLatencyBuckets
+
+```go
+var ClusterInteractionLatencyBuckets = []float64{
+    0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 5.0, 10.0, 30.0,
+}
+```
+
+The histogram bucket upper bounds (in seconds) for cluster interaction latency. These must match the Launcher's bucket boundaries so histogram data can be replayed correctly.
+
 ### Type: ResponseWriter
 
 ```go
@@ -373,12 +453,18 @@ Closes the stream. Must be called when streaming is complete.
 
 ```go
 type Runtime struct {
-    // contains filtered or unexported fields
+    MaxMessageSize  int
+    MetricsInterval time.Duration
 }
 ```
 
 The Runtime handles the request/response protocol and dispatches to plugin methods.
 
+| Field | Type | Description |
+|-------|------|-------------|
+| `MaxMessageSize` | `int` | Upper limit on message size for requests and responses. |
+| `MetricsInterval` | `time.Duration` | Interval between periodic metrics reports. Zero disables. Typically set from `DefaultOptions.MetricsInterval`. |
+
 #### Function: NewRuntime
 
 ```go
@@ -413,6 +499,7 @@ type DefaultOptions struct {
     Debug             bool
     JobExpiry         time.Duration
     HeartbeatInterval time.Duration
+    MetricsInterval   time.Duration
     LauncherConfig    string
     PluginName        string
     ScratchPath       string
 
@@ -353,6 +353,28 @@ type Error struct {
 - Better than string parsing
 - Follows Launcher API specification
 
+### Metrics collection
+
+The SDK supports periodic metrics reporting to the Launcher (API v3.7.0+). Unlike all other protocol messages, the plugin initiates metrics — it sends `MetricsResponse` messages on a timer without any corresponding request.
+
+```
+Bootstrap completes → Start metrics goroutine → Every N seconds:
+    Collect uptime + plugin metrics → Serialize → Send via response channel
+```
+
+**Data flow:**
+
+```
+Plugin (local accumulator)  →  MetricsResponse (JSON/IPC)  →  Launcher (Prometheus registry)
+       Observe()                    Drain + serialize              ObserveMultiple()
+```
+
+The plugin uses a local prometheus histogram as a cache, accumulating observations on the hot path (e.g., timing each Slurm command). On each metrics tick, the framework drains the histogram (collecting and resetting it) and sends the snapshot to the Launcher, which replays the data into its own Prometheus registry.
+
+**Why push-based?** The Launcher-plugin IPC channel has no QoS. Requesting metrics on-demand could delay time-sensitive messages (job status updates, control operations). Push-based metrics use the existing response channel and are inherently non-blocking from the Launcher's perspective.
+
+**Why swap-on-drain?** The Go prometheus client does not expose a `Reset()` method on individual histograms. The SDK works around this by atomically swapping the current histogram for a fresh one on each drain, then collecting from the old instance.
+
 ## Job cache design
 
 ### Storage and startup
 
@@ -1045,6 +1045,89 @@ func (p *MyPlugin) ReloadConfig(ctx context.Context) error {
 }
 ```
 
+### Plugin metrics
+
+The Launcher collects periodic metrics from plugins and exposes them on its Prometheus `/metrics` endpoint. All plugins automatically report `uptimeSeconds`. Plugins that interact with external schedulers can report additional metrics by implementing the `MetricsPlugin` interface.
+
+The Launcher passes `--plugin-metrics-interval-seconds <N>` at startup (default: 60, 0 to disable). The SDK handles the timer and IPC automatically.
+
+#### Reporting cluster interaction latency
+
+If your plugin runs CLI commands or makes API calls to a scheduler, you can measure their latency and report it as a histogram. A cluster interaction is any individual call to the external scheduler — a CLI command invocation, an HTTP/gRPC API request, or an SDK method call. Measure the wall-clock duration of the external call itself, from invocation to response.
+
+**What to measure:**
+
+- Time every external scheduler call: job submission, control operations (stop/kill/cancel), status queries, output retrieval, resource usage queries, etc.
+- For batch operations (e.g., a single `squeue` call that returns status for many jobs), record one observation for the entire call, not one per job.
+- Measure only the external call duration. Don't include internal cache lookups, response serialization, or in-process logic.
+
+**Setup:**
+
+```go
+type MyPlugin struct {
+    latency *launcher.Histogram
+}
+
+func NewMyPlugin() *MyPlugin {
+    return &MyPlugin{
+        latency: launcher.NewHistogram(launcher.ClusterInteractionLatencyBuckets),
+    }
+}
+
+// Metrics implements launcher.MetricsPlugin.
+func (p *MyPlugin) Metrics(ctx context.Context) launcher.PluginMetrics {
+    return launcher.PluginMetrics{
+        ClusterInteractionLatency: p.latency.Drain(),
+    }
+}
+```
+
+Then record latency wherever your plugin calls the scheduler:
+
+```go
+func (p *MyPlugin) SubmitJob(ctx context.Context, w launcher.ResponseWriter, user string, job *api.Job) {
+    start := time.Now()
+    result, err := runSchedulerCommand(ctx, "submit", job)
+    p.latency.Observe(time.Since(start).Seconds())
+    // ... handle result ...
+}
+
+func (p *MyPlugin) ControlJob(ctx context.Context, w launcher.ResponseWriter, user string, id api.JobID, op api.JobOperation) {
+    start := time.Now()
+    err := runSchedulerCommand(ctx, string(op), id)
+    p.latency.Observe(time.Since(start).Seconds())
+    // ... handle result ...
+}
+
+func (p *MyPlugin) pollStatuses(ctx context.Context) {
+    // Batch status query — one observation for the entire call
+    start := time.Now()
+    statuses, err := runSchedulerCommand(ctx, "status", "--all")
+    p.latency.Observe(time.Since(start).Seconds())
+    // ... update cache ...
+}
+```
+
+The `Histogram` type is thread-safe. Call `Observe` from any goroutine. The framework calls `Drain` on each metrics tick, which collects all accumulated observations and resets the histogram.
+
+#### Wiring metrics into the runtime
+
+Pass the metrics interval from `DefaultOptions` to the `Runtime`:
+
+```go
+opts := &launcher.DefaultOptions{}
+launcher.MustLoadOptions(opts, "myplugin")
+
+rt := launcher.NewRuntime(lgr, plugin)
+rt.MetricsInterval = opts.MetricsInterval
+```
+
+#### How it works
+
+The plugin accumulates metrics locally (using a prometheus histogram as a cache). On each metrics tick, the framework drains the accumulated data and sends a `MetricsResponse` (message type 203) to the Launcher over the IPC channel. The Launcher replays the histogram data into its own Prometheus registry, which API clients can then query.
+
+This design avoids adding request/response overhead to the Launcher-plugin connection for metrics collection. The team rejected the on-demand alternative because there is no QoS on the IPC channel and metrics requests could delay more important messages.
+
 ### User profiles
 
 System administrators may want to set default or maximum values for certain features on a per-user or per-group basis. For example, different groups of users could have different memory limits or CPU counts.
 
@@ -376,10 +376,12 @@ func main() {
 		nextID: 0,
 	}
 
-	// Create the runtime and start handling requests
-	// This blocks until the context is cancelled (e.g., Ctrl+C)
+	// Create the runtime and start handling requests.
+	// This blocks until the context is cancelled (e.g., Ctrl+C).
 	lgr.Info("Plugin ready to accept requests")
-	if err := launcher.NewRuntime(lgr, plugin).Run(ctx); err != nil {
+	rt := launcher.NewRuntime(lgr, plugin)
+	rt.MetricsInterval = options.MetricsInterval
+	if err := rt.Run(ctx); err != nil {
 		lgr.Error("Plugin runtime error", "error", err)
 		os.Exit(1)
 	}