Percona-Lab
diff --git a/‎INSIGHTS.md‎
Lines changed: 214 additions & 0 deletions b/‎INSIGHTS.md‎
Lines changed: 214 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 21 additions & 1 deletion b/‎README.md‎
Lines changed: 21 additions & 1 deletion
diff --git a/‎internal/benchmark/runner.go‎
Lines changed: 1 addition & 0 deletions b/‎internal/benchmark/runner.go‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎internal/config/config.go‎
Lines changed: 80 additions & 0 deletions b/‎internal/config/config.go‎
Lines changed: 80 additions & 0 deletions
@@ -0,0 +1,214 @@
+# Post-Run Insights: Slow Query and Index Analysis
+
+This document explains PLGM's post-run insights layer in detail.
+
+The feature provides a structured analysis after benchmark completion, including:
+- slow operation groups
+- affected collections
+- normalized query-shape groupings
+- cautious, evidence-based index guidance
+- export-ready JSON data for downstream dashboards
+
+## What It Is
+
+The insights layer is a **foundational analytics pass** designed to run after all iterations are complete.
+
+It is intentionally separated from real-time charts to keep runtime overhead bounded and predictable.
+
+## Where It Appears
+
+After completion, insights are available in:
+- Web UI dashboard panel: `POST-RUN SLOW QUERY & INDEX ANALYSIS`
+- API endpoint: `GET /api/insights`
+- `Download Summary` JSON export under the `insights` section
+
+## When It Runs
+
+Insights are finalized only after workloads finish.
+
+- While a run is active, `GET /api/insights` returns `metadata.status = pending`.
+- Once complete, the endpoint returns final analysis (`ready` / `empty` / `disabled`).
+
+This behavior avoids presenting partial or misleading findings during execution.
+
+## Data Collection Model
+
+PLGM captures sampled operation events during workload execution, with bounded retention.
+
+Each sampled event may include:
+- operation type
+- database and collection
+- normalized shape key and shape summary
+- extracted filter fields (when applicable)
+- duration
+- success/failure
+- iteration index
+- timestamp
+
+Retention characteristics:
+- sampled (configurable sampling rate)
+- bounded ring buffer (`insights_max_events`)
+- bounded aggregation cardinality (`insights_max_groups`)
+
+This prevents unbounded memory growth while preserving useful signal.
+
+## What Insights Contains
+
+Top-level sections in the final report:
+- `summary`
+- `slow_queries`
+- `affected_collections`
+- `query_shapes`
+- `potential_index_issues`
+- `recommendations`
+- `per_iteration`
+- `time_slices`
+- `metadata`
+
+## Stable Shape IDs and Cross-Run Trends
+
+Each shape group has a stable `shape_id` derived from:
+- operation
+- collection
+- normalized shape key
+
+This enables consistent identity across runs.
+
+PLGM also keeps a lightweight in-memory baseline to show trend hints (for matching shapes), e.g.:
+- improved
+- worse
+- flat
+
+## Optional Explain Sampling (Off by Default)
+
+An optional post-run explain mode can enrich evidence for top slow shapes.
+
+Important design choices:
+- disabled by default
+- runs only post-run
+- limited to top-N shapes
+- bounded by max explain execution time
+- falls back to heuristic messaging if explain is unavailable
+
+If explain sampling is enabled, index issue messages may be upgraded when evidence is observed (for example, explain indicating `COLLSCAN`).
+
+## Index Advice Philosophy
+
+PLGM uses confidence-aware wording and does not overstate certainty.
+
+Possible evidence levels:
+- heuristic
+- heuristic with index-overlap/no-overlap signals
+- explain-based evidence (when enabled and successful)
+
+Typical language intentionally uses cautious terms like:
+- "possible missing index"
+- "collection scan is possible"
+- "validate with explain"
+
+## Web UI Configuration
+
+Path: `Advanced -> Insights Analysis`
+
+Available controls:
+- Enable Post-Run Insights Analysis
+- Enable Post-Run Explain Sampling (Optional)
+- Insights Sampling Rate
+- Slow Threshold (ms)
+- Max Retained Events
+- Max Group Entries
+- Explain Top N Shapes
+- Explain Max Time (ms)
+
+All settings are applied per run and included in exported summary config.
+
+## API Contract
+
+`GET /api/insights`
+
+Typical states:
+- `inactive`: no collector/run context
+- `pending`: run still active
+- `ready`: completed report available
+- `empty`: no sampled events in buffer
+- `disabled`: insights disabled via configuration
+
+The payload is read-only and designed for UI or future dashboard consumers.
+
+## Export Contract
+
+`Download Summary` includes:
+- final benchmark summary fields
+- `insights` object identical to post-run API/UI model
+- redacted password handling preserved
+
+## Configuration Reference
+
+Config file keys:
+- `insights_enabled`
+- `insights_sampling_rate`
+- `insights_slow_threshold_ms`
+- `insights_max_events`
+- `insights_max_groups`
+- `insights_explain_enabled`
+- `insights_explain_top_n`
+- `insights_explain_max_time_ms`
+
+Environment overrides:
+- `PLGM_INSIGHTS_ENABLED`
+- `PLGM_INSIGHTS_SAMPLING_RATE`
+- `PLGM_INSIGHTS_SLOW_THRESHOLD_MS`
+- `PLGM_INSIGHTS_MAX_EVENTS`
+- `PLGM_INSIGHTS_MAX_GROUPS`
+- `PLGM_INSIGHTS_EXPLAIN_ENABLED`
+- `PLGM_INSIGHTS_EXPLAIN_TOP_N`
+- `PLGM_INSIGHTS_EXPLAIN_MAX_TIME_MS`
+
+## Recommended Starting Values
+
+For general usage:
+- sampling rate: `0.10`
+- slow threshold: `200ms`
+- max events: `5000`
+- max groups: `300`
+- explain sampling: disabled
+
+For deeper troubleshooting (short test windows):
+- sampling rate: `0.25` to `1.0`
+- explain sampling: enabled
+- top N shapes: `3` to `5`
+- explain max time: `1000` to `3000`
+
+## Use Cases
+
+1. Fast post-run triage
+- Identify top slow groups immediately after completion.
+
+2. Collection hotspot detection
+- Detect which collections account for most slow patterns.
+
+3. Safe index investigation shortlist
+- Generate candidate fields/patterns to validate with DBA workflows.
+
+4. Iteration and timeline context
+- Compare behavior across iterations and time slices.
+
+5. CI / automated benchmarking exports
+- Consume structured `insights` JSON for pipelines/reports.
+
+## Known Limitations
+
+- Sampling means results are representative, not exhaustive.
+- Heuristic index advice is not a guarantee of missing index root cause.
+- Explain enrichment depends on representative sample availability and access.
+- Trend persistence is in-memory; it does not survive process restarts.
+- Explanations are intentionally post-run only to protect active benchmark performance.
+
+## Future Enhancements
+
+Potential next steps for a full insights dashboard:
+- persistent historical run storage for long-term trend analysis
+- richer explain-plan capture and comparison views
+- cross-run diff reports and regression alerts
+- deeper per-shape drill-down and filter playback tools
+
@@ -87,6 +87,15 @@ Download the [`config.yaml`](./config.yaml) and make the necessary adjustments.
 
 For unit and integration test instructions (including Docker-based MongoDB setup), see [`TESTING.md`](./TESTING.md).
 
+### 5. Post-Run Insights (Shortcut)
+
+PLGM includes a post-run **Slow Query and Index Analysis** layer that is configurable from the Web UI (Advanced tab).
+
+Quick access:
+* Full guide: [`INSIGHTS.md`](./INSIGHTS.md)
+* Web UI path: `Advanced -> Insights Analysis`
+* Output: Dashboard insights panel + `Download Summary` JSON (`insights` section)
+
 ## The Interactive UI
 
 `plgm` features a completely embedded Web UI. It allows you to configure your database connection, upload custom workload schemas, adjust operation ratios, and monitor real-time throughput and latency without ever touching a YAML file. It has the same functionality as the CLI version, but with an awesome UI.
@@ -133,6 +142,7 @@ When running `plgm` with the `--webui` flag, you get access to a rich, browser-b
 * **Live Telemetry & Dashboard:** Watch operations per second (Find, Insert, Update, Delete) and latencies update in real-time with sub-second precision.
 * **The "Time Machine" Scrubber:** Pause the live feed and scrub backward through the benchmark timeline to investigate specific latency spikes or throughput drops.
 * **Real-Time CSV Export:** Configure and stream metrics to a local CSV file directly from the Advanced tab. Use the "Append" feature to stitch multiple benchmark runs into a single dataset.
+* **Post-Run Insights Analysis:** Review slow-query groups, affected collections, potential index issues, and recommendations after all iterations complete.
 * **Graceful Shutdown:** Click the **EXIT** button in the header to safely terminate close the application directly from the browser, ensuring all background workers are cleaned up properly.
 
 ### 3. Configuration
@@ -603,7 +613,7 @@ Once the csv is exported, you can script your own method to plot its data. We ha
 ## Post-Run JSON Summary Report
 If you forget to enable the real-time CSV export, or if you just want a clean summary of your final results, PLGM provides a Download Summary Report button in the Web UI that appears the moment a workload finishes.
 
-This generates a downloadable JSON summary report that captures both the final performance metrics (total ops, average latencies, and throughput per operation type) alongside the exact configuration parameters used to achieve those results. Passwords are automatically redacted from this file for safe sharing.
+This generates a downloadable JSON summary report that captures both the final performance metrics (total ops, average latencies, and throughput per operation type), post-run insights analysis, and the exact configuration parameters used to achieve those results. Passwords are automatically redacted from this file for safe sharing.
 
 Example Summary Snippet:
 
@@ -621,6 +631,7 @@ Example Summary Snippet:
     },
     "operations": { ... },
     "average_latencies_ms": { ... },
+    "insights": { ... },
     "configuration": {
         "concurrency": "4",
         "find_batch_size": "10",
@@ -690,6 +701,15 @@ You can override any setting in `config.yaml` using environment variables. This
 | `csv_export_enabled` ||Continuously stream workload throughput metrics to a CSV file| `false` |
 | `csv_export_append` ||If true, appends to the file. If false, overwrites it.| `false` |
 | `csv_export_path` ||Path and metrics file name| `/tmp/plgm_metrics_export.csv` |
+| **Post-Run Insights** | | | |
+| `insights_enabled` | `PLGM_INSIGHTS_ENABLED` | Enable post-run slow-query/index analysis | `true` |
+| `insights_sampling_rate` | `PLGM_INSIGHTS_SAMPLING_RATE` | Sample rate for captured operation events (`0.01`-`1.0`) | `0.10` |
+| `insights_slow_threshold_ms` | `PLGM_INSIGHTS_SLOW_THRESHOLD_MS` | Latency threshold used to classify operations as slow | `200` |
+| `insights_max_events` | `PLGM_INSIGHTS_MAX_EVENTS` | Max retained sampled events in memory | `5000` |
+| `insights_max_groups` | `PLGM_INSIGHTS_MAX_GROUPS` | Max aggregated slow-shape groups | `300` |
+| `insights_explain_enabled` | `PLGM_INSIGHTS_EXPLAIN_ENABLED` | Enable optional post-run explain sampling | `false` |
+| `insights_explain_top_n` | `PLGM_INSIGHTS_EXPLAIN_TOP_N` | Number of top slow shapes to attempt explain on | `5` |
+| `insights_explain_max_time_ms` | `PLGM_INSIGHTS_EXPLAIN_MAX_TIME_MS` | Max server time per explain command | `1000` |
 | **Workload Control** | | | |
 | `concurrency` | `PLGM_CONCURRENCY` | Number of active worker goroutines | `50` |
 | `duration` | `PLGM_DURATION` | Test duration (Go duration string) | `5m`, `60s` |
 
@@ -99,6 +99,7 @@ func RunRawInjector(ctx context.Context, db *mongo.Database, cfg *config.AppConf
 	} else {
 		collector = stats.NewCollector()
 	}
+	collector.ConfigureInsights(cfg)
 
 	monitorDone := make(chan struct{})
 
 
@@ -53,6 +53,15 @@ type AppConfig struct {
 	CSVExportEnabled bool   `yaml:"csv_export_enabled"`
 	CSVExportAppend  bool   `yaml:"csv_export_append"`
 	CSVExportPath    string `yaml:"csv_export_path"`
+
+	InsightsEnabled          bool    `yaml:"insights_enabled"`
+	InsightsSamplingRate     float64 `yaml:"insights_sampling_rate"`
+	InsightsSlowThresholdMs  int     `yaml:"insights_slow_threshold_ms"`
+	InsightsMaxEvents        int     `yaml:"insights_max_events"`
+	InsightsMaxGroups        int     `yaml:"insights_max_groups"`
+	InsightsExplainEnabled   bool    `yaml:"insights_explain_enabled"`
+	InsightsExplainTopN      int     `yaml:"insights_explain_top_n"`
+	InsightsExplainMaxTimeMS int     `yaml:"insights_explain_max_time_ms"`
 }
 
 type WebUIConfig struct {
@@ -150,6 +159,16 @@ func applyUIDefaults(cfg *AppConfig) {
 	cfg.CSVExportEnabled = false
 	cfg.CSVExportAppend = false
 	cfg.CSVExportPath = "plgm_metrics_export.csv"
+
+	// --- INSIGHTS DEFAULTS ---
+	cfg.InsightsEnabled = true
+	cfg.InsightsSamplingRate = 0.10
+	cfg.InsightsSlowThresholdMs = 200
+	cfg.InsightsMaxEvents = 5000
+	cfg.InsightsMaxGroups = 300
+	cfg.InsightsExplainEnabled = false
+	cfg.InsightsExplainTopN = 5
+	cfg.InsightsExplainMaxTimeMS = 1000
 }
 
 // applyBaseDefaults sets low-level engine safety limits & remaining UI limits
@@ -167,6 +186,25 @@ func applyBaseDefaults(cfg *AppConfig) {
 		cfg.CSVExportPath = "plgm_metrics_export.csv"
 	}
 
+	if cfg.InsightsSamplingRate <= 0 || cfg.InsightsSamplingRate > 1 {
+		cfg.InsightsSamplingRate = 0.10
+	}
+	if cfg.InsightsSlowThresholdMs <= 0 {
+		cfg.InsightsSlowThresholdMs = 200
+	}
+	if cfg.InsightsMaxEvents <= 0 {
+		cfg.InsightsMaxEvents = 5000
+	}
+	if cfg.InsightsMaxGroups <= 0 {
+		cfg.InsightsMaxGroups = 300
+	}
+	if cfg.InsightsExplainTopN <= 0 {
+		cfg.InsightsExplainTopN = 5
+	}
+	if cfg.InsightsExplainMaxTimeMS <= 0 {
+		cfg.InsightsExplainMaxTimeMS = 1000
+	}
+
 	// Web UI Port
 	if cfg.WebUI.Port <= 0 {
 		cfg.WebUI.Port = 9999 // default if not specified via flag
@@ -488,6 +526,48 @@ func applyEnvOverrides(cfg *AppConfig) map[string]bool {
 		}
 	}
 
+	// --- Insights Overrides ---
+	if v := os.Getenv("PLGM_INSIGHTS_ENABLED"); v != "" {
+		if b, err := strconv.ParseBool(v); err == nil {
+			cfg.InsightsEnabled = b
+		}
+	}
+	if v := os.Getenv("PLGM_INSIGHTS_SAMPLING_RATE"); v != "" {
+		if f, err := strconv.ParseFloat(v, 64); err == nil && f > 0 && f <= 1 {
+			cfg.InsightsSamplingRate = f
+		}
+	}
+	if v := os.Getenv("PLGM_INSIGHTS_SLOW_THRESHOLD_MS"); v != "" {
+		if n, err := strconv.Atoi(v); err == nil && n > 0 {
+			cfg.InsightsSlowThresholdMs = n
+		}
+	}
+	if v := os.Getenv("PLGM_INSIGHTS_MAX_EVENTS"); v != "" {
+		if n, err := strconv.Atoi(v); err == nil && n > 0 {
+			cfg.InsightsMaxEvents = n
+		}
+	}
+	if v := os.Getenv("PLGM_INSIGHTS_MAX_GROUPS"); v != "" {
+		if n, err := strconv.Atoi(v); err == nil && n > 0 {
+			cfg.InsightsMaxGroups = n
+		}
+	}
+	if v := os.Getenv("PLGM_INSIGHTS_EXPLAIN_ENABLED"); v != "" {
+		if b, err := strconv.ParseBool(v); err == nil {
+			cfg.InsightsExplainEnabled = b
+		}
+	}
+	if v := os.Getenv("PLGM_INSIGHTS_EXPLAIN_TOP_N"); v != "" {
+		if n, err := strconv.Atoi(v); err == nil && n > 0 {
+			cfg.InsightsExplainTopN = n
+		}
+	}
+	if v := os.Getenv("PLGM_INSIGHTS_EXPLAIN_MAX_TIME_MS"); v != "" {
+		if n, err := strconv.Atoi(v); err == nil && n > 0 {
+			cfg.InsightsExplainMaxTimeMS = n
+		}
+	}
+
 	return overrides
 }
Original file line number	Diff line number	Diff line change
`@@ -99,6 +99,7 @@ func RunRawInjector(ctx context.Context, db mongo.Database, cfg config.AppConf`
`99`	`99`	`} else {`
`100`	`100`	`collector = stats.NewCollector()`
`101`	`101`	`}`
	`102`	`+ collector.ConfigureInsights(cfg)`
`102`	`103`
`103`	`104`	`monitorDone := make(chan struct{})`
`104`	`105`