diff --git a/README.md b/README.md index dd6c92a..b1ca797 100644 --- a/README.md +++ b/README.md @@ -58,6 +58,32 @@ Comprehensive performance testing suite for the CoW Protocol Playground, enablin cow-perf run --config configs/scenarios/light-load.yml ``` +## Monitoring & Visualization + +Prometheus metrics export is **enabled by default** (port 9091). To use the full monitoring stack: + +1. **Start Prometheus & Grafana** + ```bash + docker compose --profile monitoring up -d + ``` + +2. **Run a test** (metrics export automatically on port 9091) + ```bash + cow-perf run --config configs/scenarios/light-load.yml + ``` + +3. **View dashboards** at http://localhost:3000 (default: admin/admin) + - Performance Overview + - API Performance + - Resources + - Comparison + - Trader Activity + +4. **Disable metrics export** (if needed) + ```bash + cow-perf run --config configs/scenarios/light-load.yml --prometheus-port 0 + ``` + For detailed setup and troubleshooting, see [Development Guide](docs/development.md). ## Documentation diff --git a/configs/dashboards/api-performance.json b/configs/dashboards/api-performance.json new file mode 100644 index 0000000..ba04f6d --- /dev/null +++ b/configs/dashboards/api-performance.json @@ -0,0 +1,762 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [ + { + "asDropdown": false, + "icon": "dashboard", + "includeVars": true, + "keepTime": true, + "tags": [], + "targetBlank": false, + "title": "Overview", + "tooltip": "Go to Overview Dashboard", + "type": "link", + "url": "/d/cow-perf-overview" + }, + { + "asDropdown": false, + "icon": "dashboard", + "includeVars": true, + "keepTime": true, + "tags": [], + "targetBlank": false, + "title": "Resources", + "tooltip": "Go to Resources Dashboard", + "type": "link", + "url": "/d/cow-perf-resources" + }, + { + "asDropdown": false, + "icon": "dashboard", + "includeVars": true, + "keepTime": true, + "tags": [], + "targetBlank": false, + "title": "Comparison", + "tooltip": "Go to Comparison Dashboard", + "type": "link", + "url": "/d/cow-perf-comparison" + }, + { + "asDropdown": false, + "icon": "dashboard", + "includeVars": true, + "keepTime": true, + "tags": [], + "targetBlank": false, + "title": "Traders", + "tooltip": "Go to Trader Activity Dashboard", + "type": "link", + "url": "/d/cow-perf-traders" + } + ], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 1, + "panels": [], + "title": "API Response Times", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 1 }, + "id": 2, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "histogram_quantile(0.95, sum(rate(cow_perf_api_response_time_seconds_bucket{endpoint=~\"$endpoint\", method=~\"$method\"}[$__rate_interval])) by (le, endpoint))", + "legendFormat": "{{endpoint}}", + "refId": "A" + } + ], + "title": "API Response Time by Endpoint (P95)", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "scaleDistribution": { "type": "linear" } + } + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 12, "y": 1 }, + "id": 3, + "options": { + "calculate": false, + "cellGap": 1, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Greens", + "steps": 64 + }, + "exemplars": { "color": "rgba(255,0,255,0.7)" }, + "filterValues": { "le": 1e-9 }, + "legend": { "show": true }, + "rowsFrame": { "layout": "auto" }, + "tooltip": { "show": true, "yHistogram": false }, + "yAxis": { + "axisPlacement": "left", + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum(rate(cow_perf_api_response_time_seconds_bucket{endpoint=~\"$endpoint\", method=~\"$method\"}[$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "{{le}}", + "refId": "A" + } + ], + "title": "Response Time Distribution", + "type": "heatmap" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 0.5 }, + { "color": "red", "value": 1 } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 20, "y": 1 }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "histogram_quantile(0.50, sum(rate(cow_perf_api_response_time_seconds_bucket{endpoint=~\"$endpoint\", method=~\"$method\"}[$__rate_interval])) by (le))", + "legendFormat": "P50", + "refId": "A" + } + ], + "title": "P50 Response Time", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 2 } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 20, "y": 5 }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "histogram_quantile(0.99, sum(rate(cow_perf_api_response_time_seconds_bucket{endpoint=~\"$endpoint\", method=~\"$method\"}[$__rate_interval])) by (le))", + "legendFormat": "P99", + "refId": "A" + } + ], + "title": "P99 Response Time", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 9 }, + "id": 6, + "panels": [], + "title": "API Throughput", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 10 }, + "id": 7, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum(rate(cow_perf_api_requests_total{endpoint=~\"$endpoint\", method=~\"$method\"}[$__rate_interval])) by (endpoint)", + "legendFormat": "{{endpoint}}", + "refId": "A" + } + ], + "title": "Requests Per Second by Endpoint", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "blue", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 4, "x": 12, "y": 10 }, + "id": 8, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum(cow_perf_api_requests_total{endpoint=~\"$endpoint\", method=~\"$method\"})", + "legendFormat": "Total", + "refId": "A" + } + ], + "title": "Total Requests", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 80, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "normal" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "GET" }, + "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] + }, + { + "matcher": { "id": "byName", "options": "POST" }, + "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] + }, + { + "matcher": { "id": "byName", "options": "PUT" }, + "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] + }, + { + "matcher": { "id": "byName", "options": "DELETE" }, + "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] + } + ] + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 10 }, + "id": 9, + "options": { + "legend": { + "calcs": ["sum"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum(rate(cow_perf_api_requests_total{endpoint=~\"$endpoint\", method=~\"$method\"}[$__rate_interval])) by (method)", + "legendFormat": "{{method}}", + "refId": "A" + } + ], + "title": "Requests by HTTP Method", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 18 }, + "id": 10, + "panels": [], + "title": "API Errors", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 30, + "gradientMode": "opacity", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 19 }, + "id": 11, + "options": { + "legend": { + "calcs": ["sum"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum(rate(cow_perf_api_errors_total{endpoint=~\"$endpoint\"}[$__rate_interval])) by (error_type)", + "legendFormat": "{{error_type}}", + "refId": "A" + } + ], + "title": "Error Rate Over Time", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "hideFrom": { "legend": false, "tooltip": false, "viz": false } }, + "mappings": [] + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "client_error" }, + "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] + }, + { + "matcher": { "id": "byName", "options": "server_error" }, + "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] + }, + { + "matcher": { "id": "byName", "options": "timeout" }, + "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] + }, + { + "matcher": { "id": "byName", "options": "connection_error" }, + "properties": [{ "id": "color", "value": { "fixedColor": "purple", "mode": "fixed" } }] + } + ] + }, + "gridPos": { "h": 8, "w": 6, "x": 12, "y": 19 }, + "id": 12, + "options": { + "displayLabels": ["name", "percent"], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": ["value"] + }, + "pieType": "pie", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum(cow_perf_api_errors_total{endpoint=~\"$endpoint\"}) by (error_type)", + "legendFormat": "{{error_type}}", + "refId": "A" + } + ], + "title": "Errors by Type", + "type": "piechart" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 10 } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 19 }, + "id": 13, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum(cow_perf_api_errors_total{endpoint=~\"$endpoint\"})", + "legendFormat": "Total", + "refId": "A" + } + ], + "title": "Total Errors", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "custom": { + "align": "auto", + "cellOptions": { "type": "auto" }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + } + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Value" }, + "properties": [{ "id": "custom.width", "value": 80 }] + }, + { + "matcher": { "id": "byName", "options": "endpoint" }, + "properties": [{ "id": "custom.width", "value": 200 }] + } + ] + }, + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 23 }, + "id": 14, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": ["sum"], + "show": false + }, + "showHeader": true, + "sortBy": [{ "desc": true, "displayName": "Value" }] + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum(cow_perf_api_errors_total{endpoint=~\"$endpoint\"}) by (endpoint, error_type)", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "refId": "A" + } + ], + "title": "Error Breakdown", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { "Time": true }, + "indexByName": {}, + "renameByName": {} + } + } + ], + "type": "table" + } + ], + "refresh": "5s", + "schemaVersion": 38, + "style": "dark", + "tags": ["cow-protocol", "performance-testing", "api"], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "definition": "label_values(cow_perf_orders_created_total, scenario)", + "hide": 0, + "includeAll": true, + "label": "Scenario", + "multi": false, + "name": "scenario", + "options": [], + "query": { + "query": "label_values(cow_perf_orders_created_total, scenario)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "definition": "label_values(cow_perf_api_requests_total, endpoint)", + "hide": 0, + "includeAll": true, + "label": "Endpoint", + "multi": true, + "name": "endpoint", + "options": [], + "query": { + "query": "label_values(cow_perf_api_requests_total, endpoint)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "definition": "label_values(cow_perf_api_requests_total, method)", + "hide": 0, + "includeAll": true, + "label": "Method", + "multi": true, + "name": "method", + "options": [], + "query": { + "query": "label_values(cow_perf_api_requests_total, method)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "CoW Performance Testing - API Performance", + "uid": "cow-perf-api", + "version": 1, + "weekStart": "" +} diff --git a/configs/dashboards/comparison.json b/configs/dashboards/comparison.json new file mode 100644 index 0000000..2b4ef7b --- /dev/null +++ b/configs/dashboards/comparison.json @@ -0,0 +1,887 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [ + { + "asDropdown": false, + "icon": "dashboard", + "includeVars": true, + "keepTime": true, + "tags": [], + "targetBlank": false, + "title": "Overview", + "tooltip": "Go to Overview Dashboard", + "type": "link", + "url": "/d/cow-perf-overview" + }, + { + "asDropdown": false, + "icon": "dashboard", + "includeVars": true, + "keepTime": true, + "tags": [], + "targetBlank": false, + "title": "API", + "tooltip": "Go to API Performance Dashboard", + "type": "link", + "url": "/d/cow-perf-api" + }, + { + "asDropdown": false, + "icon": "dashboard", + "includeVars": true, + "keepTime": true, + "tags": [], + "targetBlank": false, + "title": "Resources", + "tooltip": "Go to Resources Dashboard", + "type": "link", + "url": "/d/cow-perf-resources" + }, + { + "asDropdown": false, + "icon": "dashboard", + "includeVars": true, + "keepTime": true, + "tags": [], + "targetBlank": false, + "title": "Traders", + "tooltip": "Go to Trader Activity Dashboard", + "type": "link", + "url": "/d/cow-perf-traders" + } + ], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 1, + "panels": [], + "title": "Overview", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "blue", "value": null }] + }, + "unit": "string" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "/^baseline_id$/", + "values": false + }, + "textMode": "value" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_baseline_comparison_percent{baseline_id=~\"$baseline_id\"}", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "refId": "A" + } + ], + "title": "Baseline ID", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [ + { "options": { "0": { "color": "green", "index": 0, "text": "No Regressions" } }, "type": "value" }, + { "options": { "from": 1, "to": 5, "result": { "color": "yellow", "index": 1, "text": "Minor Issues" } }, "type": "range" }, + { "options": { "from": 6, "to": 999, "result": { "color": "red", "index": 2, "text": "Regressions Detected" } }, "type": "range" } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 6 } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 }, + "id": 3, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum(cow_perf_regression_detected)", + "legendFormat": "Verdict", + "refId": "A" + } + ], + "title": "Overall Verdict", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 5 } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum(cow_perf_regressions_total)", + "legendFormat": "Total", + "refId": "A" + } + ], + "title": "Total Regressions", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 }, + "id": 5, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_regression_detected{severity=\"critical\"}", + "legendFormat": "Critical", + "refId": "A" + } + ], + "title": "Critical Regressions", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, + "id": 6, + "panels": [], + "title": "Latency", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "description": "Positive = slower (regression). Negative = faster (improvement).", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 5 }, + { "color": "red", "value": 15 } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 6 }, + "id": 7, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_baseline_comparison_percent{metric=\"submission_latency_p95\", baseline_id=~\"$baseline_id\"}", + "legendFormat": "P95", + "refId": "A" + } + ], + "title": "Submission Latency Delta", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "description": "Positive = slower (regression). Negative = faster (improvement).", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 5 }, + { "color": "red", "value": 15 } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 6 }, + "id": 8, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_baseline_comparison_percent{metric=\"settlement_latency_p95\", baseline_id=~\"$baseline_id\"}", + "legendFormat": "P95", + "refId": "A" + } + ], + "title": "Settlement Latency Delta", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { + "mode": "line" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 5 }, + { "color": "red", "value": 15 } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }, + "id": 9, + "options": { + "legend": { + "calcs": ["lastNotNull"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_baseline_comparison_percent{metric=~\".*latency.*\", baseline_id=~\"$baseline_id\"}", + "legendFormat": "{{metric}}", + "refId": "A" + } + ], + "title": "Latency Comparison Over Time", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 10 }, + "id": 12, + "panels": [], + "title": "Throughput", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "description": "Negative = lower throughput (regression). Positive = higher throughput (improvement).", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": -5 }, + { "color": "green", "value": 0 } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 11 }, + "id": 13, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_baseline_comparison_percent{metric=\"orders_per_second\", baseline_id=~\"$baseline_id\"}", + "legendFormat": "Rate", + "refId": "A" + } + ], + "title": "Orders/Second Delta", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "description": "Negative = lower success rate (regression). Positive = higher success rate (improvement).", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": -5 }, + { "color": "green", "value": 0 } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 11 }, + "id": 14, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_baseline_comparison_percent{metric=\"success_rate\", baseline_id=~\"$baseline_id\"}", + "legendFormat": "Rate", + "refId": "A" + } + ], + "title": "Success Rate Delta", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 11 }, + "id": 15, + "options": { + "legend": { + "calcs": ["lastNotNull"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_baseline_comparison_percent{metric=~\"orders_per_second|success_rate\", baseline_id=~\"$baseline_id\"}", + "legendFormat": "{{metric}}", + "refId": "A" + } + ], + "title": "Throughput Comparison Over Time", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "description": "Negative = lower fill rate (regression). Positive = higher fill rate (improvement).", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": -5 }, + { "color": "green", "value": 0 } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 15 }, + "id": 16, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_baseline_comparison_percent{metric=\"fill_rate\", baseline_id=~\"$baseline_id\"}", + "legendFormat": "Rate", + "refId": "A" + } + ], + "title": "Fill Rate Delta", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 19 }, + "id": 17, + "panels": [], + "title": "Regressions", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 0, "y": 20 }, + "id": 18, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_regression_detected{severity=\"critical\"}", + "legendFormat": "Critical", + "refId": "A" + } + ], + "title": "Critical", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "orange", "value": 1 } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 4, "y": 20 }, + "id": 19, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_regression_detected{severity=\"major\"}", + "legendFormat": "Major", + "refId": "A" + } + ], + "title": "Major", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 8, "y": 20 }, + "id": 20, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_regression_detected{severity=\"minor\"}", + "legendFormat": "Minor", + "refId": "A" + } + ], + "title": "Minor", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "custom": { + "align": "auto", + "cellOptions": { "type": "auto" }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + } + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Metric" }, + "properties": [{ "id": "custom.width", "value": 200 }] + }, + { + "matcher": { "id": "byName", "options": "Delta %" }, + "properties": [ + { "id": "unit", "value": "percent" }, + { "id": "custom.width", "value": 100 } + ] + }, + { + "matcher": { "id": "byName", "options": "Baseline" }, + "properties": [{ "id": "custom.width", "value": 150 }] + } + ] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 20 }, + "id": 21, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": ["sum"], + "show": false + }, + "showHeader": true, + "sortBy": [{ "desc": true, "displayName": "Delta %" }] + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_baseline_comparison_percent{baseline_id=~\"$baseline_id\"}", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "refId": "A" + } + ], + "title": "All Comparison Metrics", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { "Time": true }, + "indexByName": {}, + "renameByName": { + "baseline_id": "Baseline", + "metric": "Metric", + "Value": "Delta %" + } + } + } + ], + "type": "table" + } + ], + "refresh": "5s", + "schemaVersion": 38, + "style": "dark", + "tags": ["cow-protocol", "performance-testing", "comparison"], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "definition": "label_values(cow_perf_orders_created_total, scenario)", + "hide": 0, + "includeAll": true, + "label": "Scenario", + "multi": false, + "name": "scenario", + "options": [], + "query": { + "query": "label_values(cow_perf_orders_created_total, scenario)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "definition": "label_values(cow_perf_baseline_comparison_percent, baseline_id)", + "hide": 0, + "includeAll": true, + "label": "Baseline", + "multi": false, + "name": "baseline_id", + "options": [], + "query": { + "query": "label_values(cow_perf_baseline_comparison_percent, baseline_id)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "CoW Performance Testing - Comparison", + "uid": "cow-perf-comparison", + "version": 1, + "weekStart": "" +} diff --git a/configs/dashboards/performance.json b/configs/dashboards/performance.json new file mode 100644 index 0000000..c32d4cf --- /dev/null +++ b/configs/dashboards/performance.json @@ -0,0 +1,960 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "enable": true, + "expr": "ALERTS{alertstate=\"firing\", component=\"cow-performance-testing\"}", + "iconColor": "red", + "name": "Firing Alerts", + "tagKeys": "alertname,severity", + "titleFormat": "{{ alertname }}" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [ + { + "asDropdown": false, + "icon": "dashboard", + "includeVars": true, + "keepTime": true, + "tags": [], + "targetBlank": false, + "title": "API", + "tooltip": "Go to API Performance Dashboard", + "type": "link", + "url": "/d/cow-perf-api" + }, + { + "asDropdown": false, + "icon": "dashboard", + "includeVars": true, + "keepTime": true, + "tags": [], + "targetBlank": false, + "title": "Resources", + "tooltip": "Go to Resources Dashboard", + "type": "link", + "url": "/d/cow-perf-resources" + }, + { + "asDropdown": false, + "icon": "dashboard", + "includeVars": true, + "keepTime": true, + "tags": [], + "targetBlank": false, + "title": "Comparison", + "tooltip": "Go to Comparison Dashboard", + "type": "link", + "url": "/d/cow-perf-comparison" + }, + { + "asDropdown": false, + "icon": "dashboard", + "includeVars": true, + "keepTime": true, + "tags": [], + "targetBlank": false, + "title": "Traders", + "tooltip": "Go to Trader Activity Dashboard", + "type": "link", + "url": "/d/cow-perf-traders" + } + ], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 1, + "panels": [], + "title": "Test Overview", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "blue", "value": null }] + } + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "/^scenario$/", + "values": false + }, + "textMode": "value" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_test_info", + "format": "table", + "instant": true, + "refId": "A" + } + ], + "title": "Scenario", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "time() - cow_perf_test_start_timestamp{scenario=~\"$scenario\"}", + "legendFormat": "Elapsed", + "refId": "A" + } + ], + "title": "Test Duration", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "purple", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_num_traders{scenario=~\"$scenario\"}", + "legendFormat": "Traders", + "refId": "A" + } + ], + "title": "Number of Traders", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 50 }, + { "color": "green", "value": 90 } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 }, + "id": 5, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_test_progress_percent{scenario=~\"$scenario\"}", + "legendFormat": "Progress", + "refId": "A" + } + ], + "title": "Test Progress", + "type": "gauge" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, + "id": 6, + "panels": [], + "title": "Order Submission Rate", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "ops" + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Target Rate" }, + "properties": [ + { "id": "custom.lineStyle", "value": { "dash": [10, 10], "fill": "dash" } }, + { "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } } + ] + }, + { + "matcher": { "id": "byName", "options": "Actual Rate" }, + "properties": [ + { "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } } + ] + } + ] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }, + "id": 7, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_actual_rate{scenario=~\"$scenario\"}", + "legendFormat": "Actual Rate", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_target_rate{scenario=~\"$scenario\"}", + "legendFormat": "Target Rate", + "refId": "B" + } + ], + "title": "Orders Per Second (Actual vs Target)", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 12, "y": 6 }, + "id": 8, + "options": { + "legend": { + "calcs": ["lastNotNull"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_orders_created_total{scenario=~\"$scenario\"}", + "legendFormat": "Created", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_orders_submitted_total{scenario=~\"$scenario\"}", + "legendFormat": "Submitted", + "refId": "B" + } + ], + "title": "Cumulative Orders Over Time", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "max": 120, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 80 }, + { "color": "green", "value": 95 } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 4, "x": 20, "y": 6 }, + "id": 9, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "(cow_perf_actual_rate{scenario=~\"$scenario\"} / cow_perf_target_rate{scenario=~\"$scenario\"}) * 100", + "legendFormat": "Achievement", + "refId": "A" + } + ], + "title": "Submission Rate Achievement", + "type": "gauge" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 }, + "id": 10, + "panels": [], + "title": "Latency Distribution", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "scaleDistribution": { "type": "linear" } + } + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 15 }, + "id": 11, + "options": { + "calculate": false, + "cellGap": 1, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Oranges", + "steps": 64 + }, + "exemplars": { "color": "rgba(255,0,255,0.7)" }, + "filterValues": { "le": 1e-9 }, + "legend": { "show": true }, + "rowsFrame": { "layout": "auto" }, + "tooltip": { "show": true, "yHistogram": false }, + "yAxis": { + "axisPlacement": "left", + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum(rate(cow_perf_submission_latency_seconds_bucket{scenario=~\"$scenario\"}[$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "{{le}}", + "refId": "A" + } + ], + "title": "Submission Latency Distribution", + "type": "heatmap" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "scaleDistribution": { "type": "linear" } + } + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 15 }, + "id": 12, + "options": { + "calculate": false, + "cellGap": 1, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Blues", + "steps": 64 + }, + "exemplars": { "color": "rgba(255,0,255,0.7)" }, + "filterValues": { "le": 1e-9 }, + "legend": { "show": true }, + "rowsFrame": { "layout": "auto" }, + "tooltip": { "show": true, "yHistogram": false }, + "yAxis": { + "axisPlacement": "left", + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum(rate(cow_perf_settlement_latency_seconds_bucket{scenario=~\"$scenario\"}[$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "{{le}}", + "refId": "A" + } + ], + "title": "Settlement Latency Distribution", + "type": "heatmap" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 15 }, + "id": 13, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "histogram_quantile(0.50, sum(rate(cow_perf_submission_latency_seconds_bucket{scenario=~\"$scenario\"}[$__rate_interval])) by (le))", + "legendFormat": "P50", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "histogram_quantile(0.90, sum(rate(cow_perf_submission_latency_seconds_bucket{scenario=~\"$scenario\"}[$__rate_interval])) by (le))", + "legendFormat": "P90", + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "histogram_quantile(0.95, sum(rate(cow_perf_submission_latency_seconds_bucket{scenario=~\"$scenario\"}[$__rate_interval])) by (le))", + "legendFormat": "P95", + "refId": "C" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "histogram_quantile(0.99, sum(rate(cow_perf_submission_latency_seconds_bucket{scenario=~\"$scenario\"}[$__rate_interval])) by (le))", + "legendFormat": "P99", + "refId": "D" + } + ], + "title": "Submission Latency Percentiles", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 23 }, + "id": 14, + "panels": [], + "title": "Order Status", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "hideFrom": { "legend": false, "tooltip": false, "viz": false } }, + "mappings": [] + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Filled" }, + "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] + }, + { + "matcher": { "id": "byName", "options": "Failed" }, + "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] + }, + { + "matcher": { "id": "byName", "options": "Expired" }, + "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] + }, + { + "matcher": { "id": "byName", "options": "Active" }, + "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] + } + ] + }, + "gridPos": { "h": 8, "w": 6, "x": 0, "y": 24 }, + "id": 15, + "options": { + "displayLabels": ["name", "percent"], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": ["value"] + }, + "pieType": "pie", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_orders_filled_total{scenario=~\"$scenario\"}", + "legendFormat": "Filled", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_orders_failed_total{scenario=~\"$scenario\"}", + "legendFormat": "Failed", + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_orders_expired_total{scenario=~\"$scenario\"}", + "legendFormat": "Expired", + "refId": "C" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_orders_active{scenario=~\"$scenario\"}", + "legendFormat": "Active", + "refId": "D" + } + ], + "title": "Order Status Distribution", + "type": "piechart" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 90 }, + { "color": "green", "value": 95 } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 6, "x": 6, "y": 24 }, + "id": 16, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "(cow_perf_orders_filled_total{scenario=~\"$scenario\"} / cow_perf_orders_submitted_total{scenario=~\"$scenario\"}) * 100", + "legendFormat": "Success Rate", + "refId": "A" + } + ], + "title": "Success Rate", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "blue", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 24 }, + "id": 17, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_orders_submitted_total{scenario=~\"$scenario\"}", + "legendFormat": "Submitted", + "refId": "A" + } + ], + "title": "Total Orders Submitted", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 24 }, + "id": 18, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_orders_filled_total{scenario=~\"$scenario\"}", + "legendFormat": "Filled", + "refId": "A" + } + ], + "title": "Total Orders Filled", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "red", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 28 }, + "id": 19, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_orders_failed_total{scenario=~\"$scenario\"}", + "legendFormat": "Failed", + "refId": "A" + } + ], + "title": "Total Orders Failed", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "blue", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 28 }, + "id": 20, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_orders_active{scenario=~\"$scenario\"}", + "legendFormat": "Active", + "refId": "A" + } + ], + "title": "Active Orders", + "type": "stat" + } + ], + "refresh": "5s", + "schemaVersion": 38, + "style": "dark", + "tags": ["cow-protocol", "performance-testing"], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "definition": "label_values(cow_perf_orders_created_total, scenario)", + "hide": 0, + "includeAll": true, + "label": "Scenario", + "multi": false, + "name": "scenario", + "options": [], + "query": { + "query": "label_values(cow_perf_orders_created_total, scenario)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "CoW Performance Testing - Overview", + "uid": "cow-perf-overview", + "version": 1, + "weekStart": "" +} diff --git a/configs/dashboards/resources.json b/configs/dashboards/resources.json new file mode 100644 index 0000000..a88f63d --- /dev/null +++ b/configs/dashboards/resources.json @@ -0,0 +1,844 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [ + { + "asDropdown": false, + "icon": "dashboard", + "includeVars": true, + "keepTime": true, + "tags": [], + "targetBlank": false, + "title": "Overview", + "tooltip": "Go to Overview Dashboard", + "type": "link", + "url": "/d/cow-perf-overview" + }, + { + "asDropdown": false, + "icon": "dashboard", + "includeVars": true, + "keepTime": true, + "tags": [], + "targetBlank": false, + "title": "API", + "tooltip": "Go to API Performance Dashboard", + "type": "link", + "url": "/d/cow-perf-api" + }, + { + "asDropdown": false, + "icon": "dashboard", + "includeVars": true, + "keepTime": true, + "tags": [], + "targetBlank": false, + "title": "Comparison", + "tooltip": "Go to Comparison Dashboard", + "type": "link", + "url": "/d/cow-perf-comparison" + }, + { + "asDropdown": false, + "icon": "dashboard", + "includeVars": true, + "keepTime": true, + "tags": [], + "targetBlank": false, + "title": "Traders", + "tooltip": "Go to Trader Activity Dashboard", + "type": "link", + "url": "/d/cow-perf-traders" + } + ], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 1, + "panels": [], + "title": "CPU Usage", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 1 }, + "id": 2, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_container_cpu_percent{container=~\"$container\"}", + "legendFormat": "{{container}}", + "refId": "A" + } + ], + "title": "CPU by Container", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 90 } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 6, "x": 12, "y": 1 }, + "id": 3, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_container_cpu_percent{container=~\"$container\"}", + "legendFormat": "{{container}}", + "refId": "A" + } + ], + "title": "Current CPU", + "type": "gauge" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 90 } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "max_over_time(cow_perf_container_cpu_percent{container=~\"$container\"}[1h])", + "legendFormat": "Peak", + "refId": "A" + } + ], + "title": "Peak CPU", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 90 } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 5 }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "avg_over_time(cow_perf_container_cpu_percent{container=~\"$container\"}[$__range])", + "legendFormat": "Avg", + "refId": "A" + } + ], + "title": "Avg CPU", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 9 }, + "id": 6, + "panels": [], + "title": "Memory Usage", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 10 }, + "id": 7, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_container_memory_bytes{container=~\"$container\"}", + "legendFormat": "{{container}}", + "refId": "A" + } + ], + "title": "Memory by Container", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1073741824 }, + { "color": "red", "value": 2147483648 } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 6, "x": 12, "y": 10 }, + "id": 8, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_container_memory_bytes{container=~\"$container\"}", + "legendFormat": "{{container}}", + "refId": "A" + } + ], + "title": "Current Memory", + "type": "gauge" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "orange", "value": null }] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 10 }, + "id": 9, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "max_over_time(cow_perf_container_memory_bytes{container=~\"$container\"}[1h])", + "legendFormat": "Peak", + "refId": "A" + } + ], + "title": "Peak Memory", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "blue", "value": null }] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 14 }, + "id": 10, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "avg_over_time(cow_perf_container_memory_bytes{container=~\"$container\"}[$__range])", + "legendFormat": "Avg", + "refId": "A" + } + ], + "title": "Avg Memory", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 18 }, + "id": 11, + "panels": [], + "title": "Network I/O", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 19 }, + "id": 12, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "rate(cow_perf_container_network_rx_bytes{container=~\"$container\"}[1m])", + "legendFormat": "{{container}}", + "refId": "A" + } + ], + "title": "Network RX Rate", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 19 }, + "id": 13, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "rate(cow_perf_container_network_tx_bytes{container=~\"$container\"}[1m])", + "legendFormat": "{{container}}", + "refId": "A" + } + ], + "title": "Network TX Rate", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 27 }, + "id": 14, + "panels": [], + "title": "Summary", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 28 }, + "id": 15, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum(cow_perf_container_network_rx_bytes{container=~\"$container\"})", + "legendFormat": "Total RX", + "refId": "A" + } + ], + "title": "Total RX", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "blue", "value": null }] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 28 }, + "id": 16, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum(cow_perf_container_network_tx_bytes{container=~\"$container\"})", + "legendFormat": "Total TX", + "refId": "A" + } + ], + "title": "Total TX", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "custom": { + "align": "auto", + "cellOptions": { "type": "auto" }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + } + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "container" }, + "properties": [{ "id": "custom.width", "value": 200 }] + }, + { + "matcher": { "id": "byName", "options": "CPU %" }, + "properties": [ + { "id": "unit", "value": "percent" }, + { "id": "custom.width", "value": 100 } + ] + }, + { + "matcher": { "id": "byName", "options": "Memory" }, + "properties": [ + { "id": "unit", "value": "bytes" }, + { "id": "custom.width", "value": 100 } + ] + }, + { + "matcher": { "id": "byName", "options": "RX" }, + "properties": [ + { "id": "unit", "value": "bytes" }, + { "id": "custom.width", "value": 100 } + ] + }, + { + "matcher": { "id": "byName", "options": "TX" }, + "properties": [ + { "id": "unit", "value": "bytes" }, + { "id": "custom.width", "value": 100 } + ] + } + ] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 28 }, + "id": 17, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": ["sum"], + "show": false + }, + "showHeader": true, + "sortBy": [{ "desc": true, "displayName": "CPU %" }] + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_container_cpu_percent{container=~\"$container\"}", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_container_memory_bytes{container=~\"$container\"}", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_container_network_rx_bytes{container=~\"$container\"}", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "refId": "C" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_container_network_tx_bytes{container=~\"$container\"}", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "refId": "D" + } + ], + "title": "Resource Summary", + "transformations": [ + { + "id": "merge", + "options": {} + }, + { + "id": "organize", + "options": { + "excludeByName": { "Time": true }, + "indexByName": {}, + "renameByName": { + "Value #A": "CPU %", + "Value #B": "Memory", + "Value #C": "RX", + "Value #D": "TX" + } + } + } + ], + "type": "table" + } + ], + "refresh": "5s", + "schemaVersion": 38, + "style": "dark", + "tags": ["cow-protocol", "performance-testing", "resources"], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "definition": "label_values(cow_perf_orders_created_total, scenario)", + "hide": 0, + "includeAll": true, + "label": "Scenario", + "multi": false, + "name": "scenario", + "options": [], + "query": { + "query": "label_values(cow_perf_orders_created_total, scenario)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "definition": "label_values(cow_perf_container_cpu_percent, container)", + "hide": 0, + "includeAll": true, + "label": "Container", + "multi": true, + "name": "container", + "options": [], + "query": { + "query": "label_values(cow_perf_container_cpu_percent, container)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "CoW Performance Testing - Resources", + "uid": "cow-perf-resources", + "version": 1, + "weekStart": "" +} diff --git a/configs/dashboards/trader-activity.json b/configs/dashboards/trader-activity.json new file mode 100644 index 0000000..46969ee --- /dev/null +++ b/configs/dashboards/trader-activity.json @@ -0,0 +1,700 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [ + { + "asDropdown": false, + "icon": "dashboard", + "includeVars": true, + "keepTime": true, + "tags": [], + "targetBlank": false, + "title": "Overview", + "tooltip": "Go to Overview Dashboard", + "type": "link", + "url": "/d/cow-perf-overview" + }, + { + "asDropdown": false, + "icon": "dashboard", + "includeVars": true, + "keepTime": true, + "tags": [], + "targetBlank": false, + "title": "API", + "tooltip": "Go to API Performance Dashboard", + "type": "link", + "url": "/d/cow-perf-api" + }, + { + "asDropdown": false, + "icon": "dashboard", + "includeVars": true, + "keepTime": true, + "tags": [], + "targetBlank": false, + "title": "Resources", + "tooltip": "Go to Resources Dashboard", + "type": "link", + "url": "/d/cow-perf-resources" + }, + { + "asDropdown": false, + "icon": "dashboard", + "includeVars": true, + "keepTime": true, + "tags": [], + "targetBlank": false, + "title": "Comparison", + "tooltip": "Go to Comparison Dashboard", + "type": "link", + "url": "/d/cow-perf-comparison" + } + ], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 1, + "panels": [], + "title": "Overview", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_traders_active", + "legendFormat": "Active", + "refId": "A" + } + ], + "title": "Active Traders", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "purple", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_num_traders{scenario=~\"$scenario\"}", + "legendFormat": "Total", + "refId": "A" + } + ], + "title": "Total Traders", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "blue", "value": null }] + }, + "unit": "short", + "decimals": 1 + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum(cow_perf_trader_orders_submitted) / cow_perf_num_traders{scenario=~\"$scenario\"}", + "legendFormat": "Avg", + "refId": "A" + } + ], + "title": "Avg Orders/Trader", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 50 }, + { "color": "green", "value": 90 } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum(cow_perf_trader_orders_filled) / sum(cow_perf_trader_orders_submitted) * 100", + "legendFormat": "Fill Rate", + "refId": "A" + } + ], + "title": "Fill Rate", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, + "id": 6, + "panels": [], + "title": "Top Traders", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 80, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }, + "id": 7, + "options": { + "legend": { + "calcs": ["lastNotNull"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "topk($top_n, cow_perf_trader_orders_submitted)", + "legendFormat": "Trader {{trader_index}}", + "refId": "A" + } + ], + "title": "Orders Submitted (Top N)", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 80, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }, + "id": 8, + "options": { + "legend": { + "calcs": ["lastNotNull"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "topk($top_n, cow_perf_trader_orders_filled)", + "legendFormat": "Trader {{trader_index}}", + "refId": "A" + } + ], + "title": "Orders Filled (Top N)", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 }, + "id": 9, + "panels": [], + "title": "Activity", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 15 }, + "id": 10, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_traders_active", + "legendFormat": "Active Traders", + "refId": "A" + } + ], + "title": "Active Traders Over Time", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 15 }, + "id": 11, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "topk($top_n, rate(cow_perf_trader_orders_submitted[1m]))", + "legendFormat": "Trader {{trader_index}}", + "refId": "A" + } + ], + "title": "Submission Rate by Trader (Top N)", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 23 }, + "id": 12, + "panels": [], + "title": "Distribution", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "hideFrom": { "legend": false, "tooltip": false, "viz": false } }, + "mappings": [] + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 24 }, + "id": 13, + "options": { + "displayLabels": ["name", "percent"], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": ["value"] + }, + "pieType": "pie", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "topk($top_n, cow_perf_trader_orders_submitted)", + "legendFormat": "Trader {{trader_index}}", + "refId": "A" + } + ], + "title": "Order Distribution", + "type": "piechart" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "custom": { + "align": "auto", + "cellOptions": { "type": "auto" }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + } + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Trader" }, + "properties": [{ "id": "custom.width", "value": 100 }] + }, + { + "matcher": { "id": "byName", "options": "Submitted" }, + "properties": [{ "id": "custom.width", "value": 100 }] + }, + { + "matcher": { "id": "byName", "options": "Filled" }, + "properties": [{ "id": "custom.width", "value": 100 }] + }, + { + "matcher": { "id": "byName", "options": "Success %" }, + "properties": [ + { "id": "unit", "value": "percent" }, + { "id": "custom.width", "value": 100 }, + { "id": "thresholds", "value": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 50 }, + { "color": "green", "value": 90 } + ] + }}, + { "id": "custom.cellOptions", "value": { "type": "color-background" } } + ] + } + ] + }, + "gridPos": { "h": 8, "w": 16, "x": 8, "y": 24 }, + "id": 14, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": ["sum"], + "show": false + }, + "showHeader": true, + "sortBy": [{ "desc": true, "displayName": "Submitted" }] + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_trader_orders_submitted", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_trader_orders_filled", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "cow_perf_trader_orders_filled / cow_perf_trader_orders_submitted * 100", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "refId": "C" + } + ], + "title": "Trader Success Rates", + "transformations": [ + { + "id": "merge", + "options": {} + }, + { + "id": "organize", + "options": { + "excludeByName": { "Time": true }, + "indexByName": {}, + "renameByName": { + "trader_index": "Trader", + "Value #A": "Submitted", + "Value #B": "Filled", + "Value #C": "Success %" + } + } + } + ], + "type": "table" + } + ], + "refresh": "5s", + "schemaVersion": 38, + "style": "dark", + "tags": ["cow-protocol", "performance-testing", "traders"], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "definition": "label_values(cow_perf_orders_created_total, scenario)", + "hide": 0, + "includeAll": true, + "label": "Scenario", + "multi": false, + "name": "scenario", + "options": [], + "query": { + "query": "label_values(cow_perf_orders_created_total, scenario)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "10", + "value": "10" + }, + "hide": 0, + "includeAll": false, + "label": "Top N", + "multi": false, + "name": "top_n", + "options": [ + { "selected": false, "text": "5", "value": "5" }, + { "selected": true, "text": "10", "value": "10" }, + { "selected": false, "text": "20", "value": "20" }, + { "selected": false, "text": "50", "value": "50" } + ], + "query": "5,10,20,50", + "skipUrlSync": false, + "type": "custom" + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "CoW Performance Testing - Trader Activity", + "uid": "cow-perf-traders", + "version": 1, + "weekStart": "" +} diff --git a/configs/grafana-datasource.yml b/configs/grafana-datasource.yml index 3567a8d..d8beef5 100644 --- a/configs/grafana-datasource.yml +++ b/configs/grafana-datasource.yml @@ -5,6 +5,7 @@ apiVersion: 1 datasources: - name: Prometheus + uid: prometheus type: prometheus access: proxy url: http://prometheus:9090 diff --git a/configs/prometheus.yml b/configs/prometheus.yml index d562f91..c37e3a4 100644 --- a/configs/prometheus.yml +++ b/configs/prometheus.yml @@ -70,11 +70,12 @@ scrape_configs: # Fail gracefully if exporter not running scrape_timeout: 5s -# Optional: Add alerting rules -# rule_files: -# - "/etc/prometheus/alerts/*.yml" +# Alert rule files +rule_files: + - "/etc/prometheus/alerts/*.yml" -# Optional: Configure Alertmanager +# Note: Alertmanager not configured - alerts visible in Prometheus UI and Grafana only +# To enable Alertmanager notifications, uncomment below and add alertmanager service: # alerting: # alertmanagers: # - static_configs: diff --git a/configs/prometheus/alerts/performance-testing.yml b/configs/prometheus/alerts/performance-testing.yml new file mode 100644 index 0000000..45014cf --- /dev/null +++ b/configs/prometheus/alerts/performance-testing.yml @@ -0,0 +1,184 @@ +# ============================================================================= +# CoW Performance Testing Suite - Prometheus Alert Rules +# ============================================================================= +# +# This file defines alerting rules for the CoW Performance Testing Suite. +# Alerts are evaluated by Prometheus and can be viewed in the Prometheus UI +# or visualized in Grafana dashboards. +# +# ============================================================================= +# ALERT PARAMETERS - Edit values here for easy customization +# ============================================================================= +# +# TODO(COW-617): Move these thresholds to configurable TOML/env variables +# +# LATENCY THRESHOLDS (seconds): +# submission_latency_warning_threshold: 5 # P95 > 5s triggers warning +# submission_latency_critical_threshold: 10 # P95 > 10s triggers critical +# +# ERROR RATE THRESHOLDS (decimal, where 0.05 = 5%): +# error_rate_critical_threshold: 0.05 # > 5% error rate +# +# THROUGHPUT THRESHOLDS (ratio, where 0.8 = 80%): +# throughput_low_threshold: 0.8 # < 80% of target rate +# +# RESOURCE THRESHOLDS (percentage): +# cpu_warning_threshold: 80 # CPU > 80% +# memory_critical_threshold: 95 # Memory > 95% +# +# ALERT DURATIONS (prevents flapping): +# latency_warning_for: 2m +# latency_critical_for: 1m +# error_rate_for: 1m +# throughput_for: 2m +# cpu_for: 5m +# memory_for: 2m +# test_stalled_for: 1m +# +# ============================================================================= + +groups: + - name: cow_performance_testing + # Evaluation interval inherited from global config (5s) + rules: + # ========================================================================= + # LATENCY ALERTS + # ========================================================================= + + # High Submission Latency (Warning) + # Triggers when P95 submission latency exceeds warning threshold + - alert: HighSubmissionLatency + expr: | + histogram_quantile(0.95, + sum(rate(cow_perf_submission_latency_seconds_bucket[1m])) by (le, scenario) + ) > 5 + for: 2m + labels: + severity: warning + component: cow-performance-testing + category: latency + annotations: + summary: "High submission latency detected" + description: "P95 submission latency is {{ $value | printf \"%.2f\" }}s (threshold: 5s) for scenario {{ $labels.scenario }}" + runbook: "Check API logs, verify network connectivity, review recent code changes" + + # Critical Submission Latency (Critical) + # Triggers when P95 submission latency exceeds critical threshold + - alert: CriticalSubmissionLatency + expr: | + histogram_quantile(0.95, + sum(rate(cow_perf_submission_latency_seconds_bucket[1m])) by (le, scenario) + ) > 10 + for: 1m + labels: + severity: critical + component: cow-performance-testing + category: latency + annotations: + summary: "Critical submission latency - immediate attention required" + description: "P95 submission latency is {{ $value | printf \"%.2f\" }}s (threshold: 10s) for scenario {{ $labels.scenario }}" + runbook: "Immediate action: Check API health, container resources, database connections" + + # ========================================================================= + # ERROR RATE ALERTS + # ========================================================================= + + # High Error Rate (Critical) + # Triggers when order failure rate exceeds threshold + - alert: HighErrorRate + expr: | + ( + sum(rate(cow_perf_orders_failed_total[5m])) by (scenario) + / + sum(rate(cow_perf_orders_submitted_total[5m])) by (scenario) + ) > 0.05 + for: 1m + labels: + severity: critical + component: cow-performance-testing + category: errors + annotations: + summary: "High error rate detected" + description: "Error rate is {{ $value | humanizePercentage }} (threshold: 5%) for scenario {{ $labels.scenario }}" + runbook: "Check order validation errors, API error responses, contract state" + + # ========================================================================= + # THROUGHPUT ALERTS + # ========================================================================= + + # Low Throughput (Warning) + # Triggers when actual throughput falls below target + - alert: LowThroughput + expr: | + ( + cow_perf_actual_rate + / + cow_perf_target_rate + ) < 0.8 + and cow_perf_target_rate > 0 + for: 2m + labels: + severity: warning + component: cow-performance-testing + category: throughput + annotations: + summary: "Low throughput - not meeting target rate" + description: "Actual throughput is {{ $value | humanizePercentage }} of target for scenario {{ $labels.scenario }}" + runbook: "Check for bottlenecks: API rate limits, network latency, resource constraints" + + # ========================================================================= + # TEST EXECUTION ALERTS + # ========================================================================= + + # Test Stalled (Critical) + # Triggers when no orders are being submitted during an active test + - alert: TestStalled + expr: | + rate(cow_perf_orders_submitted_total[1m]) == 0 + and + cow_perf_test_progress_percent > 0 + and + cow_perf_test_progress_percent < 100 + for: 1m + labels: + severity: critical + component: cow-performance-testing + category: test-execution + annotations: + summary: "Performance test appears to be stalled" + description: "No orders submitted in the last minute for scenario {{ $labels.scenario }} (progress: {{ $value }}%)" + runbook: "Check test process, verify API connectivity, review error logs" + + # ========================================================================= + # RESOURCE ALERTS + # ========================================================================= + + # High CPU Usage (Warning) + # Triggers when container CPU usage is high + - alert: HighCPUUsage + expr: | + cow_perf_container_cpu_percent > 80 + for: 5m + labels: + severity: warning + component: cow-performance-testing + category: resources + annotations: + summary: "High CPU usage on {{ $labels.container }}" + description: "CPU usage is {{ $value | printf \"%.1f\" }}% (threshold: 80%) on container {{ $labels.container }}" + runbook: "Consider scaling resources, check for inefficient operations, review container limits" + + # Critical Memory Usage (Critical) + # Triggers when container memory usage approaches limit + - alert: CriticalMemoryUsage + expr: | + cow_perf_container_memory_percent > 95 + for: 2m + labels: + severity: critical + component: cow-performance-testing + category: resources + annotations: + summary: "Critical memory usage on {{ $labels.container }}" + description: "Memory usage is {{ $value | printf \"%.1f\" }}% (threshold: 95%) on container {{ $labels.container }}" + runbook: "Immediate action: Check for memory leaks, increase container memory limit, restart if necessary" diff --git a/docker-compose.yml b/docker-compose.yml index 23e4414..1a0a5a2 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -245,6 +245,7 @@ services: - "9090:9090" volumes: - ./configs/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - ./configs/prometheus/alerts:/etc/prometheus/alerts:ro - prometheus_data:/prometheus profiles: - monitoring @@ -263,6 +264,7 @@ services: volumes: - ./configs/grafana-datasource.yml:/etc/grafana/provisioning/datasources/datasource.yml:ro - ./configs/grafana-dashboard.yml:/etc/grafana/provisioning/dashboards/dashboard.yml:ro + - ./configs/dashboards:/etc/grafana/dashboards:ro - grafana_data:/var/lib/grafana depends_on: - prometheus diff --git a/docs/cli.md b/docs/cli.md index 2012de8..16c1abf 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -98,6 +98,9 @@ default_trader_count: 10 default_duration: 60 default_startup_interval: 0.1 +# Prometheus metrics export (enabled by default) +prometheus_port: 9091 # Port for metrics exporter (null or 0 to disable) + # Order type distribution (must sum to 1.0) market_order_ratio: 0.4 limit_order_ratio: 0.4 @@ -337,6 +340,33 @@ cow-perf run --scenario medium-load --baseline v1.0 cow-perf run --config ./config.yml --scenario light-load ``` +### Real-Time Metrics Export + +Prometheus metrics export is **enabled by default** on port 9091. During test execution, metrics are exposed at `http://localhost:9091/metrics` for Prometheus scraping. + +```bash +# Run with default Prometheus export (port 9091) +cow-perf run --config configs/scenarios/light-load.yml + +# Use a different port +cow-perf run --config configs/scenarios/light-load.yml --prometheus-port 9092 + +# Disable Prometheus export +cow-perf run --config configs/scenarios/light-load.yml --prometheus-port 0 +``` + +**Using with Docker monitoring stack:** + +```bash +# Start Prometheus and Grafana +docker compose --profile monitoring up -d + +# Run test (metrics automatically available to Prometheus) +cow-perf run --config configs/scenarios/light-load.yml + +# View dashboards at http://localhost:3000 +``` + --- ## Output Formats diff --git a/src/cow_performance/api/instrumented_client.py b/src/cow_performance/api/instrumented_client.py index 97189bd..906440c 100644 --- a/src/cow_performance/api/instrumented_client.py +++ b/src/cow_performance/api/instrumented_client.py @@ -5,10 +5,13 @@ for performance analysis. """ +import asyncio import json import time from typing import Any +import aiohttp + from cow_performance.api.orderbook_client import OrderbookClient from cow_performance.metrics import APIMetrics, MetricsStore @@ -247,6 +250,43 @@ async def upload_app_data( ) raise + async def upload_app_data_with_retry( + self, + app_data_hash: str, + app_data_doc: str | dict[str, Any], + max_retries: int = 3, + ) -> dict[str, Any]: + """Upload appData with automatic retry on failure (instrumented). + + Args: + app_data_hash: 32-byte hash of appData document + app_data_doc: Full appData JSON document + max_retries: Maximum retry attempts + + Returns: + Response from orderbook + """ + last_error = None + for attempt in range(max_retries): + try: + return await self.upload_app_data(app_data_hash, app_data_doc) + except aiohttp.ClientResponseError as e: + last_error = e + if e.status == 409: + return {} + if e.status not in (500, 502, 503, 504): + raise + if attempt < max_retries - 1: + wait_time = 2**attempt + print( + f"AppData upload failed (attempt {attempt + 1}), " + f"retrying in {wait_time}s..." + ) + await asyncio.sleep(wait_time) + if last_error: + raise last_error + return {} + async def get_version(self) -> dict[str, Any]: """ Get API version with timing instrumentation. @@ -282,6 +322,10 @@ async def get_version(self) -> dict[str, Any]: ) raise + async def get_open_order_count(self, owner: str) -> int: + """Get count of open orders for an account (delegates to underlying client).""" + return await self._client.get_open_order_count(owner) + async def check_health(self) -> bool: """ Check if the orderbook API is healthy. diff --git a/src/cow_performance/cli/commands/run.py b/src/cow_performance/cli/commands/run.py index 1dcf424..ac06854 100644 --- a/src/cow_performance/cli/commands/run.py +++ b/src/cow_performance/cli/commands/run.py @@ -3,6 +3,7 @@ import asyncio import signal import sys +import time from datetime import datetime from pathlib import Path from typing import Any @@ -66,6 +67,43 @@ def handle_signal(self, signum: int, frame: Any) -> None: self.orchestrator._running = False +async def update_prometheus_metrics( + exporter: PrometheusExporter, + orchestrator: TraderOrchestrator, + test_duration: float, + target_rate: float, +) -> None: + """Periodically update Prometheus progress and throughput metrics. + + Args: + exporter: The Prometheus exporter to update + orchestrator: The trader orchestrator (to check running state and get order counts) + test_duration: Total test duration in seconds + target_rate: Target orders per second + """ + start_time = time.time() + + while orchestrator._running: + elapsed = time.time() - start_time + + # Update progress (0-100%) + progress_percent = min(100.0, (elapsed / test_duration) * 100) + exporter.update_progress(progress_percent) + + # Calculate actual rate + total_orders = orchestrator.trader_pool.get_total_orders_submitted() + actual_rate = total_orders / elapsed if elapsed > 0 else 0.0 + + # Update throughput metrics + exporter.update_throughput( + orders_per_second=actual_rate, + target_rate=target_rate, + actual_rate=actual_rate, + ) + + await asyncio.sleep(1.0) # Update every second + + async def run_performance_test( config: PerformanceTestConfig, traders: int | None = None, @@ -449,7 +487,29 @@ async def run_performance_test( try: # Start test start_time = datetime.now() - await orchestrator.run() + + if prometheus_exporter: + # Calculate target rate from behavior config (orders per minute -> per second) + target_rate = behavior_config.base_rate / 60.0 + + # Run orchestrator and metrics update loop concurrently + metrics_task = asyncio.create_task( + update_prometheus_metrics( + prometheus_exporter, + orchestrator, + float(test_duration), + target_rate, + ) + ) + await orchestrator.run() + metrics_task.cancel() # Stop metrics loop when test completes + try: + await metrics_task + except asyncio.CancelledError: + pass + else: + await orchestrator.run() + end_time = datetime.now() progress.update(task, description="[bold green]Test completed!") diff --git a/src/cow_performance/cli/config.py b/src/cow_performance/cli/config.py index 3d85a27..84c4171 100644 --- a/src/cow_performance/cli/config.py +++ b/src/cow_performance/cli/config.py @@ -201,6 +201,12 @@ class PerformanceTestConfig(BaseSettings): description="Default interval between trader startups", ) + # Prometheus metrics export + prometheus_port: int | None = Field( + default=9091, + description="Port for Prometheus metrics exporter (None or 0 to disable)", + ) + # Trading pattern configuration trading_pattern: str = Field( default="constant_rate", diff --git a/src/cow_performance/cli/main.py b/src/cow_performance/cli/main.py index 4b829ff..6cd3dc7 100644 --- a/src/cow_performance/cli/main.py +++ b/src/cow_performance/cli/main.py @@ -80,7 +80,7 @@ def run( prometheus_port: Optional[int] = typer.Option( None, "--prometheus-port", - help="Port for Prometheus metrics exporter (enables exporter when set)", + help="Port for Prometheus metrics exporter (default: 9091 from config, use 0 to disable)", ), ) -> None: """Run a performance test. @@ -108,6 +108,14 @@ def run( # Load configuration cfg = load_config(Path(config_file) if config_file else None) + # Use CLI prometheus_port override, or config value (default 9091) + # A value of 0 disables the exporter + effective_prometheus_port = ( + prometheus_port if prometheus_port is not None else cfg.prometheus_port + ) + if effective_prometheus_port == 0: + effective_prometheus_port = None + # Run the test run_command( config=cfg, @@ -119,7 +127,7 @@ def run( output_file=output_file, verbose=verbose, dry_run=dry_run, - prometheus_port=prometheus_port, + prometheus_port=effective_prometheus_port, ) except FileNotFoundError as e: diff --git a/src/cow_performance/comparison/engine.py b/src/cow_performance/comparison/engine.py index c6f08e6..7b85993 100644 --- a/src/cow_performance/comparison/engine.py +++ b/src/cow_performance/comparison/engine.py @@ -4,7 +4,6 @@ import logging from datetime import datetime -from typing import TYPE_CHECKING from cow_performance.baselines.models import PerformanceBaseline from cow_performance.comparison.models import ( @@ -26,9 +25,6 @@ ResourceAggregateMetrics, ) -if TYPE_CHECKING: - pass - logger = logging.getLogger(__name__) diff --git a/src/cow_performance/load_generation/order_tracker.py b/src/cow_performance/load_generation/order_tracker.py index 3ce3f7b..2691a8c 100644 --- a/src/cow_performance/load_generation/order_tracker.py +++ b/src/cow_performance/load_generation/order_tracker.py @@ -254,7 +254,30 @@ async def monitor_order( # and will determine the final status metadata = self.get_order(order_uid) if metadata and not metadata.is_terminal_state(): - logger.warning(f"Order {order_uid[:10]}... timed out after {attempts} poll attempts") + # Calculate detailed timeout information + age_seconds = time.time() - metadata.creation_time + status = metadata.current_status.value + + # Build lifecycle progress string + lifecycle_stages = [] + if metadata.submission_time: + lifecycle_stages.append("submitted") + if metadata.acceptance_time: + lifecycle_stages.append("accepted") + if metadata.first_fill_time: + lifecycle_stages.append("partially_filled") + + lifecycle_str = " → ".join(lifecycle_stages) if lifecycle_stages else "created only" + + # Token pair info (truncate addresses for readability) + sell_token = metadata.sell_token[-8:] if metadata.sell_token else "unknown" + buy_token = metadata.buy_token[-8:] if metadata.buy_token else "unknown" + + logger.warning( + f"Order {order_uid[:10]}... timed out after {attempts} poll attempts " + f"(status={status}, age={age_seconds:.1f}s, " + f"pair={sell_token}→{buy_token}, lifecycle=[{lifecycle_str}])" + ) return metadata or OrderMetadata( order_uid=order_uid, diff --git a/src/cow_performance/load_generation/trader_orchestrator.py b/src/cow_performance/load_generation/trader_orchestrator.py index 4455232..849e9e7 100644 --- a/src/cow_performance/load_generation/trader_orchestrator.py +++ b/src/cow_performance/load_generation/trader_orchestrator.py @@ -401,7 +401,14 @@ async def _wait_for_settlements(self, wait_time: float) -> None: print("No pending orders to monitor") return - print(f"Monitoring {len(pending_orders)} pending orders...") + # Show initial status breakdown + status_counts: dict[str, int] = {} + for o in pending_orders: + status = o.current_status.value + status_counts[status] = status_counts.get(status, 0) + 1 + status_str = ", ".join(f"{v} {k}" for k, v in sorted(status_counts.items())) + + print(f"Monitoring {len(pending_orders)} pending orders [{status_str}]...") # Monitor orders with polling start_time = time.time() @@ -429,12 +436,25 @@ async def _wait_for_settlements(self, wait_time: float) -> None: all_orders = self.order_tracker.get_all_orders() pending_orders = [o for o in all_orders if not o.is_terminal_state()] filled_orders = [o for o in all_orders if o.current_status.value == "filled"] + expired_orders = [o for o in all_orders if o.current_status.value == "expired"] + failed_orders = [o for o in all_orders if o.current_status.value == "failed"] filled_count = len(filled_orders) if filled_count > last_filled_count: + # Build status breakdown for pending orders + loop_status_counts: dict[str, int] = {} + for o in pending_orders: + status = o.current_status.value + loop_status_counts[status] = loop_status_counts.get(status, 0) + 1 + + status_str = ", ".join(f"{v} {k}" for k, v in sorted(loop_status_counts.items())) + terminal_str = "" + if expired_orders or failed_orders: + terminal_str = f" | {len(expired_orders)} expired, {len(failed_orders)} failed" + print( f" Progress: {filled_count} filled, " - f"{len(pending_orders)} pending " + f"{len(pending_orders)} pending [{status_str}]{terminal_str} " f"({int(time.time() - start_time)}s elapsed)" ) last_filled_count = filled_count @@ -447,11 +467,27 @@ async def _wait_for_settlements(self, wait_time: float) -> None: # Wait before next poll await asyncio.sleep(poll_interval) - # Final summary + # Final summary with detailed breakdown final_orders = self.order_tracker.get_all_orders() filled = len([o for o in final_orders if o.current_status.value == "filled"]) - pending = len([o for o in final_orders if not o.is_terminal_state()]) - print(f"Settlement wait completed: {filled} filled, {pending} still pending") + expired = len([o for o in final_orders if o.current_status.value == "expired"]) + failed = len([o for o in final_orders if o.current_status.value == "failed"]) + cancelled = len([o for o in final_orders if o.current_status.value == "cancelled"]) + still_pending = [o for o in final_orders if not o.is_terminal_state()] + + # Build pending breakdown + pending_str = "" + if still_pending: + final_status_counts: dict[str, int] = {} + for o in still_pending: + status = o.current_status.value + final_status_counts[status] = final_status_counts.get(status, 0) + 1 + pending_str = f" (pending breakdown: {final_status_counts})" + + print( + f"Settlement wait completed: {filled} filled, {expired} expired, " + f"{failed} failed, {cancelled} cancelled, {len(still_pending)} still pending{pending_str}" + ) async def run(self) -> None: """ diff --git a/src/cow_performance/metrics/store.py b/src/cow_performance/metrics/store.py index eed2d33..4afca03 100644 --- a/src/cow_performance/metrics/store.py +++ b/src/cow_performance/metrics/store.py @@ -236,7 +236,8 @@ def add_resource_sample(self, container_name: str, sample: ResourceSample) -> No metrics.samples.pop(0) metrics.add_sample(sample) - self._notify_callbacks("resource", sample) + # Pass (container_name, sample) tuple to callbacks for Prometheus exporter + self._notify_callbacks("resource", (container_name, sample)) def get_resource_metrics(self, container_name: str | None = None) -> dict[str, ResourceMetrics]: """ diff --git a/src/cow_performance/prometheus/exporter.py b/src/cow_performance/prometheus/exporter.py index 0df3638..2e30c25 100644 --- a/src/cow_performance/prometheus/exporter.py +++ b/src/cow_performance/prometheus/exporter.py @@ -12,7 +12,8 @@ from prometheus_client import CollectorRegistry, start_http_server from cow_performance import __version__ -from cow_performance.metrics.models import OrderMetadata, OrderStatus +from cow_performance.comparison.models import ComparisonResult +from cow_performance.metrics.models import APIMetrics, OrderMetadata, OrderStatus, ResourceSample from cow_performance.prometheus.metrics import MetricsRegistry if TYPE_CHECKING: @@ -61,6 +62,11 @@ def __init__( self._store: MetricsStore | None = None self._active_orders: set[str] = set() + # Trader tracking (Phase 2) + self._trader_address_to_index: dict[str, str] = {} + self._active_traders: set[str] = set() # Set of trader indices with active orders + self._orders_by_trader: dict[str, set[str]] = {} # trader_index -> set of order_uids + @property def registry(self) -> CollectorRegistry: """Get the Prometheus CollectorRegistry.""" @@ -113,7 +119,10 @@ def _on_metric_update(self, metric_type: str, metric: object) -> None: try: if metric_type == "order" and isinstance(metric, OrderMetadata): self._update_order_metrics(metric) - # API and resource metrics will be handled in Phase 2 + elif metric_type == "api" and isinstance(metric, APIMetrics): + self._update_api_metrics(metric) + elif metric_type == "resource": + self._update_resource_metrics(metric) except Exception as e: logger.warning("Error updating Prometheus metric: %s", e) @@ -122,11 +131,22 @@ def _update_order_metrics(self, order: OrderMetadata) -> None: status = order.current_status scenario = self.scenario + # Get or assign trader index for per-trader tracking + trader_index = self._get_trader_index(order.owner) + # Track active orders if status == OrderStatus.CREATED: self._metrics.orders_created.labels(scenario=scenario).inc() self._active_orders.add(order.order_uid) + # Update per-trader tracking + self._metrics.trader_orders_submitted.labels(trader_index=trader_index).inc() + if trader_index not in self._orders_by_trader: + self._orders_by_trader[trader_index] = set() + self._orders_by_trader[trader_index].add(order.order_uid) + self._active_traders.add(trader_index) + self._metrics.traders_active.set(len(self._active_traders)) + elif status == OrderStatus.SUBMITTED: self._metrics.orders_submitted.labels(scenario=scenario).inc() @@ -145,6 +165,10 @@ def _update_order_metrics(self, order: OrderMetadata) -> None: self._metrics.orders_filled.labels(scenario=scenario).inc() self._active_orders.discard(order.order_uid) + # Update per-trader tracking + self._metrics.trader_orders_filled.labels(trader_index=trader_index).inc() + self._remove_order_from_trader(trader_index, order.order_uid) + # Record settlement latency (acceptance to fill) latency = order.get_time_to_fill() if latency is not None: @@ -158,18 +182,106 @@ def _update_order_metrics(self, order: OrderMetadata) -> None: elif status == OrderStatus.FAILED: self._metrics.orders_failed.labels(scenario=scenario).inc() self._active_orders.discard(order.order_uid) + self._remove_order_from_trader(trader_index, order.order_uid) elif status == OrderStatus.EXPIRED: self._metrics.orders_expired.labels(scenario=scenario).inc() self._active_orders.discard(order.order_uid) + self._remove_order_from_trader(trader_index, order.order_uid) elif status == OrderStatus.CANCELLED: # Cancelled orders are tracked but not counted as failed self._active_orders.discard(order.order_uid) + self._remove_order_from_trader(trader_index, order.order_uid) # Update active orders gauge self._metrics.orders_active.labels(scenario=scenario).set(len(self._active_orders)) + def _get_trader_index(self, owner_address: str) -> str: + """Get or assign a trader index for an address. + + Uses sequential indices (0, 1, 2, ...) to manage label cardinality. + """ + if owner_address not in self._trader_address_to_index: + index = len(self._trader_address_to_index) + self._trader_address_to_index[owner_address] = str(index) + return self._trader_address_to_index[owner_address] + + def _remove_order_from_trader(self, trader_index: str, order_uid: str) -> None: + """Remove an order from trader tracking and update active traders.""" + if trader_index in self._orders_by_trader: + self._orders_by_trader[trader_index].discard(order_uid) + # If trader has no more active orders, remove from active set + if not self._orders_by_trader[trader_index]: + self._active_traders.discard(trader_index) + self._metrics.traders_active.set(len(self._active_traders)) + + def _update_api_metrics(self, api_metric: APIMetrics) -> None: + """Update API-related Prometheus metrics from APIMetrics.""" + endpoint = api_metric.endpoint + method = api_metric.method + status = str(api_metric.status_code) + + # Increment request counter + self._metrics.api_requests_total.labels( + endpoint=endpoint, + method=method, + status=status, + ).inc() + + # Record response time + self._metrics.api_response_time.labels( + endpoint=endpoint, + method=method, + ).observe(api_metric.duration) + + # Track errors (non-2xx responses) + if not api_metric.is_success: + error_type = self._classify_api_error(api_metric) + self._metrics.api_errors_total.labels( + endpoint=endpoint, + error_type=error_type, + ).inc() + + def _classify_api_error(self, api_metric: APIMetrics) -> str: + """Classify API error by type.""" + status = api_metric.status_code + if 400 <= status < 500: + return "client_error" + elif 500 <= status < 600: + return "server_error" + elif api_metric.error_message: + if "timeout" in api_metric.error_message.lower(): + return "timeout" + elif "connection" in api_metric.error_message.lower(): + return "connection_error" + return "unknown" + + def _update_resource_metrics(self, metric: object) -> None: + """Update resource-related Prometheus metrics. + + Note: MetricsStore emits (container_name, sample) tuple for resource metrics. + """ + # Handle tuple format from MetricsStore.add_resource_sample callback + if isinstance(metric, tuple) and len(metric) == 2: + container_name, sample = metric + if isinstance(sample, ResourceSample): + self._metrics.container_cpu_percent.labels(container=container_name).set( + sample.cpu_percent + ) + self._metrics.container_memory_bytes.labels(container=container_name).set( + sample.memory_bytes + ) + self._metrics.container_memory_percent.labels(container=container_name).set( + sample.memory_percent + ) + self._metrics.container_network_rx_bytes.labels(container=container_name).set( + sample.network_rx_bytes + ) + self._metrics.container_network_tx_bytes.labels(container=container_name).set( + sample.network_tx_bytes + ) + # --- Manual Recording Methods (for direct updates) --- def record_order_created(self) -> None: @@ -260,3 +372,118 @@ def update_progress(self, percent: float) -> None: def is_running(self) -> bool: """Check if exporter is running.""" return self._running + + # --- API Recording Methods (Phase 2) --- + + def record_api_request( + self, + endpoint: str, + method: str, + status_code: int, + duration_seconds: float, + ) -> None: + """Record an API request.""" + self._metrics.api_requests_total.labels( + endpoint=endpoint, + method=method, + status=str(status_code), + ).inc() + self._metrics.api_response_time.labels( + endpoint=endpoint, + method=method, + ).observe(duration_seconds) + + def record_api_error(self, endpoint: str, error_type: str) -> None: + """Record an API error.""" + self._metrics.api_errors_total.labels( + endpoint=endpoint, + error_type=error_type, + ).inc() + + # --- Resource Recording Methods (Phase 2) --- + + def update_container_resources( + self, + container: str, + cpu_percent: float, + memory_bytes: int, + network_rx_bytes: int = 0, + network_tx_bytes: int = 0, + memory_percent: float | None = None, + ) -> None: + """Update resource metrics for a container.""" + self._metrics.container_cpu_percent.labels(container=container).set(cpu_percent) + self._metrics.container_memory_bytes.labels(container=container).set(memory_bytes) + if memory_percent is not None: + self._metrics.container_memory_percent.labels(container=container).set(memory_percent) + self._metrics.container_network_rx_bytes.labels(container=container).set(network_rx_bytes) + self._metrics.container_network_tx_bytes.labels(container=container).set(network_tx_bytes) + + # --- Trader Recording Methods (Phase 2) --- + + def record_trader_order_submitted(self, trader_index: int) -> None: + """Record an order submission for a trader.""" + self._metrics.trader_orders_submitted.labels(trader_index=str(trader_index)).inc() + + def record_trader_order_filled(self, trader_index: int) -> None: + """Record an order fill for a trader.""" + self._metrics.trader_orders_filled.labels(trader_index=str(trader_index)).inc() + + def set_active_traders(self, count: int) -> None: + """Set the count of active traders.""" + self._metrics.traders_active.set(count) + + # --- Baseline Comparison Methods (Phase 2) --- + + def record_comparison_result(self, result: ComparisonResult) -> None: + """Record metrics from a baseline comparison result. + + This populates comparison metrics from a ComparisonResult object, + typically called after running a baseline comparison. + """ + baseline_id = result.baseline_id + + # Record percentage changes for each metric comparison + for metric_name, comparison in result.metric_comparisons.items(): + self._metrics.baseline_comparison_percent.labels( + metric=metric_name, + baseline_id=baseline_id, + ).set( + comparison.percent_change * 100 + ) # Convert to percentage + + # Record regression counts by severity + self._metrics.regression_detected.labels(severity="critical").set(result.critical_count) + self._metrics.regression_detected.labels(severity="major").set(result.major_count) + self._metrics.regression_detected.labels(severity="minor").set(result.minor_count) + + # Increment total regression counters + for _ in range(result.critical_count): + self._metrics.regressions_total.labels(severity="critical").inc() + for _ in range(result.major_count): + self._metrics.regressions_total.labels(severity="major").inc() + for _ in range(result.minor_count): + self._metrics.regressions_total.labels(severity="minor").inc() + + def set_baseline_comparison( + self, + metric_name: str, + baseline_id: str, + percent_change: float, + ) -> None: + """Set a single baseline comparison metric.""" + self._metrics.baseline_comparison_percent.labels( + metric=metric_name, + baseline_id=baseline_id, + ).set(percent_change) + + def set_regression_counts( + self, + critical: int = 0, + major: int = 0, + minor: int = 0, + ) -> None: + """Set regression detection counts.""" + self._metrics.regression_detected.labels(severity="critical").set(critical) + self._metrics.regression_detected.labels(severity="major").set(major) + self._metrics.regression_detected.labels(severity="minor").set(minor) diff --git a/src/cow_performance/prometheus/metrics.py b/src/cow_performance/prometheus/metrics.py index 60a4108..87be3ef 100644 --- a/src/cow_performance/prometheus/metrics.py +++ b/src/cow_performance/prometheus/metrics.py @@ -37,6 +37,10 @@ def __init__(self, registry: CollectorRegistry | None = None): self._init_latency_metrics() self._init_throughput_metrics() self._init_test_metadata() + self._init_api_metrics() + self._init_resource_metrics() + self._init_trader_metrics() + self._init_comparison_metrics() def _init_order_metrics(self) -> None: """Initialize order-related counters and gauges.""" @@ -170,3 +174,103 @@ def _init_test_metadata(self) -> None: ["scenario"], registry=self.registry, ) + + def _init_api_metrics(self) -> None: + """Initialize API performance metrics.""" + self.api_requests_total = Counter( + "cow_perf_api_requests_total", + "Total API requests", + ["endpoint", "method", "status"], + registry=self.registry, + ) + self.api_response_time = Histogram( + "cow_perf_api_response_time_seconds", + "API response time distribution", + ["endpoint", "method"], + buckets=[0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10], + registry=self.registry, + ) + self.api_errors_total = Counter( + "cow_perf_api_errors_total", + "Total API errors by type", + ["endpoint", "error_type"], + registry=self.registry, + ) + + def _init_resource_metrics(self) -> None: + """Initialize container resource metrics.""" + self.container_cpu_percent = Gauge( + "cow_perf_container_cpu_percent", + "Container CPU usage percentage", + ["container"], + registry=self.registry, + ) + self.container_memory_bytes = Gauge( + "cow_perf_container_memory_bytes", + "Container memory usage in bytes", + ["container"], + registry=self.registry, + ) + self.container_memory_percent = Gauge( + "cow_perf_container_memory_percent", + "Container memory usage as percentage of limit (0-100)", + ["container"], + registry=self.registry, + ) + self.container_network_rx_bytes = Gauge( + "cow_perf_container_network_rx_bytes", + "Container network bytes received", + ["container"], + registry=self.registry, + ) + self.container_network_tx_bytes = Gauge( + "cow_perf_container_network_tx_bytes", + "Container network bytes transmitted", + ["container"], + registry=self.registry, + ) + + def _init_trader_metrics(self) -> None: + """Initialize per-trader metrics. + + Note: Uses trader_index (0, 1, 2, ...) instead of full addresses + to manage label cardinality. Default tests have ~10 traders. + """ + self.trader_orders_submitted = Counter( + "cow_perf_trader_orders_submitted", + "Orders submitted per trader", + ["trader_index"], + registry=self.registry, + ) + self.trader_orders_filled = Counter( + "cow_perf_trader_orders_filled", + "Orders filled per trader", + ["trader_index"], + registry=self.registry, + ) + self.traders_active = Gauge( + "cow_perf_traders_active", + "Count of currently active traders", + registry=self.registry, + ) + + def _init_comparison_metrics(self) -> None: + """Initialize baseline comparison metrics.""" + self.baseline_comparison_percent = Gauge( + "cow_perf_baseline_comparison_percent", + "Percentage change from baseline (positive = increase)", + ["metric", "baseline_id"], + registry=self.registry, + ) + self.regression_detected = Gauge( + "cow_perf_regression_detected", + "Count of detected regressions by severity", + ["severity"], + registry=self.registry, + ) + self.regressions_total = Counter( + "cow_perf_regressions_total", + "Total regressions detected by severity", + ["severity"], + registry=self.registry, + ) diff --git a/tests/unit/prometheus/test_exporter.py b/tests/unit/prometheus/test_exporter.py index c593d6c..07a3f66 100644 --- a/tests/unit/prometheus/test_exporter.py +++ b/tests/unit/prometheus/test_exporter.py @@ -1,8 +1,10 @@ """Unit tests for Prometheus exporter.""" +import time + from prometheus_client import generate_latest -from cow_performance.metrics.models import OrderMetadata, OrderStatus +from cow_performance.metrics.models import APIMetrics, OrderMetadata, OrderStatus from cow_performance.prometheus.exporter import PrometheusExporter @@ -214,11 +216,289 @@ def test_callback_ignores_non_order_metrics(self) -> None: """Test callback ignores non-order metric types.""" exporter = PrometheusExporter(scenario="test") - # Should not raise - exporter._on_metric_update("api", {"some": "data"}) - exporter._on_metric_update("resource", {"some": "data"}) + # Should not raise - these will be handled by specific handlers now + exporter._on_metric_update("unknown_type", {"some": "data"}) # Counters should still be at default output = generate_latest(exporter.registry).decode() # No increments should have happened assert "cow_perf_orders_created_total" in output + + +class TestPrometheusExporterPhase2: + """Tests for Phase 2 exporter functionality.""" + + def test_record_api_request(self) -> None: + """Test API request recording.""" + exporter = PrometheusExporter(scenario="test") + exporter.record_api_request( + endpoint="/api/v1/orders", + method="POST", + status_code=200, + duration_seconds=0.15, + ) + + output = generate_latest(exporter.registry).decode() + assert ( + 'cow_perf_api_requests_total{endpoint="/api/v1/orders",method="POST",status="200"} 1.0' + in output + ) + + def test_record_api_error(self) -> None: + """Test API error recording.""" + exporter = PrometheusExporter(scenario="test") + exporter.record_api_error(endpoint="/api/v1/orders", error_type="server_error") + + output = generate_latest(exporter.registry).decode() + assert ( + 'cow_perf_api_errors_total{endpoint="/api/v1/orders",error_type="server_error"} 1.0' + in output + ) + + def test_update_container_resources(self) -> None: + """Test container resource updates.""" + exporter = PrometheusExporter(scenario="test") + exporter.update_container_resources( + container="orderbook", + cpu_percent=45.5, + memory_bytes=536870912, + network_rx_bytes=1024000, + network_tx_bytes=512000, + ) + + output = generate_latest(exporter.registry).decode() + assert 'cow_perf_container_cpu_percent{container="orderbook"} 45.5' in output + assert 'cow_perf_container_memory_bytes{container="orderbook"} 5.36870912e+08' in output + + def test_trader_index_assignment(self) -> None: + """Test that trader addresses get sequential indices.""" + exporter = PrometheusExporter(scenario="test") + + # Simulate orders from different traders + idx1 = exporter._get_trader_index("0xAAA") + idx2 = exporter._get_trader_index("0xBBB") + idx3 = exporter._get_trader_index("0xAAA") # Same as first + + assert idx1 == "0" + assert idx2 == "1" + assert idx3 == "0" # Same address gets same index + + def test_active_traders_tracking(self) -> None: + """Test active traders gauge updates.""" + exporter = PrometheusExporter(scenario="test") + + # Create orders from two traders + order1 = OrderMetadata( + order_uid="order-1", + owner="0xAAA", + creation_time=1000.0, + current_status=OrderStatus.CREATED, + ) + order2 = OrderMetadata( + order_uid="order-2", + owner="0xBBB", + creation_time=1000.0, + current_status=OrderStatus.CREATED, + ) + + exporter._on_metric_update("order", order1) + exporter._on_metric_update("order", order2) + + output = generate_latest(exporter.registry).decode() + assert "cow_perf_traders_active 2.0" in output + + # Fill one order + order1.current_status = OrderStatus.FILLED + order1.completion_time = 1030.0 + exporter._on_metric_update("order", order1) + + output = generate_latest(exporter.registry).decode() + assert "cow_perf_traders_active 1.0" in output + + def test_trader_orders_submitted_tracking(self) -> None: + """Test per-trader order submission tracking.""" + exporter = PrometheusExporter(scenario="test") + + # Create orders from same trader + order1 = OrderMetadata( + order_uid="order-1", + owner="0xAAA", + creation_time=1000.0, + current_status=OrderStatus.CREATED, + ) + order2 = OrderMetadata( + order_uid="order-2", + owner="0xAAA", + creation_time=1001.0, + current_status=OrderStatus.CREATED, + ) + + exporter._on_metric_update("order", order1) + exporter._on_metric_update("order", order2) + + output = generate_latest(exporter.registry).decode() + assert 'cow_perf_trader_orders_submitted_total{trader_index="0"} 2.0' in output + + def test_trader_orders_filled_tracking(self) -> None: + """Test per-trader order fill tracking.""" + exporter = PrometheusExporter(scenario="test") + + # Create and fill an order + order = OrderMetadata( + order_uid="order-1", + owner="0xAAA", + creation_time=1000.0, + current_status=OrderStatus.CREATED, + ) + exporter._on_metric_update("order", order) + + order.current_status = OrderStatus.FILLED + order.completion_time = 1030.0 + exporter._on_metric_update("order", order) + + output = generate_latest(exporter.registry).decode() + assert 'cow_perf_trader_orders_filled_total{trader_index="0"} 1.0' in output + + def test_set_regression_counts(self) -> None: + """Test regression count setting.""" + exporter = PrometheusExporter(scenario="test") + exporter.set_regression_counts(critical=1, major=2, minor=3) + + output = generate_latest(exporter.registry).decode() + assert 'cow_perf_regression_detected{severity="critical"} 1.0' in output + assert 'cow_perf_regression_detected{severity="major"} 2.0' in output + assert 'cow_perf_regression_detected{severity="minor"} 3.0' in output + + def test_set_baseline_comparison(self) -> None: + """Test baseline comparison metric setting.""" + exporter = PrometheusExporter(scenario="test") + exporter.set_baseline_comparison( + metric_name="avg_latency", + baseline_id="baseline-123", + percent_change=15.5, + ) + + output = generate_latest(exporter.registry).decode() + assert ( + 'cow_perf_baseline_comparison_percent{baseline_id="baseline-123",metric="avg_latency"} 15.5' + in output + ) + + def test_record_trader_order_submitted(self) -> None: + """Test manual trader order submission recording.""" + exporter = PrometheusExporter(scenario="test") + exporter.record_trader_order_submitted(trader_index=0) + exporter.record_trader_order_submitted(trader_index=0) + exporter.record_trader_order_submitted(trader_index=1) + + output = generate_latest(exporter.registry).decode() + assert 'cow_perf_trader_orders_submitted_total{trader_index="0"} 2.0' in output + assert 'cow_perf_trader_orders_submitted_total{trader_index="1"} 1.0' in output + + def test_set_active_traders(self) -> None: + """Test active traders gauge setting.""" + exporter = PrometheusExporter(scenario="test") + exporter.set_active_traders(5) + + output = generate_latest(exporter.registry).decode() + assert "cow_perf_traders_active 5.0" in output + + +class TestPrometheusExporterAPICallback: + """Tests for API callback handling.""" + + def test_callback_handles_api_metrics(self) -> None: + """Test callback processes APIMetrics correctly.""" + exporter = PrometheusExporter(scenario="test") + + api_metric = APIMetrics( + endpoint="/api/v1/orders", + method="POST", + timestamp=time.time(), + duration=0.25, + status_code=200, + ) + exporter._on_metric_update("api", api_metric) + + output = generate_latest(exporter.registry).decode() + assert ( + 'cow_perf_api_requests_total{endpoint="/api/v1/orders",method="POST",status="200"} 1.0' + in output + ) + + def test_callback_classifies_client_errors(self) -> None: + """Test that 4xx responses are classified as client errors.""" + exporter = PrometheusExporter(scenario="test") + + api_metric = APIMetrics( + endpoint="/api/v1/orders", + method="POST", + timestamp=time.time(), + duration=0.1, + status_code=400, + error_message="Bad request", + ) + exporter._on_metric_update("api", api_metric) + + output = generate_latest(exporter.registry).decode() + assert ( + 'cow_perf_api_errors_total{endpoint="/api/v1/orders",error_type="client_error"} 1.0' + in output + ) + + def test_callback_classifies_server_errors(self) -> None: + """Test that 5xx responses are classified as server errors.""" + exporter = PrometheusExporter(scenario="test") + + api_metric = APIMetrics( + endpoint="/api/v1/orders", + method="POST", + timestamp=time.time(), + duration=0.5, + status_code=500, + error_message="Internal server error", + ) + exporter._on_metric_update("api", api_metric) + + output = generate_latest(exporter.registry).decode() + assert ( + 'cow_perf_api_errors_total{endpoint="/api/v1/orders",error_type="server_error"} 1.0' + in output + ) + + def test_callback_classifies_timeout_errors(self) -> None: + """Test that timeout errors are classified correctly.""" + exporter = PrometheusExporter(scenario="test") + + api_metric = APIMetrics( + endpoint="/api/v1/orders", + method="POST", + timestamp=time.time(), + duration=30.0, + status_code=0, + error_message="Connection timeout", + ) + exporter._on_metric_update("api", api_metric) + + output = generate_latest(exporter.registry).decode() + assert ( + 'cow_perf_api_errors_total{endpoint="/api/v1/orders",error_type="timeout"} 1.0' + in output + ) + + def test_callback_records_response_time(self) -> None: + """Test that API response times are recorded.""" + exporter = PrometheusExporter(scenario="test") + + api_metric = APIMetrics( + endpoint="/api/v1/orders", + method="POST", + timestamp=time.time(), + duration=0.15, + status_code=200, + ) + exporter._on_metric_update("api", api_metric) + + output = generate_latest(exporter.registry).decode() + assert "cow_perf_api_response_time_seconds_sum" in output + assert "cow_perf_api_response_time_seconds_count" in output diff --git a/tests/unit/prometheus/test_metrics.py b/tests/unit/prometheus/test_metrics.py index c2c9605..2dfaec7 100644 --- a/tests/unit/prometheus/test_metrics.py +++ b/tests/unit/prometheus/test_metrics.py @@ -102,3 +102,104 @@ def test_info_metric(self) -> None: output = generate_latest(metrics.registry).decode() assert "cow_perf_test_info" in output assert 'test_id="abc123"' in output + + +class TestMetricsRegistryPhase2: + """Tests for Phase 2 metrics in MetricsRegistry.""" + + def test_api_metrics_exist(self) -> None: + """Test that all API metrics are registered.""" + metrics = MetricsRegistry() + output = generate_latest(metrics.registry).decode() + + assert "cow_perf_api_requests_total" in output + assert "cow_perf_api_response_time_seconds" in output + assert "cow_perf_api_errors_total" in output + + def test_resource_metrics_exist(self) -> None: + """Test that all resource metrics are registered.""" + metrics = MetricsRegistry() + output = generate_latest(metrics.registry).decode() + + assert "cow_perf_container_cpu_percent" in output + assert "cow_perf_container_memory_bytes" in output + assert "cow_perf_container_network_rx_bytes" in output + assert "cow_perf_container_network_tx_bytes" in output + + def test_trader_metrics_exist(self) -> None: + """Test that all per-trader metrics are registered.""" + metrics = MetricsRegistry() + output = generate_latest(metrics.registry).decode() + + assert "cow_perf_trader_orders_submitted" in output + assert "cow_perf_trader_orders_filled" in output + assert "cow_perf_traders_active" in output + + def test_comparison_metrics_exist(self) -> None: + """Test that all comparison metrics are registered.""" + metrics = MetricsRegistry() + output = generate_latest(metrics.registry).decode() + + assert "cow_perf_baseline_comparison_percent" in output + assert "cow_perf_regression_detected" in output + assert "cow_perf_regressions_total" in output + + def test_api_request_counter(self) -> None: + """Test API request counter with labels.""" + metrics = MetricsRegistry() + metrics.api_requests_total.labels( + endpoint="/api/v1/orders", + method="POST", + status="200", + ).inc() + + output = generate_latest(metrics.registry).decode() + assert ( + 'cow_perf_api_requests_total{endpoint="/api/v1/orders",method="POST",status="200"} 1.0' + in output + ) + + def test_api_response_time_histogram(self) -> None: + """Test API response time histogram.""" + metrics = MetricsRegistry() + metrics.api_response_time.labels( + endpoint="/api/v1/orders", + method="POST", + ).observe(0.15) + + output = generate_latest(metrics.registry).decode() + assert "cow_perf_api_response_time_seconds_bucket" in output + assert "cow_perf_api_response_time_seconds_sum" in output + + def test_container_resource_gauges(self) -> None: + """Test container resource gauges.""" + metrics = MetricsRegistry() + metrics.container_cpu_percent.labels(container="orderbook").set(45.5) + metrics.container_memory_bytes.labels(container="orderbook").set(1024 * 1024 * 512) + + output = generate_latest(metrics.registry).decode() + assert 'cow_perf_container_cpu_percent{container="orderbook"} 45.5' in output + assert 'cow_perf_container_memory_bytes{container="orderbook"}' in output + + def test_trader_counter_with_index(self) -> None: + """Test per-trader counter using index.""" + metrics = MetricsRegistry() + metrics.trader_orders_submitted.labels(trader_index="0").inc() + metrics.trader_orders_submitted.labels(trader_index="0").inc() + metrics.trader_orders_submitted.labels(trader_index="1").inc() + + output = generate_latest(metrics.registry).decode() + assert 'cow_perf_trader_orders_submitted_total{trader_index="0"} 2.0' in output + assert 'cow_perf_trader_orders_submitted_total{trader_index="1"} 1.0' in output + + def test_regression_detection_gauge(self) -> None: + """Test regression detection gauge with severity labels.""" + metrics = MetricsRegistry() + metrics.regression_detected.labels(severity="critical").set(1) + metrics.regression_detected.labels(severity="major").set(2) + metrics.regression_detected.labels(severity="minor").set(3) + + output = generate_latest(metrics.registry).decode() + assert 'cow_perf_regression_detected{severity="critical"} 1.0' in output + assert 'cow_perf_regression_detected{severity="major"} 2.0' in output + assert 'cow_perf_regression_detected{severity="minor"} 3.0' in output diff --git a/thoughts/INDEX.md b/thoughts/INDEX.md index edb95c4..f886911 100644 --- a/thoughts/INDEX.md +++ b/thoughts/INDEX.md @@ -122,6 +122,7 @@ Detailed implementation approaches for tickets. Read these before implementing t | [2026-02-02-cow-588-baseline-snapshot-system.md](plans/2026-02-02-cow-588-baseline-snapshot-system.md) | COW-588 | ✅ Complete | BaselineManager, git-info, UUID-index, serialization | | [2026-02-03-cow-589-comparison-engine.md](plans/2026-02-03-cow-589-comparison-engine.md) | COW-589 | ✅ Complete | ComparisonEngine, regression, statistics, p-value, Cohen's-d | | [2026-02-03-cow-590-automated-reporting.md](plans/2026-02-03-cow-590-automated-reporting.md) | COW-590 | ✅ Complete | ReportGenerator, formatters, CSV, recommendations, CLI | +| [2026-02-13-cow-598-alerting-rules.md](plans/2026-02-13-cow-598-alerting-rules.md) | COW-598 | 🔲 Ready | Prometheus alerts, alerting rules, thresholds, Grafana annotations | --- @@ -227,6 +228,7 @@ tickets/COW-593-grafana-dashboards.md ### Alerting Rules (COW-598) — M3 ``` tickets/COW-598-alerting-rules.md +└── plans/2026-02-13-cow-598-alerting-rules.md (execution plan) ``` --- diff --git a/thoughts/plans/2026-02-05-cow-591-phase-1-prometheus-exporter.md b/thoughts/plans/2026-02-05-cow-591-phase-1-prometheus-exporter.md new file mode 100644 index 0000000..c10be13 --- /dev/null +++ b/thoughts/plans/2026-02-05-cow-591-phase-1-prometheus-exporter.md @@ -0,0 +1,1330 @@ +# COW-591 Phase 1: Prometheus Exporter Implementation Plan + +## Overview + +Implement a real-time Prometheus HTTP exporter for the CoW Protocol performance testing suite. This exporter will expose metrics at a `/metrics` endpoint that Prometheus can scrape during test execution, enabling live monitoring and visualization in Grafana. + +**Ticket**: [COW-591-prometheus-exporters.md](../tickets/COW-591-prometheus-exporters.md) +**Phase Reference**: [COW-591-implementation-phases.md](../tasks/COW-591-implementation-phases.md) +**PoC Evaluation**: [poc-evaluation.md](../research/poc-evaluation.md) + +--- + +## Current State Analysis + +### What Already Exists + +1. **MetricsStore Callback System** (`src/cow_performance/metrics/store.py:269-298`): + - `register_callback(callback)` - Registers a callback for metric updates + - `unregister_callback(callback)` - Removes a callback + - `_notify_callbacks(metric_type, metric)` - Invokes callbacks on each metric update + - Callbacks receive `(metric_type: str, metric: object)` where `metric_type` is `"order"`, `"api"`, or `"resource"` + +2. **OrderMetadata Model** (`src/cow_performance/metrics/models.py:27-126`): + - Timestamps: `creation_time`, `submission_time`, `acceptance_time`, `first_fill_time`, `completion_time` + - Status: `current_status` (OrderStatus enum) + - Helper methods: `get_time_to_submit()`, `get_time_to_accept()`, `get_time_to_fill()`, `get_total_lifecycle_time()` + - **Note**: No `order_type` field exists - order type must be inferred or tracked separately + +3. **OrderStatus Enum** (`src/cow_performance/metrics/models.py:13-24`): + - `CREATED`, `SUBMITTED`, `ACCEPTED`, `OPEN`, `FILLED`, `PARTIALLY_FILLED`, `EXPIRED`, `CANCELLED`, `FAILED` + +4. **prometheus-client Dependency** (`pyproject.toml:26`): + - Already installed: `prometheus-client = "^0.19.0"` + +5. **Static Prometheus Output** (`src/cow_performance/cli/output.py:72-91`): + - `format_metrics_prometheus_text()` generates one-shot text format at test end + - Not suitable for real-time scraping + +6. **Prometheus Scrape Config** (`configs/prometheus.yml`): + - Scrapes CoW services (orderbook, autopilot, driver, baseline) + - Does not include performance test exporter + +### Key Discoveries + +- `MetricsStore` is instantiated at `src/cow_performance/cli/commands/run.py:294` +- The callback system was designed for COW-611 streaming but works perfectly for Prometheus +- `__version__` is available from `src/cow_performance/__init__.py:6` +- OrderMetadata does NOT have an `order_type` field - we'll use a default label value + +--- + +## Desired End State + +After this plan is complete: + +1. A new `src/cow_performance/prometheus/` module exists with: + - `MetricsRegistry` class defining all Phase 1 Prometheus metrics + - `PrometheusExporter` class with HTTP server and MetricsStore integration + +2. Running `cow-perf run --prometheus-port 9091` starts an HTTP server exposing metrics at `http://localhost:9091/metrics` + +3. Prometheus can scrape the endpoint and receive real-time metrics updates during test execution + +4. All Phase 1 metrics are exposed: + - Order counters (created, submitted, filled, failed, expired, active) + - Latency histograms (submission, orderbook, settlement, lifecycle) + - Throughput gauges (orders_per_second, target_rate, actual_rate) + - Test metadata (test_info, start_timestamp, duration, num_traders, progress) + +### Verification + +```bash +# Start a test with Prometheus exporter +cow-perf run --prometheus-port 9091 --duration 60 + +# In another terminal, verify metrics +curl http://localhost:9091/metrics | grep cow_perf_ + +# Expected output includes: +# cow_perf_orders_created_total +# cow_perf_orders_submitted_total +# cow_perf_submission_latency_seconds_bucket +# cow_perf_test_info +``` + +--- + +## What We're NOT Doing + +Phase 2 items (deferred to separate implementation): +- Per-trader metrics (`cow_perf_trader_orders_submitted`, `cow_perf_trader_orders_filled`) +- API performance metrics (`cow_perf_api_requests_total`, `cow_perf_api_response_time_seconds`) +- Resource metrics (`cow_perf_container_cpu_percent`, `cow_perf_container_memory_bytes`) +- Baseline comparison metrics (`cow_perf_baseline_comparison_percent`, `cow_perf_regression_detected`) + +Not in scope: +- Grafana dashboard creation (COW-593) +- Alerting rules (COW-598) +- Docker Compose changes for the exporter service + +--- + +## Implementation Approach + +1. **Separate metrics definitions from exporter logic** for testability +2. **Use a custom CollectorRegistry** to avoid conflicts with the default registry during testing +3. **Hook into MetricsStore callbacks** for real-time updates (no polling) +4. **Run HTTP server in daemon thread** to avoid blocking test execution +5. **Use port 9091** (not 9090) to avoid conflict with Prometheus server itself + +--- + +## Phase 1: Create Module Structure + +### Overview + +Create the `src/cow_performance/prometheus/` module with proper exports. + +### Changes Required + +#### 1. Create Module Directory + +**File**: `src/cow_performance/prometheus/__init__.py` + +```python +"""Prometheus metrics exporter for CoW Protocol performance testing.""" + +from cow_performance.prometheus.exporter import PrometheusExporter +from cow_performance.prometheus.metrics import MetricsRegistry + +__all__ = ["PrometheusExporter", "MetricsRegistry"] +``` + +#### 2. Create Empty Test Directories + +**Files**: +- `tests/unit/prometheus/__init__.py` +- `tests/integration/__init__.py` (if doesn't exist) + +### Success Criteria + +- [x] Directory structure exists as specified + +--- + +## Phase 2: Implement MetricsRegistry + +### Overview + +Define all Phase 1 Prometheus metrics in a dedicated class. Using a class allows dependency injection of the registry for testing. + +### Changes Required + +#### 1. Create Metrics Definitions + +**File**: `src/cow_performance/prometheus/metrics.py` + +```python +"""Prometheus metric definitions for CoW Protocol performance testing. + +All metrics are prefixed with `cow_perf_` to distinguish from production metrics. +Uses a custom CollectorRegistry to avoid conflicts during testing. +""" + +from prometheus_client import ( + CollectorRegistry, + Counter, + Gauge, + Histogram, + Info, +) + + +class MetricsRegistry: + """ + Registry of Prometheus metrics for performance testing. + + Uses a custom CollectorRegistry to avoid conflicts with the default registry. + All metrics are prefixed with `cow_perf_` as per naming convention. + + Example: + registry = MetricsRegistry() + registry.orders_created.labels(scenario="stress").inc() + """ + + def __init__(self, registry: CollectorRegistry | None = None): + """ + Initialize the metrics registry. + + Args: + registry: Optional custom registry. Creates new one if not provided. + """ + self.registry = registry or CollectorRegistry() + self._init_order_metrics() + self._init_latency_metrics() + self._init_throughput_metrics() + self._init_test_metadata() + + def _init_order_metrics(self) -> None: + """Initialize order-related counters and gauges.""" + # Counters for order lifecycle events + self.orders_created = Counter( + "cow_perf_orders_created_total", + "Total number of orders created", + ["scenario"], + registry=self.registry, + ) + self.orders_submitted = Counter( + "cow_perf_orders_submitted_total", + "Total number of orders submitted to API", + ["scenario"], + registry=self.registry, + ) + self.orders_filled = Counter( + "cow_perf_orders_filled_total", + "Total number of orders successfully filled", + ["scenario"], + registry=self.registry, + ) + self.orders_failed = Counter( + "cow_perf_orders_failed_total", + "Total number of orders that failed", + ["scenario"], + registry=self.registry, + ) + self.orders_expired = Counter( + "cow_perf_orders_expired_total", + "Total number of orders that expired", + ["scenario"], + registry=self.registry, + ) + + # Gauge for active orders + self.orders_active = Gauge( + "cow_perf_orders_active", + "Currently active (non-terminal) orders", + ["scenario"], + registry=self.registry, + ) + + def _init_latency_metrics(self) -> None: + """Initialize latency histograms with appropriate buckets.""" + # Submission latency (fast operation: creation to submission) + self.submission_latency = Histogram( + "cow_perf_submission_latency_seconds", + "Time from order creation to API submission", + ["scenario"], + buckets=[0.1, 0.5, 1, 2, 5, 10, 30], + registry=self.registry, + ) + + # Orderbook acceptance latency (submission to acceptance) + self.orderbook_latency = Histogram( + "cow_perf_orderbook_latency_seconds", + "Time from submission to orderbook acceptance", + ["scenario"], + buckets=[0.1, 0.5, 1, 2, 5, 10, 30], + registry=self.registry, + ) + + # Settlement latency (slow operation: acceptance to fill) + self.settlement_latency = Histogram( + "cow_perf_settlement_latency_seconds", + "Time from acceptance to order fill", + ["scenario"], + buckets=[10, 30, 60, 120, 300, 600], + registry=self.registry, + ) + + # Full lifecycle (creation to completion) + self.order_lifecycle = Histogram( + "cow_perf_order_lifecycle_seconds", + "Total order lifecycle duration (creation to completion)", + ["scenario"], + buckets=[10, 30, 60, 120, 300, 600, 900], + registry=self.registry, + ) + + def _init_throughput_metrics(self) -> None: + """Initialize throughput gauges.""" + self.orders_per_second = Gauge( + "cow_perf_orders_per_second", + "Current order submission rate", + ["scenario"], + registry=self.registry, + ) + self.target_rate = Gauge( + "cow_perf_target_rate", + "Configured target submission rate", + ["scenario"], + registry=self.registry, + ) + self.actual_rate = Gauge( + "cow_perf_actual_rate", + "Measured actual submission rate", + ["scenario"], + registry=self.registry, + ) + + def _init_test_metadata(self) -> None: + """Initialize test metadata metrics.""" + self.test_info = Info( + "cow_perf_test", + "Performance test information", + registry=self.registry, + ) + self.test_start_timestamp = Gauge( + "cow_perf_test_start_timestamp", + "Test start Unix timestamp", + ["scenario"], + registry=self.registry, + ) + self.test_duration_seconds = Gauge( + "cow_perf_test_duration_seconds", + "Configured test duration in seconds", + ["scenario"], + registry=self.registry, + ) + self.num_traders = Gauge( + "cow_perf_num_traders", + "Number of simulated traders", + ["scenario"], + registry=self.registry, + ) + self.test_progress_percent = Gauge( + "cow_perf_test_progress_percent", + "Test completion percentage (0-100)", + ["scenario"], + registry=self.registry, + ) +``` + +### Success Criteria + +- [x] `poetry run mypy src/cow_performance/prometheus/metrics.py` passes + +--- + +## Phase 3: Implement PrometheusExporter + +### Overview + +Implement the main exporter class with HTTP server and MetricsStore callback integration. + +### Changes Required + +#### 1. Create Exporter Class + +**File**: `src/cow_performance/prometheus/exporter.py` + +```python +"""Prometheus HTTP exporter for CoW Protocol performance testing metrics. + +Exposes metrics at /metrics endpoint for Prometheus scraping. +Integrates with MetricsStore via callbacks for real-time updates. +""" + +import logging +import platform +import time +from typing import TYPE_CHECKING + +from prometheus_client import start_http_server + +from cow_performance import __version__ +from cow_performance.metrics.models import OrderMetadata, OrderStatus +from cow_performance.prometheus.metrics import MetricsRegistry + +if TYPE_CHECKING: + from cow_performance.metrics.store import MetricsStore + +logger = logging.getLogger(__name__) + + +class PrometheusExporter: + """ + Prometheus HTTP exporter for performance testing. + + Exposes metrics at /metrics endpoint for Prometheus scraping. + Integrates with MetricsStore via callbacks for real-time updates. + + Example: + exporter = PrometheusExporter(port=9091, scenario="stress-test") + exporter.start() + + # Register with MetricsStore for real-time updates + exporter.register_with_store(metrics_store) + + # ... run tests ... + + exporter.stop() + """ + + DEFAULT_PORT = 9091 + + def __init__( + self, + port: int = DEFAULT_PORT, + scenario: str = "default", + ): + """ + Initialize the Prometheus exporter. + + Args: + port: Port for HTTP server (default: 9091) + scenario: Scenario name for metric labels + """ + self.port = port + self.scenario = scenario + self._metrics = MetricsRegistry() + self._running = False + self._store: "MetricsStore | None" = None + self._active_orders: set[str] = set() + + @property + def registry(self) -> "CollectorRegistry": + """Get the Prometheus CollectorRegistry.""" + from prometheus_client import CollectorRegistry + + return self._metrics.registry + + def start(self) -> None: + """Start the HTTP server for metrics exposition.""" + if self._running: + logger.warning("Prometheus exporter already running on port %d", self.port) + return + + try: + start_http_server(self.port, registry=self._metrics.registry) + self._running = True + logger.info("Prometheus exporter started on port %d", self.port) + except OSError as e: + logger.error("Failed to start Prometheus exporter on port %d: %s", self.port, e) + raise + + def stop(self) -> None: + """Stop the exporter and unregister callbacks.""" + if not self._running: + return + + # Unregister from MetricsStore if registered + if self._store is not None: + self._store.unregister_callback(self._on_metric_update) + self._store = None + + self._running = False + logger.info("Prometheus exporter stopped") + + def register_with_store(self, store: "MetricsStore") -> None: + """ + Register with MetricsStore for real-time metric updates. + + Args: + store: The MetricsStore to receive updates from + """ + self._store = store + store.register_callback(self._on_metric_update) + logger.debug("Prometheus exporter registered with MetricsStore") + + def _on_metric_update(self, metric_type: str, metric: object) -> None: + """ + Callback for MetricsStore updates. + + Maps incoming metrics to Prometheus metrics based on type. + """ + try: + if metric_type == "order" and isinstance(metric, OrderMetadata): + self._update_order_metrics(metric) + # API and resource metrics will be handled in Phase 2 + except Exception as e: + logger.warning("Error updating Prometheus metric: %s", e) + + def _update_order_metrics(self, order: OrderMetadata) -> None: + """Update order-related Prometheus metrics from OrderMetadata.""" + status = order.current_status + scenario = self.scenario + + # Track active orders + if status == OrderStatus.CREATED: + self._metrics.orders_created.labels(scenario=scenario).inc() + self._active_orders.add(order.order_uid) + + elif status == OrderStatus.SUBMITTED: + self._metrics.orders_submitted.labels(scenario=scenario).inc() + + # Record submission latency if available + latency = order.get_time_to_submit() + if latency is not None: + self._metrics.submission_latency.labels(scenario=scenario).observe(latency) + + elif status in (OrderStatus.ACCEPTED, OrderStatus.OPEN): + # Record orderbook acceptance latency + latency = order.get_time_to_accept() + if latency is not None: + self._metrics.orderbook_latency.labels(scenario=scenario).observe(latency) + + elif status == OrderStatus.FILLED: + self._metrics.orders_filled.labels(scenario=scenario).inc() + self._active_orders.discard(order.order_uid) + + # Record settlement latency (acceptance to fill) + latency = order.get_time_to_fill() + if latency is not None: + self._metrics.settlement_latency.labels(scenario=scenario).observe(latency) + + # Record full lifecycle + lifecycle = order.get_total_lifecycle_time() + if lifecycle is not None: + self._metrics.order_lifecycle.labels(scenario=scenario).observe(lifecycle) + + elif status == OrderStatus.FAILED: + self._metrics.orders_failed.labels(scenario=scenario).inc() + self._active_orders.discard(order.order_uid) + + elif status == OrderStatus.EXPIRED: + self._metrics.orders_expired.labels(scenario=scenario).inc() + self._active_orders.discard(order.order_uid) + + elif status == OrderStatus.CANCELLED: + # Cancelled orders are tracked but not counted as failed + self._active_orders.discard(order.order_uid) + + # Update active orders gauge + self._metrics.orders_active.labels(scenario=scenario).set(len(self._active_orders)) + + # --- Manual Recording Methods (for direct updates) --- + + def record_order_created(self) -> None: + """Record an order creation event.""" + self._metrics.orders_created.labels(scenario=self.scenario).inc() + + def record_order_submitted(self, latency_seconds: float | None = None) -> None: + """Record an order submission with optional latency.""" + self._metrics.orders_submitted.labels(scenario=self.scenario).inc() + if latency_seconds is not None: + self._metrics.submission_latency.labels(scenario=self.scenario).observe(latency_seconds) + + def record_order_filled( + self, + settlement_latency: float | None = None, + lifecycle_latency: float | None = None, + ) -> None: + """Record an order fill with optional latencies.""" + self._metrics.orders_filled.labels(scenario=self.scenario).inc() + if settlement_latency is not None: + self._metrics.settlement_latency.labels(scenario=self.scenario).observe( + settlement_latency + ) + if lifecycle_latency is not None: + self._metrics.order_lifecycle.labels(scenario=self.scenario).observe(lifecycle_latency) + + def record_order_failed(self) -> None: + """Record an order failure.""" + self._metrics.orders_failed.labels(scenario=self.scenario).inc() + + def record_order_expired(self) -> None: + """Record an order expiration.""" + self._metrics.orders_expired.labels(scenario=self.scenario).inc() + + def update_active_orders(self, count: int) -> None: + """Update the active orders gauge.""" + self._metrics.orders_active.labels(scenario=self.scenario).set(count) + + def update_throughput( + self, + orders_per_second: float, + target_rate: float | None = None, + actual_rate: float | None = None, + ) -> None: + """Update throughput gauges.""" + self._metrics.orders_per_second.labels(scenario=self.scenario).set(orders_per_second) + if target_rate is not None: + self._metrics.target_rate.labels(scenario=self.scenario).set(target_rate) + if actual_rate is not None: + self._metrics.actual_rate.labels(scenario=self.scenario).set(actual_rate) + + def set_test_info( + self, + test_id: str, + git_commit: str = "", + duration: int = 0, + ) -> None: + """Set test metadata info metric.""" + self._metrics.test_info.info( + { + "test_id": test_id, + "scenario": self.scenario, + "git_commit": git_commit, + "duration": str(duration), + "python_version": platform.python_version(), + "platform": platform.system(), + "cow_perf_version": __version__, + } + ) + + def set_test_start(self, timestamp: float | None = None) -> None: + """Set test start timestamp.""" + ts = timestamp or time.time() + self._metrics.test_start_timestamp.labels(scenario=self.scenario).set(ts) + + def set_test_duration(self, duration_seconds: int) -> None: + """Set configured test duration.""" + self._metrics.test_duration_seconds.labels(scenario=self.scenario).set(duration_seconds) + + def set_num_traders(self, count: int) -> None: + """Set number of simulated traders.""" + self._metrics.num_traders.labels(scenario=self.scenario).set(count) + + def update_progress(self, percent: float) -> None: + """Update test progress percentage (0-100).""" + self._metrics.test_progress_percent.labels(scenario=self.scenario).set(percent) + + def is_running(self) -> bool: + """Check if exporter is running.""" + return self._running +``` + +### Success Criteria + +- [x] `poetry run mypy src/cow_performance/prometheus/exporter.py` passes + +--- + +## Phase 4: Add CLI Integration + +### Overview + +Add `--prometheus-port` flag to the run command and integrate the exporter into the test execution flow. + +### Changes Required + +#### 1. Update Run Command + +**File**: `src/cow_performance/cli/commands/run.py` + +**Change 1**: Add import at top of file (after line 29): + +```python +from cow_performance.prometheus import PrometheusExporter +``` + +**Change 2**: Update `run_performance_test` function signature (line 68-75) to add parameter: + +```python +async def run_performance_test( + config: PerformanceTestConfig, + traders: int | None = None, + duration: int | None = None, + settlement_wait: int | None = None, + verbose: bool = False, + dry_run: bool = False, + prometheus_port: int | None = None, # Add this parameter +) -> dict[str, Any]: +``` + +**Change 3**: After MetricsStore creation (after line 294), add exporter setup: + +```python + # Create shared metrics store for all components + metrics_store = MetricsStore() + + # Start Prometheus exporter if port specified + prometheus_exporter: PrometheusExporter | None = None + if prometheus_port is not None: + prometheus_exporter = PrometheusExporter( + port=prometheus_port, + scenario=config.trading_pattern, # Use trading pattern as scenario name + ) + prometheus_exporter.start() + prometheus_exporter.register_with_store(metrics_store) + + # Set initial test metadata + prometheus_exporter.set_test_duration(test_duration) + prometheus_exporter.set_num_traders(num_traders) + prometheus_exporter.set_test_start() + + if verbose: + console.print(f"[cyan]Prometheus Exporter:[/cyan] http://localhost:{prometheus_port}/metrics") + console.print() +``` + +**Change 4**: Update the finally block (around line 438) to stop the exporter: + +```python + finally: + # Stop resource monitoring + if resource_monitor: + await resource_monitor.stop() + + # Stop Prometheus exporter + if prometheus_exporter: + prometheus_exporter.stop() +``` + +**Change 5**: Update `run_command` function signature (line 502) to add parameter: + +```python +def run_command( + config: PerformanceTestConfig, + traders: int | None = None, + duration: int | None = None, + settlement_wait: int | None = None, + output_format: str | None = None, + save_results: bool = False, + output_file: str | None = None, + verbose: bool = False, + dry_run: bool = False, + prometheus_port: int | None = None, # Add this parameter +) -> None: +``` + +**Change 6**: Pass prometheus_port to run_performance_test (around line 536): + +```python + metrics = asyncio.run( + run_performance_test( + config=config, + traders=traders, + duration=duration, + settlement_wait=settlement_wait, + verbose=use_verbose, + dry_run=dry_run, + prometheus_port=prometheus_port, # Add this + ) + ) +``` + +#### 2. Update CLI Main + +**File**: `src/cow_performance/cli/main.py` + +Add `--prometheus-port` option to the run command. Find the `run` function and add the option: + +```python + prometheus_port: Optional[int] = typer.Option( + None, + "--prometheus-port", + help="Port for Prometheus metrics exporter (enables exporter when set)", + ), +``` + +Pass it to `run_command`: + +```python + run_command( + config=config, + traders=traders, + duration=duration, + settlement_wait=settlement_wait, + output_format=output_format, + save_results=save, + output_file=output, + verbose=verbose, + dry_run=dry_run, + prometheus_port=prometheus_port, # Add this + ) +``` + +### Success Criteria + +- [x] `poetry run mypy src/cow_performance/cli/` passes + +--- + +## Phase 5: Update Prometheus Configuration + +### Overview + +Add scrape target for the performance test exporter to the Prometheus configuration. + +### Changes Required + +#### 1. Update Prometheus Config + +**File**: `configs/prometheus.yml` + +Add new scrape job after the baseline job (around line 60): + +```yaml + # CoW Performance Test Suite metrics + # Note: Only active during test runs with --prometheus-port flag + - job_name: "cow-performance-test" + scrape_interval: 5s + static_configs: + - targets: ["host.docker.internal:9091"] + labels: + service: "performance-test" + component: "cow-perf" + # Fail gracefully if exporter not running + scrape_timeout: 5s +``` + +**Note**: Use `host.docker.internal` for Docker-to-host communication on macOS/Windows. For Linux, use the host's IP or `172.17.0.1` (docker0 bridge). + +### Success Criteria + +- [x] YAML syntax is valid: `python -c "import yaml; yaml.safe_load(open('configs/prometheus.yml'))"` + +--- + +## Phase 6: Write Tests + +### Overview + +Write unit tests for MetricsRegistry and PrometheusExporter, plus an integration test for the HTTP endpoint. + +### Changes Required + +#### 1. Unit Tests for MetricsRegistry + +**File**: `tests/unit/prometheus/__init__.py` + +```python +"""Unit tests for Prometheus metrics module.""" +``` + +**File**: `tests/unit/prometheus/test_metrics.py` + +```python +"""Unit tests for Prometheus metrics registry.""" + +import pytest +from prometheus_client import CollectorRegistry, generate_latest + +from cow_performance.prometheus.metrics import MetricsRegistry + + +class TestMetricsRegistry: + """Tests for MetricsRegistry class.""" + + def test_creates_custom_registry(self) -> None: + """Test that MetricsRegistry creates a custom registry.""" + metrics = MetricsRegistry() + assert metrics.registry is not None + assert isinstance(metrics.registry, CollectorRegistry) + + def test_uses_provided_registry(self) -> None: + """Test that MetricsRegistry uses provided registry.""" + custom_registry = CollectorRegistry() + metrics = MetricsRegistry(registry=custom_registry) + assert metrics.registry is custom_registry + + def test_order_counters_exist(self) -> None: + """Test that all order counters are registered.""" + metrics = MetricsRegistry() + output = generate_latest(metrics.registry).decode() + + assert "cow_perf_orders_created_total" in output + assert "cow_perf_orders_submitted_total" in output + assert "cow_perf_orders_filled_total" in output + assert "cow_perf_orders_failed_total" in output + assert "cow_perf_orders_expired_total" in output + + def test_order_active_gauge_exists(self) -> None: + """Test that active orders gauge is registered.""" + metrics = MetricsRegistry() + output = generate_latest(metrics.registry).decode() + assert "cow_perf_orders_active" in output + + def test_latency_histograms_exist(self) -> None: + """Test that all latency histograms are registered.""" + metrics = MetricsRegistry() + output = generate_latest(metrics.registry).decode() + + assert "cow_perf_submission_latency_seconds" in output + assert "cow_perf_orderbook_latency_seconds" in output + assert "cow_perf_settlement_latency_seconds" in output + assert "cow_perf_order_lifecycle_seconds" in output + + def test_throughput_gauges_exist(self) -> None: + """Test that all throughput gauges are registered.""" + metrics = MetricsRegistry() + output = generate_latest(metrics.registry).decode() + + assert "cow_perf_orders_per_second" in output + assert "cow_perf_target_rate" in output + assert "cow_perf_actual_rate" in output + + def test_test_metadata_exists(self) -> None: + """Test that test metadata metrics are registered.""" + metrics = MetricsRegistry() + output = generate_latest(metrics.registry).decode() + + assert "cow_perf_test_info" in output + assert "cow_perf_test_start_timestamp" in output + assert "cow_perf_test_duration_seconds" in output + assert "cow_perf_num_traders" in output + assert "cow_perf_test_progress_percent" in output + + def test_counter_increments(self) -> None: + """Test that counters can be incremented.""" + metrics = MetricsRegistry() + metrics.orders_created.labels(scenario="test").inc() + metrics.orders_created.labels(scenario="test").inc() + + output = generate_latest(metrics.registry).decode() + assert 'cow_perf_orders_created_total{scenario="test"} 2.0' in output + + def test_histogram_observation(self) -> None: + """Test that histograms record observations.""" + metrics = MetricsRegistry() + metrics.submission_latency.labels(scenario="test").observe(0.5) + + output = generate_latest(metrics.registry).decode() + assert "cow_perf_submission_latency_seconds_bucket" in output + assert "cow_perf_submission_latency_seconds_sum" in output + assert "cow_perf_submission_latency_seconds_count" in output + + def test_gauge_set(self) -> None: + """Test that gauges can be set.""" + metrics = MetricsRegistry() + metrics.orders_active.labels(scenario="test").set(42) + + output = generate_latest(metrics.registry).decode() + assert 'cow_perf_orders_active{scenario="test"} 42.0' in output + + def test_info_metric(self) -> None: + """Test that info metric can be set.""" + metrics = MetricsRegistry() + metrics.test_info.info({"test_id": "abc123", "scenario": "stress"}) + + output = generate_latest(metrics.registry).decode() + assert "cow_perf_test_info" in output + assert 'test_id="abc123"' in output +``` + +#### 2. Unit Tests for PrometheusExporter + +**File**: `tests/unit/prometheus/test_exporter.py` + +```python +"""Unit tests for Prometheus exporter.""" + +import pytest +from prometheus_client import generate_latest + +from cow_performance.metrics.models import OrderMetadata, OrderStatus +from cow_performance.prometheus.exporter import PrometheusExporter + + +class TestPrometheusExporter: + """Tests for PrometheusExporter class.""" + + def test_default_port(self) -> None: + """Test that default port is 9091.""" + exporter = PrometheusExporter() + assert exporter.port == 9091 + + def test_custom_port(self) -> None: + """Test that custom port is used.""" + exporter = PrometheusExporter(port=9092) + assert exporter.port == 9092 + + def test_custom_scenario(self) -> None: + """Test that custom scenario is used.""" + exporter = PrometheusExporter(scenario="stress-test") + assert exporter.scenario == "stress-test" + + def test_is_running_initially_false(self) -> None: + """Test that exporter is not running initially.""" + exporter = PrometheusExporter() + assert exporter.is_running() is False + + def test_record_order_created(self) -> None: + """Test manual order creation recording.""" + exporter = PrometheusExporter(scenario="test") + exporter.record_order_created() + + output = generate_latest(exporter.registry).decode() + assert 'cow_perf_orders_created_total{scenario="test"} 1.0' in output + + def test_record_order_submitted_with_latency(self) -> None: + """Test order submission recording with latency.""" + exporter = PrometheusExporter(scenario="test") + exporter.record_order_submitted(latency_seconds=0.25) + + output = generate_latest(exporter.registry).decode() + assert 'cow_perf_orders_submitted_total{scenario="test"} 1.0' in output + assert "cow_perf_submission_latency_seconds_sum" in output + + def test_record_order_filled_with_latencies(self) -> None: + """Test order fill recording with latencies.""" + exporter = PrometheusExporter(scenario="test") + exporter.record_order_filled(settlement_latency=30.0, lifecycle_latency=60.0) + + output = generate_latest(exporter.registry).decode() + assert 'cow_perf_orders_filled_total{scenario="test"} 1.0' in output + assert "cow_perf_settlement_latency_seconds_sum" in output + assert "cow_perf_order_lifecycle_seconds_sum" in output + + def test_record_order_failed(self) -> None: + """Test order failure recording.""" + exporter = PrometheusExporter(scenario="test") + exporter.record_order_failed() + + output = generate_latest(exporter.registry).decode() + assert 'cow_perf_orders_failed_total{scenario="test"} 1.0' in output + + def test_record_order_expired(self) -> None: + """Test order expiration recording.""" + exporter = PrometheusExporter(scenario="test") + exporter.record_order_expired() + + output = generate_latest(exporter.registry).decode() + assert 'cow_perf_orders_expired_total{scenario="test"} 1.0' in output + + def test_update_active_orders(self) -> None: + """Test active orders gauge update.""" + exporter = PrometheusExporter(scenario="test") + exporter.update_active_orders(5) + + output = generate_latest(exporter.registry).decode() + assert 'cow_perf_orders_active{scenario="test"} 5.0' in output + + def test_update_throughput(self) -> None: + """Test throughput gauges update.""" + exporter = PrometheusExporter(scenario="test") + exporter.update_throughput( + orders_per_second=10.5, + target_rate=15.0, + actual_rate=10.5, + ) + + output = generate_latest(exporter.registry).decode() + assert 'cow_perf_orders_per_second{scenario="test"} 10.5' in output + assert 'cow_perf_target_rate{scenario="test"} 15.0' in output + assert 'cow_perf_actual_rate{scenario="test"} 10.5' in output + + def test_set_test_info(self) -> None: + """Test test info metric.""" + exporter = PrometheusExporter(scenario="test") + exporter.set_test_info(test_id="abc123", git_commit="deadbeef", duration=300) + + output = generate_latest(exporter.registry).decode() + assert "cow_perf_test_info" in output + assert 'test_id="abc123"' in output + assert 'scenario="test"' in output + + def test_set_test_duration(self) -> None: + """Test test duration gauge.""" + exporter = PrometheusExporter(scenario="test") + exporter.set_test_duration(300) + + output = generate_latest(exporter.registry).decode() + assert 'cow_perf_test_duration_seconds{scenario="test"} 300.0' in output + + def test_set_num_traders(self) -> None: + """Test num traders gauge.""" + exporter = PrometheusExporter(scenario="test") + exporter.set_num_traders(10) + + output = generate_latest(exporter.registry).decode() + assert 'cow_perf_num_traders{scenario="test"} 10.0' in output + + def test_update_progress(self) -> None: + """Test progress percentage gauge.""" + exporter = PrometheusExporter(scenario="test") + exporter.update_progress(75.0) + + output = generate_latest(exporter.registry).decode() + assert 'cow_perf_test_progress_percent{scenario="test"} 75.0' in output + + +class TestPrometheusExporterOrderCallback: + """Tests for PrometheusExporter order callback handling.""" + + def test_callback_handles_created_status(self) -> None: + """Test callback increments counter for CREATED status.""" + exporter = PrometheusExporter(scenario="test") + + order = OrderMetadata( + order_uid="order-1", + owner="0x123", + creation_time=1000.0, + current_status=OrderStatus.CREATED, + ) + exporter._on_metric_update("order", order) + + output = generate_latest(exporter.registry).decode() + assert 'cow_perf_orders_created_total{scenario="test"} 1.0' in output + assert 'cow_perf_orders_active{scenario="test"} 1.0' in output + + def test_callback_handles_submitted_status(self) -> None: + """Test callback increments counter and records latency for SUBMITTED.""" + exporter = PrometheusExporter(scenario="test") + + order = OrderMetadata( + order_uid="order-1", + owner="0x123", + creation_time=1000.0, + submission_time=1000.5, + current_status=OrderStatus.SUBMITTED, + ) + exporter._on_metric_update("order", order) + + output = generate_latest(exporter.registry).decode() + assert 'cow_perf_orders_submitted_total{scenario="test"} 1.0' in output + + def test_callback_handles_filled_status(self) -> None: + """Test callback increments counter and records latencies for FILLED.""" + exporter = PrometheusExporter(scenario="test") + + # First add as created to track in active orders + order = OrderMetadata( + order_uid="order-1", + owner="0x123", + creation_time=1000.0, + current_status=OrderStatus.CREATED, + ) + exporter._on_metric_update("order", order) + + # Then update to filled + order.submission_time = 1000.5 + order.acceptance_time = 1001.0 + order.first_fill_time = 1030.0 + order.completion_time = 1030.0 + order.current_status = OrderStatus.FILLED + exporter._on_metric_update("order", order) + + output = generate_latest(exporter.registry).decode() + assert 'cow_perf_orders_filled_total{scenario="test"} 1.0' in output + assert 'cow_perf_orders_active{scenario="test"} 0.0' in output + + def test_callback_handles_failed_status(self) -> None: + """Test callback increments counter for FAILED status.""" + exporter = PrometheusExporter(scenario="test") + + # First add as created + order = OrderMetadata( + order_uid="order-1", + owner="0x123", + creation_time=1000.0, + current_status=OrderStatus.CREATED, + ) + exporter._on_metric_update("order", order) + + # Then update to failed + order.current_status = OrderStatus.FAILED + exporter._on_metric_update("order", order) + + output = generate_latest(exporter.registry).decode() + assert 'cow_perf_orders_failed_total{scenario="test"} 1.0' in output + assert 'cow_perf_orders_active{scenario="test"} 0.0' in output + + def test_callback_ignores_non_order_metrics(self) -> None: + """Test callback ignores non-order metric types.""" + exporter = PrometheusExporter(scenario="test") + + # Should not raise + exporter._on_metric_update("api", {"some": "data"}) + exporter._on_metric_update("resource", {"some": "data"}) + + # Counters should still be at default + output = generate_latest(exporter.registry).decode() + # No increments should have happened + assert "cow_perf_orders_created_total" in output +``` + +#### 3. Integration Test + +**File**: `tests/integration/test_prometheus_integration.py` + +```python +"""Integration tests for Prometheus exporter HTTP endpoint.""" + +import socket +import time + +import pytest +import requests + +from cow_performance.prometheus.exporter import PrometheusExporter + + +def find_free_port() -> int: + """Find a free port for testing.""" + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("", 0)) + return s.getsockname()[1] + + +@pytest.fixture +def exporter() -> PrometheusExporter: + """Create and start an exporter for testing.""" + port = find_free_port() + exp = PrometheusExporter(port=port, scenario="integration-test") + exp.start() + # Give server time to start + time.sleep(0.1) + yield exp + exp.stop() + + +class TestPrometheusIntegration: + """Integration tests for Prometheus HTTP endpoint.""" + + def test_metrics_endpoint_accessible(self, exporter: PrometheusExporter) -> None: + """Test that /metrics endpoint is accessible.""" + response = requests.get(f"http://localhost:{exporter.port}/metrics", timeout=5) + assert response.status_code == 200 + assert "text/plain" in response.headers["Content-Type"] + + def test_metrics_output_valid_prometheus_format( + self, exporter: PrometheusExporter + ) -> None: + """Test that output is valid Prometheus format.""" + response = requests.get(f"http://localhost:{exporter.port}/metrics", timeout=5) + content = response.text + + # Check for HELP and TYPE comments + assert "# HELP cow_perf_" in content + assert "# TYPE cow_perf_" in content + + # Check for expected metric families + assert "cow_perf_orders_created_total" in content + assert "cow_perf_submission_latency_seconds" in content + + def test_metrics_update_reflected(self, exporter: PrometheusExporter) -> None: + """Test that metric updates are reflected in output.""" + # Record some metrics + exporter.record_order_created() + exporter.record_order_submitted(latency_seconds=0.1) + exporter.update_throughput(orders_per_second=5.0) + + # Fetch metrics + response = requests.get(f"http://localhost:{exporter.port}/metrics", timeout=5) + content = response.text + + # Verify updates + assert 'cow_perf_orders_created_total{scenario="integration-test"} 1.0' in content + assert 'cow_perf_orders_submitted_total{scenario="integration-test"} 1.0' in content + assert 'cow_perf_orders_per_second{scenario="integration-test"} 5.0' in content + + def test_multiple_exporters_on_different_ports(self) -> None: + """Test that multiple exporters can run on different ports.""" + port1 = find_free_port() + port2 = find_free_port() + + exp1 = PrometheusExporter(port=port1, scenario="test1") + exp2 = PrometheusExporter(port=port2, scenario="test2") + + try: + exp1.start() + exp2.start() + time.sleep(0.1) + + # Both should be accessible + resp1 = requests.get(f"http://localhost:{port1}/metrics", timeout=5) + resp2 = requests.get(f"http://localhost:{port2}/metrics", timeout=5) + + assert resp1.status_code == 200 + assert resp2.status_code == 200 + assert 'scenario="test1"' in resp1.text + assert 'scenario="test2"' in resp2.text + finally: + exp1.stop() + exp2.stop() +``` + +### Success Criteria + +#### Automated Verification +- [x] `poetry run black src/ tests/` passes +- [x] `poetry run ruff check src/ tests/` passes +- [x] `poetry run mypy src/` passes +- [x] `poetry run pytest` passes (all tests including new prometheus tests) + +#### Manual Verification +- [x] `cow-perf run --help` shows `--prometheus-port` option +- [x] `cow-perf run --prometheus-port 9091 --dry-run` starts exporter and shows URL +- [x] `curl http://localhost:9091/metrics` returns valid Prometheus format with `cow_perf_*` metrics +- [ ] Prometheus UI shows `cow-performance-test` target (when docker-compose is running) + +--- + +## Testing Strategy + +### Unit Tests +- **MetricsRegistry**: Verify all metrics are registered with correct names and types +- **PrometheusExporter**: Test manual recording methods, callback handling, and state management + +### Integration Tests +- **HTTP Server**: Verify `/metrics` endpoint accessibility and response format +- **Real-time Updates**: Confirm metric changes are reflected in scraped output + +### Manual Testing Steps + +1. **Start exporter with dry-run test**: + ```bash + cow-perf run --prometheus-port 9091 --dry-run --duration 10 + ``` + +2. **Verify metrics endpoint**: + ```bash + curl http://localhost:9091/metrics | grep cow_perf_ + ``` + +3. **Verify Prometheus can scrape** (requires docker-compose): + ```bash + docker compose up prometheus -d + # Check targets: http://localhost:9090/targets + ``` + +4. **Run actual test and observe metrics**: + ```bash + # Terminal 1: Start test with exporter + cow-perf run --prometheus-port 9091 --duration 60 + + # Terminal 2: Watch metrics update + watch -n 2 'curl -s http://localhost:9091/metrics | grep -E "cow_perf_orders_(created|filled)_total"' + ``` + +--- + +## Performance Considerations + +- **Callback overhead**: MetricsStore callbacks are synchronous and brief; Prometheus metric updates are thread-safe and fast +- **Label cardinality**: Phase 1 uses only `scenario` label with bounded values +- **HTTP server**: Runs in daemon thread, non-blocking to test execution +- **Memory**: Prometheus client library handles metric storage efficiently + +--- + +## References + +- Original ticket: [COW-591-prometheus-exporters.md](../tickets/COW-591-prometheus-exporters.md) +- Implementation phases: [COW-591-implementation-phases.md](../tasks/COW-591-implementation-phases.md) +- PoC evaluation: [poc-evaluation.md](../research/poc-evaluation.md) +- prometheus-client docs: https://prometheus.github.io/client_python/ +- Prometheus naming conventions: https://prometheus.io/docs/practices/naming/ diff --git a/thoughts/plans/2026-02-06-cow-591-phase-2-prometheus-exporter.md b/thoughts/plans/2026-02-06-cow-591-phase-2-prometheus-exporter.md new file mode 100644 index 0000000..43807bc --- /dev/null +++ b/thoughts/plans/2026-02-06-cow-591-phase-2-prometheus-exporter.md @@ -0,0 +1,1124 @@ +# COW-591 Phase 2: Extended Prometheus Metrics Implementation Plan + +## Overview + +Implement the remaining Prometheus metrics for COW-591: per-trader metrics, API performance metrics, resource metrics, and baseline comparison metrics. This completes the full grant deliverable for Prometheus exporters. + +**Ticket**: [COW-591-prometheus-exporters.md](../tickets/COW-591-prometheus-exporters.md) +**Phase Reference**: [COW-591-implementation-phases.md](../tasks/COW-591-implementation-phases.md) +**Phase 1 Plan**: [2026-02-05-cow-591-phase-1-prometheus-exporter.md](./2026-02-05-cow-591-phase-1-prometheus-exporter.md) +**Enables**: COW-593 (Grafana Dashboards) depends on these metrics + +--- + +## Current State Analysis + +### What Phase 1 Implemented + +1. **MetricsRegistry** (`src/cow_performance/prometheus/metrics.py`): + - Order counters: `orders_created`, `orders_submitted`, `orders_filled`, `orders_failed`, `orders_expired`, `orders_active` + - Latency histograms: `submission_latency`, `orderbook_latency`, `settlement_latency`, `order_lifecycle` + - Throughput gauges: `orders_per_second`, `target_rate`, `actual_rate` + - Test metadata: `test_info`, `test_start_timestamp`, `test_duration_seconds`, `num_traders`, `test_progress_percent` + +2. **PrometheusExporter** (`src/cow_performance/prometheus/exporter.py`): + - HTTP server on configurable port (default 9091) + - MetricsStore callback integration for `metric_type == "order"` + - Comment at line 116: `# API and resource metrics will be handled in Phase 2` + +3. **CLI Integration** (`src/cow_performance/cli/commands/run.py`): + - `--prometheus-port` flag enables exporter during test runs + +### Existing Infrastructure for Phase 2 + +1. **API Metrics** (`src/cow_performance/metrics/models.py:154-179`): + - `APIMetrics` dataclass with: `endpoint`, `method`, `timestamp`, `duration`, `status_code`, `error_message` + - MetricsStore callback emits `("api", metric)` on each API call + +2. **Resource Metrics** (`src/cow_performance/metrics/models.py:183-248`): + - `ResourceSample` dataclass with: `cpu_percent`, `memory_bytes`, `network_rx_bytes`, `network_tx_bytes` + - MetricsStore callback emits `("resource", sample)` on each sample + +3. **Baseline Comparison** (`src/cow_performance/comparison/`): + - `ComparisonResult` with `metric_comparisons`, `regressions`, severity counts + - `MetricComparison` with `percent_change`, `regression_severity` + +4. **Trader Tracking**: + - `OrderMetadata.owner` contains trader Ethereum address + - Default 10 traders per test (configurable) + +--- + +## Desired End State + +After this plan is complete: + +1. **MetricsRegistry** has all Phase 2 metrics: + - API metrics: `api_requests_total`, `api_response_time_seconds`, `api_errors_total` + - Resource metrics: `container_cpu_percent`, `container_memory_bytes`, `container_network_rx_bytes`, `container_network_tx_bytes` + - Per-trader metrics: `trader_orders_submitted`, `trader_orders_filled`, `traders_active` + - Baseline comparison metrics: `baseline_comparison_percent`, `regression_detected`, `regressions_total` + +2. **PrometheusExporter** handles all callback types: + - `metric_type == "api"` → updates API metrics + - `metric_type == "resource"` → updates resource metrics + - Order callbacks also update per-trader metrics + +3. **Baseline comparison metrics** can be populated after a comparison is run + +### Verification + +```bash +# Start a test with Prometheus exporter +cow-perf run --prometheus-port 9091 --duration 60 + +# Verify Phase 2 metrics +curl http://localhost:9091/metrics | grep -E "cow_perf_(api|container|trader|baseline|regression)" + +# Expected output includes: +# cow_perf_api_requests_total{endpoint="/api/v1/orders",method="POST",status="200"} +# cow_perf_container_cpu_percent{container="orderbook"} +# cow_perf_trader_orders_submitted{trader_index="0"} +# cow_perf_traders_active +``` + +--- + +## What We're NOT Doing + +- Grafana dashboard creation (COW-593 - separate ticket) +- Alerting rules (COW-598 - separate ticket) +- Docker Compose changes for exporter service +- Changes to MetricsStore callback system (already works) + +--- + +## Implementation Approach + +1. **Add metrics incrementally** - API, then resource, then per-trader, then baseline +2. **Extend existing callback handler** - `_on_metric_update()` already has structure for multiple types +3. **Use trader index for cardinality management** - Not full addresses (bounded to num_traders) +4. **Baseline metrics are "push" style** - Populated explicitly after comparison, not via callback + +--- + +## Phase 1: Add API Metrics + +### Overview + +Add Prometheus metrics for API request tracking. The infrastructure already exists - `InstrumentedOrderbookClient` records `APIMetrics` to `MetricsStore`, which emits `("api", metric)` callbacks. + +### Changes Required + +#### 1. Extend MetricsRegistry + +**File**: `src/cow_performance/prometheus/metrics.py` + +Add new initialization method after `_init_test_metadata()`: + +```python +def _init_api_metrics(self) -> None: + """Initialize API performance metrics.""" + self.api_requests_total = Counter( + "cow_perf_api_requests_total", + "Total API requests", + ["endpoint", "method", "status"], + registry=self.registry, + ) + self.api_response_time = Histogram( + "cow_perf_api_response_time_seconds", + "API response time distribution", + ["endpoint", "method"], + buckets=[0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10], + registry=self.registry, + ) + self.api_errors_total = Counter( + "cow_perf_api_errors_total", + "Total API errors by type", + ["endpoint", "error_type"], + registry=self.registry, + ) +``` + +Call this method in `__init__()`: + +```python +def __init__(self, registry: CollectorRegistry | None = None): + self.registry = registry or CollectorRegistry() + self._init_order_metrics() + self._init_latency_metrics() + self._init_throughput_metrics() + self._init_test_metadata() + self._init_api_metrics() # Add this line +``` + +#### 2. Extend PrometheusExporter Callback + +**File**: `src/cow_performance/prometheus/exporter.py` + +Add import at top: + +```python +from cow_performance.metrics.models import APIMetrics, OrderMetadata, OrderStatus +``` + +Update `_on_metric_update()` method to handle API metrics: + +```python +def _on_metric_update(self, metric_type: str, metric: object) -> None: + """ + Callback for MetricsStore updates. + + Maps incoming metrics to Prometheus metrics based on type. + """ + try: + if metric_type == "order" and isinstance(metric, OrderMetadata): + self._update_order_metrics(metric) + elif metric_type == "api" and isinstance(metric, APIMetrics): + self._update_api_metrics(metric) + # Resource metrics will be handled next + except Exception as e: + logger.warning("Error updating Prometheus metric: %s", e) +``` + +Add new method for API metrics: + +```python +def _update_api_metrics(self, api_metric: APIMetrics) -> None: + """Update API-related Prometheus metrics from APIMetrics.""" + endpoint = api_metric.endpoint + method = api_metric.method + status = str(api_metric.status_code) + + # Increment request counter + self._metrics.api_requests_total.labels( + endpoint=endpoint, + method=method, + status=status, + ).inc() + + # Record response time + self._metrics.api_response_time.labels( + endpoint=endpoint, + method=method, + ).observe(api_metric.duration) + + # Track errors (non-2xx responses) + if not api_metric.is_success: + error_type = self._classify_api_error(api_metric) + self._metrics.api_errors_total.labels( + endpoint=endpoint, + error_type=error_type, + ).inc() + +def _classify_api_error(self, api_metric: APIMetrics) -> str: + """Classify API error by type.""" + status = api_metric.status_code + if 400 <= status < 500: + return "client_error" + elif 500 <= status < 600: + return "server_error" + elif api_metric.error_message: + if "timeout" in api_metric.error_message.lower(): + return "timeout" + elif "connection" in api_metric.error_message.lower(): + return "connection_error" + return "unknown" +``` + +#### 3. Add Manual Recording Methods + +**File**: `src/cow_performance/prometheus/exporter.py` + +Add after existing manual methods: + +```python +# --- API Recording Methods --- + +def record_api_request( + self, + endpoint: str, + method: str, + status_code: int, + duration_seconds: float, +) -> None: + """Record an API request.""" + self._metrics.api_requests_total.labels( + endpoint=endpoint, + method=method, + status=str(status_code), + ).inc() + self._metrics.api_response_time.labels( + endpoint=endpoint, + method=method, + ).observe(duration_seconds) + +def record_api_error(self, endpoint: str, error_type: str) -> None: + """Record an API error.""" + self._metrics.api_errors_total.labels( + endpoint=endpoint, + error_type=error_type, + ).inc() +``` + +### Success Criteria + +- [x] `poetry run mypy src/cow_performance/prometheus/` passes +- [x] `poetry run ruff check src/cow_performance/prometheus/` passes + +--- + +## Phase 2: Add Resource Metrics + +### Overview + +Add Prometheus metrics for container resource monitoring. The `ResourceMonitor` already collects samples and emits `("resource", sample)` callbacks via MetricsStore. + +### Changes Required + +#### 1. Extend MetricsRegistry + +**File**: `src/cow_performance/prometheus/metrics.py` + +Add new initialization method: + +```python +def _init_resource_metrics(self) -> None: + """Initialize container resource metrics.""" + self.container_cpu_percent = Gauge( + "cow_perf_container_cpu_percent", + "Container CPU usage percentage", + ["container"], + registry=self.registry, + ) + self.container_memory_bytes = Gauge( + "cow_perf_container_memory_bytes", + "Container memory usage in bytes", + ["container"], + registry=self.registry, + ) + self.container_network_rx_bytes = Gauge( + "cow_perf_container_network_rx_bytes", + "Container network bytes received", + ["container"], + registry=self.registry, + ) + self.container_network_tx_bytes = Gauge( + "cow_perf_container_network_tx_bytes", + "Container network bytes transmitted", + ["container"], + registry=self.registry, + ) +``` + +Call in `__init__()`: + +```python +def __init__(self, registry: CollectorRegistry | None = None): + self.registry = registry or CollectorRegistry() + self._init_order_metrics() + self._init_latency_metrics() + self._init_throughput_metrics() + self._init_test_metadata() + self._init_api_metrics() + self._init_resource_metrics() # Add this line +``` + +#### 2. Extend PrometheusExporter Callback + +**File**: `src/cow_performance/prometheus/exporter.py` + +Add import: + +```python +from cow_performance.metrics.models import APIMetrics, OrderMetadata, OrderStatus, ResourceSample +``` + +Update `_on_metric_update()`: + +```python +def _on_metric_update(self, metric_type: str, metric: object) -> None: + """ + Callback for MetricsStore updates. + + Maps incoming metrics to Prometheus metrics based on type. + """ + try: + if metric_type == "order" and isinstance(metric, OrderMetadata): + self._update_order_metrics(metric) + elif metric_type == "api" and isinstance(metric, APIMetrics): + self._update_api_metrics(metric) + elif metric_type == "resource": + self._update_resource_metrics(metric) + except Exception as e: + logger.warning("Error updating Prometheus metric: %s", e) +``` + +Add new method: + +```python +def _update_resource_metrics(self, metric: object) -> None: + """Update resource-related Prometheus metrics. + + Note: MetricsStore emits (container_name, sample) tuple for resource metrics. + """ + # Handle tuple format from MetricsStore.add_resource_sample callback + if isinstance(metric, tuple) and len(metric) == 2: + container_name, sample = metric + if isinstance(sample, ResourceSample): + self._metrics.container_cpu_percent.labels( + container=container_name + ).set(sample.cpu_percent) + self._metrics.container_memory_bytes.labels( + container=container_name + ).set(sample.memory_bytes) + self._metrics.container_network_rx_bytes.labels( + container=container_name + ).set(sample.network_rx_bytes) + self._metrics.container_network_tx_bytes.labels( + container=container_name + ).set(sample.network_tx_bytes) +``` + +**Note**: Check how MetricsStore emits resource callbacks. Looking at `store.py:239`, it calls `_notify_callbacks("resource", sample)`. We need to verify if it passes just the sample or a tuple. If it's just the sample, we need the container name from somewhere. + +#### 3. Verify MetricsStore Callback Format + +**File**: `src/cow_performance/metrics/store.py` + +Check line 239 - the callback receives `("resource", sample)` but `sample` is just `ResourceSample`, not including container_name. + +**Fix needed**: Update the callback to pass container name. This requires a small change to MetricsStore: + +**File**: `src/cow_performance/metrics/store.py` + +Find `add_resource_sample()` method and update the callback notification: + +```python +def add_resource_sample(self, container_name: str, sample: ResourceSample) -> None: + """Add a resource sample for a container.""" + # ... existing code ... + + # Change this line: + # self._notify_callbacks("resource", sample) + # To include container_name: + self._notify_callbacks("resource", (container_name, sample)) +``` + +This is a minor interface change but necessary for Prometheus to know which container the sample belongs to. + +#### 4. Add Manual Recording Methods + +**File**: `src/cow_performance/prometheus/exporter.py` + +```python +# --- Resource Recording Methods --- + +def update_container_resources( + self, + container: str, + cpu_percent: float, + memory_bytes: int, + network_rx_bytes: int = 0, + network_tx_bytes: int = 0, +) -> None: + """Update resource metrics for a container.""" + self._metrics.container_cpu_percent.labels(container=container).set(cpu_percent) + self._metrics.container_memory_bytes.labels(container=container).set(memory_bytes) + self._metrics.container_network_rx_bytes.labels(container=container).set(network_rx_bytes) + self._metrics.container_network_tx_bytes.labels(container=container).set(network_tx_bytes) +``` + +### Success Criteria + +- [x] `poetry run mypy src/cow_performance/` passes +- [x] `poetry run ruff check src/cow_performance/` passes + +--- + +## Phase 3: Add Per-Trader Metrics + +### Overview + +Add Prometheus metrics for per-trader order tracking. To manage label cardinality, use trader index (0, 1, 2, ...) instead of full Ethereum addresses. + +### Cardinality Management Strategy + +**Approach**: Use trader index as label value instead of full address. + +- Default tests have 10 traders → 10 label values +- Max reasonable tests might have 100 traders → 100 label values +- This keeps cardinality bounded and predictable + +**Trade-off**: Loses direct address visibility, but: +- Address can be looked up from test logs if needed +- Index provides sufficient granularity for analysis +- Prometheus scraping remains efficient + +### Changes Required + +#### 1. Extend MetricsRegistry + +**File**: `src/cow_performance/prometheus/metrics.py` + +Add new initialization method: + +```python +def _init_trader_metrics(self) -> None: + """Initialize per-trader metrics. + + Note: Uses trader_index (0, 1, 2, ...) instead of full addresses + to manage label cardinality. Default tests have ~10 traders. + """ + self.trader_orders_submitted = Counter( + "cow_perf_trader_orders_submitted", + "Orders submitted per trader", + ["trader_index"], + registry=self.registry, + ) + self.trader_orders_filled = Counter( + "cow_perf_trader_orders_filled", + "Orders filled per trader", + ["trader_index"], + registry=self.registry, + ) + self.traders_active = Gauge( + "cow_perf_traders_active", + "Count of currently active traders", + registry=self.registry, + ) +``` + +Call in `__init__()`: + +```python +def __init__(self, registry: CollectorRegistry | None = None): + self.registry = registry or CollectorRegistry() + self._init_order_metrics() + self._init_latency_metrics() + self._init_throughput_metrics() + self._init_test_metadata() + self._init_api_metrics() + self._init_resource_metrics() + self._init_trader_metrics() # Add this line +``` + +#### 2. Extend PrometheusExporter for Trader Tracking + +**File**: `src/cow_performance/prometheus/exporter.py` + +Add trader tracking state in `__init__()`: + +```python +def __init__( + self, + port: int = DEFAULT_PORT, + scenario: str = "default", +): + self.port = port + self.scenario = scenario + self._metrics = MetricsRegistry() + self._running = False + self._store: MetricsStore | None = None + self._active_orders: set[str] = set() + + # Trader tracking (Phase 2) + self._trader_address_to_index: dict[str, str] = {} + self._active_traders: set[str] = set() # Set of trader indices with active orders + self._orders_by_trader: dict[str, set[str]] = {} # trader_index -> set of order_uids +``` + +Update `_update_order_metrics()` to also update trader metrics: + +```python +def _update_order_metrics(self, order: OrderMetadata) -> None: + """Update order-related Prometheus metrics from OrderMetadata.""" + status = order.current_status + scenario = self.scenario + + # Get or assign trader index + trader_index = self._get_trader_index(order.owner) + + # Track active orders + if status == OrderStatus.CREATED: + self._metrics.orders_created.labels(scenario=scenario).inc() + self._active_orders.add(order.order_uid) + + # Update per-trader tracking + self._metrics.trader_orders_submitted.labels(trader_index=trader_index).inc() + if trader_index not in self._orders_by_trader: + self._orders_by_trader[trader_index] = set() + self._orders_by_trader[trader_index].add(order.order_uid) + self._active_traders.add(trader_index) + self._metrics.traders_active.set(len(self._active_traders)) + + elif status == OrderStatus.SUBMITTED: + # ... existing code unchanged ... + + elif status in (OrderStatus.ACCEPTED, OrderStatus.OPEN): + # ... existing code unchanged ... + + elif status == OrderStatus.FILLED: + self._metrics.orders_filled.labels(scenario=scenario).inc() + self._active_orders.discard(order.order_uid) + + # Update per-trader tracking + self._metrics.trader_orders_filled.labels(trader_index=trader_index).inc() + self._remove_order_from_trader(trader_index, order.order_uid) + + # ... rest of existing code for latencies ... + + elif status == OrderStatus.FAILED: + self._metrics.orders_failed.labels(scenario=scenario).inc() + self._active_orders.discard(order.order_uid) + self._remove_order_from_trader(trader_index, order.order_uid) + + elif status == OrderStatus.EXPIRED: + self._metrics.orders_expired.labels(scenario=scenario).inc() + self._active_orders.discard(order.order_uid) + self._remove_order_from_trader(trader_index, order.order_uid) + + elif status == OrderStatus.CANCELLED: + self._active_orders.discard(order.order_uid) + self._remove_order_from_trader(trader_index, order.order_uid) + + # Update active orders gauge + self._metrics.orders_active.labels(scenario=scenario).set(len(self._active_orders)) +``` + +Add helper methods: + +```python +def _get_trader_index(self, owner_address: str) -> str: + """Get or assign a trader index for an address. + + Uses sequential indices (0, 1, 2, ...) to manage label cardinality. + """ + if owner_address not in self._trader_address_to_index: + index = len(self._trader_address_to_index) + self._trader_address_to_index[owner_address] = str(index) + return self._trader_address_to_index[owner_address] + +def _remove_order_from_trader(self, trader_index: str, order_uid: str) -> None: + """Remove an order from trader tracking and update active traders.""" + if trader_index in self._orders_by_trader: + self._orders_by_trader[trader_index].discard(order_uid) + # If trader has no more active orders, remove from active set + if not self._orders_by_trader[trader_index]: + self._active_traders.discard(trader_index) + self._metrics.traders_active.set(len(self._active_traders)) +``` + +#### 3. Add Manual Recording Methods + +```python +# --- Trader Recording Methods --- + +def record_trader_order_submitted(self, trader_index: int) -> None: + """Record an order submission for a trader.""" + self._metrics.trader_orders_submitted.labels(trader_index=str(trader_index)).inc() + +def record_trader_order_filled(self, trader_index: int) -> None: + """Record an order fill for a trader.""" + self._metrics.trader_orders_filled.labels(trader_index=str(trader_index)).inc() + +def set_active_traders(self, count: int) -> None: + """Set the count of active traders.""" + self._metrics.traders_active.set(count) +``` + +### Success Criteria + +- [x] `poetry run mypy src/cow_performance/prometheus/` passes +- [x] `poetry run ruff check src/cow_performance/prometheus/` passes + +--- + +## Phase 4: Add Baseline Comparison Metrics + +### Overview + +Add Prometheus metrics for baseline comparison results. These are "push" metrics - populated explicitly after a comparison is run, not via MetricsStore callbacks. + +### Changes Required + +#### 1. Extend MetricsRegistry + +**File**: `src/cow_performance/prometheus/metrics.py` + +Add new initialization method: + +```python +def _init_comparison_metrics(self) -> None: + """Initialize baseline comparison metrics.""" + self.baseline_comparison_percent = Gauge( + "cow_perf_baseline_comparison_percent", + "Percentage change from baseline (positive = increase)", + ["metric", "baseline_id"], + registry=self.registry, + ) + self.regression_detected = Gauge( + "cow_perf_regression_detected", + "Count of detected regressions by severity", + ["severity"], + registry=self.registry, + ) + self.regressions_total = Counter( + "cow_perf_regressions_total", + "Total regressions detected by severity", + ["severity"], + registry=self.registry, + ) +``` + +Call in `__init__()`: + +```python +def __init__(self, registry: CollectorRegistry | None = None): + self.registry = registry or CollectorRegistry() + self._init_order_metrics() + self._init_latency_metrics() + self._init_throughput_metrics() + self._init_test_metadata() + self._init_api_metrics() + self._init_resource_metrics() + self._init_trader_metrics() + self._init_comparison_metrics() # Add this line +``` + +#### 2. Add Comparison Recording Methods to PrometheusExporter + +**File**: `src/cow_performance/prometheus/exporter.py` + +Add import: + +```python +from cow_performance.comparison.models import ComparisonResult, RegressionSeverity +``` + +Add methods: + +```python +# --- Baseline Comparison Methods --- + +def record_comparison_result(self, result: ComparisonResult) -> None: + """Record metrics from a baseline comparison result. + + This populates comparison metrics from a ComparisonResult object, + typically called after running a baseline comparison. + """ + baseline_id = result.baseline_id + + # Record percentage changes for each metric comparison + for metric_name, comparison in result.metric_comparisons.items(): + self._metrics.baseline_comparison_percent.labels( + metric=metric_name, + baseline_id=baseline_id, + ).set(comparison.percent_change * 100) # Convert to percentage + + # Record regression counts by severity + self._metrics.regression_detected.labels(severity="critical").set(result.critical_count) + self._metrics.regression_detected.labels(severity="major").set(result.major_count) + self._metrics.regression_detected.labels(severity="minor").set(result.minor_count) + + # Increment total regression counters + for _ in range(result.critical_count): + self._metrics.regressions_total.labels(severity="critical").inc() + for _ in range(result.major_count): + self._metrics.regressions_total.labels(severity="major").inc() + for _ in range(result.minor_count): + self._metrics.regressions_total.labels(severity="minor").inc() + +def set_baseline_comparison( + self, + metric_name: str, + baseline_id: str, + percent_change: float, +) -> None: + """Set a single baseline comparison metric.""" + self._metrics.baseline_comparison_percent.labels( + metric=metric_name, + baseline_id=baseline_id, + ).set(percent_change) + +def set_regression_counts( + self, + critical: int = 0, + major: int = 0, + minor: int = 0, +) -> None: + """Set regression detection counts.""" + self._metrics.regression_detected.labels(severity="critical").set(critical) + self._metrics.regression_detected.labels(severity="major").set(major) + self._metrics.regression_detected.labels(severity="minor").set(minor) +``` + +### Success Criteria + +- [x] `poetry run mypy src/cow_performance/prometheus/` passes +- [x] `poetry run ruff check src/cow_performance/prometheus/` passes + +--- + +## Phase 5: Update Module Exports + +### Overview + +Update `__init__.py` to export any new types needed by consumers. + +### Changes Required + +**File**: `src/cow_performance/prometheus/__init__.py` + +```python +"""Prometheus metrics exporter for CoW Protocol performance testing.""" + +from cow_performance.prometheus.exporter import PrometheusExporter +from cow_performance.prometheus.metrics import MetricsRegistry + +__all__ = ["PrometheusExporter", "MetricsRegistry"] +``` + +No changes needed - exports remain the same. + +--- + +## Phase 6: Write Tests + +### Overview + +Add unit tests for all new Phase 2 metrics and update integration tests. + +### Changes Required + +#### 1. Update Unit Tests for MetricsRegistry + +**File**: `tests/unit/prometheus/test_metrics.py` + +Add tests for new metrics: + +```python +class TestMetricsRegistryPhase2: + """Tests for Phase 2 metrics in MetricsRegistry.""" + + def test_api_metrics_exist(self) -> None: + """Test that all API metrics are registered.""" + metrics = MetricsRegistry() + output = generate_latest(metrics.registry).decode() + + assert "cow_perf_api_requests_total" in output + assert "cow_perf_api_response_time_seconds" in output + assert "cow_perf_api_errors_total" in output + + def test_resource_metrics_exist(self) -> None: + """Test that all resource metrics are registered.""" + metrics = MetricsRegistry() + output = generate_latest(metrics.registry).decode() + + assert "cow_perf_container_cpu_percent" in output + assert "cow_perf_container_memory_bytes" in output + assert "cow_perf_container_network_rx_bytes" in output + assert "cow_perf_container_network_tx_bytes" in output + + def test_trader_metrics_exist(self) -> None: + """Test that all per-trader metrics are registered.""" + metrics = MetricsRegistry() + output = generate_latest(metrics.registry).decode() + + assert "cow_perf_trader_orders_submitted" in output + assert "cow_perf_trader_orders_filled" in output + assert "cow_perf_traders_active" in output + + def test_comparison_metrics_exist(self) -> None: + """Test that all comparison metrics are registered.""" + metrics = MetricsRegistry() + output = generate_latest(metrics.registry).decode() + + assert "cow_perf_baseline_comparison_percent" in output + assert "cow_perf_regression_detected" in output + assert "cow_perf_regressions_total" in output + + def test_api_request_counter(self) -> None: + """Test API request counter with labels.""" + metrics = MetricsRegistry() + metrics.api_requests_total.labels( + endpoint="/api/v1/orders", + method="POST", + status="200", + ).inc() + + output = generate_latest(metrics.registry).decode() + assert 'cow_perf_api_requests_total{endpoint="/api/v1/orders",method="POST",status="200"} 1.0' in output + + def test_api_response_time_histogram(self) -> None: + """Test API response time histogram.""" + metrics = MetricsRegistry() + metrics.api_response_time.labels( + endpoint="/api/v1/orders", + method="POST", + ).observe(0.15) + + output = generate_latest(metrics.registry).decode() + assert "cow_perf_api_response_time_seconds_bucket" in output + assert "cow_perf_api_response_time_seconds_sum" in output + + def test_container_resource_gauges(self) -> None: + """Test container resource gauges.""" + metrics = MetricsRegistry() + metrics.container_cpu_percent.labels(container="orderbook").set(45.5) + metrics.container_memory_bytes.labels(container="orderbook").set(1024 * 1024 * 512) + + output = generate_latest(metrics.registry).decode() + assert 'cow_perf_container_cpu_percent{container="orderbook"} 45.5' in output + assert 'cow_perf_container_memory_bytes{container="orderbook"}' in output + + def test_trader_counter_with_index(self) -> None: + """Test per-trader counter using index.""" + metrics = MetricsRegistry() + metrics.trader_orders_submitted.labels(trader_index="0").inc() + metrics.trader_orders_submitted.labels(trader_index="0").inc() + metrics.trader_orders_submitted.labels(trader_index="1").inc() + + output = generate_latest(metrics.registry).decode() + assert 'cow_perf_trader_orders_submitted{trader_index="0"} 2.0' in output + assert 'cow_perf_trader_orders_submitted{trader_index="1"} 1.0' in output +``` + +#### 2. Update Unit Tests for PrometheusExporter + +**File**: `tests/unit/prometheus/test_exporter.py` + +Add tests for Phase 2 functionality: + +```python +class TestPrometheusExporterPhase2: + """Tests for Phase 2 exporter functionality.""" + + def test_record_api_request(self) -> None: + """Test API request recording.""" + exporter = PrometheusExporter(scenario="test") + exporter.record_api_request( + endpoint="/api/v1/orders", + method="POST", + status_code=200, + duration_seconds=0.15, + ) + + output = generate_latest(exporter.registry).decode() + assert 'cow_perf_api_requests_total{endpoint="/api/v1/orders",method="POST",status="200"} 1.0' in output + + def test_record_api_error(self) -> None: + """Test API error recording.""" + exporter = PrometheusExporter(scenario="test") + exporter.record_api_error(endpoint="/api/v1/orders", error_type="server_error") + + output = generate_latest(exporter.registry).decode() + assert 'cow_perf_api_errors_total{endpoint="/api/v1/orders",error_type="server_error"} 1.0' in output + + def test_update_container_resources(self) -> None: + """Test container resource updates.""" + exporter = PrometheusExporter(scenario="test") + exporter.update_container_resources( + container="orderbook", + cpu_percent=45.5, + memory_bytes=536870912, + network_rx_bytes=1024000, + network_tx_bytes=512000, + ) + + output = generate_latest(exporter.registry).decode() + assert 'cow_perf_container_cpu_percent{container="orderbook"} 45.5' in output + assert 'cow_perf_container_memory_bytes{container="orderbook"} 536870912' in output + + def test_trader_index_assignment(self) -> None: + """Test that trader addresses get sequential indices.""" + exporter = PrometheusExporter(scenario="test") + + # Simulate orders from different traders + idx1 = exporter._get_trader_index("0xAAA") + idx2 = exporter._get_trader_index("0xBBB") + idx3 = exporter._get_trader_index("0xAAA") # Same as first + + assert idx1 == "0" + assert idx2 == "1" + assert idx3 == "0" # Same address gets same index + + def test_active_traders_tracking(self) -> None: + """Test active traders gauge updates.""" + exporter = PrometheusExporter(scenario="test") + + # Create orders from two traders + order1 = OrderMetadata( + order_uid="order-1", + owner="0xAAA", + creation_time=1000.0, + current_status=OrderStatus.CREATED, + ) + order2 = OrderMetadata( + order_uid="order-2", + owner="0xBBB", + creation_time=1000.0, + current_status=OrderStatus.CREATED, + ) + + exporter._on_metric_update("order", order1) + exporter._on_metric_update("order", order2) + + output = generate_latest(exporter.registry).decode() + assert "cow_perf_traders_active 2.0" in output + + # Fill one order + order1.current_status = OrderStatus.FILLED + order1.completion_time = 1030.0 + exporter._on_metric_update("order", order1) + + output = generate_latest(exporter.registry).decode() + assert "cow_perf_traders_active 1.0" in output + + def test_set_regression_counts(self) -> None: + """Test regression count setting.""" + exporter = PrometheusExporter(scenario="test") + exporter.set_regression_counts(critical=1, major=2, minor=3) + + output = generate_latest(exporter.registry).decode() + assert 'cow_perf_regression_detected{severity="critical"} 1.0' in output + assert 'cow_perf_regression_detected{severity="major"} 2.0' in output + assert 'cow_perf_regression_detected{severity="minor"} 3.0' in output + + +class TestPrometheusExporterAPICallback: + """Tests for API callback handling.""" + + def test_callback_handles_api_metrics(self) -> None: + """Test callback processes APIMetrics correctly.""" + exporter = PrometheusExporter(scenario="test") + + api_metric = APIMetrics( + endpoint="/api/v1/orders", + method="POST", + timestamp=time.time(), + duration=0.25, + status_code=200, + ) + exporter._on_metric_update("api", api_metric) + + output = generate_latest(exporter.registry).decode() + assert 'cow_perf_api_requests_total{endpoint="/api/v1/orders",method="POST",status="200"} 1.0' in output + + def test_callback_classifies_errors(self) -> None: + """Test that non-2xx responses are classified as errors.""" + exporter = PrometheusExporter(scenario="test") + + api_metric = APIMetrics( + endpoint="/api/v1/orders", + method="POST", + timestamp=time.time(), + duration=0.5, + status_code=500, + error_message="Internal server error", + ) + exporter._on_metric_update("api", api_metric) + + output = generate_latest(exporter.registry).decode() + assert 'cow_perf_api_errors_total{endpoint="/api/v1/orders",error_type="server_error"} 1.0' in output +``` + +#### 3. Add import for time module + +**File**: `tests/unit/prometheus/test_exporter.py` + +Add at top: + +```python +import time + +from cow_performance.metrics.models import APIMetrics, OrderMetadata, OrderStatus +``` + +### Success Criteria + +#### Automated Verification + +- [x] `poetry run black src/ tests/` passes +- [x] `poetry run ruff check src/ tests/` passes +- [x] `poetry run mypy src/` passes +- [x] `poetry run pytest tests/unit/prometheus/` passes (all new tests) +- [x] `poetry run pytest` passes (full test suite) + +#### Manual Verification + +- [x] `cow-perf run --prometheus-port 9091 --dry-run` starts exporter +- [x] `curl http://localhost:9091/metrics | grep cow_perf_api_` shows API metrics +- [x] `curl http://localhost:9091/metrics | grep cow_perf_container_` shows resource metrics +- [x] `curl http://localhost:9091/metrics | grep cow_perf_trader_` shows trader metrics +- [x] `curl http://localhost:9091/metrics | grep cow_perf_baseline_` shows comparison metrics + +--- + +## Testing Strategy + +### Unit Tests + +- **MetricsRegistry**: Verify all Phase 2 metrics are registered with correct names, types, and labels +- **PrometheusExporter**: Test manual recording methods, callback handling for API/resource types, trader index assignment + +### Integration Tests + +- **HTTP Server**: Verify new metrics appear in `/metrics` output +- **End-to-end**: Run a short test and verify API metrics are populated from actual API calls + +### Manual Testing Steps + +1. **Start test with Prometheus exporter**: + ```bash + cow-perf run --prometheus-port 9091 --duration 30 + ``` + +2. **In another terminal, verify metrics**: + ```bash + # API metrics + curl -s http://localhost:9091/metrics | grep "cow_perf_api_" + + # Resource metrics (requires docker services running) + curl -s http://localhost:9091/metrics | grep "cow_perf_container_" + + # Per-trader metrics + curl -s http://localhost:9091/metrics | grep "cow_perf_trader_" + ``` + +3. **Verify Prometheus can scrape** (requires docker-compose with monitoring profile): + ```bash + docker compose --profile monitoring up -d + # Check http://localhost:9090/targets for cow-performance-test target + ``` + +--- + +## Performance Considerations + +- **API metrics**: Each API call triggers callback; Prometheus increments are O(1) +- **Resource metrics**: Sampled every 5s by ResourceMonitor; bounded by number of containers (~5) +- **Per-trader metrics**: Bounded by num_traders (default 10, max ~100); uses index not address +- **Baseline comparison**: Push-based, only called after explicit comparison; not high-frequency + +--- + +## Summary of Files Modified + +| File | Changes | +|------|---------| +| `src/cow_performance/prometheus/metrics.py` | Add 4 new `_init_*` methods for API, resource, trader, comparison metrics | +| `src/cow_performance/prometheus/exporter.py` | Add callback handlers, helper methods, manual recording methods | +| `src/cow_performance/metrics/store.py` | Update resource callback to include container_name (minor) | +| `tests/unit/prometheus/test_metrics.py` | Add `TestMetricsRegistryPhase2` class | +| `tests/unit/prometheus/test_exporter.py` | Add `TestPrometheusExporterPhase2`, `TestPrometheusExporterAPICallback` classes | + +--- + +## References + +- Original ticket: [COW-591-prometheus-exporters.md](../tickets/COW-591-prometheus-exporters.md) +- Implementation phases: [COW-591-implementation-phases.md](../tasks/COW-591-implementation-phases.md) +- Phase 1 plan: [2026-02-05-cow-591-phase-1-prometheus-exporter.md](./2026-02-05-cow-591-phase-1-prometheus-exporter.md) +- Enables: COW-593 (Grafana Dashboards) +- prometheus-client docs: https://prometheus.github.io/client_python/ diff --git a/thoughts/plans/2026-02-06-cow-593-grafana-dashboards-task1.md b/thoughts/plans/2026-02-06-cow-593-grafana-dashboards-task1.md new file mode 100644 index 0000000..528283a --- /dev/null +++ b/thoughts/plans/2026-02-06-cow-593-grafana-dashboards-task1.md @@ -0,0 +1,444 @@ +# COW-593 Task 1: Grafana Dashboards Implementation Plan + +## Overview + +Create the essential Grafana dashboards for performance testing: **Performance Overview** and **API Performance**. These dashboards visualize metrics exposed by COW-591 Prometheus exporter, providing real-time visibility into test execution. + +**Scope**: Task 1 only (2 points). Task 2 (Resources, Comparison, Trader Activity dashboards) is tracked separately in `thoughts/tasks/COW-593-remaining-dashboards.md`. + +## Current State Analysis + +**What exists:** +- Grafana service configured in `docker-compose.yml` (port 3000, monitoring profile) +- Datasource provisioning: `configs/grafana-datasource.yml` (Prometheus at http://prometheus:9090) +- Dashboard provisioning: `configs/grafana-dashboard.yml` (expects dashboards at `/etc/grafana/dashboards/`) +- **No dashboard JSON files exist** — the `configs/` directory has provisioning configs but no actual dashboards + +**What COW-591 exposes (available metrics):** +- Order counters: `cow_perf_orders_created_total`, `cow_perf_orders_submitted_total`, `cow_perf_orders_filled_total`, `cow_perf_orders_failed_total`, `cow_perf_orders_expired_total` +- Order gauge: `cow_perf_orders_active` +- Latency histograms: `cow_perf_submission_latency_seconds`, `cow_perf_orderbook_latency_seconds`, `cow_perf_settlement_latency_seconds`, `cow_perf_order_lifecycle_seconds` +- Throughput gauges: `cow_perf_orders_per_second`, `cow_perf_target_rate`, `cow_perf_actual_rate` +- Test metadata: `cow_perf_test_info`, `cow_perf_test_start_timestamp`, `cow_perf_test_duration_seconds`, `cow_perf_num_traders`, `cow_perf_test_progress_percent` +- API metrics: `cow_perf_api_requests_total`, `cow_perf_api_response_time_seconds`, `cow_perf_api_errors_total` + +**Key discovery:** +- Docker-compose sets `GF_DASHBOARDS_DEFAULT_HOME_DASHBOARD_PATH=/etc/grafana/dashboards/performance.json` +- The main dashboard file MUST be named `performance.json` to be the home dashboard + +### Key Discoveries: + +- `configs/grafana-dashboard.yml:14` - Provisioning expects dashboards at `/etc/grafana/dashboards` +- `docker-compose.yml:260` - Home dashboard path is `/etc/grafana/dashboards/performance.json` +- `docker-compose.yml:264-265` - Volume mounts for provisioning configs exist, but no dashboard volume mount +- `src/cow_performance/prometheus/metrics.py` - All metric definitions with labels + +## Desired End State + +After this plan is complete: +1. `configs/dashboards/` directory exists with dashboard JSON files +2. `performance.json` serves as the Overview dashboard and Grafana home +3. `api-performance.json` provides detailed API metrics visualization +4. Both dashboards auto-load when Grafana starts +5. Dashboard variables (`scenario`) allow filtering +6. All panels display data when a performance test is running with `--prometheus-port` + +### Verification: + +```bash +# 1. Start monitoring stack +docker compose --profile monitoring up -d + +# 2. Run a test with Prometheus exporter +poetry run cow-perf run --scenario light-load --prometheus-port 9091 + +# 3. Open Grafana and verify dashboards display data +open http://localhost:3000 +``` + +## What We're NOT Doing + +- Task 2 dashboards (Resources, Comparison, Trader Activity) — separate task +- Alerting integration — COW-598 is deferred +- Dashboard links/navigation between dashboards — Task 2 +- Documentation screenshots — human action after implementation + +--- + +## Phase 1: Dashboard Infrastructure Setup + +### Overview + +Create the directory structure and update provisioning configuration so Grafana can load dashboard JSON files. + +### Changes Required: + +#### 1. Create Dashboard Directory + +**Action**: Create `configs/dashboards/` directory + +```bash +mkdir -p configs/dashboards +``` + +#### 2. Update Docker Compose Volume Mount + +**File**: `docker-compose.yml` +**Changes**: Add volume mount for dashboards directory + +The Grafana service needs a volume mount to access the dashboard JSON files. Currently only provisioning configs are mounted. + +Add this volume mount to the grafana service volumes section: +```yaml +- ./configs/dashboards:/etc/grafana/dashboards:ro +``` + +#### 3. Verify Provisioning Config + +**File**: `configs/grafana-dashboard.yml` +**Status**: Already correct — points to `/etc/grafana/dashboards` + +No changes needed. The existing config will work once dashboards are in place. + +### Success Criteria: + +#### Automated Verification: + +- [x] Directory exists: `ls configs/dashboards/` +- [x] Docker compose validates: `docker compose config --quiet` + +#### Manual Verification: + +- [ ] Grafana starts without errors: `docker compose --profile monitoring up -d grafana` +- [ ] No provisioning errors in logs: `docker compose logs grafana | grep -i error` + +--- + +## Phase 2: Performance Overview Dashboard + +### Overview + +Create the main performance testing dashboard (`performance.json`) that serves as Grafana's home dashboard. This provides at-a-glance visibility into test execution. + +### Changes Required: + +#### 1. Create Performance Overview Dashboard + +**File**: `configs/dashboards/performance.json` +**Changes**: Create new Grafana dashboard JSON + +The dashboard should include these rows and panels: + +**Row 1: Test Overview** +- Stat panel: Current scenario name (from `cow_perf_test_info`) +- Stat panel: Test duration / elapsed time +- Stat panel: Number of traders (`cow_perf_num_traders`) +- Gauge panel: Test progress percentage (`cow_perf_test_progress_percent`) + +**Row 2: Order Submission Rate** +- Time series: Orders submitted per second (actual vs target rate) + - Use `cow_perf_orders_per_second`, `cow_perf_target_rate`, `cow_perf_actual_rate` +- Time series: Cumulative orders over time + - Use `cow_perf_orders_created_total`, `cow_perf_orders_submitted_total` +- Gauge: Submission rate achievement (actual/target × 100%) + +**Row 3: Latency Distribution** +- Heatmap: Submission latency distribution + - Use `cow_perf_submission_latency_seconds_bucket` +- Heatmap: Settlement latency distribution + - Use `cow_perf_settlement_latency_seconds_bucket` +- Time series: P50, P90, P95, P99 latencies over time + - Use histogram_quantile() on latency histograms + +**Row 4: Order Status** +- Pie chart: Order status distribution (filled, failed, expired, active) + - Use order counters and gauge +- Stat panel: Success rate percentage (filled / submitted × 100%) + - Color thresholds: green >95%, yellow >90%, red <90% +- Stat panel: Total orders submitted (with sparkline) +- Stat panel: Total orders filled (with sparkline) + +**Dashboard Variables:** +- `scenario`: Query variable from `label_values(cow_perf_orders_created_total, scenario)` + +**Dashboard Settings:** +- Title: "CoW Performance Testing - Overview" +- UID: `cow-perf-overview` +- Tags: `["cow-protocol", "performance-testing"]` +- Refresh: 5s auto-refresh +- Time range: Last 15 minutes default + +### Success Criteria: + +#### Automated Verification: + +- [x] File exists: `ls configs/dashboards/performance.json` +- [x] Valid JSON: `python -m json.tool configs/dashboards/performance.json > /dev/null` +- [x] Linting passes (no code changes to src/) + +#### Manual Verification: + +- [ ] Dashboard loads in Grafana without errors +- [ ] Dashboard is set as home dashboard +- [ ] All 4 rows visible with panels +- [ ] Scenario variable dropdown populated (when test running) +- [ ] Panels show data during active test + +--- + +## Phase 3: API Performance Dashboard + +### Overview + +Create a detailed API performance dashboard (`api-performance.json`) for monitoring orderbook API interactions during performance tests. + +### Changes Required: + +#### 1. Create API Performance Dashboard + +**File**: `configs/dashboards/api-performance.json` +**Changes**: Create new Grafana dashboard JSON + +The dashboard should include these rows and panels: + +**Row 1: API Response Times** +- Time series: API response time by endpoint + - Use `cow_perf_api_response_time_seconds` +- Heatmap: Response time distribution + - Use `cow_perf_api_response_time_seconds_bucket` +- Stat panels: P50, P95, P99 response times + - Use histogram_quantile() + +**Row 2: API Throughput** +- Time series: Requests per second by endpoint + - Use rate() on `cow_perf_api_requests_total` +- Stat panel: Total requests +- Time series: Requests by HTTP method (GET, POST, etc.) + +**Row 3: API Errors** +- Time series: Error rate over time + - Use rate() on `cow_perf_api_errors_total` +- Pie chart: Errors by type (client_error, server_error, timeout, connection_error) +- Stat panel: Total errors +- Table: Error breakdown by endpoint and type + +**Dashboard Variables:** +- `scenario`: Query variable (same as overview dashboard) +- `endpoint`: Query variable from `label_values(cow_perf_api_requests_total, endpoint)` +- `method`: Query variable from `label_values(cow_perf_api_requests_total, method)` + +**Dashboard Settings:** +- Title: "CoW Performance Testing - API Performance" +- UID: `cow-perf-api` +- Tags: `["cow-protocol", "performance-testing", "api"]` +- Refresh: 5s auto-refresh +- Time range: Last 15 minutes default + +### Success Criteria: + +#### Automated Verification: + +- [x] File exists: `ls configs/dashboards/api-performance.json` +- [x] Valid JSON: `python -m json.tool configs/dashboards/api-performance.json > /dev/null` + +#### Manual Verification: + +- [ ] Dashboard loads in Grafana without errors +- [ ] All 3 rows visible with panels +- [ ] Variables (scenario, endpoint, method) populated when test running +- [ ] Filtering by endpoint works correctly + +--- + +## Testing Strategy + +### Unit Tests: + +No unit tests required — dashboard JSON files are configuration, not code. + +### Integration Tests: + +The dashboards will be validated through manual testing (see Human Testing Section below). + +### Automated Validation: + +```bash +# Validate JSON syntax +python -m json.tool configs/dashboards/performance.json > /dev/null +python -m json.tool configs/dashboards/api-performance.json > /dev/null + +# Validate docker-compose +docker compose config --quiet +``` + +--- + +## Human Testing Section + +After implementation is complete, follow these steps to verify the dashboards work correctly. + +### Prerequisites + +1. COW-591 Prometheus exporter must be working +2. Docker and Docker Compose installed +3. Poetry environment set up (`poetry install`) + +### Step 1: Start the Monitoring Stack + +```bash +# Start Grafana and Prometheus +docker compose --profile monitoring up -d + +# Verify services are running +docker compose ps + +# Expected output should show grafana and prometheus as "running" or "healthy" +``` + +### Step 2: Verify Grafana Loads + +```bash +# Open Grafana in browser +open http://localhost:3000 + +# OR use curl to check +curl -s http://localhost:3000/api/health +# Expected: {"commit":"...","database":"ok","version":"..."} +``` + +**Check:** +- [ ] Grafana loads without login (anonymous access enabled) +- [ ] Home dashboard shows "CoW Performance Testing - Overview" +- [ ] No error banners or provisioning errors + +### Step 3: Check Dashboard List + +1. In Grafana, click the hamburger menu (☰) → Dashboards +2. Look for dashboards in the list + +**Check:** +- [ ] "CoW Performance Testing - Overview" appears +- [ ] "CoW Performance Testing - API Performance" appears + +### Step 4: Run a Performance Test with Prometheus + +```bash +# In a new terminal, run a test with Prometheus exporter enabled +poetry run cow-perf run --scenario light-load --prometheus-port 9091 + +# The test should start and you should see output indicating Prometheus exporter started +``` + +**Check:** +- [ ] Test starts without Prometheus-related errors +- [ ] Log shows "Prometheus exporter started on port 9091" (or similar) + +### Step 5: Verify Prometheus Scraping + +```bash +# Check Prometheus is scraping the exporter +open http://localhost:9090/targets + +# OR use curl +curl -s http://localhost:9090/api/v1/targets | grep cow-performance +``` + +**Check:** +- [ ] `cow-performance-test` job shows State: `UP` +- [ ] Last scrape is recent (within last 10 seconds) + +### Step 6: Verify Overview Dashboard Data + +1. Go back to Grafana (http://localhost:3000) +2. Open the Overview dashboard (should be home) +3. Set time range to "Last 5 minutes" + +**Check:** +- [ ] Test Overview row shows scenario name, duration, traders +- [ ] Order Submission Rate row shows time series with data points +- [ ] Latency Distribution row shows heatmaps with color (if orders submitted) +- [ ] Order Status row shows pie chart and stats with values > 0 + +### Step 7: Verify API Performance Dashboard Data + +1. In Grafana, go to Dashboards → "CoW Performance Testing - API Performance" +2. Set time range to "Last 5 minutes" + +**Check:** +- [ ] API Response Times row shows response time data +- [ ] API Throughput row shows requests per second +- [ ] API Errors row shows error counts (may be 0 if no errors) +- [ ] Variables (endpoint, method) have options in dropdowns + +### Step 8: Test Dashboard Variables + +1. On either dashboard, look for variable dropdowns at the top +2. Click the `scenario` dropdown + +**Check:** +- [ ] Dropdown shows the current scenario name (e.g., "light-load") +- [ ] Selecting different values (if multiple tests ran) filters data + +### Step 9: Verify Dashboards Persist After Restart + +```bash +# Restart Grafana +docker compose restart grafana + +# Wait for it to come back up +sleep 10 + +# Open Grafana again +open http://localhost:3000 +``` + +**Check:** +- [ ] Dashboards still exist after restart +- [ ] Home dashboard is still the Overview dashboard +- [ ] Historical data from before restart is still visible + +### Step 10: Stop Test and Cleanup + +```bash +# Stop the performance test (Ctrl+C in the test terminal) + +# Optionally stop monitoring stack +docker compose --profile monitoring down +``` + +### Troubleshooting + +**Dashboard shows "No data":** +- Ensure test is running with `--prometheus-port 9091` +- Check Prometheus targets: http://localhost:9090/targets +- Verify metrics exist: `curl http://localhost:9091/metrics | grep cow_perf` + +**Dashboard has red error panels:** +- Check Grafana logs: `docker compose logs grafana` +- Verify Prometheus datasource: Grafana → Connections → Data sources → Prometheus → Test + +**Variables empty:** +- Metrics may not have been scraped yet — wait 10-15 seconds +- Check that the test has actually submitted orders + +--- + +## Files to Create/Modify + +| File | Action | Description | +|------|--------|-------------| +| `configs/dashboards/` | Create | New directory for dashboard JSON files | +| `configs/dashboards/performance.json` | Create | Performance Overview dashboard (home) | +| `configs/dashboards/api-performance.json` | Create | API Performance dashboard | +| `docker-compose.yml` | Modify | Add volume mount for dashboards directory | + +--- + +## References + +- Original ticket: `thoughts/tickets/COW-593-grafana-dashboards.md` +- Task 2 (remaining dashboards): `thoughts/tasks/COW-593-remaining-dashboards.md` +- PoC patterns: `thoughts/research/poc-evaluation.md` +- Prometheus metrics: `src/cow_performance/prometheus/metrics.py` +- M3 validation: `thoughts/validations/m3-validation.md` +- Grafana provisioning docs: https://grafana.com/docs/grafana/latest/administration/provisioning/ diff --git a/thoughts/plans/2026-02-10-cow-593-task2-remaining-dashboards.md b/thoughts/plans/2026-02-10-cow-593-task2-remaining-dashboards.md new file mode 100644 index 0000000..7aea6a7 --- /dev/null +++ b/thoughts/plans/2026-02-10-cow-593-task2-remaining-dashboards.md @@ -0,0 +1,691 @@ +# COW-593 Task 2: Remaining Dashboards Implementation Plan + +## Overview + +Implement the three remaining dashboards for COW-593 that complete the Grafana visualization suite: + +1. **Resource Utilization Dashboard** (`resources.json`) — Container CPU, memory, network monitoring +2. **Comparison Dashboard** (`comparison.json`) — Baseline vs current test comparison with regression indicators +3. **Trader Activity Dashboard** (`trader-activity.json`) — Per-trader statistics and activity patterns + +All three dashboards are grant deliverables required for COW-593 completion. + +## Current State Analysis + +### What Exists + +1. **Two dashboards already implemented** (Task 1): + - `configs/dashboards/performance.json` — Overview dashboard with 20 panels + - `configs/dashboards/api-performance.json` — API Performance dashboard with 14 panels + +2. **Dashboard provisioning configured**: + - `configs/grafana-dashboard.yml` — Points to `configs/dashboards/` + - `configs/grafana-datasource.yml` — Prometheus datasource configured + +3. **All required metrics available** (COW-591 Phase 2 complete): + - Resource metrics: `cow_perf_container_cpu_percent`, `cow_perf_container_memory_bytes`, `cow_perf_container_network_rx_bytes`, `cow_perf_container_network_tx_bytes` + - Comparison metrics: `cow_perf_baseline_comparison_percent`, `cow_perf_regression_detected`, `cow_perf_regressions_total` + - Trader metrics: `cow_perf_trader_orders_submitted`, `cow_perf_trader_orders_filled`, `cow_perf_traders_active` + +### Key Discoveries + +- **Trader metrics use `trader_index`** (0, 1, 2...) not `trader_address` for cardinality management (`metrics.py:227-249`) +- **Comparison metrics are "push" metrics** — populated explicitly via `record_comparison_result()` after a comparison runs +- **Existing dashboards use consistent patterns**: `schemaVersion: 38`, `pluginVersion: "10.0.0"`, `refresh: "5s"`, tags `["cow-protocol", "performance-testing"]` +- **Variable pattern**: `{scenario=~\"$scenario\"}` regex match for filtering +- **Panel IDs**: Start at 1 per dashboard (not global), use sequential numbering +- **Navigation links**: API dashboard has link back to Overview using URL `/d/cow-perf-overview` + +## Desired End State + +After this plan is complete: + +1. Three new dashboard JSON files exist in `configs/dashboards/`: + - `resources.json` — Resource Utilization Dashboard + - `comparison.json` — Comparison Dashboard + - `trader-activity.json` — Trader Activity Dashboard + +2. All dashboards: + - Follow existing panel patterns and conventions + - Have working variables for filtering + - Include navigation links to other dashboards + - Auto-load when Grafana starts with the monitoring profile + +3. Dashboard navigation is complete: + - All 5 dashboards link to each other + - Existing dashboards (performance.json, api-performance.json) updated with full navigation + +### Verification + +- All dashboards load without errors in Grafana +- Panels display data when metrics are available +- Variables filter data correctly +- Navigation links work between all dashboards + +## What We're NOT Doing + +- **Not implementing alerting** — That's COW-598 +- **Not modifying Prometheus exporter** — Metrics already exist +- **Not adding new metrics** — Using existing COW-591 Phase 2 metrics +- **Not changing provisioning configuration** — Already set up for `configs/dashboards/` +- **Not creating documentation** — Will be done after all dashboards are tested + +--- + +## Implementation Approach + +**Strategy**: Create each dashboard following the exact patterns from existing dashboards (`performance.json`, `api-performance.json`), ensuring: +- Consistent JSON structure +- Proper gridPos layout (24-unit grid width) +- Standard panel configurations +- Working variable queries + +**Order**: +1. Resources Dashboard (simplest, clearest metric mapping) +2. Trader Activity Dashboard (moderate complexity, per-trader breakdown) +3. Comparison Dashboard (most complex, requires baseline selection) +4. Navigation Links Update (cross-dashboard navigation) + +--- + +## Phase 1: Resource Utilization Dashboard + +### Overview + +Create `configs/dashboards/resources.json` with CPU, memory, and network monitoring for all containers. + +### Dashboard Metadata + +```json +{ + "title": "CoW Performance Testing - Resources", + "uid": "cow-perf-resources", + "tags": ["cow-protocol", "performance-testing", "resources"], + "refresh": "5s", + "schemaVersion": 38 +} +``` + +### Variables + +| Name | Label | Query | Multi | +|------|-------|-------|-------| +| `scenario` | Scenario | `label_values(cow_perf_orders_created_total, scenario)` | false | +| `container` | Container | `label_values(cow_perf_container_cpu_percent, container)` | true | + +### Panel Layout + +**Row 1: CPU Usage (y: 0)** + +| Panel | Type | GridPos | Query | +|-------|------|---------|-------| +| Row: CPU Usage | row | h:1, w:24, x:0, y:0 | - | +| CPU by Container | timeseries | h:8, w:12, x:0, y:1 | `cow_perf_container_cpu_percent{container=~"$container"}` | +| Current CPU | gauge | h:8, w:6, x:12, y:1 | `cow_perf_container_cpu_percent{container=~"$container"}` (per container) | +| Peak CPU | stat | h:4, w:6, x:18, y:1 | `max_over_time(cow_perf_container_cpu_percent{container=~"$container"}[1h])` | +| Avg CPU | stat | h:4, w:6, x:18, y:5 | `avg_over_time(cow_perf_container_cpu_percent{container=~"$container"}[$__range])` | + +**Row 2: Memory Usage (y: 9)** + +| Panel | Type | GridPos | Query | +|-------|------|---------|-------| +| Row: Memory Usage | row | h:1, w:24, x:0, y:9 | - | +| Memory by Container | timeseries | h:8, w:12, x:0, y:10 | `cow_perf_container_memory_bytes{container=~"$container"}` | +| Current Memory | gauge | h:8, w:6, x:12, y:10 | `cow_perf_container_memory_bytes{container=~"$container"}` (per container) | +| Peak Memory | stat | h:4, w:6, x:18, y:10 | `max_over_time(cow_perf_container_memory_bytes{container=~"$container"}[1h])` | +| Avg Memory | stat | h:4, w:6, x:18, y:14 | `avg_over_time(cow_perf_container_memory_bytes{container=~"$container"}[$__range])` | + +**Row 3: Network I/O (y: 18)** + +| Panel | Type | GridPos | Query | +|-------|------|---------|-------| +| Row: Network I/O | row | h:1, w:24, x:0, y:18 | - | +| Network RX | timeseries | h:8, w:12, x:0, y:19 | `rate(cow_perf_container_network_rx_bytes{container=~"$container"}[1m])` | +| Network TX | timeseries | h:8, w:12, x:12, y:19 | `rate(cow_perf_container_network_tx_bytes{container=~"$container"}[1m])` | + +**Row 4: Resource Summary (y: 27)** + +| Panel | Type | GridPos | Query | +|-------|------|---------|-------| +| Row: Summary | row | h:1, w:24, x:0, y:27 | - | +| Total RX | stat | h:4, w:6, x:0, y:28 | `sum(cow_perf_container_network_rx_bytes{container=~"$container"})` | +| Total TX | stat | h:4, w:6, x:6, y:28 | `sum(cow_perf_container_network_tx_bytes{container=~"$container"})` | +| Resource Table | table | h:8, w:12, x:12, y:28 | All metrics by container | + +### Key Panel Configurations + +**CPU Timeseries**: +```json +{ + "fieldConfig": { + "defaults": { + "unit": "percent", + "max": 100, + "min": 0 + } + } +} +``` + +**Memory Timeseries**: +```json +{ + "fieldConfig": { + "defaults": { + "unit": "bytes" + } + } +} +``` + +**Network Rate**: +```json +{ + "fieldConfig": { + "defaults": { + "unit": "Bps" + } + } +} +``` + +**CPU/Memory Thresholds** (gauge panels): +```json +{ + "thresholds": { + "mode": "percentage", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 90 } + ] + } +} +``` + +### Navigation Links + +```json +{ + "links": [ + { + "title": "Overview", + "url": "/d/cow-perf-overview", + "includeVars": true, + "keepTime": true + }, + { + "title": "API Performance", + "url": "/d/cow-perf-api", + "includeVars": true, + "keepTime": true + } + ] +} +``` + +### Success Criteria + +#### Automated Verification + +- [x] JSON is valid: `python -m json.tool configs/dashboards/resources.json` +- [x] Linting passes: `poetry run ruff check .` +- [x] File exists at correct path + +#### Manual Verification + +- [ ] Dashboard loads in Grafana at `/d/cow-perf-resources` +- [ ] Container variable populates with container names +- [ ] CPU, memory, network panels display data during a test run +- [ ] Gauge thresholds show correct colors +- [ ] Navigation links work + +--- + +## Phase 2: Trader Activity Dashboard + +### Overview + +Create `configs/dashboards/trader-activity.json` with per-trader statistics and activity patterns. + +### Dashboard Metadata + +```json +{ + "title": "CoW Performance Testing - Trader Activity", + "uid": "cow-perf-traders", + "tags": ["cow-protocol", "performance-testing", "traders"], + "refresh": "5s", + "schemaVersion": 38 +} +``` + +### Variables + +| Name | Label | Query | Multi | +|------|-------|-------|-------| +| `scenario` | Scenario | `label_values(cow_perf_orders_created_total, scenario)` | false | +| `top_n` | Top N | Custom: 5, 10, 20, 50 | false | + +### Panel Layout + +**Row 1: Trader Overview (y: 0)** + +| Panel | Type | GridPos | Query | +|-------|------|---------|-------| +| Row: Overview | row | h:1, w:24, x:0, y:0 | - | +| Active Traders | stat | h:4, w:6, x:0, y:1 | `cow_perf_traders_active` | +| Total Traders | stat | h:4, w:6, x:6, y:1 | `cow_perf_num_traders{scenario=~"$scenario"}` | +| Avg Orders/Trader | stat | h:4, w:6, x:12, y:1 | `sum(cow_perf_trader_orders_submitted) / cow_perf_num_traders{scenario=~"$scenario"}` | +| Fill Rate | stat | h:4, w:6, x:18, y:1 | `sum(cow_perf_trader_orders_filled) / sum(cow_perf_trader_orders_submitted) * 100` | + +**Row 2: Top Traders (y: 5)** + +| Panel | Type | GridPos | Query | +|-------|------|---------|-------| +| Row: Top Traders | row | h:1, w:24, x:0, y:5 | - | +| Orders Submitted (Bar) | timeseries (bars) | h:8, w:12, x:0, y:6 | `topk($top_n, cow_perf_trader_orders_submitted)` | +| Orders Filled (Bar) | timeseries (bars) | h:8, w:12, x:12, y:6 | `topk($top_n, cow_perf_trader_orders_filled)` | + +**Row 3: Trader Activity Over Time (y: 14)** + +| Panel | Type | GridPos | Query | +|-------|------|---------|-------| +| Row: Activity | row | h:1, w:24, x:0, y:14 | - | +| Active Traders Over Time | timeseries | h:8, w:12, x:0, y:15 | `cow_perf_traders_active` | +| Submission Rate by Trader | timeseries | h:8, w:12, x:12, y:15 | `topk($top_n, rate(cow_perf_trader_orders_submitted[1m]))` | + +**Row 4: Trader Distribution (y: 23)** + +| Panel | Type | GridPos | Query | +|-------|------|---------|-------| +| Row: Distribution | row | h:1, w:24, x:0, y:23 | - | +| Order Distribution | piechart | h:8, w:8, x:0, y:24 | `topk($top_n, cow_perf_trader_orders_submitted)` | +| Success Rate by Trader | table | h:8, w:16, x:8, y:24 | Submitted, Filled, Rate % by trader_index | + +### Key Panel Configurations + +**Bar Chart for Top Traders**: +```json +{ + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "bars", + "fillOpacity": 80, + "stacking": { "mode": "none" } + } + } + } +} +``` + +**Top N Variable**: +```json +{ + "name": "top_n", + "type": "custom", + "query": "5,10,20,50", + "current": { "text": "10", "value": "10" } +} +``` + +**Trader Table Transformations**: +```json +{ + "transformations": [ + { + "id": "organize", + "options": { + "renameByName": { + "trader_index": "Trader", + "Value #A": "Submitted", + "Value #B": "Filled", + "Value #C": "Success %" + } + } + } + ] +} +``` + +### Navigation Links + +Same pattern as Resources dashboard, linking to Overview, API, and Resources. + +### Success Criteria + +#### Automated Verification + +- [x] JSON is valid: `python -m json.tool configs/dashboards/trader-activity.json` +- [x] File exists at correct path + +#### Manual Verification + +- [ ] Dashboard loads in Grafana at `/d/cow-perf-traders` +- [ ] Top N variable allows selection +- [ ] Bar charts show top traders correctly +- [ ] Trader table shows all columns +- [ ] Pie chart shows distribution + +--- + +## Phase 3: Comparison Dashboard + +### Overview + +Create `configs/dashboards/comparison.json` for baseline vs current test comparison with regression indicators. + +### Dashboard Metadata + +```json +{ + "title": "CoW Performance Testing - Comparison", + "uid": "cow-perf-comparison", + "tags": ["cow-protocol", "performance-testing", "comparison"], + "refresh": "5s", + "schemaVersion": 38 +} +``` + +### Variables + +| Name | Label | Query | Multi | +|------|-------|-------|-------| +| `scenario` | Scenario | `label_values(cow_perf_orders_created_total, scenario)` | false | +| `baseline_id` | Baseline | `label_values(cow_perf_baseline_comparison_percent, baseline_id)` | false | + +### Panel Layout + +**Row 1: Comparison Overview (y: 0)** + +| Panel | Type | GridPos | Query | +|-------|------|---------|-------| +| Row: Overview | row | h:1, w:24, x:0, y:0 | - | +| Baseline ID | stat | h:4, w:6, x:0, y:1 | `cow_perf_test_info{baseline_id=~".+"}` (extract baseline label) | +| Overall Verdict | stat | h:4, w:6, x:6, y:1 | Based on regression count (color-coded) | +| Total Regressions | stat | h:4, w:6, x:12, y:1 | `sum(cow_perf_regressions_total)` | +| Critical Count | stat | h:4, w:6, x:18, y:1 | `cow_perf_regression_detected{severity="critical"}` | + +**Row 2: Latency Comparison (y: 5)** + +| Panel | Type | GridPos | Query | +|-------|------|---------|-------| +| Row: Latency | row | h:1, w:24, x:0, y:5 | - | +| Submission Latency Delta | stat | h:4, w:6, x:0, y:6 | `cow_perf_baseline_comparison_percent{metric="submission_latency_p95"}` | +| Settlement Latency Delta | stat | h:4, w:6, x:6, y:6 | `cow_perf_baseline_comparison_percent{metric="settlement_latency_p95"}` | +| Comparison Over Time | timeseries | h:8, w:12, x:12, y:6 | Multiple `cow_perf_baseline_comparison_percent` by metric | + +**Row 3: Throughput Comparison (y: 14)** + +| Panel | Type | GridPos | Query | +|-------|------|---------|-------| +| Row: Throughput | row | h:1, w:24, x:0, y:14 | - | +| Orders/Second Delta | stat | h:4, w:6, x:0, y:15 | `cow_perf_baseline_comparison_percent{metric="orders_per_second"}` | +| Success Rate Delta | stat | h:4, w:6, x:6, y:15 | `cow_perf_baseline_comparison_percent{metric="success_rate"}` | +| Throughput Trend | timeseries | h:8, w:12, x:12, y:15 | Actual rate comparison | + +**Row 4: Regression Details (y: 23)** + +| Panel | Type | GridPos | Query | +|-------|------|---------|-------| +| Row: Regressions | row | h:1, w:24, x:0, y:23 | - | +| Critical Regressions | stat | h:4, w:4, x:0, y:24 | `cow_perf_regression_detected{severity="critical"}` | +| Major Regressions | stat | h:4, w:4, x:4, y:24 | `cow_perf_regression_detected{severity="major"}` | +| Minor Regressions | stat | h:4, w:4, x:8, y:24 | `cow_perf_regression_detected{severity="minor"}` | +| Regression Table | table | h:8, w:12, x:12, y:24 | All comparison metrics with deltas | + +### Key Panel Configurations + +**Delta Stat Panel** (positive = worse for latency): +```json +{ + "fieldConfig": { + "defaults": { + "unit": "percent", + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 5 }, + { "color": "red", "value": 15 } + ] + } + } + }, + "options": { + "graphMode": "none" + } +} +``` + +**Delta Stat Panel** (positive = better for throughput): +```json +{ + "fieldConfig": { + "defaults": { + "unit": "percent", + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": -5 }, + { "color": "green", "value": 0 } + ] + } + } + } +} +``` + +**Regression Severity Colors**: +- Critical: `red` +- Major: `orange` +- Minor: `yellow` + +**Value Mappings for Verdict**: +```json +{ + "mappings": [ + { "type": "value", "options": { "0": { "text": "No Regressions", "color": "green" } } }, + { "type": "range", "options": { "from": 1, "to": 5, "result": { "text": "Minor Issues", "color": "yellow" } } }, + { "type": "range", "options": { "from": 6, "to": 999, "result": { "text": "Regressions Detected", "color": "red" } } } + ] +} +``` + +### Important Note + +The comparison dashboard will only show data when a baseline comparison has been run via the CLI (`cow-perf compare`). When no comparison data exists, panels will show "No data" which is expected behavior. + +### Success Criteria + +#### Automated Verification + +- [x] JSON is valid: `python -m json.tool configs/dashboards/comparison.json` +- [x] File exists at correct path + +#### Manual Verification + +- [ ] Dashboard loads in Grafana at `/d/cow-perf-comparison` +- [ ] Baseline variable populates after a comparison is run +- [ ] Delta panels show correct % change +- [ ] Color coding reflects regression severity +- [ ] Table shows all metrics with comparison data + +--- + +## Phase 4: Dashboard Navigation Links + +### Overview + +Update all 5 dashboards to include consistent navigation links to each other. + +### Link Configuration + +Each dashboard should have links to all other dashboards: + +```json +{ + "links": [ + { + "asDropdown": false, + "icon": "dashboard", + "includeVars": true, + "keepTime": true, + "tags": [], + "targetBlank": false, + "title": "Overview", + "tooltip": "Go to Overview Dashboard", + "type": "link", + "url": "/d/cow-perf-overview" + }, + { + "asDropdown": false, + "icon": "dashboard", + "includeVars": true, + "keepTime": true, + "tags": [], + "targetBlank": false, + "title": "API", + "tooltip": "Go to API Performance Dashboard", + "type": "link", + "url": "/d/cow-perf-api" + }, + { + "asDropdown": false, + "icon": "dashboard", + "includeVars": true, + "keepTime": true, + "tags": [], + "targetBlank": false, + "title": "Resources", + "tooltip": "Go to Resources Dashboard", + "type": "link", + "url": "/d/cow-perf-resources" + }, + { + "asDropdown": false, + "icon": "dashboard", + "includeVars": true, + "keepTime": true, + "tags": [], + "targetBlank": false, + "title": "Comparison", + "tooltip": "Go to Comparison Dashboard", + "type": "link", + "url": "/d/cow-perf-comparison" + }, + { + "asDropdown": false, + "icon": "dashboard", + "includeVars": true, + "keepTime": true, + "tags": [], + "targetBlank": false, + "title": "Traders", + "tooltip": "Go to Trader Activity Dashboard", + "type": "link", + "url": "/d/cow-perf-traders" + } + ] +} +``` + +### Files to Update + +1. `configs/dashboards/performance.json` — Add full links array (currently empty) +2. `configs/dashboards/api-performance.json` — Update links array (currently only has Overview link) +3. `configs/dashboards/resources.json` — Include links on creation +4. `configs/dashboards/comparison.json` — Include links on creation +5. `configs/dashboards/trader-activity.json` — Include links on creation + +### Success Criteria + +#### Automated Verification + +- [x] All 5 dashboard JSON files are valid +- [x] Each dashboard has 4 links (excluding self to avoid redundancy) + +#### Manual Verification + +- [ ] Navigation links appear in dashboard header +- [ ] Clicking links navigates to correct dashboard +- [ ] Variables are preserved across navigation +- [ ] Time range is preserved across navigation + +--- + +## Testing Strategy + +### Unit Tests + +No unit tests needed — dashboards are JSON configuration files. + +### Integration Tests + +Dashboard JSON validation is covered by Grafana's provisioning system. If JSON is invalid, Grafana will fail to load. + +### Manual Testing Steps + +1. **Start the monitoring stack**: + ```bash + docker compose --profile monitoring up -d + ``` + +2. **Run a performance test** (to generate metrics): + ```bash + poetry run cow-perf test --scenario medium_load --prometheus-port 9091 + ``` + +3. **Verify dashboards in Grafana** (http://localhost:3000): + - Navigate to each dashboard + - Verify panels show data + - Test variable selection + - Test navigation links + +4. **Run a comparison** (for Comparison dashboard): + ```bash + poetry run cow-perf baseline save --name test-baseline + poetry run cow-perf test --scenario medium_load + poetry run cow-perf compare --baseline test-baseline + ``` + - Verify Comparison dashboard shows delta values + +5. **Take screenshots** for documentation (after manual verification passes) + +--- + +## Files to Create/Modify + +### New Files + +``` +configs/dashboards/ +├── resources.json # NEW (Phase 1) +├── comparison.json # NEW (Phase 3) +└── trader-activity.json # NEW (Phase 2) +``` + +### Modified Files + +``` +configs/dashboards/ +├── performance.json # UPDATE: Add navigation links (Phase 4) +└── api-performance.json # UPDATE: Expand navigation links (Phase 4) + +thoughts/ +├── INDEX.md # UPDATE: Add this plan +└── tasks/COW-593-remaining-dashboards.md # UPDATE: Mark items complete as implemented +``` + +--- + +## References + +- Parent ticket: `thoughts/tickets/COW-593-grafana-dashboards.md` +- Task 2 specification: `thoughts/tasks/COW-593-remaining-dashboards.md` +- PoC patterns: `thoughts/research/poc-evaluation.md` +- Task 1 plan: `thoughts/plans/2026-02-06-cow-593-grafana-dashboards-task1.md` +- Existing dashboards: `configs/dashboards/performance.json`, `configs/dashboards/api-performance.json` +- Metrics implementation: `src/cow_performance/prometheus/metrics.py` diff --git a/thoughts/plans/2026-02-13-cow-598-alerting-rules.md b/thoughts/plans/2026-02-13-cow-598-alerting-rules.md new file mode 100644 index 0000000..a61a801 --- /dev/null +++ b/thoughts/plans/2026-02-13-cow-598-alerting-rules.md @@ -0,0 +1,661 @@ +# COW-598: Alerting Rules Implementation Plan + +## Overview + +Implement Prometheus alerting rules for the CoW Performance Testing Suite. This plan delivers 7 core alerts that notify developers when performance degrades, error rates spike, or resource utilization exceeds thresholds during performance testing. Alert parameters are organized for easy modification with TODO references to COW-617 for future configurability. + +**Ticket**: [COW-598-alerting-rules.md](../tickets/COW-598-alerting-rules.md) +**Grant Requirement**: M3 - Metrics & Visualization includes "Alerting rules" + +--- + +## Current State Analysis + +### What Already Exists + +1. **Prometheus Configuration** (`configs/prometheus.yml:73-81`): + - Alert rules section is commented out (`rule_files:` and `alerting:`) + - Evaluation interval already set to 5s (suitable for alerting) + - External labels configured (`monitor`, `environment`) + +2. **All Required Metrics Exist** (`src/cow_performance/prometheus/metrics.py`): + | Alert Category | Metrics Available | + |----------------|-------------------| + | Latency | `cow_perf_submission_latency_seconds_bucket` | + | Error Rate | `cow_perf_orders_failed_total`, `cow_perf_orders_submitted_total` | + | Throughput | `cow_perf_actual_rate`, `cow_perf_target_rate` | + | Resources | `cow_perf_container_cpu_percent`, `cow_perf_container_memory_bytes` | + | Test State | `cow_perf_test_progress_percent`, `cow_perf_orders_submitted_total` | + +3. **Docker Infrastructure** (`docker-compose.yml:235-248`): + - Prometheus service exists with volume mount at `/etc/prometheus/` + - Currently only mounts `prometheus.yml`, not an alerts directory + +4. **Grafana Dashboards** (`configs/dashboards/`): + - 5 dashboards exist: performance, api-performance, comparison, resources, trader-activity + - No alert annotations configured + +### Key Discoveries + +- Prometheus in Docker expects config at `/etc/prometheus/prometheus.yml` +- Volume mount pattern: `./configs/prometheus.yml:/etc/prometheus/prometheus.yml:ro` +- Need to mount alerts directory separately or extend the mount +- `cow_perf_container_memory_limit_bytes` metric does NOT exist - will use absolute threshold instead + +--- + +## Desired End State + +After this plan is complete: + +1. A new `configs/prometheus/alerts/performance-testing.yml` file exists with: + - Clear parameter documentation section at the top + - TODO(COW-617) references for future configurability + - 7 core alerting rules + +2. Prometheus loads and evaluates the alert rules every 5 seconds + +3. Alerts are visible in: + - Prometheus UI at `/alerts` + - Grafana dashboards via annotations + +4. All alert parameters are easy to find and modify in one place + +### Verification + +```bash +# Start monitoring stack +docker compose --profile monitoring up -d + +# Run a test to generate metrics +cow-perf run --prometheus-port 9091 --duration 120 + +# Check alerts in Prometheus UI +open http://localhost:9090/alerts + +# Verify alert rules loaded +curl -s http://localhost:9090/api/v1/rules | jq '.data.groups[].name' +# Expected: "cow_performance_testing" +``` + +--- + +## What We're NOT Doing + +1. **Alertmanager** - No notification channels (Slack, email, webhook) +2. **Full alert scope** - Only 7 core alerts, not the 15+ in the ticket +3. **Configurable thresholds** - Hardcoded with TODO(COW-617) for future work +4. **Alert testing framework** - Manual testing only +5. **Regression alerts** - `cow_perf_regression_detected` metric exists but regression alerts are lower priority + +--- + +## Implementation Approach + +1. **Create alerts directory structure** - New `configs/prometheus/alerts/` directory +2. **Parameter-first design** - All thresholds documented at top of file +3. **Mount alerts in Docker** - Update docker-compose volume mounts +4. **Enable rule loading in Prometheus** - Uncomment and configure `rule_files:` +5. **Add Grafana annotations** - Show firing alerts on dashboards + +--- + +## Phase 1: Create Alert Rules Directory Structure + +### Overview + +Create the directory structure for Prometheus alert rules and the main alert rules file with parameter documentation. + +### Changes Required + +#### 1. Create Alerts Directory + +```bash +mkdir -p configs/prometheus/alerts +``` + +#### 2. Create Alert Rules File + +**File**: `configs/prometheus/alerts/performance-testing.yml` + +```yaml +# ============================================================================= +# CoW Performance Testing Suite - Prometheus Alert Rules +# ============================================================================= +# +# This file defines alerting rules for the CoW Performance Testing Suite. +# Alerts are evaluated by Prometheus and can be viewed in the Prometheus UI +# or visualized in Grafana dashboards. +# +# ============================================================================= +# ALERT PARAMETERS - Edit values here for easy customization +# ============================================================================= +# +# TODO(COW-617): Move these thresholds to configurable TOML/env variables +# +# LATENCY THRESHOLDS (seconds): +# submission_latency_warning_threshold: 5 # P95 > 5s triggers warning +# submission_latency_critical_threshold: 10 # P95 > 10s triggers critical +# +# ERROR RATE THRESHOLDS (decimal, where 0.05 = 5%): +# error_rate_critical_threshold: 0.05 # > 5% error rate +# +# THROUGHPUT THRESHOLDS (ratio, where 0.8 = 80%): +# throughput_low_threshold: 0.8 # < 80% of target rate +# +# RESOURCE THRESHOLDS (percentage): +# cpu_warning_threshold: 80 # CPU > 80% +# memory_critical_threshold: 95 # Memory > 95% +# +# ALERT DURATIONS (prevents flapping): +# latency_warning_for: 2m +# latency_critical_for: 1m +# error_rate_for: 1m +# throughput_for: 2m +# cpu_for: 5m +# memory_for: 2m +# test_stalled_for: 1m +# +# ============================================================================= + +groups: + - name: cow_performance_testing + # Evaluation interval inherited from global config (5s) + rules: + # ========================================================================= + # LATENCY ALERTS + # ========================================================================= + + # High Submission Latency (Warning) + # Triggers when P95 submission latency exceeds warning threshold + - alert: HighSubmissionLatency + expr: | + histogram_quantile(0.95, + sum(rate(cow_perf_submission_latency_seconds_bucket[1m])) by (le, scenario) + ) > 5 + for: 2m + labels: + severity: warning + component: cow-performance-testing + category: latency + annotations: + summary: "High submission latency detected" + description: "P95 submission latency is {{ $value | printf \"%.2f\" }}s (threshold: 5s) for scenario {{ $labels.scenario }}" + runbook: "Check API logs, verify network connectivity, review recent code changes" + + # Critical Submission Latency (Critical) + # Triggers when P95 submission latency exceeds critical threshold + - alert: CriticalSubmissionLatency + expr: | + histogram_quantile(0.95, + sum(rate(cow_perf_submission_latency_seconds_bucket[1m])) by (le, scenario) + ) > 10 + for: 1m + labels: + severity: critical + component: cow-performance-testing + category: latency + annotations: + summary: "Critical submission latency - immediate attention required" + description: "P95 submission latency is {{ $value | printf \"%.2f\" }}s (threshold: 10s) for scenario {{ $labels.scenario }}" + runbook: "Immediate action: Check API health, container resources, database connections" + + # ========================================================================= + # ERROR RATE ALERTS + # ========================================================================= + + # High Error Rate (Critical) + # Triggers when order failure rate exceeds threshold + - alert: HighErrorRate + expr: | + ( + sum(rate(cow_perf_orders_failed_total[5m])) by (scenario) + / + sum(rate(cow_perf_orders_submitted_total[5m])) by (scenario) + ) > 0.05 + for: 1m + labels: + severity: critical + component: cow-performance-testing + category: errors + annotations: + summary: "High error rate detected" + description: "Error rate is {{ $value | humanizePercentage }} (threshold: 5%) for scenario {{ $labels.scenario }}" + runbook: "Check order validation errors, API error responses, contract state" + + # ========================================================================= + # THROUGHPUT ALERTS + # ========================================================================= + + # Low Throughput (Warning) + # Triggers when actual throughput falls below target + - alert: LowThroughput + expr: | + ( + cow_perf_actual_rate + / + cow_perf_target_rate + ) < 0.8 + and cow_perf_target_rate > 0 + for: 2m + labels: + severity: warning + component: cow-performance-testing + category: throughput + annotations: + summary: "Low throughput - not meeting target rate" + description: "Actual throughput is {{ $value | humanizePercentage }} of target for scenario {{ $labels.scenario }}" + runbook: "Check for bottlenecks: API rate limits, network latency, resource constraints" + + # ========================================================================= + # TEST EXECUTION ALERTS + # ========================================================================= + + # Test Stalled (Critical) + # Triggers when no orders are being submitted during an active test + - alert: TestStalled + expr: | + rate(cow_perf_orders_submitted_total[1m]) == 0 + and + cow_perf_test_progress_percent > 0 + and + cow_perf_test_progress_percent < 100 + for: 1m + labels: + severity: critical + component: cow-performance-testing + category: test-execution + annotations: + summary: "Performance test appears to be stalled" + description: "No orders submitted in the last minute for scenario {{ $labels.scenario }} (progress: {{ $value }}%)" + runbook: "Check test process, verify API connectivity, review error logs" + + # ========================================================================= + # RESOURCE ALERTS + # ========================================================================= + + # High CPU Usage (Warning) + # Triggers when container CPU usage is high + - alert: HighCPUUsage + expr: | + cow_perf_container_cpu_percent > 80 + for: 5m + labels: + severity: warning + component: cow-performance-testing + category: resources + annotations: + summary: "High CPU usage on {{ $labels.container }}" + description: "CPU usage is {{ $value | printf \"%.1f\" }}% (threshold: 80%) on container {{ $labels.container }}" + runbook: "Consider scaling resources, check for inefficient operations, review container limits" + + # Critical Memory Usage (Critical) + # Triggers when container memory usage approaches limit + # Note: Using absolute percentage since cow_perf_container_memory_limit_bytes not available + - alert: CriticalMemoryUsage + expr: | + cow_perf_container_memory_percent > 95 + for: 2m + labels: + severity: critical + component: cow-performance-testing + category: resources + annotations: + summary: "Critical memory usage on {{ $labels.container }}" + description: "Memory usage is {{ $value | printf \"%.1f\" }}% (threshold: 95%) on container {{ $labels.container }}" + runbook: "Immediate action: Check for memory leaks, increase container memory limit, restart if necessary" +``` + +### Success Criteria + +- [x] Directory `configs/prometheus/alerts/` exists +- [x] File `configs/prometheus/alerts/performance-testing.yml` exists with valid YAML syntax + +--- + +## Phase 2: Update Prometheus Configuration + +### Overview + +Enable alert rule loading in Prometheus by uncommenting and configuring the `rule_files:` section. + +### Changes Required + +#### 1. Update Prometheus Config + +**File**: `configs/prometheus.yml` + +**Change**: Replace lines 73-81 (the commented alert section) with: + +```yaml +# Alert rule files +rule_files: + - "/etc/prometheus/alerts/*.yml" + +# Note: Alertmanager not configured - alerts visible in Prometheus UI and Grafana only +# To enable Alertmanager notifications, uncomment below and add alertmanager service: +# alerting: +# alertmanagers: +# - static_configs: +# - targets: ["alertmanager:9093"] +``` + +### Success Criteria + +- [x] YAML syntax is valid: `python -c "import yaml; yaml.safe_load(open('configs/prometheus.yml'))"` + +--- + +## Phase 3: Update Docker Compose Volume Mounts + +### Overview + +Update the Prometheus service in docker-compose.yml to mount the alerts directory. + +### Changes Required + +#### 1. Update Docker Compose + +**File**: `docker-compose.yml` + +**Change**: Update the prometheus service volumes section (around line 246-248): + +From: +```yaml + volumes: + - ./configs/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - prometheus_data:/prometheus +``` + +To: +```yaml + volumes: + - ./configs/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - ./configs/prometheus/alerts:/etc/prometheus/alerts:ro + - prometheus_data:/prometheus +``` + +### Success Criteria + +- [x] Docker Compose syntax is valid: `docker compose config --quiet` + +--- + +## Phase 4: Add Grafana Alert Annotations + +### Overview + +Update the Performance Overview dashboard to display alert annotations and an alerts status panel. + +### Changes Required + +#### 1. Update Performance Dashboard + +**File**: `configs/dashboards/performance.json` + +Add alert annotations to the dashboard. This requires adding an `annotations` section to the dashboard JSON. + +**Change**: Add annotations configuration to show when alerts fire. Find the `"annotations"` section (or add it after `"templating"`) and update it: + +```json +"annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "enable": true, + "expr": "ALERTS{alertstate=\"firing\", component=\"cow-performance-testing\"}", + "iconColor": "red", + "name": "Firing Alerts", + "tagKeys": "alertname,severity", + "titleFormat": "{{ alertname }}" + } + ] +} +``` + +### Success Criteria + +- [x] Dashboard JSON is valid: `python -c "import json; json.load(open('configs/dashboards/performance.json'))"` + +--- + +## Phase 5: Fix Memory Metric for Alert + +### Overview + +The alert rule uses `cow_perf_container_memory_percent` but this metric doesn't exist. We need to either: +1. Add the metric to the exporter, OR +2. Update the alert to use existing metrics + +Based on review of `src/cow_performance/prometheus/metrics.py:200-225`, the available memory metric is `cow_perf_container_memory_bytes` (absolute bytes, not percentage). + +### Changes Required + +#### 1. Add Memory Percentage Metric + +**File**: `src/cow_performance/prometheus/metrics.py` + +**Change**: Add new gauge in `_init_resource_metrics()` method (around line 220): + +```python + self.container_memory_percent = Gauge( + "cow_perf_container_memory_percent", + "Container memory usage as percentage (0-100)", + ["container"], + registry=self.registry, + ) +``` + +#### 2. Update Exporter to Calculate Percentage + +**File**: `src/cow_performance/prometheus/exporter.py` + +**Change**: In the resource metrics update method, calculate and set the percentage. Find the `_update_resource_metrics` method and add: + +```python + # Calculate memory percentage if limit is known + # Note: This requires container memory limit to be available + # For now, we'll set this from Docker stats which provides percentage directly + if hasattr(sample, 'memory_percent') and sample.memory_percent is not None: + self._metrics.container_memory_percent.labels( + container=sample.container_name + ).set(sample.memory_percent) +``` + +**Alternative**: If ResourceSample doesn't have memory_percent, update the alert to use a fixed threshold in bytes: + +```yaml + - alert: CriticalMemoryUsage + expr: | + cow_perf_container_memory_bytes > 3.8e9 + # ... (3.8GB threshold, adjust based on container limits) +``` + +### Success Criteria + +- [x] `poetry run mypy src/cow_performance/prometheus/` passes +- [x] Memory percentage metric is exposed at `/metrics` endpoint + +--- + +## Phase 6: Documentation and Testing + +### Overview + +Document the alerting rules and provide manual testing instructions. + +### Changes Required + +#### 1. Update Ticket with Implementation Notes + +**File**: `thoughts/tickets/COW-598-alerting-rules.md` + +**Change**: Add implementation notes section at the end: + +```markdown +--- + +## Implementation Notes (2026-02-13) + +### Implemented Scope + +**Approach**: Option A (Prometheus alerting rules + Grafana visualization) +**Alert Count**: 7 core alerts (reduced from 15+ in original scope) + +### Alerts Implemented + +| Alert | Severity | Condition | Duration | +|-------|----------|-----------|----------| +| HighSubmissionLatency | Warning | P95 > 5s | 2m | +| CriticalSubmissionLatency | Critical | P95 > 10s | 1m | +| HighErrorRate | Critical | Error rate > 5% | 1m | +| LowThroughput | Warning | Actual < 80% target | 2m | +| TestStalled | Critical | No orders for 1m during active test | 1m | +| HighCPUUsage | Warning | CPU > 80% | 5m | +| CriticalMemoryUsage | Critical | Memory > 95% | 2m | + +### What Was NOT Implemented + +- Alertmanager (no Slack/email/webhook notifications) +- Settlement latency alerts +- API error spike alerts +- Regression alerts +- Alert testing framework +- Configurable thresholds (see COW-617) + +### Threshold Configuration + +All thresholds are hardcoded in `configs/prometheus/alerts/performance-testing.yml`. +Parameters are documented at the top of the file for easy modification. + +**TODO(COW-617)**: Move thresholds to configurable TOML/env variables. + +### Files Created/Modified + +- `configs/prometheus/alerts/performance-testing.yml` (NEW) +- `configs/prometheus.yml` (modified: enabled rule_files) +- `docker-compose.yml` (modified: added alerts volume mount) +- `configs/dashboards/performance.json` (modified: added alert annotations) +- `src/cow_performance/prometheus/metrics.py` (modified: added memory_percent gauge) +``` + +### Success Criteria + +- [x] Ticket file updated with implementation notes + +--- + +## Testing Strategy + +### Manual Testing Steps + +1. **Verify alert rules syntax**: + ```bash + # Use promtool if available, or start Prometheus and check logs + docker run --rm -v $(pwd)/configs/prometheus:/etc/prometheus \ + prom/prometheus promtool check rules /etc/prometheus/alerts/performance-testing.yml + ``` + +2. **Start monitoring stack**: + ```bash + docker compose --profile monitoring up -d + ``` + +3. **Verify rules loaded in Prometheus**: + ```bash + # Check rules API + curl -s http://localhost:9090/api/v1/rules | jq '.data.groups[].name' + # Expected output: "cow_performance_testing" + + # Check alerts page + open http://localhost:9090/alerts + ``` + +4. **Run a test to generate metrics**: + ```bash + cow-perf run --prometheus-port 9091 --duration 120 + ``` + +5. **Trigger alerts manually (optional)**: + ```bash + # To test HighErrorRate, you could submit invalid orders + # To test TestStalled, pause the test mid-execution + # Alerts should appear in Prometheus UI within evaluation interval + for duration + ``` + +6. **Verify Grafana annotations**: + ```bash + open http://localhost:3000 + # Navigate to Performance Overview dashboard + # Firing alerts should appear as red annotations on graphs + ``` + +### Automated Verification + +```bash +# Format and lint +poetry run black src/ tests/ +poetry run ruff check --fix src/ tests/ + +# Type check +poetry run mypy src/ + +# Run tests +poetry run pytest + +# Validate YAML files +python -c "import yaml; yaml.safe_load(open('configs/prometheus.yml'))" +python -c "import yaml; yaml.safe_load(open('configs/prometheus/alerts/performance-testing.yml'))" + +# Validate JSON +python -c "import json; json.load(open('configs/dashboards/performance.json'))" + +# Validate Docker Compose +docker compose config --quiet +``` + +--- + +## Success Criteria Summary + +### Automated Verification + +- [x] `poetry run black src/ tests/` passes +- [x] `poetry run ruff check src/ tests/` passes +- [x] `poetry run mypy src/cow_performance/prometheus/` passes (pre-existing errors in other modules) +- [x] `poetry run pytest tests/unit/` passes (e2e tests require Docker services) +- [x] YAML syntax valid for all config files +- [x] JSON syntax valid for dashboard files +- [x] Docker Compose config valid + +### Manual Verification + +- [ ] Prometheus loads alert rules (visible at `/alerts`) +- [ ] Alert rules API returns `cow_performance_testing` group +- [ ] Running a test generates metrics that alerts can evaluate +- [ ] Grafana shows alert annotations on Performance dashboard +- [x] Alert parameters are clearly documented at top of rules file + +--- + +## References + +- Original ticket: [COW-598-alerting-rules.md](../tickets/COW-598-alerting-rules.md) +- Prometheus alerting docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/ +- Grafana annotations: https://grafana.com/docs/grafana/latest/dashboards/build-dashboards/annotate-visualizations/ +- Related ticket for configurability: COW-617 diff --git a/thoughts/tasks/COW-591-implementation-phases.md b/thoughts/tasks/COW-591-implementation-phases.md new file mode 100644 index 0000000..760fc2f --- /dev/null +++ b/thoughts/tasks/COW-591-implementation-phases.md @@ -0,0 +1,152 @@ +# COW-591 Implementation Phases + +> **Purpose**: Document the implementation order for COW-591 metrics. All metrics are grant deliverables; this file tracks the recommended implementation sequence. +> +> **Created**: 2026-02-05 (M3 Planning Revision) +> **Parent Ticket**: [COW-591-prometheus-exporters.md](../tickets/COW-591-prometheus-exporters.md) +> **PoC Analysis**: [poc-evaluation.md](../research/poc-evaluation.md) — Detailed analysis of PoC patterns for metrics and architecture + +--- + +## Phase 1: Core Metrics (Implement First) + +These metrics provide the foundational visibility needed for performance testing: + +### Order Counters + +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `cow_perf_orders_created_total` | Counter | `scenario`, `order_type` | Total orders created | +| `cow_perf_orders_submitted_total` | Counter | `scenario`, `order_type` | Total orders submitted to API | +| `cow_perf_orders_filled_total` | Counter | `scenario`, `order_type` | Total orders successfully filled | +| `cow_perf_orders_failed_total` | Counter | `scenario`, `order_type` | Total orders that failed | +| `cow_perf_orders_expired_total` | Counter | `scenario`, `order_type` | Total orders that expired | +| `cow_perf_orders_active` | Gauge | `scenario` | Currently active orders | + +### Latency Histograms + +| Metric | Type | Buckets | Description | +|--------|------|---------|-------------| +| `cow_perf_submission_latency_seconds` | Histogram | 0.1, 0.5, 1, 2, 5, 10, 30 | Time to submit order to API | +| `cow_perf_orderbook_latency_seconds` | Histogram | 0.1, 0.5, 1, 2, 5, 10, 30 | Time for orderbook acceptance | +| `cow_perf_settlement_latency_seconds` | Histogram | 10, 30, 60, 120, 300, 600 | Time from acceptance to settlement | +| `cow_perf_order_lifecycle_seconds` | Histogram | 10, 30, 60, 120, 300, 600, 900 | Total order lifecycle duration | + +### Throughput Gauges + +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `cow_perf_orders_per_second` | Gauge | `scenario` | Current order submission rate | +| `cow_perf_target_rate` | Gauge | `scenario` | Configured target submission rate | +| `cow_perf_actual_rate` | Gauge | `scenario` | Measured actual submission rate | + +### Test Metadata + +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `cow_perf_test_info` | Info | `test_id`, `scenario`, `git_commit`, `duration`, `python_version`, `platform`, `cow_perf_version` | Test run metadata | +| `cow_perf_test_start_timestamp` | Gauge | `scenario` | Test start Unix timestamp | +| `cow_perf_test_duration_seconds` | Gauge | `scenario` | Configured test duration | +| `cow_perf_num_traders` | Gauge | `scenario` | Number of simulated traders | +| `cow_perf_test_progress_percent` | Gauge | `scenario` | Test completion percentage | + +**Note**: Platform metadata (`python_version`, `platform`, `cow_perf_version`) should be sourced from the existing baseline capture logic in `src/cow_performance/baselines/manager.py` to ensure consistency. + +--- + +## Phase 2: Extended Metrics (Implement After Phase 1) + +These metrics complete the full grant deliverable with additional visibility: + +### Per-Trader Metrics + +**Cardinality management strategy**: To avoid label explosion: +- Option A: Only expose top-N traders by volume (configurable, default 10) +- Option B: Use trader index instead of full address (trader_0, trader_1, ...) +- Option C: Aggregate per-trader metrics and expose distribution stats only + +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `cow_perf_trader_orders_submitted` | Counter | `trader_address` | Orders submitted per trader | +| `cow_perf_trader_orders_filled` | Counter | `trader_address` | Orders filled per trader | +| `cow_perf_traders_active` | Gauge | - | Count of currently active traders | + +### API Performance Metrics + +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `cow_perf_api_requests_total` | Counter | `endpoint`, `method`, `status` | Total API requests | +| `cow_perf_api_response_time_seconds` | Histogram | 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5 | API response time distribution | +| `cow_perf_api_errors_total` | Counter | `endpoint`, `error_type` | API error count by type | + +### Resource Metrics + +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `cow_perf_container_cpu_percent` | Gauge | `container` | Container CPU usage % | +| `cow_perf_container_memory_bytes` | Gauge | `container` | Container memory usage | +| `cow_perf_container_network_rx_bytes` | Gauge | `container` | Container network received | +| `cow_perf_container_network_tx_bytes` | Gauge | `container` | Container network transmitted | + +### Baseline Comparison Metrics + +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `cow_perf_baseline_comparison_percent` | Gauge | `metric`, `baseline_id` | Percentage change from baseline | +| `cow_perf_regression_detected` | Gauge | `severity` | Count of detected regressions | +| `cow_perf_regressions_total` | Counter | `severity` | Total regressions detected | + +### Scenario-Specific Metrics + +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `cow_perf_scenario_progress` | Gauge | `scenario` | Scenario-specific progress | + +--- + +## PoC Reference + +> **See also**: [poc-evaluation.md](../research/poc-evaluation.md) for complete PoC analysis including metrics patterns, architecture, and adoption recommendations. + +The PoC (PR #17 on bleu/cowprotocol-services) uses K6 for load testing. Its Prometheus config (`playground/performance-test-suite/prometheus/prometheus.yml`) shows: + +- Scrape interval: 15s (we use 5s for real-time updates) +- K6 metrics endpoint on port 6565 +- CoW Protocol service endpoints (orderbook:8080, autopilot:9589, driver:9590) + +Our exporter differs: +- Python-based using `prometheus-client` library (not K6) +- Exposes custom `cow_perf_*` metrics from our metrics framework +- Integrates with MetricsEventStream for real-time updates + +--- + +## Implementation Checklist + +### Phase 1 Checklist +- [ ] Add `prometheus-client` to pyproject.toml +- [ ] Create `src/cow_performance/prometheus/` module +- [ ] Implement `PrometheusExporter` class with HTTP server +- [ ] Define Phase 1 metrics (counters, histograms, gauges, info) +- [ ] Hook into `MetricsEventStream` callbacks +- [ ] Add `--prometheus-port` CLI flag +- [ ] Update `configs/prometheus.yml` with scrape target +- [ ] Test /metrics endpoint +- [ ] Verify Prometheus scrapes successfully + +### Phase 2 Checklist +- [ ] Implement cardinality management for per-trader metrics +- [ ] Add API performance metrics +- [ ] Add resource metrics (integrate with ResourceMonitor) +- [ ] Add baseline comparison metrics (integrate with ComparisonEngine) +- [ ] Add scenario-specific metrics +- [ ] Document all metrics in `docs/` + +--- + +## Notes + +- All metrics are part of the grant deliverable +- Implementation phasing is for development efficiency, not scope reduction +- Phase 1 should be completed before moving to Phase 2 +- Both phases must be complete for COW-591 to be considered done diff --git a/thoughts/tasks/COW-593-poc-reference.md b/thoughts/tasks/COW-593-poc-reference.md new file mode 100644 index 0000000..78dd518 --- /dev/null +++ b/thoughts/tasks/COW-593-poc-reference.md @@ -0,0 +1,235 @@ +# COW-593: PoC Reference Guide + +> **Purpose**: Document how to extract dashboard patterns and metrics from the PoC for COW-593 implementation. +> +> **Created**: 2026-02-05 (M3 Planning Revision) +> **Parent Ticket**: [COW-593-grafana-dashboards.md](../tickets/COW-593-grafana-dashboards.md) +> **PoC Analysis**: [poc-evaluation.md](../research/poc-evaluation.md) — Complete evaluation of PoC including metrics, dashboards, and architecture + +--- + +## Quick Reference + +For detailed PoC analysis (already completed), see **[poc-evaluation.md](../research/poc-evaluation.md)**, which covers: +- All Prometheus metrics exposed by the PoC +- Grafana provisioning structure +- Architecture and Docker integration +- Patterns to adopt in our implementation +- Key differences between K6 and our Python approach + +--- + +## PoC Location + +**PR #17** on `bleu/cowprotocol-services`: [https://github.com/bleu/cowprotocol-services/pull/17](https://github.com/bleu/cowprotocol-services/pull/17) + +- Title: "Luizhatem/poc performance testing suite" +- Additions: ~4,834 lines +- Files: 83 + +**Key paths in the PR**: +``` +playground/performance-test-suite/ +├── grafana/provisioning/ +│ ├── dashboards/dashboard.yml +│ └── datasources/datasource.yml +├── prometheus/prometheus.yml +├── src/ +│ ├── load-test.ts +│ ├── order-generator.ts +│ └── scenarios.ts +└── README.md +``` + +--- + +## How to Access PoC Content + +### Option A: Clone the Branch Locally + +```bash +# Clone the repo with the PoC branch +git clone --single-branch --branch Luizhatem/poc-performance-testing-suite \ + https://github.com/bleu/cowprotocol-services.git /tmp/cow-poc + +# Navigate to performance test suite +cd /tmp/cow-poc/playground/performance-test-suite +``` + +### Option B: Use GitHub API for Targeted Searches + +```bash +# Get specific file content +gh api repos/bleu/cowprotocol-services/contents/playground/performance-test-suite/prometheus/prometheus.yml \ + --jq '.content' | base64 -d + +# Search for patterns in PR files +gh api repos/bleu/cowprotocol-services/pulls/17/files \ + --jq '.[] | select(.filename | test("grafana")) | .patch' +``` + +### Option C: Run a Search Agent + +Use Claude Code's Task tool with the `Explore` subagent to search the PoC repo for specific patterns. + +--- + +## Metrics Referenced in Ticket + +The ticket mentions these dashboards from CoW Protocol monitoring: + +### `latency_dashboard.json` Metrics +These metrics are from CoW Protocol's autopilot/driver/solver services: + +| Metric Pattern | Description | Use in Our Dashboard | +|----------------|-------------|----------------------| +| `*auction_overhead_time` | Auction processing overhead | Adapt for order processing latency | +| `*auction_overhead_count` | Auction overhead counter | Reference for counter patterns | +| `gp_v2_autopilot_runloop_*` | Autopilot runloop timing | Panel layout inspiration | +| `driver_auction_preprocessing_*` | Driver preprocessing | Heatmap pattern reference | +| `driver_remaining_solve_time_*` | Solver time remaining | Gauge pattern reference | + +### `main_dashboard.json` Metrics +From CoW Protocol's API/orderbook services: + +| Metric Pattern | Description | Use in Our Dashboard | +|----------------|-------------|----------------------| +| API throughput | Requests per second | `cow_perf_api_requests_total` rate | +| API response times | Latency distribution | `cow_perf_api_response_time_seconds` histogram | +| API status codes | Response status breakdown | `cow_perf_api_requests_total{status}` | +| Orders in auction | Active orders | `cow_perf_orders_active` | +| Database queries | DB performance | Reference only (not in our scope) | +| RPC metrics | External calls | Reference only (not in our scope) | + +--- + +## PoC Prometheus Configuration + +From the PR's `prometheus/prometheus.yml`: + +```yaml +global: + scrape_interval: 15s + evaluation_interval: 15s + external_labels: + cluster: 'cow-protocol-perf-test' + environment: 'local' + +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + - job_name: 'k6' + static_configs: + - targets: ['k6:6565'] + scrape_interval: 5s + + # CoW Protocol Services (commented out, available if needed) + # - job_name: 'orderbook' -> targets: ['orderbook:8080'] + # - job_name: 'autopilot' -> targets: ['autopilot:9589'] + # - job_name: 'driver' -> targets: ['driver:9590'] +``` + +**Key takeaways**: +- PoC uses K6 for load testing (we use Python) +- 5s scrape interval for K6 metrics (good for real-time) +- 15s default interval (we can use 5-10s for performance testing) + +--- + +## Dashboard Design Patterns to Adopt + +Based on CoW Protocol's existing dashboards: + +### Heatmap Configuration +```json +{ + "type": "heatmap", + "options": { + "calculate": false, + "color": { + "scheme": "Oranges", + "exponent": 0.5 + }, + "yAxis": { + "unit": "s" + } + } +} +``` + +### Time Series with Target Line +```json +{ + "type": "timeseries", + "targets": [ + { + "expr": "rate(cow_perf_orders_submitted_total[1m])", + "legendFormat": "Actual Rate" + }, + { + "expr": "cow_perf_target_rate", + "legendFormat": "Target Rate" + } + ], + "fieldConfig": { + "defaults": { + "custom": { + "lineStyle": { + "fill": "solid" + } + } + } + } +} +``` + +### Stat Panel with Color Thresholds +```json +{ + "type": "stat", + "options": { + "colorMode": "value", + "graphMode": "area" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 90}, + {"color": "red", "value": 95} + ] + } + } + } +} +``` + +--- + +## Recommended Search Queries + +When exploring the PoC for specific patterns: + +```bash +# Find all metric names used +gh api repos/bleu/cowprotocol-services/pulls/17/files --jq '.[] | .patch' | grep -oE '[a-z_]+_total|[a-z_]+_seconds|[a-z_]+_bytes' + +# Find Grafana panel types +gh api repos/bleu/cowprotocol-services/pulls/17/files --jq '.[] | select(.filename | test("grafana")) | .patch' | grep -oE '"type":\s*"[^"]+"' + +# Find histogram bucket configurations +gh api repos/bleu/cowprotocol-services/pulls/17/files --jq '.[] | .patch' | grep -i bucket +``` + +--- + +## Notes + +- The PoC is TypeScript/K6-based; our implementation is Python-based +- Dashboard JSON patterns are portable regardless of the test framework +- Focus on panel layouts, color schemes, and query patterns from the PoC +- Our `cow_perf_*` metrics will have similar semantics but different names diff --git a/thoughts/tasks/COW-593-remaining-dashboards.md b/thoughts/tasks/COW-593-remaining-dashboards.md new file mode 100644 index 0000000..19292fd --- /dev/null +++ b/thoughts/tasks/COW-593-remaining-dashboards.md @@ -0,0 +1,204 @@ +# COW-593 Task 2: Remaining Dashboards + +> **Purpose**: Track the remaining dashboards for COW-593 that will be implemented after the essential dashboards (Task 1). +> +> **Created**: 2026-02-05 (M3 Planning Revision) +> **Parent Ticket**: [COW-593-grafana-dashboards.md](../tickets/COW-593-grafana-dashboards.md) +> **Depends on**: COW-593 Task 1 (Overview + API Performance dashboards) +> **Estimate**: ~3 points +> **PoC Analysis**: [poc-evaluation.md](../research/poc-evaluation.md) — Dashboard panel types and Grafana provisioning patterns + +--- + +## Summary + +This task covers the remaining dashboards from COW-593 that are not included in Task 1: + +1. **Resource Utilization Dashboard** — Container CPU, memory, network monitoring +2. **Comparison Dashboard** — Baseline vs current test comparison +3. **Trader Activity Dashboard** — Per-trader statistics and activity patterns + +All three dashboards are grant deliverables and must be completed for COW-593 to be considered done. + +--- + +## 1. Resource Utilization Dashboard + +**File**: `configs/dashboards/resources.json` + +### Required Panels + +#### CPU Usage Row +- [ ] Time series: CPU usage per container (orderbook, autopilot, driver, solver, anvil) + - Query: `cow_perf_container_cpu_percent{container=~"$container"}` +- [ ] Gauge: Current CPU usage per service +- [ ] Stat panel: Peak CPU usage during test + - Query: `max_over_time(cow_perf_container_cpu_percent[1h])` + +#### Memory Usage Row +- [ ] Time series: Memory usage per container + - Query: `cow_perf_container_memory_bytes{container=~"$container"}` +- [ ] Gauge: Current memory percentage per service +- [ ] Stat panel: Peak memory usage +- [ ] Alert annotation: Memory approaching limits (>80%) + +#### Network I/O Row +- [ ] Time series: Network bytes sent/received per container + - Query: `rate(cow_perf_container_network_rx_bytes[1m])`, `rate(cow_perf_container_network_tx_bytes[1m])` +- [ ] Stat panel: Total network I/O +- [ ] Time series: Network throughput rate + +#### Container Health Row +- [ ] Stat panels: Container status (running/stopped/error) +- [ ] Time series: Container restarts (if available) +- [ ] Table: Container resource limits + +### Variables +- `container` - Multi-select for container filtering + +--- + +## 2. Comparison Dashboard + +**File**: `configs/dashboards/comparison.json` + +### Required Panels + +#### Comparison Overview Row +- [ ] Stat panel: Baseline name + - Query: `cow_perf_test_info{baseline_id=~".+"}` +- [ ] Stat panel: Overall verdict (improvement/regression/neutral) + - Color-coded: green=improvement, yellow=neutral, red=regression +- [ ] Stat panel: Number of regressions detected + - Query: `sum(cow_perf_regression_detected)` +- [ ] Stat panel: Regression severity (critical/major/minor) + +#### Latency Comparison Row +- [ ] Bar gauge: Submission latency (baseline vs current, with delta %) + - Query comparison between baseline and current test_run_id +- [ ] Bar gauge: Settlement latency (baseline vs current, with delta %) +- [ ] Time series: Latency comparison over time +- [ ] Stat panels: Percentage changes per metric + +#### Throughput Comparison Row +- [ ] Bar gauge: Orders per second (baseline vs current) + - Query: `cow_perf_orders_per_second` for both runs +- [ ] Stat panel: Throughput delta (absolute and percentage) + - Query: `cow_perf_baseline_comparison_percent{metric="throughput"}` +- [ ] Time series: Throughput trend comparison + +#### Regression Details Row +- [ ] Table: List of detected regressions with severity and metrics + - Columns: Metric, Baseline Value, Current Value, Change %, Severity +- [ ] Stat panels: Count by severity (critical, major, minor) + - Query: `sum(cow_perf_regression_detected{severity="critical"})` + +### Variables +- `test_run_id` - Current test run +- `baseline_id` - Baseline to compare against + +### Implementation Note +This dashboard requires baseline comparison metrics from COW-591 Phase 2: +- `cow_perf_baseline_comparison_percent` +- `cow_perf_regression_detected` +- `cow_perf_regressions_total` + +If those metrics aren't available, the comparison dashboard can show "No baseline data" placeholder. + +--- + +## 3. Trader Activity Dashboard + +**File**: `configs/dashboards/trader-activity.json` + +### Required Panels + +#### Trader Overview Row +- [ ] Stat panel: Total active traders + - Query: `cow_perf_traders_active` +- [ ] Stat panel: Average orders per trader + - Query: `sum(cow_perf_orders_submitted_total) / cow_perf_num_traders` +- [ ] Time series: Active traders over time + - Query: `cow_perf_traders_active` + +#### Top Traders Row +- [ ] Bar chart: Top 10 traders by orders submitted + - Query: `topk(10, cow_perf_trader_orders_submitted)` +- [ ] Bar chart: Top 10 traders by orders filled + - Query: `topk(10, cow_perf_trader_orders_filled)` +- [ ] Table: Trader success rates + - Columns: Trader Address, Submitted, Filled, Success Rate % + +#### Trader Activity Patterns Row +- [ ] Heatmap: Trader activity over time + - Query: `sum(rate(cow_perf_trader_orders_submitted[1m])) by (trader_address)` + - May need to limit to top N traders for readability +- [ ] Time series: Orders by trader over time (top 5) + +#### Trader Distribution Row +- [ ] Pie chart: Distribution of orders across traders +- [ ] Histogram: Orders per trader distribution + +### Variables +- `trader_address` - Optional filter for specific trader +- `top_n` - Number of top traders to show (default: 10) + +### Implementation Note +This dashboard requires per-trader metrics from COW-591 Phase 2: +- `cow_perf_trader_orders_submitted{trader_address}` +- `cow_perf_trader_orders_filled{trader_address}` +- `cow_perf_traders_active` + +Cardinality management is important - see COW-591 implementation phases for strategies. + +--- + +## 4. Dashboard Navigation + +After all dashboards are created, add navigation links: + +- [ ] Add dashboard links in each dashboard header +- [ ] Create consistent navigation pattern: + - Overview → API → Resources → Comparison → Traders +- [ ] Add "Back to Overview" link in all secondary dashboards + +--- + +## Acceptance Criteria + +- [ ] Resources dashboard shows CPU, memory, network for all containers +- [ ] Comparison dashboard shows baseline vs current with clear indicators +- [ ] Trader Activity dashboard shows per-trader statistics +- [ ] All dashboards have working variables +- [ ] Dashboard navigation links work +- [ ] Documentation includes screenshots of each dashboard +- [ ] All panels render without errors when metrics are available + +--- + +## Implementation Order + +1. **Resources Dashboard** — Depends only on COW-591 Phase 2 resource metrics +2. **Trader Activity Dashboard** — Depends only on COW-591 Phase 2 per-trader metrics +3. **Comparison Dashboard** — Depends on COW-591 Phase 2 baseline metrics +4. **Dashboard Navigation** — After all dashboards exist + +--- + +## Files to Create + +``` +configs/dashboards/ +├── resources.json # NEW (this task) +├── comparison.json # NEW (this task) +└── trader-activity.json # NEW (this task) +``` + +--- + +## Notes + +- This task should be started immediately after COW-593 Task 1 is complete +- All three dashboards are grant deliverables — not optional +- The split into Task 1 and Task 2 is for manageable delivery, not scope reduction +- If COW-591 Phase 2 metrics aren't available, dashboards should show appropriate placeholders diff --git a/thoughts/tickets/COW-591-prometheus-exporters.md b/thoughts/tickets/COW-591-prometheus-exporters.md index 2fed7f5..e7a6895 100644 --- a/thoughts/tickets/COW-591-prometheus-exporters.md +++ b/thoughts/tickets/COW-591-prometheus-exporters.md @@ -463,3 +463,78 @@ services: * Depends on: m2-issue-06-metrics-collection-framework * Blocks: m3-issue-11-grafana-dashboards * Related: m5-issue-16-fork-mode-integration (Prometheus in docker-compose) + +--- + +## Planning Notes (M3 Planning — 2026-02-05) + +### Current State Analysis + +**What already exists:** + +1. **Basic Prometheus text output** (`cli/output.py`): + - `format_metrics_prometheus_text()` generates static text exposition format + - Only ~10 basic metrics (all gauges): `cow_perf_orders_total`, `cow_perf_orders_per_second`, `cow_perf_avg_order_latency_ms`, etc. + - **Limitation**: One-shot export at test end, not a real-time scraping endpoint + +2. **Rich metrics infrastructure** (from M2): + - `MetricsStore` - Thread-safe storage with callbacks (`metrics/store.py`) + - `MetricsEventStream` - Real-time event streaming (`metrics/streaming.py`) + - `PercentileStats`, `OrderAggregateMetrics`, `APIAggregateMetrics`, `ResourceAggregateMetrics` (`metrics/aggregator.py`) + - Detailed `OrderMetadata` with 6+ timestamps for lifecycle tracking (`metrics/models.py`) + +3. **Docker infrastructure** (`docker-compose.yml`): + - Prometheus service on port 9090 with `profile: monitoring` + - `configs/prometheus.yml` already scrapes CoW services (orderbook:9586, autopilot:9589, driver, solver) + - Grafana service on port 3000 with provisioned datasource + +### Adjustments & Clarifications + +1. **Port conflict**: The ticket proposes port 9090, but Prometheus itself uses 9090. **Use port 9091** for the performance test exporter to avoid conflicts. + +2. **Integration approach**: Hook into `MetricsEventStream` (already has callback infrastructure) rather than polling `MetricsStore`. This provides real-time metric updates. + +3. **Metric registration timing**: Metrics should be created at exporter initialization, updated via callbacks from `MetricsEventStream`, and served via HTTP. + +4. **PoC dashboards reference**: The ticket references `latency_dashboard.json` and `main_dashboard.json` from a PoC. The PoC is available via PR #17 on bleu/cowprotocol-services. See [thoughts/research/poc-evaluation.md](../research/poc-evaluation.md) for complete PoC analysis and [thoughts/tasks/COW-591-implementation-phases.md](../tasks/COW-591-implementation-phases.md) for how the PoC metrics inform our design. The compatibility section means we should: + - Use consistent naming conventions (`cow_perf_` prefix) + - Use similar histogram bucket ranges where applicable + - Support `scenario` and `test_run_id` labels for filtering + +5. **Implementation phases** (full scope, ordered by complexity): + - **Phase 1** (implement first): Order counters, latency histograms, throughput gauges, test info — these provide core visibility + - **Phase 2** (implement second): Per-trader metrics, API metrics, resource metrics, baseline comparison metrics — these complete the deliverable + + **Note**: All metrics listed in this ticket are grant deliverables. The phasing is for implementation order only, not scope reduction. See `thoughts/tasks/COW-591-implementation-phases.md` for the detailed breakdown. + +### Dependencies + +- **Add to pyproject.toml**: `prometheus-client = "^0.20.0"` + +### Recommended Implementation Order + +1. `src/cow_performance/prometheus/__init__.py` - Module setup +2. `src/cow_performance/prometheus/exporter.py` - `PrometheusExporter` class with HTTP server +3. `src/cow_performance/prometheus/metrics.py` - Metric definitions (Counter, Histogram, Gauge, Info) +4. Integration with `MetricsEventStream` via callbacks +5. CLI flag `--prometheus-port` to enable exporter during test runs +6. Update `configs/prometheus.yml` to scrape the new exporter + +### Acceptance Criteria (Full Scope) + +All metrics listed in this ticket are grant deliverables. Implementation order: + +**Phase 1** (implement first): +- [ ] `/metrics` endpoint accessible during test runs +- [ ] Core order metrics (counters for created/submitted/filled/failed/expired) +- [ ] Latency histograms with appropriate buckets (submission, orderbook, settlement, lifecycle) +- [ ] Throughput gauges (orders_per_second, target_rate, actual_rate) +- [ ] Test metadata info metric + +**Phase 2** (implement after Phase 1): +- [ ] Per-trader metrics (with cardinality management - see notes in `COW-591-implementation-phases.md`) +- [ ] API performance metrics (requests_total, response_time, errors by endpoint) +- [ ] Resource metrics (CPU, memory, network per container) +- [ ] Baseline comparison metrics (comparison_percent, regression_detected) + +**Note on cardinality**: Per-trader metrics should use bounded label values or sampling to avoid cardinality explosion. Document the approach in implementation. diff --git a/thoughts/tickets/COW-593-grafana-dashboards.md b/thoughts/tickets/COW-593-grafana-dashboards.md index f70f64b..97f58cd 100644 --- a/thoughts/tickets/COW-593-grafana-dashboards.md +++ b/thoughts/tickets/COW-593-grafana-dashboards.md @@ -406,3 +406,120 @@ grafana/ * Related: m1-issue-02-fork-mode-environment-setup (Grafana configured in docker-compose) * Related: m5-issue-19-comprehensive-documentation (dashboard usage guide) * Related: Existing PoC dashboards in `/playground/performance-test-suite/` + +--- + +## Planning Notes (M3 Planning — 2026-02-05) + +### Current State Analysis + +**What already exists:** + +1. **Grafana service configured** (`docker-compose.yml`): + - Grafana on port 3000 with `profile: monitoring` + - Datasource provisioning: `configs/grafana-datasource.yml` (points to Prometheus) + - Dashboard provisioning: `configs/grafana-dashboard.yml` (configured but **no dashboards exist**) + +2. **No dashboard JSON files** - The `configs/` directory has provisioning config but no actual dashboard files. + +3. **PoC dashboards don't exist locally** - The ticket references `latency_dashboard.json` and `main_dashboard.json` from CoW Protocol's monitoring. These are **external references** for design inspiration, not files to copy. + +### Adjustments & Clarifications + +1. **PoC Reference Available**: The PoC dashboards ARE available as a reference via **PR #17 on bleu/cowprotocol-services**. The PR adds ~4k lines, so direct file reads aren't practical. Use targeted searches: + - Search for metric names (e.g., `cow_perf_`, `gp_v2_autopilot_runloop`, `driver_auction_preprocessing`) + - Search for panel types (heatmap, timeseries, stat) + - Reference dashboard patterns (heatmap color schemes, bucket configurations) + + **Access strategy**: See [thoughts/research/poc-evaluation.md](../research/poc-evaluation.md) for complete PoC analysis (metrics, dashboards, architecture). For additional reference patterns, see [thoughts/tasks/COW-593-poc-reference.md](../tasks/COW-593-poc-reference.md). + +2. **Full dashboard scope maintained**: All dashboards listed in this ticket are grant deliverables: + - Performance Testing Overview + - API Performance + - Resource Utilization + - Comparison Dashboard + - Trader Activity + + **Implementation split** (for manageable delivery): + - **Task 1 (COW-593)**: Essential dashboards (~2 points) — Overview, API Performance + - **Task 2 (local)**: Remaining dashboards (~3 points) — Resources, Comparison, Trader Activity + + See `thoughts/tasks/COW-593-remaining-dashboards.md` for Task 2 details. + +3. **Directory structure** (maintains original plan): + ``` + configs/ + ├── grafana-datasource.yml # exists + ├── grafana-dashboard.yml # exists, update path + └── dashboards/ + ├── performance-overview.json # Task 1 + ├── api-performance.json # Task 1 + ├── resources.json # Task 2 + ├── comparison.json # Task 2 + └── trader-activity.json # Task 2 + ``` + +4. **Variable strategy**: + - `test_run_id` - Essential for filtering + - `scenario` - Essential for filtering + - `baseline_id` - For comparison dashboard + - Keep others as needed per dashboard + +5. **Dashboard panel structure** (per original ticket specification): + + **Overview Dashboard** (Task 1): + - Row 1: Test overview stats (scenario, duration, traders, verdict) + - Row 2: Order submission rate (time series + gauge) + - Row 3: Latency heatmaps (submission, settlement) + - Row 4: Order status (pie chart, success rate) + + **API Performance Dashboard** (Task 1): + - Adapt patterns from PoC's API monitoring panels + - Response times, throughput, error rates by endpoint + + **Resources, Comparison, Trader Activity** (Task 2): + - See `thoughts/tasks/COW-593-remaining-dashboards.md` + +### Dependencies + +- **Requires COW-591 complete**: Dashboard queries depend on Prometheus metrics being exposed +- **Grafana provisioning**: Update `configs/grafana-dashboard.yml` to point to `configs/dashboards/` + +### Recommended Implementation Order + +1. Create `configs/dashboards/` directory +2. Update `configs/grafana-dashboard.yml` provisioning path +3. Create `configs/dashboards/performance-testing.json` with core panels +4. Test with `docker compose --profile monitoring up -d` +5. Iterate on panel queries and layout +6. Add documentation screenshots to `docs/` + +### Acceptance Criteria (Full Scope, Split Delivery) + +**Task 1 — COW-593 (this ticket, ~2 points)**: +- [ ] Performance Overview dashboard functional +- [ ] API Performance dashboard functional +- [ ] Order submission rate visualization +- [ ] Latency distribution visualization (heatmap) +- [ ] Test metadata display (scenario, duration, traders) +- [ ] Dashboard variables working (test_run_id, scenario) +- [ ] Dashboards auto-load on Grafana startup +- [ ] Dashboard loads correctly with Prometheus datasource + +**Task 2 — `thoughts/tasks/COW-593-remaining-dashboards.md` (~3 points)**: +- [ ] Resources dashboard (CPU, memory, network per container) +- [ ] Comparison dashboard (baseline vs current, regression indicators) +- [ ] Trader Activity dashboard (per-trader stats, activity heatmap) +- [ ] Dashboard links and navigation between dashboards +- [ ] Complete documentation with screenshots + +**Note**: Both tasks must be completed for full COW-593 delivery. Task 2 is tracked locally and should be completed immediately after Task 1. + +### Human Actions Required + +- After COW-591 and COW-593 are implemented, manual testing is required: + 1. Start monitoring stack: `docker compose --profile monitoring up -d` + 2. Run a performance test with Prometheus exporter enabled + 3. Open Grafana at http://localhost:3000 + 4. Verify dashboard displays data correctly + 5. Take screenshots for documentation diff --git a/thoughts/tickets/COW-598-alerting-rules.md b/thoughts/tickets/COW-598-alerting-rules.md index 6c05f5e..56441f4 100644 --- a/thoughts/tickets/COW-598-alerting-rules.md +++ b/thoughts/tickets/COW-598-alerting-rules.md @@ -448,3 +448,109 @@ Orders are taking longer than expected to be accepted by the API. * Depends on: m3-issue-11-prometheus-exporters, m3-issue-12-grafana-dashboards * Related: m5-issue-19-comprehensive-documentation (alert documentation) + +--- + +## Planning Notes (M3 Planning — 2026-02-05) + +> **STATUS: DEFERRED** — This ticket is out of scope for the current M3 planning cycle. +> COW-598 will be refined and implemented after COW-591 (Prometheus Exporters) and COW-593 (Grafana Dashboards) are complete. + +### Deferral Rationale + +COW-598 (Alerting Rules) feels out of context compared to COW-591 and COW-593 and the work done so far. The manager/user has requested that this ticket be set aside for now and refined in a later planning step. + +**What this means**: +- No implementation work on COW-598 during the current M3 phase +- COW-591 and COW-593 take priority +- After COW-591 and COW-593 are complete, return to COW-598 for detailed planning + +### Preserved Analysis (For Future Reference) + +The following analysis was conducted during initial M3 planning and is preserved for when COW-598 is revisited: + +#### Current State + +1. **Prometheus config** (`configs/prometheus.yml`): + - Rule file loading is NOT configured (no `rule_files:` section) + - Alertmanager is NOT configured (no `alerting:` section) + - Only scrape configs for CoW services + +2. **No alerting infrastructure** - Will be built from scratch. + +3. **Docker Compose** - No Alertmanager service defined. + +#### Future Implementation Considerations + +1. **Alertmanager is optional**: For a local performance testing tool: + - **Option A**: Prometheus alerting rules + Grafana alert visualization (simpler) + - **Option B**: Full Alertmanager with notification channels (more complex) + - **Recommendation**: Start with Option A. + +2. **Core alerts to prioritize** (when implementing): + +| Alert | Severity | Condition | +|-------|----------|-----------| +| HighSubmissionLatency | Warning | P95 > 5s for 2m | +| CriticalSubmissionLatency | Critical | P95 > 10s for 1m | +| HighErrorRate | Critical | Error rate > 5% for 1m | +| LowThroughput | Warning | Actual < 80% target for 2m | +| TestStalled | Critical | No orders for 1m during active test | + +3. **Dependencies** (will be satisfied before COW-598 starts): + - Requires COW-591 complete: Alerts depend on Prometheus metrics + - Requires COW-593 dashboards: Alerts can be visualized in dashboards + +### Next Steps + +When COW-591 and COW-593 are complete: +1. Revisit this ticket's Planning Notes +2. Refine alert thresholds based on actual metric behavior observed during COW-591/COW-593 testing +3. Determine if full scope (15+ alerts) or reduced scope (5-7 alerts) is appropriate +4. Update validation and grant-alignment documents accordingly + +--- + +## Implementation Notes (2026-02-13) + +### Implemented Scope + +**Approach**: Option A (Prometheus alerting rules + Grafana visualization) +**Alert Count**: 7 core alerts (reduced from 15+ in original scope) + +### Alerts Implemented + +| Alert | Severity | Condition | Duration | +|-------|----------|-----------|----------| +| HighSubmissionLatency | Warning | P95 > 5s | 2m | +| CriticalSubmissionLatency | Critical | P95 > 10s | 1m | +| HighErrorRate | Critical | Error rate > 5% | 1m | +| LowThroughput | Warning | Actual < 80% target | 2m | +| TestStalled | Critical | No orders for 1m during active test | 1m | +| HighCPUUsage | Warning | CPU > 80% | 5m | +| CriticalMemoryUsage | Critical | Memory > 95% | 2m | + +### What Was NOT Implemented + +- Alertmanager (no Slack/email/webhook notifications) +- Settlement latency alerts +- API error spike alerts +- Regression alerts +- Alert testing framework +- Configurable thresholds (see COW-617) + +### Threshold Configuration + +All thresholds are hardcoded in `configs/prometheus/alerts/performance-testing.yml`. +Parameters are documented at the top of the file for easy modification. + +**TODO(COW-617)**: Move thresholds to configurable TOML/env variables. + +### Files Created/Modified + +- `configs/prometheus/alerts/performance-testing.yml` (NEW) +- `configs/prometheus.yml` (modified: enabled rule_files) +- `docker-compose.yml` (modified: added alerts volume mount) +- `configs/dashboards/performance.json` (modified: added alert annotations) +- `src/cow_performance/prometheus/metrics.py` (modified: added memory_percent gauge) +- `src/cow_performance/prometheus/exporter.py` (modified: export memory_percent metric)