vllm-project
diff --git a/‎.github/workflows/performance-test.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/performance-test.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 1 addition & 0 deletions b/‎Makefile‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎e2e/pkg/performance/load_generator.go‎
Lines changed: 14 additions & 14 deletions b/‎e2e/pkg/performance/load_generator.go‎
Lines changed: 14 additions & 14 deletions
diff --git a/‎e2e/pkg/performance/metrics_collector.go‎
Lines changed: 7 additions & 7 deletions b/‎e2e/pkg/performance/metrics_collector.go‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎perf/CI-STRATEGY.md‎
Lines changed: 32 additions & 0 deletions b/‎perf/CI-STRATEGY.md‎
Lines changed: 32 additions & 0 deletions
@@ -88,6 +88,7 @@ jobs:
 
       - name: Run component benchmarks
         run: |
+          mkdir -p reports
           export LD_LIBRARY_PATH=${PWD}/candle-binding/target/release
           make perf-bench-quick 2>&1 | tee reports/bench-output.txt
 
 
@@ -21,6 +21,7 @@ _run:
 		-f tools/make/observability.mk \
 		-f tools/make/openshift.mk \
 		-f tools/make/e2e.mk \
+		-f tools/make/performance.mk \
 		$(MAKECMDGOALS)
 
 .PHONY: _run
 
@@ -28,20 +28,20 @@ func NewLoadGenerator(concurrency, rateLimit int, duration time.Duration) *LoadG
 
 // LoadResult contains the results of a load test
 type LoadResult struct {
-	TotalRequests   int
-	SuccessfulReqs  int
-	FailedReqs      int
-	Duration        time.Duration
-	AvgLatencyMs    float64
-	P50LatencyMs    float64
-	P90LatencyMs    float64
-	P95LatencyMs    float64
-	P99LatencyMs    float64
-	MaxLatencyMs    float64
-	MinLatencyMs    float64
-	ThroughputQPS   float64
-	Latencies       []time.Duration
-	Errors          []error
+	TotalRequests  int
+	SuccessfulReqs int
+	FailedReqs     int
+	Duration       time.Duration
+	AvgLatencyMs   float64
+	P50LatencyMs   float64
+	P90LatencyMs   float64
+	P95LatencyMs   float64
+	P99LatencyMs   float64
+	MaxLatencyMs   float64
+	MinLatencyMs   float64
+	ThroughputQPS  float64
+	Latencies      []time.Duration
+	Errors         []error
 }
 
 // RequestFunc is a function that executes a single request
 
@@ -110,13 +110,13 @@ func (mc *MetricsCollector) MonitorPodMetrics(ctx context.Context, podName strin
 
 // ResourceStats holds aggregated resource statistics
 type ResourceStats struct {
-	AvgCPUCores  float64
-	MaxCPUCores  float64
-	MinCPUCores  float64
-	AvgMemoryMB  float64
-	MaxMemoryMB  float64
-	MinMemoryMB  float64
-	SampleCount  int
+	AvgCPUCores float64
+	MaxCPUCores float64
+	MinCPUCores float64
+	AvgMemoryMB float64
+	MaxMemoryMB float64
+	MinMemoryMB float64
+	SampleCount int
 }
 
 // AggregateMetrics aggregates multiple pod metrics samples
 
@@ -44,15 +44,18 @@ Here are different approaches teams use, from most to least restrictive:
 ### Strategy 1: Label-Based (CURRENT - RECOMMENDED) 🏷️
 
 **When it runs:**
+
 - Only when PR has `performance` label
 - Manual trigger via GitHub UI
 
 **Pros:**
+
 - ✅ Saves tons of CI time
 - ✅ Developers control when tests run
 - ✅ No noise on small PRs
 
 **Cons:**
+
 - ❌ Developers might forget to add label
 - ❌ Regressions could slip through
 
@@ -63,6 +66,7 @@ Here are different approaches teams use, from most to least restrictive:
 ### Strategy 2: Path-Based (Original Design) 📁
 
 **When it runs:**
+
 ```yaml
 on:
   pull_request:
@@ -73,10 +77,12 @@ on:
 ```
 
 **Pros:**
+
 - ✅ Automatic - no manual intervention
 - ✅ Catches regressions early
 
 **Cons:**
+
 - ❌ Runs too often (most PRs touch these paths)
 - ❌ High CI cost
 - ❌ Slows down development
@@ -88,6 +94,7 @@ on:
 ### Strategy 3: Scheduled + Manual Only ⏰
 
 **When it runs:**
+
 ```yaml
 on:
   schedule:
@@ -96,11 +103,13 @@ on:
 ```
 
 **Pros:**
+
 - ✅ Minimal CI cost
 - ✅ No PR delays
 - ✅ Nightly baseline still updates
 
 **Cons:**
+
 - ❌ Regressions found after merge (too late!)
 - ❌ Developers must manually trigger
 
@@ -111,6 +120,7 @@ on:
 ### Strategy 4: Hybrid - Critical Paths Only 🎯
 
 **When it runs:**
+
 ```yaml
 on:
   pull_request:
@@ -122,11 +132,13 @@ on:
 ```
 
 **Pros:**
+
 - ✅ Automatic for critical code
 - ✅ Reduced CI usage vs path-based
 - ✅ Catches most important regressions
 
 **Cons:**
+
 - ❌ Still runs frequently
 - ❌ Can miss indirect performance impacts
 
@@ -137,16 +149,19 @@ on:
 ### Strategy 5: PR Size Based 📏
 
 **When it runs:**
+
 ```yaml
 # Run only on large PRs (>500 lines changed)
 if: github.event.pull_request.additions + github.event.pull_request.deletions > 500
 ```
 
 **Pros:**
+
 - ✅ Small PRs skip expensive tests
 - ✅ Large risky changes get tested
 
 **Cons:**
+
 - ❌ Single-line change can cause regression
 - ❌ Complex logic to maintain
 
@@ -157,6 +172,7 @@ if: github.event.pull_request.additions + github.event.pull_request.deletions >
 ### Strategy 6: Pre-merge Only (Protected Branch) 🔒
 
 **When it runs:**
+
 ```yaml
 on:
   pull_request:
@@ -167,10 +183,12 @@ on:
 ```
 
 **Pros:**
+
 - ✅ Tests final code before/after merge
 - ✅ Doesn't slow down draft PRs
 
 **Cons:**
+
 - ❌ Late feedback for developers
 - ❌ Might catch issues post-merge
 
@@ -181,27 +199,31 @@ on:
 ## Recommended Setup by Project Stage
 
 ### 🌱 Early Stage Project
+
 ```yaml
 Strategy: Scheduled + Manual
 Performance Tests: Nightly only
 Reason: Save CI budget, iterate fast
 ```
 
 ### 🌿 Growing Project
+
 ```yaml
 Strategy: Label-Based (CURRENT)
 Performance Tests: On 'performance' label
 Reason: Balance cost vs safety
 ```
 
 ### 🌳 Mature Project
+
 ```yaml
 Strategy: Hybrid Critical Paths
 Performance Tests: Auto on critical code
 Reason: High confidence, catch regressions
 ```
 
 ### 🏢 Enterprise Project
+
 ```yaml
 Strategy: Every PR (Path-Based)
 Performance Tests: Always
@@ -251,6 +273,7 @@ No changes needed! Current setup is optimized.
 ## Cost Analysis
 
 Assuming:
+
 - 10 PRs per day
 - 20 minutes per performance test
 - $0.008 per minute (GitHub Actions pricing)
@@ -271,6 +294,7 @@ Assuming:
 ### For Developers
 
 **When to add `performance` label:**
+
 - ✅ Changing classification, cache, or decision engine
 - ✅ Modifying CGO bindings
 - ✅ Optimizing algorithms
@@ -282,6 +306,7 @@ Assuming:
 ### For Reviewers
 
 **Check for performance label:**
+
 ```markdown
 ## Performance Checklist
 - [ ] Does this PR touch classification/cache/decision code?
@@ -292,6 +317,7 @@ Assuming:
 ### For CI
 
 **Monitor false negatives:**
+
 - Track regressions found in nightly but missed in PRs
 - If >5% slip through, consider tightening strategy
 
@@ -302,26 +328,30 @@ Assuming:
 ### Q: What if a regression slips through?
 
 **A:** Nightly workflow will catch it and create an issue. You can:
+
 1. Revert the problematic PR
 2. Fix forward with a new PR
 3. Update baseline if intentional
 
 ### Q: Can I force performance tests on a PR without label?
 
 **A:** Yes! Two ways:
+
 1. Add `performance` label to PR
 2. Go to Actions tab → Performance Tests → Run workflow → Select your branch
 
 ### Q: What about main branch protection?
 
 **A:** Performance tests are NOT required checks. They're:
+
 - Advisory (warn but don't block)
 - Opt-in (run when needed)
 - Nightly will catch issues anyway
 
 ### Q: Should I run tests locally before PR?
 
 **A:** Recommended for performance-critical changes:
+
 ```bash
 make perf-bench-quick    # Takes 3-5 min
 make perf-compare        # Compare vs baseline
@@ -339,11 +369,13 @@ make perf-compare        # Compare vs baseline
 - Nightly workflow ensures baselines stay current
 
 **To run performance tests on your PR:**
+
 1. Add label: `performance`
 2. Wait for tests to complete (~15 min)
 3. Review results in PR comment
 
 **Why nightly is still needed:**
+
 - Updates baselines automatically
 - Catches anything that slipped through
 - Runs comprehensive 30s benchmarks