Skip to content

Commit 3c44bde

Browse files
committed
Enhance performance regression test with detailed metrics and clear messaging
- Add comprehensive test information (date, backend, model, mode) - Include detailed performance metrics (elapsed time, memory usage, ratios) - Add performance grading system (EXCELLENT, GOOD, ACCEPTABLE, FAILED) - Provide clear status indicators for time and memory separately - Show percentage usage of thresholds for easy comparison - Include peak memory usage for better analysis - Add structured JSON output for CI artifacts and comparison - Improve console output with emojis and clear formatting - Add detailed error messages for performance regressions This makes it much easier to compare performance across different runs and quickly identify any performance regressions or improvements.
1 parent 428efd2 commit 3c44bde

File tree

1 file changed

+107
-14
lines changed

1 file changed

+107
-14
lines changed

scripts/test_performance_regression.py

Lines changed: 107 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -105,9 +105,21 @@ def main():
105105
sys.exit(1)
106106
evaluator = OpenAIEvaluator(quick_mode=True, model=OPENAI_MODEL)
107107

108+
print(f"\n🚀 Starting Performance Regression Test")
109+
print(f"📅 Test Date: {time.strftime('%Y-%m-%d %H:%M:%S UTC', time.gmtime())}")
110+
print(f"🔧 Backend: {BACKEND}")
111+
print(f"⚡ Quick Mode: Enabled")
112+
print(f"🎯 Time Threshold: {THRESHOLD_SECONDS}s")
113+
print(f"💾 Memory Threshold: {THRESHOLD_MB}MB")
114+
print(f"🤖 Model: {OPENAI_MODEL if BACKEND == 'OPENAI' else 'Local Model'}")
115+
print("-" * 60)
116+
108117
start_time = time.time()
109118
start_mem = get_memory_mb()
110119

120+
print(f"📊 Initial Memory Usage: {start_mem:.2f}MB")
121+
print(f"⏱️ Starting evaluation at: {time.strftime('%H:%M:%S')}")
122+
111123
# Run the evaluation (do not print results to avoid CI log noise)
112124
evaluator.run_evaluation()
113125

@@ -116,31 +128,112 @@ def main():
116128

117129
elapsed = end_time - start_time
118130
mem_used = max(0.0, end_mem - start_mem)
131+
mem_peak = end_mem
132+
133+
print(f"⏱️ Evaluation completed at: {time.strftime('%H:%M:%S')}")
134+
print(f"📊 Final Memory Usage: {end_mem:.2f}MB")
135+
136+
# Calculate performance ratios
137+
time_ratio = (elapsed / THRESHOLD_SECONDS) * 100
138+
memory_ratio = (mem_used / THRESHOLD_MB) * 100
139+
140+
# Determine performance grade
141+
if elapsed <= THRESHOLD_SECONDS * 0.5 and mem_used <= THRESHOLD_MB * 0.5:
142+
grade = "🟢 EXCELLENT"
143+
elif elapsed <= THRESHOLD_SECONDS * 0.8 and mem_used <= THRESHOLD_MB * 0.8:
144+
grade = "🟡 GOOD"
145+
elif elapsed <= THRESHOLD_SECONDS and mem_used <= THRESHOLD_MB:
146+
grade = "🟠 ACCEPTABLE"
147+
else:
148+
grade = "🔴 FAILED"
119149

120150
metrics = {
121-
"backend": BACKEND,
122-
"elapsed_seconds": round(elapsed, 2),
123-
"memory_mb": round(mem_used, 2),
124-
"threshold_seconds": THRESHOLD_SECONDS,
125-
"threshold_mb": THRESHOLD_MB,
126-
"status": "PASS" if elapsed <= THRESHOLD_SECONDS and mem_used <= THRESHOLD_MB else "FAIL"
151+
"test_info": {
152+
"date": time.strftime('%Y-%m-%d %H:%M:%S UTC', time.gmtime()),
153+
"backend": BACKEND,
154+
"model": OPENAI_MODEL if BACKEND == 'OPENAI' else 'Local Model',
155+
"quick_mode": True,
156+
"test_type": "LLM Judge Evaluation Performance"
157+
},
158+
"performance": {
159+
"elapsed_seconds": round(elapsed, 3),
160+
"memory_mb": round(mem_used, 3),
161+
"memory_peak_mb": round(mem_peak, 3),
162+
"time_ratio_percent": round(time_ratio, 1),
163+
"memory_ratio_percent": round(memory_ratio, 1)
164+
},
165+
"thresholds": {
166+
"time_seconds": THRESHOLD_SECONDS,
167+
"memory_mb": THRESHOLD_MB
168+
},
169+
"status": {
170+
"overall": "PASS" if elapsed <= THRESHOLD_SECONDS and mem_used <= THRESHOLD_MB else "FAIL",
171+
"time_status": "PASS" if elapsed <= THRESHOLD_SECONDS else "FAIL",
172+
"memory_status": "PASS" if mem_used <= THRESHOLD_MB else "FAIL",
173+
"grade": grade
174+
}
127175
}
128176

129177
# Output results for CI artifact
130178
with open("performance_metrics.json", "w") as f:
131179
json.dump(metrics, f, indent=2)
132180

133-
print("\n===== Performance Regression Metrics =====")
134-
print(json.dumps(metrics, indent=2))
135-
print("========================================\n")
181+
# Print detailed results
182+
print(f"\n{'='*60}")
183+
print(f"📈 PERFORMANCE REGRESSION TEST RESULTS")
184+
print(f"{'='*60}")
185+
print(f"📅 Test Date: {metrics['test_info']['date']}")
186+
print(f"🔧 Backend: {metrics['test_info']['backend']}")
187+
print(f"🤖 Model: {metrics['test_info']['model']}")
188+
print(f"⚡ Mode: Quick Evaluation")
189+
print(f"")
190+
print(f"⏱️ EXECUTION TIME:")
191+
print(f" • Elapsed: {metrics['performance']['elapsed_seconds']}s")
192+
print(f" • Threshold: {metrics['thresholds']['time_seconds']}s")
193+
print(f" • Usage: {metrics['performance']['time_ratio_percent']}% of threshold")
194+
print(f" • Status: {metrics['status']['time_status']}")
195+
print(f"")
196+
print(f"💾 MEMORY USAGE:")
197+
print(f" • Used: {metrics['performance']['memory_mb']}MB")
198+
print(f" • Peak: {metrics['performance']['memory_peak_mb']}MB")
199+
print(f" • Threshold: {metrics['thresholds']['memory_mb']}MB")
200+
print(f" • Usage: {metrics['performance']['memory_ratio_percent']}% of threshold")
201+
print(f" • Status: {metrics['status']['memory_status']}")
202+
print(f"")
203+
print(f"🎯 OVERALL RESULT:")
204+
print(f" • Grade: {metrics['status']['grade']}")
205+
print(f" • Status: {metrics['status']['overall']}")
206+
print(f"")
207+
208+
if metrics['status']['overall'] == "PASS":
209+
print(f"✅ PERFORMANCE TEST PASSED")
210+
if grade == "🟢 EXCELLENT":
211+
print(f" 🎉 Excellent performance! Well under thresholds.")
212+
elif grade == "🟡 GOOD":
213+
print(f" 👍 Good performance within safe margins.")
214+
else:
215+
print(f" ⚠️ Acceptable performance, but close to thresholds.")
216+
else:
217+
print(f"❌ PERFORMANCE TEST FAILED")
218+
print(f" 🚨 Performance regression detected!")
219+
if metrics['status']['time_status'] == "FAIL":
220+
print(f" ⏱️ Time exceeded threshold by {elapsed - THRESHOLD_SECONDS:.2f}s")
221+
if metrics['status']['memory_status'] == "FAIL":
222+
print(f" 💾 Memory exceeded threshold by {mem_used - THRESHOLD_MB:.2f}MB")
223+
224+
print(f"{'='*60}")
225+
print(f"📄 Results saved to: performance_metrics.json")
226+
print(f"📊 CI Artifact: performance-metrics.zip")
227+
print(f"{'='*60}\n")
136228

137229
# Robust CI failure: assertion + sys.exit(1)
138-
assert metrics["status"] == "PASS", (
139-
f"Performance regression: time={elapsed:.2f}s, mem={mem_used:.2f}MB"
140-
)
141-
if metrics["status"] != "PASS":
142-
print(f"Performance regression: time={elapsed:.2f}s, mem={mem_used:.2f}MB", file=sys.stderr)
230+
if metrics['status']['overall'] != "PASS":
231+
print(f"❌ PERFORMANCE REGRESSION DETECTED", file=sys.stderr)
232+
print(f" Time: {elapsed:.3f}s (threshold: {THRESHOLD_SECONDS}s)", file=sys.stderr)
233+
print(f" Memory: {mem_used:.3f}MB (threshold: {THRESHOLD_MB}MB)", file=sys.stderr)
143234
sys.exit(1)
235+
236+
print(f"✅ Performance test completed successfully!")
144237

145238
if __name__ == "__main__":
146239
main()

0 commit comments

Comments
 (0)