@@ -105,9 +105,21 @@ def main():
105105 sys .exit (1 )
106106 evaluator = OpenAIEvaluator (quick_mode = True , model = OPENAI_MODEL )
107107
108+ print (f"\n 🚀 Starting Performance Regression Test" )
109+ print (f"📅 Test Date: { time .strftime ('%Y-%m-%d %H:%M:%S UTC' , time .gmtime ())} " )
110+ print (f"🔧 Backend: { BACKEND } " )
111+ print (f"⚡ Quick Mode: Enabled" )
112+ print (f"🎯 Time Threshold: { THRESHOLD_SECONDS } s" )
113+ print (f"💾 Memory Threshold: { THRESHOLD_MB } MB" )
114+ print (f"🤖 Model: { OPENAI_MODEL if BACKEND == 'OPENAI' else 'Local Model' } " )
115+ print ("-" * 60 )
116+
108117 start_time = time .time ()
109118 start_mem = get_memory_mb ()
110119
120+ print (f"📊 Initial Memory Usage: { start_mem :.2f} MB" )
121+ print (f"⏱️ Starting evaluation at: { time .strftime ('%H:%M:%S' )} " )
122+
111123 # Run the evaluation (do not print results to avoid CI log noise)
112124 evaluator .run_evaluation ()
113125
@@ -116,31 +128,112 @@ def main():
116128
117129 elapsed = end_time - start_time
118130 mem_used = max (0.0 , end_mem - start_mem )
131+ mem_peak = end_mem
132+
133+ print (f"⏱️ Evaluation completed at: { time .strftime ('%H:%M:%S' )} " )
134+ print (f"📊 Final Memory Usage: { end_mem :.2f} MB" )
135+
136+ # Calculate performance ratios
137+ time_ratio = (elapsed / THRESHOLD_SECONDS ) * 100
138+ memory_ratio = (mem_used / THRESHOLD_MB ) * 100
139+
140+ # Determine performance grade
141+ if elapsed <= THRESHOLD_SECONDS * 0.5 and mem_used <= THRESHOLD_MB * 0.5 :
142+ grade = "🟢 EXCELLENT"
143+ elif elapsed <= THRESHOLD_SECONDS * 0.8 and mem_used <= THRESHOLD_MB * 0.8 :
144+ grade = "🟡 GOOD"
145+ elif elapsed <= THRESHOLD_SECONDS and mem_used <= THRESHOLD_MB :
146+ grade = "🟠 ACCEPTABLE"
147+ else :
148+ grade = "🔴 FAILED"
119149
120150 metrics = {
121- "backend" : BACKEND ,
122- "elapsed_seconds" : round (elapsed , 2 ),
123- "memory_mb" : round (mem_used , 2 ),
124- "threshold_seconds" : THRESHOLD_SECONDS ,
125- "threshold_mb" : THRESHOLD_MB ,
126- "status" : "PASS" if elapsed <= THRESHOLD_SECONDS and mem_used <= THRESHOLD_MB else "FAIL"
151+ "test_info" : {
152+ "date" : time .strftime ('%Y-%m-%d %H:%M:%S UTC' , time .gmtime ()),
153+ "backend" : BACKEND ,
154+ "model" : OPENAI_MODEL if BACKEND == 'OPENAI' else 'Local Model' ,
155+ "quick_mode" : True ,
156+ "test_type" : "LLM Judge Evaluation Performance"
157+ },
158+ "performance" : {
159+ "elapsed_seconds" : round (elapsed , 3 ),
160+ "memory_mb" : round (mem_used , 3 ),
161+ "memory_peak_mb" : round (mem_peak , 3 ),
162+ "time_ratio_percent" : round (time_ratio , 1 ),
163+ "memory_ratio_percent" : round (memory_ratio , 1 )
164+ },
165+ "thresholds" : {
166+ "time_seconds" : THRESHOLD_SECONDS ,
167+ "memory_mb" : THRESHOLD_MB
168+ },
169+ "status" : {
170+ "overall" : "PASS" if elapsed <= THRESHOLD_SECONDS and mem_used <= THRESHOLD_MB else "FAIL" ,
171+ "time_status" : "PASS" if elapsed <= THRESHOLD_SECONDS else "FAIL" ,
172+ "memory_status" : "PASS" if mem_used <= THRESHOLD_MB else "FAIL" ,
173+ "grade" : grade
174+ }
127175 }
128176
129177 # Output results for CI artifact
130178 with open ("performance_metrics.json" , "w" ) as f :
131179 json .dump (metrics , f , indent = 2 )
132180
133- print ("\n ===== Performance Regression Metrics =====" )
134- print (json .dumps (metrics , indent = 2 ))
135- print ("========================================\n " )
181+ # Print detailed results
182+ print (f"\n { '=' * 60 } " )
183+ print (f"📈 PERFORMANCE REGRESSION TEST RESULTS" )
184+ print (f"{ '=' * 60 } " )
185+ print (f"📅 Test Date: { metrics ['test_info' ]['date' ]} " )
186+ print (f"🔧 Backend: { metrics ['test_info' ]['backend' ]} " )
187+ print (f"🤖 Model: { metrics ['test_info' ]['model' ]} " )
188+ print (f"⚡ Mode: Quick Evaluation" )
189+ print (f"" )
190+ print (f"⏱️ EXECUTION TIME:" )
191+ print (f" • Elapsed: { metrics ['performance' ]['elapsed_seconds' ]} s" )
192+ print (f" • Threshold: { metrics ['thresholds' ]['time_seconds' ]} s" )
193+ print (f" • Usage: { metrics ['performance' ]['time_ratio_percent' ]} % of threshold" )
194+ print (f" • Status: { metrics ['status' ]['time_status' ]} " )
195+ print (f"" )
196+ print (f"💾 MEMORY USAGE:" )
197+ print (f" • Used: { metrics ['performance' ]['memory_mb' ]} MB" )
198+ print (f" • Peak: { metrics ['performance' ]['memory_peak_mb' ]} MB" )
199+ print (f" • Threshold: { metrics ['thresholds' ]['memory_mb' ]} MB" )
200+ print (f" • Usage: { metrics ['performance' ]['memory_ratio_percent' ]} % of threshold" )
201+ print (f" • Status: { metrics ['status' ]['memory_status' ]} " )
202+ print (f"" )
203+ print (f"🎯 OVERALL RESULT:" )
204+ print (f" • Grade: { metrics ['status' ]['grade' ]} " )
205+ print (f" • Status: { metrics ['status' ]['overall' ]} " )
206+ print (f"" )
207+
208+ if metrics ['status' ]['overall' ] == "PASS" :
209+ print (f"✅ PERFORMANCE TEST PASSED" )
210+ if grade == "🟢 EXCELLENT" :
211+ print (f" 🎉 Excellent performance! Well under thresholds." )
212+ elif grade == "🟡 GOOD" :
213+ print (f" 👍 Good performance within safe margins." )
214+ else :
215+ print (f" ⚠️ Acceptable performance, but close to thresholds." )
216+ else :
217+ print (f"❌ PERFORMANCE TEST FAILED" )
218+ print (f" 🚨 Performance regression detected!" )
219+ if metrics ['status' ]['time_status' ] == "FAIL" :
220+ print (f" ⏱️ Time exceeded threshold by { elapsed - THRESHOLD_SECONDS :.2f} s" )
221+ if metrics ['status' ]['memory_status' ] == "FAIL" :
222+ print (f" 💾 Memory exceeded threshold by { mem_used - THRESHOLD_MB :.2f} MB" )
223+
224+ print (f"{ '=' * 60 } " )
225+ print (f"📄 Results saved to: performance_metrics.json" )
226+ print (f"📊 CI Artifact: performance-metrics.zip" )
227+ print (f"{ '=' * 60 } \n " )
136228
137229 # Robust CI failure: assertion + sys.exit(1)
138- assert metrics ["status" ] == "PASS" , (
139- f"Performance regression: time={ elapsed :.2f} s, mem={ mem_used :.2f} MB"
140- )
141- if metrics ["status" ] != "PASS" :
142- print (f"Performance regression: time={ elapsed :.2f} s, mem={ mem_used :.2f} MB" , file = sys .stderr )
230+ if metrics ['status' ]['overall' ] != "PASS" :
231+ print (f"❌ PERFORMANCE REGRESSION DETECTED" , file = sys .stderr )
232+ print (f" Time: { elapsed :.3f} s (threshold: { THRESHOLD_SECONDS } s)" , file = sys .stderr )
233+ print (f" Memory: { mem_used :.3f} MB (threshold: { THRESHOLD_MB } MB)" , file = sys .stderr )
143234 sys .exit (1 )
235+
236+ print (f"✅ Performance test completed successfully!" )
144237
145238if __name__ == "__main__" :
146239 main ()
0 commit comments