@@ -67,10 +67,10 @@ NC='\033[0m' # No Color
6767# Auto-detect vLLM model if not specified
6868if [[ -z " $VLLM_MODEL " ]]; then
6969 echo -e " ${BLUE} 🔍 Auto-detecting vLLM model from endpoint...${NC} "
70-
70+
7171 # Try to fetch models from the vLLM endpoint
7272 VLLM_MODELS_JSON=$( curl -s " $VLLM_ENDPOINT /models" 2> /dev/null || echo " " )
73-
73+
7474 if [[ -n " $VLLM_MODELS_JSON " ]]; then
7575 # Extract the first model ID from the JSON response
7676 VLLM_MODEL=$( echo " $VLLM_MODELS_JSON " | python3 -c "
8585except:
8686 print('')
8787" 2> /dev/null)
88-
88+
8989 if [[ -n " $VLLM_MODEL " ]]; then
9090 echo -e " ${GREEN} ✅ Auto-detected vLLM model: $VLLM_MODEL ${NC} "
9191 else
@@ -110,20 +110,20 @@ declare -A DATASET_CONFIGS=(
110110 [" gpqa" ]=20 # 1 category × 20 = 20 samples - OUTSTANDING reasoning differentiation
111111 [" mmlu" ]=10 # 57 subjects × 10 = 570 samples - EXCELLENT reasoning differentiation
112112 [" truthfulqa" ]=15 # Truthfulness evaluation - some reasoning differentiation (60% → 73.3%)
113-
113+
114114 # Mathematical reasoning datasets
115115 # ["math"]=15 # Competition mathematics - DISABLED: Dataset not available on HF Hub
116116 [" gsm8k" ]=25 # Elementary math word problems - EXPECTED good reasoning differentiation
117117 [" aqua-rat" ]=20 # Algebraic word problems with rationales - EXPECTED good differentiation
118-
118+
119119 # Multi-step reasoning datasets
120120 [" drop" ]=20 # Reading comprehension with discrete reasoning - EXPECTED excellent differentiation
121121 [" strategyqa" ]=20 # Multi-step implicit reasoning - EXPECTED good differentiation
122-
122+
123123 # Scientific reasoning datasets
124124 [" sciq" ]=25 # Science questions requiring reasoning - EXPECTED moderate differentiation
125125 [" openbookqa" ]=20 # Elementary science with fact reasoning - EXPECTED moderate differentiation
126-
126+
127127 # Disabled datasets with poor reasoning differentiation:
128128 # ["arc-challenge"]=15 # 100% accuracy across all modes, minimal benefit
129129 # ["commonsenseqa"]=20 # Same accuracy across modes, small token difference
@@ -174,42 +174,42 @@ extract_and_save_metrics() {
174174 local mode=$2 # "router" or "vllm"
175175 local results_dir=$3
176176 local timestamp=$( date ' +%Y-%m-%d %H:%M:%S' )
177-
177+
178178 # Find the results files (handle nested directory structure)
179179 local summary_file=" "
180180 local detailed_file=" "
181-
181+
182182 # Look for files in nested directories
183183 if [[ -d " $results_dir " ]]; then
184184 summary_file=$( find " $results_dir " -name " results_summary.csv" -type f | head -1)
185185 if [[ -z " $summary_file " ]]; then
186186 detailed_file=$( find " $results_dir " -name " detailed_results.csv" -type f | head -1)
187187 fi
188188 fi
189-
189+
190190 # Use whichever file we found
191191 local target_file=" "
192192 if [[ -f " $summary_file " ]]; then
193193 target_file=" $summary_file "
194194 elif [[ -f " $detailed_file " ]]; then
195195 target_file=" $detailed_file "
196196 fi
197-
197+
198198 if [[ -n " $target_file " && -f " $target_file " ]]; then
199199 echo -e " ${YELLOW} 📊 Extracting metrics from $target_file ...${NC} "
200-
200+
201201 # Extract overall metrics from the CSV file
202202 # Skip header and get the last line (overall summary) or calculate averages
203- local temp_file=" /tmp/metrics_$dataset_$ mode .txt"
204-
203+ local temp_file=" /tmp/metrics_${dataset} _ ${ mode} .txt"
204+
205205 # Use Python to calculate averages from the CSV
206206 python3 -c "
207207import pandas as pd
208208import sys
209209
210210try:
211211 df = pd.read_csv('$target_file ')
212-
212+
213213 # Calculate overall metrics (handle different CSV formats)
214214 if len(df) > 0:
215215 # Handle accuracy column (is_correct vs accuracy)
@@ -219,63 +219,63 @@ try:
219219 avg_accuracy = df['accuracy'].mean()
220220 else:
221221 avg_accuracy = 0.0
222-
222+
223223 # Handle latency column (response_time vs avg_latency_ms)
224224 if 'response_time' in df.columns:
225225 avg_latency = df['response_time'].mean() * 1000 # Convert to ms
226226 elif 'avg_latency_ms' in df.columns:
227227 avg_latency = df['avg_latency_ms'].mean()
228228 else:
229229 avg_latency = 0.0
230-
230+
231231 # Handle token column (total_tokens vs avg_total_tokens)
232232 if 'total_tokens' in df.columns:
233233 avg_tokens = df['total_tokens'].mean()
234234 elif 'avg_total_tokens' in df.columns:
235235 avg_tokens = df['avg_total_tokens'].mean()
236236 else:
237237 avg_tokens = 0.0
238-
238+
239239 sample_count = len(df)
240-
240+
241241 # Determine model name
242242 if '$mode ' == 'router':
243243 model_name = '$ROUTER_MODEL '
244244 else:
245245 model_name = '$VLLM_MODEL '
246-
246+
247247 # For vLLM, we might have multiple modes (NR, NR_REASONING)
248248 # Check both 'mode' and 'mode_label' columns for mode information
249249 if '$mode ' == 'vllm' and ('mode' in df.columns or 'mode_label' in df.columns):
250250 # Use mode_label if available (more descriptive), otherwise use mode
251251 mode_column = 'mode_label' if 'mode_label' in df.columns else 'mode'
252252 for mode_type in df[mode_column].unique():
253253 mode_df = df[df[mode_column] == mode_type]
254-
254+
255255 # Recalculate metrics for this specific mode using correct column names
256256 if 'is_correct' in mode_df.columns:
257257 mode_accuracy = mode_df['is_correct'].mean()
258258 elif 'accuracy' in mode_df.columns:
259259 mode_accuracy = mode_df['accuracy'].mean()
260260 else:
261261 mode_accuracy = 0.0
262-
262+
263263 if 'response_time' in mode_df.columns:
264264 mode_latency = mode_df['response_time'].mean() * 1000
265265 elif 'avg_latency_ms' in mode_df.columns:
266266 mode_latency = mode_df['avg_latency_ms'].mean()
267267 else:
268268 mode_latency = 0.0
269-
269+
270270 if 'total_tokens' in mode_df.columns:
271271 mode_tokens = mode_df['total_tokens'].mean()
272272 elif 'avg_total_tokens' in mode_df.columns:
273273 mode_tokens = mode_df['avg_total_tokens'].mean()
274274 else:
275275 mode_tokens = 0.0
276-
276+
277277 mode_samples = len(mode_df)
278-
278+
279279 # Map technical mode names to descriptive names
280280 if mode_type == 'VLLM_NR':
281281 display_mode = 'vLLM_No_Reasoning'
285285 display_mode = 'vLLM_CoT'
286286 else:
287287 display_mode = mode_type # Use the mode_label as-is if not recognized
288-
288+
289289 csv_line = f'$dataset ,{display_mode},{model_name},{mode_accuracy:.3f},{mode_latency:.1f},{mode_tokens:.1f},{mode_samples},$timestamp '
290290 print(f' 📝 Writing to CSV: {csv_line}', file=sys.stderr)
291291 print(csv_line)
@@ -295,12 +295,12 @@ try:
295295 print(csv_line)
296296 else:
297297 print(f'$dataset ,$mode ,unknown,0.000,0.0,0.0,0,$timestamp ', file=sys.stderr)
298-
298+
299299except Exception as e:
300300 print(f'Error processing $target_file : {e}', file=sys.stderr)
301301 print(f'$dataset ,$mode ,unknown,0.000,0.0,0.0,0,$timestamp ', file=sys.stderr)
302302" | tee -a " $RESEARCH_CSV " >> " $PERSISTENT_RESEARCH_CSV "
303-
303+
304304 echo -e " ${GREEN} ✅ Metrics saved to both timestamped and master research CSV${NC} "
305305 else
306306 echo -e " ${RED} ❌ Warning: No results files found in $results_dir ${NC} "
@@ -313,9 +313,9 @@ except Exception as e:
313313run_dataset_benchmark () {
314314 local dataset=$1
315315 local samples=${DATASET_CONFIGS[$dataset]}
316-
316+
317317 echo -e " ${GREEN} 📊 Benchmarking $dataset dataset ($samples samples per category)...${NC} "
318-
318+
319319 # Router benchmark (pass vLLM info for consistent token calculation)
320320 echo -e " ${YELLOW} 🤖 Running router evaluation...${NC} "
321321 python3 -m vllm_semantic_router_bench.router_reason_bench_multi_dataset \
@@ -333,7 +333,7 @@ run_dataset_benchmark() {
333333 # Extract and save router metrics immediately
334334 extract_and_save_metrics " $dataset " " Router" " $OUTPUT_BASE /router_$dataset "
335335
336- # vLLM benchmark
336+ # vLLM benchmark
337337 echo -e " ${YELLOW} ⚡ Running vLLM evaluation...${NC} "
338338 python3 -m vllm_semantic_router_bench.router_reason_bench_multi_dataset \
339339 --dataset " $dataset " \
@@ -345,14 +345,14 @@ run_dataset_benchmark() {
345345 --output-dir " $OUTPUT_BASE /vllm_$dataset " \
346346 --concurrent-requests " $CONCURRENT_REQUESTS " \
347347 --seed 42
348-
348+
349349 # Extract and save vLLM metrics immediately
350350 extract_and_save_metrics " $dataset " " vllm" " $OUTPUT_BASE /vllm_$dataset "
351-
351+
352352 # Generate updated comprehensive plots for current dataset
353353 echo -e " ${BLUE} 📈 Updating comprehensive plots with $dataset results...${NC} "
354354 generate_comprehensive_plot " $dataset "
355-
355+
356356 echo -e " ${GREEN} ✅ Completed $dataset benchmark and comprehensive plots updated${NC} "
357357 echo -e " ${GREEN} 📈 CSV data updated in: $PERSISTENT_RESEARCH_CSV ${NC} "
358358 echo " "
@@ -361,13 +361,13 @@ run_dataset_benchmark() {
361361# Function to generate comprehensive plot with all completed datasets (called after each dataset completes)
362362generate_comprehensive_plot () {
363363 local current_dataset=$1
364-
364+
365365 if [[ -n " $current_dataset " ]]; then
366366 echo -e " ${YELLOW} 📊 Generating plot for current dataset: $current_dataset ...${NC} "
367367 else
368368 echo -e " ${YELLOW} 📊 Generating comprehensive plot with all completed datasets...${NC} "
369369 fi
370-
370+
371371 # Use the plot_comprehensive_results.py script to generate updated charts
372372 if [[ -f " plot_comprehensive_results.py" ]]; then
373373 echo -e " ${BLUE} Running comprehensive plotting script...${NC} "
@@ -376,16 +376,16 @@ generate_comprehensive_plot() {
376376 --csv \" $RESEARCH_CSV \" \
377377 --output-dir \" $OUTPUT_BASE \" \
378378 --model-filter \" $VLLM_MODEL \" "
379-
379+
380380 # Add dataset filter if specified
381381 if [[ -n " $current_dataset " ]]; then
382382 PLOT_CMD=" $PLOT_CMD --dataset-filter \" $current_dataset \" "
383383 fi
384-
385- eval $PLOT_CMD
386-
384+
385+ eval " $PLOT_CMD "
386+
387387 echo -e " ${GREEN} ✅ Comprehensive plots updated in $OUTPUT_BASE ${NC} "
388-
388+
389389 # Print actual paths of generated charts
390390 if [[ -f " $OUTPUT_BASE /accuracy_comparison.png" ]]; then
391391 echo -e " ${GREEN} 📊 Accuracy Chart: $OUTPUT_BASE /accuracy_comparison.png${NC} "
@@ -404,9 +404,9 @@ generate_comprehensive_plot() {
404404# Function to generate plot for a single dataset (kept for compatibility)
405405generate_dataset_plot () {
406406 local dataset=$1
407-
407+
408408 echo -e " ${YELLOW} 📊 Plotting $dataset results...${NC} "
409-
409+
410410 # Find the summary.json files
411411 ROUTER_SUMMARY=$( find " $OUTPUT_BASE /router_$dataset " -name " summary.json" -type f | head -1)
412412 VLLM_SUMMARY=$( find " $OUTPUT_BASE /vllm_$dataset " -name " summary.json" -type f | head -1)
@@ -419,7 +419,7 @@ generate_dataset_plot() {
419419 fi
420420
421421 echo -e " ${BLUE} Running: $PLOT_CMD ${NC} "
422- eval $PLOT_CMD
422+ eval " $PLOT_CMD "
423423 echo -e " ${GREEN} ✅ $dataset plots generated in $OUTPUT_BASE /plots_$dataset ${NC} "
424424 else
425425 echo -e " ${RED} ⚠️ No vLLM summary.json found for $dataset , skipping plots${NC} "
@@ -429,7 +429,7 @@ generate_dataset_plot() {
429429# Function to generate comparison plots (now just calls individual dataset plots)
430430generate_plots () {
431431 echo -e " ${BLUE} 📈 Generating any remaining comparison plots...${NC} "
432-
432+
433433 for dataset in " ${! DATASET_CONFIGS[@]} " ; do
434434 # Check if plots already exist
435435 if [[ ! -d " $OUTPUT_BASE /plots_$dataset " ]]; then
@@ -447,15 +447,15 @@ generate_plots() {
447447# Function to generate summary report
448448generate_summary () {
449449 echo -e " ${BLUE} 📋 Generating research summary...${NC} "
450-
450+
451451 local summary_file=" $OUTPUT_BASE /RESEARCH_SUMMARY.md"
452-
452+
453453 cat > " $summary_file " << EOF
454454# Multi-Dataset Benchmark Research Report
455455
456456**Generated:** $( date)
457457**Configuration:** Router vs vLLM Direct Comparison
458- **Router Model:** $ROUTER_MODEL
458+ **Router Model:** $ROUTER_MODEL
459459**vLLM Model:** $VLLM_MODEL
460460
461461## Dataset Overview
539539## Usage Instructions
540540
5415411. **Review CSV files** for detailed numerical results
542- 2. **Examine plots** for visual comparison trends
542+ 2. **Examine plots** for visual comparison trends
5435433. **Analyze token usage** for efficiency insights
5445444. **Compare across datasets** for model capability assessment
545545
@@ -572,7 +572,7 @@ for dataset in "${DATASET_ORDER[@]}"; do
572572 echo -e " ${YELLOW} ⚠️ Dataset $dataset not configured, skipping...${NC} "
573573 continue
574574 fi
575-
575+
576576 dataset_count=$(( dataset_count + 1 ))
577577 echo -e " ${BLUE} 🚀 Progress: Dataset $dataset_count /$total_datasets - Starting $dataset ${NC} "
578578 run_dataset_benchmark " $dataset "
@@ -601,7 +601,7 @@ echo -e "${YELLOW}⏱️ Total Runtime:${NC} ${minutes}m ${seconds}s"
601601echo " "
602602echo -e " ${BLUE} 📋 Next Steps:${NC} "
603603echo " 1. 📊 **Master research data**: $PERSISTENT_RESEARCH_CSV "
604- echo " 2. 📊 **This run's data**: $OUTPUT_BASE /research_results.csv"
604+ echo " 2. 📊 **This run's data**: $OUTPUT_BASE /research_results.csv"
605605echo " 3. 📋 Review research summary: $OUTPUT_BASE /RESEARCH_SUMMARY.md"
606606echo " 4. 📈 **View comprehensive charts**:"
607607if [[ -f " $OUTPUT_BASE /accuracy_comparison.png" ]]; then
0 commit comments