Skip to content

Commit acd89ab

Browse files
authored
feat: add shelleck precommit hook (#488)
* feat: add shelleck precommit hook Signed-off-by: yuluo-yx <[email protected]> * feat: add shelleck precommit hook Signed-off-by: yuluo-yx <[email protected]> * feat: add shelleck precommit hook Signed-off-by: yuluo-yx <[email protected]> --------- Signed-off-by: yuluo-yx <[email protected]>
1 parent 9956e0b commit acd89ab

File tree

18 files changed

+284
-256
lines changed

18 files changed

+284
-256
lines changed

.pre-commit-config.yaml

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,14 @@ repos:
2222
language: system
2323
files: \.go$
2424

25+
- repo: local
26+
hooks:
27+
- id: shellcheck
28+
name: shellcheck
29+
entry: make shellcheck
30+
language: system
31+
files: \.sh$
32+
2533
- repo: local
2634
hooks:
2735
- id: golang-lint
@@ -87,7 +95,7 @@ repos:
8795
language_version: python3
8896
files: \.py$
8997
exclude: ^(\.venv/|venv/|env/|__pycache__/|\.git/|site-packages/)
90-
98+
9199
# Commented out flake8 - only reports issues, doesn't auto-fix
92100
# - repo: https://github.com/PyCQA/flake8
93101
# rev: 7.3.0

bench/build_and_test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ echo "=============================================="
99

1010
# Clean previous builds
1111
echo "🧹 Cleaning previous builds..."
12-
rm -rf build/ dist/ *.egg-info/
12+
rm -rf build/ dist/ ./*.egg-info/
1313
find vllm_semantic_router_bench/ -name "__pycache__" -type d -exec rm -rf {} + 2>/dev/null || true
1414
find vllm_semantic_router_bench/ -name "*.pyc" -delete 2>/dev/null || true
1515

bench/comprehensive_bench.sh

Lines changed: 51 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -67,10 +67,10 @@ NC='\033[0m' # No Color
6767
# Auto-detect vLLM model if not specified
6868
if [[ -z "$VLLM_MODEL" ]]; then
6969
echo -e "${BLUE}🔍 Auto-detecting vLLM model from endpoint...${NC}"
70-
70+
7171
# Try to fetch models from the vLLM endpoint
7272
VLLM_MODELS_JSON=$(curl -s "$VLLM_ENDPOINT/models" 2>/dev/null || echo "")
73-
73+
7474
if [[ -n "$VLLM_MODELS_JSON" ]]; then
7575
# Extract the first model ID from the JSON response
7676
VLLM_MODEL=$(echo "$VLLM_MODELS_JSON" | python3 -c "
@@ -85,7 +85,7 @@ try:
8585
except:
8686
print('')
8787
" 2>/dev/null)
88-
88+
8989
if [[ -n "$VLLM_MODEL" ]]; then
9090
echo -e "${GREEN}✅ Auto-detected vLLM model: $VLLM_MODEL${NC}"
9191
else
@@ -110,20 +110,20 @@ declare -A DATASET_CONFIGS=(
110110
["gpqa"]=20 # 1 category × 20 = 20 samples - OUTSTANDING reasoning differentiation
111111
["mmlu"]=10 # 57 subjects × 10 = 570 samples - EXCELLENT reasoning differentiation
112112
["truthfulqa"]=15 # Truthfulness evaluation - some reasoning differentiation (60% → 73.3%)
113-
113+
114114
# Mathematical reasoning datasets
115115
# ["math"]=15 # Competition mathematics - DISABLED: Dataset not available on HF Hub
116116
["gsm8k"]=25 # Elementary math word problems - EXPECTED good reasoning differentiation
117117
["aqua-rat"]=20 # Algebraic word problems with rationales - EXPECTED good differentiation
118-
118+
119119
# Multi-step reasoning datasets
120120
["drop"]=20 # Reading comprehension with discrete reasoning - EXPECTED excellent differentiation
121121
["strategyqa"]=20 # Multi-step implicit reasoning - EXPECTED good differentiation
122-
122+
123123
# Scientific reasoning datasets
124124
["sciq"]=25 # Science questions requiring reasoning - EXPECTED moderate differentiation
125125
["openbookqa"]=20 # Elementary science with fact reasoning - EXPECTED moderate differentiation
126-
126+
127127
# Disabled datasets with poor reasoning differentiation:
128128
# ["arc-challenge"]=15 # 100% accuracy across all modes, minimal benefit
129129
# ["commonsenseqa"]=20 # Same accuracy across modes, small token difference
@@ -174,42 +174,42 @@ extract_and_save_metrics() {
174174
local mode=$2 # "router" or "vllm"
175175
local results_dir=$3
176176
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
177-
177+
178178
# Find the results files (handle nested directory structure)
179179
local summary_file=""
180180
local detailed_file=""
181-
181+
182182
# Look for files in nested directories
183183
if [[ -d "$results_dir" ]]; then
184184
summary_file=$(find "$results_dir" -name "results_summary.csv" -type f | head -1)
185185
if [[ -z "$summary_file" ]]; then
186186
detailed_file=$(find "$results_dir" -name "detailed_results.csv" -type f | head -1)
187187
fi
188188
fi
189-
189+
190190
# Use whichever file we found
191191
local target_file=""
192192
if [[ -f "$summary_file" ]]; then
193193
target_file="$summary_file"
194194
elif [[ -f "$detailed_file" ]]; then
195195
target_file="$detailed_file"
196196
fi
197-
197+
198198
if [[ -n "$target_file" && -f "$target_file" ]]; then
199199
echo -e "${YELLOW} 📊 Extracting metrics from $target_file...${NC}"
200-
200+
201201
# Extract overall metrics from the CSV file
202202
# Skip header and get the last line (overall summary) or calculate averages
203-
local temp_file="/tmp/metrics_$dataset_$mode.txt"
204-
203+
local temp_file="/tmp/metrics_${dataset}_${mode}.txt"
204+
205205
# Use Python to calculate averages from the CSV
206206
python3 -c "
207207
import pandas as pd
208208
import sys
209209
210210
try:
211211
df = pd.read_csv('$target_file')
212-
212+
213213
# Calculate overall metrics (handle different CSV formats)
214214
if len(df) > 0:
215215
# Handle accuracy column (is_correct vs accuracy)
@@ -219,63 +219,63 @@ try:
219219
avg_accuracy = df['accuracy'].mean()
220220
else:
221221
avg_accuracy = 0.0
222-
222+
223223
# Handle latency column (response_time vs avg_latency_ms)
224224
if 'response_time' in df.columns:
225225
avg_latency = df['response_time'].mean() * 1000 # Convert to ms
226226
elif 'avg_latency_ms' in df.columns:
227227
avg_latency = df['avg_latency_ms'].mean()
228228
else:
229229
avg_latency = 0.0
230-
230+
231231
# Handle token column (total_tokens vs avg_total_tokens)
232232
if 'total_tokens' in df.columns:
233233
avg_tokens = df['total_tokens'].mean()
234234
elif 'avg_total_tokens' in df.columns:
235235
avg_tokens = df['avg_total_tokens'].mean()
236236
else:
237237
avg_tokens = 0.0
238-
238+
239239
sample_count = len(df)
240-
240+
241241
# Determine model name
242242
if '$mode' == 'router':
243243
model_name = '$ROUTER_MODEL'
244244
else:
245245
model_name = '$VLLM_MODEL'
246-
246+
247247
# For vLLM, we might have multiple modes (NR, NR_REASONING)
248248
# Check both 'mode' and 'mode_label' columns for mode information
249249
if '$mode' == 'vllm' and ('mode' in df.columns or 'mode_label' in df.columns):
250250
# Use mode_label if available (more descriptive), otherwise use mode
251251
mode_column = 'mode_label' if 'mode_label' in df.columns else 'mode'
252252
for mode_type in df[mode_column].unique():
253253
mode_df = df[df[mode_column] == mode_type]
254-
254+
255255
# Recalculate metrics for this specific mode using correct column names
256256
if 'is_correct' in mode_df.columns:
257257
mode_accuracy = mode_df['is_correct'].mean()
258258
elif 'accuracy' in mode_df.columns:
259259
mode_accuracy = mode_df['accuracy'].mean()
260260
else:
261261
mode_accuracy = 0.0
262-
262+
263263
if 'response_time' in mode_df.columns:
264264
mode_latency = mode_df['response_time'].mean() * 1000
265265
elif 'avg_latency_ms' in mode_df.columns:
266266
mode_latency = mode_df['avg_latency_ms'].mean()
267267
else:
268268
mode_latency = 0.0
269-
269+
270270
if 'total_tokens' in mode_df.columns:
271271
mode_tokens = mode_df['total_tokens'].mean()
272272
elif 'avg_total_tokens' in mode_df.columns:
273273
mode_tokens = mode_df['avg_total_tokens'].mean()
274274
else:
275275
mode_tokens = 0.0
276-
276+
277277
mode_samples = len(mode_df)
278-
278+
279279
# Map technical mode names to descriptive names
280280
if mode_type == 'VLLM_NR':
281281
display_mode = 'vLLM_No_Reasoning'
@@ -285,7 +285,7 @@ try:
285285
display_mode = 'vLLM_CoT'
286286
else:
287287
display_mode = mode_type # Use the mode_label as-is if not recognized
288-
288+
289289
csv_line = f'$dataset,{display_mode},{model_name},{mode_accuracy:.3f},{mode_latency:.1f},{mode_tokens:.1f},{mode_samples},$timestamp'
290290
print(f' 📝 Writing to CSV: {csv_line}', file=sys.stderr)
291291
print(csv_line)
@@ -295,12 +295,12 @@ try:
295295
print(csv_line)
296296
else:
297297
print(f'$dataset,$mode,unknown,0.000,0.0,0.0,0,$timestamp', file=sys.stderr)
298-
298+
299299
except Exception as e:
300300
print(f'Error processing $target_file: {e}', file=sys.stderr)
301301
print(f'$dataset,$mode,unknown,0.000,0.0,0.0,0,$timestamp', file=sys.stderr)
302302
" | tee -a "$RESEARCH_CSV" >> "$PERSISTENT_RESEARCH_CSV"
303-
303+
304304
echo -e "${GREEN} ✅ Metrics saved to both timestamped and master research CSV${NC}"
305305
else
306306
echo -e "${RED} ❌ Warning: No results files found in $results_dir${NC}"
@@ -313,9 +313,9 @@ except Exception as e:
313313
run_dataset_benchmark() {
314314
local dataset=$1
315315
local samples=${DATASET_CONFIGS[$dataset]}
316-
316+
317317
echo -e "${GREEN}📊 Benchmarking $dataset dataset ($samples samples per category)...${NC}"
318-
318+
319319
# Router benchmark (pass vLLM info for consistent token calculation)
320320
echo -e "${YELLOW} 🤖 Running router evaluation...${NC}"
321321
python3 -m vllm_semantic_router_bench.router_reason_bench_multi_dataset \
@@ -333,7 +333,7 @@ run_dataset_benchmark() {
333333
# Extract and save router metrics immediately
334334
extract_and_save_metrics "$dataset" "Router" "$OUTPUT_BASE/router_$dataset"
335335

336-
# vLLM benchmark
336+
# vLLM benchmark
337337
echo -e "${YELLOW} ⚡ Running vLLM evaluation...${NC}"
338338
python3 -m vllm_semantic_router_bench.router_reason_bench_multi_dataset \
339339
--dataset "$dataset" \
@@ -345,14 +345,14 @@ run_dataset_benchmark() {
345345
--output-dir "$OUTPUT_BASE/vllm_$dataset" \
346346
--concurrent-requests "$CONCURRENT_REQUESTS" \
347347
--seed 42
348-
348+
349349
# Extract and save vLLM metrics immediately
350350
extract_and_save_metrics "$dataset" "vllm" "$OUTPUT_BASE/vllm_$dataset"
351-
351+
352352
# Generate updated comprehensive plots for current dataset
353353
echo -e "${BLUE} 📈 Updating comprehensive plots with $dataset results...${NC}"
354354
generate_comprehensive_plot "$dataset"
355-
355+
356356
echo -e "${GREEN} ✅ Completed $dataset benchmark and comprehensive plots updated${NC}"
357357
echo -e "${GREEN} 📈 CSV data updated in: $PERSISTENT_RESEARCH_CSV${NC}"
358358
echo ""
@@ -361,13 +361,13 @@ run_dataset_benchmark() {
361361
# Function to generate comprehensive plot with all completed datasets (called after each dataset completes)
362362
generate_comprehensive_plot() {
363363
local current_dataset=$1
364-
364+
365365
if [[ -n "$current_dataset" ]]; then
366366
echo -e "${YELLOW} 📊 Generating plot for current dataset: $current_dataset...${NC}"
367367
else
368368
echo -e "${YELLOW} 📊 Generating comprehensive plot with all completed datasets...${NC}"
369369
fi
370-
370+
371371
# Use the plot_comprehensive_results.py script to generate updated charts
372372
if [[ -f "plot_comprehensive_results.py" ]]; then
373373
echo -e "${BLUE} Running comprehensive plotting script...${NC}"
@@ -376,16 +376,16 @@ generate_comprehensive_plot() {
376376
--csv \"$RESEARCH_CSV\" \
377377
--output-dir \"$OUTPUT_BASE\" \
378378
--model-filter \"$VLLM_MODEL\""
379-
379+
380380
# Add dataset filter if specified
381381
if [[ -n "$current_dataset" ]]; then
382382
PLOT_CMD="$PLOT_CMD --dataset-filter \"$current_dataset\""
383383
fi
384-
385-
eval $PLOT_CMD
386-
384+
385+
eval "$PLOT_CMD"
386+
387387
echo -e "${GREEN} ✅ Comprehensive plots updated in $OUTPUT_BASE${NC}"
388-
388+
389389
# Print actual paths of generated charts
390390
if [[ -f "$OUTPUT_BASE/accuracy_comparison.png" ]]; then
391391
echo -e "${GREEN} 📊 Accuracy Chart: $OUTPUT_BASE/accuracy_comparison.png${NC}"
@@ -404,9 +404,9 @@ generate_comprehensive_plot() {
404404
# Function to generate plot for a single dataset (kept for compatibility)
405405
generate_dataset_plot() {
406406
local dataset=$1
407-
407+
408408
echo -e "${YELLOW} 📊 Plotting $dataset results...${NC}"
409-
409+
410410
# Find the summary.json files
411411
ROUTER_SUMMARY=$(find "$OUTPUT_BASE/router_$dataset" -name "summary.json" -type f | head -1)
412412
VLLM_SUMMARY=$(find "$OUTPUT_BASE/vllm_$dataset" -name "summary.json" -type f | head -1)
@@ -419,7 +419,7 @@ generate_dataset_plot() {
419419
fi
420420

421421
echo -e "${BLUE} Running: $PLOT_CMD${NC}"
422-
eval $PLOT_CMD
422+
eval "$PLOT_CMD"
423423
echo -e "${GREEN}$dataset plots generated in $OUTPUT_BASE/plots_$dataset${NC}"
424424
else
425425
echo -e "${RED} ⚠️ No vLLM summary.json found for $dataset, skipping plots${NC}"
@@ -429,7 +429,7 @@ generate_dataset_plot() {
429429
# Function to generate comparison plots (now just calls individual dataset plots)
430430
generate_plots() {
431431
echo -e "${BLUE}📈 Generating any remaining comparison plots...${NC}"
432-
432+
433433
for dataset in "${!DATASET_CONFIGS[@]}"; do
434434
# Check if plots already exist
435435
if [[ ! -d "$OUTPUT_BASE/plots_$dataset" ]]; then
@@ -447,15 +447,15 @@ generate_plots() {
447447
# Function to generate summary report
448448
generate_summary() {
449449
echo -e "${BLUE}📋 Generating research summary...${NC}"
450-
450+
451451
local summary_file="$OUTPUT_BASE/RESEARCH_SUMMARY.md"
452-
452+
453453
cat > "$summary_file" << EOF
454454
# Multi-Dataset Benchmark Research Report
455455
456456
**Generated:** $(date)
457457
**Configuration:** Router vs vLLM Direct Comparison
458-
**Router Model:** $ROUTER_MODEL
458+
**Router Model:** $ROUTER_MODEL
459459
**vLLM Model:** $VLLM_MODEL
460460
461461
## Dataset Overview
@@ -539,7 +539,7 @@ EOF
539539
## Usage Instructions
540540
541541
1. **Review CSV files** for detailed numerical results
542-
2. **Examine plots** for visual comparison trends
542+
2. **Examine plots** for visual comparison trends
543543
3. **Analyze token usage** for efficiency insights
544544
4. **Compare across datasets** for model capability assessment
545545
@@ -572,7 +572,7 @@ for dataset in "${DATASET_ORDER[@]}"; do
572572
echo -e "${YELLOW}⚠️ Dataset $dataset not configured, skipping...${NC}"
573573
continue
574574
fi
575-
575+
576576
dataset_count=$((dataset_count + 1))
577577
echo -e "${BLUE}🚀 Progress: Dataset $dataset_count/$total_datasets - Starting $dataset${NC}"
578578
run_dataset_benchmark "$dataset"
@@ -601,7 +601,7 @@ echo -e "${YELLOW}⏱️ Total Runtime:${NC} ${minutes}m ${seconds}s"
601601
echo ""
602602
echo -e "${BLUE}📋 Next Steps:${NC}"
603603
echo "1. 📊 **Master research data**: $PERSISTENT_RESEARCH_CSV"
604-
echo "2. 📊 **This run's data**: $OUTPUT_BASE/research_results.csv"
604+
echo "2. 📊 **This run's data**: $OUTPUT_BASE/research_results.csv"
605605
echo "3. 📋 Review research summary: $OUTPUT_BASE/RESEARCH_SUMMARY.md"
606606
echo "4. 📈 **View comprehensive charts**:"
607607
if [[ -f "$OUTPUT_BASE/accuracy_comparison.png" ]]; then

0 commit comments

Comments
 (0)