Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion src/training/model_eval/mmlu_pro_vllm_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,18 @@ def get_available_models(endpoint: str, api_key: str = "") -> List[str]:
try:
models = client.models.list()
return [model.id for model in models.data]
except:
except Exception as e:
print(f"Error communicating with vLLM endpoint: {e}")
# Try direct HTTP request as fallback
try:
response = requests.get(f"{endpoint}/models")
if response.status_code == 200:
models_data = response.json()
return [model["id"] for model in models_data.get("data", [])]
else:
print(f"Failed to get models list. Status code: {response.status_code}")
except Exception as e:
print(f"Failed to get models list via HTTP: {e}")
return []


Expand Down
135 changes: 129 additions & 6 deletions src/training/model_eval/result_to_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,13 +87,45 @@ def generate_config_yaml(category_accuracies, similarity_threshold):
"max_entries": 1000,
"ttl_seconds": 3600,
},
"classifier": {
"model_id": "models/category_classifier_modernbert-base_model",
"threshold": 0.1,
"tools": {
"enabled": True,
"top_k": 3,
"similarity_threshold": 0.2,
"tools_db_path": "config/tools_db.json",
"fallback_to_empty": True,
},
"prompt_guard": {
"enabled": True,
"use_modernbert": True,
"model_id": "models/jailbreak_classifier_modernbert-base_model",
"threshold": 0.7,
"use_cpu": True,
"category_mapping_path": "models/category_classifier_modernbert-base_model/category_mapping.json",
"jailbreak_mapping_path": "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json",
},
"gpu_config": {
"flops": 312000000000000, # 312e12 fp16
"hbm": 2000000000000, # 2e12 (2 TB/s)
"description": "A100-80G",
},
"classifier": {
"category_model": {
"model_id": "models/category_classifier_modernbert-base_model",
"use_modernbert": True,
"threshold": 0.6,
"use_cpu": True,
"category_mapping_path": "models/category_classifier_modernbert-base_model/category_mapping.json",
},
"pii_model": {
"model_id": "models/pii_classifier_modernbert-base_presidio_token_model",
"use_modernbert": True,
"threshold": 0.7,
"use_cpu": True,
"pii_mapping_path": "models/pii_classifier_modernbert-base_presidio_token_model/pii_type_mapping.json",
},
"load_aware": False,
},
"categories": [],
"default_reasoning_effort": "medium", # Default reasoning effort level (low, medium, high)
}

# Get the best model overall to use as default (excluding 'auto')
Expand All @@ -114,6 +146,80 @@ def generate_config_yaml(category_accuracies, similarity_threshold):
default_model = max(model_avg_accuracies, key=model_avg_accuracies.get)
config["default_model"] = default_model.split(":")[0] # Remove the approach suffix

# Define category-specific reasoning settings
category_reasoning = {
"math": {
"use_reasoning": True,
"reasoning_description": "Mathematical problems require step-by-step reasoning",
"reasoning_effort": "high",
},
"physics": {
"use_reasoning": True,
"reasoning_description": "Physics concepts need logical analysis",
"reasoning_effort": "high",
},
"chemistry": {
"use_reasoning": True,
"reasoning_description": "Chemical reactions and formulas require systematic thinking",
"reasoning_effort": "high",
},
"computer science": {
"use_reasoning": True,
"reasoning_description": "Programming and algorithms need logical reasoning",
"reasoning_effort": "high",
},
"engineering": {
"use_reasoning": True,
"reasoning_description": "Engineering problems require systematic problem-solving",
"reasoning_effort": "high",
},
"biology": {
"use_reasoning": True,
"reasoning_description": "Biological processes benefit from structured analysis",
"reasoning_effort": "medium",
},
"business": {
"use_reasoning": False,
"reasoning_description": "Business content is typically conversational",
"reasoning_effort": "low",
},
"law": {
"use_reasoning": False,
"reasoning_description": "Legal content is typically explanatory",
"reasoning_effort": "medium",
},
"psychology": {
"use_reasoning": False,
"reasoning_description": "Psychology content is usually explanatory",
"reasoning_effort": "medium",
},
"history": {
"use_reasoning": False,
"reasoning_description": "Historical content is narrative-based",
"reasoning_effort": "low",
},
"economics": {
"use_reasoning": False,
"reasoning_description": "Economic discussions are usually explanatory",
"reasoning_effort": "medium",
},
"philosophy": {
"use_reasoning": False,
"reasoning_description": "Philosophical discussions are conversational",
"reasoning_effort": "medium",
},
"health": {
"use_reasoning": False,
"reasoning_description": "Health information is typically informational",
"reasoning_effort": "medium",
},
"other": {
"use_reasoning": False,
"reasoning_description": "General content doesn't require reasoning",
"reasoning_effort": "low",
},
}

# Create category entries with ranked model-score pairs (excluding 'auto')
for category, models in category_accuracies.items():
# Sort models by accuracy (descending), exclude 'auto'
Expand All @@ -126,8 +232,25 @@ def generate_config_yaml(category_accuracies, similarity_threshold):
model_scores = [
{"model": model, "score": float(acc)} for model, acc in ranked_models
]
# Add category to config
config["categories"].append({"name": category, "model_scores": model_scores})
# Get reasoning settings for the category
reasoning_settings = category_reasoning.get(
category.lower(),
{
"use_reasoning": False,
"reasoning_description": "General content doesn't require reasoning",
"reasoning_effort": "low",
},
)
# Add category to config with reasoning settings
config["categories"].append(
{
"name": category,
"use_reasoning": reasoning_settings["use_reasoning"],
"reasoning_description": reasoning_settings["reasoning_description"],
"reasoning_effort": reasoning_settings["reasoning_effort"],
"model_scores": model_scores,
}
)

return config

Expand Down
97 changes: 96 additions & 1 deletion website/docs/getting-started/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -439,10 +439,105 @@ make test-pii # PII detection
make test-prompt-guard # Jailbreak protection
```

## Configuration Generation

The Semantic Router supports automated configuration generation based on model performance benchmarks. This workflow uses MMLU-Pro evaluation results to determine optimal model routing for different categories.

### Benchmarking Workflow

1. **Run MMLU-Pro Evaluation:**
```bash
# Evaluate models using MMLU-Pro benchmark
python src/training/model_eval/mmlu_pro_vllm_eval.py \
--endpoint http://localhost:8000/v1 \
--models phi4,gemma3:27b,mistral-small3.1 \
--samples-per-category 5 \
--use-cot \
--concurrent-requests 4 \
--output-dir results
```

2. **Generate Configuration:**
```bash
# Generate config.yaml from benchmark results
python src/training/model_eval/result_to_config.py \
--results-dir results \
--output-file config/config.yaml \
--similarity-threshold 0.80
```

### Generated Configuration Features

The generated configuration includes:

- **Model Performance Rankings:** Models are ranked by performance for each category
- **Reasoning Settings:** Automatically configures reasoning requirements per category:
- `use_reasoning`: Whether to use step-by-step reasoning
- `reasoning_description`: Description of reasoning approach
- `reasoning_effort`: Required effort level (low/medium/high)
- **Default Model Selection:** Best overall performing model is set as default
- **Security and Performance Settings:** Pre-configured optimal values for:
- PII detection thresholds
- Semantic cache settings
- Tool selection parameters

### Customizing Generated Config

The generated config.yaml can be customized by:

1. Editing category-specific settings in `result_to_config.py`
2. Adjusting thresholds and parameters via command line arguments
3. Manually modifying the generated config.yaml

### Example Workflow

Here's a complete example workflow for generating and testing a configuration:

```bash
# Run MMLU-Pro evaluation
# Option 1: Specify models manually
python src/training/model_eval/mmlu_pro_vllm_eval.py \
--endpoint http://localhost:8000/v1 \
--models phi4,gemma3:27b,mistral-small3.1 \
--samples-per-category 5 \
--use-cot \
--concurrent-requests 4 \
--output-dir results \
--max-tokens 2048 \
--temperature 0.0 \
--seed 42

# Option 2: Auto-discover models from endpoint
python src/training/model_eval/mmlu_pro_vllm_eval.py \
--endpoint http://localhost:8000/v1 \
--samples-per-category 5 \
--use-cot \
--concurrent-requests 4 \
--output-dir results \
--max-tokens 2048 \
--temperature 0.0 \
--seed 42

# Generate initial config
python src/training/model_eval/result_to_config.py \
--results-dir results \
--output-file config/config.yaml \
--similarity-threshold 0.80

# Test the generated config
make test
```

This workflow ensures your configuration is:
- Based on actual model performance
- Properly tested before deployment
- Version controlled for tracking changes
- Optimized for your specific use case

## Next Steps

- **[Installation Guide](installation.md)** - Setup instructions
- **[Quick Start Guide](installation.md)** - Basic usage examples
- **[API Documentation](../api/router.md)** - Complete API reference

The configuration system is designed to be simple yet powerful. Start with the basic configuration and gradually enable advanced features as needed.
The configuration system is designed to be simple yet powerful. Start with the basic configuration and gradually enable advanced features as needed.
Loading