Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion optillm/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Version information
__version__ = "0.2.3"
__version__ = "0.2.4"

# Import from server module
from .server import (
Expand Down
100 changes: 58 additions & 42 deletions optillm/plugins/proxy/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ routing:
# Option A: Use proxy as default for ALL requests (recommended)
optillm --approach proxy

# Option B: Start server normally (requires model prefix or extra_body)
# Option B: Start server normally (use model prefix or extra_body per request)
optillm

# With custom port
Expand All @@ -60,33 +60,34 @@ optillm --approach proxy --port 8000

### 3. Usage Examples

#### When using `--approach proxy` (Recommended)
#### Method 1: Using --approach proxy (Recommended)
```bash
# No need for "proxy-" prefix! The proxy handles all requests automatically
# Start server with proxy as default approach
optillm --approach proxy

# Then make normal requests - proxy handles all routing automatically!
curl -X POST http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "gpt-4",
"messages": [{"role": "user", "content": "Hello"}]
}'

# The proxy will:
# 1. Route to one of your configured providers
# 2. Apply model mapping if configured
# 3. Handle failover automatically
```

#### Without `--approach proxy` flag
#### Method 2: Using Model Prefix (when server started without --approach proxy)
```bash
# Method 1: Use model prefix
# Use "proxy-" prefix to activate the proxy plugin
curl -X POST http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "proxy-gpt-4",
"messages": [{"role": "user", "content": "Hello"}]
}'
```

# Method 2: Use extra_body
#### Method 3: Using extra_body (when server started without --approach proxy)
```bash
# Use extra_body parameter
curl -X POST http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
Expand All @@ -98,44 +99,37 @@ curl -X POST http://localhost:8000/v1/chat/completions \
}'
```

#### Proxy with Approach/Plugin
Both methods will:
- Route to one of your configured providers
- Apply model mapping if configured
- Handle failover automatically

#### Combined Approaches
```bash
# Use MOA approach with proxy load balancing
# Apply BON sampling, then route through proxy
curl -X POST http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "gpt-4",
"messages": [{"role": "user", "content": "Solve this problem"}],
"extra_body": {
"optillm_approach": "proxy",
"proxy_wrap": "moa"
}
"model": "bon&proxy-gpt-4",
"messages": [{"role": "user", "content": "Generate ideas"}]
}'
```

# Use memory plugin with proxy
#### Proxy Wrapping Other Approaches
```bash
# Use proxy to wrap MOA approach
curl -X POST http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "gpt-4",
"messages": [{"role": "user", "content": "Remember this"}],
"messages": [{"role": "user", "content": "Solve this problem"}],
"extra_body": {
"optillm_approach": "proxy",
"proxy_wrap": "memory"
"proxy_wrap": "moa"
}
}'
```

#### Combined Approaches
```bash
# Apply BON sampling, then route through proxy
curl -X POST http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "bon&proxy-gpt-4",
"messages": [{"role": "user", "content": "Generate ideas"}]
}'
```

## Configuration Reference

### Provider Configuration
Expand Down Expand Up @@ -203,7 +197,7 @@ providers:

### Model-Specific Routing

When using `--approach proxy`, the proxy automatically maps model names to provider-specific deployments:
The proxy automatically maps model names to provider-specific deployments:

```yaml
providers:
Expand All @@ -222,9 +216,9 @@ providers:
# No model_map needed - uses model names as-is
```

With this configuration and `optillm --approach proxy`:
- Request for "gpt-4" → Azure uses "gpt-4-deployment-001", OpenAI uses "gpt-4"
- Request for "gpt-3.5-turbo" → Azure uses "gpt-35-turbo-deployment", OpenAI uses "gpt-3.5-turbo"
With this configuration and `proxy-gpt-4` model requests:
- Request for "proxy-gpt-4" → Azure uses "gpt-4-deployment-001", OpenAI uses "gpt-4"
- Request for "proxy-gpt-3.5-turbo" → Azure uses "gpt-35-turbo-deployment", OpenAI uses "gpt-3.5-turbo"

### Failover Configuration

Expand Down Expand Up @@ -358,19 +352,35 @@ client = OpenAI(
api_key="dummy" # Can be any string when using proxy
)

# If server started with --approach proxy:
# Method 1: Server started with --approach proxy (recommended)
# Just make normal requests - proxy handles everything!
response = client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": "Hello"}]
)

# Method 2: Use proxy with model prefix
response = client.chat.completions.create(
model="gpt-4", # No "proxy-" prefix needed!
model="proxy-gpt-4", # Use "proxy-" prefix
messages=[{"role": "user", "content": "Hello"}]
)

# Or explicitly use proxy with another approach:
# Method 3: Use extra_body
response = client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": "Hello"}],
extra_body={
"optillm_approach": "proxy"
}
)

# Method 4: Proxy wrapping another approach
response = client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": "Hello"}],
extra_body={
"optillm_approach": "proxy",
"proxy_wrap": "moa" # Proxy will route MOA's requests
"proxy_wrap": "moa"
}
)
```
Expand All @@ -379,12 +389,18 @@ response = client.chat.completions.create(
```python
from langchain.llms import OpenAI

# If server started with --approach proxy:
# If server started with --approach proxy (recommended)
llm = OpenAI(
openai_api_base="http://localhost:8000/v1",
model_name="gpt-4" # Proxy handles routing automatically
)

# Or use proxy with model prefix
llm = OpenAI(
openai_api_base="http://localhost:8000/v1",
model_name="proxy-gpt-4" # Use "proxy-" prefix
)

response = llm("What is the meaning of life?")
```

Expand Down
15 changes: 12 additions & 3 deletions optillm/plugins/proxy/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,15 @@ class _Completions:
def __init__(self, proxy_client):
self.proxy_client = proxy_client

def _filter_kwargs(self, kwargs: dict) -> dict:
"""Filter out OptiLLM-specific parameters that shouldn't be sent to providers"""
optillm_params = {
'optillm_approach', 'proxy_wrap', 'wrapped_approach', 'wrap',
'mcts_simulations', 'mcts_exploration', 'mcts_depth',
'best_of_n', 'rstar_max_depth', 'rstar_num_rollouts', 'rstar_c'
}
return {k: v for k, v in kwargs.items() if k not in optillm_params}

def create(self, **kwargs):
"""Create completion with load balancing and failover"""
model = kwargs.get('model', 'unknown')
Expand Down Expand Up @@ -145,8 +154,8 @@ def create(self, **kwargs):
attempted_providers.add(provider)

try:
# Map model name if needed
request_kwargs = kwargs.copy()
# Map model name if needed and filter out OptiLLM-specific parameters
request_kwargs = self._filter_kwargs(kwargs.copy())
request_kwargs['model'] = provider.map_model(model)

# Track timing
Expand Down Expand Up @@ -177,7 +186,7 @@ def create(self, **kwargs):
if self.proxy_client.fallback_client:
logger.warning("All proxy providers failed, using fallback client")
try:
return self.proxy_client.fallback_client.chat.completions.create(**kwargs)
return self.proxy_client.fallback_client.chat.completions.create(**self._filter_kwargs(kwargs))
except Exception as e:
errors.append(("fallback_client", str(e)))

Expand Down
6 changes: 4 additions & 2 deletions optillm/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -992,12 +992,14 @@ def main():
global request_batcher
global conversation_logger
# Call this function at the start of main()

# Load plugins first so they're available in argument parser
load_plugins()

args = parse_args()
# Update server_config with all argument values
server_config.update(vars(args))

load_plugins()

port = server_config['port']

# Initialize request batcher if batch mode is enabled
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "optillm"
version = "0.2.3"
version = "0.2.4"
description = "An optimizing inference proxy for LLMs."
readme = "README.md"
license = "Apache-2.0"
Expand Down