Skip to content

Commit d1baf93

Browse files
authored
Merge pull request #242 from codelion/fix-readme
Fix readme
2 parents 37e9f33 + e01d502 commit d1baf93

File tree

5 files changed

+76
-49
lines changed

5 files changed

+76
-49
lines changed

optillm/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Version information
2-
__version__ = "0.2.3"
2+
__version__ = "0.2.4"
33

44
# Import from server module
55
from .server import (

optillm/plugins/proxy/README.md

Lines changed: 58 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ routing:
5151
# Option A: Use proxy as default for ALL requests (recommended)
5252
optillm --approach proxy
5353

54-
# Option B: Start server normally (requires model prefix or extra_body)
54+
# Option B: Start server normally (use model prefix or extra_body per request)
5555
optillm
5656

5757
# With custom port
@@ -60,33 +60,34 @@ optillm --approach proxy --port 8000
6060

6161
### 3. Usage Examples
6262

63-
#### When using `--approach proxy` (Recommended)
63+
#### Method 1: Using --approach proxy (Recommended)
6464
```bash
65-
# No need for "proxy-" prefix! The proxy handles all requests automatically
65+
# Start server with proxy as default approach
66+
optillm --approach proxy
67+
68+
# Then make normal requests - proxy handles all routing automatically!
6669
curl -X POST http://localhost:8000/v1/chat/completions \
6770
-H "Content-Type: application/json" \
6871
-d '{
6972
"model": "gpt-4",
7073
"messages": [{"role": "user", "content": "Hello"}]
7174
}'
72-
73-
# The proxy will:
74-
# 1. Route to one of your configured providers
75-
# 2. Apply model mapping if configured
76-
# 3. Handle failover automatically
7775
```
7876

79-
#### Without `--approach proxy` flag
77+
#### Method 2: Using Model Prefix (when server started without --approach proxy)
8078
```bash
81-
# Method 1: Use model prefix
79+
# Use "proxy-" prefix to activate the proxy plugin
8280
curl -X POST http://localhost:8000/v1/chat/completions \
8381
-H "Content-Type: application/json" \
8482
-d '{
8583
"model": "proxy-gpt-4",
8684
"messages": [{"role": "user", "content": "Hello"}]
8785
}'
86+
```
8887

89-
# Method 2: Use extra_body
88+
#### Method 3: Using extra_body (when server started without --approach proxy)
89+
```bash
90+
# Use extra_body parameter
9091
curl -X POST http://localhost:8000/v1/chat/completions \
9192
-H "Content-Type: application/json" \
9293
-d '{
@@ -98,44 +99,37 @@ curl -X POST http://localhost:8000/v1/chat/completions \
9899
}'
99100
```
100101

101-
#### Proxy with Approach/Plugin
102+
Both methods will:
103+
- Route to one of your configured providers
104+
- Apply model mapping if configured
105+
- Handle failover automatically
106+
107+
#### Combined Approaches
102108
```bash
103-
# Use MOA approach with proxy load balancing
109+
# Apply BON sampling, then route through proxy
104110
curl -X POST http://localhost:8000/v1/chat/completions \
105111
-H "Content-Type: application/json" \
106112
-d '{
107-
"model": "gpt-4",
108-
"messages": [{"role": "user", "content": "Solve this problem"}],
109-
"extra_body": {
110-
"optillm_approach": "proxy",
111-
"proxy_wrap": "moa"
112-
}
113+
"model": "bon&proxy-gpt-4",
114+
"messages": [{"role": "user", "content": "Generate ideas"}]
113115
}'
116+
```
114117

115-
# Use memory plugin with proxy
118+
#### Proxy Wrapping Other Approaches
119+
```bash
120+
# Use proxy to wrap MOA approach
116121
curl -X POST http://localhost:8000/v1/chat/completions \
117122
-H "Content-Type: application/json" \
118123
-d '{
119124
"model": "gpt-4",
120-
"messages": [{"role": "user", "content": "Remember this"}],
125+
"messages": [{"role": "user", "content": "Solve this problem"}],
121126
"extra_body": {
122127
"optillm_approach": "proxy",
123-
"proxy_wrap": "memory"
128+
"proxy_wrap": "moa"
124129
}
125130
}'
126131
```
127132

128-
#### Combined Approaches
129-
```bash
130-
# Apply BON sampling, then route through proxy
131-
curl -X POST http://localhost:8000/v1/chat/completions \
132-
-H "Content-Type: application/json" \
133-
-d '{
134-
"model": "bon&proxy-gpt-4",
135-
"messages": [{"role": "user", "content": "Generate ideas"}]
136-
}'
137-
```
138-
139133
## Configuration Reference
140134

141135
### Provider Configuration
@@ -203,7 +197,7 @@ providers:
203197
204198
### Model-Specific Routing
205199
206-
When using `--approach proxy`, the proxy automatically maps model names to provider-specific deployments:
200+
The proxy automatically maps model names to provider-specific deployments:
207201
208202
```yaml
209203
providers:
@@ -222,9 +216,9 @@ providers:
222216
# No model_map needed - uses model names as-is
223217
```
224218

225-
With this configuration and `optillm --approach proxy`:
226-
- Request for "gpt-4" → Azure uses "gpt-4-deployment-001", OpenAI uses "gpt-4"
227-
- Request for "gpt-3.5-turbo" → Azure uses "gpt-35-turbo-deployment", OpenAI uses "gpt-3.5-turbo"
219+
With this configuration and `proxy-gpt-4` model requests:
220+
- Request for "proxy-gpt-4" → Azure uses "gpt-4-deployment-001", OpenAI uses "gpt-4"
221+
- Request for "proxy-gpt-3.5-turbo" → Azure uses "gpt-35-turbo-deployment", OpenAI uses "gpt-3.5-turbo"
228222

229223
### Failover Configuration
230224

@@ -358,19 +352,35 @@ client = OpenAI(
358352
api_key="dummy" # Can be any string when using proxy
359353
)
360354
361-
# If server started with --approach proxy:
355+
# Method 1: Server started with --approach proxy (recommended)
356+
# Just make normal requests - proxy handles everything!
357+
response = client.chat.completions.create(
358+
model="gpt-4",
359+
messages=[{"role": "user", "content": "Hello"}]
360+
)
361+
362+
# Method 2: Use proxy with model prefix
362363
response = client.chat.completions.create(
363-
model="gpt-4", # No "proxy-" prefix needed!
364+
model="proxy-gpt-4", # Use "proxy-" prefix
364365
messages=[{"role": "user", "content": "Hello"}]
365366
)
366367
367-
# Or explicitly use proxy with another approach:
368+
# Method 3: Use extra_body
369+
response = client.chat.completions.create(
370+
model="gpt-4",
371+
messages=[{"role": "user", "content": "Hello"}],
372+
extra_body={
373+
"optillm_approach": "proxy"
374+
}
375+
)
376+
377+
# Method 4: Proxy wrapping another approach
368378
response = client.chat.completions.create(
369379
model="gpt-4",
370380
messages=[{"role": "user", "content": "Hello"}],
371381
extra_body={
372382
"optillm_approach": "proxy",
373-
"proxy_wrap": "moa" # Proxy will route MOA's requests
383+
"proxy_wrap": "moa"
374384
}
375385
)
376386
```
@@ -379,12 +389,18 @@ response = client.chat.completions.create(
379389
```python
380390
from langchain.llms import OpenAI
381391
382-
# If server started with --approach proxy:
392+
# If server started with --approach proxy (recommended)
383393
llm = OpenAI(
384394
openai_api_base="http://localhost:8000/v1",
385395
model_name="gpt-4" # Proxy handles routing automatically
386396
)
387397
398+
# Or use proxy with model prefix
399+
llm = OpenAI(
400+
openai_api_base="http://localhost:8000/v1",
401+
model_name="proxy-gpt-4" # Use "proxy-" prefix
402+
)
403+
388404
response = llm("What is the meaning of life?")
389405
```
390406

optillm/plugins/proxy/client.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,15 @@ class _Completions:
114114
def __init__(self, proxy_client):
115115
self.proxy_client = proxy_client
116116

117+
def _filter_kwargs(self, kwargs: dict) -> dict:
118+
"""Filter out OptiLLM-specific parameters that shouldn't be sent to providers"""
119+
optillm_params = {
120+
'optillm_approach', 'proxy_wrap', 'wrapped_approach', 'wrap',
121+
'mcts_simulations', 'mcts_exploration', 'mcts_depth',
122+
'best_of_n', 'rstar_max_depth', 'rstar_num_rollouts', 'rstar_c'
123+
}
124+
return {k: v for k, v in kwargs.items() if k not in optillm_params}
125+
117126
def create(self, **kwargs):
118127
"""Create completion with load balancing and failover"""
119128
model = kwargs.get('model', 'unknown')
@@ -145,8 +154,8 @@ def create(self, **kwargs):
145154
attempted_providers.add(provider)
146155

147156
try:
148-
# Map model name if needed
149-
request_kwargs = kwargs.copy()
157+
# Map model name if needed and filter out OptiLLM-specific parameters
158+
request_kwargs = self._filter_kwargs(kwargs.copy())
150159
request_kwargs['model'] = provider.map_model(model)
151160

152161
# Track timing
@@ -177,7 +186,7 @@ def create(self, **kwargs):
177186
if self.proxy_client.fallback_client:
178187
logger.warning("All proxy providers failed, using fallback client")
179188
try:
180-
return self.proxy_client.fallback_client.chat.completions.create(**kwargs)
189+
return self.proxy_client.fallback_client.chat.completions.create(**self._filter_kwargs(kwargs))
181190
except Exception as e:
182191
errors.append(("fallback_client", str(e)))
183192

optillm/server.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -992,12 +992,14 @@ def main():
992992
global request_batcher
993993
global conversation_logger
994994
# Call this function at the start of main()
995+
996+
# Load plugins first so they're available in argument parser
997+
load_plugins()
998+
995999
args = parse_args()
9961000
# Update server_config with all argument values
9971001
server_config.update(vars(args))
9981002

999-
load_plugins()
1000-
10011003
port = server_config['port']
10021004

10031005
# Initialize request batcher if batch mode is enabled

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "optillm"
7-
version = "0.2.3"
7+
version = "0.2.4"
88
description = "An optimizing inference proxy for LLMs."
99
readme = "README.md"
1010
license = "Apache-2.0"

0 commit comments

Comments
 (0)