Skip to content

Commit a2f908b

Browse files
committed
removing HF token
1 parent 276ad1f commit a2f908b

File tree

8 files changed

+219
-95
lines changed

8 files changed

+219
-95
lines changed
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# End Goal:
2+
3+
## To be compatible with `1-infrastructure/`
4+
Must deploy your inference setup to a kubernetes cluster
5+
6+
## To allow `3-workloads/` to run workloads
7+
Have some server running queries at localhost:30080/v1/completions/
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
apiVersion: apps/v1
2+
kind: Deployment
3+
metadata:
4+
name: sglang-deployment
5+
spec:
6+
replicas: 1
7+
strategy:
8+
type: Recreate
9+
selector:
10+
matchLabels:
11+
app: sglang-server
12+
template:
13+
metadata:
14+
labels:
15+
app: sglang-server
16+
engine: sglang
17+
spec:
18+
restartPolicy: Always
19+
containers:
20+
- name: sglang-container
21+
image: docker.io/lmsysorg/sglang:latest
22+
imagePullPolicy: Always # IfNotPresent or Never
23+
ports:
24+
- containerPort: 30000
25+
command: ["python3", "-m", "sglang.launch_server"]
26+
args: ["--model-path", "MODEL_URL_PLACEHOLDER", "--host", "0.0.0.0", "--port", "30000"]
27+
env:
28+
- name: HF_TOKEN
29+
value: <YOUR_HF_TOKEN>
30+
- name: TRANSFORMERS_CACHE
31+
value: /huggingface-cache
32+
resources:
33+
requests:
34+
memory: "16Gi"
35+
nvidia.com/gpu: 1
36+
limits:
37+
nvidia.com/gpu: 1
38+
volumeMounts:
39+
- name: shm
40+
mountPath: /dev/shm
41+
- name: hf-cache
42+
mountPath: /huggingface-cache
43+
- name: localtime
44+
mountPath: /etc/localtime
45+
readOnly: true
46+
livenessProbe:
47+
httpGet:
48+
path: /health
49+
port: 30000
50+
initialDelaySeconds: 30
51+
periodSeconds: 10
52+
volumes:
53+
- name: shm
54+
emptyDir:
55+
medium: Memory
56+
sizeLimit: 10Gi
57+
- name: hf-cache
58+
emptyDir:
59+
sizeLimit: 20Gi
60+
- name: localtime
61+
hostPath:
62+
path: /etc/localtime
63+
type: File
64+
---
65+
apiVersion: v1
66+
kind: Service
67+
metadata:
68+
name: sglang-service
69+
spec:
70+
selector:
71+
app: sglang-server
72+
ports:
73+
- protocol: TCP
74+
port: 30000 # port on host
75+
targetPort: 30000 # port in container
76+
type: LoadBalancer

2-serving-engines/sglang/run-sglang.sh

Whitespace-only changes.

3-workloads/agentic/agentic-qa.py

Lines changed: 81 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -144,68 +144,75 @@ async def _async_launch_request(self, messages: List[Dict[str, str]], max_token
144144
start_time = time.time()
145145
first_token_time = None
146146

147-
# Make the request
148-
response = await self.client.chat.completions.create(
149-
model=model,
150-
messages=messages,
151-
stream=True,
152-
max_tokens=max_tokens,
153-
temperature=0.0,
154-
stream_options={"include_usage": True},
155-
extra_headers=extra_headers,
156-
)
147+
try:
148+
# Make the request
149+
response = await self.client.chat.completions.create(
150+
model=model,
151+
messages=messages,
152+
stream=True,
153+
max_tokens=max_tokens,
154+
temperature=0.0,
155+
stream_options={"include_usage": True},
156+
extra_headers=extra_headers,
157+
)
158+
159+
# Process the streaming response
160+
async for chunk in response:
161+
if not chunk.choices:
162+
continue
157163

158-
# Process the streaming response
159-
async for chunk in response:
160-
if not chunk.choices:
161-
continue
162-
163-
# Handle content
164-
if chunk.choices[0].delta.content is not None:
165-
if first_token_time is None and chunk.choices[0].delta.content != "":
166-
first_token_time = time.time()
167-
words += chunk.choices[0].delta.content
168-
169-
# Handle token counts if available
170-
if hasattr(chunk, 'usage') and chunk.usage is not None:
171-
tokens_out = chunk.usage.completion_tokens
172-
tokens_prefill = chunk.usage.prompt_tokens
173-
174-
# If we didn't get token counts from streaming, try to get them from the final response
175-
if tokens_out == 0 or tokens_prefill == 0:
176-
print("No token counts from streaming, getting final response")
177-
print(f"{tokens_out}, {tokens_prefill}")
178-
try:
179-
final_response = await self.client.chat.completions.create(
180-
model=model,
181-
messages=messages,
182-
stream=False,
183-
)
184-
if hasattr(final_response, 'usage') and final_response.usage is not None:
185-
tokens_out = final_response.usage.completion_tokens
186-
tokens_prefill = final_response.usage.prompt_tokens
187-
except Exception as e:
188-
logging.warning(f"Failed to get token counts from final response: {e}")
189-
190-
# # Calculate timing metrics
191-
ttft = first_token_time - start_time if first_token_time else 0
192-
generation_time = time.time() - first_token_time if first_token_time else 0
193-
194-
return Response(
195-
body=words,
196-
ttft=ttft,
197-
generation_time=generation_time,
198-
prompt_tokens=tokens_prefill,
199-
generation_tokens=tokens_out,
200-
launch_time=start_time,
201-
finish_time=time.time(),
202-
agentID=agentID,
203-
)
164+
# Handle content
165+
if chunk.choices[0].delta.content is not None:
166+
if first_token_time is None and chunk.choices[0].delta.content != "":
167+
first_token_time = time.time()
168+
words += chunk.choices[0].delta.content
169+
170+
# Handle token counts if available
171+
if hasattr(chunk, 'usage') and chunk.usage is not None:
172+
tokens_out = chunk.usage.completion_tokens
173+
tokens_prefill = chunk.usage.prompt_tokens
174+
175+
# If we didn't get token counts from streaming, try to get them from the final response
176+
if tokens_out == 0 or tokens_prefill == 0:
177+
print("No token counts from streaming, getting final response")
178+
print(f"{tokens_out}, {tokens_prefill}")
179+
try:
180+
final_response = await self.client.chat.completions.create(
181+
model=model,
182+
messages=messages,
183+
stream=False,
184+
)
185+
if hasattr(final_response, 'usage') and final_response.usage is not None:
186+
tokens_out = final_response.usage.completion_tokens
187+
tokens_prefill = final_response.usage.prompt_tokens
188+
except Exception as e:
189+
logging.warning(f"Failed to get token counts from final response: {e}")
190+
191+
# # Calculate timing metrics
192+
ttft = first_token_time - start_time if first_token_time else 0
193+
generation_time = time.time() - first_token_time if first_token_time else 0
194+
195+
return Response(
196+
body=words,
197+
ttft=ttft,
198+
generation_time=generation_time,
199+
prompt_tokens=tokens_prefill,
200+
generation_tokens=tokens_out,
201+
launch_time=start_time,
202+
finish_time=time.time(),
203+
agentID=agentID,
204+
)
205+
except openai.BadRequestError as e:
206+
logging.warning(f"BadRequestError with model {model}: {e}")
207+
return None
208+
except Exception as e:
209+
logging.error(f"Error during request to model {model}: {e}")
210+
return None
204211

205212
except Exception as e:
206213
logging.error(f"Error in _async_launch_request: {str(e)}")
207214
logging.error(f"Request details - model: {model}, messages: {messages}")
208-
raise
215+
return None
209216

210217
def launch_request(
211218
self,
@@ -218,11 +225,18 @@ def launch_request(
218225
"""
219226
finish_callback: Callable[[Response, int], None]
220227
"""
221-
real_callback = lambda x: finish_callback(x.result(), agentID)
228+
def safe_callback(future):
229+
try:
230+
result = future.result()
231+
# The callback will handle the None case
232+
finish_callback(result, agentID)
233+
except Exception as e:
234+
logger.error(f"Error in callback: {e}")
235+
222236
future = asyncio.run_coroutine_threadsafe(
223237
self._async_launch_request(messages, max_tokens, agentID, extra_headers), self.loop
224238
)
225-
future.add_done_callback(real_callback)
239+
future.add_done_callback(safe_callback)
226240

227241

228242
class UserSession:
@@ -309,7 +323,13 @@ def _launch_new_request(self, timestamp: float, request_executor: RequestExecuto
309323
self.has_unfinished_request = True
310324
self.last_request_time = timestamp
311325

312-
def _on_request_finished(self, response: Response, agentID: int):
326+
def _on_request_finished(self, response: Optional[Response], agentID: int):
327+
if response is None:
328+
logger.warning(f"User {self.user_config.user_id} request failed (likely context length exceeded)")
329+
self.has_unfinished_request = False
330+
self.finished = True # Mark session as finished when request fails
331+
return
332+
313333
if self.user_config.whole_history:
314334
self.chat_history.on_system_response_whole(response.body, agentID)
315335
else:
@@ -666,7 +686,7 @@ def main():
666686
f"When --trace-file is omitted, you MUST supply: {', '.join(missing)}"
667687
)
668688

669-
# From here on you know youre in exactly one mode:
689+
# From here on you know you're in exactly one mode:
670690
if args.trace_file:
671691
print("Running in trace‑mode, loading:", args.trace_file)
672692
else:

3-workloads/agentic/run_agentic.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ for interval in "${NEW_USER_INTERVALS[@]}"; do
8686
# Change to project root before running summarize.py
8787
cd "$PROJECT_ROOT"
8888
python3 "4-latest-results/post-processing/summarize.py" \
89-
"4-latest-results/${output_file#../../}" \
89+
"${output_file#../../}" \
9090
KEY="$KEY" \
9191
WORKLOAD="agentic" \
9292
NUM_USERS_WARMUP="$NUM_USERS_WARMUP" \

3-workloads/mooncake/mooncake-qa.py

Lines changed: 52 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -137,34 +137,41 @@ async def _async_launch_request(self, messages, max_tokens, extra_headers=None):
137137
start_time = time.time()
138138
first_token_time = None
139139
words = ""
140-
response = await self.client.chat.completions.create(
141-
messages=messages,
142-
model=self.model,
143-
temperature=0,
144-
stream=True,
145-
max_tokens=max_tokens,
146-
stream_options={"include_usage": True},
147-
extra_headers=extra_headers,
148-
)
149-
async for tok in response:
150-
if not tok.choices:
151-
continue
152-
chunk_message = tok.choices[0].delta.content
153-
if chunk_message is not None:
154-
if first_token_time is None and chunk_message != "":
155-
first_token_time = time.time()
156-
words += chunk_message
157-
tokens_out = tok.usage.completion_tokens
158-
tokens_prefill = tok.usage.prompt_tokens
159-
return Response(
160-
body=words,
161-
ttft=first_token_time - start_time,
162-
generation_time=time.time() - first_token_time,
163-
prompt_tokens=tokens_prefill,
164-
generation_tokens=tokens_out,
165-
launch_time=start_time,
166-
finish_time=time.time(),
167-
)
140+
try:
141+
response = await self.client.chat.completions.create(
142+
messages=messages,
143+
model=self.model,
144+
temperature=0,
145+
stream=True,
146+
max_tokens=max_tokens,
147+
stream_options={"include_usage": True},
148+
extra_headers=extra_headers,
149+
)
150+
async for tok in response:
151+
if not tok.choices:
152+
continue
153+
chunk_message = tok.choices[0].delta.content
154+
if chunk_message is not None:
155+
if first_token_time is None and chunk_message != "":
156+
first_token_time = time.time()
157+
words += chunk_message
158+
tokens_out = tok.usage.completion_tokens
159+
tokens_prefill = tok.usage.prompt_tokens
160+
return Response(
161+
body=words,
162+
ttft=first_token_time - start_time if first_token_time is not None else 0,
163+
generation_time=time.time() - first_token_time if first_token_time is not None else 0,
164+
prompt_tokens=tokens_prefill,
165+
generation_tokens=tokens_out,
166+
launch_time=start_time,
167+
finish_time=time.time(),
168+
)
169+
except openai.BadRequestError as e:
170+
logger.warning(f"BadRequestError: {e}")
171+
return None
172+
except Exception as e:
173+
logger.error(f"Unexpected error in _async_launch_request: {e}")
174+
return None
168175

169176
def launch_request(
170177
self,
@@ -177,11 +184,19 @@ def launch_request(
177184
finish_callback: Callable[[Response], None]
178185
"""
179186
messages = chat_history.get_messages_for_openai()
180-
real_callback = lambda x: finish_callback(x.result())
187+
def safe_callback(future):
188+
try:
189+
result = future.result()
190+
# Pass the result to the callback even if it's None
191+
# The callback will handle the None case
192+
finish_callback(result)
193+
except Exception as e:
194+
logger.error(f"Error in callback: {e}")
195+
181196
future = asyncio.run_coroutine_threadsafe(
182197
self._async_launch_request(messages, max_tokens, extra_headers), self.loop
183198
)
184-
future.add_done_callback(real_callback)
199+
future.add_done_callback(safe_callback)
185200

186201

187202
class UserSession:
@@ -256,7 +271,13 @@ def _launch_new_request(self, timestamp: float, request_executor: RequestExecuto
256271
self.has_unfinished_request = True
257272
self.last_request_time = timestamp
258273

259-
def _on_request_finished(self, response: Response):
274+
def _on_request_finished(self, response: Optional[Response]):
275+
if response is None:
276+
logger.warning(f"User {self.user_config.user_id} request failed (likely context length exceeded)")
277+
self.has_unfinished_request = False
278+
self.finished = True # Mark session as finished when request fails
279+
return
280+
260281
self.chat_history.on_system_response(response.body)
261282
self.has_unfinished_request = False
262283
logger.debug(

3-workloads/mooncake/run_mooncake.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ for qps in "${QPS_VALUES[@]}"; do
5656
# Change to project root before running summarize.py
5757
cd "$PROJECT_ROOT"
5858
python3 "4-latest-results/post-processing/summarize.py" \
59-
"4-latest-results/${output_file#../../}" \
59+
"${output_file#../../}" \
6060
KEY="$KEY" \
6161
WORKLOAD="mooncake" \
6262
NUM_ROUNDS="$NUM_ROUNDS" \

3-workloads/sharegpt/workload_execution/run-sharegpt.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ for qps in "${QPS_VALUES[@]}"; do
6767
# Change to project root before running summarize.py
6868
cd "$PROJECT_ROOT"
6969
python3 "4-latest-results/post-processing/summarize.py" \
70-
"4-latest-results/${output_file#../../../}" \
70+
"${output_file#../../../}" \
7171
KEY="$KEY" \
7272
WORKLOAD="sharegpt" \
7373
LIMIT="$LIMIT" \

0 commit comments

Comments
 (0)