removing HF token

sammshen · sammshen · commit a2f908b202e9 · 2025-05-16T07:22:05.000Z
diff --git a/2-serving-engines/ADDING_BASELINE.md b/2-serving-engines/ADDING_BASELINE.md
@@ -0,0 +1,7 @@
+# End Goal:
+
+## To be compatible with `1-infrastructure/`
+Must deploy your inference setup to a kubernetes cluster
+
+## To allow `3-workloads/` to run workloads
+Have some server running queries at localhost:30080/v1/completions/
diff --git a/2-serving-engines/sglang/k8s-sglang-service.yaml b/2-serving-engines/sglang/k8s-sglang-service.yaml
@@ -0,0 +1,76 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: sglang-deployment
+spec:
+  replicas: 1
+  strategy:
+    type: Recreate
+  selector:
+    matchLabels:
+      app: sglang-server
+  template:
+    metadata:
+      labels:
+        app: sglang-server
+        engine: sglang
+    spec:
+      restartPolicy: Always
+      containers:
+        - name: sglang-container
+          image: docker.io/lmsysorg/sglang:latest
+          imagePullPolicy: Always  # IfNotPresent or Never
+          ports:
+            - containerPort: 30000
+          command: ["python3", "-m", "sglang.launch_server"]
+          args: ["--model-path", "MODEL_URL_PLACEHOLDER", "--host", "0.0.0.0", "--port", "30000"]
+          env:
+            - name: HF_TOKEN
+              value: <YOUR_HF_TOKEN>
+            - name: TRANSFORMERS_CACHE
+              value: /huggingface-cache
+          resources:
+            requests:
+              memory: "16Gi"
+              nvidia.com/gpu: 1
+            limits:
+              nvidia.com/gpu: 1
+          volumeMounts:
+            - name: shm
+              mountPath: /dev/shm
+            - name: hf-cache
+              mountPath: /huggingface-cache
+            - name: localtime
+              mountPath: /etc/localtime
+              readOnly: true
+          livenessProbe:
+            httpGet:
+              path: /health
+              port: 30000
+            initialDelaySeconds: 30
+            periodSeconds: 10
+      volumes:
+        - name: shm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 10Gi
+        - name: hf-cache
+          emptyDir:
+            sizeLimit: 20Gi
+        - name: localtime
+          hostPath:
+            path: /etc/localtime
+            type: File
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: sglang-service
+spec:
+  selector:
+    app: sglang-server
+  ports:
+    - protocol: TCP
+      port: 30000  # port on host
+      targetPort: 30000  # port in container
+  type: LoadBalancer
diff --git a/2-serving-engines/sglang/run-sglang.sh b/2-serving-engines/sglang/run-sglang.sh
diff --git a/3-workloads/agentic/agentic-qa.py b/3-workloads/agentic/agentic-qa.py
@@ -144,68 +144,75 @@ async def _async_launch_request(self, messages: List[Dict[str, str]],  max_token
             start_time = time.time()
             first_token_time = None
 
-            # Make the request
-            response = await self.client.chat.completions.create(
-                model=model,
-                messages=messages,
-                stream=True,
-                max_tokens=max_tokens,
-                temperature=0.0,
-                stream_options={"include_usage": True},
-                extra_headers=extra_headers,
-            )
+            try:
+                # Make the request
+                response = await self.client.chat.completions.create(
+                    model=model,
+                    messages=messages,
+                    stream=True,
+                    max_tokens=max_tokens,
+                    temperature=0.0,
+                    stream_options={"include_usage": True},
+                    extra_headers=extra_headers,
+                )
+
+                # Process the streaming response
+                async for chunk in response:
+                    if not chunk.choices:
+                        continue
 
-            # Process the streaming response
-            async for chunk in response:
-                if not chunk.choices:
-                    continue
-
-                # Handle content
-                if chunk.choices[0].delta.content is not None:
-                    if first_token_time is None and chunk.choices[0].delta.content != "":
-                        first_token_time = time.time()
-                    words += chunk.choices[0].delta.content
-
-            # Handle token counts if available
-            if hasattr(chunk, 'usage') and chunk.usage is not None:
-                tokens_out = chunk.usage.completion_tokens
-                tokens_prefill = chunk.usage.prompt_tokens
-
-            # If we didn't get token counts from streaming, try to get them from the final response
-            if tokens_out == 0 or tokens_prefill == 0:
-                print("No token counts from streaming, getting final response")
-                print(f"{tokens_out}, {tokens_prefill}")
-                try:
-                    final_response = await self.client.chat.completions.create(
-                        model=model,
-                        messages=messages,
-                        stream=False,
-                    )
-                    if hasattr(final_response, 'usage') and final_response.usage is not None:
-                        tokens_out = final_response.usage.completion_tokens
-                        tokens_prefill = final_response.usage.prompt_tokens
-                except Exception as e:
-                    logging.warning(f"Failed to get token counts from final response: {e}")
-
-            # # Calculate timing metrics
-            ttft = first_token_time - start_time if first_token_time else 0
-            generation_time = time.time() - first_token_time if first_token_time else 0
-
-            return Response(
-                body=words,
-                ttft=ttft,
-                generation_time=generation_time,
-                prompt_tokens=tokens_prefill,
-                generation_tokens=tokens_out,
-                launch_time=start_time,
-                finish_time=time.time(),
-                agentID=agentID,
-            )
+                    # Handle content
+                    if chunk.choices[0].delta.content is not None:
+                        if first_token_time is None and chunk.choices[0].delta.content != "":
+                            first_token_time = time.time()
+                        words += chunk.choices[0].delta.content
+
+                # Handle token counts if available
+                if hasattr(chunk, 'usage') and chunk.usage is not None:
+                    tokens_out = chunk.usage.completion_tokens
+                    tokens_prefill = chunk.usage.prompt_tokens
+
+                # If we didn't get token counts from streaming, try to get them from the final response
+                if tokens_out == 0 or tokens_prefill == 0:
+                    print("No token counts from streaming, getting final response")
+                    print(f"{tokens_out}, {tokens_prefill}")
+                    try:
+                        final_response = await self.client.chat.completions.create(
+                            model=model,
+                            messages=messages,
+                            stream=False,
+                        )
+                        if hasattr(final_response, 'usage') and final_response.usage is not None:
+                            tokens_out = final_response.usage.completion_tokens
+                            tokens_prefill = final_response.usage.prompt_tokens
+                    except Exception as e:
+                        logging.warning(f"Failed to get token counts from final response: {e}")
+
+                # # Calculate timing metrics
+                ttft = first_token_time - start_time if first_token_time else 0
+                generation_time = time.time() - first_token_time if first_token_time else 0
+
+                return Response(
+                    body=words,
+                    ttft=ttft,
+                    generation_time=generation_time,
+                    prompt_tokens=tokens_prefill,
+                    generation_tokens=tokens_out,
+                    launch_time=start_time,
+                    finish_time=time.time(),
+                    agentID=agentID,
+                )
+            except openai.BadRequestError as e:
+                logging.warning(f"BadRequestError with model {model}: {e}")
+                return None
+            except Exception as e:
+                logging.error(f"Error during request to model {model}: {e}")
+                return None
 
         except Exception as e:
             logging.error(f"Error in _async_launch_request: {str(e)}")
             logging.error(f"Request details - model: {model}, messages: {messages}")
-            raise
+            return None
 
     def launch_request(
         self,
@@ -218,11 +225,18 @@ def launch_request(
         """
         finish_callback: Callable[[Response, int], None]
         """
-        real_callback = lambda x: finish_callback(x.result(), agentID)
+        def safe_callback(future):
+            try:
+                result = future.result()
+                # The callback will handle the None case
+                finish_callback(result, agentID)
+            except Exception as e:
+                logger.error(f"Error in callback: {e}")
+
         future = asyncio.run_coroutine_threadsafe(
             self._async_launch_request(messages, max_tokens, agentID, extra_headers), self.loop
         )
-        future.add_done_callback(real_callback)
+        future.add_done_callback(safe_callback)
 
 
 class UserSession:
@@ -309,7 +323,13 @@ def _launch_new_request(self, timestamp: float, request_executor: RequestExecuto
         self.has_unfinished_request = True
         self.last_request_time = timestamp
 
-    def _on_request_finished(self, response: Response, agentID: int):
+    def _on_request_finished(self, response: Optional[Response], agentID: int):
+        if response is None:
+            logger.warning(f"User {self.user_config.user_id} request failed (likely context length exceeded)")
+            self.has_unfinished_request = False
+            self.finished = True  # Mark session as finished when request fails
+            return
+
         if self.user_config.whole_history:
             self.chat_history.on_system_response_whole(response.body, agentID)
         else:
@@ -666,7 +686,7 @@ def main():
                 f"When --trace-file is omitted, you MUST supply: {', '.join(missing)}"
             )
 
-    # From here on you know you’re in exactly one mode:
+    # From here on you know you're in exactly one mode:
     if args.trace_file:
         print("Running in trace‑mode, loading:", args.trace_file)
     else:
diff --git a/3-workloads/agentic/run_agentic.sh b/3-workloads/agentic/run_agentic.sh
@@ -86,7 +86,7 @@ for interval in "${NEW_USER_INTERVALS[@]}"; do
     # Change to project root before running summarize.py
     cd "$PROJECT_ROOT"
     python3 "4-latest-results/post-processing/summarize.py" \
-        "4-latest-results/${output_file#../../}" \
+        "${output_file#../../}" \
         KEY="$KEY" \
         WORKLOAD="agentic" \
         NUM_USERS_WARMUP="$NUM_USERS_WARMUP" \
diff --git a/3-workloads/mooncake/mooncake-qa.py b/3-workloads/mooncake/mooncake-qa.py
@@ -137,34 +137,41 @@ async def _async_launch_request(self, messages, max_tokens, extra_headers=None):
         start_time = time.time()
         first_token_time = None
         words = ""
-        response = await self.client.chat.completions.create(
-            messages=messages,
-            model=self.model,
-            temperature=0,
-            stream=True,
-            max_tokens=max_tokens,
-            stream_options={"include_usage": True},
-            extra_headers=extra_headers,
-        )
-        async for tok in response:
-            if not tok.choices:
-                continue
-            chunk_message = tok.choices[0].delta.content
-            if chunk_message is not None:
-                if first_token_time is None and chunk_message != "":
-                    first_token_time = time.time()
-                words += chunk_message
-        tokens_out = tok.usage.completion_tokens
-        tokens_prefill = tok.usage.prompt_tokens
-        return Response(
-            body=words,
-            ttft=first_token_time - start_time,
-            generation_time=time.time() - first_token_time,
-            prompt_tokens=tokens_prefill,
-            generation_tokens=tokens_out,
-            launch_time=start_time,
-            finish_time=time.time(),
-        )
+        try:
+            response = await self.client.chat.completions.create(
+                messages=messages,
+                model=self.model,
+                temperature=0,
+                stream=True,
+                max_tokens=max_tokens,
+                stream_options={"include_usage": True},
+                extra_headers=extra_headers,
+            )
+            async for tok in response:
+                if not tok.choices:
+                    continue
+                chunk_message = tok.choices[0].delta.content
+                if chunk_message is not None:
+                    if first_token_time is None and chunk_message != "":
+                        first_token_time = time.time()
+                    words += chunk_message
+            tokens_out = tok.usage.completion_tokens
+            tokens_prefill = tok.usage.prompt_tokens
+            return Response(
+                body=words,
+                ttft=first_token_time - start_time if first_token_time is not None else 0,
+                generation_time=time.time() - first_token_time if first_token_time is not None else 0,
+                prompt_tokens=tokens_prefill,
+                generation_tokens=tokens_out,
+                launch_time=start_time,
+                finish_time=time.time(),
+            )
+        except openai.BadRequestError as e:
+            logger.warning(f"BadRequestError: {e}")
+            return None
+        except Exception as e:
+            logger.error(f"Unexpected error in _async_launch_request: {e}")
+            return None
 
     def launch_request(
         self,
@@ -177,11 +184,19 @@ def launch_request(
         finish_callback: Callable[[Response], None]
         """
         messages = chat_history.get_messages_for_openai()
-        real_callback = lambda x: finish_callback(x.result())
+        def safe_callback(future):
+            try:
+                result = future.result()
+                # Pass the result to the callback even if it's None
+                # The callback will handle the None case
+                finish_callback(result)
+            except Exception as e:
+                logger.error(f"Error in callback: {e}")
+
         future = asyncio.run_coroutine_threadsafe(
             self._async_launch_request(messages, max_tokens, extra_headers), self.loop
         )
-        future.add_done_callback(real_callback)
+        future.add_done_callback(safe_callback)
 
 
 class UserSession:
@@ -256,7 +271,13 @@ def _launch_new_request(self, timestamp: float, request_executor: RequestExecuto
         self.has_unfinished_request = True
         self.last_request_time = timestamp
 
-    def _on_request_finished(self, response: Response):
+    def _on_request_finished(self, response: Optional[Response]):
+        if response is None:
+            logger.warning(f"User {self.user_config.user_id} request failed (likely context length exceeded)")
+            self.has_unfinished_request = False
+            self.finished = True  # Mark session as finished when request fails
+            return
+
         self.chat_history.on_system_response(response.body)
         self.has_unfinished_request = False
         logger.debug(
diff --git a/3-workloads/mooncake/run_mooncake.sh b/3-workloads/mooncake/run_mooncake.sh
@@ -56,7 +56,7 @@ for qps in "${QPS_VALUES[@]}"; do
     # Change to project root before running summarize.py
     cd "$PROJECT_ROOT"
     python3 "4-latest-results/post-processing/summarize.py" \
-        "4-latest-results/${output_file#../../}" \
+        "${output_file#../../}" \
         KEY="$KEY" \
         WORKLOAD="mooncake" \
         NUM_ROUNDS="$NUM_ROUNDS" \
diff --git a/3-workloads/sharegpt/workload_execution/run-sharegpt.sh b/3-workloads/sharegpt/workload_execution/run-sharegpt.sh
@@ -67,7 +67,7 @@ for qps in "${QPS_VALUES[@]}"; do
     # Change to project root before running summarize.py
     cd "$PROJECT_ROOT"
     python3 "4-latest-results/post-processing/summarize.py" \
-        "4-latest-results/${output_file#../../../}" \
+        "${output_file#../../../}" \
         KEY="$KEY" \
         WORKLOAD="sharegpt" \
         LIMIT="$LIMIT" \