Fix SGLang adapter usage, run tests manually on local SGLang server

nv-alicheng · nv-alicheng · commit 672374f60f38 · 2025-12-31T19:25:22.000-08:00
diff --git a/src/inference_endpoint/endpoint_client/worker.py b/src/inference_endpoint/endpoint_client/worker.py
@@ -187,7 +187,7 @@ def add_chunk(self, delta: SGLangSSEDelta) -> StreamChunk | None:
         if not isinstance(delta, SGLangSSEDelta):
             return None
 
-        if delta.total_tokens == self.total_tokens:
+        if delta.total_completion_tokens == self.total_tokens:
             return None
 
         # In SGLang /generate, the .text field is the total accumulated text, not
@@ -196,8 +196,8 @@ def add_chunk(self, delta: SGLangSSEDelta) -> StreamChunk | None:
         if (start_idx := len(delta.text)) > len(self.text):
             content_diff = delta.text[start_idx:]
         self.text = delta.text
-        self.token_ids.extend(delta.token_ids)
-        self.total_tokens = delta.total_tokens
+        self.token_ids.extend(delta.token_delta)
+        self.total_tokens = delta.total_completion_tokens
         if delta.has_retractions:
             # For now, we won't be handling retractions if they occur, but we will
             # report it as part of the metadata if it does happen.
@@ -228,7 +228,7 @@ def get_final_output(self) -> QueryResult:
                 "final_chunk": True,
                 "retraction_occurred": self.retraction_occurred,
                 "n_tokens": self.total_tokens,
-                "output_tokens": self.token_ids,
+                "token_ids": self.token_ids,
             },
         )
 
diff --git a/src/inference_endpoint/sglang/types.py b/src/inference_endpoint/sglang/types.py
@@ -43,7 +43,7 @@ class SamplingParams(msgspec.Struct, kw_only=True, omit_defaults=True):
 class SGLangGenerateRequest(msgspec.Struct, kw_only=True, omit_defaults=True):
     input_ids: list[int]
     sampling_params: SamplingParams
-    stream: bool = True
+    stream: bool
 
 
 class MetaInfo(msgspec.Struct, kw_only=True, omit_defaults=True):
@@ -65,6 +65,6 @@ class SGLangGenerateResponse(msgspec.Struct, kw_only=True, omit_defaults=True):
 
 class SGLangSSEDelta(msgspec.Struct):
     text: str = ""
-    token_delta: int = 0
+    token_delta: list[int] = msgspec.field(default_factory=list)
     total_completion_tokens: int = 0
     has_retractions: bool = False
diff --git a/tests/integration/endpoint_client/test_sglang_adapter.py b/tests/integration/endpoint_client/test_sglang_adapter.py
@@ -16,8 +16,6 @@
 """Integration tests for SGLang adapter with real GPT-OSS server.
 
 This test assumes a server running GPT-OSS is available at localhost:30000.
-To start a server, use:
-    python3 -m sglang.launch_server --model-path <model> --host 0.0.0.0 --port 30000
 """
 
 import asyncio
@@ -98,6 +96,7 @@ class TestSGLangAdapterIntegration:
     """Integration tests for SGLang adapter with real GPT-OSS server."""
 
     @pytest.mark.asyncio
+    @pytest.mark.run_explicitly
     @pytest.mark.integration
     async def test_sglang_non_streaming_request(self, sglang_futures_client):
         """Test non-streaming request through SGLang adapter.
@@ -114,6 +113,10 @@ async def test_sglang_non_streaming_request(self, sglang_futures_client):
                 "input_tokens": input_tokens,
                 "stream": False,
             },
+            headers={
+                "Content-Type": "application/json",
+                "Accept": "application/json",
+            },
         )
 
         future = sglang_futures_client.issue_query(query)
@@ -126,19 +129,14 @@ async def test_sglang_non_streaming_request(self, sglang_futures_client):
         assert len(result.response_output) > 0
 
         # Verify metadata
-        assert "metadata" in dir(result)
         assert result.metadata is not None
         assert "token_ids" in result.metadata
         assert "n_tokens" in result.metadata
         assert isinstance(result.metadata["token_ids"], list)
         assert isinstance(result.metadata["n_tokens"], int)
 
-        print(
-            f"\nNon-streaming response: {result.response_output[:100]}..."
-        )  # Print first 100 chars
-        print(f"Token count: {result.metadata['n_tokens']}")
-
     @pytest.mark.asyncio
+    @pytest.mark.run_explicitly
     @pytest.mark.integration
     async def test_sglang_streaming_request(self, sglang_futures_client):
         """Test streaming request through SGLang adapter.
@@ -157,6 +155,10 @@ async def test_sglang_streaming_request(self, sglang_futures_client):
                 "temperature": 0.8,
                 "stream": True,
             },
+            headers={
+                "Content-Type": "application/json",
+                "Accept": "text/event-stream",
+            },
         )
 
         future = sglang_futures_client.issue_query(query)
@@ -167,15 +169,18 @@ async def test_sglang_streaming_request(self, sglang_futures_client):
         assert "response_output" in dir(result)
         assert result.response_output is not None
 
-        # In streaming mode, response_output should contain accumulated output
-        assert "output" in result.response_output
-        output_chunks = result.response_output["output"]
-        assert isinstance(output_chunks, list)
-        assert len(output_chunks) > 0
+        assert result.metadata is not None
+        assert "token_ids" in result.metadata
+        assert "n_tokens" in result.metadata
+        assert isinstance(result.metadata["token_ids"], list)
+        assert isinstance(result.metadata["n_tokens"], int)
 
-        # Reconstruct full text
-        full_text = "".join(output_chunks)
-        assert len(full_text) > 0
+        # Check that something was generated, but no more than max_new_tokens
+        assert 0 < result.metadata["n_tokens"] and result.metadata["n_tokens"] <= 100
 
-        print(f"\nStreaming response: {full_text[:100]}...")  # Print first 100 chars
-        print(f"Number of chunks: {len(output_chunks)}")
+        # The token IDs in the result should be at most n_tokens because of retractions
+        if result.metadata["retraction_occurred"]:
+            assert len(result.metadata["token_ids"]) <= result.metadata["n_tokens"]
+        else:
+            # STOP token is not included in the response, but counts towards generated
+            assert len(result.metadata["token_ids"]) + 1 == result.metadata["n_tokens"]