Skip to content

Commit a7a754b

Browse files
authored
add stream_options for openai api (#1046)
1 parent 1df7044 commit a7a754b

File tree

3 files changed

+49
-2
lines changed

3 files changed

+49
-2
lines changed

lightllm/server/api_models.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,10 @@ class ToolChoice(BaseModel):
4848
type: Literal["function"] = Field(default="function", examples=["function"])
4949

5050

51+
class StreamOptions(BaseModel):
52+
include_usage: Optional[bool] = False
53+
54+
5155
class CompletionRequest(BaseModel):
5256
model: str
5357
# prompt: string or tokens
@@ -58,6 +62,7 @@ class CompletionRequest(BaseModel):
5862
top_p: Optional[float] = 1.0
5963
n: Optional[int] = 1
6064
stream: Optional[bool] = False
65+
stream_options: Optional[StreamOptions] = None
6166
logprobs: Optional[int] = None
6267
echo: Optional[bool] = False
6368
stop: Optional[Union[str, List[str]]] = None
@@ -82,6 +87,7 @@ class ChatCompletionRequest(BaseModel):
8287
top_p: Optional[float] = 1.0
8388
n: Optional[int] = 1
8489
stream: Optional[bool] = False
90+
stream_options: Optional[StreamOptions] = None
8591
stop: Optional[Union[str, List[str]]] = None
8692
max_tokens: Optional[int] = 16
8793
presence_penalty: Optional[float] = 0.0
@@ -170,6 +176,7 @@ class ChatCompletionStreamResponse(BaseModel):
170176
created: int = Field(default_factory=lambda: int(time.time()))
171177
model: str
172178
choices: List[ChatCompletionStreamResponseChoice]
179+
usage: Optional[UsageInfo] = None
173180

174181
@field_validator("id", mode="before")
175182
def ensure_id_is_str(cls, v):
@@ -216,6 +223,7 @@ class CompletionStreamResponse(BaseModel):
216223
created: int = Field(default_factory=lambda: int(time.time()))
217224
model: str
218225
choices: List[CompletionStreamChoice]
226+
usage: Optional[UsageInfo] = None
219227

220228
@field_validator("id", mode="before")
221229
def ensure_id_is_str(cls, v):

lightllm/server/api_openai.py

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,11 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
222222
finish_reason = None
223223
from .req_id_generator import convert_sub_id_to_group_id
224224

225+
prompt_tokens = 0
226+
completion_tokens = 0
225227
async for sub_req_id, request_output, metadata, finish_status in results_generator:
228+
prompt_tokens = metadata["prompt_tokens"]
229+
completion_tokens += 1
226230
if request.tool_choice != "none" and request.tools:
227231
delta = request_output
228232
group_request_id = convert_sub_id_to_group_id(sub_req_id)
@@ -309,6 +313,22 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
309313
choices=[stream_choice],
310314
)
311315
yield ("data: " + json.dumps(stream_resp.dict(), ensure_ascii=False) + "\n\n").encode("utf-8")
316+
# Additional usage chunk
317+
318+
if request.stream_options and request.stream_options.include_usage:
319+
usage = UsageInfo(
320+
prompt_tokens=prompt_tokens,
321+
completion_tokens=completion_tokens,
322+
total_tokens=prompt_tokens + completion_tokens,
323+
)
324+
usage_chunk = ChatCompletionStreamResponse(
325+
id=group_request_id,
326+
created=created_time,
327+
choices=[], # Empty choices array as per OpenAI spec
328+
model=request.model,
329+
usage=usage,
330+
)
331+
yield f"data: {usage_chunk.model_dump_json()}\n\n"
312332

313333
background_tasks = BackgroundTasks()
314334
return StreamingResponse(stream_results(), media_type="text/event-stream", background=background_tasks)
@@ -453,9 +473,13 @@ async def _handle_streaming_completion(
453473
async def stream_results() -> AsyncGenerator[bytes, None]:
454474
from .req_id_generator import convert_sub_id_to_group_id
455475

476+
prompt_tokens = 0
477+
completion_tokens = 0
478+
456479
async for sub_req_id, request_output, metadata, finish_status in results_generator:
457480
group_request_id = convert_sub_id_to_group_id(sub_req_id)
458-
481+
prompt_tokens = metadata["prompt_tokens"]
482+
completion_tokens += 1
459483
current_finish_reason = None
460484
if finish_status.is_finished():
461485
current_finish_reason = finish_status.get_finish_reason()
@@ -483,6 +507,21 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
483507

484508
yield "data: [DONE]\n\n".encode("utf-8")
485509

510+
if request.stream_options and request.stream_options.include_usage:
511+
usage = UsageInfo(
512+
prompt_tokens=prompt_tokens,
513+
completion_tokens=completion_tokens,
514+
total_tokens=prompt_tokens + completion_tokens,
515+
)
516+
usage_chunk = CompletionStreamResponse(
517+
id=group_request_id,
518+
created=created_time,
519+
choices=[], # Empty choices array as per OpenAI spec
520+
model=request.model,
521+
usage=usage,
522+
)
523+
yield f"data: {usage_chunk.model_dump_json()}\n\n"
524+
486525
background_tasks = BackgroundTasks()
487526
return StreamingResponse(stream_results(), media_type="text/event-stream", background=background_tasks)
488527

lightllm/server/core/objs/out_token_circlequeue.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import ctypes
33
from typing import Tuple
44

5-
LIGHTLLM_TOKEN_MAX_BYTES = int(os.getenv("LIGHTLLM_TOKEN_MAX_BYTES", 696))
5+
LIGHTLLM_TOKEN_MAX_BYTES = int(os.getenv("LIGHTLLM_TOKEN_MAX_BYTES", 1280))
66
LIGHTLLM_OUT_TOKEN_QUEUE_SIZE = int(os.getenv("LIGHTLLM_OUT_TOKEN_QUEUE_SIZE", 8))
77

88

0 commit comments

Comments
 (0)