Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ repos:
additional_dependencies:
- tomli
# add ignore words list
args: ["-L", "Mor,ans,thirdparty"]
args: ["-L", "Mor,ans,thirdparty", "--skip", "security_scanning/*"]
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.9.4
hooks:
Expand Down
7 changes: 6 additions & 1 deletion tensorrt_llm/executor/result.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,12 @@ def _handle_response(self, response: "GenerationExecutor.Response"):
else:
beam_output.text = self.tokenizer.decode(
beam_output.token_ids, **kwargs)

# Update _last_token_ids_len after detokenization to prevent
# re-decoding the same tokens in subsequent responses when n > 1.
# Without this, outputs not updated in _handle_sequence would have
# stale _last_token_ids_len, causing token_ids_diff to return old tokens.
beam_output._last_token_ids_len = len(beam_output.token_ids)

is_generating = not self._done
is_finished_with_stop_or_length = (
beam_output.finish_reason == 'stop'
Expand Down
6 changes: 0 additions & 6 deletions tensorrt_llm/serve/chat_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,9 +197,3 @@ def parse_chat_messages_coroutines(

return conversation, mm_data_tracker.retrieve_all_async(
), mm_placeholder_counts


def check_multiple_response(n: int, backend: Optional[str]):
if n > 1 and backend == "pytorch":
raise ValueError(
"Multiple response is not supported in PyTorch workflow")
6 changes: 1 addition & 5 deletions tensorrt_llm/serve/openai_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,7 @@
from tensorrt_llm.llmapi.llm import RequestOutput
from tensorrt_llm.logger import logger
from tensorrt_llm.metrics.collector import MetricsCollector
from tensorrt_llm.serve.chat_utils import (check_multiple_response,
parse_chat_messages_coroutines)
from tensorrt_llm.serve.chat_utils import parse_chat_messages_coroutines
from tensorrt_llm.serve.metadata_server import create_metadata_server
from tensorrt_llm.serve.openai_protocol import (ChatCompletionRequest,
ChatCompletionResponse,
Expand Down Expand Up @@ -417,7 +416,6 @@ async def create_chat_response(
return chat_response

try:
check_multiple_response(request.n, self.llm.args.backend)
conversation: List[ConversationMessage] = []
tool_dicts = None if request.tools is None else [
tool.model_dump() for tool in request.tools
Expand Down Expand Up @@ -524,7 +522,6 @@ async def create_mm_embedding_response(promise: RequestOutput):
)

try:
check_multiple_response(request.n, self.llm.args.backend)
conversation: List[ConversationMessage] = []
tool_dicts = None if request.tools is None else [
tool.model_dump() for tool in request.tools
Expand Down Expand Up @@ -651,7 +648,6 @@ async def generator_wrapper(generator: AsyncIterator[Any]):
yield "data: [DONE]\n\n"

try:
check_multiple_response(request.n, self.llm.args.backend)
if isinstance(request.prompt, str) or \
(isinstance(request.prompt, list) and isinstance(request.prompt[0], int)):
prompts = [request.prompt]
Expand Down
78 changes: 77 additions & 1 deletion tests/unittest/llmapi/apps/_test_openai_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ def server(model_name: str, backend: str, extra_llm_api_options: bool,
temp_extra_llm_api_options_file: str, num_postprocess_workers: int):
model_path = get_model_path(model_name)
args = ["--backend", f"{backend}"]
args.extend(["--kv_cache_free_gpu_memory_fraction",
"0.2"]) # for co-existence with other servers
if backend == "trt":
args.extend(["--max_beam_width", "4"])
if extra_llm_api_options:
Expand All @@ -78,11 +80,34 @@ def server(model_name: str, backend: str, extra_llm_api_options: bool,
yield remote_server


@pytest.fixture(scope="module")
def server_with_beam_search(model_name: str, backend: str,
extra_llm_api_options: bool,
temp_extra_llm_api_options_file: str,
num_postprocess_workers: int):
model_path = get_model_path(model_name)
args = ["--backend", f"{backend}"]
args.extend(["--kv_cache_free_gpu_memory_fraction",
"0.2"]) # for co-existence with other servers
args.extend(["--max_beam_width", "2"])
if extra_llm_api_options:
args.extend(
["--extra_llm_api_options", temp_extra_llm_api_options_file])
args.extend(["--num_postprocess_workers", f"{num_postprocess_workers}"])
with RemoteOpenAIServer(model_path, args) as remote_server:
yield remote_server


@pytest.fixture(scope="module")
def client(server: RemoteOpenAIServer):
return server.get_client()


@pytest.fixture(scope="module")
def client_with_beam_search(server_with_beam_search: RemoteOpenAIServer):
return server_with_beam_search.get_client()


@pytest.fixture(scope="module")
def async_client(server: RemoteOpenAIServer):
return server.get_async_client()
Expand Down Expand Up @@ -180,7 +205,33 @@ def test_multiple_responses(client: openai.OpenAI, model_name: str,
backend: str):
if backend == "pytorch":
pytest.skip(
"Multiple responses are not supported in PyTorch backend yet")
"'n' not allowed with temperature=0 unless TLLM_ALLOW_N_GREEDY_DECODING=1"
)
messages = [{
"role": "system",
"content": "you are a helpful assistant"
}, {
"role": "user",
"content": "what is 1+1?"
}]
# test n and best_of
chat_completion = client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
n=2,
temperature=0.0,
extra_body=dict(best_of=4),
)
assert len(chat_completion.choices) == 2


def test_multiple_responses_and_beam_search(client: openai.OpenAI,
model_name: str, backend: str):
if backend == "pytorch":
pytest.skip(
"Mixing beam search and regular requests is not supported in PyTorch backend"
)

messages = [{
"role": "system",
Expand All @@ -202,6 +253,7 @@ def test_multiple_responses(client: openai.OpenAI, model_name: str,
assert chat_completion.choices[
0].message.content != chat_completion.choices[
1].message.content, "beam search should be different"

# test n and best_of
chat_completion = client.chat.completions.create(
model=model_name,
Expand All @@ -214,6 +266,30 @@ def test_multiple_responses(client: openai.OpenAI, model_name: str,
assert len(chat_completion.choices) == 2


def test_multiple_responses_with_beam_search(
client_with_beam_search: openai.OpenAI, model_name: str):
messages = [{
"role": "system",
"content": "you are a helpful assistant"
}, {
"role": "user",
"content": "what is 1+1?"
}]
# test beam search
chat_completion = client_with_beam_search.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
n=2,
temperature=0.0,
extra_body=dict(use_beam_search=True),
)
assert len(chat_completion.choices) == 2
assert chat_completion.choices[
0].message.content != chat_completion.choices[
1].message.content, "beam search should be different"


@pytest.mark.asyncio(loop_scope="module")
async def test_chat_streaming(async_client: openai.AsyncOpenAI,
model_name: str):
Expand Down
77 changes: 70 additions & 7 deletions tests/unittest/llmapi/apps/_test_openai_completions.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
# Adapted from
# https://github.com/vllm-project/vllm/blob/aae6927be06dedbda39c6b0c30f6aa3242b84388/tests/entrypoints/openai/test_completion.py

import itertools
from typing import List

import openai
import pytest
from utils.util import similar

from ..test_llm import get_model_path
from .openai_server import RemoteOpenAIServer
Expand Down Expand Up @@ -33,8 +35,21 @@ def num_postprocess_workers(request):
def server(model_name: str, backend: str, num_postprocess_workers: int):
model_path = get_model_path(model_name)
args = ["--backend", f"{backend}"]
if backend == "trt":
args.extend(["--max_beam_width", "4"])
args.extend(["--kv_cache_free_gpu_memory_fraction",
"0.2"]) # for co-existence with other servers
args.extend(["--num_postprocess_workers", f"{num_postprocess_workers}"])
with RemoteOpenAIServer(model_path, args) as remote_server:
yield remote_server


@pytest.fixture(scope="module")
def server_with_beam_search(model_name: str, backend: str,
num_postprocess_workers: int):
model_path = get_model_path(model_name)
args = ["--backend", f"{backend}"]
args.extend(["--kv_cache_free_gpu_memory_fraction",
"0.2"]) # for co-existence with other servers
args.extend(["--max_beam_width", "2"])
args.extend(["--num_postprocess_workers", f"{num_postprocess_workers}"])
with RemoteOpenAIServer(model_path, args) as remote_server:
yield remote_server
Expand All @@ -50,6 +65,11 @@ def async_client(server: RemoteOpenAIServer):
return server.get_async_client()


@pytest.fixture(scope="module")
def async_client_with_beam_search(server_with_beam_search: RemoteOpenAIServer):
return server_with_beam_search.get_async_client()


def test_single_completion(client: openai.OpenAI, model_name):
completion = client.completions.create(
model=model_name,
Expand Down Expand Up @@ -146,12 +166,10 @@ async def test_batch_completions(async_client: openai.AsyncOpenAI, model_name,
@pytest.mark.asyncio(loop_scope="module")
@pytest.mark.parametrize("prompts",
[["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2])
async def test_batch_completions_beam_search(async_client: openai.AsyncOpenAI,
model_name, prompts, backend):
async def test_batch_completions_beam_search(
async_client_with_beam_search: openai.AsyncOpenAI, model_name, prompts):
# test beam search
if backend == 'pytorch':
pytest.skip("Beam search is not supported in PyTorch backend yet")
batch = await async_client.completions.create(
batch = await async_client_with_beam_search.completions.create(
model=model_name,
prompt=prompts,
n=2,
Expand Down Expand Up @@ -189,6 +207,51 @@ async def test_batch_completions_streaming(async_client: openai.AsyncOpenAI,
assert texts[0] == texts[1]


@pytest.mark.asyncio(loop_scope="module")
@pytest.mark.parametrize("prompts", [["Hello, my name is"] * 2])
async def test_batch_completions_with_option_n_streaming(
async_client: openai.AsyncOpenAI, model_name, prompts):
# Use non-stream single generation as reference
completion_ref = await async_client.completions.create(
model=model_name,
prompt=prompts[0],
max_tokens=5,
temperature=0.0001,
)

text_ref = completion_ref.choices[0].text
# .choices[0].text

# test beam search with streaming
batch = await async_client.completions.create(
model=model_name,
prompt=prompts,
n=3, # number of completions to generate for each prompt.
max_tokens=5,
temperature=0.0001,
stream=True,
)
texts = [""] * 6 # 2 prompts × 3 generations per prompt = 6 choices
async for chunk in batch:
assert len(chunk.choices) == 1
choice = chunk.choices[0]
texts[choice.index] += choice.text

assert "" not in texts # Assert all the generations are not empty

# Check all pairs within first request are consistent
for i, j in itertools.combinations(texts[:3], 2):
assert similar(i, j, threshold=0.8)

# Check all pairs within second request are consistent
for i, j in itertools.combinations(texts[3:], 2):
assert similar(i, j, threshold=0.8)

# Check all generations are consistent with the reference
for text in texts:
assert similar(text, text_ref, threshold=0.8)


@pytest.mark.asyncio(loop_scope="module")
async def test_completion_stream_options(async_client: openai.AsyncOpenAI,
model_name: str):
Expand Down