Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion evalscope/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,9 +195,10 @@ def __init_default_generation_config(self):
'temperature': 1.0,
'n': 1,
}
elif self.eval_type == EvalType.SERVICE:
elif self.eval_type in (EvalType.SERVICE, EvalType.BASE_MODEL):
self.generation_config = {
'temperature': 0.0,
'max_tokens': 512,
}
Comment on lines +198 to 202
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

This change introduces a default max_tokens: 512 for EvalType.SERVICE in addition to the new EvalType.BASE_MODEL. Previously, EvalType.SERVICE did not have a default max_tokens, so it would use the model's default. This is a behavioral change that could unexpectedly truncate outputs for existing users of EvalType.SERVICE. The pull request description states "Zero breaking changes to existing server/openai_api behavior", but this change seems to contradict that. To avoid breaking changes, you might want to apply this default only to EvalType.BASE_MODEL.

Suggested change
elif self.eval_type in (EvalType.SERVICE, EvalType.BASE_MODEL):
self.generation_config = {
'temperature': 0.0,
'max_tokens': 512,
}
elif self.eval_type == EvalType.SERVICE:
self.generation_config = {
'temperature': 0.0,
}
elif self.eval_type == EvalType.BASE_MODEL:
self.generation_config = {
'temperature': 0.0,
'max_tokens': 512,
}

if isinstance(self.generation_config, dict):
self.generation_config = GenerateConfig.model_validate(self.generation_config)
Expand Down
1 change: 1 addition & 0 deletions evalscope/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ class EvalType:
MOCK_LLM = 'mock_llm'
CHECKPOINT = 'llm_ckpt' # native model checkpoint
SERVICE = 'openai_api' # model service
BASE_MODEL = 'base_model' # base model service using completions endpoint
TEXT2IMAGE = 'text2image' # image generation service
IMAGE_EDITING = 'image_editing' # image editing service

Expand Down
7 changes: 7 additions & 0 deletions evalscope/models/model_apis.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,13 @@ def server() -> type[ModelAPI]:
return OpenAICompatibleAPI


@register_model_api(name='base_model')
def base_model() -> type[ModelAPI]:
from .openai_compatible import OpenAIBaseModelAPI

return OpenAIBaseModelAPI


@register_model_api(name='llm_ckpt')
def llm_ckpt() -> type[ModelAPI]:
check_import('torch', package='torch', raise_error=True, feature_name='llm_ckpt')
Expand Down
57 changes: 56 additions & 1 deletion evalscope/models/openai_compatible.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
from openai import APIStatusError, BadRequestError, OpenAI, PermissionDeniedError, UnprocessableEntityError
from openai._types import NOT_GIVEN
from openai.types import Completion
from openai.types.chat import ChatCompletion
from typing import Any, Dict, List, Optional, Tuple, Union

Expand All @@ -11,13 +12,17 @@
from evalscope.utils.argument_utils import get_supported_params
from .utils.openai import (
chat_choices_from_openai,
collect_completion_stream_response,
collect_stream_response,
completion_choices_from_openai,
model_output_from_openai,
model_output_from_openai_completion,
openai_chat_messages,
openai_chat_tool_choice,
openai_chat_tools,
openai_completion_params,
openai_handle_bad_request,
openai_prompt_from_messages,
)

logger = get_logger()
Expand Down Expand Up @@ -50,7 +55,7 @@ def __init__(
assert self.base_url, f'Base URL for {model_name} not found'

# remove trailing slash from base_url
self.base_url = self.base_url.rstrip('/').removesuffix('/chat/completions')
self.base_url = self.base_url.rstrip('/').removesuffix('/chat/completions').removesuffix('/completions')

# create http client
self.client = OpenAI(
Expand Down Expand Up @@ -142,3 +147,53 @@ def chat_choices_from_completion(self, completion: ChatCompletion,
def handle_bad_request(self, ex: APIStatusError) -> Union[ModelOutput, Exception]:
"""Hook for subclasses to do bad request handling"""
return openai_handle_bad_request(self.model_name, ex)


class OpenAIBaseModelAPI(OpenAICompatibleAPI):
"""OpenAI compatible API that targets the completions endpoint."""

def generate(
self,
input: List[ChatMessage],
tools: List[ToolInfo],
tool_choice: ToolChoice,
config: GenerateConfig,
) -> ModelOutput:
completion_params = self.completion_params(
config=config,
tools=False,
)

request = dict(
prompt=openai_prompt_from_messages(input),
**completion_params,
)

self.validate_completion_request_params(request)

try:
completion = self.client.completions.create(**request)
# handle streaming response
if not isinstance(completion, Completion):
completion = collect_completion_stream_response(completion)
response = completion.model_dump()
self.on_response(response)

choices = completion_choices_from_openai(completion)
return model_output_from_openai_completion(completion, choices)

except (BadRequestError, UnprocessableEntityError, PermissionDeniedError) as ex:
return self.handle_bad_request(ex)

def validate_completion_request_params(self, params: Dict[str, Any]):
"""Validate request params for completions endpoint."""
if not hasattr(self, '_valid_completion_params'):
self._valid_completion_params = get_supported_params(self.client.completions.create)
Comment on lines +190 to +191
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

It's good practice to cache the result of get_supported_params to avoid repeated and potentially expensive reflection calls. However, this caching is implemented on the instance (self). If multiple instances of OpenAIBaseModelAPI are created, this check will be performed for each one. Consider caching this at the class level to optimize further, as the supported parameters for self.client.completions.create will be the same across all instances of this class.

Suggested change
if not hasattr(self, '_valid_completion_params'):
self._valid_completion_params = get_supported_params(self.client.completions.create)
if not hasattr(OpenAIBaseModelAPI, '_valid_completion_params'):
OpenAIBaseModelAPI._valid_completion_params = get_supported_params(self.client.completions.create)


extra_body = params.get('extra_body', {})
for key in list(params.keys()):
if key not in self._valid_completion_params:
extra_body[key] = params.pop(key)

if extra_body:
params['extra_body'] = extra_body
88 changes: 88 additions & 0 deletions evalscope/models/utils/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,14 @@
from collections import defaultdict
from copy import copy
from openai import APIStatusError, OpenAIError
from openai.types import Completion

# Compatibility shim: CompletionChoice location differs across openai versions
try:
from openai.types.completion import Choice as CompletionChoice # type: ignore
except ImportError: # pragma: no cover
from openai.types.completion_choice import CompletionChoice # type: ignore

from openai.types.chat import (
ChatCompletion,
ChatCompletionAssistantMessageParam,
Expand Down Expand Up @@ -157,6 +165,16 @@ def openai_chat_messages(
return [openai_chat_message(message, system_role) for message in messages]


def openai_prompt_from_messages(messages: List[ChatMessage]) -> str:
"""Flatten chat messages into a simple text prompt for completions API."""
parts: List[str] = []
for message in messages:
role = getattr(message, 'role', 'user')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Using getattr(message, 'role', 'user') is defensive, but ChatMessage is a Union of types that all define a role attribute. Relying on getattr with a default might mask potential issues where a message object is missing its role, which would cause it to silently default to 'user' and lead to incorrect prompt formatting. It would be more robust and clearer to directly access message.role. If there's a scenario where role can be missing, it might be better to handle that case explicitly or adjust the type hints.

Suggested change
role = getattr(message, 'role', 'user')
role = message.role

parts.append(f'{role}: {message.text}')
parts.append('assistant:')
return '\n'.join(parts)


def openai_completion_params(model: str, config: GenerateConfig, tools: bool) -> Dict[str, Any]:
params: Dict[str, Any] = dict(model=model)
# handle stream option
Expand Down Expand Up @@ -505,6 +523,40 @@ def chat_message_assistant_from_openai(
)


def completion_choices_from_openai(response: Completion) -> List[ChatCompletionChoice]:
choices = list(response.choices)
choices.sort(key=lambda c: c.index)
return [
ChatCompletionChoice(
message=ChatMessageAssistant(content=(choice.text or ''), model=response.model, source='generate'),
stop_reason=as_stop_reason(choice.finish_reason),
logprobs=None,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The logprobs field is hardcoded to None. The OpenAI completions API can return log probabilities if they are requested in generation_config. To fully support the features of the completions endpoint, it would be beneficial to parse the logprobs from the choice object and populate the logprobs field in ChatCompletionChoice. This would be consistent with how chat_choices_from_openai handles logprobs for chat completions.

) for choice in choices
]


def model_output_from_openai_completion(
completion: Completion,
choices: List[ChatCompletionChoice],
) -> ModelOutput:
return ModelOutput(
model=completion.model,
choices=choices,
usage=(
ModelUsage(
input_tokens=completion.usage.prompt_tokens,
output_tokens=completion.usage.completion_tokens,
input_tokens_cache_read=(
completion.usage.prompt_tokens_details.cached_tokens
if completion.usage.prompt_tokens_details is not None else None
),
reasoning_tokens=None,
total_tokens=completion.usage.total_tokens,
) if completion.usage else None
),
)


def model_output_from_openai(
completion: ChatCompletion,
choices: list[ChatCompletionChoice],
Expand Down Expand Up @@ -616,6 +668,42 @@ def _parse_content_with_internal(content: str, ) -> Tuple[str, Optional[JsonValu
) if internal_match else (content, None))


def collect_completion_stream_response(response_stream: List[Completion]) -> Completion:
collected_choices = defaultdict(str)
finish_reasons = {}
last_chunk: Optional[Completion] = None

for chunk in response_stream:
last_chunk = chunk
for choice in chunk.choices:
collected_choices[choice.index] += getattr(choice, 'text', '') or ''
if choice.finish_reason:
finish_reasons[choice.index] = choice.finish_reason

if last_chunk is None:
raise ValueError('Empty completion stream')

choices: List[CompletionChoice] = []
for index, text in collected_choices.items():
choices.append(
CompletionChoice(
finish_reason=finish_reasons.get(index, 'stop'),
index=index,
logprobs=None,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Similar to the non-streaming case, logprobs are hardcoded to None when reconstructing choices from a stream. The streaming chunks can contain logprob information which is currently being discarded. To provide full feature support, please consider collecting and reconstructing the logprobs from the stream as well.

text=text,
)
)

return Completion(
id=last_chunk.id,
choices=choices,
created=last_chunk.created,
model=last_chunk.model,
object=getattr(last_chunk, 'object', 'completion'),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The object attribute of the reconstructed Completion object is set using getattr(last_chunk, 'object', 'completion'). According to the OpenAI API documentation, the object type for a completion object is 'text_completion'. Using 'completion' might be incorrect. For consistency with the OpenAI spec and with how collect_stream_response handles chat completions (it sets object='chat.completion'), this should probably be set to 'text_completion'. The object for a streaming chunk is also 'text_completion', so you could just use last_chunk.object.

Suggested change
object=getattr(last_chunk, 'object', 'completion'),
object='text_completion',

usage=getattr(last_chunk, 'usage', None),
)


def collect_stream_response(response_stream: List[ChatCompletionChunk]) -> ChatCompletion:
collected_chunks: List[ChatCompletionChunk] = []
collected_messages = defaultdict(list)
Expand Down
Loading