From e3d54d3698fbb08e9655a5f39a2f8dabcec1b936 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=C3=B8rstad?= Date: Mon, 22 Sep 2025 16:51:50 -0500 Subject: [PATCH 1/3] Support empty response for Completions and ChatCompletions API --- lm_eval/models/openai_completions.py | 11 ++++++++--- tests/models/test_api.py | 24 ++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/lm_eval/models/openai_completions.py b/lm_eval/models/openai_completions.py index d89f63d31e3..32a088e8d75 100644 --- a/lm_eval/models/openai_completions.py +++ b/lm_eval/models/openai_completions.py @@ -8,7 +8,6 @@ from lm_eval.models.api_models import TemplateAPI from lm_eval.models.utils import handle_stop_sequences - eval_logger = logging.getLogger(__name__) @@ -95,7 +94,10 @@ def parse_generations(outputs: Union[Dict, List[Dict]], **kwargs) -> List[str]: for out in outputs: tmp = [None] * len(out["choices"]) for choices in out["choices"]: - tmp[choices["index"]] = choices["text"] + x = "" + if choices["text"] is not None: + x = choices["text"] + tmp[choices["index"]] = x res = res + tmp return res @@ -167,7 +169,10 @@ def parse_generations(outputs: Union[Dict, List[Dict]], **kwargs) -> List[str]: for out in outputs: tmp = [None] * len(out["choices"]) for choices in out["choices"]: - tmp[choices["index"]] = choices["message"]["content"] + x = "" + if choices["message"]["content"] is not None: + x = choices["message"]["content"] + tmp[choices["index"]] = x res = res + tmp return res diff --git a/tests/models/test_api.py b/tests/models/test_api.py index 2db22377470..4ae368d83c8 100644 --- a/tests/models/test_api.py +++ b/tests/models/test_api.py @@ -3,6 +3,7 @@ import pytest +from lm_eval.api.instance import Instance from lm_eval.models.openai_completions import LocalCompletionsAPI @@ -161,6 +162,29 @@ def test_model_tokenized_call_usage( assert result == {"result": "success"} +def test_generate_until_with_null_message_content(api): + with patch("requests.post") as mock_post: + mock_response = MagicMock() + mock_response.json.return_value = { + "choices": [ + { + "index": 0, + "text": None, + } + ] + } + mock_response.ok = True + mock_post.return_value = mock_response + request = Instance( + request_type="generate_until", + doc={}, + arguments=("Test prompt", {"max_gen_toks": 10}), + idx=0, + ) + + _ = api.generate_until([request]) + + class DummyAsyncContextManager: def __init__(self, result): self.result = result From 05ee49186920c48ccb44ec3c20300446be861a01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=C3=B8rstad?= Date: Fri, 3 Oct 2025 11:39:07 +0200 Subject: [PATCH 2/3] Add warnings for empty/None API responses - Add warning logs when API returns None/empty responses in parse_generations - Helps users identify when reasoning models consume entire token budget - Applied pre-commit formatting Addresses review feedback from @baberabb --- lm_eval/models/openai_completions.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/lm_eval/models/openai_completions.py b/lm_eval/models/openai_completions.py index 32a088e8d75..13bd5cf44f3 100644 --- a/lm_eval/models/openai_completions.py +++ b/lm_eval/models/openai_completions.py @@ -8,6 +8,7 @@ from lm_eval.models.api_models import TemplateAPI from lm_eval.models.utils import handle_stop_sequences + eval_logger = logging.getLogger(__name__) @@ -97,6 +98,12 @@ def parse_generations(outputs: Union[Dict, List[Dict]], **kwargs) -> List[str]: x = "" if choices["text"] is not None: x = choices["text"] + else: + eval_logger.warning( + f"Received empty response for choice {choices['index']}. " + "This can happen when using reasoning models if the model spends all the token limit on reasoning. " + "Consider increasing the number of allowed tokens." + ) tmp[choices["index"]] = x res = res + tmp return res @@ -172,6 +179,12 @@ def parse_generations(outputs: Union[Dict, List[Dict]], **kwargs) -> List[str]: x = "" if choices["message"]["content"] is not None: x = choices["message"]["content"] + else: + eval_logger.warning( + f"Received empty response for choice {choices['index']}. " + "This can happen when using reasoning models if the model spends all the token limit on reasoning. " + "Consider increasing the number of allowed tokens." + ) tmp[choices["index"]] = x res = res + tmp return res From c01f509e2e5430f5dc894434d2394e49a64b575b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=C3=B8rstad?= Date: Fri, 3 Oct 2025 11:53:28 +0200 Subject: [PATCH 3/3] Improve code readability and check for empty string instead of just None --- lm_eval/models/openai_completions.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/lm_eval/models/openai_completions.py b/lm_eval/models/openai_completions.py index 13bd5cf44f3..96aa7ea0f08 100644 --- a/lm_eval/models/openai_completions.py +++ b/lm_eval/models/openai_completions.py @@ -95,16 +95,15 @@ def parse_generations(outputs: Union[Dict, List[Dict]], **kwargs) -> List[str]: for out in outputs: tmp = [None] * len(out["choices"]) for choices in out["choices"]: - x = "" - if choices["text"] is not None: - x = choices["text"] - else: + x = choices["text"] + content = x if x is not None else "" + if not content: eval_logger.warning( f"Received empty response for choice {choices['index']}. " - "This can happen when using reasoning models if the model spends all the token limit on reasoning. " + "This can happen when using reasoning models if the model spends the entire token budget on reasoning. " "Consider increasing the number of allowed tokens." ) - tmp[choices["index"]] = x + tmp[choices["index"]] = content res = res + tmp return res @@ -176,16 +175,15 @@ def parse_generations(outputs: Union[Dict, List[Dict]], **kwargs) -> List[str]: for out in outputs: tmp = [None] * len(out["choices"]) for choices in out["choices"]: - x = "" - if choices["message"]["content"] is not None: - x = choices["message"]["content"] - else: + x = choices["message"]["content"] + content = x if x is not None else "" + if not content: eval_logger.warning( f"Received empty response for choice {choices['index']}. " - "This can happen when using reasoning models if the model spends all the token limit on reasoning. " + "This can happen when using reasoning models if the model spends the entire token budget on reasoning. " "Consider increasing the number of allowed tokens." ) - tmp[choices["index"]] = x + tmp[choices["index"]] = content res = res + tmp return res