Skip to content

Commit 344dda4

Browse files
Merge pull request #47 from lucaromagnoli/feat/model-benchmarks
tests(bench): add test_benchmarks.cpp to the test suite and remove ob…
2 parents da886cb + c97bfa8 commit 344dda4

File tree

5 files changed

+125
-143
lines changed

5 files changed

+125
-143
lines changed

include/openai/OpenAITypes.h

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -677,7 +677,8 @@ struct ResponsesRequest {
677677
auto modelEnum = modelFromString(model);
678678

679679
// Reasoning models (O-series + GPT-5) have different parameter support
680-
if (modelEnum == Model::GPT_5 || modelEnum == Model::O3 || modelEnum == Model::O3_Mini ||
680+
if (modelEnum == Model::GPT_5 || modelEnum == Model::GPT_5_Mini ||
681+
modelEnum == Model::GPT_5_Nano || modelEnum == Model::O3 || modelEnum == Model::O3_Mini ||
681682
modelEnum == Model::O1 || modelEnum == Model::O1_Mini ||
682683
modelEnum == Model::O1_Preview || modelEnum == Model::O1_Pro ||
683684
modelEnum == Model::O4_Mini || modelEnum == Model::O4_Mini_Deep_Research) {
@@ -1024,8 +1025,11 @@ std::string getRecommendedApiForModel(const std::string& model);
10241025

10251026
// Model lists for different APIs
10261027
const std::vector<std::string> RESPONSES_MODELS = {
1027-
"gpt-5", "gpt-4o", "gpt-4o-mini", "gpt-4.1", "gpt-4.1-nano", "gpt-4.1-mini",
1028-
"gpt-image-1", "o1", "o3-mini", "o3", "o4-mini", "computer-use-preview"};
1028+
"gpt-5", "gpt-5-mini", "gpt-5-nano",
1029+
"gpt-4o", "gpt-4o-mini",
1030+
"gpt-4.1", "gpt-4.1-nano", "gpt-4.1-mini",
1031+
"gpt-image-1", "o1", "o3-mini", "o3",
1032+
"o4-mini", "computer-use-preview"};
10291033

10301034
const std::vector<std::string> CHAT_COMPLETION_MODELS = {"gpt-4", "gpt-4-turbo", "gpt-4o",
10311035
"gpt-4o-mini", "gpt-3.5-turbo"};

tests/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ add_executable(llmcpp_tests
3535
${UNIT_TEST_SOURCES}
3636
${INTEGRATION_TEST_SOURCES}
3737
bench/benchmark_core.cpp
38+
bench/test_benchmarks.cpp
3839
)
3940

4041
# Link against the library and test framework
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#!/usr/bin/env bash
2+
set -euo pipefail
3+
4+
# Resolve repo root (prefer git); fallback to two levels up from this script
5+
if repo_root=$(git rev-parse --show-toplevel 2>/dev/null); then
6+
:
7+
else
8+
script_dir="$(cd "$(dirname "$0")" && pwd)"
9+
repo_root="$(cd "$script_dir/../.." && pwd)"
10+
fi
11+
12+
cd "$repo_root"
13+
14+
# Load API key from .env if present
15+
if [[ ! -f .env && -z "${OPENAI_API_KEY:-}" ]]; then
16+
echo "ERROR: OPENAI_API_KEY not set and .env missing." >&2
17+
exit 1
18+
fi
19+
# shellcheck disable=SC1091
20+
set -a && source .env 2>/dev/null || true && set +a
21+
22+
build_dir=cmake-build-bench
23+
cmake -S . -B "$build_dir" -G Ninja -DCMAKE_BUILD_TYPE=Release -DLLMCPP_BUILD_TESTS=ON >/dev/null
24+
cmake --build "$build_dir" -j >/dev/null
25+
26+
echo "model,ms,status,input_tokens,output_tokens,total_tokens"
27+
LLMCPP_RUN_BENCHMARKS=1 "$build_dir"/tests/llmcpp_tests "[openai][integration][benchmark]" --reporter compact 2>/dev/null |
28+
awk -F, '/^gpt|^o[13]-|^o4-mini/ { print $0 }'
29+
30+

tests/bench/test_benchmarks.cpp

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
#include <catch2/catch_test_macros.hpp>
2+
#include <chrono>
3+
#include <cstdlib>
4+
#include <iostream>
5+
#include <string>
6+
7+
#include "openai/OpenAIClient.h"
8+
#include "openai/OpenAITypes.h"
9+
10+
using namespace std::chrono;
11+
12+
static bool isReasoningModel(OpenAI::Model model) {
13+
return model == OpenAI::Model::GPT_5 || model == OpenAI::Model::GPT_5_Mini ||
14+
model == OpenAI::Model::GPT_5_Nano || model == OpenAI::Model::O3 ||
15+
model == OpenAI::Model::O3_Mini || model == OpenAI::Model::O1 ||
16+
model == OpenAI::Model::O1_Mini || model == OpenAI::Model::O1_Preview ||
17+
model == OpenAI::Model::O1_Pro || model == OpenAI::Model::O4_Mini;
18+
}
19+
20+
static bool isExcludedModel(const std::string &modelName) {
21+
return modelName == "gpt-image-1" || modelName == "computer-use-preview";
22+
}
23+
24+
static bool isGpt5Family(OpenAI::Model model) {
25+
return model == OpenAI::Model::GPT_5 || model == OpenAI::Model::GPT_5_Mini ||
26+
model == OpenAI::Model::GPT_5_Nano;
27+
}
28+
29+
TEST_CASE("OpenAI model benchmarks (structured outputs)", "[openai][integration][benchmark]") {
30+
const char* runBenchEnv = std::getenv("LLMCPP_RUN_BENCHMARKS");
31+
if (!runBenchEnv || std::string(runBenchEnv) != "1") {
32+
SUCCEED("Benchmarks skipped. Set LLMCPP_RUN_BENCHMARKS=1 to enable.");
33+
return;
34+
}
35+
36+
const char* apiKey = std::getenv("OPENAI_API_KEY");
37+
REQUIRE(apiKey != nullptr);
38+
39+
OpenAIClient client(apiKey);
40+
41+
// Minimal structured output schema
42+
json schema = {{"type", "object"},
43+
{"properties", {{"answer", {{"type", "string"}}}}},
44+
{"required", json::array({"answer"})},
45+
{"additionalProperties", false}};
46+
47+
// Simple input aligned with structured output requirement
48+
auto input = OpenAI::ResponsesInput::fromText(
49+
"Return a JSON object that conforms to the provided schema with answer set to 'OK'.");
50+
51+
// Iterate through response-capable models
52+
for (const auto& modelName : OpenAI::RESPONSES_MODELS) {
53+
if (isExcludedModel(modelName)) {
54+
std::cout << "[BENCH] skipping model=" << modelName << " (not supported for JSON schema bench)" << std::endl;
55+
continue;
56+
}
57+
DYNAMIC_SECTION("Benchmark model: " << modelName) {
58+
OpenAI::ResponsesRequest req;
59+
req.model = modelName;
60+
req.input = input;
61+
req.text = OpenAI::TextOutputConfig("bench_schema", schema, true);
62+
// Do not set max_output_tokens at all; let the server decide for all models
63+
auto modelEnum = OpenAI::modelFromString(modelName);
64+
65+
// Tweak reasoning parameters when appropriate
66+
if (isReasoningModel(modelEnum)) {
67+
req.reasoning = json{{"effort", "low"}};
68+
}
69+
70+
const auto start = steady_clock::now();
71+
auto response = client.sendResponsesRequest(req);
72+
const auto end = steady_clock::now();
73+
74+
const auto elapsedMs = duration_cast<milliseconds>(end - start).count();
75+
const bool ok = (response.isCompleted() && !response.hasError());
76+
int inTok = response.usage.inputTokens;
77+
int outTok = response.usage.outputTokens;
78+
int totalTok = inTok + outTok;
79+
std::cout << modelName << "," << elapsedMs << "," << (ok ? "ok" : "fail")
80+
<< "," << inTok << "," << outTok << "," << totalTok << std::endl;
81+
82+
// Sanity: we should at least get a response object back; don't assert success to avoid
83+
// flakes
84+
REQUIRE(!response.id.empty());
85+
}
86+
}
87+
}

tests/integration/test_benchmarks.cpp

Lines changed: 0 additions & 140 deletions
This file was deleted.

0 commit comments

Comments
 (0)