Skip to content

Commit 04c7900

Browse files
feat: vlm benchmark with structured output (#666)
Co-authored-by: Maciej Majek <[email protected]>
1 parent 49a869e commit 04c7900

File tree

29 files changed

+798
-299
lines changed

29 files changed

+798
-299
lines changed

docs/simulation_and_benchmarking/rai_bench.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,6 @@ Tasks of this benchmark are grouped by type:
127127

128128
- Basic - basic usage of tools
129129
- Navigation
130-
- Spatial reasoning - questions about surroundings with images attached
131130
- Manipulation
132131
- Custom Interfaces - requires using messages with custom interfaces
133132

docs/tutorials/benchmarking.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,6 @@ if __name__ == "__main__":
9494
extra_tool_calls=5, # how many extra tool calls allowed to still pass
9595
task_types=[ # what types of tasks to include
9696
"basic",
97-
"spatial_reasoning",
9897
"manipulation",
9998
],
10099
repeats=1,

src/rai_bench/README.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,22 @@ python src/rai_bench/rai_bench/examples/tool_calling_agent/main.py --model-name
163163
> [!NOTE]
164164
> The configs of vendors are defined in [config.toml](../../config.toml) Change ithem if needed.
165165
166+
## VLM Benchmark
167+
168+
The VLM Benchmark is a benchmark for VLM models. It includes a set of tasks containing questions related to images and evaluates the performance of the agent that returns the answer in the structured format.
169+
170+
### Running
171+
172+
To set up tracing backends, please follow the instructions in the [tracing.md](../../docs/tracing.md) document.
173+
174+
To run the benchmark:
175+
176+
```bash
177+
cd rai
178+
source setup_shell.sh
179+
python src/rai_bench/rai_bench/examples/vlm_benchmark.py --model-name gemma3:4b --vendor ollama
180+
```
181+
166182
## Testing Models
167183

168184
To test multiple models, different benchamrks or couple repeats in one go - use script [test_models](./rai_bench/examples/test_models.py)

src/rai_bench/rai_bench/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
get_llm_for_benchmark,
2323
parse_manipulation_o3de_benchmark_args,
2424
parse_tool_calling_benchmark_args,
25+
parse_vlm_benchmark_args,
2526
)
2627

2728
__all__ = [
@@ -31,6 +32,7 @@
3132
"get_llm_for_benchmark",
3233
"parse_manipulation_o3de_benchmark_args",
3334
"parse_tool_calling_benchmark_args",
35+
"parse_vlm_benchmark_args",
3436
"test_dual_agents",
3537
"test_models",
3638
]

src/rai_bench/rai_bench/docs/tool_calling_agent_benchmark.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,4 @@ Implementations can be found:
1414

1515
- Validators [Validators](../tool_calling_agent/validators.py)
1616
- Subtasks [Validators](../tool_calling_agent/tasks/subtasks.py)
17-
- Tasks, including navigation, spatial, custom interfaces and other [Tasks](../tool_calling_agent/tasks/)
17+
- Tasks, including navigation, custom interfaces and other [Tasks](../tool_calling_agent/tasks/)

src/rai_bench/rai_bench/examples/benchmarking_models.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@
3535
extra_tool_calls=5, # how many extra tool calls allowed to still pass
3636
task_types=[ # what types of tasks to include
3737
"basic",
38-
"spatial_reasoning",
3938
"manipulation",
4039
],
4140
repeats=1,

src/rai_bench/rai_bench/examples/dual_agent.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616

1717
from rai_bench import (
1818
ManipulationO3DEBenchmarkConfig,
19-
ToolCallingAgentBenchmarkConfig,
2019
test_dual_agents,
2120
)
2221

@@ -29,11 +28,6 @@
2928

3029
tool_llm = ChatOpenAI(model="gpt-4o-mini", base_url="https://api.openai.com/v1/")
3130
# Define benchmarks that will be used
32-
tool_conf = ToolCallingAgentBenchmarkConfig(
33-
extra_tool_calls=0, # how many extra tool calls allowed to still pass
34-
task_types=["spatial_reasoning"],
35-
repeats=15,
36-
)
3731

3832
man_conf = ManipulationO3DEBenchmarkConfig(
3933
o3de_config_path="src/rai_bench/rai_bench/manipulation_o3de/predefined/configs/o3de_config.yaml", # path to your o3de config
@@ -48,6 +42,6 @@
4842
test_dual_agents(
4943
multimodal_llms=[m_llm],
5044
tool_calling_models=[tool_llm],
51-
benchmark_configs=[man_conf, tool_conf],
45+
benchmark_configs=[man_conf],
5246
out_dir=out_dir,
5347
)
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# Copyright (C) 2025 Robotec.AI
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from pathlib import Path
16+
17+
from rai_bench import (
18+
define_benchmark_logger,
19+
parse_vlm_benchmark_args,
20+
)
21+
from rai_bench.utils import get_llm_for_benchmark
22+
from rai_bench.vlm_benchmark import get_spatial_tasks, run_benchmark
23+
24+
if __name__ == "__main__":
25+
args = parse_vlm_benchmark_args()
26+
experiment_dir = Path(args.out_dir)
27+
experiment_dir.mkdir(parents=True, exist_ok=True)
28+
bench_logger = define_benchmark_logger(out_dir=experiment_dir)
29+
try:
30+
tasks = get_spatial_tasks()
31+
for task in tasks:
32+
task.set_logger(bench_logger)
33+
34+
llm = get_llm_for_benchmark(
35+
model_name=args.model_name,
36+
vendor=args.vendor,
37+
)
38+
run_benchmark(
39+
llm=llm,
40+
out_dir=experiment_dir,
41+
tasks=tasks,
42+
bench_logger=bench_logger,
43+
)
44+
except Exception as e:
45+
bench_logger.critical(
46+
msg=f"Benchmark failed with error: {e}",
47+
exc_info=True,
48+
)

src/rai_bench/rai_bench/results_processing/langfuse_scores_tracing.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,15 +47,15 @@ def send_score(
4747
if isinstance(callback, CallbackHandler):
4848
callback.langfuse.score(
4949
trace_id=str(run_id),
50-
name="tool calls result",
50+
name="result",
5151
value=score,
5252
comment=comment,
5353
)
5454
return None
5555
if isinstance(callback, LangChainTracer):
5656
callback.client.create_feedback(
5757
run_id=run_id,
58-
key="tool calls result",
58+
key="result",
5959
score=score,
6060
comment=comment,
6161
)

src/rai_bench/rai_bench/test_models.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,14 +64,12 @@ class ToolCallingAgentBenchmarkConfig(BenchmarkConfig):
6464
"manipulation",
6565
"navigation",
6666
"custom_interfaces",
67-
"spatial_reasoning",
6867
]
6968
] = [
7069
"basic",
7170
"manipulation",
7271
"navigation",
7372
"custom_interfaces",
74-
"spatial_reasoning",
7573
]
7674

7775
@property

0 commit comments

Comments
 (0)