RobotecAI
diff --git a/‎docs/simulation_and_benchmarking/rai_bench.md‎
Lines changed: 0 additions & 1 deletion b/‎docs/simulation_and_benchmarking/rai_bench.md‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎docs/tutorials/benchmarking.md‎
Lines changed: 0 additions & 1 deletion b/‎docs/tutorials/benchmarking.md‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/rai_bench/README.md‎
Lines changed: 16 additions & 0 deletions b/‎src/rai_bench/README.md‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎src/rai_bench/rai_bench/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎src/rai_bench/rai_bench/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/rai_bench/rai_bench/docs/tool_calling_agent_benchmark.md‎
Lines changed: 1 addition & 1 deletion b/‎src/rai_bench/rai_bench/docs/tool_calling_agent_benchmark.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/rai_bench/rai_bench/examples/benchmarking_models.py‎
Lines changed: 0 additions & 1 deletion b/‎src/rai_bench/rai_bench/examples/benchmarking_models.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/rai_bench/rai_bench/examples/dual_agent.py‎
Lines changed: 1 addition & 7 deletions b/‎src/rai_bench/rai_bench/examples/dual_agent.py‎
Lines changed: 1 addition & 7 deletions
diff --git a/‎src/rai_bench/rai_bench/examples/vlm_benchmark.py‎
Lines changed: 48 additions & 0 deletions b/‎src/rai_bench/rai_bench/examples/vlm_benchmark.py‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎src/rai_bench/rai_bench/results_processing/langfuse_scores_tracing.py‎
Lines changed: 2 additions & 2 deletions b/‎src/rai_bench/rai_bench/results_processing/langfuse_scores_tracing.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/rai_bench/rai_bench/test_models.py‎
Lines changed: 0 additions & 2 deletions b/‎src/rai_bench/rai_bench/test_models.py‎
Lines changed: 0 additions & 2 deletions
@@ -127,7 +127,6 @@ Tasks of this benchmark are grouped by type:
 
 -   Basic - basic usage of tools
 -   Navigation
--   Spatial reasoning - questions about surroundings with images attached
 -   Manipulation
 -   Custom Interfaces - requires using messages with custom interfaces
 
 
@@ -94,7 +94,6 @@ if __name__ == "__main__":
         extra_tool_calls=5,  # how many extra tool calls allowed to still pass
         task_types=[  # what types of tasks to include
             "basic",
-            "spatial_reasoning",
             "manipulation",
         ],
         repeats=1,
 
@@ -163,6 +163,22 @@ python src/rai_bench/rai_bench/examples/tool_calling_agent/main.py --model-name
 > [!NOTE]
 > The configs of vendors are defined in [config.toml](../../config.toml) Change ithem if needed.
 
+## VLM Benchmark
+
+The VLM Benchmark is a benchmark for VLM models. It includes a set of tasks containing questions related to images and evaluates the performance of the agent that returns the answer in the structured format.
+
+### Running
+
+To set up tracing backends, please follow the instructions in the [tracing.md](../../docs/tracing.md) document.
+
+To run the benchmark:
+
+```bash
+cd rai
+source setup_shell.sh
+python src/rai_bench/rai_bench/examples/vlm_benchmark.py --model-name gemma3:4b --vendor ollama
+```
+
 ## Testing Models
 
 To test multiple models, different benchamrks or couple repeats in one go - use script [test_models](./rai_bench/examples/test_models.py)
 
@@ -22,6 +22,7 @@
     get_llm_for_benchmark,
     parse_manipulation_o3de_benchmark_args,
     parse_tool_calling_benchmark_args,
+    parse_vlm_benchmark_args,
 )
 
 __all__ = [
@@ -31,6 +32,7 @@
     "get_llm_for_benchmark",
     "parse_manipulation_o3de_benchmark_args",
     "parse_tool_calling_benchmark_args",
+    "parse_vlm_benchmark_args",
     "test_dual_agents",
     "test_models",
 ]
@@ -14,4 +14,4 @@ Implementations can be found:
 
 -   Validators [Validators](../tool_calling_agent/validators.py)
 -   Subtasks [Validators](../tool_calling_agent/tasks/subtasks.py)
--   Tasks, including navigation, spatial, custom interfaces and other [Tasks](../tool_calling_agent/tasks/)
+-   Tasks, including navigation, custom interfaces and other [Tasks](../tool_calling_agent/tasks/)
@@ -35,7 +35,6 @@
         extra_tool_calls=5,  # how many extra tool calls allowed to still pass
         task_types=[  # what types of tasks to include
             "basic",
-            "spatial_reasoning",
             "manipulation",
         ],
         repeats=1,
 
@@ -16,7 +16,6 @@
 
 from rai_bench import (
     ManipulationO3DEBenchmarkConfig,
-    ToolCallingAgentBenchmarkConfig,
     test_dual_agents,
 )
 
@@ -29,11 +28,6 @@
 
     tool_llm = ChatOpenAI(model="gpt-4o-mini", base_url="https://api.openai.com/v1/")
     # Define benchmarks that will be used
-    tool_conf = ToolCallingAgentBenchmarkConfig(
-        extra_tool_calls=0,  # how many extra tool calls allowed to still pass
-        task_types=["spatial_reasoning"],
-        repeats=15,
-    )
 
     man_conf = ManipulationO3DEBenchmarkConfig(
         o3de_config_path="src/rai_bench/rai_bench/manipulation_o3de/predefined/configs/o3de_config.yaml",  # path to your o3de config
@@ -48,6 +42,6 @@
     test_dual_agents(
         multimodal_llms=[m_llm],
         tool_calling_models=[tool_llm],
-        benchmark_configs=[man_conf, tool_conf],
+        benchmark_configs=[man_conf],
         out_dir=out_dir,
     )
@@ -0,0 +1,48 @@
+# Copyright (C) 2025 Robotec.AI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+
+from rai_bench import (
+    define_benchmark_logger,
+    parse_vlm_benchmark_args,
+)
+from rai_bench.utils import get_llm_for_benchmark
+from rai_bench.vlm_benchmark import get_spatial_tasks, run_benchmark
+
+if __name__ == "__main__":
+    args = parse_vlm_benchmark_args()
+    experiment_dir = Path(args.out_dir)
+    experiment_dir.mkdir(parents=True, exist_ok=True)
+    bench_logger = define_benchmark_logger(out_dir=experiment_dir)
+    try:
+        tasks = get_spatial_tasks()
+        for task in tasks:
+            task.set_logger(bench_logger)
+
+        llm = get_llm_for_benchmark(
+            model_name=args.model_name,
+            vendor=args.vendor,
+        )
+        run_benchmark(
+            llm=llm,
+            out_dir=experiment_dir,
+            tasks=tasks,
+            bench_logger=bench_logger,
+        )
+    except Exception as e:
+        bench_logger.critical(
+            msg=f"Benchmark failed with error: {e}",
+            exc_info=True,
+        )
@@ -47,15 +47,15 @@ def send_score(
         if isinstance(callback, CallbackHandler):
             callback.langfuse.score(
                 trace_id=str(run_id),
-                name="tool calls result",
+                name="result",
                 value=score,
                 comment=comment,
             )
             return None
         if isinstance(callback, LangChainTracer):
             callback.client.create_feedback(
                 run_id=run_id,
-                key="tool calls result",
+                key="result",
                 score=score,
                 comment=comment,
             )
 
@@ -64,14 +64,12 @@ class ToolCallingAgentBenchmarkConfig(BenchmarkConfig):
             "manipulation",
             "navigation",
             "custom_interfaces",
-            "spatial_reasoning",
         ]
     ] = [
         "basic",
         "manipulation",
         "navigation",
         "custom_interfaces",
-        "spatial_reasoning",
     ]
 
     @property
Original file line number	Diff line number	Diff line change
`@@ -22,6 +22,7 @@`
`22`	`22`	`get_llm_for_benchmark,`
`23`	`23`	`parse_manipulation_o3de_benchmark_args,`
`24`	`24`	`parse_tool_calling_benchmark_args,`
	`25`	`+ parse_vlm_benchmark_args,`
`25`	`26`	`)`
`26`	`27`
`27`	`28`	`__all__ = [`
`@@ -31,6 +32,7 @@`
`31`	`32`	`"get_llm_for_benchmark",`
`32`	`33`	`"parse_manipulation_o3de_benchmark_args",`
`33`	`34`	`"parse_tool_calling_benchmark_args",`
	`35`	`+ "parse_vlm_benchmark_args",`
`34`	`36`	`"test_dual_agents",`
`35`	`37`	`"test_models",`
`36`	`38`	`]`
Original file line number	Diff line number	Diff line change
`@@ -64,14 +64,12 @@ class ToolCallingAgentBenchmarkConfig(BenchmarkConfig):`
`64`	`64`	`"manipulation",`
`65`	`65`	`"navigation",`
`66`	`66`	`"custom_interfaces",`
`67`		`- "spatial_reasoning",`
`68`	`67`	`]`
`69`	`68`	`] = [`
`70`	`69`	`"basic",`
`71`	`70`	`"manipulation",`
`72`	`71`	`"navigation",`
`73`	`72`	`"custom_interfaces",`
`74`		`- "spatial_reasoning",`
`75`	`73`	`]`
`76`	`74`
`77`	`75`	`@property`