feat: added vlm bench config to test many models

Magdalena Kotynia · Magdalena Kotynia · commit 867f294bf3f6 · 2025-08-12T10:23:31.000+02:00
diff --git a/src/rai_bench/rai_bench/__init__.py b/src/rai_bench/rai_bench/__init__.py
@@ -14,6 +14,7 @@
 from .test_models import (
     ManipulationO3DEBenchmarkConfig,
     ToolCallingAgentBenchmarkConfig,
+    VLMBenchmarkConfig,
     test_dual_agents,
     test_models,
 )
@@ -28,6 +29,7 @@
 __all__ = [
     "ManipulationO3DEBenchmarkConfig",
     "ToolCallingAgentBenchmarkConfig",
+    "VLMBenchmarkConfig",
     "define_benchmark_logger",
     "get_llm_for_benchmark",
     "parse_manipulation_o3de_benchmark_args",
diff --git a/src/rai_bench/rai_bench/test_models.py b/src/rai_bench/rai_bench/test_models.py
@@ -23,6 +23,7 @@
 
 import rai_bench.manipulation_o3de as manipulation_o3de
 import rai_bench.tool_calling_agent as tool_calling_agent
+import rai_bench.vlm_benchmark as vlm_benchmark
 from rai_bench.utils import (
     define_benchmark_logger,
     get_llm_for_benchmark,
@@ -77,6 +78,25 @@ def name(self) -> str:
         return "tool_calling_agent"
 
 
+class VLMBenchmarkConfig(BenchmarkConfig):
+    complexities: List[Literal["easy", "medium", "hard"]] = ["easy", "medium", "hard"]
+    task_types: List[
+        Literal[
+            "bool_response_image_task",
+            "quantity_response_image_task",
+            "multiple_choice_image_task",
+        ]
+    ] = [
+        "bool_response_image_task",
+        "quantity_response_image_task",
+        "multiple_choice_image_task",
+    ]
+
+    @property
+    def name(self) -> str:
+        return "vlm"
+
+
 def test_dual_agents(
     multimodal_llms: List[BaseChatModel],
     tool_calling_models: List[BaseChatModel],
@@ -211,6 +231,15 @@ def test_models(
                                 experiment_id=experiment_id,
                                 bench_logger=bench_logger,
                             )
+
+                        elif isinstance(bench_conf, VLMBenchmarkConfig):
+                            vlm_tasks = vlm_benchmark.get_spatial_tasks()
+                            vlm_benchmark.run_benchmark(
+                                llm=llm,
+                                out_dir=Path(curr_out_dir),
+                                tasks=vlm_tasks,
+                                bench_logger=bench_logger,
+                            )
                     except Exception as e:
                         bench_logger.critical(f"BENCHMARK RUN FAILED: {e}")
                         bench_logger.critical(