fix: fixed typing after refactor extracting common logic for vlm tasks inputs and answers

Magdalena Kotynia · Magdalena Kotynia · commit 1adc3d413cbb · 2025-08-18T15:51:58.000+02:00
diff --git a/src/rai_bench/rai_bench/vlm_benchmark/interfaces.py b/src/rai_bench/rai_bench/vlm_benchmark/interfaces.py
@@ -14,7 +14,7 @@
 
 import logging
 from abc import ABC, abstractmethod
-from typing import Generic, List, Literal, Optional, TypeVar
+from typing import Any, Generic, List, Literal, Optional, TypeVar
 
 from langchain_core.messages import BaseMessage
 from langchain_core.runnables.config import DEFAULT_RECURSION_LIMIT
@@ -28,29 +28,6 @@
 IMAGE_REASONING_SYSTEM_PROMPT = "You are a helpful and knowledgeable AI assistant that specializes in interpreting and analyzing visual content. Your task is to answer questions based on the images provided to you. Please response in requested structured output format."
 
 
-class LangchainRawOutputModel(BaseModel):
-    """
-    A Pydantic model for wrapping Langchain message parsing results from a structured output agent. See documentation for more details:
-    https://github.com/langchain-ai/langchain/blob/02001212b0a2b37d90451d8493089389ea220cab/libs/core/langchain_core/language_models/chat_models.py#L1430-L1432
-
-
-    Attributes
-    ----------
-    raw : BaseMessage
-        The original raw message object from Langchain before parsing.
-    parsed : BaseModel
-        The parsed and validated Pydantic model instance derived from the raw message.
-    parsing_error : Optional[BaseException]
-        Any exception that occurred during the parsing process, None if parsing
-        was successful.
-    """
-
-    model_config = ConfigDict(arbitrary_types_allowed=True)
-    raw: BaseMessage
-    parsed: BaseModel
-    parsing_error: Optional[BaseException]
-
-
 class TaskValidationError(Exception):
     pass
 
@@ -78,7 +55,30 @@ class ImageReasoningAnswer(BaseModel, Generic[AnswerT]):
     justification: str = Field(..., description="Justification for the answer.")
 
 
-class ImageReasoningTask(ABC, Generic[BaseModelT]):
+class LangchainRawOutputModel(BaseModel):
+    """
+    A Pydantic model for wrapping Langchain message parsing results from a structured output agent. See documentation for more details:
+    https://github.com/langchain-ai/langchain/blob/02001212b0a2b37d90451d8493089389ea220cab/libs/core/langchain_core/language_models/chat_models.py#L1430-L1432
+
+
+    Attributes
+    ----------
+    raw : BaseMessage
+        The original raw message object from Langchain before parsing.
+    parsed : BaseModel
+        The parsed and validated Pydantic model instance derived from the raw message.
+    parsing_error : Optional[BaseException]
+        Any exception that occurred during the parsing process, None if parsing
+        was successful.
+    """
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    raw: BaseMessage
+    parsed: ImageReasoningAnswer[Any]
+    parsing_error: Optional[BaseException]
+
+
+class ImageReasoningTask(ABC, Generic[AnswerT]):
     complexity: Literal["easy", "medium", "hard"]
     recursion_limit: int = DEFAULT_RECURSION_LIMIT
 
@@ -103,13 +103,14 @@ def __init__(
             self.logger = logging.getLogger(__name__)
         self.question: str
         self.images_paths: List[str]
+        # TODO move here task input
 
     def set_logger(self, logger: loggers_type):
         self.logger = logger
 
     @property
     @abstractmethod
-    def structured_output(self) -> type[BaseModelT]:
+    def structured_output(self) -> type[ImageReasoningAnswer[AnswerT]]:
         """Structured output that agent should return."""
         pass
 
@@ -141,7 +142,7 @@ def get_prompt(self) -> str:
         pass
 
     @abstractmethod
-    def validate(self, output: BaseModelT) -> float:
+    def validate(self, output: ImageReasoningAnswer[AnswerT]) -> float:
         """Validate result of the task."""
         pass
 
@@ -158,7 +159,7 @@ def get_images(self) -> List[str]:
 
     def get_structured_output_from_messages(
         self, messages: List[BaseMessage]
-    ) -> BaseModelT | None:
+    ) -> ImageReasoningAnswer[AnswerT] | None:
         """Extract and validate structured output from a list of messages.
 
         Iterates through messages in reverse order, attempting to find the message that is
diff --git a/src/rai_bench/rai_bench/vlm_benchmark/tasks/tasks.py b/src/rai_bench/rai_bench/vlm_benchmark/tasks/tasks.py
@@ -57,7 +57,7 @@ class MultipleChoiceImageTaskInput(ImageReasoningTaskInput[List[str]]):
     )
 
 
-class BoolImageTask(ImageReasoningTask[BoolAnswerWithJustification]):
+class BoolImageTask(ImageReasoningTask[bool]):
     complexity = "easy"
 
     def __init__(
@@ -87,11 +87,11 @@ def get_images(self):
         images = [preprocess_image(image_path) for image_path in self.images_paths]
         return images
 
-    def validate(self, output: BoolAnswerWithJustification) -> float:
+    def validate(self, output: ImageReasoningAnswer[bool]) -> float:
         return float(output.answer == self.expected_answer)
 
 
-class QuantityImageTask(ImageReasoningTask[QuantityAnswerWithJustification]):
+class QuantityImageTask(ImageReasoningTask[int]):
     """A task that requires counting objects in an image."""
 
     complexity = "medium"
@@ -114,7 +114,7 @@ def type(self) -> str:
     def structured_output(self) -> Type[QuantityAnswerWithJustification]:
         return QuantityAnswerWithJustification
 
-    def validate(self, output: QuantityAnswerWithJustification) -> float:
+    def validate(self, output: ImageReasoningAnswer[int]) -> float:
         return float(output.answer == self.expected_answer)
 
     def get_prompt(self) -> str:
@@ -125,9 +125,7 @@ def get_images(self):
         return images
 
 
-class MultipleChoiceImageTask(
-    ImageReasoningTask[MultipleChoiceAnswerWithJustification]
-):
+class MultipleChoiceImageTask(ImageReasoningTask[List[str]]):
     """A task that requires selecting one or more answers from a set of options."""
 
     complexity = "hard"
@@ -151,7 +149,7 @@ def type(self) -> str:
     def structured_output(self) -> Type[MultipleChoiceAnswerWithJustification]:
         return MultipleChoiceAnswerWithJustification
 
-    def validate(self, output: MultipleChoiceAnswerWithJustification) -> float:
+    def validate(self, output: ImageReasoningAnswer[List[str]]) -> float:
         answers_processed = set([answer.casefold() for answer in output.answer])
         expected_processed = set([answer.casefold() for answer in self.expected_answer])