fix(evaluation): enhance docstrings for VLM metrics and base classes

davidberenstein1957 · davidberenstein1957 · commit d3d659b4f7cf · 2026-02-21T09:26:19.000+01:00
- Added detailed parameter descriptions to VQAnswer, ScoreOutput, and various metric classes in metrics_vlm.py.
- Updated docstrings in base classes of vlm_base.py to include parameter details and return types.
- Improved clarity and consistency across all metric-related docstrings.
diff --git a/src/pruna/evaluation/metrics/metrics_vlm.py b/src/pruna/evaluation/metrics/metrics_vlm.py
@@ -60,14 +60,32 @@ def _process_images(images: torch.Tensor) -> List[Any]:
 
 # Pydantic models for structured generation
 class VQAnswer(BaseModel):
-    """Structured output for VQA."""
+    """
+    Structured output for VQA.
+
+    Parameters
+    ----------
+    answer : str
+        The VQA answer text.
+    confidence : float, optional
+        Confidence score. Default is 1.0.
+    """
 
     answer: str
     confidence: float = 1.0
 
 
 class ScoreOutput(BaseModel):
-    """Structured output for scoring metrics."""
+    """
+    Structured output for scoring metrics.
+
+    Parameters
+    ----------
+    score : float
+        The numeric score.
+    reasoning : str | None, optional
+        Optional reasoning for the score.
+    """
 
     score: float
     reasoning: Optional[str] = None
@@ -89,6 +107,8 @@ class VQAMetric(StatefulMetric):
 
     Parameters
     ----------
+    *args : Any
+        Additional positional arguments.
     vlm_type : {"litellm", "transformers"}, optional
         VLM backend to use. Default is "litellm".
     model_name : str, optional
@@ -101,6 +121,8 @@ class VQAMetric(StatefulMetric):
         Device for transformers VLM.
     api_key : str | None, optional
         API key for litellm.
+    call_type : str, optional
+        Call type for the metric.
     **kwargs : Any
         Additional arguments.
     """
@@ -190,10 +212,22 @@ class AlignmentScoreMetric(StatefulMetric):
 
     Parameters
     ----------
+    *args : Any
+        Additional positional arguments.
     vlm_type : {"litellm", "transformers"}, optional
         VLM backend. Default is "litellm".
+    model_name : str, optional
+        Model name. Default is "gpt-4o".
     structured_output : bool, optional
         Use structured generation. Default is True.
+    use_outlines : bool, optional
+        Use outlines for transformers. Default is False.
+    device : str | torch.device | None, optional
+        Device for transformers VLM.
+    api_key : str | None, optional
+        API key for litellm.
+    call_type : str, optional
+        Call type for the metric.
     **kwargs : Any
         Additional arguments.
     """
@@ -277,6 +311,27 @@ class ImageEditScoreMetric(StatefulMetric):
     Reference
     ----------
     VieScore: https://github.com/ByteDance/IEA-eval
+
+    Parameters
+    ----------
+    *args : Any
+        Additional positional arguments.
+    vlm_type : {"litellm", "transformers"}, optional
+        VLM backend. Default is "litellm".
+    model_name : str, optional
+        Model name. Default is "gpt-4o".
+    structured_output : bool, optional
+        Use structured generation. Default is True.
+    use_outlines : bool, optional
+        Use outlines for transformers. Default is False.
+    device : str | torch.device | None, optional
+        Device for transformers VLM.
+    api_key : str | None, optional
+        API key for litellm.
+    call_type : str, optional
+        Call type for the metric.
+    **kwargs : Any
+        Additional arguments.
     """
 
     scores: List[float]
@@ -361,6 +416,27 @@ class QAAccuracyMetric(StatefulMetric):
 
     Uses VLM to answer questions about images.
     Higher scores indicate better image understanding.
+
+    Parameters
+    ----------
+    *args : Any
+        Additional positional arguments.
+    vlm_type : {"litellm", "transformers"}, optional
+        VLM backend. Default is "litellm".
+    model_name : str, optional
+        Model name. Default is "gpt-4o".
+    structured_output : bool, optional
+        Use structured generation. Default is True.
+    use_outlines : bool, optional
+        Use outlines for transformers. Default is False.
+    device : str | torch.device | None, optional
+        Device for transformers VLM.
+    api_key : str | None, optional
+        API key for litellm.
+    call_type : str, optional
+        Call type for the metric.
+    **kwargs : Any
+        Additional arguments.
     """
 
     scores: List[float]
@@ -437,6 +513,27 @@ class TextScoreMetric(StatefulMetric):
 
     Uses VLM for OCR to extract text and compare with ground truth.
     Lower scores (edit distance) are better.
+
+    Parameters
+    ----------
+    *args : Any
+        Additional positional arguments.
+    vlm_type : {"litellm", "transformers"}, optional
+        VLM backend. Default is "litellm".
+    model_name : str, optional
+        Model name. Default is "gpt-4o".
+    structured_output : bool, optional
+        Use structured generation. Default is True.
+    use_outlines : bool, optional
+        Use outlines for transformers. Default is False.
+    device : str | torch.device | None, optional
+        Device for transformers VLM.
+    api_key : str | None, optional
+        API key for litellm.
+    call_type : str, optional
+        Call type for the metric.
+    **kwargs : Any
+        Additional arguments.
     """
 
     scores: List[float]
@@ -522,6 +619,27 @@ class VieScoreMetric(StatefulMetric):
     - Semantic score: How well image follows prompt
     - Quality score: Naturalness and artifacts
     - Overall: Geometric mean of semantic and quality
+
+    Parameters
+    ----------
+    *args : Any
+        Additional positional arguments.
+    vlm_type : {"litellm", "transformers"}, optional
+        VLM backend. Default is "litellm".
+    model_name : str, optional
+        Model name. Default is "gpt-4o".
+    structured_output : bool, optional
+        Use structured generation. Default is True.
+    use_outlines : bool, optional
+        Use outlines for transformers. Default is False.
+    device : str | torch.device | None, optional
+        Device for transformers VLM.
+    api_key : str | None, optional
+        API key for litellm.
+    call_type : str, optional
+        Call type for the metric.
+    **kwargs : Any
+        Additional arguments.
     """
 
     scores: List[float]
diff --git a/src/pruna/evaluation/metrics/vlm_base.py b/src/pruna/evaluation/metrics/vlm_base.py
@@ -52,7 +52,25 @@ def generate(
         response_format: Optional[Type[BaseModel]] = None,
         **kwargs: Any,
     ) -> List[str]:
-        """Generate responses for images and prompts."""
+        """
+        Generate responses for images and prompts.
+
+        Parameters
+        ----------
+        images : List[Image.Image]
+            List of PIL Images.
+        prompts : List[str]
+            List of text prompts.
+        response_format : Type[BaseModel] | None
+            Optional pydantic model for structured output.
+        **kwargs : Any
+            Additional arguments passed to the implementation.
+
+        Returns
+        -------
+        List[str]
+            Generated responses.
+        """
         pass
 
     @abstractmethod
@@ -63,7 +81,25 @@ def score(
         answers: List[str],
         **kwargs: Any,
     ) -> List[float]:
-        """Score how well answers match images for given questions."""
+        """
+        Score how well answers match images for given questions.
+
+        Parameters
+        ----------
+        images : List[Image.Image]
+            List of PIL Images.
+        questions : List[str]
+            List of questions.
+        answers : List[str]
+            List of expected answers.
+        **kwargs : Any
+            Additional arguments passed to the implementation.
+
+        Returns
+        -------
+        List[float]
+            Scores for each image-question pair.
+        """
         pass
 
 
@@ -73,6 +109,15 @@ class LitellmVLM(BaseVLM):
 
     Supports 100+ LLM providers (OpenAI, Anthropic, Azure, etc.)
     Default model is gpt-4o.
+
+    Parameters
+    ----------
+    model_name : str, optional
+        Model name (e.g., gpt-4o). Default is "gpt-4o".
+    api_key : str | None, optional
+        API key for the provider. Uses LITELLM_API_KEY or OPENAI_API_KEY env if None.
+    **kwargs : Any
+        Additional arguments passed to litellm.
     """
 
     def __init__(
@@ -111,6 +156,8 @@ def generate(
             List of text prompts.
         response_format : Type[BaseModel] | None
             Optional pydantic model for structured output.
+        **kwargs : Any
+            Additional arguments passed to litellm completion.
 
         Returns
         -------
@@ -169,6 +216,8 @@ def score(
             List of questions.
         answers : List[str]
             List of expected answers.
+        **kwargs : Any
+            Additional arguments passed to generate.
 
         Returns
         -------
@@ -196,6 +245,17 @@ class TransformersVLM(BaseVLM):
     VLM using HuggingFace Transformers for local inference.
 
     Supports models like BLIP, LLaVA, etc.
+
+    Parameters
+    ----------
+    model_name : str, optional
+        HuggingFace model name. Default is "Salesforce/blip2-opt-2.7b".
+    device : str | torch.device | None, optional
+        Device for inference. Auto-detected if None.
+    use_outlines : bool, optional
+        Use outlines for constrained decoding. Default is False.
+    **kwargs : Any
+        Additional arguments passed to model generation.
     """
 
     def __init__(
@@ -244,20 +304,22 @@ def generate(
         """
         Generate responses using local VLM.
 
-        Args:
-            images: List of PIL Images
-            prompts: List of text prompts
-            response_format: Optional format constraint (e.g., "json", "integer")
-        """
-        """
+        Parameters
+        ----------
+        images : List[Image.Image]
+            List of PIL Images.
+        prompts : List[str]
+            List of text prompts.
+        response_format : str | None
+            Optional format constraint (e.g., "json", "integer", "yes_no").
+        **kwargs : Any
+            Additional arguments passed to model generate.
 
-        Generate responses using local VLM.
-        Args:
-            images: List of PIL Images
-            prompts: List of text prompts
-            response_format: Optional format constraint (e.g., "json", "integer")
+        Returns
+        -------
+        List[str]
+            Generated responses.
         """
-
         self._load_model()
         results = []
         max_new_tokens = kwargs.get("max_new_tokens", 128)
@@ -347,6 +409,8 @@ def score(
             List of questions.
         answers : List[str]
             List of expected answers.
+        **kwargs : Any
+            Additional arguments passed to generate.
 
         Returns
         -------
diff --git a/tests/style/test_docstrings.py b/tests/style/test_docstrings.py
@@ -14,7 +14,4 @@ def test_docstrings(file: str) -> None:
     file : str
         The import statement to check.
     """
-    # Skip metrics_vlm module as it uses a different docstring pattern for VLM parameters
-    if "metrics_vlm" in file:
-        pytest.skip("metrics_vlm uses custom VLM parameter documentation")
     check_docstrings_content(file)