2828
2929import base64
3030import io
31+ import math
3132import os
3233from abc import ABC , abstractmethod
3334from typing import Any , List , Literal , Optional , Type , TypeVar
@@ -129,6 +130,7 @@ def score(
129130 images : List [Image .Image ],
130131 questions : List [str ],
131132 answers : List [str ],
133+ use_probability : bool = False ,
132134 ** kwargs : Any ,
133135 ) -> List [float ]:
134136 """
@@ -142,13 +144,15 @@ def score(
142144 List of questions.
143145 answers : List[str]
144146 List of expected answers.
147+ use_probability : bool, optional
148+ If True and supported, return P(expected answer) instead of binary 0/1.
145149 **kwargs : Any
146150 Additional arguments passed to the implementation.
147151
148152 Returns
149153 -------
150154 List[float]
151- Scores for each image-question pair.
155+ Scores for each image-question pair (0-1, or probability when use_probability) .
152156 """
153157 pass
154158
@@ -253,11 +257,15 @@ def score(
253257 images : List [Image .Image ],
254258 questions : List [str ],
255259 answers : List [str ],
260+ use_probability : bool = False ,
256261 ** kwargs : Any ,
257262 ) -> List [float ]:
258263 """
259264 Score how well answers match images for given questions.
260265
266+ When use_probability=True, requests logprobs from the API and returns P(expected).
267+ Falls back to binary 0/1 if logprobs not available.
268+
261269 Parameters
262270 ----------
263271 images : List[Image.Image]
@@ -266,22 +274,80 @@ def score(
266274 List of questions.
267275 answers : List[str]
268276 List of expected answers.
277+ use_probability : bool, optional
278+ If True, return P(expected) from logprobs when available. Default is False.
269279 **kwargs : Any
270- Additional arguments passed to generate .
280+ Additional arguments passed to litellm completion .
271281
272282 Returns
273283 -------
274284 List[float]
275- Scores for each image-question pair.
285+ Scores for each image-question pair (0-1, or probability when use_probability) .
276286 """
277287 scores = []
278288 for image , question , answer in zip (images , questions , answers ):
279289 prompt = f"{ question } Please answer yes or no."
280- response = self .generate ([image ], [prompt ], ** kwargs )[0 ].lower ()
281- score = 1.0 if answer .lower () in response else 0.0
290+ if use_probability :
291+ score = self ._score_with_logprobs (image , prompt , answer , ** kwargs )
292+ else :
293+ response = self .generate ([image ], [prompt ], ** kwargs )[0 ].lower ()
294+ score = 1.0 if answer .lower () in response else 0.0
282295 scores .append (score )
283296 return scores
284297
298+ def _score_with_logprobs (self , image : Image .Image , prompt : str , expected : str , ** kwargs : Any ) -> float :
299+ """
300+ Get P(expected) from logprobs when available.
301+
302+ Parameters
303+ ----------
304+ image : Image.Image
305+ PIL Image to score.
306+ prompt : str
307+ Question prompt.
308+ expected : str
309+ Expected answer (e.g., "Yes").
310+ **kwargs : Any
311+ Additional arguments passed to litellm completion.
312+
313+ Returns
314+ -------
315+ float
316+ Probability of expected answer (0-1), or binary 0/1 on fallback.
317+ """
318+ content = [
319+ {"type" : "text" , "text" : prompt },
320+ {"type" : "image_url" , "image_url" : {"url" : self ._image_to_data_url (image )}},
321+ ]
322+ completion_kwargs = {
323+ "model" : self .model_name ,
324+ "messages" : [{"role" : "user" , "content" : content }],
325+ "api_key" : self .api_key ,
326+ "logprobs" : True ,
327+ "top_logprobs" : 5 ,
328+ ** self .extra_kwargs ,
329+ ** kwargs ,
330+ }
331+ try :
332+ response = self ._litellm .completion (** completion_kwargs )
333+ choice = response .choices [0 ]
334+ logprobs = getattr (choice , "logprobs" , None ) or getattr (choice .message , "logprobs" , None )
335+ if logprobs and hasattr (logprobs , "content" ):
336+ for tok in (logprobs .content or []):
337+ top = getattr (tok , "top_logprobs" , None ) or []
338+ for t in top :
339+ token_str = getattr (t , "token" , "" ) or str (t ).lower ()
340+ if token_str and expected .lower () in token_str .lower ():
341+ logprob = float (getattr (t , "logprob" , - 1e9 ) or - 1e9 )
342+ return min (1.0 , max (0.0 , math .exp (logprob )))
343+ content_str = (choice .message .content or "" ).lower ()
344+ if expected .lower () in content_str :
345+ return 1.0
346+ return 0.0
347+ except Exception :
348+ response = self .generate ([image ], [prompt ], ** kwargs )[0 ].lower ()
349+ return 1.0 if expected .lower () in response else 0.0
350+
285351 def _image_to_data_url (self , image : Image .Image ) -> str :
286352 buffer = io .BytesIO ()
287353 image .save (buffer , format = "PNG" )
@@ -458,11 +524,14 @@ def score(
458524 images : List [Image .Image ],
459525 questions : List [str ],
460526 answers : List [str ],
527+ use_probability : bool = False ,
461528 ** kwargs : Any ,
462529 ) -> List [float ]:
463530 """
464531 Score how well answers match images for given questions.
465532
533+ use_probability is not supported for TransformersVLM; uses binary 0/1.
534+
466535 Parameters
467536 ----------
468537 images : List[Image.Image]
@@ -471,13 +540,15 @@ def score(
471540 List of questions.
472541 answers : List[str]
473542 List of expected answers.
543+ use_probability : bool, optional
544+ Ignored; TransformersVLM always uses binary 0/1.
474545 **kwargs : Any
475546 Additional arguments passed to generate.
476547
477548 Returns
478549 -------
479550 List[float]
480- Scores for each image-question pair.
551+ Scores for each image-question pair (0 or 1) .
481552 """
482553 scores = []
483554 for image , question , answer in zip (images , questions , answers ):
0 commit comments