EvolvingLMMs-Lab
diff --git a/‎lmms_eval/api/metrics.py‎
Lines changed: 13 additions & 12 deletions b/‎lmms_eval/api/metrics.py‎
Lines changed: 13 additions & 12 deletions
diff --git a/‎lmms_eval/models/chat/openai_compatible.py‎
Lines changed: 1 addition & 1 deletion b/‎lmms_eval/models/chat/openai_compatible.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lmms_eval/models/whisper_tt.py‎
Lines changed: 45 additions & 64 deletions b/‎lmms_eval/models/whisper_tt.py‎
Lines changed: 45 additions & 64 deletions
@@ -536,32 +536,33 @@ def bootstrap_chair_metric(metric_fn, xs, iters):
     print(f"bootstrapping for stddev: {metric_fn.__name__}")
     res = []
     from tqdm import tqdm
-    
+
     for _ in tqdm(range(iters), desc="Bootstrap"):
         bootstrap_sample = random.choices(xs, k=len(xs))
         metric_value = metric_fn(bootstrap_sample)
         res.append(metric_value)
-    
+
     return sample_stddev(res)
 
+
 def stderr_for_metric(metric, bootstrap_iters: int):
     if bootstrap_iters <= 0:
         # return no function (don't compute stderr) if bootstrap iters = 0
         return None
     # for coco_cap_chair
-    from lmms_eval.tasks.coco_cap_chair.utils import (
-        coco_cap_chair_aggregate_results_chair_i,
-        coco_cap_chair_aggregate_results_chair_s,
-        coco_cap_chair_aggregate_results_recall,
-    )
     # for amber_g
     from lmms_eval.tasks.amber_g.utils import (
         amber_g_aggregate_chair,
+        amber_g_aggregate_cog,
         amber_g_aggregate_cover,
         amber_g_aggregate_hal,
-        amber_g_aggregate_cog,
     )
-    
+    from lmms_eval.tasks.coco_cap_chair.utils import (
+        coco_cap_chair_aggregate_results_chair_i,
+        coco_cap_chair_aggregate_results_chair_s,
+        coco_cap_chair_aggregate_results_recall,
+    )
+
     bootstrappable = [
         median,
         matthews_corrcoef,
@@ -582,10 +583,10 @@ def stderr_for_metric(metric, bootstrap_iters: int):
     if metric in bootstrappable:
         return lambda x: bootstrap_stderr(metric, x, iters=bootstrap_iters)
 
-    if hasattr(metric, '__name__'):
-        if 'coco_cap_chair' in metric.__name__:
+    if hasattr(metric, "__name__"):
+        if "coco_cap_chair" in metric.__name__:
             return lambda x: bootstrap_chair_metric(metric, x, iters=bootstrap_iters)
-        if 'amber_g' in metric.__name__ or 'amber_' in metric.__name__:
+        if "amber_g" in metric.__name__ or "amber_" in metric.__name__:
             return lambda x: bootstrap_chair_metric(metric, x, iters=bootstrap_iters)
 
     stderr = {mean: mean_stderr, acc_all: acc_all_stderr}
 
@@ -78,7 +78,7 @@ def generate_until(self, requests) -> List[str]:
                 if "o1" in self.model_version or "o3" in self.model_version or "o4" in self.model_version or "gpt-5" in self.model_version:
                     del payload["temperature"]
                     payload.pop("max_tokens")
-                    #payload["reasoning_effort"] = "medium"
+                    # payload["reasoning_effort"] = "medium"
                     payload["response_format"] = {"type": "text"}
                     payload["max_completion_tokens"] = 5000
 
 
@@ -1,16 +1,16 @@
 import asyncio
 import base64
 import os
-from io import BytesIO
 import time
+from io import BytesIO
 from typing import List, Tuple
 
+import aiohttp
 import numpy as np
 import requests
 from accelerate import Accelerator, DistributedType
 from loguru import logger as eval_logger
 from scipy.io import wavfile
-import aiohttp
 from tqdm import tqdm
 from transformers import AutoProcessor
 
@@ -28,7 +28,7 @@
 class WhisperTT(lmms):
     """
     Whisper Audio Model - HTTP API Client
-    
+
     This implementation uses HTTP calls to the tt-media-server instead of
     direct ttnn/tt-metal execution, allowing evals to run outside docker.
     """
@@ -52,23 +52,23 @@ def __init__(
         # Log warning for unexpected kwargs but don't fail
         if kwargs:
             eval_logger.warning(f"Ignoring unexpected kwargs: {kwargs}")
-        
+
         # Get base URL from env var or argument
         self.base_url = base_url or os.getenv("OPENAI_API_BASE", "http://127.0.0.1:8000")
         self.timeout = timeout
         self.max_retries = max_retries
         self.pretrained = pretrained
-        
+
         # Get API key from environment
         self.api_key = os.getenv("OPENAI_API_KEY", "your-secret-key")
-        
+
         eval_logger.info(f"Initializing WhisperTT HTTP client with base_url: {self.base_url}")
-        
+
         # Setup processor for tokenization
         self.processor = AutoProcessor.from_pretrained(pretrained)
         self.processor.tokenizer.set_prefix_tokens(language=language, task=task)
         self._tokenizer = self.processor.tokenizer
-        
+
         # Setup accelerator for distributed evaluation
         accelerator = Accelerator()
         if accelerator.num_processes > 1:
@@ -79,7 +79,7 @@ def __init__(
             self._device = device
             self._rank = 0
             self._world_size = 1
-        
+
         self.batch_size_per_gpu = int(batch_size)
         self.use_cache = use_cache
 
@@ -110,41 +110,41 @@ def world_size(self):
     def encode_audio_to_base64_wav(self, audio_array: np.ndarray, sampling_rate: int) -> str:
         """
         Convert audio numpy array to base64-encoded WAV format.
-        
+
         Args:
             audio_array: Audio data as numpy array
             sampling_rate: Sampling rate of the audio
-            
+
         Returns:
             Base64-encoded WAV file string
         """
         # Ensure float32 to create 32-bit WAV files (not 64-bit)
         # This prevents "Unsupported bit depth: 64" errors on the server
         audio_array = audio_array.astype(np.float32)
-        
+
         # Create WAV file in memory
         wav_buffer = BytesIO()
         wavfile.write(wav_buffer, sampling_rate, audio_array)
         wav_bytes = wav_buffer.getvalue()
-        
+
         # Encode to base64
-        base64_str = base64.b64encode(wav_bytes).decode('utf-8')
+        base64_str = base64.b64encode(wav_bytes).decode("utf-8")
         return base64_str
 
     def transcribe_audio(self, audio_array: np.ndarray, sampling_rate: int) -> str:
         """
         Transcribe audio using the tt-media-server HTTP API.
-        
+
         Args:
             audio_array: Audio data as numpy array
             sampling_rate: Sampling rate of the audio
-            
+
         Returns:
             Transcription text
         """
         # Encode audio to base64 WAV
         base64_audio = self.encode_audio_to_base64_wav(audio_array, sampling_rate)
-        
+
         # Prepare request
         url = f"{self.base_url}/audio/transcriptions"
         headers = {
@@ -153,58 +153,50 @@ def transcribe_audio(self, audio_array: np.ndarray, sampling_rate: int) -> str:
         }
         if self.api_key:
             headers["Authorization"] = f"Bearer {self.api_key}"
-        
-        payload = {
-            "file": base64_audio,
-            "stream": False
-        }
-        
+
+        payload = {"file": base64_audio, "stream": False}
+
         # Make request with retries
         for attempt in range(self.max_retries):
             try:
-                response = requests.post(
-                    url,
-                    json=payload,
-                    headers=headers,
-                    timeout=self.timeout
-                )
+                response = requests.post(url, json=payload, headers=headers, timeout=self.timeout)
                 response.raise_for_status()
-                
+
                 # Parse response
                 result = response.json()
-                
+
                 # Extract transcription text from response
                 # The response format should contain the transcription
                 if isinstance(result, dict):
                     # Try common keys for transcription text
-                    transcription = result.get('text') or result.get('transcription') or result.get('result')
+                    transcription = result.get("text") or result.get("transcription") or result.get("result")
                     if transcription:
                         return transcription
                     # If no known key, return the entire dict as string
                     eval_logger.warning(f"Unexpected response format: {result}")
                     return str(result)
                 else:
                     return str(result)
-                    
+
             except requests.exceptions.RequestException as e:
                 if attempt < self.max_retries - 1:
                     eval_logger.warning(f"Request failed (attempt {attempt + 1}/{self.max_retries}): {e}")
                     continue
                 else:
                     eval_logger.error(f"All retry attempts failed: {e}")
                     raise
-        
+
         return ""
 
     async def _generate_audio_transcription(self, session, audio_array: np.ndarray, sampling_rate: int, audio_index: int = None) -> str:
         """
         Transcribe audio using the tt-media-server HTTP API.
-        
+
         Args:
             audio_array: Audio data as numpy array
             sampling_rate: Sampling rate of the audio
             audio_index: Index of the audio for logging purposes
-            
+
         Returns:
             Transcription text
         """
@@ -216,38 +208,27 @@ async def _generate_audio_transcription(self, session, audio_array: np.ndarray,
 
         # Prepare request
         url = f"{self.base_url}/audio/transcriptions"
-        headers = {
-            "accept": "application/json",
-            "Content-Type": "application/json"
-        }
+        headers = {"accept": "application/json", "Content-Type": "application/json"}
         if self.api_key:
             headers["Authorization"] = f"Bearer {self.api_key}"
-        
-        payload = {
-            "file": base64_audio,
-            "stream": False
-        }
+
+        payload = {"file": base64_audio, "stream": False}
 
         try:
-            async with session.post(
-                f"{self.base_url}/audio/transcriptions",
-                json=payload,
-                headers=headers,
-                timeout=aiohttp.ClientTimeout(total=15000)
-            ) as response:
+            async with session.post(f"{self.base_url}/audio/transcriptions", json=payload, headers=headers, timeout=aiohttp.ClientTimeout(total=15000)) as response:
                 elapsed = time.time() - start_time
 
                 if response.status != 200:
                     eval_logger.info(f"❌ Audio transcription failed with status: {response.status}")
                     return ""
 
                 result = await response.json()
-                
+
                 # Extract transcription text from response
                 # The response format should contain the transcription
                 if isinstance(result, dict):
                     # Try common keys for transcription text
-                    transcription = result.get('text') or result.get('transcription') or result.get('result')
+                    transcription = result.get("text") or result.get("transcription") or result.get("result")
                     eval_logger.info(f"Transcription result for audio {audio_index}: {transcription}")
                     if transcription:
                         return transcription
@@ -282,18 +263,18 @@ def _collate(x):
             return -len(toks), x[0]
 
         pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding")
-        
+
         # Group requests by their generation_kwargs
         re_ords = utils.Collator([reg.args for reg in requests], _collate, grouping=True)
         chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None)
-        
+
         # Collect all audios from all chunks first
         all_audios = []
         all_contexts = []
         all_gen_kwargs_list = []
-        
+
         time_start = time.time()
-        
+
         for chunk in chunks:
             contexts, all_gen_kwargs, doc_to_visual, doc_id, task, split = zip(*chunk)
             task = task[0]
@@ -318,12 +299,12 @@ def _collate(x):
             sampling_rate = self.processor.feature_extractor.sampling_rate
             assert sampling_rate == SAMPLING_RATE, f"Expected sampling rate {SAMPLING_RATE}, but got {sampling_rate}"
             audios = [downsample_audio(audio["array"], audio["sampling_rate"], sampling_rate) for audio in flattened_audios]
-            
+
             # Collect all data
             all_audios.extend(audios)
             all_contexts.extend(contexts)
             all_gen_kwargs_list.extend([gen_kwargs] * len(contexts))
-            
+
         time_end_prep = time.time()
         eval_logger.info(f"Preparation time for {len(all_audios)} requests: {time_end_prep - time_start:.2f}s")
 
@@ -334,7 +315,7 @@ async def run_transcriptions():
                 return await asyncio.gather(*tasks)
 
         answers = asyncio.run(run_transcriptions())
-        
+
         time_end_process = time.time()
 
         eval_logger.info(f"Total time for {len(all_audios)} requests across all chunks {time_end_process - time_start:.2f}s")
@@ -348,11 +329,11 @@ async def run_transcriptions():
                 until = gen_kwargs["until"]
                 if isinstance(until, str):
                     until = [until]
-            
+
             for term in until:
                 if len(term) > 0:
                     ans = ans.split(term)[0]
-            
+
             processed_answers.append(ans)
 
         for ans, context, gen_kwargs in zip(processed_answers, all_contexts, all_gen_kwargs_list):
@@ -364,9 +345,9 @@ async def run_transcriptions():
         res = re_ords.get_original(res)
 
         pbar.close()
-        
+
         time_end_process = time.time()
-        
+
         eval_logger.info(f"Total time for {len(all_audios)} requests across all chunks {time_end_process - time_start:.2f}s")
 
         return res