embeddings-benchmark · isaac-chung · Jan 8, 2026 · Jan 1, 2026 · Jan 5, 2026 · Jan 5, 2026
diff --git a/mteb/models/model_implementations/msclap_models.py b/mteb/models/model_implementations/msclap_models.py
@@ -1,4 +1,6 @@
 import logging
+import os
+import tempfile
 import warnings
 from typing import Any
 
@@ -58,6 +60,7 @@ def get_audio_embeddings(
         show_progress_bar: bool = True,
         **kwargs: Any,
     ) -> np.ndarray:
+        import soundfile as sf
         import torchaudio
 
         all_embeddings = []
@@ -66,34 +69,50 @@ def get_audio_embeddings(
             inputs,
             disable=not show_progress_bar,
         ):
-            audio_arrays = []
-            for a in batch["audio"]:
-                array = torch.tensor(a["array"], dtype=torch.float32)
-                sr = a.get("sampling_rate", None)
-                if sr is None:
-                    warnings.warn(
-                        f"No sampling_rate provided for an audio sample. "
-                        f"Assuming {self.sampling_rate} Hz (model default)."
+            temp_files = []
+            try:
+                for a in batch["audio"]:
+                    array = torch.tensor(a["array"], dtype=torch.float32)
+                    sr = a.get("sampling_rate", None)
+                    if sr is None:
+                        warnings.warn(
+                            f"No sampling_rate provided for an audio sample. "
+                            f"Assuming {self.sampling_rate} Hz (model default)."
+                        )
+                        sr = self.sampling_rate
+
+                    if sr != self.sampling_rate:
+                        resampler = torchaudio.transforms.Resample(
+                            orig_freq=sr, new_freq=self.sampling_rate
+                        )
+                        array = resampler(array)
+
+                    # Write to temp file - msclap expects file paths
+                    temp_file = tempfile.NamedTemporaryFile(
+                        suffix=".wav", delete=False
                     )
-                    sr = self.sampling_rate
-
-                if sr != self.sampling_rate:
-                    resampler = torchaudio.transforms.Resample(
-                        orig_freq=sr, new_freq=self.sampling_rate
+                    temp_files.append(temp_file.name)
+                    sf.write(temp_file.name, array.numpy(), self.sampling_rate)
+
+                with torch.no_grad():
+                    # Use the official msclap API that expects file paths
+                    # https://github.com/microsoft/CLAP#api
+                    audio_features = self.model.get_audio_embeddings(
+                        temp_files, resample=False
                     )
-                    array = resampler(array)
-                audio_arrays.append(array.numpy())
-
-            with torch.no_grad():
-                # Use the internal audio encoder directly
-                # [0] gives audio embeddings, [1] gives class probabilities
-                audio_features = self.model.clap.audio_encoder(audio_arrays)[0]
+                    # Normalize embeddings
+                    audio_features = audio_features / audio_features.norm(
+                        dim=-1, keepdim=True
+                    )
+                    all_embeddings.append(audio_features.cpu().detach().numpy())
+            finally:
+                # Clean up temp files
 
-                # Normalize embeddings
-                audio_features = audio_features / audio_features.norm(
-                    dim=-1, keepdim=True
-                )
-                all_embeddings.append(audio_features.cpu().detach().numpy())
+                for f in temp_files:
+                    try:
+                        os.unlink(f)
+                    except OSError:
+                        pass
 
         return np.vstack(all_embeddings)
 
@@ -162,7 +181,7 @@ def encode(
     loader=MSClapWrapper,
     name="microsoft/msclap-2022",
     languages=["eng-Latn"],
-    revision="N/A",
+    revision="no_revision",
     release_date="2022-12-01",
     modalities=["audio", "text"],
     n_parameters=196_000_000,
@@ -184,7 +203,7 @@ def encode(
     loader=MSClapWrapper,
     name="microsoft/msclap-2023",
     languages=["eng-Latn"],
-    revision="N/A",
+    revision="no_revision",
     release_date="2023-09-01",
     modalities=["audio", "text"],
     n_parameters=160_000_000,