add whisper

xadupre · xadupre · commit 19b025c6737e · 2025-04-10T16:27:16.000+02:00
diff --git a/_unittests/ut_torch_models/test_hghub_model.py b/_unittests/ut_torch_models/test_hghub_model.py
@@ -104,6 +104,14 @@ def test_get_untrained_model_with_inputs_text2text_generation(self):
         raise unittest.SkipTest(f"not working for {mid!r}")
         model(**inputs)
 
+    @hide_stdout()
+    def test_get_untrained_model_with_inputs_automatic_speech_recognition(self):
+        mid = "openai/whisper-tiny"
+        data = get_untrained_model_with_inputs(mid, verbose=1)
+        self.assertIn((data["size"], data["n_weights"]), [(132115968, 33028992)])
+        model, inputs = data["model"], data["inputs"]
+        model(**inputs)
+
     @hide_stdout()
     def test_get_untrained_model_with_inputs_imagetext2text_generation(self):
         mid = "HuggingFaceM4/tiny-random-idefics"
diff --git a/_unittests/ut_torch_models/try_tasks.py b/_unittests/ut_torch_models/try_tasks.py
@@ -82,6 +82,72 @@ def test_imagetext2text_generation(self):
 
         print(generated_text[0])
 
+    @never_test()
+    def test_automatic_speech_recognition(self):
+        # clear&&NEVERTEST=1 python _unittests/ut_torch_models/try_tasks.py -k automatic_speech
+        # https://huggingface.co/openai/whisper-tiny
+
+        from transformers import WhisperProcessor, WhisperForConditionalGeneration
+        from datasets import load_dataset
+
+        """
+        kwargs=dict(
+            cache_position:T7s4,
+            past_key_values:EncoderDecoderCache(
+                self_attention_cache=DynamicCache[serialized](#2[#0[],#0[]]),
+                cross_attention_cache=DynamicCache[serialized](#2[#0[],#0[]])
+            ),
+            decoder_input_ids:T7s1x4,
+            encoder_outputs:dict(last_hidden_state:T1s1x1500x384),
+            use_cache:bool,return_dict:bool
+        )
+        kwargs=dict(
+            cache_position:T7s1,
+            past_key_values:EncoderDecoderCache(
+                self_attention_cache=DynamicCache[serialized](#2[
+                    #4[T1s1x6x4x64,T1s1x6x4x64,T1s1x6x4x64,T1s1x6x4x64],
+                    #4[T1s1x6x4x64,T1s1x6x4x64,T1s1x6x4x64,T1s1x6x4x64]
+                ]),
+                cross_attention_cache=DynamicCache[serialized](#2[
+                    #4[T1s1x6x1500x64,T1s1x6x1500x64,T1s1x6x1500x64,T1s1x6x1500x64],
+                    #4[T1s1x6x1500x64,T1s1x6x1500x64,T1s1x6x1500x64,T1s1x6x1500x64]
+                ]),
+            ),
+            decoder_input_ids:T7s1x1,
+            encoder_outputs:dict(last_hidden_state:T1s1x1500x384),
+            use_cache:bool,return_dict:bool
+        )
+        """
+
+        # load model and processor
+        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
+        forced_decoder_ids = processor.get_decoder_prompt_ids(
+            language="english", task="transcribe"
+        )
+
+        # load streaming dataset and read first audio sample
+        ds = load_dataset(
+            "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation"
+        )
+        sample = ds[0]["audio"]
+        input_features = processor(
+            sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt"
+        ).input_features
+
+        # generate token ids
+        print()
+        with steel_forward(model):
+            predicted_ids = model.generate(
+                input_features, forced_decoder_ids=forced_decoder_ids
+            )
+
+        # decode token ids to text
+        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)
+        print("--", transcription)
+        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
+        print("--", transcription)
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/onnx_diagnostic/ext_test_case.py b/onnx_diagnostic/ext_test_case.py
@@ -1137,6 +1137,11 @@ def _debug(self):
         "Tells if DEBUG=1 is set up."
         return os.environ.get("DEBUG") in BOOLEAN_VALUES
 
+    def string_type(self, *args, **kwargs):
+        from .helpers import string_type
+
+        return string_type(*args, **kwargs)
+
     def subloop(self, *args, verbose: int = 0):
         "Loops over elements and calls :meth:`unittests.TestCase.subTest`."
         if len(args) == 1:
diff --git a/onnx_diagnostic/helpers/helper.py b/onnx_diagnostic/helpers/helper.py
@@ -1,6 +1,7 @@
 import ast
 import enum
 import inspect
+from dataclasses import is_dataclass, fields
 from typing import Any, Callable, Dict, List, Optional, Set
 import numpy as np
 
@@ -140,6 +141,19 @@ def string_type(
     """
     if obj is None:
         return "None"
+    if is_dataclass(obj):
+        values = {f.name: getattr(obj, f.name, None) for f in fields(obj)}
+        values = {k: v for k, v in values.items() if v is not None}
+        s = string_type(
+            values,
+            with_shape=with_shape,
+            with_min_max=with_min_max,
+            with_device=with_device,
+            ignore=ignore,
+            limit=limit,
+        )
+        return f"{obj.__class__.__name__}{s[4:]}"
+
     # tuple
     if isinstance(obj, tuple):
         if len(obj) == 1:
diff --git a/onnx_diagnostic/torch_models/hghub/hub_data.py b/onnx_diagnostic/torch_models/hghub/hub_data.py
@@ -119,7 +119,7 @@
     VitsModel,text-to-audio
     Wav2Vec2ConformerForCTC,automatic-speech-recognition
     Wav2Vec2Model,feature-extraction
-    WhisperForConditionalGeneration,no-pipeline-tag
+    WhisperForConditionalGeneration,automatic-speech-recognition
     XLMModel,feature-extraction
     XLMRobertaForCausalLM,text-generation
     YolosForObjectDetection,object-detection
diff --git a/onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py b/onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py
diff --git a/onnx_diagnostic/torch_models/hghub/model_inputs.py b/onnx_diagnostic/torch_models/hghub/model_inputs.py