From 6aae23850394c88408207778abeaf52640cec3a7 Mon Sep 17 00:00:00 2001 From: BujSet Date: Mon, 21 Jul 2025 17:41:22 +0000 Subject: [PATCH 1/4] Adding in whisper tiny export script in examples --- examples/models/__init__.py | 2 ++ examples/models/whisper_tiny/__init__.py | 11 +++++++ examples/models/whisper_tiny/model.py | 39 ++++++++++++++++++++++++ requirements-examples.txt | 1 + 4 files changed, 53 insertions(+) create mode 100644 examples/models/whisper_tiny/__init__.py create mode 100644 examples/models/whisper_tiny/model.py diff --git a/examples/models/__init__.py b/examples/models/__init__.py index 76469846608..329580594aa 100644 --- a/examples/models/__init__.py +++ b/examples/models/__init__.py @@ -37,6 +37,7 @@ class Model(str, Enum): EfficientSam = "efficient_sam" Qwen25 = "qwen2_5" Phi4Mini = "phi_4_mini" + WhisperTiny = "whisper_tiny" def __str__(self) -> str: return self.value @@ -82,6 +83,7 @@ def __str__(self) -> str: str(Model.EfficientSam): ("efficient_sam", "EfficientSAM"), str(Model.Qwen25): ("qwen2_5", "Qwen2_5Model"), str(Model.Phi4Mini): ("phi_4_mini", "Phi4MiniModel"), + str(Model.WhisperTiny): ("whisper_tiny", "WhisperTinyModel"), } __all__ = [ diff --git a/examples/models/whisper_tiny/__init__.py b/examples/models/whisper_tiny/__init__.py new file mode 100644 index 00000000000..ca800c7cad4 --- /dev/null +++ b/examples/models/whisper_tiny/__init__.py @@ -0,0 +1,11 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from .model import WhisperTinyModel + +__all__ = [ + "WhisperTinyModel", +] diff --git a/examples/models/whisper_tiny/model.py b/examples/models/whisper_tiny/model.py new file mode 100644 index 00000000000..c93e9e1266f --- /dev/null +++ b/examples/models/whisper_tiny/model.py @@ -0,0 +1,39 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import logging + +import torch + +from transformers import AutoFeatureExtractor, WhisperModel # @manual +from datasets import load_dataset + +from ..model_base import EagerModelBase + + +class WhisperTinyModel(EagerModelBase): + def __init__(self): + pass + + def get_eager_model(self) -> torch.nn.Module: + logging.info("Loading whipser-tiny model") + # pyre-ignore + model = WhisperModel.from_pretrained("openai/whisper-tiny", return_dict=False) + model.eval() + logging.info("Loaded whisper-tiny model") + return model + + def get_example_inputs(self): + feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-tiny") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt") + print(inputs) + print(inputs.input_features) + return (inputs.input_features,) + # Raw audio input: 1 second of 16kHz audio + #input_values = torch.randn(1, 16000) + #print(input_values) + #return (input_values,) diff --git a/requirements-examples.txt b/requirements-examples.txt index 7426df861a2..3cab53469c3 100644 --- a/requirements-examples.txt +++ b/requirements-examples.txt @@ -5,3 +5,4 @@ timm == 1.0.7 torchsr == 1.0.4 torchtune >= 0.6.1 transformers >= 4.53.1 +librosa >= 0.11.0 From 9c119ea95c4297c6e4e212807b1cf79665e8b9df Mon Sep 17 00:00:00 2001 From: BujSet Date: Mon, 21 Jul 2025 23:20:14 +0000 Subject: [PATCH 2/4] Using WhisperForConditionalGeneration instead of WhisperModel; seems to be the more correct thing --- examples/models/whisper_tiny/model.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/examples/models/whisper_tiny/model.py b/examples/models/whisper_tiny/model.py index c93e9e1266f..317f13ebe75 100644 --- a/examples/models/whisper_tiny/model.py +++ b/examples/models/whisper_tiny/model.py @@ -9,6 +9,7 @@ import torch from transformers import AutoFeatureExtractor, WhisperModel # @manual +from transformers import AutoProcessor, WhisperForConditionalGeneration # @manual from datasets import load_dataset from ..model_base import EagerModelBase @@ -21,18 +22,29 @@ def __init__(self): def get_eager_model(self) -> torch.nn.Module: logging.info("Loading whipser-tiny model") # pyre-ignore - model = WhisperModel.from_pretrained("openai/whisper-tiny", return_dict=False) + model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", return_dict=False) model.eval() logging.info("Loaded whisper-tiny model") return model def get_example_inputs(self): - feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-tiny") + processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en") + model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", return_dict=False) ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") - inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt") - print(inputs) - print(inputs.input_features) - return (inputs.input_features,) + inputs = processor(ds[0]["audio"]["array"], return_tensors="pt") + input_features = inputs.input_features + #generated_ids = model.generate(inputs=input_features) + return (input_features[0],) #(generated_ids,) + + #feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-tiny") + #ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + #inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt") + #print(inputs) + #print(inputs.input_features) + #print(inputs.input_features.shape) + #decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id + + #return (inputs.input_features,decoder_input_ids) # Raw audio input: 1 second of 16kHz audio #input_values = torch.randn(1, 16000) #print(input_values) From f06e66a813c259a0dd26b7f5dbf6de7dde8b8948 Mon Sep 17 00:00:00 2001 From: BujSet Date: Tue, 22 Jul 2025 18:34:21 +0000 Subject: [PATCH 3/4] input dimensions seem correct, but getting a value error --- examples/models/whisper_tiny/model.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/examples/models/whisper_tiny/model.py b/examples/models/whisper_tiny/model.py index 317f13ebe75..e7efc27fa2f 100644 --- a/examples/models/whisper_tiny/model.py +++ b/examples/models/whisper_tiny/model.py @@ -17,6 +17,8 @@ class WhisperTinyModel(EagerModelBase): def __init__(self): + #self.max_cache_length=1024 + #self.batch_size=1 pass def get_eager_model(self) -> torch.nn.Module: @@ -28,13 +30,27 @@ def get_eager_model(self) -> torch.nn.Module: return model def get_example_inputs(self): + #input_ids = torch.tensor([[0]], dtype=torch.long) + #encoder_hidden_states = torch.rand(1, 1500, 384) + #cache_position = torch.tensor([0], dtype=torch.long) + #atten_mask = torch.full((1, self.max_cache_length), torch.tensor(-255.0)) + #atten_mask *= torch.arange(self.max_cache_length) > cache_position.reshape( + # -1, 1 + #) + #atten_mask = atten_mask[None, None, :, :].expand(self.batch_size, 1, -1, -1) + #return (input_ids, atten_mask, encoder_hidden_states, cache_position) + processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en") model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", return_dict=False) ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") inputs = processor(ds[0]["audio"]["array"], return_tensors="pt") input_features = inputs.input_features + #expected_shape = (1, processor.feature_extractor.feature_size, processor.feature_extractor.nb_max_frames) + #print("Expected shape: " + str(expected_shape)) + print("Input features has shape: " + str(input_features.shape)) #generated_ids = model.generate(inputs=input_features) - return (input_features[0],) #(generated_ids,) + #return (torch.rand(expected_shape),) #(input_features,) #(generated_ids,) + return (input_features,) #(generated_ids,) #feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-tiny") #ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") From 2df3bbdb229057e09e45ff5c77189e746d6997a8 Mon Sep 17 00:00:00 2001 From: BujSet Date: Tue, 22 Jul 2025 18:38:41 +0000 Subject: [PATCH 4/4] Adding export log file --- export.log | 1 + 1 file changed, 1 insertion(+) create mode 100644 export.log diff --git a/export.log b/export.log new file mode 100644 index 00000000000..f3fa834ece2 --- /dev/null +++ b/export.log @@ -0,0 +1 @@ +Input features has shape: torch.Size([1, 80, 3000])