quic
diff --git a/‎README.md‎
Lines changed: 3 additions & 3 deletions b/‎README.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎qai_hub_models/_version.py‎
Lines changed: 1 addition & 1 deletion b/‎qai_hub_models/_version.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎qai_hub_models/global_requirements.txt‎
Lines changed: 3 additions & 2 deletions b/‎qai_hub_models/global_requirements.txt‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎qai_hub_models/models/_shared/hf_whisper/app.py‎
Lines changed: 123 additions & 20 deletions b/‎qai_hub_models/models/_shared/hf_whisper/app.py‎
Lines changed: 123 additions & 20 deletions
diff --git a/‎qai_hub_models/models/_shared/hf_whisper/demo.py‎
Lines changed: 26 additions & 10 deletions b/‎qai_hub_models/models/_shared/hf_whisper/demo.py‎
Lines changed: 26 additions & 10 deletions
diff --git a/‎qai_hub_models/models/_shared/hf_whisper/test_utils.py‎
Lines changed: 1 addition & 1 deletion b/‎qai_hub_models/models/_shared/hf_whisper/test_utils.py‎
Lines changed: 1 addition & 1 deletion
@@ -289,10 +289,10 @@ and many more.
 | | |
 | **Speech Recognition**
 | [HuggingFace-WavLM-Base-Plus](https://aihub.qualcomm.com/models/huggingface_wavlm_base_plus) | [qai_hub_models.models.huggingface_wavlm_base_plus](qai_hub_models/models/huggingface_wavlm_base_plus/README.md) |
-| [Whisper-Base-En](https://aihub.qualcomm.com/models/whisper_base_en) | [qai_hub_models.models.whisper_base_en](qai_hub_models/models/whisper_base_en/README.md) |
+| [Whisper-Base](https://aihub.qualcomm.com/models/whisper_base) | [qai_hub_models.models.whisper_base](qai_hub_models/models/whisper_base/README.md) |
 | [Whisper-Large-V3-Turbo](https://aihub.qualcomm.com/models/whisper_large_v3_turbo) | [qai_hub_models.models.whisper_large_v3_turbo](qai_hub_models/models/whisper_large_v3_turbo/README.md) |
-| [Whisper-Small-En](https://aihub.qualcomm.com/models/whisper_small_en) | [qai_hub_models.models.whisper_small_en](qai_hub_models/models/whisper_small_en/README.md) |
-| [Whisper-Tiny-En](https://aihub.qualcomm.com/models/whisper_tiny_en) | [qai_hub_models.models.whisper_tiny_en](qai_hub_models/models/whisper_tiny_en/README.md) |
+| [Whisper-Small](https://aihub.qualcomm.com/models/whisper_small) | [qai_hub_models.models.whisper_small](qai_hub_models/models/whisper_small/README.md) |
+| [Whisper-Tiny](https://aihub.qualcomm.com/models/whisper_tiny) | [qai_hub_models.models.whisper_tiny](qai_hub_models/models/whisper_tiny/README.md) |
 | | |
 | **Audio Classification**
 | [YamNet](https://aihub.qualcomm.com/models/yamnet) | [qai_hub_models.models.yamnet](qai_hub_models/models/yamnet/README.md) |
 
@@ -3,4 +3,4 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # ---------------------------------------------------------------------
 
-__version__ = "0.35.0"
+__version__ = "0.36.0"
@@ -45,8 +45,8 @@ mypy==1.13.0
 numba==0.60.0
 numpy<2
 object-detection-metrics==0.4.post1
-onnx>=1.16.1
-onnxruntime>=1.19
+onnx>=1.16.1,<1.20
+onnxruntime>=1.19,<1.23
 onnxsim<=0.4.36;python_version<'3.12'
 onnxsim-prebuilt==0.4.36.post1;python_version>='3.12'
 opencv-python>4,<5
@@ -82,6 +82,7 @@ seaborn==0.11.0
 segment-anything==1.0
 sentencepiece==0.2.0
 shapely==2.0.3
+sounddevice==0.5.2
 soundfile==0.13.1
 stringcase==1.2.0
 supervision==0.25.1
 
@@ -6,6 +6,7 @@
 from __future__ import annotations
 
 import numpy as np
+import sounddevice as sd
 import torch
 from scipy.signal import resample_poly
 from transformers.models.whisper import WhisperConfig
@@ -56,6 +57,7 @@ def __init__(
 
         self.feature_extractor = get_feature_extractor(hf_model_id)
         self.tokenizer = get_tokenizer(hf_model_id)
+        self.clip_segment_tokens = set(self.tokenizer.all_special_ids)
 
     def predict(self, *args, **kwargs):
         # See transcribe.
@@ -82,23 +84,10 @@ def transcribe(
         -------
         List of audio arrays, chunked into N arrays of model_chunk_seconds seconds.
         """
-        if isinstance(audio, str):
-            import audio2numpy as a2n  # import here, as this requires ffmpeg to be installed on host machine
-
-            audio, audio_sample_rate = a2n.audio_from_file(audio)
-        else:
-            assert audio_sample_rate is not None
-        assert isinstance(audio, np.ndarray)
-        assert isinstance(audio_sample_rate, int)
-        with torch.no_grad():
-            trans = " ".join(
-                self._transcribe_single_chunk(x)
-                for x in chunk_and_resample_audio(audio, audio_sample_rate)
-            )
-
-        return trans
+        tokens = self.transcribe_tokens(audio, audio_sample_rate)
+        return self.tokenizer.decode(tokens, skip_special_tokens=True).strip()
 
-    def _transcribe_single_chunk(self, audio: np.ndarray) -> str:
+    def _transcribe_single_chunk(self, audio: np.ndarray) -> list[int]:
         """
         Transcribe an audio chunk to text.
 
@@ -110,8 +99,7 @@ def _transcribe_single_chunk(self, audio: np.ndarray) -> str:
             The maximum length of this audio must be self.max_audio_samples.
 
         Returns:
-
-        - transcribed texts
+            list of token ids
         """
         # feature
         input_features = self.feature_extractor(
@@ -213,8 +201,123 @@ def _transcribe_single_chunk(self, audio: np.ndarray) -> str:
             # update position_ids
             position_ids += 1
 
-        # Exclude start / end tokens
-        return self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
+        return output_ids[0].tolist()
+
+    def stream(self, device=2, audio_chunk_size_seconds: int = 5) -> None:
+        """
+        Stream audio from the given audio device and transcribe in real time.
+
+        Parameters:
+            device:
+                Audio device (see. sounddevice.query_devices())
+            audio_chunk_size_seconds:
+                Number of seconds to record between each transcription attempt.
+        """
+        tokens: list[int] = []
+
+        def callback(audio: np.ndarray, frames, time, status):
+            nonlocal tokens
+            curr_tokens = self.transcribe_tokens(audio.squeeze(-1), SAMPLE_RATE)
+            tokens.extend(curr_tokens)
+
+            if not curr_tokens:
+                # This audio was empty, so it's safe to decode previous tokens.
+                print(
+                    self.tokenizer.decode(tokens, skip_special_tokens=True),
+                    end="",
+                    flush=True,
+                )
+                tokens = []
+            else:
+                split_start = 0
+                decode_splits = []
+                token_idx = 0
+                # Every time 2 "clip segment tokens" (timestamp tokens)
+                # appear in sequence, we're safe to decode the previous tokens.
+                while token_idx < len(tokens):
+                    if tokens[token_idx] in self.clip_segment_tokens:
+                        next_non_clip_idx = token_idx + 1
+                        while (
+                            next_non_clip_idx < len(tokens)
+                            and tokens[next_non_clip_idx] in self.clip_segment_tokens
+                        ):
+                            next_non_clip_idx = next_non_clip_idx + 1
+
+                        if next_non_clip_idx >= token_idx + 2:
+                            split_end = token_idx + 1
+                            if max(split_end - split_start, 0) > 0:
+                                decode_splits.append((split_start, split_end))
+                            split_start = next_non_clip_idx
+
+                        token_idx = next_non_clip_idx + 1
+                    else:
+                        token_idx = token_idx + 1
+
+                for split in decode_splits:
+                    print(
+                        self.tokenizer.decode(
+                            tokens[split[0] : split[1]], skip_special_tokens=True
+                        ),
+                        end="",
+                        flush=True,
+                    )
+                if split_start != 0:
+                    tokens = tokens[split_start:]
+
+        print("Listening...")
+        print("Text can take up to 20 seconds before printing.")
+        with sd.InputStream(
+            device=device,
+            channels=1,
+            blocksize=audio_chunk_size_seconds * SAMPLE_RATE,
+            callback=callback,
+            samplerate=SAMPLE_RATE,
+        ):
+            while True:
+                response = input("Press ctrl+c or q/Q to quit.\n")
+                if response in ("q", "Q"):
+                    break
+
+    def transcribe_tokens(
+        self, audio: np.ndarray | str, audio_sample_rate: int | None = None
+    ) -> list[int]:
+        """
+        Transcribe the provided audio to text.
+
+        Parameters
+        ----------
+        audio: numpy array | str
+            Path to audio file if a string.
+            Raw audio array of shape (# of samples) if a numpy array.
+
+        audio_sample_rate: int | None
+            The sample rate of the provided audio, in samples / second.
+            If audio is a numpy array, this must be provided.
+            If audio is a file and audio_sample_rate is None, this is ignored and the sample rate will be derived from the audio file.
+
+        Returns
+        -------
+        transcribed tokens
+        """
+        if isinstance(audio, str):
+            import audio2numpy as a2n  # import here, as this requires ffmpeg to be installed on host machine
+
+            audio, audio_sample_rate = a2n.audio_from_file(audio)
+            if isinstance(audio, np.ndarray) and audio.ndim == 2:
+                # Audio is multi-channel (e.g., stero); collapse to single.
+                audio = audio.mean(-1)
+
+        assert audio_sample_rate is not None
+        assert isinstance(audio, np.ndarray)
+
+        out_chunked_tokens: list[list[int]] = [
+            self._transcribe_single_chunk(x)
+            for x in chunk_and_resample_audio(audio, audio_sample_rate)
+        ]
+        out_tokens: list[int] = []
+        for chunk_tokens in out_chunked_tokens:
+            out_tokens.extend(chunk_tokens)
+        return out_tokens
 
 
 def chunk_and_resample_audio(
 
@@ -3,7 +3,6 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # ---------------------------------------------------------------------
 
-
 import numpy as np
 
 from qai_hub_models.models._shared.hf_whisper.app import HfWhisperApp
@@ -30,22 +29,39 @@ def load_demo_audio() -> tuple[np.ndarray, int]:
 def hf_whisper_demo(model_cls: type[HfWhisper], is_test: bool = False) -> None:
     parser = get_model_cli_parser(model_cls)
     parser.add_argument(
-        "--audio_file",
+        "--audio-file",
         type=str,
         default=None,
         help="Audio file path or URL",
     )
+    parser.add_argument(
+        "--stream-audio-device",
+        type=int,
+        default=None,
+        help="Audio device (number) to stream from.",
+    )
+    parser.add_argument(
+        "--stream-audio-chunk-size",
+        type=int,
+        default=10,
+        help="For audio streaming, the number of seconds to record between each transcription attempt. A minimum of around 10 seconds is recommended for best accuracy.",
+    )
     args = parser.parse_args([] if is_test else None)
+    if (args.stream_audio_device is not None) and (args.audio_file is not None):
+        raise ValueError("Cannot set both audio-file and stream-audio-device")
 
     model = model_cls.from_pretrained()
     app = HfWhisperApp(model.encoder, model.decoder, model_cls.get_hf_whisper_version())
 
-    # Load default audio if file not provided
-    audio = args.audio_file
-    audio_sample_rate = None
-    if not audio:
-        audio, audio_sample_rate = load_demo_audio()
+    if args.stream_audio_device:
+        app.stream(args.stream_audio_device, args.stream_audio_chunk_size)
+    else:
+        # Load default audio if file not provided
+        audio = args.audio_file
+        audio_sample_rate = None
+        if not audio:
+            audio, audio_sample_rate = load_demo_audio()
 
-    # Perform transcription
-    transcription = app.transcribe(audio, audio_sample_rate)
-    print("Transcription:", transcription)
+        # Perform transcription
+        transcription = app.transcribe(audio, audio_sample_rate)
+        print("Transcription:", transcription)
@@ -145,7 +145,7 @@ def run_test_transcribe(
         model = WhisperForConditionalGeneration.from_pretrained(hf_whisper_version)
         predicted_ids = model.generate(mel_input)
         tokenizer = WhisperTokenizer.from_pretrained(hf_whisper_version)
-        text_orig = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)
+        text_orig = tokenizer.decode(predicted_ids[0], skip_special_tokens=True).strip()
 
     # Perform transcription
     transcription = app.transcribe(audio, sample_rate)