Merge pull request #167 from ks6088ts-labs/feature/issue-166_whisper

ks6088ts · web-flow · commit bc7d669583f1 · 2024-10-09T14:20:49.000+09:00
add whisper example
diff --git a/apps/16_whisper_transcription/README.md b/apps/16_whisper_transcription/README.md
@@ -0,0 +1,4 @@
+# References
+
+- [openai/whisper](https://github.com/openai/whisper)
+- [Improve --model argument handling and help message #1764](https://github.com/openai/whisper/pull/1764)
diff --git a/apps/16_whisper_transcription/main.py b/apps/16_whisper_transcription/main.py
@@ -0,0 +1,70 @@
+import argparse
+import logging
+
+import whisper
+from dotenv import load_dotenv
+
+
+def init_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        prog="whisper_transcription",
+        description="Transcript with Whisper model",
+    )
+    parser.add_argument(
+        "-m",
+        "--model",
+        default="turbo",
+        help="Model name",
+    )
+    parser.add_argument(
+        "-f",
+        "--file",
+        default="dist/sample_audio.wav",
+        help="Audio file",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = init_args()
+
+    # Set verbose mode
+    if args.verbose:
+        logging.basicConfig(level=logging.DEBUG)
+
+    # Parse .env file and set environment variables
+    load_dotenv()
+
+    model = whisper.load_model(name=args.model)
+
+    # load audio and pad/trim it to fit 30 seconds
+    audio = whisper.load_audio(
+        file=args.file,
+    )
+    audio = whisper.pad_or_trim(
+        array=audio,
+        length=30 * 16000,
+    )
+
+    # make log-Mel spectrogram and move to the same device as the model
+    # https://github.com/openai/whisper/pull/1764
+    mel = whisper.log_mel_spectrogram(
+        audio=audio,
+        n_mels=128,
+    ).to(model.device)
+
+    # detect the spoken language
+    _, probs = model.detect_language(mel)
+    print(f"Detected language: {max(probs, key=probs.get)}")
+
+    # decode the audio
+    options = whisper.DecodingOptions()
+    result = whisper.decode(model, mel, options)
+
+    # print the recognized text
+    print(result.text)
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -38,6 +38,7 @@ lxml = "^5.3.0"
 nest-asyncio = "^1.6.0"
 typer = "^0.12.5"
 azure-cognitiveservices-speech = "^1.40.0"
+openai-whisper = "^20240930"
 
 [tool.poetry.group.dev.dependencies]
 pre-commit = "^4.0.0"