Skip to content

Commit bc7d669

Browse files
authored
Merge pull request #167 from ks6088ts-labs/feature/issue-166_whisper
add whisper example
2 parents dbd2247 + 9b28859 commit bc7d669

File tree

4 files changed

+163
-7
lines changed

4 files changed

+163
-7
lines changed
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# References
2+
3+
- [openai/whisper](https://github.com/openai/whisper)
4+
- [Improve --model argument handling and help message #1764](https://github.com/openai/whisper/pull/1764)
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
import argparse
2+
import logging
3+
4+
import whisper
5+
from dotenv import load_dotenv
6+
7+
8+
def init_args() -> argparse.Namespace:
9+
parser = argparse.ArgumentParser(
10+
prog="whisper_transcription",
11+
description="Transcript with Whisper model",
12+
)
13+
parser.add_argument(
14+
"-m",
15+
"--model",
16+
default="turbo",
17+
help="Model name",
18+
)
19+
parser.add_argument(
20+
"-f",
21+
"--file",
22+
default="dist/sample_audio.wav",
23+
help="Audio file",
24+
)
25+
parser.add_argument(
26+
"-v",
27+
"--verbose",
28+
action="store_true",
29+
)
30+
return parser.parse_args()
31+
32+
33+
if __name__ == "__main__":
34+
args = init_args()
35+
36+
# Set verbose mode
37+
if args.verbose:
38+
logging.basicConfig(level=logging.DEBUG)
39+
40+
# Parse .env file and set environment variables
41+
load_dotenv()
42+
43+
model = whisper.load_model(name=args.model)
44+
45+
# load audio and pad/trim it to fit 30 seconds
46+
audio = whisper.load_audio(
47+
file=args.file,
48+
)
49+
audio = whisper.pad_or_trim(
50+
array=audio,
51+
length=30 * 16000,
52+
)
53+
54+
# make log-Mel spectrogram and move to the same device as the model
55+
# https://github.com/openai/whisper/pull/1764
56+
mel = whisper.log_mel_spectrogram(
57+
audio=audio,
58+
n_mels=128,
59+
).to(model.device)
60+
61+
# detect the spoken language
62+
_, probs = model.detect_language(mel)
63+
print(f"Detected language: {max(probs, key=probs.get)}")
64+
65+
# decode the audio
66+
options = whisper.DecodingOptions()
67+
result = whisper.decode(model, mel, options)
68+
69+
# print the recognized text
70+
print(result.text)

poetry.lock

Lines changed: 88 additions & 7 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ lxml = "^5.3.0"
3838
nest-asyncio = "^1.6.0"
3939
typer = "^0.12.5"
4040
azure-cognitiveservices-speech = "^1.40.0"
41+
openai-whisper = "^20240930"
4142

4243
[tool.poetry.group.dev.dependencies]
4344
pre-commit = "^4.0.0"

0 commit comments

Comments
 (0)