Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 10 additions & 8 deletions whisper/faster-whisper-v2/config.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
description: Faster Whisper v2
environment_variables: {}
external_package_dirs: []
model_cache:
- repo_id: Systran/faster-whisper-large-v2
model_metadata:
avatar_url: https://cdn.baseten.co/production/static/openai.png
cover_image_url: https://cdn.baseten.co/production/static/whisper.png
Expand All @@ -12,16 +10,20 @@ model_metadata:
pretty_name: Whisper
tags:
- speech-recognition
model_cache:
- repo_id: Systran/faster-whisper-large-v2
model_name: Faster Whisper v2
python_version: py39
python_version: py310
requirements:
- torch==2.1.1
- faster-whisper==1.0.3
- ctranslate2==4.4.0
- torch==2.4.1
- httpx==0.27.2
- faster-whisper@https://github.com/SYSTRAN/faster-whisper/archive/refs/heads/master.tar.gz
- ffmpeg-python==0.2.0
resources:
accelerator: T4
accelerator: L4
cpu: 500m
memory: 512Mi
use_gpu: true
secrets: {}
system_packages: []
system_packages:
- ffmpeg
200 changes: 163 additions & 37 deletions whisper/faster-whisper-v2/model/model.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,125 @@
import base64
from tempfile import NamedTemporaryFile
from typing import Any, Dict
import io
import logging
import tempfile
import time
from typing import Dict

import requests
from faster_whisper import WhisperModel
import ffmpeg
import httpx
import numpy as np
import torch
from faster_whisper import BatchedInferencePipeline, WhisperModel

DEFAULT_BATCH_SIZE = 8
DEFAULT_WORD_LEVEL_TIMESTAMPS = False
DEFAULT_PROMPT = None
DEFAULT_TEMPERATURE = 0
DEFAULT_BEAM_SIZE = 5
DEFAULT_BEST_OF = 5
DEFAULT_LANGUAGE = None
DEFAULT_CONDITION_ON_PREVIOUS_TEXT = False


class Model:
def __init__(self, **kwargs) -> None:
self._data_dir = kwargs["data_dir"]
self._config = kwargs["config"]
self._secrets = kwargs["secrets"]
self._model = None
def __init__(self, **kwargs):
self.model = None
self.batched_model = None

def load(self):
self._model = WhisperModel(self._config["model_metadata"]["model_id"])
self.model = WhisperModel("large-v2", device="cuda", compute_type="float16")
self.batched_model = BatchedInferencePipeline(model=self.model)

def base64_to_wav(self, base64_string):
binary_data = base64.b64decode(base64_string)
with tempfile.NamedTemporaryFile(
suffix=".wav", delete=False
) as output_file_path:
output_file_path.write(binary_data)
output_file_path.flush()
return output_file_path.name

def audio_url_to_waveform(self, path_or_url: str):
sampling_rate = 16000
# Use ffmpeg to read the audio file and convert to the monochannel, 16kHz
out, _ = (
ffmpeg.input(
path_or_url, seekable=0
) # Disable HTTP seekable (range requests)
.output("pipe:", format="wav", acodec="pcm_s16le", ac=1, ar=sampling_rate)
.run(capture_stdout=True, capture_stderr=True)
)

# Convert the raw byte data into a numpy array
waveform_np = np.frombuffer(out, dtype=np.int16)

# Normalize the waveform data
waveform_np = waveform_np.astype(np.float32) / 32768.0

# Convert the numpy array to a pytorch tensor
waveform_tensor = torch.tensor(waveform_np, dtype=torch.float32)

return waveform_tensor

def base64_to_waveform(self, base64_audio: str):
sampling_rate = 16000

# Decode the Base64 string to get the raw audio bytes
audio_bytes = base64.b64decode(base64_audio)

# Convert the raw audio bytes to a binary stream for ffmpeg to process
audio_stream = io.BytesIO(audio_bytes)

# Use ffmpeg to process the raw bytes and convert to monochannel, 16kHz wav format
out, _ = (
ffmpeg.input("pipe:0") # Input is from pipe
.output("pipe:", format="wav", acodec="pcm_s16le", ac=1, ar=sampling_rate)
.run(input=audio_stream.read(), capture_stdout=True, capture_stderr=True)
)

# Convert the raw byte data into a numpy array
waveform_np = np.frombuffer(out, dtype=np.int16)

# Normalize the waveform data
waveform_np = waveform_np.astype(np.float32) / 32768.0

# Convert the numpy array to a pytorch tensor
waveform_tensor = torch.tensor(waveform_np, dtype=torch.float32)

return waveform_tensor

def download_file(self, url):
with httpx.Client() as client:
response = client.get(url, timeout=500)
if response.status_code == 200:
# Save the file to a local file
with tempfile.NamedTemporaryFile(
suffix=".wav", delete=False
) as output_file_path:
output_file_path.write(response.content)
output_file_path.flush()
logging.info("File downloaded successfully.")
return output_file_path.name
else:
logging.info(
f"Failed to download file. Status code: {response.status_code}"
)
return None

def preprocess(self, request: Dict) -> Dict:
audio_base64 = request.get("audio")
url = request.get("url")
word_level_timestamps = request.get(
"word_timestamps", DEFAULT_WORD_LEVEL_TIMESTAMPS
)
prompt = request.get("prompt", DEFAULT_PROMPT)
temperature = request.get("temperature", DEFAULT_TEMPERATURE)
batch_size = request.get("batch_size", DEFAULT_BATCH_SIZE)
beam_size = request.get("beam_size", DEFAULT_BEAM_SIZE)
best_of = request.get("best_of", DEFAULT_BEST_OF)
language = request.get("language", DEFAULT_LANGUAGE)

response = {}

if audio_base64 and url:
return {
Expand All @@ -29,40 +130,65 @@ def preprocess(self, request: Dict) -> Dict:
"error": "Please provide either an audio file in base64 string format or a URL to an audio file.",
}

binary_data = None

if audio_base64:
binary_data = base64.b64decode(audio_base64)
waveform = self.base64_to_waveform(audio_base64)
response["audio"] = waveform
# file_name = self.base64_to_wav(audio_base64)
# response["audio"] = file_name

elif url:
resp = requests.get(url)
binary_data = resp.content
start = time.time()
# file_name = self.download_file(url)
waveform = self.audio_url_to_waveform(url)
logging.info(
f"url download time: {time.time() - start}",
)
response["audio"] = waveform

return {"data": binary_data}
response["word_timestamps"] = word_level_timestamps
response["initial_prompt"] = prompt
response["temperature"] = temperature
response["batch_size"] = batch_size
response["beam_size"] = beam_size
response["best_of"] = best_of
response["language"] = language

def predict(self, request: Dict) -> Dict:
if request.get("error"):
return request
return response

audio_data = request.get("data")
result_segments = []
def predict(self, model_input: Dict):
start = time.time()

with NamedTemporaryFile() as fp:
fp.write(audio_data)
segments, info = self._model.transcribe(
fp.name,
temperature=0,
best_of=5,
beam_size=5,
)
all_segments = []
full_transcript = ""
audio = model_input.pop("audio")

for seg in segments:
result_segments.append(
{"text": seg.text, "start": seg.start, "end": seg.end}
)
segments, info = self.batched_model.transcribe(audio, **model_input)
for segment in segments:
segment_information = {
"text": segment.text,
"start": segment.start,
"end": segment.end,
}

words = []
if segment.words:
for word in segment.words:
words.append(
{"start": word.start, "end": word.end, "word": word.word}
)

segment_information["words"] = words

all_segments.append(segment_information)
full_transcript += segment.text

language = info.language
end = time.time()
transcription_time = end - start

return {
"language": info.language,
"language_probability": info.language_probability,
"duration": info.duration,
"segments": result_segments,
"segments": all_segments,
"language": language,
"transcript": full_transcript,
"transcription_time": transcription_time,
}
Loading