diff --git a/whisper/faster-whisper-v2/config.yaml b/whisper/faster-whisper-v2/config.yaml index da604f48e..4b3b55549 100644 --- a/whisper/faster-whisper-v2/config.yaml +++ b/whisper/faster-whisper-v2/config.yaml @@ -1,8 +1,6 @@ description: Faster Whisper v2 environment_variables: {} external_package_dirs: [] -model_cache: -- repo_id: Systran/faster-whisper-large-v2 model_metadata: avatar_url: https://cdn.baseten.co/production/static/openai.png cover_image_url: https://cdn.baseten.co/production/static/whisper.png @@ -12,16 +10,20 @@ model_metadata: pretty_name: Whisper tags: - speech-recognition +model_cache: + - repo_id: Systran/faster-whisper-large-v2 model_name: Faster Whisper v2 -python_version: py39 +python_version: py310 requirements: -- torch==2.1.1 -- faster-whisper==1.0.3 -- ctranslate2==4.4.0 + - torch==2.4.1 + - httpx==0.27.2 + - faster-whisper@https://github.com/SYSTRAN/faster-whisper/archive/refs/heads/master.tar.gz + - ffmpeg-python==0.2.0 resources: - accelerator: T4 + accelerator: L4 cpu: 500m memory: 512Mi use_gpu: true secrets: {} -system_packages: [] +system_packages: + - ffmpeg diff --git a/whisper/faster-whisper-v2/model/model.py b/whisper/faster-whisper-v2/model/model.py index ec486e433..99f76c3a9 100644 --- a/whisper/faster-whisper-v2/model/model.py +++ b/whisper/faster-whisper-v2/model/model.py @@ -1,24 +1,125 @@ import base64 -from tempfile import NamedTemporaryFile -from typing import Any, Dict +import io +import logging +import tempfile +import time +from typing import Dict -import requests -from faster_whisper import WhisperModel +import ffmpeg +import httpx +import numpy as np +import torch +from faster_whisper import BatchedInferencePipeline, WhisperModel + +DEFAULT_BATCH_SIZE = 8 +DEFAULT_WORD_LEVEL_TIMESTAMPS = False +DEFAULT_PROMPT = None +DEFAULT_TEMPERATURE = 0 +DEFAULT_BEAM_SIZE = 5 +DEFAULT_BEST_OF = 5 +DEFAULT_LANGUAGE = None +DEFAULT_CONDITION_ON_PREVIOUS_TEXT = False class Model: - def __init__(self, **kwargs) -> None: - self._data_dir = kwargs["data_dir"] - self._config = kwargs["config"] - self._secrets = kwargs["secrets"] - self._model = None + def __init__(self, **kwargs): + self.model = None + self.batched_model = None def load(self): - self._model = WhisperModel(self._config["model_metadata"]["model_id"]) + self.model = WhisperModel("large-v2", device="cuda", compute_type="float16") + self.batched_model = BatchedInferencePipeline(model=self.model) + + def base64_to_wav(self, base64_string): + binary_data = base64.b64decode(base64_string) + with tempfile.NamedTemporaryFile( + suffix=".wav", delete=False + ) as output_file_path: + output_file_path.write(binary_data) + output_file_path.flush() + return output_file_path.name + + def audio_url_to_waveform(self, path_or_url: str): + sampling_rate = 16000 + # Use ffmpeg to read the audio file and convert to the monochannel, 16kHz + out, _ = ( + ffmpeg.input( + path_or_url, seekable=0 + ) # Disable HTTP seekable (range requests) + .output("pipe:", format="wav", acodec="pcm_s16le", ac=1, ar=sampling_rate) + .run(capture_stdout=True, capture_stderr=True) + ) + + # Convert the raw byte data into a numpy array + waveform_np = np.frombuffer(out, dtype=np.int16) + + # Normalize the waveform data + waveform_np = waveform_np.astype(np.float32) / 32768.0 + + # Convert the numpy array to a pytorch tensor + waveform_tensor = torch.tensor(waveform_np, dtype=torch.float32) + + return waveform_tensor + + def base64_to_waveform(self, base64_audio: str): + sampling_rate = 16000 + + # Decode the Base64 string to get the raw audio bytes + audio_bytes = base64.b64decode(base64_audio) + + # Convert the raw audio bytes to a binary stream for ffmpeg to process + audio_stream = io.BytesIO(audio_bytes) + + # Use ffmpeg to process the raw bytes and convert to monochannel, 16kHz wav format + out, _ = ( + ffmpeg.input("pipe:0") # Input is from pipe + .output("pipe:", format="wav", acodec="pcm_s16le", ac=1, ar=sampling_rate) + .run(input=audio_stream.read(), capture_stdout=True, capture_stderr=True) + ) + + # Convert the raw byte data into a numpy array + waveform_np = np.frombuffer(out, dtype=np.int16) + + # Normalize the waveform data + waveform_np = waveform_np.astype(np.float32) / 32768.0 + + # Convert the numpy array to a pytorch tensor + waveform_tensor = torch.tensor(waveform_np, dtype=torch.float32) + + return waveform_tensor + + def download_file(self, url): + with httpx.Client() as client: + response = client.get(url, timeout=500) + if response.status_code == 200: + # Save the file to a local file + with tempfile.NamedTemporaryFile( + suffix=".wav", delete=False + ) as output_file_path: + output_file_path.write(response.content) + output_file_path.flush() + logging.info("File downloaded successfully.") + return output_file_path.name + else: + logging.info( + f"Failed to download file. Status code: {response.status_code}" + ) + return None def preprocess(self, request: Dict) -> Dict: audio_base64 = request.get("audio") url = request.get("url") + word_level_timestamps = request.get( + "word_timestamps", DEFAULT_WORD_LEVEL_TIMESTAMPS + ) + prompt = request.get("prompt", DEFAULT_PROMPT) + temperature = request.get("temperature", DEFAULT_TEMPERATURE) + batch_size = request.get("batch_size", DEFAULT_BATCH_SIZE) + beam_size = request.get("beam_size", DEFAULT_BEAM_SIZE) + best_of = request.get("best_of", DEFAULT_BEST_OF) + language = request.get("language", DEFAULT_LANGUAGE) + + response = {} if audio_base64 and url: return { @@ -29,40 +130,65 @@ def preprocess(self, request: Dict) -> Dict: "error": "Please provide either an audio file in base64 string format or a URL to an audio file.", } - binary_data = None - if audio_base64: - binary_data = base64.b64decode(audio_base64) + waveform = self.base64_to_waveform(audio_base64) + response["audio"] = waveform + # file_name = self.base64_to_wav(audio_base64) + # response["audio"] = file_name + elif url: - resp = requests.get(url) - binary_data = resp.content + start = time.time() + # file_name = self.download_file(url) + waveform = self.audio_url_to_waveform(url) + logging.info( + f"url download time: {time.time() - start}", + ) + response["audio"] = waveform - return {"data": binary_data} + response["word_timestamps"] = word_level_timestamps + response["initial_prompt"] = prompt + response["temperature"] = temperature + response["batch_size"] = batch_size + response["beam_size"] = beam_size + response["best_of"] = best_of + response["language"] = language - def predict(self, request: Dict) -> Dict: - if request.get("error"): - return request + return response - audio_data = request.get("data") - result_segments = [] + def predict(self, model_input: Dict): + start = time.time() - with NamedTemporaryFile() as fp: - fp.write(audio_data) - segments, info = self._model.transcribe( - fp.name, - temperature=0, - best_of=5, - beam_size=5, - ) + all_segments = [] + full_transcript = "" + audio = model_input.pop("audio") - for seg in segments: - result_segments.append( - {"text": seg.text, "start": seg.start, "end": seg.end} - ) + segments, info = self.batched_model.transcribe(audio, **model_input) + for segment in segments: + segment_information = { + "text": segment.text, + "start": segment.start, + "end": segment.end, + } + + words = [] + if segment.words: + for word in segment.words: + words.append( + {"start": word.start, "end": word.end, "word": word.word} + ) + + segment_information["words"] = words + + all_segments.append(segment_information) + full_transcript += segment.text + + language = info.language + end = time.time() + transcription_time = end - start return { - "language": info.language, - "language_probability": info.language_probability, - "duration": info.duration, - "segments": result_segments, + "segments": all_segments, + "language": language, + "transcript": full_transcript, + "transcription_time": transcription_time, }