From b0e39bee131a7fbc4d99d6c495766dc088ed2d10 Mon Sep 17 00:00:00 2001 From: Het Trivedi Date: Thu, 19 Sep 2024 17:13:27 -0700 Subject: [PATCH 1/4] Updating faster whisper v2 to use batching --- whisper/faster-whisper-v2/config.yaml | 11 +- whisper/faster-whisper-v2/model/model.py | 159 ++++++++++++++++------- 2 files changed, 119 insertions(+), 51 deletions(-) diff --git a/whisper/faster-whisper-v2/config.yaml b/whisper/faster-whisper-v2/config.yaml index 7bade046e..4c787a274 100644 --- a/whisper/faster-whisper-v2/config.yaml +++ b/whisper/faster-whisper-v2/config.yaml @@ -1,8 +1,6 @@ description: Faster Whisper v2 environment_variables: {} external_package_dirs: [] -model_cache: -- repo_id: Systran/faster-whisper-large-v2 model_metadata: avatar_url: https://cdn.baseten.co/production/static/openai.png cover_image_url: https://cdn.baseten.co/production/static/whisper.png @@ -13,12 +11,13 @@ model_metadata: tags: - speech-recognition model_name: Faster Whisper v2 -python_version: py39 +python_version: py310 requirements: -- torch==2.1.0 -- faster-whisper==0.10.0 +- torch +- httpx +- faster-whisper@https://github.com/SYSTRAN/faster-whisper/archive/refs/heads/master.tar.gz resources: - accelerator: T4 + accelerator: L4 cpu: 500m memory: 512Mi use_gpu: true diff --git a/whisper/faster-whisper-v2/model/model.py b/whisper/faster-whisper-v2/model/model.py index ec486e433..0d2a570a5 100644 --- a/whisper/faster-whisper-v2/model/model.py +++ b/whisper/faster-whisper-v2/model/model.py @@ -1,24 +1,65 @@ +import os + +from faster_whisper import WhisperModel, BatchedInferencePipeline +import httpx +import tempfile +import time +from typing import Dict import base64 -from tempfile import NamedTemporaryFile -from typing import Any, Dict +import logging -import requests -from faster_whisper import WhisperModel +DEFAULT_BATCH_SIZE = 8 +DEFAULT_WORD_LEVEL_TIMESTAMPS = False +DEFAULT_PROMPT = None +DEFAULT_TEMPERATURE = 0 +DEFAULT_BEAM_SIZE = 5 +DEFAULT_BEST_OF = 5 +DEFAULT_LANGUAGE = None +DEFAULT_CONDITION_ON_PREVIOUS_TEXT = False class Model: - def __init__(self, **kwargs) -> None: - self._data_dir = kwargs["data_dir"] - self._config = kwargs["config"] - self._secrets = kwargs["secrets"] - self._model = None + def __init__(self, **kwargs): + self.model = None + self.batched_model = None def load(self): - self._model = WhisperModel(self._config["model_metadata"]["model_id"]) + self.model = WhisperModel("large-v2", device="cuda", compute_type="float16") + self.batched_model = BatchedInferencePipeline(model=self.model) + + def base64_to_wav(self, base64_string): + binary_data = base64.b64decode(base64_string) + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file_path: + output_file_path.write(binary_data) + output_file_path.flush() + return output_file_path.name + + def download_file(self, url): + with httpx.Client() as client: + response = client.get(url, timeout=500) + if response.status_code == 200: + # Save the file to a local file + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file_path: + output_file_path.write(response.content) + output_file_path.flush() + logging.info("File downloaded successfully.") + return output_file_path.name + else: + logging.info(f"Failed to download file. Status code: {response.status_code}") + return None def preprocess(self, request: Dict) -> Dict: audio_base64 = request.get("audio") url = request.get("url") + word_level_timestamps = request.get("word_timestamps", DEFAULT_WORD_LEVEL_TIMESTAMPS) + prompt = request.get("prompt", DEFAULT_PROMPT) + temperature = request.get("temperature", DEFAULT_TEMPERATURE) + batch_size = request.get("batch_size", DEFAULT_BATCH_SIZE) + beam_size = request.get("beam_size", DEFAULT_BEAM_SIZE) + best_of = request.get("best_of", DEFAULT_BEST_OF) + language = request.get("language", DEFAULT_LANGUAGE) + + response = {} if audio_base64 and url: return { @@ -29,40 +70,68 @@ def preprocess(self, request: Dict) -> Dict: "error": "Please provide either an audio file in base64 string format or a URL to an audio file.", } - binary_data = None - if audio_base64: - binary_data = base64.b64decode(audio_base64) + file_name = self.base64_to_wav(audio_base64) + response['audio'] = file_name + elif url: - resp = requests.get(url) - binary_data = resp.content - - return {"data": binary_data} - - def predict(self, request: Dict) -> Dict: - if request.get("error"): - return request - - audio_data = request.get("data") - result_segments = [] - - with NamedTemporaryFile() as fp: - fp.write(audio_data) - segments, info = self._model.transcribe( - fp.name, - temperature=0, - best_of=5, - beam_size=5, - ) - - for seg in segments: - result_segments.append( - {"text": seg.text, "start": seg.start, "end": seg.end} - ) - - return { - "language": info.language, - "language_probability": info.language_probability, - "duration": info.duration, - "segments": result_segments, - } + start = time.time() + file_name = self.download_file(url) + logging.info(f"url download time: {time.time() - start}",) + response['audio'] = file_name + + response['word_timestamps'] = word_level_timestamps + response['initial_prompt'] = prompt + response['temperature'] = temperature + response['batch_size'] = batch_size + response['beam_size'] = beam_size + response['best_of'] = best_of + response['language'] = language + + return response + + def predict(self, model_input: Dict): + start = time.time() + + all_segments = [] + full_transcript = "" + audio = model_input.pop("audio") + + if audio: + segments, info = self.batched_model.transcribe(audio, **model_input) + for segment in segments: + segment_information = { + "text": segment.text, + "start": segment.start, + "end": segment.end + } + + words = [] + if segment.words: + for word in segment.words: + words.append( + { + "start": word.start, + "end": word.end, + "word": word.word + } + ) + + segment_information['words'] = words + + all_segments.append(segment_information) + full_transcript += segment.text + + language = info.language + end = time.time() + transcription_time = end - start + + return { + "segments": all_segments, + "language": language, + "transcript": full_transcript, + "transcription_time": transcription_time + } + + return model_input + From 4b6c223958114b1f3104844a61b06b1066f72b70 Mon Sep 17 00:00:00 2001 From: Het Trivedi Date: Fri, 20 Sep 2024 09:27:06 -0700 Subject: [PATCH 2/4] fixing lint, adding caching --- whisper/faster-whisper-v2/config.yaml | 2 + whisper/faster-whisper-v2/model/model.py | 74 +++++++++++++----------- 2 files changed, 43 insertions(+), 33 deletions(-) diff --git a/whisper/faster-whisper-v2/config.yaml b/whisper/faster-whisper-v2/config.yaml index 4c787a274..b9f3bfe5d 100644 --- a/whisper/faster-whisper-v2/config.yaml +++ b/whisper/faster-whisper-v2/config.yaml @@ -10,6 +10,8 @@ model_metadata: pretty_name: Whisper tags: - speech-recognition +model_cache: + - repo_id: Systran/faster-whisper-large-v2 model_name: Faster Whisper v2 python_version: py310 requirements: diff --git a/whisper/faster-whisper-v2/model/model.py b/whisper/faster-whisper-v2/model/model.py index 0d2a570a5..4d802496b 100644 --- a/whisper/faster-whisper-v2/model/model.py +++ b/whisper/faster-whisper-v2/model/model.py @@ -1,12 +1,12 @@ +import base64 +import logging import os - -from faster_whisper import WhisperModel, BatchedInferencePipeline -import httpx import tempfile import time from typing import Dict -import base64 -import logging + +import httpx +from faster_whisper import BatchedInferencePipeline, WhisperModel DEFAULT_BATCH_SIZE = 8 DEFAULT_WORD_LEVEL_TIMESTAMPS = False @@ -29,7 +29,9 @@ def load(self): def base64_to_wav(self, base64_string): binary_data = base64.b64decode(base64_string) - with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file_path: + with tempfile.NamedTemporaryFile( + suffix=".wav", delete=False + ) as output_file_path: output_file_path.write(binary_data) output_file_path.flush() return output_file_path.name @@ -39,19 +41,25 @@ def download_file(self, url): response = client.get(url, timeout=500) if response.status_code == 200: # Save the file to a local file - with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file_path: + with tempfile.NamedTemporaryFile( + suffix=".wav", delete=False + ) as output_file_path: output_file_path.write(response.content) output_file_path.flush() logging.info("File downloaded successfully.") return output_file_path.name else: - logging.info(f"Failed to download file. Status code: {response.status_code}") + logging.info( + f"Failed to download file. Status code: {response.status_code}" + ) return None def preprocess(self, request: Dict) -> Dict: audio_base64 = request.get("audio") url = request.get("url") - word_level_timestamps = request.get("word_timestamps", DEFAULT_WORD_LEVEL_TIMESTAMPS) + word_level_timestamps = request.get( + "word_timestamps", DEFAULT_WORD_LEVEL_TIMESTAMPS + ) prompt = request.get("prompt", DEFAULT_PROMPT) temperature = request.get("temperature", DEFAULT_TEMPERATURE) batch_size = request.get("batch_size", DEFAULT_BATCH_SIZE) @@ -72,21 +80,23 @@ def preprocess(self, request: Dict) -> Dict: if audio_base64: file_name = self.base64_to_wav(audio_base64) - response['audio'] = file_name + response["audio"] = file_name elif url: start = time.time() file_name = self.download_file(url) - logging.info(f"url download time: {time.time() - start}",) - response['audio'] = file_name - - response['word_timestamps'] = word_level_timestamps - response['initial_prompt'] = prompt - response['temperature'] = temperature - response['batch_size'] = batch_size - response['beam_size'] = beam_size - response['best_of'] = best_of - response['language'] = language + logging.info( + f"url download time: {time.time() - start}", + ) + response["audio"] = file_name + + response["word_timestamps"] = word_level_timestamps + response["initial_prompt"] = prompt + response["temperature"] = temperature + response["batch_size"] = batch_size + response["beam_size"] = beam_size + response["best_of"] = best_of + response["language"] = language return response @@ -103,21 +113,17 @@ def predict(self, model_input: Dict): segment_information = { "text": segment.text, "start": segment.start, - "end": segment.end + "end": segment.end, } words = [] if segment.words: for word in segment.words: words.append( - { - "start": word.start, - "end": word.end, - "word": word.word - } + {"start": word.start, "end": word.end, "word": word.word} ) - segment_information['words'] = words + segment_information["words"] = words all_segments.append(segment_information) full_transcript += segment.text @@ -126,12 +132,14 @@ def predict(self, model_input: Dict): end = time.time() transcription_time = end - start + # Delete temp file + os.remove(audio) + return { - "segments": all_segments, - "language": language, - "transcript": full_transcript, - "transcription_time": transcription_time - } + "segments": all_segments, + "language": language, + "transcript": full_transcript, + "transcription_time": transcription_time, + } return model_input - From 457714ab6ee348e4b7686b92347290ea053ed742 Mon Sep 17 00:00:00 2001 From: Het Trivedi Date: Tue, 8 Oct 2024 10:27:50 -0700 Subject: [PATCH 3/4] Updating code for faster whisper --- whisper/faster-whisper-v2/config.yaml | 10 +- whisper/faster-whisper-v2/model/model.py | 126 ++++++++++++++++------- 2 files changed, 93 insertions(+), 43 deletions(-) diff --git a/whisper/faster-whisper-v2/config.yaml b/whisper/faster-whisper-v2/config.yaml index b9f3bfe5d..63aa7578d 100644 --- a/whisper/faster-whisper-v2/config.yaml +++ b/whisper/faster-whisper-v2/config.yaml @@ -15,13 +15,15 @@ model_cache: model_name: Faster Whisper v2 python_version: py310 requirements: -- torch -- httpx -- faster-whisper@https://github.com/SYSTRAN/faster-whisper/archive/refs/heads/master.tar.gz + - torch==2.4.1 + - httpx==0.27.2 + - faster-whisper@https://github.com/SYSTRAN/faster-whisper/archive/refs/heads/master.tar.gz + - ffmpeg-python==0.2.0 resources: accelerator: L4 cpu: 500m memory: 512Mi use_gpu: true secrets: {} -system_packages: [] +system_packages: + - ffmpeg diff --git a/whisper/faster-whisper-v2/model/model.py b/whisper/faster-whisper-v2/model/model.py index 4d802496b..a9fa3a520 100644 --- a/whisper/faster-whisper-v2/model/model.py +++ b/whisper/faster-whisper-v2/model/model.py @@ -1,11 +1,13 @@ import base64 import logging -import os import tempfile import time from typing import Dict - +import numpy as np +import torch import httpx +import ffmpeg +import io from faster_whisper import BatchedInferencePipeline, WhisperModel DEFAULT_BATCH_SIZE = 8 @@ -30,19 +32,68 @@ def load(self): def base64_to_wav(self, base64_string): binary_data = base64.b64decode(base64_string) with tempfile.NamedTemporaryFile( - suffix=".wav", delete=False + suffix=".wav", delete=False ) as output_file_path: output_file_path.write(binary_data) output_file_path.flush() return output_file_path.name + def audio_url_to_waveform(self, path_or_url: str): + sampling_rate = 16000 + # Use ffmpeg to read the audio file and convert to the monochannel, 16kHz + out, _ = ( + ffmpeg.input( + path_or_url, seekable=0 + ) # Disable HTTP seekable (range requests) + .output("pipe:", format="wav", acodec="pcm_s16le", ac=1, ar=sampling_rate) + .run(capture_stdout=True, capture_stderr=True) + ) + + # Convert the raw byte data into a numpy array + waveform_np = np.frombuffer(out, dtype=np.int16) + + # Normalize the waveform data + waveform_np = waveform_np.astype(np.float32) / 32768.0 + + # Convert the numpy array to a pytorch tensor + waveform_tensor = torch.tensor(waveform_np, dtype=torch.float32) + + return waveform_tensor + + def base64_to_waveform(self, base64_audio: str): + sampling_rate = 16000 + + # Decode the Base64 string to get the raw audio bytes + audio_bytes = base64.b64decode(base64_audio) + + # Convert the raw audio bytes to a binary stream for ffmpeg to process + audio_stream = io.BytesIO(audio_bytes) + + # Use ffmpeg to process the raw bytes and convert to monochannel, 16kHz wav format + out, _ = ( + ffmpeg.input('pipe:0') # Input is from pipe + .output("pipe:", format="wav", acodec="pcm_s16le", ac=1, ar=sampling_rate) + .run(input=audio_stream.read(), capture_stdout=True, capture_stderr=True) + ) + + # Convert the raw byte data into a numpy array + waveform_np = np.frombuffer(out, dtype=np.int16) + + # Normalize the waveform data + waveform_np = waveform_np.astype(np.float32) / 32768.0 + + # Convert the numpy array to a pytorch tensor + waveform_tensor = torch.tensor(waveform_np, dtype=torch.float32) + + return waveform_tensor + def download_file(self, url): with httpx.Client() as client: response = client.get(url, timeout=500) if response.status_code == 200: # Save the file to a local file with tempfile.NamedTemporaryFile( - suffix=".wav", delete=False + suffix=".wav", delete=False ) as output_file_path: output_file_path.write(response.content) output_file_path.flush() @@ -79,16 +130,19 @@ def preprocess(self, request: Dict) -> Dict: } if audio_base64: - file_name = self.base64_to_wav(audio_base64) - response["audio"] = file_name + waveform = self.base64_to_waveform(audio_base64) + response["audio"] = waveform + # file_name = self.base64_to_wav(audio_base64) + # response["audio"] = file_name elif url: start = time.time() - file_name = self.download_file(url) + # file_name = self.download_file(url) + waveform = self.audio_url_to_waveform(url) logging.info( f"url download time: {time.time() - start}", ) - response["audio"] = file_name + response["audio"] = waveform response["word_timestamps"] = word_level_timestamps response["initial_prompt"] = prompt @@ -107,39 +161,33 @@ def predict(self, model_input: Dict): full_transcript = "" audio = model_input.pop("audio") - if audio: - segments, info = self.batched_model.transcribe(audio, **model_input) - for segment in segments: - segment_information = { - "text": segment.text, - "start": segment.start, - "end": segment.end, - } - - words = [] - if segment.words: - for word in segment.words: - words.append( - {"start": word.start, "end": word.end, "word": word.word} - ) - - segment_information["words"] = words + segments, info = self.batched_model.transcribe(audio, **model_input) + for segment in segments: + segment_information = { + "text": segment.text, + "start": segment.start, + "end": segment.end, + } - all_segments.append(segment_information) - full_transcript += segment.text + words = [] + if segment.words: + for word in segment.words: + words.append( + {"start": word.start, "end": word.end, "word": word.word} + ) - language = info.language - end = time.time() - transcription_time = end - start + segment_information["words"] = words - # Delete temp file - os.remove(audio) + all_segments.append(segment_information) + full_transcript += segment.text - return { - "segments": all_segments, - "language": language, - "transcript": full_transcript, - "transcription_time": transcription_time, - } + language = info.language + end = time.time() + transcription_time = end - start - return model_input + return { + "segments": all_segments, + "language": language, + "transcript": full_transcript, + "transcription_time": transcription_time, + } From ede16e7e90d68d6e3dfd2038fa876e89ab109b4f Mon Sep 17 00:00:00 2001 From: Het Trivedi Date: Tue, 8 Oct 2024 10:30:39 -0700 Subject: [PATCH 4/4] fixing lint --- whisper/faster-whisper-v2/config.yaml | 2 +- whisper/faster-whisper-v2/model/model.py | 13 +++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/whisper/faster-whisper-v2/config.yaml b/whisper/faster-whisper-v2/config.yaml index 63aa7578d..4b3b55549 100644 --- a/whisper/faster-whisper-v2/config.yaml +++ b/whisper/faster-whisper-v2/config.yaml @@ -25,5 +25,5 @@ resources: memory: 512Mi use_gpu: true secrets: {} -system_packages: +system_packages: - ffmpeg diff --git a/whisper/faster-whisper-v2/model/model.py b/whisper/faster-whisper-v2/model/model.py index a9fa3a520..99f76c3a9 100644 --- a/whisper/faster-whisper-v2/model/model.py +++ b/whisper/faster-whisper-v2/model/model.py @@ -1,13 +1,14 @@ import base64 +import io import logging import tempfile import time from typing import Dict + +import ffmpeg +import httpx import numpy as np import torch -import httpx -import ffmpeg -import io from faster_whisper import BatchedInferencePipeline, WhisperModel DEFAULT_BATCH_SIZE = 8 @@ -32,7 +33,7 @@ def load(self): def base64_to_wav(self, base64_string): binary_data = base64.b64decode(base64_string) with tempfile.NamedTemporaryFile( - suffix=".wav", delete=False + suffix=".wav", delete=False ) as output_file_path: output_file_path.write(binary_data) output_file_path.flush() @@ -71,7 +72,7 @@ def base64_to_waveform(self, base64_audio: str): # Use ffmpeg to process the raw bytes and convert to monochannel, 16kHz wav format out, _ = ( - ffmpeg.input('pipe:0') # Input is from pipe + ffmpeg.input("pipe:0") # Input is from pipe .output("pipe:", format="wav", acodec="pcm_s16le", ac=1, ar=sampling_rate) .run(input=audio_stream.read(), capture_stdout=True, capture_stderr=True) ) @@ -93,7 +94,7 @@ def download_file(self, url): if response.status_code == 200: # Save the file to a local file with tempfile.NamedTemporaryFile( - suffix=".wav", delete=False + suffix=".wav", delete=False ) as output_file_path: output_file_path.write(response.content) output_file_path.flush()