basetenlabs
diff --git a/‎07-high-performance-dynamic-batching/.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎07-high-performance-dynamic-batching/.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎07-high-performance-dynamic-batching/.truss_ignore‎
Lines changed: 3 additions & 0 deletions b/‎07-high-performance-dynamic-batching/.truss_ignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎07-high-performance-dynamic-batching/README.md‎
Lines changed: 29 additions & 0 deletions b/‎07-high-performance-dynamic-batching/README.md‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎07-high-performance-dynamic-batching/config.yaml‎
Lines changed: 29 additions & 0 deletions b/‎07-high-performance-dynamic-batching/config.yaml‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎07-high-performance-dynamic-batching/model/__init__.py‎ b/‎07-high-performance-dynamic-batching/model/__init__.py‎
diff --git a/‎07-high-performance-dynamic-batching/model/model.py‎
Lines changed: 85 additions & 0 deletions b/‎07-high-performance-dynamic-batching/model/model.py‎
Lines changed: 85 additions & 0 deletions
diff --git a/‎07-high-performance-dynamic-batching/packages/__init__.py‎ b/‎07-high-performance-dynamic-batching/packages/__init__.py‎
@@ -0,0 +1,3 @@
+.venv/
+payload.json
+.vscode
@@ -0,0 +1,3 @@
+.venv/
+payload.json
+.vscode
@@ -0,0 +1,29 @@
+# Dynamic Batching in Truss
+
+This repository contains an implementation designed to enable dynamic batching for machine learning models within the Truss framework. The core of this implementation lies in the `model/model.py` file, which introduces a `MlBatcher` class extending `AsyncBatcher`. This class is responsible for collecting individual prediction requests and processing them in batches, thereby improving throughput and efficiency.
+
+## Key Features
+
+- **Dynamic Batching:** The `MlBatcher` class dynamically batches incoming prediction requests, allowing for more efficient use of resources and faster response times.
+- **Asynchronous Processing:** Utilizes asynchronous programming to handle concurrent prediction requests without blocking, ensuring high throughput.
+- **Easy Integration:** Designed to be deployed as a normal Truss, making integration into existing projects straightforward.
+
+## Deployment
+
+To deploy this as a normal Truss, ensure you have the Truss CLI installed and configured. Then, follow these steps:
+
+1. Clone this repository to your local machine.
+2. Navigate to the repository directory and build the Truss using the command `truss push --publish`.
+3. Once the build completes, deploy the Truss to your desired environment.
+
+## Configuration
+
+The `config.yaml` file contains configuration options for the model, including the Python version, required packages, and runtime settings such as `predict_concurrency`. Adjust these settings as needed to optimize performance for your specific use case.
+
+## Testing
+
+The `test.py` file provides an example of how to send concurrent requests to the deployed model for testing purposes. Modify the URL and data as needed to match your deployment.
+
+## Conclusion
+
+This implementation showcases how dynamic batching can be seamlessly integrated into the Truss framework, providing a scalable and efficient solution for handling machine learning inference at scale.
@@ -0,0 +1,29 @@
+base_image:
+  image: baseten/trtllm-server:r23.12_baseten_v0.9.0.dev2024022000
+  python_executable_path: /usr/bin/python3
+model_name: TRT Whisper - Dynamic Batching
+python_version: py311
+requirements:
+- async-batcher==0.2.0
+- mpi4py==3.1.5
+- pynvml==11.5.0
+- huggingface_hub==0.20.3
+- tiktoken==0.6.0
+- datasets==2.17.1
+- kaldialign==0.9
+- openai-whisper==20231117
+- soundfile==0.12.1
+model_cache:
+  - repo_id: baseten/trtllm-whisper-a10g-large-v2-1
+system_packages:
+- python3.10-venv
+- ffmpeg
+resources:
+  accelerator: A10G
+runtime:
+  predict_concurrency: 256
+external_data:
+- local_data_path: assets/multilingual.tiktoken
+  url: https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/multilingual.tiktoken
+- local_data_path: assets/mel_filters.npz
+  url: https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/mel_filters.npz
@@ -0,0 +1,85 @@
+import base64
+import gc
+import os
+import re
+from tempfile import NamedTemporaryFile
+
+import torch
+from async_batcher.batcher import AsyncBatcher
+from huggingface_hub import snapshot_download
+from run import WhisperTRTLLM
+from torch import Tensor
+from whisper_utils import log_mel_spectrogram
+
+TEXT_PREFIX = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
+
+# Num beams is the number of paths the model traverses before transcribing the text
+NUM_BEAMS = 3
+
+# Max queue time is the amount of time in seconds to wait to fill the batch
+MAX_QUEUE_TIME = 0.25
+
+# Maximum size of the batch. This is dictated by the compiled engine.
+MAX_BATCH_SIZE = 8
+
+
+class MlBatcher(AsyncBatcher[list[Tensor], list[str]]):
+    def __init__(self, model, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.model: WhisperTRTLLM = model
+
+    def process_batch(self, batch: list[Tensor]) -> list[float]:
+        # Need to pad the batch up to the maximum batch size
+        features = torch.cat(batch, dim=0).type(torch.float16)
+        return self.model.process_batch(features, TEXT_PREFIX, NUM_BEAMS)
+
+
+class Model:
+    def __init__(self, **kwargs):
+        self._data_dir = kwargs["data_dir"]
+        self._model = None
+        self._batcher = None
+        gc.freeze()
+
+    def load(self):
+        # Download the compiled model from hugging face hub
+        snapshot_download(
+            "baseten/trtllm-whisper-a10g-large-v2-1",
+            local_dir=self._data_dir,
+            max_workers=4,
+        )
+
+        self._model = WhisperTRTLLM(f"{self._data_dir}")
+        self._batcher = MlBatcher(
+            model=self._model,
+            max_batch_size=MAX_BATCH_SIZE,
+            max_queue_time=MAX_QUEUE_TIME,
+        )
+
+    def base64_to_wav(self, base64_string, output_file_path):
+        binary_data = base64.b64decode(base64_string)
+        with open(output_file_path, "wb") as wav_file:
+            wav_file.write(binary_data)
+        return output_file_path
+
+    async def predict(self, model_input: dict):
+        # TODO: figure out what the normalizer is for
+        normalizer = None
+        with NamedTemporaryFile() as fp:
+            self.base64_to_wav(model_input["audio"], fp.name)
+            mel, total_duration = log_mel_spectrogram(
+                fp.name,
+                self._model.n_mels,
+                device="cuda",
+                return_duration=True,
+                mel_filters_dir=f"{self._data_dir}/assets",
+            )
+            mel = mel.type(torch.float16)
+            mel = mel.unsqueeze(0)
+            prediction = await self._batcher.process(item=mel)
+
+            # remove all special tokens in the prediction
+            prediction = re.sub(r"<\|.*?\|>", "", prediction)
+            if normalizer:
+                prediction = normalizer(prediction)
+            return {"text": prediction.strip(), "duration": total_duration}
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+.venv/`
	`2`	`+payload.json`
	`3`	`+.vscode`