basetenlabs
diff --git a/‎llama/mythomax-13b-trt-sq/README.md‎ renamed to ‎llama/mythomax-13b-trt-fp8/README.md‎
Lines changed: 4 additions & 5 deletions b/‎llama/mythomax-13b-trt-sq/README.md‎ renamed to ‎llama/mythomax-13b-trt-fp8/README.md‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎llama/mythomax-13b-trt-sq/TRT-LLM-README.md‎ renamed to ‎llama/mythomax-13b-trt-fp8/TRT-LLM-README.md‎ b/‎llama/mythomax-13b-trt-sq/TRT-LLM-README.md‎ renamed to ‎llama/mythomax-13b-trt-fp8/TRT-LLM-README.md‎
diff --git a/‎llama/mythomax-13b-trt-sq/config.yaml‎ renamed to ‎llama/mythomax-13b-trt-fp8/config.yaml‎
Lines changed: 2 additions & 2 deletions b/‎llama/mythomax-13b-trt-sq/config.yaml‎ renamed to ‎llama/mythomax-13b-trt-fp8/config.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎llama/mythomax-13b-trt-sq/data/.gitattributes‎ renamed to ‎llama/mythomax-13b-trt-fp8/data/.gitattributes‎ b/‎llama/mythomax-13b-trt-sq/data/.gitattributes‎ renamed to ‎llama/mythomax-13b-trt-fp8/data/.gitattributes‎
diff --git a/‎llama/mythomax-13b-trt-sq/model/__init__.py‎ renamed to ‎llama/mythomax-13b-trt-fp8/model/__init__.py‎ b/‎llama/mythomax-13b-trt-sq/model/__init__.py‎ renamed to ‎llama/mythomax-13b-trt-fp8/model/__init__.py‎
diff --git a/‎llama/mythomax-13b-trt-sq/model/model.py‎ renamed to ‎llama/mythomax-13b-trt-fp8/model/model.py‎ b/‎llama/mythomax-13b-trt-sq/model/model.py‎ renamed to ‎llama/mythomax-13b-trt-fp8/model/model.py‎
diff --git a/‎llama/mythomax-13b-trt-sq/packages/client.py‎ renamed to ‎llama/mythomax-13b-trt-fp8/packages/client.py‎ b/‎llama/mythomax-13b-trt-sq/packages/client.py‎ renamed to ‎llama/mythomax-13b-trt-fp8/packages/client.py‎
diff --git a/‎llama/mythomax-13b-trt-sq/packages/inflight_batcher_llm/ensemble/config.pbtxt‎ renamed to ‎llama/mythomax-13b-trt-fp8/packages/inflight_batcher_llm/ensemble/config.pbtxt‎ b/‎llama/mythomax-13b-trt-sq/packages/inflight_batcher_llm/ensemble/config.pbtxt‎ renamed to ‎llama/mythomax-13b-trt-fp8/packages/inflight_batcher_llm/ensemble/config.pbtxt‎
diff --git a/‎llama/mythomax-13b-trt-sq/packages/inflight_batcher_llm/postprocessing/1/model.py‎ renamed to ‎llama/mythomax-13b-trt-fp8/packages/inflight_batcher_llm/postprocessing/1/model.py‎
Lines changed: 39 additions & 15 deletions b/‎llama/mythomax-13b-trt-sq/packages/inflight_batcher_llm/postprocessing/1/model.py‎ renamed to ‎llama/mythomax-13b-trt-fp8/packages/inflight_batcher_llm/postprocessing/1/model.py‎
Lines changed: 39 additions & 15 deletions
diff --git a/‎llama/mythomax-13b-trt-sq/packages/inflight_batcher_llm/postprocessing/config.pbtxt‎ renamed to ‎llama/mythomax-13b-trt-fp8/packages/inflight_batcher_llm/postprocessing/config.pbtxt‎ b/‎llama/mythomax-13b-trt-sq/packages/inflight_batcher_llm/postprocessing/config.pbtxt‎ renamed to ‎llama/mythomax-13b-trt-fp8/packages/inflight_batcher_llm/postprocessing/config.pbtxt‎
@@ -2,10 +2,9 @@
 
 Based on https://huggingface.co/Gryphe/MythoMax-L2-13b
 
-int8 quantized using smoothquant using `https://huggingface.co/datasets/royallab/PIPPA-cleaned` dataset
-smoothquant alpha value used: 0.5
+fp8 quantized using `https://huggingface.co/datasets/royallab/PIPPA-cleaned` dataset
 
-TensorRT-LLM engine is here: https://huggingface.co/baseten/Gryphe_MythoMax-L2-13b_v0.7.1_H100-80GB-HBM3_2ff724
+TensorRT-LLM engine is here: https://huggingface.co/baseten/Gryphe_MythoMax-L2-13b_v0.7.1_H100-80GB-HBM3_fp8
 
 Max input tokens: 3000
 Max output tokens: 2000
@@ -25,15 +24,15 @@ First, clone this repository:
 
 ```sh
 git clone https://github.com/basetenlabs/truss-examples/
-cd llama/mythomax-13b-trt-sq
+cd llama/mythomax-13b-trt-fp8
 ```
 
 Before deployment:
 
 1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys).
 2. Install the latest version of Truss: `pip install --upgrade truss`
 
-With `mythomax-13b-trt-sq` as your working directory, you can deploy the model with:
+With `mythomax-13b-trt-fp8` as your working directory, you can deploy the model with:
 
 ```sh
 truss push --publish
 
@@ -1,5 +1,5 @@
 base_image:
-  image: docker.io/baseten/trtllm-server:r23.12_baseten_v0.7.1
+  image: docker.io/baseten/trtllm-server:r23.12_baseten_v0.9.0.dev2024022000
   python_executable_path: /usr/bin/python3
 description: Generate text from a prompt with this seven billion parameter language
   model.
@@ -9,7 +9,7 @@ external_package_dirs: []
 model_metadata:
   avatar_url: https://cdn.baseten.co/production/static/explore/meta.png
   cover_image_url: https://cdn.baseten.co/production/static/explore/llama.png
-  engine_repository: baseten/Gryphe_MythoMax-L2-13b_v0.7.1_H100-80GB-HBM3_2ff724
+  engine_repository: baseten/Gryphe_MythoMax-L2-13b_v0.7.1_H100-80GB-HBM3_fp8
   example_model_input:
     max_tokens: 1024
     prompt: What's the meaning of life?
 
@@ -32,6 +32,8 @@
 import triton_python_backend_utils as pb_utils
 from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer
 
+INVALID_UNICODE_CHAR = "�"
+
 
 class TritonPythonModel:
     """Your Python model must use the same class name. Every Python model
@@ -55,7 +57,8 @@ def initialize(self, args):
         """
         # Parse model configs
         model_config = json.loads(args["model_config"])
-        tokenizer_dir = os.environ["triton_tokenizer_repository"]
+        # NOTE: Keep this in sync with the truss model.py variable
+        tokenizer_dir = os.environ["TRITON_TOKENIZER_REPOSITORY"]
         tokenizer_type = model_config["parameters"]["tokenizer_type"]["string_value"]
 
         if tokenizer_type == "t5":
@@ -115,24 +118,48 @@ def execute(self, requests):
                 .as_numpy()
                 .flatten()
             )
+
             if len(tokens_batch) == 0:
                 continue
 
             # Postprocess output data
-            prev_token = self._get_prev_token(request_id)
-            self._store_prev_token(request_id, tokens_batch[-1])
+            prev_token = self._get_var(request_id, "prev_token")
+            token_buffer = self._get_var(request_id, "token_buffer")
+            token_buffer = token_buffer if token_buffer is not None else []
+            current_tokens = np.concatenate(
+                (np.array(token_buffer, dtype=int), tokens_batch), dtype=int
+            )
+            current_tokens_decoded = self.tokenizer.decode(current_tokens)
+
+            if len(current_tokens_decoded) == 0:
+                responses.append(pb_utils.InferenceResponse())
+                continue
+
+            if current_tokens_decoded[-1] == INVALID_UNICODE_CHAR:
+                # If the last token is invalid, we need to keep it in the buffer
+                # for the next request to see if this is a multi-token unicode
+                # character.
+                self._store_var(request_id, "token_buffer", current_tokens)
+                responses.append(pb_utils.InferenceResponse())
+                continue
+
             if prev_token is None:
-                delta = self.tokenizer.decode(tokens_batch)
+                delta = current_tokens_decoded
             else:
                 # TODO(pankaj) Figure out how to make tokenizer.decode not
                 # ignore initial whitespace so we can avoid this hack.
                 # Get string with and without previous token and diff. This hack
                 # is needed because tokenizer.decode strips initial whitespace.
-                old_string = self.tokenizer.decode([prev_token])
-                with_prev_token = np.concatenate(([prev_token], tokens_batch))
+                old_string = self.tokenizer.decode(prev_token)
+                with_prev_token = np.concatenate((prev_token, current_tokens))
                 new_string = self.tokenizer.decode(with_prev_token)
                 delta = self._compute_delta(old_string, new_string)
 
+            # The previous token is the last character of the decoded sequence
+            # which includes the multi-token unicode character.
+            self._store_var(request_id, "prev_token", current_tokens)
+            self._store_var(request_id, "token_buffer", None)
+
             # Create output tensor
             output_tensor = pb_utils.Tensor(
                 "OUTPUT", np.array([delta]).astype(self.output_dtype)
@@ -147,22 +174,19 @@ def execute(self, requests):
     def finalize(self):
         print("Cleaning up...")
 
-    def _store_prev_token(self, request_id, token):
+    def _store_var(self, request_id, var_name, var):
         if request_id in self.state_dict:
-            self.state_dict[request_id]["prev_token"] = token
-
-            # Move request ID to end of queue to prevent it from being evicted
+            self.state_dict[request_id][var_name] = var
             self.state_dict.move_to_end(request_id)
         else:
-            # Evict least recently used item if cache is full
             if len(self.state_dict) > self.cache_size:
                 self.state_dict.popitem(last=False)
+            self.state_dict[request_id] = {"prev_token": None, "token_buffer": None}
+            self.state_dict[request_id][var_name] = var
 
-            self.state_dict[request_id] = {"prev_token": token}
-
-    def _get_prev_token(self, request_id):
+    def _get_var(self, request_id, var_name):
         if request_id in self.state_dict:
-            return self.state_dict[request_id]["prev_token"]
+            return self.state_dict[request_id][var_name]
         return None
 
     def _compute_delta(self, prev_str, new_str):