flat t5 trtllm follow up (#222)

pankajroark · web-flow · commit 61280fe256c9 · 2024-03-05T10:40:14.000-08:00
* Stop early -- added a stopping criteria for this, couldn't find
another way
* Print inference time
* Use hf_access_token from secrets
diff --git a/tensorrt-llm/flan-t5-trt-llm/config.yaml b/tensorrt-llm/flan-t5-trt-llm/config.yaml
@@ -16,3 +16,5 @@ resources:
   use_gpu: true
 runtime:
   predict_concurrency: 1
+secrets:
+  hf_access_token: placeholder__bound_at_runtime
diff --git a/tensorrt-llm/flan-t5-trt-llm/model/model.py b/tensorrt-llm/flan-t5-trt-llm/model/model.py
@@ -1,9 +1,11 @@
+import time
+
 import torch
 from enc_dec.enc_dec_model import TRTLLMEncDecModel
 from huggingface_hub import snapshot_download
 from transformers import AutoConfig, AutoTokenizer
 
-HF_MODEL_NAME = "google-t5/t5-large"
+HF_MODEL_NAME = "google/flan-t5-large"
 DEFAULT_MAX_NEW_TOKENS = 20
 
 
@@ -14,9 +16,17 @@ def __init__(self, **kwargs):
         self._engine_repo = model_metadata["engine_repository"]
         self._engine_name = model_metadata["engine_name"]
         self._beam_width = model_metadata["beam_width"]
+        self._secrets = kwargs["secrets"]
+        self._hf_access_token = self._secrets["hf_access_token"]
+        if not self._hf_access_token:
+            self._hf_access_token = None
 
     def load(self):
-        snapshot_download(repo_id=self._engine_repo, local_dir=self._engine_dir)
+        snapshot_download(
+            repo_id=self._engine_repo,
+            local_dir=self._engine_dir,
+            token=self._hf_access_token,
+        )
         self._tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_NAME)
         model_config = AutoConfig.from_pretrained(HF_MODEL_NAME)
         self._decoder_start_token_id = model_config.decoder_start_token_id
@@ -25,6 +35,7 @@ def load(self):
         )
 
     def predict(self, model_input):
+        start_time = time.time()
         try:
             input_text = model_input.pop("prompt")
             max_new_tokens = model_input.pop("max_new_tokens", DEFAULT_MAX_NEW_TOKENS)
@@ -57,6 +68,7 @@ def predict(self, model_input):
                     output_ids, skip_special_tokens=True
                 )
                 decoded_output.append(output_text)
+            print(f"Inference time: {(time.time() - start_time)*1000}ms")
             return {"status": "success", "data": decoded_output}
         except Exception as exc:
             return {"status": "error", "data": None, "message": str(exc)}
diff --git a/tensorrt-llm/flan-t5-trt-llm/packages/enc_dec/enc_dec_model.py b/tensorrt-llm/flan-t5-trt-llm/packages/enc_dec/enc_dec_model.py
@@ -497,6 +497,15 @@ def generate(
         )
         torch.cuda.synchronize()
 
+        # TODO(pankaj) Figure out a better way to stop this.
+        # Using stopping criteria is expensive, but there couldn't find
+        # another way of stopping generation early.
+        def stopping_criteria(
+            step: int, input_ids: torch.Tensor, scores: torch.Tensor
+        ) -> bool:
+            # If generated token is eos then stop
+            return input_ids[0][step + 1] == eos_token_id
+
         output = self.decoder_session.decode(
             decoder_input_ids,
             decoder_input_lengths,
@@ -505,6 +514,7 @@ def generate(
             encoder_input_lengths=encoder_input_lengths,
             return_dict=return_dict,
             cross_attention_mask=cross_attention_mask,
+            stopping_criteria=stopping_criteria,
         )
         torch.cuda.synchronize()