fix whisper .en model (#46)

yuekaizhang · Yuekai Zhang · web-flow · commit 067b21c61d0a · 2024-11-29T16:52:08.000+01:00
* fix whisper

* add all models

---------

Co-authored-by: Yuekai Zhang &lt;yuekaiz@nvidia.com&gt;
diff --git a/tensorrtllm/run_eval.py b/tensorrtllm/run_eval.py
@@ -1,10 +1,12 @@
 import argparse
 import os
 import torch
+import json
 from tensorrt_llm.runtime import ModelRunnerCpp
 from tensorrt_llm.bindings import GptJsonConfig
 import numpy as np
-
+from collections import OrderedDict
+from pathlib import Path
 from whisper_utils import log_mel_spectrogram, get_tokenizer
 import evaluate
 from normalizer import data_utils
@@ -16,18 +18,38 @@
 
 wer_metric = evaluate.load("wer")
 
+def read_config(component, engine_dir):
+    engine_dir = Path(engine_dir)
+    config_path = engine_dir / component / 'config.json'
+    with open(config_path, 'r') as f:
+        config = json.load(f)
+    model_config = OrderedDict()
+    model_config.update(config['pretrained_config'])
+    model_config.update(config['build_config'])
+    return model_config
+
 class WhisperTRTLLM(object):
 
     def __init__(self,
                  engine_dir,
                  assets_dir="assets",
                  batch_size=64):
-        tokenizer_name = "multilingual"
-        assert (Path(assets_dir) / "multilingual.tiktoken").exists(
-        ), "multilingual.tiktoken file is not existed in assets_dir"
-
+        encoder_config = read_config('encoder', engine_dir)
+        decoder_config = read_config('decoder', engine_dir)
+        self.n_mels = encoder_config['n_mels']
+        self.num_languages = encoder_config['num_languages']
+        is_multilingual = (decoder_config['vocab_size'] >= 51865)
+        if is_multilingual:
+            tokenizer_name = "multilingual"
+            assert (Path(assets_dir) / "multilingual.tiktoken").exists(
+            ), "multilingual.tiktoken file is not existed in assets_dir"
+        else:
+            tokenizer_name = "gpt2"
+            assert (Path(assets_dir) / "gpt2.tiktoken").exists(
+            ), "gpt2.tiktoken file is not existed in assets_dir"
+        self.text_prefix="<|startoftranscript|><|en|><|transcribe|><|notimestamps|>" if is_multilingual else "<|startoftranscript|><|notimestamps|>"
         self.tokenizer = get_tokenizer(name=tokenizer_name,
-                                       num_languages=100,
+                                       num_languages=self.num_languages,
                                        tokenizer_dir=assets_dir)
         self.eot_id = self.tokenizer.encode(
             "<|endoftext|>",
@@ -43,7 +65,6 @@ def __init__(self,
                                 debug_mode=False,
                                 kv_cache_free_gpu_memory_fraction=0.9)
         self.model_runner_cpp = ModelRunnerCpp.from_dir(**runner_kwargs)
-        self.n_mels = 128
 
     def process_single_batch(self, mel_batch, decoder_input_ids, mel_input_lengths, max_new_tokens):
         outputs = self.model_runner_cpp.generate(
@@ -66,9 +87,9 @@ def process_single_batch(self, mel_batch, decoder_input_ids, mel_input_lengths,
             texts.append(text)
         return texts
     
-    def process_batch(self, mel, mel_input_lengths, text_prefix="<|startoftranscript|><|en|><|transcribe|><|notimestamps|>", num_threads=4, max_new_tokens=96):
+    def process_batch(self, mel, mel_input_lengths, num_threads=4, max_new_tokens=96):
         prompt_id = self.tokenizer.encode(
-            text_prefix, allowed_special=self.tokenizer.special_tokens_set)
+            self.text_prefix, allowed_special=self.tokenizer.special_tokens_set)
         prompt_id = torch.tensor(prompt_id)
         batch_size = len(mel)
         decoder_input_ids = prompt_id.repeat(batch_size, 1)
diff --git a/tensorrtllm/run_whisper.sh b/tensorrtllm/run_whisper.sh
@@ -11,6 +11,7 @@ download_model() {
     wget -nc --directory-prefix=assets "$URL"
     wget -nc --directory-prefix=assets assets/mel_filters.npz https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/mel_filters.npz
     wget -nc --directory-prefix=assets https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/multilingual.tiktoken
+    wget -nc --directory-prefix=assets https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/gpt2.tiktoken
 
 }
 
@@ -48,7 +49,7 @@ build_model() {
                   --gpt_attention_plugin "$INFERENCE_PRECISION"
 }
 
-MODEL_IDs=("large-v3-turbo" "large-v3")
+MODEL_IDs=("large-v3-turbo" "large-v3" "large-v2" "large-v1" "medium" "base" "small" "tiny" "medium.en" "base.en" "small.en" "tiny.en")
 DEVICE_INDEX=0
 BATCH_SIZE=64
 

Original file line number	Diff line number	Diff line change
`@@ -11,6 +11,7 @@ download_model() {`
`11`	`11`	`wget -nc --directory-prefix=assets "$URL"`
`12`	`12`	`wget -nc --directory-prefix=assets assets/mel_filters.npz https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/mel_filters.npz`
`13`	`13`	`wget -nc --directory-prefix=assets https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/multilingual.tiktoken`
	`14`	`+ wget -nc --directory-prefix=assets https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/gpt2.tiktoken`
`14`	`15`
`15`	`16`	`}`
`16`	`17`
`@@ -48,7 +49,7 @@ build_model() {`
`48`	`49`	`--gpt_attention_plugin "$INFERENCE_PRECISION"`
`49`	`50`	`}`
`50`	`51`
`51`		`-MODEL_IDs=("large-v3-turbo" "large-v3")`
	`52`	`+MODEL_IDs=("large-v3-turbo" "large-v3" "large-v2" "large-v1" "medium" "base" "small" "tiny" "medium.en" "base.en" "small.en" "tiny.en")`
`52`	`53`	`DEVICE_INDEX=0`
`53`	`54`	`BATCH_SIZE=64`
`54`	`55`