Remove local tokenizer requirement for vllm on prem throughput benchmark (meta-llama#514)

WuhanMonkey · web-flow · commit c96ac50742bb · 2024-05-14T16:36:39.000-07:00
diff --git a/recipes/benchmarks/inference_throughput/on-prem/vllm/chat_vllm_benchmark.py b/recipes/benchmarks/inference_throughput/on-prem/vllm/chat_vllm_benchmark.py
@@ -40,8 +40,6 @@
 SAFE_CHECK = params["SAFE_CHECK"]
 # Threshold for tokens per second below which we deem the query to be slow
 THRESHOLD_TPS = params["THRESHOLD_TPS"] 
-# Default Llama tokenizer, replace with your own tokenizer 
-TOKENIZER_PATH = params["TOKENIZER_PATH"] 
 TEMPERATURE = params["TEMPERATURE"]
 TOP_P = params["TOP_P"]
 # Add your model endpoints here, specify the port number. You can acquire the endpoint when creating a on-prem server like vLLM.
@@ -55,8 +53,8 @@
     print("No available GPUs")
 
 
-# This tokenizer is downloaded from Azure model catalog for each specific models. The main purpose is to decode the reponses for token calculation
-tokenizer = transformers.AutoTokenizer.from_pretrained(TOKENIZER_PATH)
+# This tokenizer is downloaded from HuggingFace based on the model path you set. Note Llama 3 use a different tokenizer compare to Llama 2
+tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_PATH)
 
 num_token_input_prompt = len(tokenizer.encode(PROMPT))
 print(f"Number of token for input prompt: {num_token_input_prompt}")
diff --git a/recipes/benchmarks/inference_throughput/on-prem/vllm/parameters.json b/recipes/benchmarks/inference_throughput/on-prem/vllm/parameters.json
@@ -5,7 +5,6 @@
     "MODEL_HEADERS" : {"Content-Type": "application/json"},
     "SAFE_CHECK" : true,
     "THRESHOLD_TPS" : 7,
-    "TOKENIZER_PATH" : "../../tokenizer",
     "RANDOM_PROMPT_LENGTH" : 1000,
     "TEMPERATURE" : 0.6,
     "TOP_P" : 0.9,
diff --git a/recipes/benchmarks/inference_throughput/on-prem/vllm/pretrained_vllm_benchmark.py b/recipes/benchmarks/inference_throughput/on-prem/vllm/pretrained_vllm_benchmark.py
@@ -36,8 +36,6 @@
 SAFE_CHECK = params["SAFE_CHECK"]
 # Threshold for tokens per second below which we deem the query to be slow
 THRESHOLD_TPS = params["THRESHOLD_TPS"] 
-# Replace with your own tokenizer 
-TOKENIZER_PATH = params["TOKENIZER_PATH"] 
 RANDOM_PROMPT_LENGTH = params["RANDOM_PROMPT_LENGTH"]
 TEMPERATURE = params["TEMPERATURE"]
 TOP_P = params["TOP_P"]
@@ -52,8 +50,8 @@
     print("No available GPUs")
 
 
-# This tokenizer is downloaded from Azure model catalog for each specific models. The main purpose is to decode the reponses for token calculation
-tokenizer = transformers.AutoTokenizer.from_pretrained(TOKENIZER_PATH)
+# This tokenizer is downloaded from HuggingFace based on the model path you set. Note Llama 3 use a different tokenizer compare to Llama 2
+tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_PATH)
 
 # Select vocabulary that is longer than 2 tokens (closer to real words) and close to the English (not foolproof)
 vocab = [token for token in tokenizer.get_vocab().keys() if len(token) > 2 and all(ord(c) < 128 for c in token)]