Update Azure API benchmark scripts

WuhanMonkey · subramen · commit 3d914de2d6dd · 2024-06-26T16:53:00.000-04:00
For refactory, use model path to retrieve tokenizer from the HF.
diff --git a/recipes/benchmarks/inference_throughput/cloud-api/azure/chat_azure_api_benchmark.py b/recipes/benchmarks/inference_throughput/cloud-api/azure/chat_azure_api_benchmark.py
@@ -10,6 +10,7 @@
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import Dict, Tuple, List
 
+# Add your own prompt in input.jsonl for testing.
 with open('input.jsonl') as input:
     prompt_data = json.load(input)
 
@@ -23,23 +24,20 @@
 CONCURRENT_LEVELS = params["CONCURRENT_LEVELS"]
 # Threshold for tokens per second below which we deem the query to be slow
 THRESHOLD_TPS = params["THRESHOLD_TPS"] 
-# Default Llama 2 tokenizer, replace with your own tokenizer 
-TOKENIZER_PATH = params["TOKENIZER_PATH"] 
+MODEL_PATH = params["MODEL_PATH"]
 TEMPERATURE = params["TEMPERATURE"]
 TOP_P = params["TOP_P"]
 # Model endpoint provided with API provider 
 MODEL_ENDPOINTS = params["MODEL_ENDPOINTS"]
 API_KEY = params["API_KEY"]
 SYS_PROMPT = params["SYS_PROMPT"]
 
-
-# This tokenizer is downloaded from Azure model catalog for each specific models. The main purpose is to decode the reponses for token calculation
-tokenizer = transformers.AutoTokenizer.from_pretrained(TOKENIZER_PATH)
+# This tokenizer is downloaded from huggingface based on MODEL_PATH. Llama 3 use tiktoken tokenizer which is different from Llama 2.
+tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_PATH)
 
 num_token_input_prompt = len(tokenizer.encode(PROMPT))
 print(f"Number of token for input prompt: {num_token_input_prompt}")
 
-
 def generate_text() -> Tuple[int, int]:
 
     #Configure payload data sending to API endpoint
@@ -49,7 +47,7 @@ def generate_text() -> Tuple[int, int]:
             "max_tokens": MAX_NEW_TOKEN,
             "temperature": TEMPERATURE,
             "top_p" : TOP_P,
-            "stream": "False"
+            "stream": False
     }
     body = str.encode(json.dumps(payload))
     url = MODEL_ENDPOINTS
diff --git a/recipes/benchmarks/inference_throughput/cloud-api/azure/parameters.json b/recipes/benchmarks/inference_throughput/cloud-api/azure/parameters.json
@@ -2,11 +2,11 @@
     "MAX_NEW_TOKEN" : 256,
     "CONCURRENT_LEVELS" : [1, 2, 4, 8, 16, 32, 64],
     "THRESHOLD_TPS" : 7,
-    "TOKENIZER_PATH" : "../../tokenizer",
-    "RANDOM_PROMPT_LENGTH" : 1000,
+    "MODEL_PATH" : "meta-llama/your-model-path",
+    "RANDOM_PROMPT_LENGTH" : 25,
     "TEMPERATURE" : 0.6,
     "TOP_P" : 0.9,
-    "MODEL_ENDPOINTS" : "https://your-endpoint.inference.ai.azure.com/v1/completions",
+    "MODEL_ENDPOINTS" : "https://your-endpoint.inference.ai.azure.com/v1/chat/completions",
     "API_KEY" : "your-auth-key",
     "SYS_PROMPT" : "You are a helpful assistant."
 }
diff --git a/recipes/benchmarks/inference_throughput/cloud-api/azure/pretrained_azure_api_benchmark.py b/recipes/benchmarks/inference_throughput/cloud-api/azure/pretrained_azure_api_benchmark.py
@@ -11,7 +11,7 @@
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import Dict, Tuple, List
 
-# Predefined inputs
+# Predefined inputs - optional
 with open('input.jsonl') as input:
     prompt_data = json.load(input)
 
@@ -23,7 +23,7 @@
 # Threshold for tokens per second below which we deem the query to be slow
 THRESHOLD_TPS = params["THRESHOLD_TPS"] 
 # Default Llama 2 tokenizer, replace with your own tokenizer 
-TOKENIZER_PATH = params["TOKENIZER_PATH"]
+MODEL_PATH = params["MODEL_PATH"]
 RANDOM_PROMPT_LENGTH = params["RANDOM_PROMPT_LENGTH"]
 TEMPERATURE = params["TEMPERATURE"]
 TOP_P = params["TOP_P"]
@@ -32,8 +32,8 @@
 API_KEY = params["API_KEY"]
 
 
-# This tokenizer is downloaded from Azure model catalog for each specific models. The main purpose is to decode the reponses for token calculation
-tokenizer = transformers.AutoTokenizer.from_pretrained(TOKENIZER_PATH)
+# This tokenizer is downloaded from huggingface based on MODEL_PATH. Llama 3 use tiktoken tokenizer which is different from Llama 2.
+tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_PATH)
 
 # Select vocabulary that is longer than 2 tokens (closer to real words) and close to the English (not foolproof)
 vocab = [token for token in tokenizer.get_vocab().keys() if len(token) > 2 and all(ord(c) < 128 for c in token)]