Skip to content

Commit 3d914de

Browse files
WuhanMonkeysubramen
authored andcommitted
Update Azure API benchmark scripts
For refactory, use model path to retrieve tokenizer from the HF.
1 parent a29ae75 commit 3d914de

File tree

3 files changed

+12
-14
lines changed

3 files changed

+12
-14
lines changed

recipes/benchmarks/inference_throughput/cloud-api/azure/chat_azure_api_benchmark.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from concurrent.futures import ThreadPoolExecutor, as_completed
1111
from typing import Dict, Tuple, List
1212

13+
# Add your own prompt in input.jsonl for testing.
1314
with open('input.jsonl') as input:
1415
prompt_data = json.load(input)
1516

@@ -23,23 +24,20 @@
2324
CONCURRENT_LEVELS = params["CONCURRENT_LEVELS"]
2425
# Threshold for tokens per second below which we deem the query to be slow
2526
THRESHOLD_TPS = params["THRESHOLD_TPS"]
26-
# Default Llama 2 tokenizer, replace with your own tokenizer
27-
TOKENIZER_PATH = params["TOKENIZER_PATH"]
27+
MODEL_PATH = params["MODEL_PATH"]
2828
TEMPERATURE = params["TEMPERATURE"]
2929
TOP_P = params["TOP_P"]
3030
# Model endpoint provided with API provider
3131
MODEL_ENDPOINTS = params["MODEL_ENDPOINTS"]
3232
API_KEY = params["API_KEY"]
3333
SYS_PROMPT = params["SYS_PROMPT"]
3434

35-
36-
# This tokenizer is downloaded from Azure model catalog for each specific models. The main purpose is to decode the reponses for token calculation
37-
tokenizer = transformers.AutoTokenizer.from_pretrained(TOKENIZER_PATH)
35+
# This tokenizer is downloaded from huggingface based on MODEL_PATH. Llama 3 use tiktoken tokenizer which is different from Llama 2.
36+
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_PATH)
3837

3938
num_token_input_prompt = len(tokenizer.encode(PROMPT))
4039
print(f"Number of token for input prompt: {num_token_input_prompt}")
4140

42-
4341
def generate_text() -> Tuple[int, int]:
4442

4543
#Configure payload data sending to API endpoint
@@ -49,7 +47,7 @@ def generate_text() -> Tuple[int, int]:
4947
"max_tokens": MAX_NEW_TOKEN,
5048
"temperature": TEMPERATURE,
5149
"top_p" : TOP_P,
52-
"stream": "False"
50+
"stream": False
5351
}
5452
body = str.encode(json.dumps(payload))
5553
url = MODEL_ENDPOINTS

recipes/benchmarks/inference_throughput/cloud-api/azure/parameters.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,11 @@
22
"MAX_NEW_TOKEN" : 256,
33
"CONCURRENT_LEVELS" : [1, 2, 4, 8, 16, 32, 64],
44
"THRESHOLD_TPS" : 7,
5-
"TOKENIZER_PATH" : "../../tokenizer",
6-
"RANDOM_PROMPT_LENGTH" : 1000,
5+
"MODEL_PATH" : "meta-llama/your-model-path",
6+
"RANDOM_PROMPT_LENGTH" : 25,
77
"TEMPERATURE" : 0.6,
88
"TOP_P" : 0.9,
9-
"MODEL_ENDPOINTS" : "https://your-endpoint.inference.ai.azure.com/v1/completions",
9+
"MODEL_ENDPOINTS" : "https://your-endpoint.inference.ai.azure.com/v1/chat/completions",
1010
"API_KEY" : "your-auth-key",
1111
"SYS_PROMPT" : "You are a helpful assistant."
1212
}

recipes/benchmarks/inference_throughput/cloud-api/azure/pretrained_azure_api_benchmark.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from concurrent.futures import ThreadPoolExecutor, as_completed
1212
from typing import Dict, Tuple, List
1313

14-
# Predefined inputs
14+
# Predefined inputs - optional
1515
with open('input.jsonl') as input:
1616
prompt_data = json.load(input)
1717

@@ -23,7 +23,7 @@
2323
# Threshold for tokens per second below which we deem the query to be slow
2424
THRESHOLD_TPS = params["THRESHOLD_TPS"]
2525
# Default Llama 2 tokenizer, replace with your own tokenizer
26-
TOKENIZER_PATH = params["TOKENIZER_PATH"]
26+
MODEL_PATH = params["MODEL_PATH"]
2727
RANDOM_PROMPT_LENGTH = params["RANDOM_PROMPT_LENGTH"]
2828
TEMPERATURE = params["TEMPERATURE"]
2929
TOP_P = params["TOP_P"]
@@ -32,8 +32,8 @@
3232
API_KEY = params["API_KEY"]
3333

3434

35-
# This tokenizer is downloaded from Azure model catalog for each specific models. The main purpose is to decode the reponses for token calculation
36-
tokenizer = transformers.AutoTokenizer.from_pretrained(TOKENIZER_PATH)
35+
# This tokenizer is downloaded from huggingface based on MODEL_PATH. Llama 3 use tiktoken tokenizer which is different from Llama 2.
36+
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_PATH)
3737

3838
# Select vocabulary that is longer than 2 tokens (closer to real words) and close to the English (not foolproof)
3939
vocab = [token for token in tokenizer.get_vocab().keys() if len(token) > 2 and all(ord(c) < 128 for c in token)]

0 commit comments

Comments
 (0)