Update on-prem vllm scripts and readme

WuhanMonkey · subramen · commit 2828fd020128 · 2024-06-26T16:54:34.000-04:00
diff --git a/recipes/benchmarks/inference_throughput/on-prem/README.md b/recipes/benchmarks/inference_throughput/on-prem/README.md
@@ -37,3 +37,5 @@ To run pretrained model benchmark, follow the command below.
 ```
 python pretrained_vllm_benchmark.py
 ```
+
+Refer to more vLLM benchmark details on their official Github repo [here](https://github.com/vllm-project/vllm/tree/main/benchmarks).
diff --git a/recipes/benchmarks/inference_throughput/on-prem/vllm/chat_vllm_benchmark.py b/recipes/benchmarks/inference_throughput/on-prem/vllm/chat_vllm_benchmark.py
@@ -4,7 +4,6 @@
 import csv
 import json
 import time
-import random
 import threading
 import numpy as np
 import requests
@@ -18,7 +17,7 @@
 from azure.ai.contentsafety.models import AnalyzeTextOptions
 
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import Dict, Tuple, List
+from typing import Tuple, List
 
 
 
diff --git a/recipes/benchmarks/inference_throughput/on-prem/vllm/parameters.json b/recipes/benchmarks/inference_throughput/on-prem/vllm/parameters.json
@@ -1,7 +1,7 @@
 {
     "MAX_NEW_TOKENS" : 256,
     "CONCURRENT_LEVELS" : [1, 2, 4, 8, 16, 32, 64, 128, 256],
-    "MODEL_PATH" : "meta-llama/Meta-Llama-3-70B-Instruct",
+    "MODEL_PATH" : "meta-llama/your-model-path",
     "MODEL_HEADERS" : {"Content-Type": "application/json"},
     "SAFE_CHECK" : true,
     "THRESHOLD_TPS" : 7,
diff --git a/recipes/benchmarks/inference_throughput/on-prem/vllm/pretrained_vllm_benchmark.py b/recipes/benchmarks/inference_throughput/on-prem/vllm/pretrained_vllm_benchmark.py
@@ -18,7 +18,7 @@
 from azure.ai.contentsafety.models import AnalyzeTextOptions
 
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import Dict, Tuple, List
+from typing import Tuple, List
 
 
 # Predefined inputs

Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`{`
`2`	`2`	`"MAX_NEW_TOKENS" : 256,`
`3`	`3`	`"CONCURRENT_LEVELS" : [1, 2, 4, 8, 16, 32, 64, 128, 256],`
`4`		`- "MODEL_PATH" : "meta-llama/Meta-Llama-3-70B-Instruct",`
	`4`	`+ "MODEL_PATH" : "meta-llama/your-model-path",`
`5`	`5`	`"MODEL_HEADERS" : {"Content-Type": "application/json"},`
`6`	`6`	`"SAFE_CHECK" : true,`
`7`	`7`	`"THRESHOLD_TPS" : 7,`