Skip to content

Commit b63f4cb

Browse files
authored
Merge pull request #103 from dtrifiro/rhoai-2.19-sync-with-midstream-0.8.3.0
sync with nm-vllm-ent @ v0.8.3.0-rc0
2 parents 3db53a2 + 09cbae3 commit b63f4cb

File tree

1,375 files changed

+113556
-32336
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,375 files changed

+113556
-32336
lines changed

.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@ tasks:
44
- name: "gsm8k"
55
metrics:
66
- name: "exact_match,strict-match"
7-
value: 0.233
7+
value: 0.231
88
- name: "exact_match,flexible-extract"
9-
value: 0.236
9+
value: 0.22
1010
limit: 1000
1111
num_fewshot: 5

.buildkite/lm-eval-harness/test_lm_eval_correctness.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
import lm_eval
1515
import numpy
16+
import pytest
1617
import yaml
1718

1819
RTOL = 0.05
@@ -46,6 +47,10 @@ def test_lm_eval_correctness():
4647
eval_config = yaml.safe_load(
4748
Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
4849

50+
if eval_config[
51+
"model_name"] == "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform": #noqa: E501
52+
pytest.skip("FBGEMM is currently failing on main.")
53+
4954
# Launch eval requests.
5055
results = launch_lm_eval(eval_config)
5156

.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,13 @@ def results_to_json(latency, throughput, serving):
8484
# this result is generated via `benchmark_serving.py`
8585

8686
# attach the benchmarking command to raw_result
87-
with open(test_file.with_suffix(".commands")) as f:
88-
command = json.loads(f.read())
87+
try:
88+
with open(test_file.with_suffix(".commands")) as f:
89+
command = json.loads(f.read())
90+
except OSError as e:
91+
print(e)
92+
continue
93+
8994
raw_result.update(command)
9095

9196
# update the test name of this result
@@ -99,8 +104,13 @@ def results_to_json(latency, throughput, serving):
99104
# this result is generated via `benchmark_latency.py`
100105

101106
# attach the benchmarking command to raw_result
102-
with open(test_file.with_suffix(".commands")) as f:
103-
command = json.loads(f.read())
107+
try:
108+
with open(test_file.with_suffix(".commands")) as f:
109+
command = json.loads(f.read())
110+
except OSError as e:
111+
print(e)
112+
continue
113+
104114
raw_result.update(command)
105115

106116
# update the test name of this result
@@ -121,8 +131,13 @@ def results_to_json(latency, throughput, serving):
121131
# this result is generated via `benchmark_throughput.py`
122132

123133
# attach the benchmarking command to raw_result
124-
with open(test_file.with_suffix(".commands")) as f:
125-
command = json.loads(f.read())
134+
try:
135+
with open(test_file.with_suffix(".commands")) as f:
136+
command = json.loads(f.read())
137+
except OSError as e:
138+
print(e)
139+
continue
140+
126141
raw_result.update(command)
127142

128143
# update the test name of this result

.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -426,7 +426,7 @@ main() {
426426

427427
pip install -U transformers
428428

429-
pip install -r requirements-dev.txt
429+
pip install -r requirements/dev.txt
430430
which genai-perf
431431

432432
# check storage

.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,24 @@ set -x
1010
set -o pipefail
1111

1212
check_gpus() {
13-
# check the number of GPUs and GPU type.
14-
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
13+
if command -v nvidia-smi; then
14+
# check the number of GPUs and GPU type.
15+
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
16+
elif command -v amd-smi; then
17+
declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
18+
fi
19+
1520
if [[ $gpu_count -gt 0 ]]; then
1621
echo "GPU found."
1722
else
1823
echo "Need at least 1 GPU to run benchmarking."
1924
exit 1
2025
fi
21-
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
26+
if command -v nvidia-smi; then
27+
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
28+
elif command -v amd-smi; then
29+
declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
30+
fi
2231
echo "GPU type is $gpu_type"
2332
}
2433

@@ -90,9 +99,15 @@ kill_gpu_processes() {
9099

91100

92101
# wait until GPU memory usage smaller than 1GB
93-
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
94-
sleep 1
95-
done
102+
if command -v nvidia-smi; then
103+
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
104+
sleep 1
105+
done
106+
elif command -v amd-smi; then
107+
while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
108+
sleep 1
109+
done
110+
fi
96111

97112
# remove vllm config file
98113
rm -rf ~/.config/vllm
@@ -309,11 +324,14 @@ run_serving_tests() {
309324

310325
new_test_name=$test_name"_qps_"$qps
311326

327+
# pass the tensor parallel size to the client so that it can be displayed
328+
# on the benchmark dashboard
312329
client_command="python3 benchmark_serving.py \
313330
--save-result \
314331
--result-dir $RESULTS_FOLDER \
315332
--result-filename ${new_test_name}.json \
316333
--request-rate $qps \
334+
--metadata "tensor_parallel_size=$tp" \
317335
$client_args"
318336

319337
echo "Running test case $test_name with qps $qps"
@@ -358,7 +376,7 @@ main() {
358376
# get the current IP address, required by benchmark_serving.py
359377
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
360378
# turn of the reporting of the status of each request, to clean up the terminal output
361-
export VLLM_LOG_LEVEL="WARNING"
379+
export VLLM_LOGGING_LEVEL="WARNING"
362380

363381
# prepare for benchmarking
364382
cd benchmarks || exit 1

.buildkite/nightly-benchmarks/tests/serving-tests.json

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -63,10 +63,12 @@
6363
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
6464
"disable_log_requests": "",
6565
"tensor_parallel_size": 4,
66-
"swap_space": 16,
67-
"speculative_model": "turboderp/Qwama-0.5B-Instruct",
68-
"num_speculative_tokens": 4,
69-
"speculative_draft_tensor_parallel_size": 1
66+
"swap_space": 16,
67+
"speculative_config": {
68+
"model": "turboderp/Qwama-0.5B-Instruct",
69+
"num_speculative_tokens": 4,
70+
"draft_tensor_parallel_size": 1
71+
}
7072
},
7173
"client_parameters": {
7274
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",

.buildkite/nightly-benchmarks/tests/throughput-tests.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,4 +32,4 @@
3232
"backend": "vllm"
3333
}
3434
}
35-
]
35+
]

.buildkite/release-pipeline.yaml

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,23 @@
11
steps:
2+
- label: "Build wheel - CUDA 12.4"
3+
agents:
4+
queue: cpu_queue_postmerge
5+
commands:
6+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
7+
- "mkdir artifacts"
8+
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
9+
- "bash .buildkite/scripts/upload-wheels.sh"
10+
env:
11+
DOCKER_BUILDKIT: "1"
12+
213
- label: "Build wheel - CUDA 12.1"
314
agents:
415
queue: cpu_queue_postmerge
516
commands:
6-
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
17+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
718
- "mkdir artifacts"
819
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
9-
- "bash .buildkite/upload-wheels.sh"
20+
- "bash .buildkite/scripts/upload-wheels.sh"
1021
env:
1122
DOCKER_BUILDKIT: "1"
1223

@@ -20,10 +31,10 @@ steps:
2031
agents:
2132
queue: cpu_queue_postmerge
2233
commands:
23-
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
34+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
2435
- "mkdir artifacts"
2536
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
26-
- "bash .buildkite/upload-wheels.sh"
37+
- "bash .buildkite/scripts/upload-wheels.sh"
2738
env:
2839
DOCKER_BUILDKIT: "1"
2940

@@ -37,7 +48,7 @@ steps:
3748
queue: cpu_queue_postmerge
3849
commands:
3950
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
40-
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
51+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
4152
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
4253

4354
- label: "Build and publish TPU release image"
@@ -46,7 +57,7 @@ steps:
4657
agents:
4758
queue: tpu_queue_postmerge
4859
commands:
49-
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
60+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
5061
- "docker push vllm/vllm-tpu:nightly"
5162
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
5263
plugins:
@@ -71,7 +82,7 @@ steps:
7182
queue: cpu_queue_postmerge
7283
commands:
7384
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
74-
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --progress plain -f Dockerfile.cpu ."
85+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
7586
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
7687
env:
7788
DOCKER_BUILDKIT: "1"

.buildkite/run-openvino-test.sh

Lines changed: 0 additions & 16 deletions
This file was deleted.

.buildkite/run-tpu-test.sh

Lines changed: 0 additions & 26 deletions
This file was deleted.

0 commit comments

Comments
 (0)