66
77# allow to bind to different cores
88CORE_RANGE=${CORE_RANGE:- 48-95}
9+ # used for TP/PP E2E test
910OMP_CORE_RANGE=${OMP_CORE_RANGE:- 48-95}
1011NUMA_NODE=${NUMA_NODE:- 1}
1112
@@ -24,8 +25,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE
2425numactl -C " $CORE_RANGE " -N " $NUMA_NODE " docker build --build-arg VLLM_CPU_DISABLE_AVX512=" true" --tag cpu-test-" $NUMA_NODE " -avx2 --target vllm-test -f docker/Dockerfile.cpu .
2526
2627# Run the image, setting --shm-size=4g for tensor parallel.
27- docker run -itd --cpuset-cpus=" $CORE_RANGE " --cpuset-mems=" $NUMA_NODE " --entrypoint /bin/bash -v ~ /.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND= " $OMP_CORE_RANGE " --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-" $NUMA_NODE " cpu-test-" $NUMA_NODE "
28- docker run -itd --cpuset-cpus=" $CORE_RANGE " --cpuset-mems=" $NUMA_NODE " --entrypoint /bin/bash -v ~ /.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND= " $OMP_CORE_RANGE " --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-" $NUMA_NODE " -avx2 cpu-test-" $NUMA_NODE " -avx2
28+ docker run -itd --cpuset-cpus=" $CORE_RANGE " --cpuset-mems=" $NUMA_NODE " --entrypoint /bin/bash -v ~ /.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS= " $OMP_CORE_RANGE " --shm-size=4g --name cpu-test-" $NUMA_NODE " cpu-test-" $NUMA_NODE "
29+ docker run -itd --cpuset-cpus=" $CORE_RANGE " --cpuset-mems=" $NUMA_NODE " --entrypoint /bin/bash -v ~ /.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS= " $OMP_CORE_RANGE " --shm-size=4g --name cpu-test-" $NUMA_NODE " -avx2 cpu-test-" $NUMA_NODE " -avx2
2930
3031function cpu_tests() {
3132 set -e
@@ -48,10 +49,16 @@ function cpu_tests() {
4849 # Run basic model test
4950 docker exec cpu-test-" $NUMA_NODE " bash -c "
5051 set -e
51- pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
52- pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
53- pytest -v -s tests/models/language/generation -m cpu_model
54- VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model
52+ # Note: disable until supports V1
53+ # pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
54+ # pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
55+
56+ # Note: disable Bart until supports V1
57+ pytest -v -s tests/models/language/generation -m cpu_model \
58+ --ignore=tests/models/language/generation/test_bart.py
59+ VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \
60+ --ignore=tests/models/language/generation/test_bart.py
61+
5562 pytest -v -s tests/models/language/pooling -m cpu_model
5663 pytest -v -s tests/models/multimodal/generation \
5764 --ignore=tests/models/multimodal/generation/test_mllama.py \
@@ -62,33 +69,26 @@ function cpu_tests() {
6269 docker exec cpu-test-" $NUMA_NODE " bash -c "
6370 set -e
6471 pytest -s -v \
65- tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
66- tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
72+ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
6773
74+ # Note: disable it until supports V1
6875 # Run AWQ test
69- docker exec cpu-test-" $NUMA_NODE " bash -c "
70- set -e
71- VLLM_USE_V1=0 pytest -s -v \
72- tests/quantization/test_ipex_quant.py"
73-
74- # Run chunked-prefill and prefix-cache test
75- docker exec cpu-test-" $NUMA_NODE " bash -c "
76- set -e
77- pytest -s -v -k cpu_model \
78- tests/basic_correctness/test_chunked_prefill.py"
76+ # docker exec cpu-test-"$NUMA_NODE" bash -c "
77+ # set -e
78+ # VLLM_USE_V1=0 pytest -s -v \
79+ # tests/quantization/test_ipex_quant.py"
7980
8081 # online serving
81- docker exec cpu-test-" $NUMA_NODE " bash -c "
82+ docker exec cpu-test-" $NUMA_NODE " bash -c '
8283 set -e
83- python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
84- timeout 600 bash -c ' until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
85- VLLM_CPU_CI_ENV=0 python3 benchmarks/benchmark_serving.py \
84+ VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
85+ timeout 600 bash -c " until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
86+ python3 benchmarks/benchmark_serving.py \
8687 --backend vllm \
8788 --dataset-name random \
88- --model facebook/opt-125m \
89+ --model meta-llama/Llama-3.2-3B-Instruct \
8990 --num-prompts 20 \
90- --endpoint /v1/completions \
91- --tokenizer facebook/opt-125m"
91+ --endpoint /v1/completions'
9292
9393 # Run multi-lora tests
9494 docker exec cpu-test-" $NUMA_NODE " bash -c "
0 commit comments