66
77# allow to bind to different cores
88CORE_RANGE=${CORE_RANGE:- 48-95}
9+ OMP_CORE_RANGE=${OMP_CORE_RANGE:- 48-95}
910NUMA_NODE=${NUMA_NODE:- 1}
1011
1112export CMAKE_BUILD_PARALLEL_LEVEL=32
@@ -23,10 +24,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE
2324numactl -C " $CORE_RANGE " -N " $NUMA_NODE " docker build --build-arg VLLM_CPU_DISABLE_AVX512=" true" --tag cpu-test-" $NUMA_NODE " -avx2 --target vllm-test -f docker/Dockerfile.cpu .
2425
2526# Run the image, setting --shm-size=4g for tensor parallel.
26- docker run -itd --entrypoint /bin/bash -v ~ /.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=" $CORE_RANGE " \
27- --cpuset-mems=" $NUMA_NODE " --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-" $NUMA_NODE " cpu-test-" $NUMA_NODE "
28- docker run -itd --entrypoint /bin/bash -v ~ /.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=" $CORE_RANGE " \
29- --cpuset-mems=" $NUMA_NODE " --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-" $NUMA_NODE " -avx2 cpu-test-" $NUMA_NODE " -avx2
27+ docker run -itd --cpuset-cpus=" $CORE_RANGE " --cpuset-mems=" $NUMA_NODE " --entrypoint /bin/bash -v ~ /.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND=" $OMP_CORE_RANGE " --shm-size=4g --name cpu-test-" $NUMA_NODE " cpu-test-" $NUMA_NODE "
28+ docker run -itd --cpuset-cpus=" $CORE_RANGE " --cpuset-mems=" $NUMA_NODE " --entrypoint /bin/bash -v ~ /.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND=" $OMP_CORE_RANGE " --shm-size=4g --name cpu-test-" $NUMA_NODE " -avx2 cpu-test-" $NUMA_NODE " -avx2
3029
3130function cpu_tests() {
3231 set -e
@@ -56,7 +55,7 @@ function cpu_tests() {
5655 # Run AWQ test
5756 docker exec cpu-test-" $NUMA_NODE " bash -c "
5857 set -e
59- pytest -s -v \
58+ VLLM_USE_V1=0 pytest -s -v \
6059 tests/quantization/test_ipex_quant.py"
6160
6261 # Run chunked-prefill and prefix-cache test
@@ -68,8 +67,6 @@ function cpu_tests() {
6867 # online serving
6968 docker exec cpu-test-" $NUMA_NODE " bash -c "
7069 set -e
71- export VLLM_CPU_KVCACHE_SPACE=10
72- export VLLM_CPU_OMP_THREADS_BIND=$1
7370 python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
7471 timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
7572 python3 benchmarks/benchmark_serving.py \
@@ -89,4 +86,4 @@ function cpu_tests() {
8986
9087# All of CPU tests are expected to be finished less than 40 mins.
9188export -f cpu_tests
92- timeout 40m bash -c " cpu_tests $CORE_RANGE $NUMA_NODE "
89+ timeout 1h bash -c " cpu_tests $CORE_RANGE $NUMA_NODE "
0 commit comments