@@ -13,26 +13,27 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test -f Dockerfile.
1313numactl -C " $CORE_RANGE " -N " $NUMA_NODE " docker build --build-arg VLLM_CPU_DISABLE_AVX512=" true" -t cpu-test-avx2 -f Dockerfile.cpu .
1414
1515# Setup cleanup
16- remove_docker_container () { docker rm -f cpu-test cpu-test-avx2 || true ; }
16+ remove_docker_container () { docker rm -f cpu-test- " $NUMA_NODE " cpu-test-avx2- " $NUMA_NODE " || true ; }
1717trap remove_docker_container EXIT
1818remove_docker_container
1919
2020# Run the image, setting --shm-size=4g for tensor parallel.
2121docker run -itd --entrypoint /bin/bash -v ~ /.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=" $CORE_RANGE " \
22- --cpuset-mems=" $NUMA_NODE " --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
22+ --cpuset-mems=" $NUMA_NODE " --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test- " $NUMA_NODE " cpu-test
2323docker run -itd --entrypoint /bin/bash -v ~ /.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=" $CORE_RANGE " \
24- --cpuset-mems=" $NUMA_NODE " --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
24+ --cpuset-mems=" $NUMA_NODE " --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2- " $NUMA_NODE " cpu-test-avx2
2525
2626function cpu_tests() {
2727 set -e
28+ export NUMA_NODE=$2
2829
2930 # offline inference
30- docker exec cpu-test-avx2 bash -c "
31+ docker exec cpu-test-avx2- " $NUMA_NODE " bash -c "
3132 set -e
3233 python3 examples/offline_inference.py"
3334
3435 # Run basic model test
35- docker exec cpu-test bash -c "
36+ docker exec cpu-test- " $NUMA_NODE " bash -c "
3637 set -e
3738 pip install pytest pytest-asyncio \
3839 decord einops librosa peft Pillow sentence-transformers soundfile \
@@ -45,20 +46,26 @@ function cpu_tests() {
4546 pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
4647
4748 # Run compressed-tensor test
48- docker exec cpu-test bash -c "
49+ docker exec cpu-test- " $NUMA_NODE " bash -c "
4950 set -e
5051 pytest -s -v \
5152 tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
5253 tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
5354
5455 # Run AWQ test
55- docker exec cpu-test bash -c "
56+ docker exec cpu-test- " $NUMA_NODE " bash -c "
5657 set -e
5758 pytest -s -v \
5859 tests/quantization/test_ipex_quant.py"
5960
61+ # Run chunked-prefill and prefix-cache test
62+ docker exec cpu-test-" $NUMA_NODE " bash -c "
63+ set -e
64+ pytest -s -v -k cpu_model \
65+ tests/basic_correctness/test_chunked_prefill.py"
66+
6067 # online inference
61- docker exec cpu-test bash -c "
68+ docker exec cpu-test- " $NUMA_NODE " bash -c "
6269 set -e
6370 export VLLM_CPU_KVCACHE_SPACE=10
6471 export VLLM_CPU_OMP_THREADS_BIND=$1
@@ -75,4 +82,4 @@ function cpu_tests() {
7582
7683# All of CPU tests are expected to be finished less than 25 mins.
7784export -f cpu_tests
78- timeout 25m bash -c " cpu_tests $CORE_RANGE "
85+ timeout 30m bash -c " cpu_tests $CORE_RANGE $NUMA_NODE "
0 commit comments