Skip to content

Commit 5eda21e

Browse files
authored
[Hardware][CPU] compressed-tensor INT8 W8A8 AZP support (#9344)
1 parent 8e1cddc commit 5eda21e

File tree

7 files changed

+452
-96
lines changed

7 files changed

+452
-96
lines changed

.buildkite/run-cpu-test.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,10 @@ docker exec cpu-test bash -c "
3232
--ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
3333

3434
# Run compressed-tensor test
35-
# docker exec cpu-test bash -c "
36-
# pytest -s -v \
37-
# tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
38-
# tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynanmic_per_token"
35+
docker exec cpu-test bash -c "
36+
pytest -s -v \
37+
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
38+
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
3939

4040
# Run AWQ test
4141
docker exec cpu-test bash -c "

Dockerfile.cpu

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -33,19 +33,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \
3333
pip install --upgrade pip && \
3434
pip install -r requirements-build.txt
3535

36-
# install oneDNN
37-
RUN git clone -b rls-v3.5 https://github.com/oneapi-src/oneDNN.git
38-
39-
RUN --mount=type=cache,target=/root/.cache/ccache \
40-
cmake -B ./oneDNN/build -S ./oneDNN -G Ninja -DONEDNN_LIBRARY_TYPE=STATIC \
41-
-DONEDNN_BUILD_DOC=OFF \
42-
-DONEDNN_BUILD_EXAMPLES=OFF \
43-
-DONEDNN_BUILD_TESTS=OFF \
44-
-DONEDNN_BUILD_GRAPH=OFF \
45-
-DONEDNN_ENABLE_WORKLOAD=INFERENCE \
46-
-DONEDNN_ENABLE_PRIMITIVE=MATMUL && \
47-
cmake --build ./oneDNN/build --target install --config Release
48-
4936
FROM cpu-test-1 AS build
5037

5138
WORKDIR /workspace/vllm

cmake/cpu_extension.cmake

Lines changed: 34 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
1+
include(FetchContent)
2+
3+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
4+
set(CMAKE_CXX_EXTENSIONS ON)
15
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
2-
set(CMAKE_CXX_STANDARD 17)
36

47
#
58
# Define environment variables for special configurations
@@ -82,15 +85,40 @@ else()
8285
message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 or Power9+ ISA support.")
8386
endif()
8487

88+
#
89+
# Build oneDNN for W8A8 GEMM kernels (only for x86-AVX512 platforms)
90+
#
91+
if (AVX512_FOUND AND NOT AVX512_DISABLED)
92+
FetchContent_Declare(
93+
oneDNN
94+
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
95+
GIT_TAG v3.5.3
96+
GIT_PROGRESS TRUE
97+
GIT_SHALLOW TRUE
98+
)
99+
100+
set(ONEDNN_LIBRARY_TYPE "STATIC")
101+
set(ONEDNN_BUILD_DOC "OFF")
102+
set(ONEDNN_BUILD_EXAMPLES "OFF")
103+
set(ONEDNN_BUILD_TESTS "OFF")
104+
set(ONEDNN_ENABLE_WORKLOAD "INFERENCE")
105+
set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
106+
set(ONEDNN_BUILD_GRAPH "OFF")
107+
set(ONEDNN_ENABLE_JIT_PROFILING "OFF")
108+
set(ONEDNN_ENABLE_ITT_TASKS "OFF")
109+
set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
110+
set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
111+
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
112+
113+
FetchContent_MakeAvailable(oneDNN)
114+
115+
list(APPEND LIBS dnnl)
116+
endif()
117+
85118
message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
86119

87120
list(APPEND LIBS numa)
88121

89-
# Appending the dnnl library for the AVX2 and AVX512, as it is not utilized by Power architecture.
90-
if (AVX2_FOUND OR AVX512_FOUND)
91-
list(APPEND LIBS dnnl)
92-
endif()
93-
94122
#
95123
# _C extension
96124
#

csrc/cpu/cpu_types_x86.hpp

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,30 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
265265
void save(float *ptr) const { _mm256_storeu_ps(ptr, reg); }
266266
};
267267

268+
#ifdef __AVX512F__
269+
struct INT32Vec16: public Vec<INT32Vec16> {
270+
constexpr static int VEC_ELEM_NUM = 16;
271+
union AliasReg {
272+
__m512i reg;
273+
int32_t values[VEC_ELEM_NUM];
274+
};
275+
276+
__m512i reg;
277+
278+
explicit INT32Vec16(const void* data_ptr) : reg(_mm512_loadu_epi32(data_ptr)) {}
279+
280+
void save(int32_t* ptr) const {
281+
_mm512_storeu_epi32(ptr, reg);
282+
}
283+
284+
void save(int32_t* ptr, const int elem_num) const {
285+
constexpr uint32_t M = 0xFFFFFFFF;
286+
__mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
287+
_mm512_mask_storeu_epi32(ptr, mask, reg);
288+
}
289+
};
290+
#endif
291+
268292
#ifdef __AVX512F__
269293
struct FP32Vec16 : public Vec<FP32Vec16> {
270294
constexpr static int VEC_ELEM_NUM = 16;
@@ -283,8 +307,6 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
283307

284308
explicit FP32Vec16(__m512 data) : reg(data) {}
285309

286-
explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {}
287-
288310
explicit FP32Vec16(const FP32Vec4 &data)
289311
: reg((__m512)_mm512_inserti32x4(
290312
_mm512_inserti32x4(
@@ -303,6 +325,9 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
303325

304326
explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
305327

328+
explicit FP32Vec16(const INT32Vec16 &v)
329+
: reg(_mm512_cvt_roundepi32_ps(v.reg, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)) {}
330+
306331
FP32Vec16 operator*(const FP32Vec16 &b) const {
307332
return FP32Vec16(_mm512_mul_ps(reg, b.reg));
308333
}
@@ -333,6 +358,16 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
333358
return FP32Vec16(_mm512_mask_max_ps(reg, mask, reg, b.reg));
334359
}
335360

361+
FP32Vec16 min(const FP32Vec16& b) const {
362+
return FP32Vec16(_mm512_min_ps(reg, b.reg));
363+
}
364+
365+
FP32Vec16 min(const FP32Vec16& b, const int elem_num) const {
366+
constexpr uint32_t M = 0xFFFFFFFF;
367+
__mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
368+
return FP32Vec16(_mm512_mask_min_ps(reg, mask, reg, b.reg));
369+
}
370+
336371
FP32Vec16 abs() const {
337372
return FP32Vec16(_mm512_abs_ps(reg));
338373
}
@@ -341,6 +376,8 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
341376

342377
float reduce_max() const { return _mm512_reduce_max_ps(reg); }
343378

379+
float reduce_min() const { return _mm512_reduce_min_ps(reg); }
380+
344381
template <int group_size> float reduce_sub_sum(int idx) {
345382
static_assert(VEC_ELEM_NUM % group_size == 0);
346383
constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));

0 commit comments

Comments
 (0)