Skip to content

Commit 922d839

Browse files
committed
ggml : add repack testing support
This commit add support for testing the ggml-cpu repack feature which enables the repackaging of quantized data into more optimal layout for matrix multiplication for specific hardware architectures. The motivation is to enable the testing of a cpu backend that uses repacked data against a reference cpu backend that does not use repacked data. Building: ```console $ cmake -B build \ -DGGML_CPU_REF_BACKEND=ON -DGGML_BACKEND_DL=ON \ -DGGML_CPU_ALL_VARIANTS=ON ``` List availble cpu architectures/variants: ```console $ ./build/bin/test-backend-ops cpu-variants --list CPU variants: CPU-alderlake - 12th Gen Intel(R) Core(TM) i7-1260P ``` Run tests: ```console ./build-ref/bin/test-backend-ops cpu-variants \ --variant CPU-alderlake \ -o "MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1)" Testing CPU variant 'CPU-alderlake' against cpu-ref backend... repack: repack tensor a with q4_0_8x8 MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): OK repack: repack tensor a with q4_0_8x8 MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): OK 14491/14491 tests passed ``` All matrix multiplication tests can be run by use specifying `-o "MUL_MAT"` but it may be harder to spot the ones that use repacking.
1 parent b6f2ff9 commit 922d839

File tree

6 files changed

+194
-3
lines changed

6 files changed

+194
-3
lines changed

ggml/CMakeLists.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -286,7 +286,9 @@ add_subdirectory(src)
286286

287287
if (GGML_BUILD_TESTS)
288288
enable_testing()
289-
add_subdirectory(tests)
289+
if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/tests")
290+
add_subdirectory(tests)
291+
endif ()
290292
endif ()
291293

292294
if (GGML_BUILD_EXAMPLES)

ggml/src/ggml-cpu/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -502,6 +502,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
502502
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_REPACK)
503503
endif()
504504

505+
if (GGML_BUILD_TESTS)
506+
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_BUILD_TESTS)
507+
endif()
508+
505509
if (GGML_CPU_KLEIDIAI)
506510
message(STATUS "Using KleidiAI optimized kernels if applicable")
507511

ggml/src/ggml-cpu/ggml-cpu.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,11 @@ std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffer_type
5555
if (ggml_backend_cpu_repack_buffer_type()) {
5656
bufts.push_back(ggml_backend_cpu_repack_buffer_type());
5757
}
58+
#ifdef GGML_BUILD_TESTS
59+
if (ggml_backend_cpu_repack_test_buffer_type()) {
60+
bufts.push_back(ggml_backend_cpu_repack_test_buffer_type());
61+
}
62+
#endif // GGML_BUILD_TESTS
5863
#endif
5964

6065
return bufts;
@@ -98,7 +103,7 @@ struct ggml_backend_cpu_context {
98103
};
99104

100105
static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) {
101-
return "CPU";
106+
return GGML_CPU_VARIANT_NAME;
102107

103108
GGML_UNUSED(backend);
104109
}

ggml/src/ggml-cpu/repack.cpp

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1980,3 +1980,149 @@ ggml_backend_buffer_type_t ggml_backend_cpu_repack_buffer_type(void) {
19801980

19811981
return &ggml_backend_cpu_buffer_type_repack;
19821982
}
1983+
1984+
#ifdef GGML_BUILD_TESTS
1985+
// Test repack wrapper buffer type that stores original data before repacking.
1986+
// The motivation for this type is that when testing repack when set_tensor is
1987+
// called the data of the tensor is repacked and the original data is lost.
1988+
//
1989+
// In test-backend-ops.cpp we want to compare the results of a backend using
1990+
// repacked input data, and compare against a backend that non-repacked data.
1991+
// The problem arises in `ggml_backend_compare_graph_backend` where the graphs
1992+
// are copied and ggml_backend_buffer_repack_buffer_type does not implement
1993+
// the get_tensor function, but even if it did it would return the repacked data
1994+
// which is not what we want to compare against. This type allows proper
1995+
// comparison between repack and non-repack data.
1996+
1997+
#include <unordered_map>
1998+
#include <vector>
1999+
2000+
struct test_repack_wrapper_context {
2001+
ggml_backend_buffer_t cpu_buffer;
2002+
2003+
// This map stores the original (non repacked) data so that when the graph
2004+
// is copied by ggml_backend_compare_graph_backend we can return the original
2005+
// data in get_tensor.
2006+
std::unordered_map<struct ggml_tensor *, std::vector<uint8_t>> original_data;
2007+
};
2008+
2009+
static void ggml_backend_cpu_repack_test_buffer_free_buffer(ggml_backend_buffer_t buffer) {
2010+
test_repack_wrapper_context * ctx = (test_repack_wrapper_context *) buffer->context;
2011+
if (ctx->cpu_buffer) {
2012+
ggml_backend_buffer_free(ctx->cpu_buffer);
2013+
}
2014+
delete ctx;
2015+
}
2016+
2017+
static enum ggml_status ggml_backend_cpu_repack_test_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
2018+
if (tensor->op == GGML_OP_MUL_MAT && ggml_n_dims(tensor->src[0]) == 2) {
2019+
tensor->src[0]->extra = (void *) const_cast<ggml::cpu::tensor_traits *>(ggml_repack_get_optimal_repack_type(tensor->src[0]));
2020+
}
2021+
if (tensor->op == GGML_OP_MUL_MAT_ID && ggml_n_dims(tensor->src[0]) == 3) {
2022+
tensor->src[0]->extra = (void *) const_cast<ggml::cpu::tensor_traits *>(ggml_repack_get_optimal_repack_type(tensor->src[0]));
2023+
}
2024+
2025+
// Not really sure if this is strictly needed as the cpu buffer does not
2026+
// initialize anything at the moment, but keeping this just in case that changes.
2027+
test_repack_wrapper_context * w_ctx = (test_repack_wrapper_context *) buffer->context;
2028+
if (w_ctx->cpu_buffer->iface.init_tensor) {
2029+
return w_ctx->cpu_buffer->iface.init_tensor(w_ctx->cpu_buffer, tensor);
2030+
}
2031+
return GGML_STATUS_SUCCESS;
2032+
}
2033+
2034+
static void ggml_backend_cpu_repack_test_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
2035+
test_repack_wrapper_context * w_ctx = (test_repack_wrapper_context *) buffer->context;
2036+
GGML_ASSERT(w_ctx != nullptr);
2037+
2038+
auto tensor_traits = (ggml::cpu::repack::tensor_traits_base *) tensor->extra;
2039+
if (tensor_traits) {
2040+
w_ctx->original_data[tensor] = std::vector<uint8_t>((const uint8_t *)data, (const uint8_t *)data + size);
2041+
auto OK = tensor_traits->repack(tensor, data, size);
2042+
GGML_ASSERT(OK == 0);
2043+
} else {
2044+
// Forward to underlying CPU buffer (no repacking)
2045+
w_ctx->cpu_buffer->iface.set_tensor(w_ctx->cpu_buffer, tensor, data, offset, size);
2046+
}
2047+
}
2048+
2049+
static void * ggml_backend_cpu_repack_test_buffer_get_base(ggml_backend_buffer_t buffer) {
2050+
test_repack_wrapper_context * w_ctx = (test_repack_wrapper_context *) buffer->context;
2051+
return ggml_backend_buffer_get_base(w_ctx->cpu_buffer);
2052+
}
2053+
2054+
static void ggml_backend_cpu_repack_test_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
2055+
test_repack_wrapper_context * w_ctx = (test_repack_wrapper_context *) buffer->context;
2056+
2057+
auto tensor_traits = (ggml::cpu::repack::tensor_traits_base *) tensor->extra;
2058+
if (tensor_traits) {
2059+
// Return the original data for repacked tensor data. This is here so
2060+
// that when the graph is copied we can still get the original data which
2061+
// would otherwise be lost.
2062+
auto it = w_ctx->original_data.find(const_cast<struct ggml_tensor *>(tensor));
2063+
if (it != w_ctx->original_data.end()) {
2064+
const auto& original = it->second;
2065+
size_t copy_size = std::min(size, original.size() - offset);
2066+
std::memcpy(data, original.data() + offset, copy_size);
2067+
}
2068+
} else {
2069+
// For non-repacked data just forward to the underlying CPU buffer.
2070+
w_ctx->cpu_buffer->iface.get_tensor(w_ctx->cpu_buffer, tensor, data, offset, size);
2071+
}
2072+
}
2073+
2074+
static const char * ggml_backend_cpu_repack_test_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
2075+
return "CPU_REPACK_TEST";
2076+
GGML_UNUSED(buft);
2077+
}
2078+
2079+
static ggml_backend_buffer_t ggml_backend_cpu_repack_test_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
2080+
ggml_backend_buffer_t cpu_buffer = ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
2081+
if (!cpu_buffer) {
2082+
return nullptr;
2083+
}
2084+
2085+
test_repack_wrapper_context * w_ctx = new test_repack_wrapper_context;
2086+
w_ctx->cpu_buffer = cpu_buffer;
2087+
2088+
static const struct ggml_backend_buffer_i ggml_backend_cpu_repack_test_buffer_i = {
2089+
/* .free_buffer = */ ggml_backend_cpu_repack_test_buffer_free_buffer,
2090+
/* .get_base = */ ggml_backend_cpu_repack_test_buffer_get_base,
2091+
/* .init_tensor = */ ggml_backend_cpu_repack_test_buffer_init_tensor,
2092+
/* .memset_tensor = */ nullptr,
2093+
/* .set_tensor = */ ggml_backend_cpu_repack_test_buffer_set_tensor,
2094+
/* .get_tensor = */ ggml_backend_cpu_repack_test_buffer_get_tensor,
2095+
/* .cpy_tensor = */ nullptr,
2096+
/* .clear = */ nullptr,
2097+
/* .reset = */ nullptr,
2098+
};
2099+
2100+
// This is intentionally using the repack buffer type because this type is
2101+
// used in ggml::cpu::repack::get_tensor_traits, and without this the
2102+
// computation will not be forwarded to repacks compute_forward function.
2103+
auto repack_buft = ggml_backend_cpu_repack_buffer_type();
2104+
return ggml_backend_buffer_init(repack_buft, ggml_backend_cpu_repack_test_buffer_i, w_ctx, size);
2105+
GGML_UNUSED(buft);
2106+
}
2107+
2108+
static size_t ggml_backend_cpu_repack_test_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
2109+
return ggml_backend_buft_get_alignment(ggml_backend_cpu_buffer_type());
2110+
GGML_UNUSED(buft);
2111+
}
2112+
2113+
ggml_backend_buffer_type_t ggml_backend_cpu_repack_test_buffer_type(void) {
2114+
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_repack_test = {
2115+
/* .iface = */ {
2116+
/* .get_name = */ ggml_backend_cpu_repack_test_buffer_type_get_name,
2117+
/* .alloc_buffer = */ ggml_backend_cpu_repack_test_buffer_type_alloc_buffer,
2118+
/* .get_alignment = */ ggml_backend_cpu_repack_test_buffer_type_get_alignment,
2119+
/* .get_max_size = */ nullptr, // defaults to SIZE_MAX
2120+
/* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes
2121+
/* .is_host = */ nullptr, // defaults to true
2122+
},
2123+
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
2124+
/* .context = */ nullptr,
2125+
};
2126+
return &ggml_backend_cpu_buffer_type_repack_test;
2127+
}
2128+
#endif // GGML_BUILD_TESTS

ggml/src/ggml-cpu/repack.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@
1010

1111
ggml_backend_buffer_type_t ggml_backend_cpu_repack_buffer_type(void);
1212

13+
#ifdef GGML_BUILD_TESTS
14+
ggml_backend_buffer_type_t ggml_backend_cpu_repack_test_buffer_type(void);
15+
#endif
16+
1317
template <int K> constexpr int QK_0() {
1418
if constexpr (K == 4) {
1519
return QK4_0;

tests/test-backend-ops.cpp

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1137,7 +1137,37 @@ struct test_case {
11371137
add_sentinel(ctx);
11381138

11391139
// allocate
1140-
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend1);
1140+
std::vector<ggml_backend_buffer_type_t> extra_buft_list;
1141+
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
1142+
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
1143+
auto get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
1144+
ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
1145+
1146+
if (get_extra_bufts_fn) {
1147+
ggml_backend_buffer_type_t * extra_bufts = get_extra_bufts_fn(cpu_dev);
1148+
while (extra_bufts && *extra_bufts) {
1149+
extra_buft_list.push_back(*extra_bufts);
1150+
++extra_bufts;
1151+
}
1152+
}
1153+
1154+
// Try to find test repack wrapper buffer type among the extra buffer types
1155+
ggml_backend_buffer_type_t test_repack_buft = nullptr;
1156+
for (auto buft : extra_buft_list) {
1157+
const char* buft_name = ggml_backend_buft_name(buft);
1158+
if (buft_name && strstr(buft_name, "CPU_REPACK_TEST")) {
1159+
test_repack_buft = buft;
1160+
break;
1161+
}
1162+
}
1163+
1164+
ggml_backend_buffer_t buf = nullptr;
1165+
if (test_repack_buft) {
1166+
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, test_repack_buft);
1167+
} else {
1168+
// Fallback to regular allocation
1169+
buf = ggml_backend_alloc_ctx_tensors(ctx, backend1);
1170+
}
11411171

11421172
if (buf == NULL) {
11431173
printf("failed to allocate tensors [%s] ", ggml_backend_name(backend1));

0 commit comments

Comments
 (0)