Skip to content

Commit 4ed3ab7

Browse files
Feat/smarterquant tests (#8)
* Implement remaining SmarterQuant JSON and GGUF tests - Enhanced `tests/test-smarterquant.cpp` with edge cases for tensor column counts (128, 300, 512, 768) and different permutation patterns (identity, few swaps). - Added `tests/test-smarterquant-gguf.cpp` for end-to-end GGUF testing, including metadata writing/reading and numerical verification through the quantization and model loading pipeline. - Updated `todo.txt` to reflect test completion. * docs: Analyze memory impact of SmarterQuant unpermutation buffer Documents the memory usage of the temporary F32 buffer used during the unpermutation step in `ggml_get_rows_smarterquant`. The buffer is stack-allocated (`alloca`) with size `n_cols * sizeof(float)`. For typical model dimensions, this is a minor memory footprint (e.g., 16-32KB) and is short-lived. A potential concern for extremely large column counts is noted, though not typical for current LLM weights. --------- Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com>
1 parent e55e95e commit 4ed3ab7

File tree

3 files changed

+533
-122
lines changed

3 files changed

+533
-122
lines changed

tests/test-smarterquant-gguf.cpp

Lines changed: 332 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,332 @@
1+
// End-to-end tests for SmarterQuant GGUF functionality
2+
// This test covers:
3+
// 1. Writing SmarterQuant metadata to GGUF during quantization.
4+
// 2. Reading SmarterQuant metadata from GGUF during model loading.
5+
// 3. Numerical correctness of dequantization through the model loading path.
6+
7+
#include "ggml.h"
8+
#include "ggml-cpu.h"
9+
#include "llama.h"
10+
#include "llama-quant.h"
11+
#include "llama-model-loader.h" // For llama_model_loader, to inspect GGUF metadata directly if needed
12+
#include "gguf.h" // For gguf_init_empty, gguf_add_tensor, gguf_write_to_file, etc.
13+
#include "json.hpp" // For nlohmann::json to create dummy smarterquant json
14+
15+
#undef NDEBUG // Ensure asserts are enabled
16+
#include <assert.h>
17+
#include <math.h>
18+
#include <stdio.h>
19+
#include <string>
20+
#include <vector>
21+
#include <numeric>
22+
#include <algorithm>
23+
#include <iostream>
24+
#include <iomanip>
25+
#include <fstream> // For std::ofstream, std::ifstream
26+
#include <stdexcept> // For std::runtime_error
27+
#include <cstdio> // For remove()
28+
29+
// Helper from test-smarterquant.cpp
30+
static bool compare_float_arrays(const float* arr1, const float* arr2, size_t size, float tolerance, const std::string& test_name) {
31+
for (size_t i = 0; i < size; ++i) {
32+
if (fabs(arr1[i] - arr2[i]) > tolerance) {
33+
std::cerr << std::fixed << std::setprecision(8)
34+
<< "Test: " << test_name << " - Mismatch at index " << i << ": arr1 = " << arr1[i]
35+
<< ", arr2 = " << arr2[i] << ", diff = " << fabs(arr1[i] - arr2[i])
36+
<< ", tolerance = " << tolerance << std::endl;
37+
return false;
38+
}
39+
}
40+
return true;
41+
}
42+
43+
// Helper to create a dummy FP32 GGUF file
44+
static bool create_dummy_fp32_gguf(
45+
const std::string& filename,
46+
const std::vector<std::pair<std::string, std::vector<int64_t>>>& tensor_infos, // name, dims
47+
const std::vector<std::vector<float>>& tensor_data // data for each tensor
48+
) {
49+
gguf_context_ptr ctx_out { gguf_init_empty() };
50+
if (!ctx_out) {
51+
fprintf(stderr, "Failed to initialize GGUF context for %s\n", filename.c_str());
52+
return false;
53+
}
54+
55+
// Add some minimal GGUF metadata
56+
gguf_set_val_str (ctx_out.get(), "general.architecture", "dummy");
57+
gguf_set_val_u32(ctx_out.get(), "dummy.block_count", 1);
58+
gguf_set_val_u32(ctx_out.get(), "dummy.tensor_count", tensor_infos.size());
59+
60+
61+
for (size_t i = 0; i < tensor_infos.size(); ++i) {
62+
const auto& info = tensor_infos[i];
63+
const auto& data = tensor_data[i];
64+
65+
struct ggml_init_params params = { data.size() * sizeof(float) + ggml_tensor_overhead(), NULL, true };
66+
struct ggml_context * tensor_ctx = ggml_init(params);
67+
if (!tensor_ctx) {
68+
fprintf(stderr, "Failed to create ggml context for tensor %s\n", info.first.c_str());
69+
return false;
70+
}
71+
72+
struct ggml_tensor * t = nullptr;
73+
if (info.second.size() == 1) {
74+
t = ggml_new_tensor_1d(tensor_ctx, GGML_TYPE_F32, info.second[0]);
75+
} else if (info.second.size() == 2) {
76+
t = ggml_new_tensor_2d(tensor_ctx, GGML_TYPE_F32, info.second[0], info.second[1]);
77+
} else if (info.second.size() == 3) {
78+
t = ggml_new_tensor_3d(tensor_ctx, GGML_TYPE_F32, info.second[0], info.second[1], info.second[2]);
79+
} else {
80+
fprintf(stderr, "Unsupported tensor dimension count %zu for %s\n", info.second.size(), info.first.c_str());
81+
ggml_free(tensor_ctx);
82+
return false;
83+
}
84+
ggml_set_name(t, info.first.c_str());
85+
memcpy(t->data, data.data(), ggml_nbytes(t));
86+
87+
gguf_add_tensor(ctx_out.get(), t);
88+
ggml_free(tensor_ctx);
89+
}
90+
91+
if (!gguf_write_to_file(ctx_out.get(), filename.c_string(), false)) {
92+
fprintf(stderr, "Failed to write GGUF file %s\n", filename.c_str());
93+
return false;
94+
}
95+
printf(" Successfully created dummy FP32 GGUF: %s\n", filename.c_str());
96+
return true;
97+
}
98+
99+
// Helper to create a dummy smarterquant.json file
100+
static bool create_dummy_smarterquant_json(
101+
const std::string& filename,
102+
const std::string& tensor_name_1, int64_t n_cols_1, const std::vector<int32_t>& perm_1,
103+
const std::string& tensor_name_2, int64_t n_cols_2, const std::vector<int32_t>& perm_2
104+
) {
105+
nlohmann::json j;
106+
107+
nlohmann::json t1_config = nlohmann::json::array();
108+
nlohmann::json t1_types = nlohmann::json::array({GGML_TYPE_Q4_0, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0, GGML_TYPE_Q2_K});
109+
nlohmann::json t1_perm = nlohmann::json::array();
110+
if (!perm_1.empty()) {
111+
for (int32_t idx : perm_1) t1_perm.push_back(idx);
112+
} else { // identity
113+
for (int64_t i=0; i<n_cols_1; ++i) t1_perm.push_back(i);
114+
}
115+
t1_config.push_back(t1_types);
116+
t1_config.push_back(t1_perm);
117+
j[tensor_name_1] = t1_config;
118+
119+
nlohmann::json t2_config = nlohmann::json::array();
120+
// Use different types for the second tensor for variety
121+
nlohmann::json t2_types = nlohmann::json::array({GGML_TYPE_Q8_0, GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K});
122+
nlohmann::json t2_perm = nlohmann::json::array();
123+
if (!perm_2.empty()) {
124+
for (int32_t idx : perm_2) t2_perm.push_back(idx);
125+
} else { // identity
126+
for (int64_t i=0; i<n_cols_2; ++i) t2_perm.push_back(i);
127+
}
128+
t2_config.push_back(t2_types);
129+
t2_config.push_back(t2_perm);
130+
j[tensor_name_2] = t2_config;
131+
132+
std::ofstream ofs(filename);
133+
if (!ofs.is_open()) {
134+
fprintf(stderr, "Failed to open %s for writing.\n", filename.c_str());
135+
return false;
136+
}
137+
ofs << j.dump(4);
138+
ofs.close();
139+
printf(" Successfully created dummy smarterquant JSON: %s\n", filename.c_str());
140+
return true;
141+
}
142+
143+
144+
int main(int argc, char **argv) {
145+
GGML_UNUSED(argc);
146+
GGML_UNUSED(argv);
147+
printf("Testing SmarterQuant GGUF end-to-end functionality...\n");
148+
int overall_status = 0;
149+
150+
const std::string dummy_fp32_gguf_name = "dummy_fp32_input.gguf";
151+
const std::string dummy_sq_json_name = "dummy_smarterquant.json";
152+
const std::string dummy_quantized_gguf_name = "dummy_quantized_output.gguf";
153+
154+
// Define tensors for the dummy model
155+
std::string tensor1_name = "tensor_one";
156+
std::vector<int64_t> tensor1_dims = {512, 2}; // 512 cols, 2 rows
157+
std::vector<float> tensor1_data(tensor1_dims[0] * tensor1_dims[1]);
158+
for(size_t i=0; i<tensor1_data.size(); ++i) tensor1_data[i] = static_cast<float>(i % 128) - 64.f;
159+
160+
std::string tensor2_name = "tensor_two";
161+
std::vector<int64_t> tensor2_dims = {1280, 1}; // 1280 cols, 1 row (5 * 256)
162+
std::vector<float> tensor2_data(tensor2_dims[0] * tensor2_dims[1]);
163+
for(size_t i=0; i<tensor2_data.size(); ++i) tensor2_data[i] = static_cast<float>((i % 200) * (i%2==0 ? 1 : -1)) * 0.5f;
164+
165+
166+
// Create dummy FP32 GGUF
167+
if (!create_dummy_fp32_gguf(dummy_fp32_gguf_name,
168+
{{tensor1_name, tensor1_dims}, {tensor2_name, tensor2_dims}},
169+
{tensor1_data, tensor2_data})) {
170+
fprintf(stderr, "Failed to create dummy FP32 GGUF.\n");
171+
return 1;
172+
}
173+
174+
// Create dummy smarterquant.json
175+
std::vector<int32_t> perm1(tensor1_dims[0]); // Reverse for tensor1
176+
for(int64_t i=0; i<tensor1_dims[0]; ++i) perm1[i] = (tensor1_dims[0] - 1) - i;
177+
std::vector<int32_t> perm2; // Identity for tensor2 (empty means identity in helper)
178+
179+
if (!create_dummy_smarterquant_json(dummy_sq_json_name,
180+
tensor1_name, tensor1_dims[0], perm1,
181+
tensor2_name, tensor2_dims[0], perm2)) {
182+
fprintf(stderr, "Failed to create dummy smarterquant.json.\n");
183+
remove(dummy_fp32_gguf_name.c_str());
184+
return 1;
185+
}
186+
187+
// Quantize
188+
printf(" Attempting quantization...\n");
189+
llama_model_quantize_params qparams = llama_model_quantize_default_params();
190+
qparams.ftype = LLAMA_FTYPE_MOSTLY_Q8_0; // Base type, SmarterQuant will override
191+
qparams.smarter_quant_json_path = dummy_sq_json_name.c_str(); // Specify our JSON
192+
193+
// We need a kv_overrides vector even if empty, for the SmarterQuant metadata to be added to.
194+
std::vector<llama_model_kv_override> kv_overrides;
195+
kv_overrides.emplace_back(); // Add the null terminator
196+
kv_overrides.back().key[0] = 0;
197+
qparams.kv_overrides = &kv_overrides;
198+
199+
200+
try {
201+
llama_model_quantize_impl(dummy_fp32_gguf_name, dummy_quantized_gguf_name, &qparams);
202+
printf(" Quantization call completed.\n");
203+
} catch (const std::exception& e) {
204+
fprintf(stderr, " ERROR: Quantization failed with exception: %s\n", e.what());
205+
overall_status = 1;
206+
goto cleanup;
207+
}
208+
209+
// Load the quantized model and verify
210+
printf(" Loading quantized GGUF and verifying...\n");
211+
llama_model_params mparams = llama_model_default_params();
212+
llama_model * model = llama_load_model_from_file(dummy_quantized_gguf_name.c_str(), mparams);
213+
214+
if (!model) {
215+
fprintf(stderr, " ERROR: Failed to load quantized GGUF model %s.\n", dummy_quantized_gguf_name.c_str());
216+
overall_status = 1;
217+
goto cleanup;
218+
}
219+
220+
// Verify tensor1
221+
{
222+
const ggml_tensor* t1 = llama_get_model_tensor(model, tensor1_name.c_str());
223+
if (!t1) {
224+
fprintf(stderr, " ERROR: Tensor '%s' not found in quantized model.\n", tensor1_name.c_str());
225+
overall_status = 1;
226+
} else {
227+
if (!t1->sq_info || !t1->sq_info->enabled) {
228+
fprintf(stderr, " ERROR: Tensor '%s' does not have SmarterQuant info enabled after loading.\n", tensor1_name.c_str());
229+
overall_status = 1;
230+
} else {
231+
printf(" Tensor '%s' SmarterQuant info loaded successfully.\n", tensor1_name.c_str());
232+
// Check types (example for first block)
233+
if (t1->sq_info->compression_types[0] != GGML_TYPE_Q4_0) {
234+
fprintf(stderr, " ERROR: Tensor '%s' expected type0 %d, got %d.\n", tensor1_name.c_str(), GGML_TYPE_Q4_0, t1->sq_info->compression_types[0]);
235+
overall_status = 1;
236+
}
237+
// Check permutation (example for first element)
238+
if (t1->sq_info->column_permutation[0] != perm1[0]) {
239+
fprintf(stderr, " ERROR: Tensor '%s' expected perm[0] %d, got %d.\n", tensor1_name.c_str(), perm1[0], t1->sq_info->column_permutation[0]);
240+
overall_status = 1;
241+
}
242+
243+
// Numerical check for tensor1
244+
std::vector<float> t1_dequant_data(tensor1_dims[0] * tensor1_dims[1]);
245+
// Simulate getting rows (simplified for test - assumes CPU context and direct call)
246+
// In a real scenario, this would be through ggml_compute_forward or similar.
247+
for(int r=0; r<tensor1_dims[1]; ++r) {
248+
// Calculate the byte offset for the current row in the ggml_tensor's data
249+
// This is a simplified calculation. A real scenario might need to consider
250+
// the actual byte layout if rows are not simply (total_size / num_rows).
251+
// llama_tensor_quantize_smarter_blocks writes data sequentially, so this should be okay for this test.
252+
size_t row_byte_size = 0;
253+
for(int64_t c_seg = 0; c_seg < tensor1_dims[0]; c_seg += 256) {
254+
int64_t seg_cols = std::min((int64_t)256, tensor1_dims[0] - c_seg);
255+
int block_idx_in_row = c_seg / 256;
256+
ggml_type seg_type = (block_idx_in_row < 4) ? (ggml_type)t1->sq_info->compression_types[block_idx_in_row] : (ggml_type)t1->sq_info->compression_types[3];
257+
row_byte_size += ggml_type_size(seg_type) * (seg_cols / ggml_blck_size(seg_type));
258+
}
259+
260+
const char * t1_row_data = (const char*)t1->data + r * row_byte_size;
261+
float* t1_dequant_row_ptr = t1_dequant_data.data() + r * tensor1_dims[0];
262+
ggml_get_rows_smarterquant(t1, t1_row_data, t1_dequant_row_ptr);
263+
}
264+
if (!compare_float_arrays(tensor1_data.data(), t1_dequant_data.data(), tensor1_data.size(), 0.15f, tensor1_name)) {
265+
fprintf(stderr, " ERROR: Numerical mismatch for tensor '%s'.\n", tensor1_name.c_str());
266+
overall_status = 1;
267+
} else {
268+
printf(" Tensor '%s' numerical check PASSED.\n", tensor1_name.c_str());
269+
}
270+
}
271+
}
272+
}
273+
274+
// Verify tensor2
275+
{
276+
const ggml_tensor* t2 = llama_get_model_tensor(model, tensor2_name.c_str());
277+
if (!t2) {
278+
fprintf(stderr, " ERROR: Tensor '%s' not found in quantized model.\n", tensor2_name.c_str());
279+
overall_status = 1;
280+
} else {
281+
if (!t2->sq_info || !t2->sq_info->enabled) {
282+
fprintf(stderr, " ERROR: Tensor '%s' does not have SmarterQuant info enabled after loading.\n", tensor2_name.c_str());
283+
overall_status = 1;
284+
} else {
285+
printf(" Tensor '%s' SmarterQuant info loaded successfully.\n", tensor2_name.c_str());
286+
if (t2->sq_info->compression_types[0] != GGML_TYPE_Q8_0) { // Matching dummy_smarterquant.json
287+
fprintf(stderr, " ERROR: Tensor '%s' expected type0 %d, got %d.\n", tensor2_name.c_str(), GGML_TYPE_Q8_0, t2->sq_info->compression_types[0]);
288+
overall_status = 1;
289+
}
290+
// Check permutation (identity for tensor2)
291+
if (t2->sq_info->column_permutation[0] != 0) {
292+
fprintf(stderr, " ERROR: Tensor '%s' expected perm[0] 0 (identity), got %d.\n", tensor2_name.c_str(), t2->sq_info->column_permutation[0]);
293+
overall_status = 1;
294+
}
295+
// Numerical check for tensor2
296+
std::vector<float> t2_dequant_data(tensor2_dims[0] * tensor2_dims[1]);
297+
for(int r=0; r<tensor2_dims[1]; ++r) {
298+
size_t row_byte_size = 0;
299+
for(int64_t c_seg = 0; c_seg < tensor2_dims[0]; c_seg += 256) {
300+
int64_t seg_cols = std::min((int64_t)256, tensor2_dims[0] - c_seg);
301+
int block_idx_in_row = c_seg / 256;
302+
ggml_type seg_type = (block_idx_in_row < 4) ? (ggml_type)t2->sq_info->compression_types[block_idx_in_row] : (ggml_type)t2->sq_info->compression_types[3];
303+
row_byte_size += ggml_type_size(seg_type) * (seg_cols / ggml_blck_size(seg_type));
304+
}
305+
const char * t2_row_data = (const char*)t2->data + r * row_byte_size;
306+
float* t2_dequant_row_ptr = t2_dequant_data.data() + r * tensor2_dims[0];
307+
ggml_get_rows_smarterquant(t2, t2_row_data, t2_dequant_row_ptr);
308+
}
309+
if (!compare_float_arrays(tensor2_data.data(), t2_dequant_data.data(), tensor2_data.size(), 0.15f, tensor2_name)) {
310+
fprintf(stderr, " ERROR: Numerical mismatch for tensor '%s'.\n", tensor2_name.c_str());
311+
overall_status = 1;
312+
} else {
313+
printf(" Tensor '%s' numerical check PASSED.\n", tensor2_name.c_str());
314+
}
315+
}
316+
}
317+
}
318+
319+
320+
if (model) {
321+
llama_free_model(model);
322+
}
323+
324+
cleanup:
325+
// Clean up dummy files
326+
remove(dummy_fp32_gguf_name.c_str());
327+
remove(dummy_sq_json_name.c_str());
328+
remove(dummy_quantized_gguf_name.c_str());
329+
330+
printf("SmarterQuant GGUF end-to-end test finished.\n");
331+
return overall_status;
332+
}

0 commit comments

Comments
 (0)