|
| 1 | +// End-to-end tests for SmarterQuant GGUF functionality |
| 2 | +// This test covers: |
| 3 | +// 1. Writing SmarterQuant metadata to GGUF during quantization. |
| 4 | +// 2. Reading SmarterQuant metadata from GGUF during model loading. |
| 5 | +// 3. Numerical correctness of dequantization through the model loading path. |
| 6 | + |
| 7 | +#include "ggml.h" |
| 8 | +#include "ggml-cpu.h" |
| 9 | +#include "llama.h" |
| 10 | +#include "llama-quant.h" |
| 11 | +#include "llama-model-loader.h" // For llama_model_loader, to inspect GGUF metadata directly if needed |
| 12 | +#include "gguf.h" // For gguf_init_empty, gguf_add_tensor, gguf_write_to_file, etc. |
| 13 | +#include "json.hpp" // For nlohmann::json to create dummy smarterquant json |
| 14 | + |
| 15 | +#undef NDEBUG // Ensure asserts are enabled |
| 16 | +#include <assert.h> |
| 17 | +#include <math.h> |
| 18 | +#include <stdio.h> |
| 19 | +#include <string> |
| 20 | +#include <vector> |
| 21 | +#include <numeric> |
| 22 | +#include <algorithm> |
| 23 | +#include <iostream> |
| 24 | +#include <iomanip> |
| 25 | +#include <fstream> // For std::ofstream, std::ifstream |
| 26 | +#include <stdexcept> // For std::runtime_error |
| 27 | +#include <cstdio> // For remove() |
| 28 | + |
| 29 | +// Helper from test-smarterquant.cpp |
| 30 | +static bool compare_float_arrays(const float* arr1, const float* arr2, size_t size, float tolerance, const std::string& test_name) { |
| 31 | + for (size_t i = 0; i < size; ++i) { |
| 32 | + if (fabs(arr1[i] - arr2[i]) > tolerance) { |
| 33 | + std::cerr << std::fixed << std::setprecision(8) |
| 34 | + << "Test: " << test_name << " - Mismatch at index " << i << ": arr1 = " << arr1[i] |
| 35 | + << ", arr2 = " << arr2[i] << ", diff = " << fabs(arr1[i] - arr2[i]) |
| 36 | + << ", tolerance = " << tolerance << std::endl; |
| 37 | + return false; |
| 38 | + } |
| 39 | + } |
| 40 | + return true; |
| 41 | +} |
| 42 | + |
| 43 | +// Helper to create a dummy FP32 GGUF file |
| 44 | +static bool create_dummy_fp32_gguf( |
| 45 | + const std::string& filename, |
| 46 | + const std::vector<std::pair<std::string, std::vector<int64_t>>>& tensor_infos, // name, dims |
| 47 | + const std::vector<std::vector<float>>& tensor_data // data for each tensor |
| 48 | +) { |
| 49 | + gguf_context_ptr ctx_out { gguf_init_empty() }; |
| 50 | + if (!ctx_out) { |
| 51 | + fprintf(stderr, "Failed to initialize GGUF context for %s\n", filename.c_str()); |
| 52 | + return false; |
| 53 | + } |
| 54 | + |
| 55 | + // Add some minimal GGUF metadata |
| 56 | + gguf_set_val_str (ctx_out.get(), "general.architecture", "dummy"); |
| 57 | + gguf_set_val_u32(ctx_out.get(), "dummy.block_count", 1); |
| 58 | + gguf_set_val_u32(ctx_out.get(), "dummy.tensor_count", tensor_infos.size()); |
| 59 | + |
| 60 | + |
| 61 | + for (size_t i = 0; i < tensor_infos.size(); ++i) { |
| 62 | + const auto& info = tensor_infos[i]; |
| 63 | + const auto& data = tensor_data[i]; |
| 64 | + |
| 65 | + struct ggml_init_params params = { data.size() * sizeof(float) + ggml_tensor_overhead(), NULL, true }; |
| 66 | + struct ggml_context * tensor_ctx = ggml_init(params); |
| 67 | + if (!tensor_ctx) { |
| 68 | + fprintf(stderr, "Failed to create ggml context for tensor %s\n", info.first.c_str()); |
| 69 | + return false; |
| 70 | + } |
| 71 | + |
| 72 | + struct ggml_tensor * t = nullptr; |
| 73 | + if (info.second.size() == 1) { |
| 74 | + t = ggml_new_tensor_1d(tensor_ctx, GGML_TYPE_F32, info.second[0]); |
| 75 | + } else if (info.second.size() == 2) { |
| 76 | + t = ggml_new_tensor_2d(tensor_ctx, GGML_TYPE_F32, info.second[0], info.second[1]); |
| 77 | + } else if (info.second.size() == 3) { |
| 78 | + t = ggml_new_tensor_3d(tensor_ctx, GGML_TYPE_F32, info.second[0], info.second[1], info.second[2]); |
| 79 | + } else { |
| 80 | + fprintf(stderr, "Unsupported tensor dimension count %zu for %s\n", info.second.size(), info.first.c_str()); |
| 81 | + ggml_free(tensor_ctx); |
| 82 | + return false; |
| 83 | + } |
| 84 | + ggml_set_name(t, info.first.c_str()); |
| 85 | + memcpy(t->data, data.data(), ggml_nbytes(t)); |
| 86 | + |
| 87 | + gguf_add_tensor(ctx_out.get(), t); |
| 88 | + ggml_free(tensor_ctx); |
| 89 | + } |
| 90 | + |
| 91 | + if (!gguf_write_to_file(ctx_out.get(), filename.c_string(), false)) { |
| 92 | + fprintf(stderr, "Failed to write GGUF file %s\n", filename.c_str()); |
| 93 | + return false; |
| 94 | + } |
| 95 | + printf(" Successfully created dummy FP32 GGUF: %s\n", filename.c_str()); |
| 96 | + return true; |
| 97 | +} |
| 98 | + |
| 99 | +// Helper to create a dummy smarterquant.json file |
| 100 | +static bool create_dummy_smarterquant_json( |
| 101 | + const std::string& filename, |
| 102 | + const std::string& tensor_name_1, int64_t n_cols_1, const std::vector<int32_t>& perm_1, |
| 103 | + const std::string& tensor_name_2, int64_t n_cols_2, const std::vector<int32_t>& perm_2 |
| 104 | +) { |
| 105 | + nlohmann::json j; |
| 106 | + |
| 107 | + nlohmann::json t1_config = nlohmann::json::array(); |
| 108 | + nlohmann::json t1_types = nlohmann::json::array({GGML_TYPE_Q4_0, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0, GGML_TYPE_Q2_K}); |
| 109 | + nlohmann::json t1_perm = nlohmann::json::array(); |
| 110 | + if (!perm_1.empty()) { |
| 111 | + for (int32_t idx : perm_1) t1_perm.push_back(idx); |
| 112 | + } else { // identity |
| 113 | + for (int64_t i=0; i<n_cols_1; ++i) t1_perm.push_back(i); |
| 114 | + } |
| 115 | + t1_config.push_back(t1_types); |
| 116 | + t1_config.push_back(t1_perm); |
| 117 | + j[tensor_name_1] = t1_config; |
| 118 | + |
| 119 | + nlohmann::json t2_config = nlohmann::json::array(); |
| 120 | + // Use different types for the second tensor for variety |
| 121 | + nlohmann::json t2_types = nlohmann::json::array({GGML_TYPE_Q8_0, GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K}); |
| 122 | + nlohmann::json t2_perm = nlohmann::json::array(); |
| 123 | + if (!perm_2.empty()) { |
| 124 | + for (int32_t idx : perm_2) t2_perm.push_back(idx); |
| 125 | + } else { // identity |
| 126 | + for (int64_t i=0; i<n_cols_2; ++i) t2_perm.push_back(i); |
| 127 | + } |
| 128 | + t2_config.push_back(t2_types); |
| 129 | + t2_config.push_back(t2_perm); |
| 130 | + j[tensor_name_2] = t2_config; |
| 131 | + |
| 132 | + std::ofstream ofs(filename); |
| 133 | + if (!ofs.is_open()) { |
| 134 | + fprintf(stderr, "Failed to open %s for writing.\n", filename.c_str()); |
| 135 | + return false; |
| 136 | + } |
| 137 | + ofs << j.dump(4); |
| 138 | + ofs.close(); |
| 139 | + printf(" Successfully created dummy smarterquant JSON: %s\n", filename.c_str()); |
| 140 | + return true; |
| 141 | +} |
| 142 | + |
| 143 | + |
| 144 | +int main(int argc, char **argv) { |
| 145 | + GGML_UNUSED(argc); |
| 146 | + GGML_UNUSED(argv); |
| 147 | + printf("Testing SmarterQuant GGUF end-to-end functionality...\n"); |
| 148 | + int overall_status = 0; |
| 149 | + |
| 150 | + const std::string dummy_fp32_gguf_name = "dummy_fp32_input.gguf"; |
| 151 | + const std::string dummy_sq_json_name = "dummy_smarterquant.json"; |
| 152 | + const std::string dummy_quantized_gguf_name = "dummy_quantized_output.gguf"; |
| 153 | + |
| 154 | + // Define tensors for the dummy model |
| 155 | + std::string tensor1_name = "tensor_one"; |
| 156 | + std::vector<int64_t> tensor1_dims = {512, 2}; // 512 cols, 2 rows |
| 157 | + std::vector<float> tensor1_data(tensor1_dims[0] * tensor1_dims[1]); |
| 158 | + for(size_t i=0; i<tensor1_data.size(); ++i) tensor1_data[i] = static_cast<float>(i % 128) - 64.f; |
| 159 | + |
| 160 | + std::string tensor2_name = "tensor_two"; |
| 161 | + std::vector<int64_t> tensor2_dims = {1280, 1}; // 1280 cols, 1 row (5 * 256) |
| 162 | + std::vector<float> tensor2_data(tensor2_dims[0] * tensor2_dims[1]); |
| 163 | + for(size_t i=0; i<tensor2_data.size(); ++i) tensor2_data[i] = static_cast<float>((i % 200) * (i%2==0 ? 1 : -1)) * 0.5f; |
| 164 | + |
| 165 | + |
| 166 | + // Create dummy FP32 GGUF |
| 167 | + if (!create_dummy_fp32_gguf(dummy_fp32_gguf_name, |
| 168 | + {{tensor1_name, tensor1_dims}, {tensor2_name, tensor2_dims}}, |
| 169 | + {tensor1_data, tensor2_data})) { |
| 170 | + fprintf(stderr, "Failed to create dummy FP32 GGUF.\n"); |
| 171 | + return 1; |
| 172 | + } |
| 173 | + |
| 174 | + // Create dummy smarterquant.json |
| 175 | + std::vector<int32_t> perm1(tensor1_dims[0]); // Reverse for tensor1 |
| 176 | + for(int64_t i=0; i<tensor1_dims[0]; ++i) perm1[i] = (tensor1_dims[0] - 1) - i; |
| 177 | + std::vector<int32_t> perm2; // Identity for tensor2 (empty means identity in helper) |
| 178 | + |
| 179 | + if (!create_dummy_smarterquant_json(dummy_sq_json_name, |
| 180 | + tensor1_name, tensor1_dims[0], perm1, |
| 181 | + tensor2_name, tensor2_dims[0], perm2)) { |
| 182 | + fprintf(stderr, "Failed to create dummy smarterquant.json.\n"); |
| 183 | + remove(dummy_fp32_gguf_name.c_str()); |
| 184 | + return 1; |
| 185 | + } |
| 186 | + |
| 187 | + // Quantize |
| 188 | + printf(" Attempting quantization...\n"); |
| 189 | + llama_model_quantize_params qparams = llama_model_quantize_default_params(); |
| 190 | + qparams.ftype = LLAMA_FTYPE_MOSTLY_Q8_0; // Base type, SmarterQuant will override |
| 191 | + qparams.smarter_quant_json_path = dummy_sq_json_name.c_str(); // Specify our JSON |
| 192 | + |
| 193 | + // We need a kv_overrides vector even if empty, for the SmarterQuant metadata to be added to. |
| 194 | + std::vector<llama_model_kv_override> kv_overrides; |
| 195 | + kv_overrides.emplace_back(); // Add the null terminator |
| 196 | + kv_overrides.back().key[0] = 0; |
| 197 | + qparams.kv_overrides = &kv_overrides; |
| 198 | + |
| 199 | + |
| 200 | + try { |
| 201 | + llama_model_quantize_impl(dummy_fp32_gguf_name, dummy_quantized_gguf_name, &qparams); |
| 202 | + printf(" Quantization call completed.\n"); |
| 203 | + } catch (const std::exception& e) { |
| 204 | + fprintf(stderr, " ERROR: Quantization failed with exception: %s\n", e.what()); |
| 205 | + overall_status = 1; |
| 206 | + goto cleanup; |
| 207 | + } |
| 208 | + |
| 209 | + // Load the quantized model and verify |
| 210 | + printf(" Loading quantized GGUF and verifying...\n"); |
| 211 | + llama_model_params mparams = llama_model_default_params(); |
| 212 | + llama_model * model = llama_load_model_from_file(dummy_quantized_gguf_name.c_str(), mparams); |
| 213 | + |
| 214 | + if (!model) { |
| 215 | + fprintf(stderr, " ERROR: Failed to load quantized GGUF model %s.\n", dummy_quantized_gguf_name.c_str()); |
| 216 | + overall_status = 1; |
| 217 | + goto cleanup; |
| 218 | + } |
| 219 | + |
| 220 | + // Verify tensor1 |
| 221 | + { |
| 222 | + const ggml_tensor* t1 = llama_get_model_tensor(model, tensor1_name.c_str()); |
| 223 | + if (!t1) { |
| 224 | + fprintf(stderr, " ERROR: Tensor '%s' not found in quantized model.\n", tensor1_name.c_str()); |
| 225 | + overall_status = 1; |
| 226 | + } else { |
| 227 | + if (!t1->sq_info || !t1->sq_info->enabled) { |
| 228 | + fprintf(stderr, " ERROR: Tensor '%s' does not have SmarterQuant info enabled after loading.\n", tensor1_name.c_str()); |
| 229 | + overall_status = 1; |
| 230 | + } else { |
| 231 | + printf(" Tensor '%s' SmarterQuant info loaded successfully.\n", tensor1_name.c_str()); |
| 232 | + // Check types (example for first block) |
| 233 | + if (t1->sq_info->compression_types[0] != GGML_TYPE_Q4_0) { |
| 234 | + fprintf(stderr, " ERROR: Tensor '%s' expected type0 %d, got %d.\n", tensor1_name.c_str(), GGML_TYPE_Q4_0, t1->sq_info->compression_types[0]); |
| 235 | + overall_status = 1; |
| 236 | + } |
| 237 | + // Check permutation (example for first element) |
| 238 | + if (t1->sq_info->column_permutation[0] != perm1[0]) { |
| 239 | + fprintf(stderr, " ERROR: Tensor '%s' expected perm[0] %d, got %d.\n", tensor1_name.c_str(), perm1[0], t1->sq_info->column_permutation[0]); |
| 240 | + overall_status = 1; |
| 241 | + } |
| 242 | + |
| 243 | + // Numerical check for tensor1 |
| 244 | + std::vector<float> t1_dequant_data(tensor1_dims[0] * tensor1_dims[1]); |
| 245 | + // Simulate getting rows (simplified for test - assumes CPU context and direct call) |
| 246 | + // In a real scenario, this would be through ggml_compute_forward or similar. |
| 247 | + for(int r=0; r<tensor1_dims[1]; ++r) { |
| 248 | + // Calculate the byte offset for the current row in the ggml_tensor's data |
| 249 | + // This is a simplified calculation. A real scenario might need to consider |
| 250 | + // the actual byte layout if rows are not simply (total_size / num_rows). |
| 251 | + // llama_tensor_quantize_smarter_blocks writes data sequentially, so this should be okay for this test. |
| 252 | + size_t row_byte_size = 0; |
| 253 | + for(int64_t c_seg = 0; c_seg < tensor1_dims[0]; c_seg += 256) { |
| 254 | + int64_t seg_cols = std::min((int64_t)256, tensor1_dims[0] - c_seg); |
| 255 | + int block_idx_in_row = c_seg / 256; |
| 256 | + ggml_type seg_type = (block_idx_in_row < 4) ? (ggml_type)t1->sq_info->compression_types[block_idx_in_row] : (ggml_type)t1->sq_info->compression_types[3]; |
| 257 | + row_byte_size += ggml_type_size(seg_type) * (seg_cols / ggml_blck_size(seg_type)); |
| 258 | + } |
| 259 | + |
| 260 | + const char * t1_row_data = (const char*)t1->data + r * row_byte_size; |
| 261 | + float* t1_dequant_row_ptr = t1_dequant_data.data() + r * tensor1_dims[0]; |
| 262 | + ggml_get_rows_smarterquant(t1, t1_row_data, t1_dequant_row_ptr); |
| 263 | + } |
| 264 | + if (!compare_float_arrays(tensor1_data.data(), t1_dequant_data.data(), tensor1_data.size(), 0.15f, tensor1_name)) { |
| 265 | + fprintf(stderr, " ERROR: Numerical mismatch for tensor '%s'.\n", tensor1_name.c_str()); |
| 266 | + overall_status = 1; |
| 267 | + } else { |
| 268 | + printf(" Tensor '%s' numerical check PASSED.\n", tensor1_name.c_str()); |
| 269 | + } |
| 270 | + } |
| 271 | + } |
| 272 | + } |
| 273 | + |
| 274 | + // Verify tensor2 |
| 275 | + { |
| 276 | + const ggml_tensor* t2 = llama_get_model_tensor(model, tensor2_name.c_str()); |
| 277 | + if (!t2) { |
| 278 | + fprintf(stderr, " ERROR: Tensor '%s' not found in quantized model.\n", tensor2_name.c_str()); |
| 279 | + overall_status = 1; |
| 280 | + } else { |
| 281 | + if (!t2->sq_info || !t2->sq_info->enabled) { |
| 282 | + fprintf(stderr, " ERROR: Tensor '%s' does not have SmarterQuant info enabled after loading.\n", tensor2_name.c_str()); |
| 283 | + overall_status = 1; |
| 284 | + } else { |
| 285 | + printf(" Tensor '%s' SmarterQuant info loaded successfully.\n", tensor2_name.c_str()); |
| 286 | + if (t2->sq_info->compression_types[0] != GGML_TYPE_Q8_0) { // Matching dummy_smarterquant.json |
| 287 | + fprintf(stderr, " ERROR: Tensor '%s' expected type0 %d, got %d.\n", tensor2_name.c_str(), GGML_TYPE_Q8_0, t2->sq_info->compression_types[0]); |
| 288 | + overall_status = 1; |
| 289 | + } |
| 290 | + // Check permutation (identity for tensor2) |
| 291 | + if (t2->sq_info->column_permutation[0] != 0) { |
| 292 | + fprintf(stderr, " ERROR: Tensor '%s' expected perm[0] 0 (identity), got %d.\n", tensor2_name.c_str(), t2->sq_info->column_permutation[0]); |
| 293 | + overall_status = 1; |
| 294 | + } |
| 295 | + // Numerical check for tensor2 |
| 296 | + std::vector<float> t2_dequant_data(tensor2_dims[0] * tensor2_dims[1]); |
| 297 | + for(int r=0; r<tensor2_dims[1]; ++r) { |
| 298 | + size_t row_byte_size = 0; |
| 299 | + for(int64_t c_seg = 0; c_seg < tensor2_dims[0]; c_seg += 256) { |
| 300 | + int64_t seg_cols = std::min((int64_t)256, tensor2_dims[0] - c_seg); |
| 301 | + int block_idx_in_row = c_seg / 256; |
| 302 | + ggml_type seg_type = (block_idx_in_row < 4) ? (ggml_type)t2->sq_info->compression_types[block_idx_in_row] : (ggml_type)t2->sq_info->compression_types[3]; |
| 303 | + row_byte_size += ggml_type_size(seg_type) * (seg_cols / ggml_blck_size(seg_type)); |
| 304 | + } |
| 305 | + const char * t2_row_data = (const char*)t2->data + r * row_byte_size; |
| 306 | + float* t2_dequant_row_ptr = t2_dequant_data.data() + r * tensor2_dims[0]; |
| 307 | + ggml_get_rows_smarterquant(t2, t2_row_data, t2_dequant_row_ptr); |
| 308 | + } |
| 309 | + if (!compare_float_arrays(tensor2_data.data(), t2_dequant_data.data(), tensor2_data.size(), 0.15f, tensor2_name)) { |
| 310 | + fprintf(stderr, " ERROR: Numerical mismatch for tensor '%s'.\n", tensor2_name.c_str()); |
| 311 | + overall_status = 1; |
| 312 | + } else { |
| 313 | + printf(" Tensor '%s' numerical check PASSED.\n", tensor2_name.c_str()); |
| 314 | + } |
| 315 | + } |
| 316 | + } |
| 317 | + } |
| 318 | + |
| 319 | + |
| 320 | + if (model) { |
| 321 | + llama_free_model(model); |
| 322 | + } |
| 323 | + |
| 324 | +cleanup: |
| 325 | + // Clean up dummy files |
| 326 | + remove(dummy_fp32_gguf_name.c_str()); |
| 327 | + remove(dummy_sq_json_name.c_str()); |
| 328 | + remove(dummy_quantized_gguf_name.c_str()); |
| 329 | + |
| 330 | + printf("SmarterQuant GGUF end-to-end test finished.\n"); |
| 331 | + return overall_status; |
| 332 | +} |
0 commit comments