Skip to content

Commit a319f02

Browse files
Add comprehensive test coverage for quantization format conversions (AT-103)
This commit implements comprehensive test coverage for quantization format conversions and cross-format accuracy validation as specified in ticket AT-103. New Features: - tests/test-conversion-accuracy.cpp: New dedicated test suite for conversion pipeline accuracy validation with tests for: * Single format quantization and dequantization * Cross-format conversions between different quantization types * Round-trip conversion tests * Tensor alignment validation * Large model simulation with memory constraints * Multi-file model support - tests/test-backend-ops.cpp: Extended with new test_quant_conversion struct for systematic cross-format conversion testing across all quantization formats - tests/test-quantize-fns.cpp: Added cross-format validation functions: * cross_format_conversion_error() for testing conversion between formats * round_trip_error() for testing quantization stability * Automated test sections for cross-format and round-trip conversions - tests/test-quantize-stats.cpp: Added perplexity measurement framework: * calculate_perplexity() for quality assessment * compare_perplexity_across_formats() for systematic comparison - gguf-py/gguf/conversion_validation.py: New Python module for HuggingFace to GGUF conversion accuracy validation with configurable error thresholds - tests/CMakeLists.txt: Updated to include new test-conversion-accuracy target Test Coverage: - All quantization formats tested: Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, Q8_1, Q2_K through Q6_K, and IQ variants - Error thresholds based on quantization bit depth - Integration with existing test infrastructure maintained - Backward compatibility preserved Related to ticket AT-103 Co-Authored-By: Alex Peng <[email protected]>
1 parent 661ae31 commit a319f02

File tree

6 files changed

+888
-1
lines changed

6 files changed

+888
-1
lines changed
Lines changed: 202 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,202 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Quantization conversion accuracy validation utilities for GGUF format conversions.
4+
Provides functions to validate accuracy after converting from HuggingFace to GGUF format.
5+
"""
6+
7+
from __future__ import annotations
8+
9+
import logging
10+
import numpy as np
11+
from typing import Any
12+
from pathlib import Path
13+
14+
logger = logging.getLogger("gguf-validation")
15+
16+
17+
def calculate_rmse(original: np.ndarray, converted: np.ndarray) -> float:
18+
"""Calculate Root Mean Square Error between original and converted tensors."""
19+
if original.shape != converted.shape:
20+
raise ValueError(f"Shape mismatch: {original.shape} vs {converted.shape}")
21+
22+
diff = original.astype(np.float64) - converted.astype(np.float64)
23+
mse = np.mean(diff ** 2)
24+
return np.sqrt(mse)
25+
26+
27+
def calculate_max_error(original: np.ndarray, converted: np.ndarray) -> float:
28+
"""Calculate maximum absolute error between original and converted tensors."""
29+
if original.shape != converted.shape:
30+
raise ValueError(f"Shape mismatch: {original.shape} vs {converted.shape}")
31+
32+
diff = np.abs(original.astype(np.float64) - converted.astype(np.float64))
33+
return np.max(diff)
34+
35+
36+
def validate_tensor_conversion(
37+
tensor_name: str,
38+
original_data: np.ndarray,
39+
converted_data: np.ndarray,
40+
max_rmse_threshold: float = 0.01,
41+
max_error_threshold: float = 0.1,
42+
verbose: bool = False
43+
) -> tuple[bool, dict[str, float]]:
44+
"""
45+
Validate accuracy of a single tensor conversion.
46+
47+
Args:
48+
tensor_name: Name of the tensor being validated
49+
original_data: Original tensor data
50+
converted_data: Converted tensor data (after GGUF conversion)
51+
max_rmse_threshold: Maximum allowed RMSE
52+
max_error_threshold: Maximum allowed absolute error
53+
verbose: Whether to print detailed validation results
54+
55+
Returns:
56+
Tuple of (passed: bool, metrics: dict)
57+
"""
58+
try:
59+
rmse = calculate_rmse(original_data, converted_data)
60+
max_err = calculate_max_error(original_data, converted_data)
61+
62+
passed = rmse <= max_rmse_threshold and max_err <= max_error_threshold
63+
64+
metrics = {
65+
"rmse": float(rmse),
66+
"max_error": float(max_err),
67+
"rmse_threshold": max_rmse_threshold,
68+
"max_error_threshold": max_error_threshold,
69+
"passed": passed
70+
}
71+
72+
if verbose or not passed:
73+
status = "✓" if passed else "✗"
74+
logger.info(
75+
f"{status} {tensor_name}: RMSE={rmse:.6f} (threshold={max_rmse_threshold}), "
76+
f"MaxErr={max_err:.6f} (threshold={max_error_threshold})"
77+
)
78+
79+
return passed, metrics
80+
81+
except Exception as e:
82+
logger.error(f"Error validating {tensor_name}: {e}")
83+
return False, {"error": str(e)}
84+
85+
86+
def validate_model_conversion(
87+
original_tensors: dict[str, np.ndarray],
88+
converted_tensors: dict[str, np.ndarray],
89+
quantization_type: str = "f16",
90+
verbose: bool = False
91+
) -> dict[str, Any]:
92+
"""
93+
Validate accuracy of entire model conversion.
94+
95+
Args:
96+
original_tensors: Dictionary of original tensor names to data
97+
converted_tensors: Dictionary of converted tensor names to data
98+
quantization_type: Type of quantization used (affects thresholds)
99+
verbose: Whether to print detailed validation results
100+
101+
Returns:
102+
Dictionary with validation results and statistics
103+
"""
104+
thresholds = get_quantization_thresholds(quantization_type)
105+
106+
results = {
107+
"total_tensors": 0,
108+
"passed_tensors": 0,
109+
"failed_tensors": [],
110+
"metrics": {},
111+
"overall_passed": True
112+
}
113+
114+
common_tensors = set(original_tensors.keys()) & set(converted_tensors.keys())
115+
116+
if not common_tensors:
117+
logger.warning("No common tensors found between original and converted models")
118+
results["overall_passed"] = False
119+
return results
120+
121+
results["total_tensors"] = len(common_tensors)
122+
123+
for tensor_name in sorted(common_tensors):
124+
passed, metrics = validate_tensor_conversion(
125+
tensor_name,
126+
original_tensors[tensor_name],
127+
converted_tensors[tensor_name],
128+
max_rmse_threshold=thresholds["rmse"],
129+
max_error_threshold=thresholds["max_error"],
130+
verbose=verbose
131+
)
132+
133+
results["metrics"][tensor_name] = metrics
134+
135+
if passed:
136+
results["passed_tensors"] += 1
137+
else:
138+
results["failed_tensors"].append(tensor_name)
139+
results["overall_passed"] = False
140+
141+
if verbose:
142+
logger.info(
143+
f"\nValidation Summary: {results['passed_tensors']}/{results['total_tensors']} tensors passed"
144+
)
145+
if results["failed_tensors"]:
146+
logger.warning(f"Failed tensors: {', '.join(results['failed_tensors'])}")
147+
148+
return results
149+
150+
151+
def get_quantization_thresholds(quantization_type: str) -> dict[str, float]:
152+
"""
153+
Get appropriate error thresholds for different quantization types.
154+
155+
Args:
156+
quantization_type: Type of quantization (f32, f16, q4_0, q8_0, etc.)
157+
158+
Returns:
159+
Dictionary with "rmse" and "max_error" thresholds
160+
"""
161+
thresholds_map = {
162+
"f32": {"rmse": 1e-6, "max_error": 1e-5},
163+
"f16": {"rmse": 1e-3, "max_error": 1e-2},
164+
"bf16": {"rmse": 1e-2, "max_error": 1e-1},
165+
"q8_0": {"rmse": 2e-3, "max_error": 2e-2},
166+
"q4_0": {"rmse": 1e-2, "max_error": 1e-1},
167+
"q4_1": {"rmse": 1e-2, "max_error": 1e-1},
168+
"q5_0": {"rmse": 8e-3, "max_error": 8e-2},
169+
"q5_1": {"rmse": 8e-3, "max_error": 8e-2},
170+
"q2_k": {"rmse": 2e-2, "max_error": 2e-1},
171+
"q3_k": {"rmse": 1.5e-2, "max_error": 1.5e-1},
172+
"q4_k": {"rmse": 1e-2, "max_error": 1e-1},
173+
"q5_k": {"rmse": 8e-3, "max_error": 8e-2},
174+
"q6_k": {"rmse": 5e-3, "max_error": 5e-2},
175+
}
176+
177+
default = {"rmse": 1e-2, "max_error": 1e-1}
178+
179+
return thresholds_map.get(quantization_type.lower(), default)
180+
181+
182+
def save_validation_report(results: dict[str, Any], output_path: Path) -> None:
183+
"""
184+
Save validation results to a JSON file.
185+
186+
Args:
187+
results: Validation results dictionary
188+
output_path: Path to save the report
189+
"""
190+
import json
191+
192+
with open(output_path, 'w') as f:
193+
json.dump(results, f, indent=2)
194+
195+
logger.info(f"Validation report saved to {output_path}")
196+
197+
198+
if __name__ == "__main__":
199+
logging.basicConfig(level=logging.INFO)
200+
logger.info("GGUF Conversion Validation Utilities")
201+
logger.info("This module provides functions for validating HuggingFace to GGUF conversions")
202+
logger.info("Import this module in convert_hf_to_gguf.py to enable validation")

tests/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,7 @@ if (NOT GGML_BACKEND_DL)
207207
llama_build_and_test(test-barrier.cpp)
208208
llama_build_and_test(test-quantize-fns.cpp)
209209
llama_build_and_test(test-quantize-perf.cpp)
210+
llama_build_and_test(test-conversion-accuracy.cpp)
210211
llama_build_and_test(test-rope.cpp)
211212
endif()
212213

tests/test-backend-ops.cpp

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5433,6 +5433,48 @@ struct test_falcon : public test_llm {
54335433
}
54345434
};
54355435

5436+
struct test_quant_conversion : public test_case {
5437+
const ggml_type type_src;
5438+
const ggml_type type_intermediate;
5439+
const ggml_type type_dst;
5440+
const std::array<int64_t, 4> ne;
5441+
5442+
std::string vars() override {
5443+
return VARS_TO_STR4(type_src, type_intermediate, type_dst, ne);
5444+
}
5445+
5446+
double max_nmse_err() override {
5447+
return 5e-4;
5448+
}
5449+
5450+
test_quant_conversion(ggml_type type_src = GGML_TYPE_F32,
5451+
ggml_type type_intermediate = GGML_TYPE_Q4_0,
5452+
ggml_type type_dst = GGML_TYPE_Q8_0,
5453+
std::array<int64_t, 4> ne = {512, 512, 1, 1})
5454+
: type_src(type_src), type_intermediate(type_intermediate), type_dst(type_dst), ne(ne) {}
5455+
5456+
ggml_tensor * build_graph(ggml_context * ctx) override {
5457+
// Create source tensor
5458+
ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
5459+
ggml_set_param(src);
5460+
ggml_set_name(src, "src");
5461+
5462+
ggml_tensor * intermediate = ggml_new_tensor(ctx, type_intermediate, 4, ne.data());
5463+
ggml_set_name(intermediate, "intermediate");
5464+
intermediate = ggml_cpy(ctx, src, intermediate);
5465+
5466+
ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, ne.data());
5467+
ggml_set_name(dst, "dst");
5468+
dst = ggml_cpy(ctx, intermediate, dst);
5469+
5470+
ggml_tensor * out = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne.data());
5471+
ggml_set_name(out, "out");
5472+
out = ggml_cpy(ctx, dst, out);
5473+
5474+
return out;
5475+
}
5476+
};
5477+
54365478

54375479
// ###########################################
54385480
// ## Section 3: GGML Op Test Instantiation ##
@@ -5870,6 +5912,19 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
58705912
}
58715913
}
58725914

5915+
static const ggml_type quant_conversion_test_types[] = {
5916+
GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
5917+
GGML_TYPE_Q8_0, GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K
5918+
};
5919+
5920+
for (ggml_type intermediate : quant_conversion_test_types) {
5921+
for (ggml_type dst : quant_conversion_test_types) {
5922+
if (intermediate != dst) {
5923+
test_cases.emplace_back(new test_quant_conversion(GGML_TYPE_F32, intermediate, dst, {256, 256, 1, 1}));
5924+
}
5925+
}
5926+
}
5927+
58735928
test_cases.emplace_back(new test_cont());
58745929
test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 1 ,1}));
58755930
test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 3 ,5}));

0 commit comments

Comments
 (0)