Skip to content

Commit eab8708

Browse files
committed
Minor factoring for efficiency and correctness
1 parent 556f6b0 commit eab8708

File tree

1 file changed

+60
-66
lines changed

1 file changed

+60
-66
lines changed

src/llama-quant.cpp

Lines changed: 60 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -596,7 +596,7 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
596596
return new_size;
597597
}
598598

599-
// Returns per-tensor type overrides to meet target BPW at lowest error
599+
// Returns tensor type overrides to meet a global bpw target
600600
static std::unordered_map<std::string, ggml_type> target_bpw_type(
601601
llama_model_loader & ml,
602602
std::vector<no_init<uint8_t>> & buffer,
@@ -650,6 +650,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
650650
};
651651

652652
constexpr double epsilon = 1e-12;
653+
constexpr double infinity = std::numeric_limits<double>::infinity();
653654

654655
auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t {
655656
const int64_t n_per_row = t->ne[0];
@@ -680,7 +681,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
680681

681682
auto name_tn = LLM_TN(model.arch);
682683
auto can_quantize = [&](const ggml_tensor * t) -> bool {
683-
// This list should be kept in sync with llama_tensor_quantize_impl()
684+
// This list should be kept in sync with llama_tensor_quantize_impl() to avoid drift
684685
const std::string name = ggml_get_name(t);
685686
bool q = name.rfind("weight") == name.size() - 6;
686687
q &= ggml_n_dims(t) >= 2;
@@ -730,9 +731,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
730731
const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
731732

732733
const size_t sample_element_count = f32_sample.size();
733-
const size_t sample_row_count = sample_element_count / (size_t)n_per_row;
734+
const size_t sample_row_count = n_per_row > 0 ? sample_element_count / (size_t)n_per_row : 0;
734735
if (sample_row_count == 0) { return 0.0; }
735736

737+
size_t expected_rows = 0;
738+
for (int64_t s = 0; s < ne2; ++s) {
739+
expected_rows += (size_t)sample_rows_per_slice[s];
740+
}
741+
if (expected_rows != sample_row_count) { return infinity; }
742+
736743
const size_t row_sz = ggml_row_size(quant_type, n_per_row);
737744
const size_t buffer_sz = row_sz * sample_row_count;
738745

@@ -750,15 +757,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
750757
const float * activations = activations_sample + s * n_per_row;
751758
double denom = 0.0;
752759
for (int64_t j = 0; j < n_per_row; ++j) {
760+
const double w = values ? std::max(0.0f, values[j]) : 1.0;
753761
const double a = activations[j];
754-
const double w = values ? values[j] : 1.0;
755762
denom += w * a * a;
756763
}
757764
bias_denominator_per_slice[s] = denom;
758765
}
759766
}
760767

761-
// Compute per-row squared norms with weighting (if values are provided)
768+
// Per-row squared norms with weighting
762769
std::vector<double> row_sq_norm(sample_row_count, 0.0);
763770
{
764771
size_t offset = 0;
@@ -768,15 +775,14 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
768775
if (rs == 0) { continue; }
769776

770777
const float * values = has_values ? values_sample + s * n_per_row : nullptr;
771-
772778
for (int64_t r = 0; r < rs; ++r, ++row_idx) {
773779
const float * x = f32_sample.data() + offset;
774780
double rsn = 0.0;
775781
if (values) {
776782
for (int64_t j = 0; j < n_per_row; ++j) {
777-
const double v = values[j];
783+
const double w = std::max(0.0f, values[j]);
778784
const double xx = x[j];
779-
rsn += v * xx * xx;
785+
rsn += w * xx * xx;
780786
}
781787
} else {
782788
for (int64_t j = 0; j < n_per_row; ++j) {
@@ -790,7 +796,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
790796
}
791797
}
792798

793-
// Quantize sampled rows slice-by-slice into quantized_buffer
799+
// Quantize sampled rows per slice -> quantized_buffer
794800
{
795801
size_t q_offset = 0;
796802
size_t f_offset = 0;
@@ -800,70 +806,66 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
800806

801807
const float * value = has_values ? values_sample + slice * n_per_row : nullptr;
802808
(void)ggml_quantize_chunk(quant_type, f32_sample.data() + f_offset, quantized_buffer.data() + q_offset, 0, rs, n_per_row, value);
803-
804809
q_offset += row_sz * (size_t)rs;
805810
f_offset += (size_t)rs * (size_t)n_per_row;
806811
}
807812
}
808813

809-
// Dequantize into dequantized_buffer
814+
// quantized_buffer -> dequantized_buffer
810815
{
811816
const ggml_type_traits * traits = ggml_get_type_traits(quant_type);
812-
auto row_to_float = [&](size_t r) {
813-
uint8_t * src = quantized_buffer.data() + r * row_sz;
814-
float * dst = dequantized_buffer.data() + r * (size_t)n_per_row;
815-
if (quant_type == GGML_TYPE_F16) {
816-
ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, (int)n_per_row);
817-
} else if (quant_type == GGML_TYPE_BF16) {
818-
ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, (int)n_per_row);
819-
} else {
820-
if (!traits || !traits->to_float) {
821-
LLAMA_LOG_WARN("%s: unsupported quantization type %s\n", __func__, ggml_type_name(quant_type));
822-
return false;
817+
818+
const bool is_fp16 = quant_type == GGML_TYPE_F16;
819+
const bool is_bf16 = quant_type == GGML_TYPE_BF16;
820+
if (!is_fp16 && !is_bf16 && traits && traits->to_float) {
821+
traits->to_float(quantized_buffer.data(), dequantized_buffer.data(), (int)(sample_row_count * (size_t)n_per_row));
822+
} else {
823+
for (size_t r = 0; r < sample_row_count; ++r) {
824+
uint8_t * src = quantized_buffer.data() + r * row_sz;
825+
float * dst = dequantized_buffer.data() + r * (size_t) n_per_row;
826+
if (is_fp16) {
827+
ggml_fp16_to_fp32_row((const ggml_fp16_t *) src, dst, (int)n_per_row);
828+
} else if (is_bf16) {
829+
ggml_bf16_to_fp32_row((const ggml_bf16_t *) src, dst, (int)n_per_row);
830+
} else {
831+
if (!traits || !traits->to_float) { return infinity; }
832+
traits->to_float(src, dst, (int)n_per_row);
823833
}
824-
traits->to_float(src, dst, (int)n_per_row);
825834
}
826-
827-
return true;
828-
};
829-
830-
for (size_t r = 0; r < sample_row_count; ++r) {
831-
if (!row_to_float(r)) { return 1e35; }
832835
}
833836
}
834837

835838
// Compute error
836839
size_t offset = 0;
837840
size_t row_idx = 0;
838841
double total_err = 0.0;
842+
839843
for (int64_t slice = 0; slice < ne2; ++slice) {
840844
const int64_t rs = sample_rows_per_slice[slice];
841845
if (rs == 0) { continue; }
842846

843847
const float * values = has_values ? values_sample + slice * n_per_row : nullptr;
844848
const float * activations = has_activations ? activations_sample + slice * n_per_row : nullptr;
845849
const double bias_denom = has_activations ? bias_denominator_per_slice[slice] : 0.0;
846-
847850
double slice_err = 0.0;
848-
849851
for (int64_t r = 0; r < rs; ++r, ++row_idx) {
850852
const float * x = f32_sample.data() + offset;
851853
const float * y = dequantized_buffer.data() + offset;
852854
double weighted_mse = 0.0;
853855
double bias_num = 0.0;
854856
if (values && activations) {
855857
for (int64_t j = 0; j < n_per_row; ++j) {
856-
const double v = values[j];
858+
const double w = std::max(0.0f, values[j]);
857859
const double e = y[j] - x[j];
858860
const double a = activations[j];
859-
weighted_mse += v * e * e;
860-
bias_num += v * e * a;
861+
weighted_mse += w * e * e;
862+
bias_num += w * e * a;
861863
}
862864
} else if (values) {
863865
for (int64_t j = 0; j < n_per_row; ++j) {
864-
const double v = values[j];
866+
const double w = std::max(0.0f, values[j]);
865867
const double e = y[j] - x[j];
866-
weighted_mse += v * e * e;
868+
weighted_mse += w * e * e;
867869
}
868870
} else if (activations) {
869871
for (int64_t j = 0; j < n_per_row; ++j) {
@@ -881,26 +883,28 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
881883

882884
double err_num = weighted_mse;
883885
if (activations && bias_lambda != 0.0f) {
884-
const double proj = bias_num * bias_num / (bias_denom + epsilon);
885-
err_num += (double)bias_lambda * proj;
886+
if (bias_denom > 0.0) {
887+
const double proj = bias_num * bias_num / (bias_denom + epsilon);
888+
err_num += bias_lambda * proj;
889+
}
886890
}
887891

888-
const double err_den = row_sq_norm[row_idx] + epsilon;
889-
slice_err += err_num / err_den;
892+
const double denom = row_sq_norm[row_idx] + epsilon;
893+
slice_err += err_num / denom;
890894
offset += (size_t)n_per_row;
891895
}
892896

893897
const double scale_rows = (double)nrows / std::max(1.0, (double)rs);
894898
total_err += slice_err * scale_rows;
899+
if (!std::isfinite(total_err)) { return infinity; }
895900
}
896901

897-
return std::isfinite(total_err) ? total_err : 1e35;
902+
return std::isfinite(total_err) ? total_err : infinity;
898903
};
899904

905+
// Scaling factor to increase lambda when activations are concentrated
900906
auto directional_scale = [&](const float * values, const float * activations, int64_t n_per_row) {
901907
if (!activations) { return 1.0f; }
902-
// Compute dominance = ||sqrt(v).*a||_2 / (RMS(a)*sqrt(sum(v)))
903-
// If no values, use v=1
904908
double sum_v = 0.0;
905909
double sum_aw2 = 0.0;
906910
double sum_a2 = 0.0;
@@ -915,24 +919,19 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
915919
const double denom = std::sqrt(std::max(epsilon, sum_v)) * std::max(epsilon, rms_a);
916920
const double scale = denom > 0.0 ? std::sqrt(sum_aw2) / denom : 1.0;
917921

918-
// Clamp to a reasonable range
919922
return (float)std::clamp(scale, 0.5, 2.0);
920923
};
921924

922-
// Returns an adaptive lambda for this tensor using a small probe set
923-
// bias_lambda adjusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE
924-
// larger value favours quantisation types that produce smaller bias even if the MSE is slightly larger
925+
// Higher precision but much longer to compute
925926
auto precise_lambda = [&](const ggml_tensor * t,
926927
const std::vector<float> & f32_sample,
927928
const std::vector<int64_t> & sample_rows_per_slice,
928929
const float * values,
929930
const float * activations,
930931
const std::vector<ggml_type> & compatible_candidates) -> float
931932
{
932-
// No activations => no projection term
933933
if (!activations) { return 0.0f; }
934934

935-
// pick a tiny probe set: try to spread around mid-range types
936935
std::vector<ggml_type> probes;
937936
probes.reserve(3);
938937
auto push_if = [&](const ggml_type tiny) {
@@ -941,7 +940,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
941940
}
942941
};
943942

944-
// Prefer family-consistent probes; fall back to whatever exists
945943
push_if(GGML_TYPE_Q4_K);
946944
push_if(GGML_TYPE_Q3_K);
947945
push_if(GGML_TYPE_Q5_K);
@@ -953,19 +951,18 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
953951
}
954952
if (probes.empty()) { return 0.0f; }
955953

956-
// Scratch buffers (reused)
954+
// Scratch buffers
957955
const int64_t n_per_row = t->ne[0];
958956
const size_t total_sampled_rows = f32_sample.size() / n_per_row;
959957
size_t max_row_sz = 0;
960958
for (auto pt : probes) {
961959
max_row_sz = std::max(max_row_sz, ggml_row_size(pt, n_per_row));
962960
}
961+
963962
std::vector<uint8_t> quantized_buffer(max_row_sz * total_sampled_rows);
964963
std::vector<float> dequantized_buffer(f32_sample.size());
965-
966964
std::vector<double> ratios;
967965
ratios.reserve(probes.size());
968-
969966
for (const auto pt : probes) {
970967
// err at lambda=0 => pure weighted MSE part
971968
double err0 = estimate_error(t, pt, f32_sample, sample_rows_per_slice, values, activations, quantized_buffer, dequantized_buffer, 0.0f);
@@ -984,17 +981,17 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
984981
std::nth_element(ratios.begin(), ratios.begin() + ratios.size() / 2, ratios.end());
985982
double lambda = ratios[ratios.size() / 2];
986983

987-
// activations directional scale
988984
const float scale = directional_scale(values, activations, n_per_row);
989985
lambda *= scale;
990-
991-
// clamp to safe range
992986
lambda = std::clamp(lambda, 0.0, 8.0);
987+
993988
return (float)lambda;
994989
};
995990

991+
// Faster to compute but lower precision. Best option for the vast majority of models
996992
auto fast_lambda = [&](const float * values, const float * activations, const int64_t n_per_row) {
997993
if (!activations) { return 0.0f; }
994+
998995
double s = 0.0;
999996
double s2 = 0.0;
1000997
for (int64_t j = 0; j < n_per_row; ++j) {
@@ -1004,17 +1001,13 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
10041001
s += aw2;
10051002
s2 += aw2 * aw2;
10061003
}
1004+
10071005
if (s2 <= 0.0) { return 0.0f; }
10081006
const auto d = (double)n_per_row;
1009-
//const double p = s * s / (d * s2 + epsilon);
1010-
//const double lambda = 8.0 * std::clamp(1.0 - p, 0.0, 1.0);
1011-
// Map p in (0,1] to lambda in [0,8] decreasing
10121007
double base = 1.0 - s * s / (d * s2 + epsilon);
10131008
base = std::clamp(base, 0.0, 1.0);
10141009

1015-
// activations directional scale
10161010
const double scale = directional_scale(values, activations, n_per_row);
1017-
// clamp to safe range
10181011
const double lambda = std::clamp(base * scale, 0.0, 1.0) * 8.0;
10191012

10201013
return (float)lambda;
@@ -1036,13 +1029,13 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
10361029
}
10371030
ml.load_data_for(t);
10381031

1039-
// Dequantize only sampled rows into f32_sample
1032+
// Dequantize sampled rows into f32_sample
10401033
const int64_t n_per_row = t->ne[0];
10411034
const int64_t nrows_total = t->ne[1];
10421035
const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
10431036

1044-
// Larger sample_rows_per_expert values may result in more accurate error estimates, but will take longer to compute
1045-
constexpr int sample_rows_per_expert = 384;
1037+
// Larger sample_rows_per_expert values may result in more accurate error estimates, but it will take much longer to compute
1038+
constexpr int sample_rows_per_expert = 256;
10461039
std::vector<float> f32_sample;
10471040
f32_sample.reserve((size_t)ne2 * (size_t)std::min<int64_t>(nrows_total, sample_rows_per_expert) * (size_t)n_per_row);
10481041

@@ -1096,6 +1089,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
10961089
const std::string key = remap_imatrix(tensor_name, mapped);
10971090
const auto it = m->find(key);
10981091
if (it == m->end()) { return {nullptr, 0}; }
1092+
10991093
return { it->second.data(), it->second.size() };
11001094
};
11011095

@@ -1104,7 +1098,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
11041098
const size_t want = (size_t)ne2 * (size_t)n_per_row;
11051099
dst.clear();
11061100
if (!src || src_sz == 0) { return; }
1107-
11081101
if (src_sz == want) {
11091102
dst.resize(want);
11101103
std::memcpy(dst.data(), src, want * sizeof(float));
@@ -1160,7 +1153,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
11601153
std::sort(compatible_candidates.begin(), compatible_candidates.end());
11611154
compatible_candidates.erase(std::unique(compatible_candidates.begin(), compatible_candidates.end()), compatible_candidates.end());
11621155

1163-
// Compute adaptive bias_lambda for this tensor
1156+
// Adjusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE.
1157+
// Larger values favours quantisation types that produce smaller bias even if the MSE is slightly bigger
11641158
float bias_lambda = 0.0f;
11651159
{
11661160
const float * values = values_sample.empty() ? nullptr : values_sample.data();

0 commit comments

Comments
 (0)