Skip to content

Commit a369469

Browse files
committed
Replace fast_bias() for per slice version and remove precise_bias()
1 parent 14fae69 commit a369469

File tree

1 file changed

+58
-109
lines changed

1 file changed

+58
-109
lines changed

src/llama-quant.cpp

Lines changed: 58 additions & 109 deletions
Original file line numberDiff line numberDiff line change
@@ -868,8 +868,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
868868
size_t row_idx = 0;
869869
double total_mse = 0.0;
870870
double total_proj = 0.0;
871+
double total_bias = 0.0;
871872
for (int64_t slice = 0; slice < ne2; ++slice) {
872-
const int64_t rs = sample_rows_per_slice[slice];
873+
const int64_t rs = rows_sample[slice];
873874
if (rs == 0) { continue; }
874875

875876
const float * values = has_values ? values_sample + slice * n_per_row : nullptr;
@@ -918,21 +919,24 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
918919
}
919920
row_proj_norm.push_back(p_norm);
920921
}
922+
921923
offset += (size_t)n_per_row;
922924
}
923925

924926
// Trimmed sum to avoid outlier rows dominating the results
925927
auto trimmed_sum = [&](std::vector<double> & v) -> double {
926928
if (v.empty()) { return 0.0; }
929+
927930
const int64_t n = (int64_t)v.size();
928931
if (n < 50) {
929932
double s = 0.0;
930933
for (const double z : v) { s += z; }
934+
931935
return s;
932936
}
933937

934-
int64_t k = (int64_t) std::floor(0.02 * (double)n); // trim 2% on each side
935-
k = std::max<int64_t>(0, std::min<int64_t>(k, n / 32)); // but not more than 3.125%
938+
int64_t k = (int64_t)std::floor(0.02 * (double)n); // trim 2% each side
939+
k = std::max<int64_t>(0, std::min<int64_t>(k, n / 32)); // cap at ~3.125%
936940
std::nth_element(v.begin(), v.begin() + k, v.end());
937941
std::nth_element(v.begin() + k, v.begin() + (n - k), v.end());
938942
double s = 0.0;
@@ -944,11 +948,17 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
944948
};
945949

946950
const double scale_rows = (double)nrows / std::max(1.0, (double)rs);
951+
const double slice_mse = trimmed_sum(row_mse_norm) * scale_rows;
952+
const double slice_proj = activations ? trimmed_sum(row_proj_norm) * scale_rows : 0.0;
947953

948-
total_mse += trimmed_sum(row_mse_norm) * scale_rows;
949-
if (activations) { total_proj += trimmed_sum(row_proj_norm) * scale_rows; }
954+
total_mse += slice_mse;
955+
total_proj += slice_proj;
950956

951-
if (!std::isfinite(total_mse) || !std::isfinite(total_proj)) {
957+
// per-slice lambda if provided, otherwise use scalar
958+
const double bl = slice_bias_lambda ? (double)std::max(0.0f, slice_bias_lambda[slice]) : (double)tensor_bias_lambda;
959+
total_bias += bl * slice_proj;
960+
961+
if (!std::isfinite(total_mse) || !std::isfinite(total_proj) || !std::isfinite(total_bias)) {
952962
if (out_mse) { *out_mse = infinity; }
953963
if (out_proj) { *out_proj = 0.0; }
954964

@@ -959,133 +969,76 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
959969
if (out_mse) { *out_mse = total_mse; }
960970
if (out_proj) { *out_proj = total_proj; }
961971

962-
const double total_err = total_mse + bias_lambda * total_proj;
972+
const double total_err = slice_bias_lambda ? total_mse + total_bias : total_mse + tensor_bias_lambda * total_proj;
973+
963974
return std::isfinite(total_err) ? total_err : infinity;
964975
};
965976

966-
// Higher precision but longer to compute
967-
auto precise_lambda = [&](const ggml_tensor * t,
968-
const std::vector<float> & f32_sample,
969-
const std::vector<int64_t> & sample_rows_per_slice,
970-
const float * values,
971-
const float * activations,
972-
const std::vector<ggml_type> & compatible_candidates) -> float
977+
// Returns lambda per slice or 0.0 if no activations
978+
auto estimate_lambda = [&](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) -> std::vector<float>
973979
{
974-
if (!activations) { return 0.0f; }
975-
976-
std::vector<ggml_type> probes;
977-
probes.reserve(3);
978-
auto push_if = [&](const ggml_type tiny) {
979-
if (std::find(compatible_candidates.begin(), compatible_candidates.end(), tiny) != compatible_candidates.end()) {
980-
probes.push_back(tiny);
981-
}
982-
};
983-
984-
push_if(GGML_TYPE_Q3_K);
985-
push_if(GGML_TYPE_Q4_K);
986-
push_if(GGML_TYPE_Q5_K);
987-
if (probes.empty() && !compatible_candidates.empty()) {
988-
probes.push_back(compatible_candidates[compatible_candidates.size() / 2]);
989-
}
990-
if (probes.size() == 1 && compatible_candidates.size() >= 2) {
991-
probes.push_back(compatible_candidates.front());
992-
}
993-
if (probes.empty()) { return 0.0f; }
994-
995-
// Scratch buffers
996-
const int64_t n_per_row = t->ne[0];
997-
const size_t total_sampled_rows = f32_sample.size() / n_per_row;
998-
size_t max_row_sz = 0;
999-
for (auto pt : probes) max_row_sz = std::max(max_row_sz, ggml_row_size(pt, n_per_row));
1000-
1001-
std::vector<uint8_t> quantized_buffer(max_row_sz * total_sampled_rows);
1002-
std::vector<float> dequantized_buffer(f32_sample.size());
1003-
1004-
std::vector<double> ratios;
1005-
ratios.reserve(probes.size());
1006-
for (const auto pt : probes) {
1007-
double m = 0.0;
1008-
double p = 0.0;
1009-
(void)estimate_error(t, pt, f32_sample, sample_rows_per_slice, values, activations, quantized_buffer, dequantized_buffer, 0.0f, &m, &p);
1010-
if (p > epsilon && std::isfinite(m) && std::isfinite(p)) {
1011-
ratios.push_back(m / p);
1012-
}
1013-
}
1014-
1015-
if (ratios.empty()) { return 0.0f; }
1016-
1017-
std::nth_element(ratios.begin(), ratios.begin() + ratios.size() / 2, ratios.end());
1018-
const double lambda = std::clamp(ratios[ratios.size() / 2], 0.0, 8.0);
1019-
1020-
return (float)lambda;
1021-
};
1022-
1023-
// Faster to compute but may yield lower precision. Best option for the vast majority of cases
1024-
auto fast_lambda = [&](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) {
1025-
if (!activations) { return 0.0f; }
1026-
1027-
double accum = 0.0;
1028-
int ns = 0;
980+
std::vector<float> lambdas(std::max<int64_t>(1, ne2), 0.0f);
981+
if (!activations) { return lambdas; }
1029982

1030983
for (int64_t s = 0; s < std::max<int64_t>(1, ne2); ++s) {
1031984
const float * v = values ? values + s * n_per_row : nullptr;
1032985
const float * a = activations + s * n_per_row;
1033-
1034986
double s1 = 0.0;
1035987
double s2 = 0.0;
1036988
for (int64_t j = 0; j < n_per_row; ++j) {
1037-
const double w = v ? std::max(0.0f, v[j]) : 1.0;
989+
const double w = v ? std::max(0.0f, v[j]) : 1.0;
1038990
const double aw = std::sqrt(w) * a[j];
1039991
const double aw2 = aw * aw;
1040992
s1 += aw2;
1041993
s2 += aw2 * aw2;
1042994
}
1043995

996+
float l = 0.0f;
1044997
if (s1 > 0.0) {
1045-
const double n = (double)n_per_row;
1046-
double c = std::max(0.0, s2 / (s1 * s1 + epsilon) - 1.0 / n);
998+
const auto n = (double)n_per_row;
999+
const double c = std::max(0.0, s2 / (s1 * s1 + epsilon) - 1.0 / n);
10471000
double lambda = 8.0 * (c / (c + 1.0));
1048-
accum += std::clamp(lambda, 0.0, 8.0);
1049-
++ns;
1001+
l = (float)std::clamp(lambda, 0.0, 12.0);
10501002
}
1051-
}
10521003

1053-
if (ns == 0) { return 0.0f; }
1004+
lambdas[(size_t)s] = l;
1005+
}
10541006

1055-
return (float)(accum / ns);
1007+
return lambdas;
10561008
};
10571009

10581010
std::vector<tensor_info> all;
10591011
all.reserve(tensors.size());
10601012
for (const auto * tw : tensors) {
10611013
std::vector<std::thread> workers;
10621014
workers.reserve(std::max(1, nthread));
1063-
ggml_tensor * t = tw->tensor;
1064-
const std::string name = ggml_get_name(t);
1065-
if (!can_quantize(t)) { continue; }
1015+
ggml_tensor * tensor = tw->tensor;
1016+
const std::string name = ggml_get_name(tensor);
1017+
if (!can_quantize(tensor)) { continue; }
10661018

1067-
LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12d elements)\n", __func__, name.c_str(), (int)ggml_nelements(t));
1019+
LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12d elements)\n", __func__, name.c_str(), (int)ggml_nelements(tensor));
10681020
if (!ml.use_mmap) {
1069-
if (buffer.size() < ggml_nbytes(t)) { buffer.resize(ggml_nbytes(t)); }
1070-
t->data = buffer.data();
1021+
if (buffer.size() < ggml_nbytes(tensor)) { buffer.resize(ggml_nbytes(tensor)); }
1022+
tensor->data = buffer.data();
10711023
}
1072-
ml.load_data_for(t);
1024+
1025+
ml.load_data_for(tensor);
10731026

10741027
// Dequantize sampled rows into f32_sample
1075-
const int64_t n_per_row = t->ne[0];
1076-
const int64_t nrows_total = t->ne[1];
1077-
const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
1028+
const int64_t n_per_row = tensor->ne[0];
1029+
const int64_t nrows_total = tensor->ne[1];
1030+
const int64_t ne2 = tensor->ne[2] > 0 ? tensor->ne[2] : 1;
10781031

1079-
// Larger sample_rows_per_expert values may result in more accurate error estimates, but it will take much longer to compute
1080-
const int sample_rows_per_expert = activations_data ? 512 : 256;
1032+
// Larger rows_sample_per_expert values may result in more accurate error estimates, but it will take much longer to compute
1033+
const int rows_sample_per_expert = activations_data ? 512 : 256;
10811034
std::vector<float> f32_sample;
1082-
f32_sample.reserve((size_t)ne2 * (size_t)std::min<int64_t>(nrows_total, sample_rows_per_expert) * (size_t)n_per_row);
1035+
f32_sample.reserve((size_t)ne2 * (size_t)std::min<int64_t>(nrows_total, rows_sample_per_expert) * (size_t)n_per_row);
10831036

1084-
std::vector<int64_t> sample_rows_per_slice(ne2, 0);
1085-
const int64_t sample_rows_max = std::max<int64_t>(1, std::min<int64_t>(nrows_total, sample_rows_per_expert));
1086-
const int64_t stride = std::max<int64_t>(1, nrows_total / sample_rows_max);
1037+
std::vector<int64_t> rows_sample(ne2, 0);
1038+
const int64_t rows_sample_max = std::max<int64_t>(1, std::min<int64_t>(nrows_total, rows_sample_per_expert));
1039+
const int64_t stride = std::max<int64_t>(1, nrows_total / rows_sample_max);
10871040
std::vector<float> row_buffer(n_per_row);
1088-
const ggml_type src_type = t->type;
1041+
const ggml_type src_type = tensor->type;
10891042
const ggml_type_traits *src_traits = ggml_get_type_traits(src_type);
10901043
const bool src_is_quant = ggml_is_quantized(src_type);
10911044
const size_t src_row_sz = ggml_row_size(src_type, n_per_row);
@@ -1199,23 +1152,20 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
11991152

12001153
// Adjusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE.
12011154
// Larger values favours quantisation types that produce smaller bias even if the MSE is slightly bigger
1202-
float bias_lambda = 0.0f;
1203-
{
1204-
const float * values = values_sample.empty() ? nullptr : values_sample.data();
1205-
const float * activations = activations_sample.empty() ? nullptr : activations_sample.data();
1206-
if (params->bpw_bias == 1) {
1207-
bias_lambda = fast_lambda(values, activations, n_per_row, ne2);
1208-
} else if (params->bpw_bias == 2) {
1209-
bias_lambda = precise_lambda(t, f32_sample, sample_rows_per_slice, values, activations, compatible_candidates);
1210-
}
1211-
}
1212-
1213-
// Now evaluate candidates
1214-
std::vector<candidate_types> eval_candidates(compatible_candidates.size());
1155+
float tensor_lambda = 0.0f;
12151156
const float * values = values_sample.empty() ? nullptr : values_sample.data();
12161157
const float * activations = activations_sample.empty() ? nullptr : activations_sample.data();
1158+
auto lambdas = estimate_lambda(values, activations, n_per_row, ne2);
1159+
double acc = 0.0;
1160+
int ns = 0;
1161+
for (float l : lambdas) { acc += l; ++ns; }
1162+
tensor_lambda = ns ? (float)(acc / ns) : 0.0f;
1163+
1164+
// Evaluate candidates
1165+
std::vector<candidate_types> eval_candidates(compatible_candidates.size());
12171166
std::vector<uint8_t> quantized_buffer(max_row_sz * total_sampled_rows);
12181167
std::vector<float> dequantised_buffer(f32_sample.size());
1168+
const float * slice_lambda = lambdas.empty() ? nullptr : lambdas.data();
12191169
int n_eval_threads = std::max(1, std::min<int>(nthread, (int)compatible_candidates.size()));
12201170
std::atomic<size_t> cidx{0};
12211171
std::vector<std::thread> eval_workers;
@@ -1476,7 +1426,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
14761426
int best_j = -1;
14771427
double best_ratio = -1.0;
14781428
size_t best_delta = 0;
1479-
14801429
for (int i = 0; i < (int)all.size(); ++i) {
14811430
const auto & ti = all[i];
14821431
if (ti.choice >= (int)ti.candidate.size() - 1) {

0 commit comments

Comments
 (0)