Skip to content

Commit 556f6b0

Browse files
committed
Add --precise-lambda option
1 parent 66aff8f commit 556f6b0

File tree

3 files changed

+23
-11
lines changed

3 files changed

+23
-11
lines changed

include/llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,7 @@ extern "C" {
357357
void * tensor_types; // pointer to vector containing tensor types
358358
void * prune_layers; // pointer to vector containing layer indices to prune
359359
float target_bpw; // target bits per weight (bpw)
360+
bool precise_lambda; // use precise_lambda calculation - slow computation but very accurate
360361
} llama_model_quantize_params;
361362

362363
typedef struct llama_logit_bias {

src/llama-quant.cpp

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -722,7 +722,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
722722
const float * values_sample,
723723
const float * activations_sample,
724724
std::vector<uint8_t> & quantized_buffer,
725-
std::vector<float> & dequantized_buffer) -> double
725+
std::vector<float> & dequantized_buffer,
726+
float bias_lambda) -> double
726727
{
727728
const int64_t n_per_row = t->ne[0];
728729
const int64_t nrows = t->ne[1];
@@ -878,10 +879,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
878879
}
879880
}
880881

881-
// bias_lambda adjusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE
882-
// larger value favours quantisation types that produce smaller bias even if the MSE is slightly larger
883-
constexpr float bias_lambda = 1.5f;
884-
constexpr double epsilon = 1e-12;
885882
double err_num = weighted_mse;
886883
if (activations && bias_lambda != 0.0f) {
887884
const double proj = bias_num * bias_num / (bias_denom + epsilon);
@@ -1163,6 +1160,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
11631160
std::sort(compatible_candidates.begin(), compatible_candidates.end());
11641161
compatible_candidates.erase(std::unique(compatible_candidates.begin(), compatible_candidates.end()), compatible_candidates.end());
11651162

1163+
// Compute adaptive bias_lambda for this tensor
1164+
float bias_lambda = 0.0f;
1165+
{
1166+
const float * values = values_sample.empty() ? nullptr : values_sample.data();
1167+
const float * activations = activations_sample.empty() ? nullptr : activations_sample.data();
1168+
bias_lambda = params->precise_lambda ? precise_lambda(t, f32_sample, sample_rows_per_slice, values, activations, compatible_candidates) :
1169+
fast_lambda(values, activations, n_per_row);
1170+
}
1171+
11661172
// Now evaluate candidates
11671173
std::vector<candidate_types> eval_candidates(compatible_candidates.size());
11681174
const float * values = values_sample.empty() ? nullptr : values_sample.data();
@@ -1186,7 +1192,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
11861192
const ggml_type tt = compatible_candidates[i];
11871193
const auto bpw = (float)tensor_bpw(t, tt);
11881194
const size_t bytes = tensor_bytes(t, tt);
1189-
const auto err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, values, activations, tl_quantized_buffer, tl_dequantised_buffer);
1195+
const auto err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, values, activations, tl_quantized_buffer, tl_dequantised_buffer, bias_lambda);
11901196
eval_candidates[i] = candidate_types{ tt, bpw, bytes, err };
11911197
}
11921198
});
@@ -1301,7 +1307,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
13011307
};
13021308

13031309
auto recompute_best_upgrade = [&]() -> upgrade {
1304-
const double eps = 1e-12;
13051310
upgrade best{ -1, -1, 0.0, 0, -1.0 };
13061311
for (int i = 0; i < (int) all.size(); ++i) {
13071312
const auto & ti = all[i];
@@ -1653,10 +1658,11 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
16531658
if (params->target_bpw != -1.0f && !params->only_copy) {
16541659
if (params->imatrix) {
16551660
if (params->activations) {
1656-
LLAMA_LOG_INFO("%s: imatrix with activations provided, target bpw quantization will be more accurate\n", __func__);
1661+
LLAMA_LOG_INFO("%s: imatrix with activations provided, target bpw quantization will be more accurate - ",__func__);
16571662
} else {
1658-
LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate\n", __func__);
1663+
LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate - ", __func__);
16591664
}
1665+
LLAMA_LOG_INFO("using %s\n", params->precise_lambda ? "precise lambda (slow)" : "fast lambda");
16601666
LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw);
16611667
bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread);
16621668
} else {
@@ -1966,7 +1972,8 @@ llama_model_quantize_params llama_model_quantize_default_params() {
19661972
/*.kv_overrides =*/ nullptr,
19671973
/*.tensor_type =*/ nullptr,
19681974
/*.prune_layers =*/ nullptr,
1969-
/*.target_bpw =*/ -1.0f
1975+
/*.target_bpw =*/ -1.0f,
1976+
/*.precise_lambda =*/ false
19701977
};
19711978

19721979
return result;

tools/quantize/quantize.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,9 @@ static void usage(const char * executable) {
132132
printf(" Advanced option to selectively quantize tensors. May be specified multiple times.\n");
133133
printf(" --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n");
134134
printf(" Advanced option to remove all tensors from the given layers\n");
135-
printf(" --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 16.0 \n");
135+
printf(" --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 16.0\n");
136+
printf(" Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n");
137+
printf(" --precise-lambda: given a target bpw, use a high-precision error computation at the expense of longer processing times\n");
136138
printf(" --keep-split: will generate quantized model in the same shards as input\n");
137139
printf(" --override-kv KEY=TYPE:VALUE\n");
138140
printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
@@ -538,6 +540,8 @@ int main(int argc, char ** argv) {
538540
if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) {
539541
usage(argv[0]);
540542
}
543+
} else if (strcmp(argv[arg_idx], "--precise-lambda") == 0) {
544+
params.precise_lambda = true;
541545
} else if (strcmp(argv[arg_idx], "--prune-layers") == 0) {
542546
if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) {
543547
usage(argv[0]);

0 commit comments

Comments
 (0)