1515#include < regex>
1616#include < thread>
1717#include < unordered_map>
18+ #include < optional>
1819
1920// Quantization types. Changes to this struct must be replicated in quantize.cpp
2021struct tensor_quantization {
@@ -623,7 +624,6 @@ static void signal_handler(int) {
623624// Returns tensor type overrides to meet a global bpw target
624625static std::unordered_map<std::string, ggml_type> target_bpw_type (
625626 llama_model_loader & ml,
626- std::vector<no_init<uint8_t >> & buffer,
627627 const llama_model & model,
628628 const std::vector<const llama_model_loader::llama_tensor_weight *> & tensors,
629629 const std::map<int , std::string> & mapped,
@@ -659,6 +659,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
659659 GGML_TYPE_IQ3_XXS,
660660 GGML_TYPE_Q3_K,
661661 GGML_TYPE_IQ4_XS,
662+ GGML_TYPE_IQ4_NL,
662663 GGML_TYPE_Q4_K,
663664 GGML_TYPE_Q5_K,
664665 GGML_TYPE_Q6_K,
@@ -1127,34 +1128,44 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
11271128
11281129 install_signal_handlers ();
11291130 auto bpw_data = load_bpw_state ();
1130- std::vector<tensor_info> all;
1131- all.reserve (tensors.size ());
1132- for (const auto * tw : tensors) {
1131+
1132+ // Significantly reduce compute time by parallelising tensor processing - courtesy of https://github.com/ddh0
1133+ auto process_tensor = [&](const llama_model_loader::llama_tensor_weight * tw,
1134+ std::vector<no_init<uint8_t >> & thread_local_buffer,
1135+ std::mutex & loader_mutex,
1136+ std::mutex & log_mutex) -> std::optional<tensor_info>
1137+ {
11331138 ggml_tensor * tensor = tw->tensor ;
11341139 const std::string name = ggml_get_name (tensor);
1135- if (!can_quantize (tensor)) { continue ; }
1136- check_signal_handler (all);
1140+ if (bpw_stop.load (std::memory_order_relaxed)) {
1141+ return std::nullopt ;
1142+ }
11371143
1138- // If we already have fully evaluatedd this tensor then reuse it
1139- if (auto it_saved = bpw_data.find (name); it_saved != bpw_data.end ()) {
1144+ // check for pre-computed results from a checkpoint file.
1145+ auto it_saved = bpw_data.find (name);
1146+ if (it_saved != bpw_data.end ()) {
11401147 tensor_info info;
11411148 info.w = tw;
11421149 info.candidate = it_saved->second .candidate ;
11431150 info.choice = it_saved->second .choice ;
11441151 info.min_bpw = it_saved->second .min_bpw ;
11451152 info.max_bpw = it_saved->second .max_bpw ;
11461153 info.n_elements = it_saved->second .n_elements ? it_saved->second .n_elements : (size_t )ggml_nelements (tensor);
1147- all.push_back (std::move (info));
1148- continue ;
1154+ return info;
1155+ }
1156+ {
1157+ std::lock_guard<std::mutex> lock (log_mutex);
1158+ LLAMA_LOG_INFO (" \t target_bpw_type: - processing tensor %45s \t (%12" PRId64 " elements)\n " , name.c_str (), ggml_nelements (tensor));
11491159 }
11501160
1151- LLAMA_LOG_INFO (" \t %s: - processing tensor %45s \t (%12" PRId64 " elements)\n " , __func__, name.c_str (), ggml_nelements (tensor));
11521161 if (!ml.use_mmap ) {
1153- if (buffer.size () < ggml_nbytes (tensor)) { buffer.resize (ggml_nbytes (tensor)); }
1154- tensor->data = buffer.data ();
1162+ if (thread_local_buffer.size () < ggml_nbytes (tensor)) { thread_local_buffer.resize (ggml_nbytes (tensor)); }
1163+ tensor->data = thread_local_buffer.data ();
1164+ }
1165+ {
1166+ std::lock_guard<std::mutex> lock (loader_mutex);
1167+ ml.load_data_for (tensor);
11551168 }
1156-
1157- ml.load_data_for (tensor);
11581169
11591170 // Dequantize sampled rows into f32_sample
11601171 const int64_t n_per_row = tensor->ne [0 ];
@@ -1170,7 +1181,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
11701181 const int64_t max_rows = 4096 ;
11711182 int64_t total_rows = std::llround (slice_budget / std::max<int64_t >(1 , n));
11721183 total_rows = std::max<int64_t >(min_rows, std::min<int64_t >(total_rows, std::min<int64_t >(rows, max_rows)));
1173- if (rows <= min_rows * 2 ) { total_rows = rows; } // use all rows for small tensors
1184+ if (rows <= min_rows * 2 ) { total_rows = rows; }
11741185 return total_rows;
11751186 };
11761187
@@ -1191,17 +1202,16 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
11911202 return ;
11921203 }
11931204 if (t == GGML_TYPE_F16) {
1194- ggml_fp16_to_fp32_row ((const ggml_fp16_t *) src, dst, (int )n_per_row);
1205+ ggml_fp16_to_fp32_row ((const ggml_fp16_t *)src, dst, (int )n_per_row);
11951206 return ;
11961207 }
11971208 if (t == GGML_TYPE_BF16) {
1198- ggml_bf16_to_fp32_row ((const ggml_bf16_t *) src, dst, (int )n_per_row);
1209+ ggml_bf16_to_fp32_row ((const ggml_bf16_t *)src, dst, (int )n_per_row);
11991210 return ;
12001211 }
1201-
12021212 if (src_is_quant) {
12031213 GGML_ASSERT (src_traits && src_traits->to_float );
1204- src_traits->to_float (src, dst, (int ) n_per_row);
1214+ src_traits->to_float (src, dst, (int )n_per_row);
12051215 return ;
12061216 }
12071217
@@ -1266,6 +1276,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
12661276 return ;
12671277 }
12681278
1279+ std::lock_guard<std::mutex> lock (log_mutex);
12691280 LLAMA_LOG_WARN (" %s: side data size mismatch for %s: got %zu, expected %zu or %zu; ignoring\n " , func, name.c_str (), src_sz, (size_t )n_per_row, want);
12701281 };
12711282
@@ -1276,12 +1287,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
12761287 if (values_all) { copy_or_broadcast (values_all, values_sz, values_sample); }
12771288 if (activations_all) { copy_or_broadcast (activations_all, activations_sz, activations_sample); }
12781289
1279- const int64_t nelem = ggml_nelements (tensor);
12801290 tensor_info info;
12811291 info.w = tw;
1282- info.n_elements = nelem;
1283-
1284- // Prepare scratch buffers sized for the largest candidate row size
1292+ info.n_elements = ggml_nelements (tensor);
12851293 size_t total_sampled_rows = f32_sample.size () / n_per_row;
12861294
12871295 // Build list of candidate types first (compatible ones)
@@ -1295,7 +1303,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
12951303 for (size_t i = 0 ; i < base_sz; ++i) {
12961304 ggml_type ts_type = base_arr[i];
12971305 if (is_iq (ts_type) && !has_valid_imatrix) {
1298- LLAMA_LOG_WARN (" %s: skipping %s for %s, no or mismatched imatrix\n " , __func__, ggml_type_name (ts_type), name.c_str ());
1306+ std::lock_guard<std::mutex> lock (log_mutex);
1307+ LLAMA_LOG_WARN (" \t %s: skipping %s for %s, no or mismatched imatrix\n " , func, ggml_type_name (ts_type), name.c_str ());
12991308 continue ;
13001309 }
13011310
@@ -1325,58 +1334,38 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
13251334 std::vector<uint8_t > quantized_buffer (max_row_sz * total_sampled_rows);
13261335 std::vector<float > dequantized_buffer (f32_sample.size ());
13271336 const float * slice_lambda = lambdas.empty () ? nullptr : lambdas.data ();
1328- int n_eval_threads = std::max (1 , std::min<int >(nthread, (int )compatible_candidates.size ()));
1329- std::atomic<size_t > cidx{0 };
1330- std::vector<std::thread> eval_workers;
1331- eval_workers.reserve (n_eval_threads);
1332- for (int ti = 0 ; ti < n_eval_threads; ++ti) {
1333- eval_workers.emplace_back ([&] {
1334- // thread-local scratch
1335- std::vector<uint8_t > tl_quantized_buffer (quantized_buffer.size ());
1336- std::vector<float > tl_dequantized_buffer (dequantized_buffer.size ());
1337- for (;;) {
1338- if (bpw_stop.load (std::memory_order_relaxed)) { break ; } // stop if a signal arrived
1339- const size_t i = cidx.fetch_add (1 , std::memory_order_acq_rel);
1340- if (i >= compatible_candidates.size ()) { break ; }
1341-
1342- const ggml_type tensor_types = compatible_candidates[i];
1343- const auto bpw = (float )tensor_bpw (tensor, tensor_types);
1344- const size_t bytes = tensor_bytes (tensor, tensor_types);
1345- double mse = 0.0 ;
1346- double proj = 0.0 ;
1347- const auto err = estimate_error (tensor, tensor_types, f32_sample, rows_sample, values, activations,
1348- tl_quantized_buffer, tl_dequantized_buffer, tensor_lambda, slice_lambda, &mse, &proj);
1349- eval_candidates[i] = candidate_types{ tensor_types, bpw, bytes, err, mse, proj };
1350- }
1351- });
1352- }
1337+ for (size_t i = 0 ; i < compatible_candidates.size (); ++i) {
1338+ if (bpw_stop.load (std::memory_order_relaxed)) { break ; }
13531339
1354- for (auto &th : eval_workers) { th.join (); }
1355-
1356- // If interruption happened mid-evaluation, exit without adding a half-baked tensor entry
1357- if (bpw_stop.load (std::memory_order_relaxed) && cidx.load (std::memory_order_relaxed) < compatible_candidates.size ()) {
1358- check_signal_handler (all);
1340+ const ggml_type tensor_types = compatible_candidates[i];
1341+ const auto bpw = (float )tensor_bpw (tensor, tensor_types);
1342+ const size_t bytes = tensor_bytes (tensor, tensor_types);
1343+ double mse = 0.0 ;
1344+ double proj = 0.0 ;
1345+ const auto err = estimate_error (tensor, tensor_types, f32_sample, rows_sample, values, activations,
1346+ quantized_buffer, dequantized_buffer, tensor_lambda, slice_lambda, &mse, &proj);
1347+ eval_candidates[i] = candidate_types{ tensor_types, bpw, bytes, err, mse, proj };
13591348 }
13601349
1350+ if (bpw_stop.load (std::memory_order_relaxed)) { return std::nullopt ; }
1351+
13611352 // Check if biasing is needed
13621353 bool bias_needed = false ;
13631354 if (!lambdas.empty ()) {
13641355 int min_mse = -1 ;
13651356 int min_bias = -1 ;
1366- {
1367- double best_mse = std::numeric_limits<double >::infinity ();
1368- double best_err = std::numeric_limits<double >::infinity ();
1369- for (int i = 0 ; i < (int )eval_candidates.size (); ++i) {
1370- const auto & c = eval_candidates[i];
1371- if (c.bytes == 0 ) { continue ; }
1372- if (c.mse < best_mse) {
1373- best_mse = c.mse ;
1374- min_mse = i;
1375- }
1376- if (c.error < best_err) {
1377- best_err = c.error ;
1378- min_bias = i;
1379- }
1357+ double best_mse = std::numeric_limits<double >::infinity ();
1358+ double best_err = std::numeric_limits<double >::infinity ();
1359+ for (int i = 0 ; i < (int )eval_candidates.size (); ++i) {
1360+ const auto & c = eval_candidates[i];
1361+ if (c.bytes == 0 ) { continue ; }
1362+ if (c.mse < best_mse) {
1363+ best_mse = c.mse ;
1364+ min_mse = i;
1365+ }
1366+ if (c.error < best_err) {
1367+ best_err = c.error ;
1368+ min_bias = i;
13801369 }
13811370 }
13821371
@@ -1388,8 +1377,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
13881377 if (c.bytes == 0 ) { continue ; }
13891378 const double mse = std::max (c.mse , epsilon);
13901379 const double bias_term = std::max (0.0 , c.error - c.mse );
1391- const double rel = bias_term / mse;
1392- max_rel_bias = std::max (rel, max_rel_bias);
1380+ max_rel_bias = std::max (bias_term / mse, max_rel_bias);
13931381 }
13941382
13951383 bias_needed = max_rel_bias >= 0.5 ; // >= 50% of MSE?
@@ -1404,7 +1392,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
14041392
14051393 if (info.candidate .empty ()) {
14061394 // As a last resort, keep original type
1407- float bpw = ggml_nbytes (tensor) * 8 .0f / nelem ;
1395+ float bpw = ggml_nbytes (tensor) * 8 .0f / info. n_elements ;
14081396 info.candidate .push_back (candidate_types{ tensor->type , bpw, ggml_nbytes (tensor), 0.0 });
14091397 }
14101398
@@ -1416,26 +1404,18 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
14161404 if (a.bytes != b.bytes ) { return a.bytes < b.bytes ; }
14171405 return a.error < b.error ;
14181406 });
1419- const auto last = std::unique (candidates.begin (), candidates.end (), [](const candidate_types & a, const candidate_types & b) {
1407+ candidates. erase ( std::unique (candidates.begin (), candidates.end (), [](const candidate_types & a, const candidate_types & b) {
14201408 return a.bytes == b.bytes ;
1421- });
1422- candidates.erase (last, candidates.end ());
1423-
1424- // Pareto by bytes -> error
1409+ }), candidates.end ());
14251410 std::vector<candidate_types> pareto;
14261411 pareto.reserve (candidates.size ());
14271412 double best_err = infinity;
1428- size_t last_b = std::numeric_limits<size_t >::max ();
14291413 for (const auto & c : candidates) {
1430- if (c.bytes != last_b) {
1431- last_b = c.bytes ;
1432- if (c.error < best_err) {
1433- best_err = c.error ;
1434- pareto.push_back (c);
1435- }
1414+ if (c.error < best_err) {
1415+ best_err = c.error ;
1416+ pareto.push_back (c);
14361417 }
14371418 }
1438-
14391419 candidates.swap (pareto);
14401420 if (candidates.size () < 3 ) { return ; } // need at least 3 points to do convex hull
14411421
@@ -1470,10 +1450,43 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
14701450 info.choice = 0 ;
14711451 info.min_bpw = info.candidate .front ().bpw ;
14721452 info.max_bpw = info.candidate .back ().bpw ;
1473- all.push_back (std::move (info));
1474- check_signal_handler (all); // save after each tensor
1453+
1454+ return info;
1455+ };
1456+
1457+ std::vector<tensor_info> all; // this vector will be populated by the parallel workers
1458+ {
1459+ std::atomic<size_t > tensor_idx{0 }; // shared work queue index for all threads
1460+ const size_t num_tensors_to_process = tensors.size ();
1461+ std::mutex loader_mutex;
1462+ std::mutex log_mutex;
1463+ std::mutex results_mutex;
1464+ std::vector<std::thread> workers;
1465+ int num_threads_to_spawn = std::max (1 , std::min<int >(nthread, (int )num_tensors_to_process));
1466+
1467+ for (int i = 0 ; i < num_threads_to_spawn; ++i) {
1468+ workers.emplace_back ([&]() {
1469+ std::vector<no_init<uint8_t >> thread_local_buffer;
1470+ while (true ) {
1471+ const size_t current_idx = tensor_idx.fetch_add (1 );
1472+ if (current_idx >= num_tensors_to_process) { break ; }
1473+ const auto * tw = tensors[current_idx];
1474+ if (!can_quantize (tw->tensor )) { continue ; }
1475+ // Execute the main processing logic for this tensor
1476+ std::optional<tensor_info> result_info = process_tensor (tw, thread_local_buffer, loader_mutex, log_mutex);
1477+ if (result_info) {
1478+ std::lock_guard<std::mutex> lock (results_mutex);
1479+ all.push_back (std::move (*result_info));
1480+ }
1481+ }
1482+ });
1483+ }
1484+
1485+ for (auto & w : workers) { w.join (); }
14751486 }
14761487
1488+ check_signal_handler (all);
1489+
14771490 if (all.empty ()) { return {}; }
14781491
14791492 // Compute total elements across all tensors and bytes for non-quantizable tensors
@@ -1965,7 +1978,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
19651978 LLAMA_LOG_WARN (" %s: imatrix without activations provided, target bpw quantization will be less accurate\n " , __func__);
19661979 }
19671980 LLAMA_LOG_INFO (" %s: computing tensor quantization mix to achieve %.4f bpw\n " , __func__, params->target_bpw );
1968- bpw_overrides = target_bpw_type (ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread);
1981+ bpw_overrides = target_bpw_type (ml, model, tensors, mapped, values_data, activations_data, params, nthread);
19691982 } else {
19701983 LLAMA_LOG_WARN (" %s: no imatrix provided, target bpw will not apply\n " , __func__);
19711984 }
0 commit comments