Skip to content

Commit 12e0524

Browse files
committed
Reduce compute time by parallelising tensor processing - courtesy of https://github.com/ddh0
1 parent 951de2e commit 12e0524

File tree

1 file changed

+101
-88
lines changed

1 file changed

+101
-88
lines changed

src/llama-quant.cpp

Lines changed: 101 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include <regex>
1616
#include <thread>
1717
#include <unordered_map>
18+
#include <optional>
1819

1920
// Quantization types. Changes to this struct must be replicated in quantize.cpp
2021
struct tensor_quantization {
@@ -623,7 +624,6 @@ static void signal_handler(int) {
623624
// Returns tensor type overrides to meet a global bpw target
624625
static std::unordered_map<std::string, ggml_type> target_bpw_type(
625626
llama_model_loader & ml,
626-
std::vector<no_init<uint8_t>> & buffer,
627627
const llama_model & model,
628628
const std::vector<const llama_model_loader::llama_tensor_weight *> & tensors,
629629
const std::map<int, std::string> & mapped,
@@ -659,6 +659,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
659659
GGML_TYPE_IQ3_XXS,
660660
GGML_TYPE_Q3_K,
661661
GGML_TYPE_IQ4_XS,
662+
GGML_TYPE_IQ4_NL,
662663
GGML_TYPE_Q4_K,
663664
GGML_TYPE_Q5_K,
664665
GGML_TYPE_Q6_K,
@@ -1127,34 +1128,44 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
11271128

11281129
install_signal_handlers();
11291130
auto bpw_data = load_bpw_state();
1130-
std::vector<tensor_info> all;
1131-
all.reserve(tensors.size());
1132-
for (const auto * tw : tensors) {
1131+
1132+
// Significantly reduce compute time by parallelising tensor processing - courtesy of https://github.com/ddh0
1133+
auto process_tensor = [&](const llama_model_loader::llama_tensor_weight * tw,
1134+
std::vector<no_init<uint8_t>> & thread_local_buffer,
1135+
std::mutex & loader_mutex,
1136+
std::mutex & log_mutex) -> std::optional<tensor_info>
1137+
{
11331138
ggml_tensor * tensor = tw->tensor;
11341139
const std::string name = ggml_get_name(tensor);
1135-
if (!can_quantize(tensor)) { continue; }
1136-
check_signal_handler(all);
1140+
if (bpw_stop.load(std::memory_order_relaxed)) {
1141+
return std::nullopt;
1142+
}
11371143

1138-
// If we already have fully evaluatedd this tensor then reuse it
1139-
if (auto it_saved = bpw_data.find(name); it_saved != bpw_data.end()) {
1144+
// check for pre-computed results from a checkpoint file.
1145+
auto it_saved = bpw_data.find(name);
1146+
if (it_saved != bpw_data.end()) {
11401147
tensor_info info;
11411148
info.w = tw;
11421149
info.candidate = it_saved->second.candidate;
11431150
info.choice = it_saved->second.choice;
11441151
info.min_bpw = it_saved->second.min_bpw;
11451152
info.max_bpw = it_saved->second.max_bpw;
11461153
info.n_elements = it_saved->second.n_elements ? it_saved->second.n_elements : (size_t)ggml_nelements(tensor);
1147-
all.push_back(std::move(info));
1148-
continue;
1154+
return info;
1155+
}
1156+
{
1157+
std::lock_guard<std::mutex> lock(log_mutex);
1158+
LLAMA_LOG_INFO("\ttarget_bpw_type: - processing tensor %45s \t(%12" PRId64 " elements)\n", name.c_str(), ggml_nelements(tensor));
11491159
}
11501160

1151-
LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12" PRId64 " elements)\n", __func__, name.c_str(), ggml_nelements(tensor));
11521161
if (!ml.use_mmap) {
1153-
if (buffer.size() < ggml_nbytes(tensor)) { buffer.resize(ggml_nbytes(tensor)); }
1154-
tensor->data = buffer.data();
1162+
if (thread_local_buffer.size() < ggml_nbytes(tensor)) { thread_local_buffer.resize(ggml_nbytes(tensor)); }
1163+
tensor->data = thread_local_buffer.data();
1164+
}
1165+
{
1166+
std::lock_guard<std::mutex> lock(loader_mutex);
1167+
ml.load_data_for(tensor);
11551168
}
1156-
1157-
ml.load_data_for(tensor);
11581169

11591170
// Dequantize sampled rows into f32_sample
11601171
const int64_t n_per_row = tensor->ne[0];
@@ -1170,7 +1181,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
11701181
const int64_t max_rows = 4096;
11711182
int64_t total_rows = std::llround(slice_budget / std::max<int64_t>(1, n));
11721183
total_rows = std::max<int64_t>(min_rows, std::min<int64_t>(total_rows, std::min<int64_t>(rows, max_rows)));
1173-
if (rows <= min_rows * 2) { total_rows = rows; } // use all rows for small tensors
1184+
if (rows <= min_rows * 2) { total_rows = rows; }
11741185
return total_rows;
11751186
};
11761187

@@ -1191,17 +1202,16 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
11911202
return;
11921203
}
11931204
if (t == GGML_TYPE_F16) {
1194-
ggml_fp16_to_fp32_row((const ggml_fp16_t *) src, dst, (int)n_per_row);
1205+
ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, (int)n_per_row);
11951206
return;
11961207
}
11971208
if (t == GGML_TYPE_BF16) {
1198-
ggml_bf16_to_fp32_row((const ggml_bf16_t *) src, dst, (int)n_per_row);
1209+
ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, (int)n_per_row);
11991210
return;
12001211
}
1201-
12021212
if (src_is_quant) {
12031213
GGML_ASSERT(src_traits && src_traits->to_float);
1204-
src_traits->to_float(src, dst, (int) n_per_row);
1214+
src_traits->to_float(src, dst, (int)n_per_row);
12051215
return;
12061216
}
12071217

@@ -1266,6 +1276,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
12661276
return;
12671277
}
12681278

1279+
std::lock_guard<std::mutex> lock(log_mutex);
12691280
LLAMA_LOG_WARN("%s: side data size mismatch for %s: got %zu, expected %zu or %zu; ignoring\n", func, name.c_str(), src_sz, (size_t)n_per_row, want);
12701281
};
12711282

@@ -1276,12 +1287,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
12761287
if (values_all) { copy_or_broadcast(values_all, values_sz, values_sample); }
12771288
if (activations_all) { copy_or_broadcast(activations_all, activations_sz, activations_sample); }
12781289

1279-
const int64_t nelem = ggml_nelements(tensor);
12801290
tensor_info info;
12811291
info.w = tw;
1282-
info.n_elements = nelem;
1283-
1284-
// Prepare scratch buffers sized for the largest candidate row size
1292+
info.n_elements = ggml_nelements(tensor);
12851293
size_t total_sampled_rows = f32_sample.size() / n_per_row;
12861294

12871295
// Build list of candidate types first (compatible ones)
@@ -1295,7 +1303,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
12951303
for (size_t i = 0; i < base_sz; ++i) {
12961304
ggml_type ts_type = base_arr[i];
12971305
if (is_iq(ts_type) && !has_valid_imatrix) {
1298-
LLAMA_LOG_WARN("%s: skipping %s for %s, no or mismatched imatrix\n", __func__, ggml_type_name(ts_type), name.c_str());
1306+
std::lock_guard<std::mutex> lock(log_mutex);
1307+
LLAMA_LOG_WARN("\t%s: skipping %s for %s, no or mismatched imatrix\n", func, ggml_type_name(ts_type), name.c_str());
12991308
continue;
13001309
}
13011310

@@ -1325,58 +1334,38 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
13251334
std::vector<uint8_t> quantized_buffer(max_row_sz * total_sampled_rows);
13261335
std::vector<float> dequantized_buffer(f32_sample.size());
13271336
const float * slice_lambda = lambdas.empty() ? nullptr : lambdas.data();
1328-
int n_eval_threads = std::max(1, std::min<int>(nthread, (int)compatible_candidates.size()));
1329-
std::atomic<size_t> cidx{0};
1330-
std::vector<std::thread> eval_workers;
1331-
eval_workers.reserve(n_eval_threads);
1332-
for (int ti = 0; ti < n_eval_threads; ++ti) {
1333-
eval_workers.emplace_back([&] {
1334-
// thread-local scratch
1335-
std::vector<uint8_t> tl_quantized_buffer(quantized_buffer.size());
1336-
std::vector<float> tl_dequantized_buffer(dequantized_buffer.size());
1337-
for (;;) {
1338-
if (bpw_stop.load(std::memory_order_relaxed)) { break; } // stop if a signal arrived
1339-
const size_t i = cidx.fetch_add(1, std::memory_order_acq_rel);
1340-
if (i >= compatible_candidates.size()) { break; }
1341-
1342-
const ggml_type tensor_types = compatible_candidates[i];
1343-
const auto bpw = (float)tensor_bpw(tensor, tensor_types);
1344-
const size_t bytes = tensor_bytes(tensor, tensor_types);
1345-
double mse = 0.0;
1346-
double proj = 0.0;
1347-
const auto err = estimate_error(tensor, tensor_types, f32_sample, rows_sample, values, activations,
1348-
tl_quantized_buffer, tl_dequantized_buffer, tensor_lambda, slice_lambda, &mse, &proj);
1349-
eval_candidates[i] = candidate_types{ tensor_types, bpw, bytes, err, mse, proj };
1350-
}
1351-
});
1352-
}
1337+
for (size_t i = 0; i < compatible_candidates.size(); ++i) {
1338+
if (bpw_stop.load(std::memory_order_relaxed)) { break; }
13531339

1354-
for (auto &th : eval_workers) { th.join(); }
1355-
1356-
// If interruption happened mid-evaluation, exit without adding a half-baked tensor entry
1357-
if (bpw_stop.load(std::memory_order_relaxed) && cidx.load(std::memory_order_relaxed) < compatible_candidates.size()) {
1358-
check_signal_handler(all);
1340+
const ggml_type tensor_types = compatible_candidates[i];
1341+
const auto bpw = (float)tensor_bpw(tensor, tensor_types);
1342+
const size_t bytes = tensor_bytes(tensor, tensor_types);
1343+
double mse = 0.0;
1344+
double proj = 0.0;
1345+
const auto err = estimate_error(tensor, tensor_types, f32_sample, rows_sample, values, activations,
1346+
quantized_buffer, dequantized_buffer, tensor_lambda, slice_lambda, &mse, &proj);
1347+
eval_candidates[i] = candidate_types{ tensor_types, bpw, bytes, err, mse, proj };
13591348
}
13601349

1350+
if (bpw_stop.load(std::memory_order_relaxed)) { return std::nullopt; }
1351+
13611352
// Check if biasing is needed
13621353
bool bias_needed = false;
13631354
if (!lambdas.empty()) {
13641355
int min_mse = -1;
13651356
int min_bias = -1;
1366-
{
1367-
double best_mse = std::numeric_limits<double>::infinity();
1368-
double best_err = std::numeric_limits<double>::infinity();
1369-
for (int i = 0; i < (int)eval_candidates.size(); ++i) {
1370-
const auto & c = eval_candidates[i];
1371-
if (c.bytes == 0) { continue; }
1372-
if (c.mse < best_mse) {
1373-
best_mse = c.mse;
1374-
min_mse = i;
1375-
}
1376-
if (c.error < best_err) {
1377-
best_err = c.error;
1378-
min_bias = i;
1379-
}
1357+
double best_mse = std::numeric_limits<double>::infinity();
1358+
double best_err = std::numeric_limits<double>::infinity();
1359+
for (int i = 0; i < (int)eval_candidates.size(); ++i) {
1360+
const auto & c = eval_candidates[i];
1361+
if (c.bytes == 0) { continue; }
1362+
if (c.mse < best_mse) {
1363+
best_mse = c.mse;
1364+
min_mse = i;
1365+
}
1366+
if (c.error < best_err) {
1367+
best_err = c.error;
1368+
min_bias = i;
13801369
}
13811370
}
13821371

@@ -1388,8 +1377,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
13881377
if (c.bytes == 0) { continue; }
13891378
const double mse = std::max(c.mse, epsilon);
13901379
const double bias_term = std::max(0.0, c.error - c.mse);
1391-
const double rel = bias_term / mse;
1392-
max_rel_bias = std::max(rel, max_rel_bias);
1380+
max_rel_bias = std::max(bias_term / mse, max_rel_bias);
13931381
}
13941382

13951383
bias_needed = max_rel_bias >= 0.5; // >= 50% of MSE?
@@ -1404,7 +1392,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
14041392

14051393
if (info.candidate.empty()) {
14061394
// As a last resort, keep original type
1407-
float bpw = ggml_nbytes(tensor) * 8.0f / nelem;
1395+
float bpw = ggml_nbytes(tensor) * 8.0f / info.n_elements;
14081396
info.candidate.push_back(candidate_types{ tensor->type, bpw, ggml_nbytes(tensor), 0.0 });
14091397
}
14101398

@@ -1416,26 +1404,18 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
14161404
if (a.bytes != b.bytes) { return a.bytes < b.bytes; }
14171405
return a.error < b.error;
14181406
});
1419-
const auto last = std::unique(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) {
1407+
candidates.erase(std::unique(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) {
14201408
return a.bytes == b.bytes;
1421-
});
1422-
candidates.erase(last, candidates.end());
1423-
1424-
// Pareto by bytes -> error
1409+
}), candidates.end());
14251410
std::vector<candidate_types> pareto;
14261411
pareto.reserve(candidates.size());
14271412
double best_err = infinity;
1428-
size_t last_b = std::numeric_limits<size_t>::max();
14291413
for (const auto & c : candidates) {
1430-
if (c.bytes != last_b) {
1431-
last_b = c.bytes;
1432-
if (c.error < best_err) {
1433-
best_err = c.error;
1434-
pareto.push_back(c);
1435-
}
1414+
if (c.error < best_err) {
1415+
best_err = c.error;
1416+
pareto.push_back(c);
14361417
}
14371418
}
1438-
14391419
candidates.swap(pareto);
14401420
if (candidates.size() < 3) { return; } // need at least 3 points to do convex hull
14411421

@@ -1470,10 +1450,43 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
14701450
info.choice = 0;
14711451
info.min_bpw = info.candidate.front().bpw;
14721452
info.max_bpw = info.candidate.back().bpw;
1473-
all.push_back(std::move(info));
1474-
check_signal_handler(all); // save after each tensor
1453+
1454+
return info;
1455+
};
1456+
1457+
std::vector<tensor_info> all; // this vector will be populated by the parallel workers
1458+
{
1459+
std::atomic<size_t> tensor_idx{0}; // shared work queue index for all threads
1460+
const size_t num_tensors_to_process = tensors.size();
1461+
std::mutex loader_mutex;
1462+
std::mutex log_mutex;
1463+
std::mutex results_mutex;
1464+
std::vector<std::thread> workers;
1465+
int num_threads_to_spawn = std::max(1, std::min<int>(nthread, (int)num_tensors_to_process));
1466+
1467+
for (int i = 0; i < num_threads_to_spawn; ++i) {
1468+
workers.emplace_back([&]() {
1469+
std::vector<no_init<uint8_t>> thread_local_buffer;
1470+
while (true) {
1471+
const size_t current_idx = tensor_idx.fetch_add(1);
1472+
if (current_idx >= num_tensors_to_process) { break; }
1473+
const auto * tw = tensors[current_idx];
1474+
if (!can_quantize(tw->tensor)) { continue; }
1475+
// Execute the main processing logic for this tensor
1476+
std::optional<tensor_info> result_info = process_tensor(tw, thread_local_buffer, loader_mutex, log_mutex);
1477+
if (result_info) {
1478+
std::lock_guard<std::mutex> lock(results_mutex);
1479+
all.push_back(std::move(*result_info));
1480+
}
1481+
}
1482+
});
1483+
}
1484+
1485+
for (auto & w : workers) { w.join(); }
14751486
}
14761487

1488+
check_signal_handler(all);
1489+
14771490
if (all.empty()) { return {}; }
14781491

14791492
// Compute total elements across all tensors and bytes for non-quantizable tensors
@@ -1965,7 +1978,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
19651978
LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate\n", __func__);
19661979
}
19671980
LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw);
1968-
bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread);
1981+
bpw_overrides = target_bpw_type(ml, model, tensors, mapped, values_data, activations_data, params, nthread);
19691982
} else {
19701983
LLAMA_LOG_WARN("%s: no imatrix provided, target bpw will not apply\n", __func__);
19711984
}

0 commit comments

Comments
 (0)