Skip to content

Commit 5303212

Browse files
committed
Simplify tensor selection
1 parent 8da14c0 commit 5303212

File tree

1 file changed

+11
-88
lines changed

1 file changed

+11
-88
lines changed

src/llama-quant.cpp

Lines changed: 11 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,12 @@
1111
#include <csignal>
1212
#include <fstream>
1313
#include <mutex>
14+
#include <numeric>
15+
#include <optional>
1416
#include <random>
1517
#include <regex>
1618
#include <thread>
1719
#include <unordered_map>
18-
#include <optional>
1920
#include <unordered_set>
2021

2122
// Quantization types. Changes to this struct must be replicated in quantize.cpp
@@ -1151,7 +1152,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
11511152

11521153
const auto bpw_data = load_bpw_state();
11531154

1154-
// Reduce compute time by parallelising tensor processing - courtesy of https://github.com/ddh0
1155+
// Parallelize tensor processing - courtesy of https://github.com/ddh0
11551156
auto process_tensor = [&](const llama_model_loader::llama_tensor_weight * tw,
11561157
std::vector<no_init<uint8_t>> & thread_local_buffer,
11571158
std::mutex & loader_mutex,
@@ -1569,93 +1570,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
15691570
return emit_overrides();
15701571
}
15711572

1572-
auto tensor_depth = [&](const std::string & name) -> float {
1573-
static const std::regex layer_pattern(R"(blk\.(\d+)\.)");
1574-
std::smatch match;
1575-
1576-
// Depth component: output, embeddings & early/late layers are important
1577-
if (name == "output.weight" || name == "token_embd.weight") {
1578-
return 1.0f;
1579-
}
1580-
if (std::regex_search(name, match, layer_pattern)) {
1581-
const int layer = std::stoi(match[1]);
1582-
const float normalized_layer = (float)layer / (float)std::max(1, (int)model.hparams.n_layer - 1);
1583-
const float center_dist = std::abs(normalized_layer - 0.5f) * 2.0f;
1584-
return 0.01f + 0.9f * center_dist;
1585-
}
1586-
1587-
return 0.0f;
1588-
};
1589-
1590-
auto tensor_importance = [&](const std::vector<tensor_info> & all_tensors) -> std::unordered_map<std::string, float> {
1591-
std::unordered_map<std::string, float> scores;
1592-
for (const auto & t : all_tensors) {
1593-
const std::string name = ggml_get_name(t.w->tensor);
1594-
float total_score = 0.0f;
1595-
float depth_score = 0.0f;
1596-
float type_score = 0.0f;
1597-
1598-
// Type component: certain tensor types have more impact on model quality
1599-
const std::vector<std::pair<float, std::vector<const char*>>> tensor_scores = {
1600-
{0.9f, {".ffn_down.weight", ".ffn_down_exps.weight"}},
1601-
{0.89f, {".attn_output.weight", ".time_mix_output.weight", ".attn_o.weight"}},
1602-
{0.3f, {".ffn_up.weight", ".ffn_gate.weight", ".ffn_up_exps.weight", ".ffn_gate_exps.weight"}},
1603-
{0.29f, {".attn_q.weight", ".attn_k.weight", ".attn_v.weight", ".attn_qkv.weight"}},
1604-
{0.2f, {"token_embd.weight"}}
1605-
};
1606-
if (name == "output.weight") {
1607-
type_score = 1.0f;
1608-
} else {
1609-
for (const auto& ts : tensor_scores) {
1610-
const bool found = std::any_of(ts.second.begin(), ts.second.end(), [&](const char* pattern) {
1611-
return name.find(pattern) != std::string::npos;
1612-
});
1613-
if (found) {
1614-
type_score = ts.first;
1615-
break;
1616-
}
1617-
}
1618-
}
1619-
if (type_score > 0.0f) {
1620-
depth_score = tensor_depth(name);
1621-
}
1622-
1623-
// Weighted combination
1624-
total_score = 0.90f * type_score + 0.10f * depth_score; // 90% type + 10% depth
1625-
if (total_score != 0.0f) {
1626-
scores[name] = total_score;
1627-
LLAMA_LOG_DEBUG("\t%s: \t %45s \t depth score %.4f \t type score %.4f \t total score %.4f\n", func, name.c_str(), depth_score, type_score, total_score);
1628-
}
1629-
}
1630-
1631-
return scores;
1632-
};
1633-
1634-
auto select_tensors = [&](const std::vector<tensor_info> & all_vec) -> std::unordered_set<std::string> {
1635-
const auto scores = tensor_importance(all_vec);
1636-
1637-
// Sort by score
1638-
std::vector<std::pair<std::string, float>> sorted_scores(scores.begin(), scores.end());
1639-
std::sort(sorted_scores.begin(), sorted_scores.end(), [](const auto & a, const auto & b) { return a.second > b.second; });
1640-
1641-
// Select top percentile
1642-
const size_t n_important = std::max<size_t>(1, std::llround((double)sorted_scores.size() * 0.29f)); // 29% seems to be the pareto front
1643-
1644-
std::unordered_set<std::string> important;
1645-
for (size_t i = 0; i < std::min(n_important, sorted_scores.size()); ++i) {
1646-
important.insert(sorted_scores[i].first);
1647-
LLAMA_LOG_DEBUG("\t%s: important tensor %s (score %.4f)\n", func, sorted_scores[i].first.c_str(), sorted_scores[i].second);
1648-
}
1649-
1650-
const auto pct = 100.0 * (double)important.size() / (double)sorted_scores.size();
1651-
LLAMA_LOG_INFO("%s: prioritizing %zu out of %zu tensors (%.2f%%)\n", func, important.size(), sorted_scores.size(), pct);
1652-
return important;
1653-
};
1654-
1655-
const auto important_set = select_tensors(all);
1656-
1573+
// Certain tensors have a higher impact on model quality, so we apply a lower penalty to them
16571574
auto is_important = [&](const std::string & tensor_name) -> bool {
1658-
return important_set.count(tensor_name) > 0;
1575+
const auto important = tensor_name == "output.weight" ||
1576+
tensor_name.find(".ffn_down.weight") != std::string::npos ||
1577+
tensor_name.find(".ffn_down_exps.weight") != std::string::npos ||
1578+
tensor_name.find(".attn_output.weight") != std::string::npos ||
1579+
tensor_name.find(".time_mix_output.weight") != std::string::npos ||
1580+
tensor_name.find(".attn_o.weight") != std::string::npos;
1581+
return important;
16591582
};
16601583

16611584
// Lagrangian relaxation to minimise error subject to a bpw target constraint

0 commit comments

Comments
 (0)