Skip to content

Commit d6ccd56

Browse files
committed
Finetune heuristics
1 parent 04561d5 commit d6ccd56

File tree

1 file changed

+43
-38
lines changed

1 file changed

+43
-38
lines changed

src/llama-quant.cpp

Lines changed: 43 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -838,7 +838,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
838838
LLAMA_LOG_WARN("%s: model ID mismatch, ignoring: %s\n", func, checkpoint_file.c_str());
839839
return out;
840840
} else {
841-
LLAMA_LOG_INFO("%s: resuming tensor quantization\n", func);
841+
LLAMA_LOG_INFO("%s: state file found, resuming tensor quantization\n", func);
842842
}
843843

844844
uint64_t n = 0;
@@ -1569,54 +1569,59 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
15691569
return emit_overrides();
15701570
}
15711571

1572-
auto tensor_importance = [&](const std::vector<tensor_info> & all_vec) -> std::unordered_map<std::string, float> {
1572+
auto tensor_depth = [&](const std::string & name) -> float {
1573+
static const std::regex layer_pattern(R"(blk\.(\d+)\.)");
1574+
std::smatch match;
1575+
1576+
// Depth component: output, embeddings & early/late layers are important
1577+
if (name == "output.weight" || name == "token_embd.weight") {
1578+
return 1.0f;
1579+
}
1580+
if (std::regex_search(name, match, layer_pattern)) {
1581+
const int layer = std::stoi(match[1]);
1582+
const float normalized_layer = (float)layer / (float)std::max(1, (int)model.hparams.n_layer - 1);
1583+
const float center_dist = std::abs(normalized_layer - 0.5f) * 2.0f;
1584+
return 0.01f + 0.9f * center_dist;
1585+
}
1586+
1587+
return 0.0f;
1588+
};
1589+
1590+
auto tensor_importance = [&](const std::vector<tensor_info> & all_tensors) -> std::unordered_map<std::string, float> {
15731591
std::unordered_map<std::string, float> scores;
1574-
for (const auto & ti : all_vec) {
1575-
const std::string name = ggml_get_name(ti.w->tensor);
1592+
for (const auto & t : all_tensors) {
1593+
const std::string name = ggml_get_name(t.w->tensor);
15761594
float total_score = 0.0f;
15771595
float depth_score = 0.0f;
15781596
float type_score = 0.0f;
15791597

1580-
// Depth component: output & early/late layers are important
1598+
// Type component: certain tensor types have more impact on model quality
1599+
const std::vector<std::pair<float, std::vector<const char*>>> tensor_scores = {
1600+
{0.9f, {".ffn_down.weight", ".ffn_down_exps.weight"}},
1601+
{0.89f, {".attn_output.weight", ".time_mix_output.weight", ".attn_o.weight"}},
1602+
{0.3f, {".ffn_up.weight", ".ffn_gate.weight", ".ffn_up_exps.weight", ".ffn_gate_exps.weight"}},
1603+
{0.29f, {".attn_q.weight", ".attn_k.weight", ".attn_v.weight", ".attn_qkv.weight"}},
1604+
{0.2f, {"token_embd.weight"}}
1605+
};
15811606
if (name == "output.weight") {
1582-
depth_score = 1.0f;
1607+
type_score = 1.0f;
15831608
} else {
1584-
static const std::regex layer_pattern(R"(blk\.(\d+)\.)");
1585-
std::smatch match;
1586-
if (std::regex_search(name, match, layer_pattern)) {
1587-
const int layer = std::stoi(match[1]);
1588-
const float normalized_layer = (float)layer / (float)std::max(1, (int)model.hparams.n_layer - 1);
1589-
const float center_dist = std::abs(normalized_layer - 0.5f) * 2.0f;
1590-
depth_score = 0.9f * center_dist;
1609+
for (const auto& ts : tensor_scores) {
1610+
const bool found = std::any_of(ts.second.begin(), ts.second.end(), [&](const char* pattern) {
1611+
return name.find(pattern) != std::string::npos;
1612+
});
1613+
if (found) {
1614+
type_score = ts.first;
1615+
break;
1616+
}
15911617
}
15921618
}
1593-
1594-
// Type component: certain tensor types have more impact on model quality
1595-
if (name == "output.weight") {
1596-
type_score = 1.0f;
1597-
} else if (name.find(".ffn_down.weight") != std::string::npos ||
1598-
name.find(".ffn_down_exps.weight") != std::string::npos) {
1599-
type_score = 0.9f;
1600-
} else if (name.find(".attn_output.weight") != std::string::npos ||
1601-
name.find(".time_mix_output.weight") != std::string::npos ||
1602-
name.find(".attn_o.weight") != std::string::npos) {
1603-
type_score = 0.8f;
1604-
} else if (name.find(".ffn_up.weight") != std::string::npos ||
1605-
name.find(".ffn_gate.weight") != std::string::npos ||
1606-
name.find(".ffn_up_exps.weight") != std::string::npos ||
1607-
name.find(".ffn_gate_exps.weight") != std::string::npos) {
1608-
type_score = 0.3f;
1609-
} else if (name.find(".attn_q.weight") != std::string::npos ||
1610-
name.find(".attn_k.weight") != std::string::npos ||
1611-
name.find(".attn_v.weight") != std::string::npos ||
1612-
name.find(".attn_qkv.weight") != std::string::npos) {
1613-
type_score = 0.2f;
1614-
} else if (name.find("token_embd.weight") != std::string::npos) {
1615-
type_score = 0.1f;
1619+
if (type_score > 0.0f) {
1620+
depth_score = tensor_depth(name);
16161621
}
16171622

16181623
// Weighted combination
1619-
total_score = 0.8f * type_score + 0.2f * depth_score; // 80% type + 20% depth
1624+
total_score = 0.90f * type_score + 0.10f * depth_score; // 90% type + 10% depth
16201625
if (total_score != 0.0f) {
16211626
scores[name] = total_score;
16221627
LLAMA_LOG_DEBUG("\t%s: \t %45s \t depth score %.4f \t type score %.4f \t total score %.4f\n", func, name.c_str(), depth_score, type_score, total_score);
@@ -1634,7 +1639,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
16341639
std::sort(sorted_scores.begin(), sorted_scores.end(), [](const auto & a, const auto & b) { return a.second > b.second; });
16351640

16361641
// Select top percentile
1637-
const size_t n_important = std::max<size_t>(1, std::llround((double)sorted_scores.size() * 0.25f)); // bump top 25%
1642+
const size_t n_important = std::max<size_t>(1, std::llround((double)sorted_scores.size() * 0.29f)); // 29% seems to be the pareto front
16381643

16391644
std::unordered_set<std::string> important;
16401645
for (size_t i = 0; i < std::min(n_important, sorted_scores.size()); ++i) {

0 commit comments

Comments
 (0)