@@ -838,7 +838,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
838838 LLAMA_LOG_WARN (" %s: model ID mismatch, ignoring: %s\n " , func, checkpoint_file.c_str ());
839839 return out;
840840 } else {
841- LLAMA_LOG_INFO (" %s: resuming tensor quantization\n " , func);
841+ LLAMA_LOG_INFO (" %s: state file found, resuming tensor quantization\n " , func);
842842 }
843843
844844 uint64_t n = 0 ;
@@ -1569,54 +1569,59 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
15691569 return emit_overrides ();
15701570 }
15711571
1572- auto tensor_importance = [&](const std::vector<tensor_info> & all_vec) -> std::unordered_map<std::string, float > {
1572+ auto tensor_depth = [&](const std::string & name) -> float {
1573+ static const std::regex layer_pattern (R"( blk\.(\d+)\.)" );
1574+ std::smatch match;
1575+
1576+ // Depth component: output, embeddings & early/late layers are important
1577+ if (name == " output.weight" || name == " token_embd.weight" ) {
1578+ return 1 .0f ;
1579+ }
1580+ if (std::regex_search (name, match, layer_pattern)) {
1581+ const int layer = std::stoi (match[1 ]);
1582+ const float normalized_layer = (float )layer / (float )std::max (1 , (int )model.hparams .n_layer - 1 );
1583+ const float center_dist = std::abs (normalized_layer - 0 .5f ) * 2 .0f ;
1584+ return 0 .01f + 0 .9f * center_dist;
1585+ }
1586+
1587+ return 0 .0f ;
1588+ };
1589+
1590+ auto tensor_importance = [&](const std::vector<tensor_info> & all_tensors) -> std::unordered_map<std::string, float > {
15731591 std::unordered_map<std::string, float > scores;
1574- for (const auto & ti : all_vec ) {
1575- const std::string name = ggml_get_name (ti .w ->tensor );
1592+ for (const auto & t : all_tensors ) {
1593+ const std::string name = ggml_get_name (t .w ->tensor );
15761594 float total_score = 0 .0f ;
15771595 float depth_score = 0 .0f ;
15781596 float type_score = 0 .0f ;
15791597
1580- // Depth component: output & early/late layers are important
1598+ // Type component: certain tensor types have more impact on model quality
1599+ const std::vector<std::pair<float , std::vector<const char *>>> tensor_scores = {
1600+ {0 .9f , {" .ffn_down.weight" , " .ffn_down_exps.weight" }},
1601+ {0 .89f , {" .attn_output.weight" , " .time_mix_output.weight" , " .attn_o.weight" }},
1602+ {0 .3f , {" .ffn_up.weight" , " .ffn_gate.weight" , " .ffn_up_exps.weight" , " .ffn_gate_exps.weight" }},
1603+ {0 .29f , {" .attn_q.weight" , " .attn_k.weight" , " .attn_v.weight" , " .attn_qkv.weight" }},
1604+ {0 .2f , {" token_embd.weight" }}
1605+ };
15811606 if (name == " output.weight" ) {
1582- depth_score = 1 .0f ;
1607+ type_score = 1 .0f ;
15831608 } else {
1584- static const std::regex layer_pattern (R"( blk\.(\d+)\.)" );
1585- std::smatch match;
1586- if (std::regex_search (name, match, layer_pattern)) {
1587- const int layer = std::stoi (match[1 ]);
1588- const float normalized_layer = (float )layer / (float )std::max (1 , (int )model.hparams .n_layer - 1 );
1589- const float center_dist = std::abs (normalized_layer - 0 .5f ) * 2 .0f ;
1590- depth_score = 0 .9f * center_dist;
1609+ for (const auto & ts : tensor_scores) {
1610+ const bool found = std::any_of (ts.second .begin (), ts.second .end (), [&](const char * pattern) {
1611+ return name.find (pattern) != std::string::npos;
1612+ });
1613+ if (found) {
1614+ type_score = ts.first ;
1615+ break ;
1616+ }
15911617 }
15921618 }
1593-
1594- // Type component: certain tensor types have more impact on model quality
1595- if (name == " output.weight" ) {
1596- type_score = 1 .0f ;
1597- } else if (name.find (" .ffn_down.weight" ) != std::string::npos ||
1598- name.find (" .ffn_down_exps.weight" ) != std::string::npos) {
1599- type_score = 0 .9f ;
1600- } else if (name.find (" .attn_output.weight" ) != std::string::npos ||
1601- name.find (" .time_mix_output.weight" ) != std::string::npos ||
1602- name.find (" .attn_o.weight" ) != std::string::npos) {
1603- type_score = 0 .8f ;
1604- } else if (name.find (" .ffn_up.weight" ) != std::string::npos ||
1605- name.find (" .ffn_gate.weight" ) != std::string::npos ||
1606- name.find (" .ffn_up_exps.weight" ) != std::string::npos ||
1607- name.find (" .ffn_gate_exps.weight" ) != std::string::npos) {
1608- type_score = 0 .3f ;
1609- } else if (name.find (" .attn_q.weight" ) != std::string::npos ||
1610- name.find (" .attn_k.weight" ) != std::string::npos ||
1611- name.find (" .attn_v.weight" ) != std::string::npos ||
1612- name.find (" .attn_qkv.weight" ) != std::string::npos) {
1613- type_score = 0 .2f ;
1614- } else if (name.find (" token_embd.weight" ) != std::string::npos) {
1615- type_score = 0 .1f ;
1619+ if (type_score > 0 .0f ) {
1620+ depth_score = tensor_depth (name);
16161621 }
16171622
16181623 // Weighted combination
1619- total_score = 0 .8f * type_score + 0 .2f * depth_score; // 80 % type + 20 % depth
1624+ total_score = 0 .90f * type_score + 0 .10f * depth_score; // 90 % type + 10 % depth
16201625 if (total_score != 0 .0f ) {
16211626 scores[name] = total_score;
16221627 LLAMA_LOG_DEBUG (" \t %s: \t %45s \t depth score %.4f \t type score %.4f \t total score %.4f\n " , func, name.c_str (), depth_score, type_score, total_score);
@@ -1634,7 +1639,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
16341639 std::sort (sorted_scores.begin (), sorted_scores.end (), [](const auto & a, const auto & b) { return a.second > b.second ; });
16351640
16361641 // Select top percentile
1637- const size_t n_important = std::max<size_t >(1 , std::llround ((double )sorted_scores.size () * 0 .25f )); // bump top 25%
1642+ const size_t n_important = std::max<size_t >(1 , std::llround ((double )sorted_scores.size () * 0 .29f )); // 29% seems to be the pareto front
16381643
16391644 std::unordered_set<std::string> important;
16401645 for (size_t i = 0 ; i < std::min (n_important, sorted_scores.size ()); ++i) {
0 commit comments