@@ -838,7 +838,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
838838            LLAMA_LOG_WARN (" %s: model ID mismatch, ignoring: %s\n " c_str ());
839839            return  out;
840840        } else  {
841-             LLAMA_LOG_INFO (" %s: resuming tensor quantization\n " 
841+             LLAMA_LOG_INFO (" %s: state file found,  resuming tensor quantization\n " 
842842        }
843843
844844        uint64_t  n = 0 ;
@@ -1569,54 +1569,59 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
15691569        return  emit_overrides ();
15701570    }
15711571
1572-     auto  tensor_importance = [&](const  std::vector<tensor_info> & all_vec) -> std::unordered_map<std::string, float > {
1572+     auto  tensor_depth = [&](const  std::string & name) -> float  {
1573+         static  const  std::regex layer_pattern (R"( blk\.(\d+)\.)" 
1574+         std::smatch match;
1575+ 
1576+         //  Depth component: output, embeddings & early/late layers are important
1577+         if  (name == " output.weight" " token_embd.weight" 
1578+             return  1 .0f ;
1579+         }
1580+         if  (std::regex_search (name, match, layer_pattern)) {
1581+             const  int  layer = std::stoi (match[1 ]);
1582+             const  float  normalized_layer = (float )layer / (float )std::max (1 , (int )model.hparams .n_layer  - 1 );
1583+             const  float  center_dist = std::abs (normalized_layer - 0 .5f ) * 2 .0f ;
1584+             return  0 .01f  + 0 .9f  * center_dist;
1585+         }
1586+ 
1587+         return  0 .0f ;
1588+     };
1589+ 
1590+     auto  tensor_importance = [&](const  std::vector<tensor_info> & all_tensors) -> std::unordered_map<std::string, float > {
15731591        std::unordered_map<std::string, float > scores;
1574-         for  (const  auto  & ti  : all_vec ) {
1575-             const  std::string name = ggml_get_name (ti .w ->tensor );
1592+         for  (const  auto  & t  : all_tensors ) {
1593+             const  std::string name = ggml_get_name (t .w ->tensor );
15761594            float  total_score = 0 .0f ;
15771595            float  depth_score = 0 .0f ;
15781596            float  type_score = 0 .0f ;
15791597
1580-             //  Depth component: output & early/late layers are important
1598+             //  Type component: certain tensor types have more impact on model quality
1599+             const  std::vector<std::pair<float , std::vector<const  char *>>> tensor_scores = {
1600+                 {0 .9f , {" .ffn_down.weight" " .ffn_down_exps.weight" 
1601+                 {0 .89f , {" .attn_output.weight" " .time_mix_output.weight" " .attn_o.weight" 
1602+                 {0 .3f , {" .ffn_up.weight" " .ffn_gate.weight" " .ffn_up_exps.weight" " .ffn_gate_exps.weight" 
1603+                 {0 .29f , {" .attn_q.weight" " .attn_k.weight" " .attn_v.weight" " .attn_qkv.weight" 
1604+                 {0 .2f , {" token_embd.weight" 
1605+             };
15811606            if  (name == " output.weight" 
1582-                 depth_score  = 1 .0f ;
1607+                 type_score  = 1 .0f ;
15831608            } else  {
1584-                 static  const  std::regex layer_pattern (R"( blk\.(\d+)\.)" 
1585-                 std::smatch match;
1586-                 if  (std::regex_search (name, match, layer_pattern)) {
1587-                     const  int  layer = std::stoi (match[1 ]);
1588-                     const  float  normalized_layer = (float )layer / (float )std::max (1 , (int )model.hparams .n_layer  - 1 );
1589-                     const  float  center_dist = std::abs (normalized_layer - 0 .5f ) * 2 .0f ;
1590-                     depth_score = 0 .9f  * center_dist;
1609+                 for  (const  auto & ts : tensor_scores) {
1610+                     const  bool  found = std::any_of (ts.second .begin (), ts.second .end (), [&](const  char * pattern) {
1611+                         return  name.find (pattern) != std::string::npos;
1612+                     });
1613+                     if  (found) {
1614+                         type_score = ts.first ;
1615+                         break ;
1616+                     }
15911617                }
15921618            }
1593- 
1594-             //  Type component: certain tensor types have more impact on model quality
1595-             if  (name == " output.weight" 
1596-                 type_score = 1 .0f ;
1597-             } else  if  (name.find (" .ffn_down.weight" 
1598-                        name.find (" .ffn_down_exps.weight" 
1599-                 type_score = 0 .9f ;
1600-             } else  if  (name.find (" .attn_output.weight" 
1601-                        name.find (" .time_mix_output.weight" 
1602-                        name.find (" .attn_o.weight" 
1603-                 type_score = 0 .8f ;
1604-             } else  if  (name.find (" .ffn_up.weight" 
1605-                        name.find (" .ffn_gate.weight" 
1606-                        name.find (" .ffn_up_exps.weight" 
1607-                        name.find (" .ffn_gate_exps.weight" 
1608-                 type_score = 0 .3f ;
1609-             } else  if  (name.find (" .attn_q.weight" 
1610-                        name.find (" .attn_k.weight" 
1611-                        name.find (" .attn_v.weight" 
1612-                        name.find (" .attn_qkv.weight" 
1613-                 type_score = 0 .2f ;
1614-             } else  if  (name.find (" token_embd.weight" 
1615-                 type_score = 0 .1f ;
1619+             if  (type_score > 0 .0f ) {
1620+                 depth_score = tensor_depth (name);
16161621            }
16171622
16181623            //  Weighted combination
1619-             total_score = 0 .8f  * type_score + 0 .2f  * depth_score; //  80 % type + 20 % depth
1624+             total_score = 0 .90f  * type_score + 0 .10f  * depth_score; //  90 % type + 10 % depth
16201625            if  (total_score != 0 .0f ) {
16211626                scores[name] = total_score;
16221627                LLAMA_LOG_DEBUG (" \t %s: \t  %45s \t  depth score %.4f \t  type score %.4f \t  total score %.4f\n " c_str (), depth_score, type_score, total_score);
@@ -1634,7 +1639,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
16341639        std::sort (sorted_scores.begin (), sorted_scores.end (), [](const  auto  & a, const  auto  & b) { return  a.second  > b.second ; });
16351640
16361641        //  Select top percentile
1637-         const  size_t  n_important = std::max<size_t >(1 , std::llround ((double )sorted_scores.size () * 0 .25f )); //  bump top 25% 
1642+         const  size_t  n_important = std::max<size_t >(1 , std::llround ((double )sorted_scores.size () * 0 .29f )); //  29% seems to be the pareto front 
16381643
16391644        std::unordered_set<std::string> important;
16401645        for  (size_t  i = 0 ; i < std::min (n_important, sorted_scores.size ()); ++i) {
0 commit comments