|
11 | 11 | #include <csignal> |
12 | 12 | #include <fstream> |
13 | 13 | #include <mutex> |
| 14 | +#include <numeric> |
| 15 | +#include <optional> |
14 | 16 | #include <random> |
15 | 17 | #include <regex> |
16 | 18 | #include <thread> |
17 | 19 | #include <unordered_map> |
18 | | -#include <optional> |
19 | 20 | #include <unordered_set> |
20 | 21 |
|
21 | 22 | // Quantization types. Changes to this struct must be replicated in quantize.cpp |
@@ -1151,7 +1152,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type( |
1151 | 1152 |
|
1152 | 1153 | const auto bpw_data = load_bpw_state(); |
1153 | 1154 |
|
1154 | | - // Reduce compute time by parallelising tensor processing - courtesy of https://github.com/ddh0 |
| 1155 | + // Parallelize tensor processing - courtesy of https://github.com/ddh0 |
1155 | 1156 | auto process_tensor = [&](const llama_model_loader::llama_tensor_weight * tw, |
1156 | 1157 | std::vector<no_init<uint8_t>> & thread_local_buffer, |
1157 | 1158 | std::mutex & loader_mutex, |
@@ -1569,93 +1570,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type( |
1569 | 1570 | return emit_overrides(); |
1570 | 1571 | } |
1571 | 1572 |
|
1572 | | - auto tensor_depth = [&](const std::string & name) -> float { |
1573 | | - static const std::regex layer_pattern(R"(blk\.(\d+)\.)"); |
1574 | | - std::smatch match; |
1575 | | - |
1576 | | - // Depth component: output, embeddings & early/late layers are important |
1577 | | - if (name == "output.weight" || name == "token_embd.weight") { |
1578 | | - return 1.0f; |
1579 | | - } |
1580 | | - if (std::regex_search(name, match, layer_pattern)) { |
1581 | | - const int layer = std::stoi(match[1]); |
1582 | | - const float normalized_layer = (float)layer / (float)std::max(1, (int)model.hparams.n_layer - 1); |
1583 | | - const float center_dist = std::abs(normalized_layer - 0.5f) * 2.0f; |
1584 | | - return 0.01f + 0.9f * center_dist; |
1585 | | - } |
1586 | | - |
1587 | | - return 0.0f; |
1588 | | - }; |
1589 | | - |
1590 | | - auto tensor_importance = [&](const std::vector<tensor_info> & all_tensors) -> std::unordered_map<std::string, float> { |
1591 | | - std::unordered_map<std::string, float> scores; |
1592 | | - for (const auto & t : all_tensors) { |
1593 | | - const std::string name = ggml_get_name(t.w->tensor); |
1594 | | - float total_score = 0.0f; |
1595 | | - float depth_score = 0.0f; |
1596 | | - float type_score = 0.0f; |
1597 | | - |
1598 | | - // Type component: certain tensor types have more impact on model quality |
1599 | | - const std::vector<std::pair<float, std::vector<const char*>>> tensor_scores = { |
1600 | | - {0.9f, {".ffn_down.weight", ".ffn_down_exps.weight"}}, |
1601 | | - {0.89f, {".attn_output.weight", ".time_mix_output.weight", ".attn_o.weight"}}, |
1602 | | - {0.3f, {".ffn_up.weight", ".ffn_gate.weight", ".ffn_up_exps.weight", ".ffn_gate_exps.weight"}}, |
1603 | | - {0.29f, {".attn_q.weight", ".attn_k.weight", ".attn_v.weight", ".attn_qkv.weight"}}, |
1604 | | - {0.2f, {"token_embd.weight"}} |
1605 | | - }; |
1606 | | - if (name == "output.weight") { |
1607 | | - type_score = 1.0f; |
1608 | | - } else { |
1609 | | - for (const auto& ts : tensor_scores) { |
1610 | | - const bool found = std::any_of(ts.second.begin(), ts.second.end(), [&](const char* pattern) { |
1611 | | - return name.find(pattern) != std::string::npos; |
1612 | | - }); |
1613 | | - if (found) { |
1614 | | - type_score = ts.first; |
1615 | | - break; |
1616 | | - } |
1617 | | - } |
1618 | | - } |
1619 | | - if (type_score > 0.0f) { |
1620 | | - depth_score = tensor_depth(name); |
1621 | | - } |
1622 | | - |
1623 | | - // Weighted combination |
1624 | | - total_score = 0.90f * type_score + 0.10f * depth_score; // 90% type + 10% depth |
1625 | | - if (total_score != 0.0f) { |
1626 | | - scores[name] = total_score; |
1627 | | - LLAMA_LOG_DEBUG("\t%s: \t %45s \t depth score %.4f \t type score %.4f \t total score %.4f\n", func, name.c_str(), depth_score, type_score, total_score); |
1628 | | - } |
1629 | | - } |
1630 | | - |
1631 | | - return scores; |
1632 | | - }; |
1633 | | - |
1634 | | - auto select_tensors = [&](const std::vector<tensor_info> & all_vec) -> std::unordered_set<std::string> { |
1635 | | - const auto scores = tensor_importance(all_vec); |
1636 | | - |
1637 | | - // Sort by score |
1638 | | - std::vector<std::pair<std::string, float>> sorted_scores(scores.begin(), scores.end()); |
1639 | | - std::sort(sorted_scores.begin(), sorted_scores.end(), [](const auto & a, const auto & b) { return a.second > b.second; }); |
1640 | | - |
1641 | | - // Select top percentile |
1642 | | - const size_t n_important = std::max<size_t>(1, std::llround((double)sorted_scores.size() * 0.29f)); // 29% seems to be the pareto front |
1643 | | - |
1644 | | - std::unordered_set<std::string> important; |
1645 | | - for (size_t i = 0; i < std::min(n_important, sorted_scores.size()); ++i) { |
1646 | | - important.insert(sorted_scores[i].first); |
1647 | | - LLAMA_LOG_DEBUG("\t%s: important tensor %s (score %.4f)\n", func, sorted_scores[i].first.c_str(), sorted_scores[i].second); |
1648 | | - } |
1649 | | - |
1650 | | - const auto pct = 100.0 * (double)important.size() / (double)sorted_scores.size(); |
1651 | | - LLAMA_LOG_INFO("%s: prioritizing %zu out of %zu tensors (%.2f%%)\n", func, important.size(), sorted_scores.size(), pct); |
1652 | | - return important; |
1653 | | - }; |
1654 | | - |
1655 | | - const auto important_set = select_tensors(all); |
1656 | | - |
| 1573 | + // Certain tensors have a higher impact on model quality, so we apply a lower penalty to them |
1657 | 1574 | auto is_important = [&](const std::string & tensor_name) -> bool { |
1658 | | - return important_set.count(tensor_name) > 0; |
| 1575 | + const auto important = tensor_name == "output.weight" || |
| 1576 | + tensor_name.find(".ffn_down.weight") != std::string::npos || |
| 1577 | + tensor_name.find(".ffn_down_exps.weight") != std::string::npos || |
| 1578 | + tensor_name.find(".attn_output.weight") != std::string::npos || |
| 1579 | + tensor_name.find(".time_mix_output.weight") != std::string::npos || |
| 1580 | + tensor_name.find(".attn_o.weight") != std::string::npos; |
| 1581 | + return important; |
1659 | 1582 | }; |
1660 | 1583 |
|
1661 | 1584 | // Lagrangian relaxation to minimise error subject to a bpw target constraint |
|
0 commit comments