Remove allowed tensor checks

EAddario · EAddario · commit 61bb6e2640a6 · 2025-05-11T13:56:01.000+01:00
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
@@ -14,6 +14,12 @@
 #include <thread>
 #include <unordered_map>
 
+// Quantization types. Changes to this struct must be replicated in quantize.cpp
+struct tensor_quantization {
+    std::string name;
+    ggml_type quant = GGML_TYPE_COUNT;
+};
+
 static void zeros(std::ofstream & file, size_t n) {
     char zero = 0;
     for (size_t i = 0; i < n; ++i) {
@@ -793,20 +799,14 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                     const std::string tensor_name(tensor->name);
                     for (const auto & [tname, qtype] : tensor_types) {
                         if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
-                            for (const auto & allowed : ALLOWED_TENSOR_TYPE) {
-                                if (tensor_name.find(allowed) != std::string::npos) {
-                                    if  (qtype != new_type) {
-                                        LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
-                                        new_type = qtype;
-                                        break;
-                                    }
-                                }
+                            if  (qtype != new_type) {
+                                LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
+                                new_type = qtype;
+                                break; // if two or more types are specified for the tensor, first match wins
                             }
-                            goto loop_exit; // if two or more types are specified for the tensor, first match wins
                         }
                     }
                 }
-                loop_exit:;
             }
 
             if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
diff --git a/src/llama-quant.h b/src/llama-quant.h
@@ -1,66 +1 @@
 #pragma once
-
-#include <string>
-#include <vector>
-
-#include "ggml.h"
-
-// Allowed tensors for arbitrary quantization with --tensor-type option
-static const std::vector<std::string> ALLOWED_TENSOR_TYPE = {
-  "attn_k",
-  "attn_k_b",
-  "attn_kv_a_mqa",
-  "attn_kv_b",
-  "attn_o",
-  "attn_output",
-  "attn_q",
-  "attn_q_a",
-  "attn_q_b",
-  "attn_qkv",
-  "attn_rel_b",
-  "attn_v",
-  "attn_v_b",
-  "channel_mix_key",
-  "channel_mix_receptance",
-  "channel_mix_value",
-  "cls",
-  "cls.output",
-  "conv1",
-  "conv1d",
-  "conv2",
-  "cross_attn_k",
-  "cross_attn_o",
-  "cross_attn_q",
-  "cross_attn_rel_b",
-  "cross_attn_v",
-  "dw",
-  "ffn_down",
-  "ffn_down_exps",
-  "ffn_down_shexp",
-  "ffn_gate",
-  "ffn_gate_exps",
-  "ffn_gate_shexp",
-  "ffn_up",
-  "ffn_up_exps",
-  "ffn_up_shexp",
-  "pw1",
-  "pw1",
-  "ssm_a",
-  "ssm_conv1d",
-  "ssm_dt",
-  "ssm_in",
-  "ssm_out",
-  "ssm_x",
-  "time_mix_gate",
-  "time_mix_key",
-  "time_mix_output",
-  "time_mix_receptance",
-  "time_mix_value",
-  "token_types"
-};
-
-// Quantization types
-struct tensor_quantization {
-  std::string name;
-  ggml_type quant = GGML_TYPE_COUNT;
-};
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
@@ -1,6 +1,5 @@
 #include "common.h"
 #include "llama.h"
-#include "llama-quant.h"
 
 #include <cstdio>
 #include <cstring>
@@ -58,6 +57,12 @@ static const std::vector<quant_option> QUANT_OPTIONS = {
     { "COPY",     LLAMA_FTYPE_ALL_F32,         "only copy tensors, no quantizing",  },
 };
 
+// Quantization types. Changes to this struct must be replicated in llama-quantize.cpp
+struct tensor_quantization {
+    std::string name;
+    ggml_type quant = GGML_TYPE_COUNT;
+};
+
 static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE       = "quantize.imatrix.file";
 static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET    = "quantize.imatrix.dataset";
 static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES  = "quantize.imatrix.entries_count";
@@ -245,7 +250,7 @@ static ggml_type parse_ggml_type(const char * arg) {
             return type;
         }
     }
-    fprintf(stderr, "%s: invalid ggml_type '%s'\n", __func__, arg);
+    fprintf(stderr, "\n%s: invalid ggml_type '%s'\n\n", __func__, arg);
     return GGML_TYPE_COUNT;
 }