Move struct declaration to header and update allowed tensor list

EAddario · EAddario · commit 6c281ad88f5c · 2025-04-17T09:57:48.000+01:00
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
@@ -1,5 +1,6 @@
 #include "common.h"
 #include "llama.h"
+#include "llama-quant.h"
 
 #include <cstdio>
 #include <cstring>
@@ -248,52 +249,6 @@ static ggml_type parse_ggml_type(const char * arg) {
     return GGML_TYPE_COUNT;
 }
 
-// Allowed tensors for arbitrary quantization with --tensor-type option
-static const std::vector<std::string> ALLOWED_TENSOR_TYPE = {
-    "attn_k",
-    "attn_kv_a_mqa",
-    "attn_kv_b",
-    "attn_o",
-    "attn_output",
-    "attn_q",
-    "attn_q_a",
-    "attn_q_b",
-    "attn_qkv",
-    "attn_v",
-    "channel_mix_key",
-    "channel_mix_receptance",
-    "channel_mix_value",
-    "cls",
-    "cls.output",
-    "cross_attn_k",
-    "cross_attn_o",
-    "cross_attn_q",
-    "cross_attn_v",
-    "ffn_act",
-    "ffn_down",
-    "ffn_down_exps",
-    "ffn_down_shexp",
-    "ffn_gate",
-    "ffn_gate_exps",
-    "ffn_gate_shexp",
-    "ffn_up",
-    "ffn_up_exps",
-    "ffn_up_shexp",
-    "ssm_in",
-    "ssm_out",
-    "time_mix_gate",
-    "time_mix_key",
-    "time_mix_output",
-    "time_mix_receptance",
-    "time_mix_value",
-};
-
-// changes to this struct must be replicated in llama-quant.cpp
-struct tensor_quantization {
-    std::string name;
-    ggml_type quant = GGML_TYPE_COUNT;
-};
-
 static bool parse_tensor_type(const char * data, std::vector<tensor_quantization> & tensor_type) {
     const char * sep = strchr(data, '=');
     if (sep == nullptr) {
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
@@ -48,12 +48,6 @@ struct quantize_state_impl {
         {}
 };
 
-// changes to this struct must be replicated in quantize.cpp
-struct tensor_quantization {
-    std::string name;
-    ggml_type quant = GGML_TYPE_COUNT;
-};
-
 static void llama_tensor_dequantize_impl(
     ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
     const size_t nelements, const int nthread
diff --git a/src/llama-quant.h b/src/llama-quant.h
@@ -1 +1,66 @@
 #pragma once
+
+#include <string>
+#include <vector>
+
+#include "ggml.h"
+
+// Allowed tensors for arbitrary quantization with --tensor-type option
+static const std::vector<std::string> ALLOWED_TENSOR_TYPE = {
+  "attn_k",
+  "attn_k_b",
+  "attn_kv_a_mqa",
+  "attn_kv_b",
+  "attn_o",
+  "attn_output",
+  "attn_q",
+  "attn_q_a",
+  "attn_q_b",
+  "attn_qkv",
+  "attn_rel_b",
+  "attn_v",
+  "attn_v_b",
+  "channel_mix_key",
+  "channel_mix_receptance",
+  "channel_mix_value",
+  "cls",
+  "cls.output",
+  "conv1",
+  "conv1d",
+  "conv2",
+  "cross_attn_k",
+  "cross_attn_o",
+  "cross_attn_q",
+  "cross_attn_rel_b",
+  "cross_attn_v",
+  "dw",
+  "ffn_down",
+  "ffn_down_exps",
+  "ffn_down_shexp",
+  "ffn_gate",
+  "ffn_gate_exps",
+  "ffn_gate_shexp",
+  "ffn_up",
+  "ffn_up_exps",
+  "ffn_up_shexp",
+  "pw1",
+  "pw1",
+  "ssm_a",
+  "ssm_conv1d",
+  "ssm_dt",
+  "ssm_in",
+  "ssm_out",
+  "ssm_x",
+  "time_mix_gate",
+  "time_mix_key",
+  "time_mix_output",
+  "time_mix_receptance",
+  "time_mix_value",
+  "token_types"
+};
+
+// Quantization types
+struct tensor_quantization {
+  std::string name;
+  ggml_type quant = GGML_TYPE_COUNT;
+};