|
| 1 | +#include "ggml.h" |
| 2 | +#include "gguf.h" |
| 3 | + |
| 4 | +#include <string> |
| 5 | +#include <map> |
| 6 | +#include <sstream> |
| 7 | + |
| 8 | +// Internal header for clip.cpp |
| 9 | + |
| 10 | +#define KEY_FTYPE "general.file_type" |
| 11 | +#define KEY_NAME "general.name" |
| 12 | +#define KEY_DESCRIPTION "general.description" |
| 13 | +#define KEY_HAS_TEXT_ENC "clip.has_text_encoder" |
| 14 | +#define KEY_HAS_VIS_ENC "clip.has_vision_encoder" |
| 15 | +#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector" |
| 16 | +#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector" |
| 17 | +#define KEY_HAS_GLM_PROJ "clip.has_glm_projector" |
| 18 | +#define KEY_MINICPMV_VERSION "clip.minicpmv_version" |
| 19 | +#define KEY_HAS_QWEN2VL_MERGER "clip.has_qwen2vl_merger" |
| 20 | +#define KEY_USE_GELU "clip.use_gelu" |
| 21 | +#define KEY_USE_SILU "clip.use_silu" |
| 22 | +#define KEY_N_EMBD "clip.%s.embedding_length" |
| 23 | +#define KEY_N_FF "clip.%s.feed_forward_length" |
| 24 | +#define KEY_N_BLOCK "clip.%s.block_count" |
| 25 | +#define KEY_N_HEAD "clip.%s.attention.head_count" |
| 26 | +#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon" |
| 27 | +#define KEY_PROJ_DIM "clip.%s.projection_dim" |
| 28 | +#define KEY_TOKENS "tokenizer.ggml.tokens" |
| 29 | +#define KEY_N_POSITIONS "clip.text.context_length" |
| 30 | +#define KEY_IMAGE_SIZE "clip.vision.image_size" |
| 31 | +#define KEY_PATCH_SIZE "clip.vision.patch_size" |
| 32 | +#define KEY_IMAGE_MEAN "clip.vision.image_mean" |
| 33 | +#define KEY_IMAGE_STD "clip.vision.image_std" |
| 34 | +#define KEY_PROJ_TYPE "clip.projector_type" |
| 35 | +#define KEY_FEATURE_LAYER "clip.vision.feature_layer" |
| 36 | + |
| 37 | +#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type" |
| 38 | +#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints" |
| 39 | +#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution" |
| 40 | + |
| 41 | + |
| 42 | +// |
| 43 | +// tensor name constants |
| 44 | +// |
| 45 | + |
| 46 | +#define TN_TOKEN_EMBD "%s.token_embd.weight" |
| 47 | +#define TN_POS_EMBD "%s.position_embd.weight" |
| 48 | +#define TN_CLASS_EMBD "v.class_embd" |
| 49 | +#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat |
| 50 | +#define TN_PATCH_EMBD_1 "v.patch_embd.weight.1" |
| 51 | +#define TN_PATCH_BIAS "v.patch_embd.bias" |
| 52 | +#define TN_ATTN_K "%s.blk.%d.attn_k.%s" |
| 53 | +#define TN_ATTN_Q "%s.blk.%d.attn_q.%s" |
| 54 | +#define TN_ATTN_V "%s.blk.%d.attn_v.%s" |
| 55 | +#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s" |
| 56 | +#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s" |
| 57 | +#define TN_FFN_UP "%s.blk.%d.ffn_up.%s" |
| 58 | +#define TN_LN_1 "%s.blk.%d.ln1.%s" |
| 59 | +#define TN_LN_2 "%s.blk.%d.ln2.%s" |
| 60 | +#define TN_LN_PRE "%s.pre_ln.%s" |
| 61 | +#define TN_LN_POST "%s.post_ln.%s" |
| 62 | +#define TN_TEXT_PROJ "text_projection.weight" |
| 63 | +#define TN_VIS_PROJ "visual_projection.weight" |
| 64 | +#define TN_LLAVA_PROJ "mm.%d.%s" |
| 65 | +#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s" |
| 66 | +#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s" |
| 67 | +#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s" |
| 68 | +#define TN_IMAGE_NEWLINE "model.image_newline" |
| 69 | +#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3 |
| 70 | +#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3 |
| 71 | + |
| 72 | +// mimicpmv |
| 73 | +#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k" |
| 74 | +#define TN_MINICPMV_QUERY "resampler.query" |
| 75 | +#define TN_MINICPMV_PROJ "resampler.proj.weight" |
| 76 | +#define TN_MINICPMV_KV_PROJ "resampler.kv.weight" |
| 77 | +#define TN_MINICPMV_ATTN "resampler.attn.%s.%s" |
| 78 | +#define TN_MINICPMV_LN "resampler.ln_%s.%s" |
| 79 | + |
| 80 | +#define TN_GLM_ADAPER_CONV "adapter.conv.%s" |
| 81 | +#define TN_GLM_ADAPTER_LINEAR "adapter.linear.linear.%s" |
| 82 | +#define TN_GLM_ADAPTER_NORM_1 "adapter.linear.norm1.%s" |
| 83 | +#define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s" |
| 84 | +#define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s" |
| 85 | +#define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s" |
| 86 | +#define TN_GLM_BOI_W "adapter.boi" |
| 87 | +#define TN_GLM_EOI_W "adapter.eoi" |
| 88 | + |
| 89 | +enum projector_type { |
| 90 | + PROJECTOR_TYPE_MLP, |
| 91 | + PROJECTOR_TYPE_MLP_NORM, |
| 92 | + PROJECTOR_TYPE_LDP, |
| 93 | + PROJECTOR_TYPE_LDPV2, |
| 94 | + PROJECTOR_TYPE_RESAMPLER, |
| 95 | + PROJECTOR_TYPE_GLM_EDGE, |
| 96 | + PROJECTOR_TYPE_MERGER, |
| 97 | + PROJECTOR_TYPE_GEMMA3, |
| 98 | + PROJECTOR_TYPE_UNKNOWN, |
| 99 | +}; |
| 100 | + |
| 101 | +static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = { |
| 102 | + { PROJECTOR_TYPE_MLP, "mlp" }, |
| 103 | + { PROJECTOR_TYPE_LDP, "ldp" }, |
| 104 | + { PROJECTOR_TYPE_LDPV2, "ldpv2"}, |
| 105 | + { PROJECTOR_TYPE_RESAMPLER, "resampler"}, |
| 106 | + { PROJECTOR_TYPE_GLM_EDGE, "adapter"}, |
| 107 | + { PROJECTOR_TYPE_MERGER, "qwen2vl_merger"}, |
| 108 | + { PROJECTOR_TYPE_GEMMA3, "gemma3"}, |
| 109 | +}; |
| 110 | + |
| 111 | +static projector_type clip_projector_type_from_string(const std::string & str) { |
| 112 | + for (const auto & pair : PROJECTOR_TYPE_NAMES) { |
| 113 | + if (pair.second == str) { |
| 114 | + return pair.first; |
| 115 | + } |
| 116 | + } |
| 117 | + return PROJECTOR_TYPE_UNKNOWN; |
| 118 | +} |
| 119 | + |
| 120 | +// |
| 121 | +// common utils |
| 122 | +// |
| 123 | + |
| 124 | +static std::string string_format(const char * fmt, ...) { |
| 125 | + va_list ap; |
| 126 | + va_list ap2; |
| 127 | + va_start(ap, fmt); |
| 128 | + va_copy(ap2, ap); |
| 129 | + int size = vsnprintf(NULL, 0, fmt, ap); |
| 130 | + GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT |
| 131 | + std::vector<char> buf(size + 1); |
| 132 | + int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); |
| 133 | + GGML_ASSERT(size2 == size); |
| 134 | + va_end(ap2); |
| 135 | + va_end(ap); |
| 136 | + return std::string(buf.data(), buf.size()); |
| 137 | +} |
| 138 | + |
| 139 | +static void string_replace_all(std::string & s, const std::string & search, const std::string & replace) { |
| 140 | + if (search.empty()) { |
| 141 | + return; |
| 142 | + } |
| 143 | + std::string builder; |
| 144 | + builder.reserve(s.length()); |
| 145 | + size_t pos = 0; |
| 146 | + size_t last_pos = 0; |
| 147 | + while ((pos = s.find(search, last_pos)) != std::string::npos) { |
| 148 | + builder.append(s, last_pos, pos - last_pos); |
| 149 | + builder.append(replace); |
| 150 | + last_pos = pos + search.length(); |
| 151 | + } |
| 152 | + builder.append(s, last_pos, std::string::npos); |
| 153 | + s = std::move(builder); |
| 154 | +} |
| 155 | + |
| 156 | +// |
| 157 | +// gguf utils |
| 158 | +// |
| 159 | + |
| 160 | +static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) { |
| 161 | + switch (type) { |
| 162 | + case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]); |
| 163 | + case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]); |
| 164 | + case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]); |
| 165 | + case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]); |
| 166 | + case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]); |
| 167 | + case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]); |
| 168 | + case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]); |
| 169 | + case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]); |
| 170 | + case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]); |
| 171 | + case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]); |
| 172 | + case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false"; |
| 173 | + default: return string_format("unknown type %d", type); |
| 174 | + } |
| 175 | +} |
| 176 | + |
| 177 | +static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { |
| 178 | + const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i); |
| 179 | + |
| 180 | + switch (type) { |
| 181 | + case GGUF_TYPE_STRING: |
| 182 | + return gguf_get_val_str(ctx_gguf, i); |
| 183 | + case GGUF_TYPE_ARRAY: |
| 184 | + { |
| 185 | + const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i); |
| 186 | + int arr_n = gguf_get_arr_n(ctx_gguf, i); |
| 187 | + const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i); |
| 188 | + std::stringstream ss; |
| 189 | + ss << "["; |
| 190 | + for (int j = 0; j < arr_n; j++) { |
| 191 | + if (arr_type == GGUF_TYPE_STRING) { |
| 192 | + std::string val = gguf_get_arr_str(ctx_gguf, i, j); |
| 193 | + // escape quotes |
| 194 | + string_replace_all(val, "\\", "\\\\"); |
| 195 | + string_replace_all(val, "\"", "\\\""); |
| 196 | + ss << '"' << val << '"'; |
| 197 | + } else if (arr_type == GGUF_TYPE_ARRAY) { |
| 198 | + ss << "???"; |
| 199 | + } else { |
| 200 | + ss << gguf_data_to_str(arr_type, data, j); |
| 201 | + } |
| 202 | + if (j < arr_n - 1) { |
| 203 | + ss << ", "; |
| 204 | + } |
| 205 | + } |
| 206 | + ss << "]"; |
| 207 | + return ss.str(); |
| 208 | + } |
| 209 | + default: |
| 210 | + return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0); |
| 211 | + } |
| 212 | +} |
0 commit comments