-
Notifications
You must be signed in to change notification settings - Fork 13.4k
clip : refactor clip_init, add tests #12757
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 9 commits
Commits
Show all changes
16 commits
Select commit
Hold shift + click to select a range
44adfae
refactor clip_init
ngxson 79c5656
fix loading file
ngxson dd50832
fix style
ngxson 7b9e7b8
test ok
ngxson ee1fadd
better test with report
ngxson b41acc4
add missing headers
ngxson 6fe6846
clarify
ngxson eeea35a
add KEY_MM_PATCH_MERGE_TYPE
ngxson 17be2f9
remove bool has_* pattern
ngxson 853705e
Apply suggestions from code review
ngxson 376f80a
Update examples/llava/clip.cpp
ngxson 84b35d2
use ggml_soft_max_ext
ngxson 88aec68
refactor logging system
ngxson c4bb063
add minicpm-v-o 2.6 for testing
ngxson 9d4baa6
use nullptr everywhere
ngxson 13b2d8c
fix Yi-VL model
ngxson File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,215 @@ | ||
#include "ggml.h" | ||
#include "gguf.h" | ||
|
||
#include <climits> | ||
#include <stdarg.h> | ||
#include <string> | ||
#include <map> | ||
#include <sstream> | ||
#include <vector> | ||
|
||
// Internal header for clip.cpp | ||
|
||
#define KEY_FTYPE "general.file_type" | ||
#define KEY_NAME "general.name" | ||
#define KEY_DESCRIPTION "general.description" | ||
#define KEY_HAS_TEXT_ENC "clip.has_text_encoder" | ||
#define KEY_HAS_VIS_ENC "clip.has_vision_encoder" | ||
#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector" | ||
#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector" | ||
#define KEY_HAS_GLM_PROJ "clip.has_glm_projector" | ||
#define KEY_MINICPMV_VERSION "clip.minicpmv_version" | ||
#define KEY_HAS_QWEN2VL_MERGER "clip.has_qwen2vl_merger" | ||
#define KEY_USE_GELU "clip.use_gelu" | ||
#define KEY_USE_SILU "clip.use_silu" | ||
#define KEY_N_EMBD "clip.%s.embedding_length" | ||
#define KEY_N_FF "clip.%s.feed_forward_length" | ||
#define KEY_N_BLOCK "clip.%s.block_count" | ||
#define KEY_N_HEAD "clip.%s.attention.head_count" | ||
#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon" | ||
#define KEY_PROJ_DIM "clip.%s.projection_dim" | ||
#define KEY_TOKENS "tokenizer.ggml.tokens" | ||
#define KEY_N_POSITIONS "clip.text.context_length" | ||
#define KEY_IMAGE_SIZE "clip.vision.image_size" | ||
#define KEY_PATCH_SIZE "clip.vision.patch_size" | ||
#define KEY_IMAGE_MEAN "clip.vision.image_mean" | ||
#define KEY_IMAGE_STD "clip.vision.image_std" | ||
#define KEY_PROJ_TYPE "clip.projector_type" | ||
#define KEY_FEATURE_LAYER "clip.vision.feature_layer" | ||
|
||
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type" | ||
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints" | ||
#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution" | ||
|
||
|
||
// | ||
// tensor name constants | ||
// | ||
|
||
#define TN_TOKEN_EMBD "%s.token_embd.weight" | ||
#define TN_POS_EMBD "%s.position_embd.weight" | ||
#define TN_CLASS_EMBD "v.class_embd" | ||
#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat | ||
#define TN_PATCH_EMBD_1 "v.patch_embd.weight.1" | ||
#define TN_PATCH_BIAS "v.patch_embd.bias" | ||
#define TN_ATTN_K "%s.blk.%d.attn_k.%s" | ||
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s" | ||
#define TN_ATTN_V "%s.blk.%d.attn_v.%s" | ||
#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s" | ||
#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s" | ||
#define TN_FFN_UP "%s.blk.%d.ffn_up.%s" | ||
#define TN_LN_1 "%s.blk.%d.ln1.%s" | ||
#define TN_LN_2 "%s.blk.%d.ln2.%s" | ||
#define TN_LN_PRE "%s.pre_ln.%s" | ||
#define TN_LN_POST "%s.post_ln.%s" | ||
#define TN_TEXT_PROJ "text_projection.weight" | ||
#define TN_VIS_PROJ "visual_projection.weight" | ||
#define TN_LLAVA_PROJ "mm.%d.%s" | ||
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s" | ||
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s" | ||
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s" | ||
#define TN_IMAGE_NEWLINE "model.image_newline" | ||
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3 | ||
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3 | ||
|
||
// mimicpmv | ||
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k" | ||
#define TN_MINICPMV_QUERY "resampler.query" | ||
#define TN_MINICPMV_PROJ "resampler.proj.weight" | ||
#define TN_MINICPMV_KV_PROJ "resampler.kv.weight" | ||
#define TN_MINICPMV_ATTN "resampler.attn.%s.%s" | ||
#define TN_MINICPMV_LN "resampler.ln_%s.%s" | ||
|
||
#define TN_GLM_ADAPER_CONV "adapter.conv.%s" | ||
#define TN_GLM_ADAPTER_LINEAR "adapter.linear.linear.%s" | ||
#define TN_GLM_ADAPTER_NORM_1 "adapter.linear.norm1.%s" | ||
#define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s" | ||
#define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s" | ||
#define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s" | ||
#define TN_GLM_BOI_W "adapter.boi" | ||
#define TN_GLM_EOI_W "adapter.eoi" | ||
|
||
enum projector_type { | ||
PROJECTOR_TYPE_MLP, | ||
PROJECTOR_TYPE_MLP_NORM, | ||
PROJECTOR_TYPE_LDP, | ||
PROJECTOR_TYPE_LDPV2, | ||
PROJECTOR_TYPE_RESAMPLER, | ||
PROJECTOR_TYPE_GLM_EDGE, | ||
PROJECTOR_TYPE_MERGER, | ||
PROJECTOR_TYPE_GEMMA3, | ||
PROJECTOR_TYPE_UNKNOWN, | ||
}; | ||
|
||
static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = { | ||
{ PROJECTOR_TYPE_MLP, "mlp" }, | ||
{ PROJECTOR_TYPE_LDP, "ldp" }, | ||
{ PROJECTOR_TYPE_LDPV2, "ldpv2"}, | ||
{ PROJECTOR_TYPE_RESAMPLER, "resampler"}, | ||
{ PROJECTOR_TYPE_GLM_EDGE, "adapter"}, | ||
{ PROJECTOR_TYPE_MERGER, "qwen2vl_merger"}, | ||
{ PROJECTOR_TYPE_GEMMA3, "gemma3"}, | ||
}; | ||
|
||
static projector_type clip_projector_type_from_string(const std::string & str) { | ||
for (const auto & pair : PROJECTOR_TYPE_NAMES) { | ||
if (pair.second == str) { | ||
return pair.first; | ||
} | ||
} | ||
return PROJECTOR_TYPE_UNKNOWN; | ||
} | ||
|
||
// | ||
// common utils | ||
// | ||
|
||
static std::string string_format(const char * fmt, ...) { | ||
va_list ap; | ||
va_list ap2; | ||
va_start(ap, fmt); | ||
va_copy(ap2, ap); | ||
int size = vsnprintf(NULL, 0, fmt, ap); | ||
GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT | ||
std::vector<char> buf(size + 1); | ||
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); | ||
GGML_ASSERT(size2 == size); | ||
va_end(ap2); | ||
va_end(ap); | ||
return std::string(buf.data(), buf.size()); | ||
} | ||
|
||
static void string_replace_all(std::string & s, const std::string & search, const std::string & replace) { | ||
if (search.empty()) { | ||
return; | ||
} | ||
std::string builder; | ||
builder.reserve(s.length()); | ||
size_t pos = 0; | ||
size_t last_pos = 0; | ||
while ((pos = s.find(search, last_pos)) != std::string::npos) { | ||
builder.append(s, last_pos, pos - last_pos); | ||
builder.append(replace); | ||
last_pos = pos + search.length(); | ||
} | ||
builder.append(s, last_pos, std::string::npos); | ||
s = std::move(builder); | ||
} | ||
|
||
// | ||
// gguf utils | ||
// | ||
|
||
static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) { | ||
switch (type) { | ||
case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]); | ||
case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]); | ||
case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]); | ||
case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]); | ||
case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]); | ||
case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]); | ||
case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]); | ||
case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]); | ||
case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]); | ||
case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]); | ||
case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false"; | ||
default: return string_format("unknown type %d", type); | ||
} | ||
} | ||
|
||
static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { | ||
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i); | ||
|
||
switch (type) { | ||
case GGUF_TYPE_STRING: | ||
return gguf_get_val_str(ctx_gguf, i); | ||
case GGUF_TYPE_ARRAY: | ||
{ | ||
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i); | ||
int arr_n = gguf_get_arr_n(ctx_gguf, i); | ||
const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i); | ||
std::stringstream ss; | ||
ss << "["; | ||
for (int j = 0; j < arr_n; j++) { | ||
if (arr_type == GGUF_TYPE_STRING) { | ||
std::string val = gguf_get_arr_str(ctx_gguf, i, j); | ||
// escape quotes | ||
string_replace_all(val, "\\", "\\\\"); | ||
string_replace_all(val, "\"", "\\\""); | ||
ss << '"' << val << '"'; | ||
} else if (arr_type == GGUF_TYPE_ARRAY) { | ||
ss << "???"; | ||
} else { | ||
ss << gguf_data_to_str(arr_type, data, j); | ||
} | ||
if (j < arr_n - 1) { | ||
ss << ", "; | ||
} | ||
} | ||
ss << "]"; | ||
return ss.str(); | ||
} | ||
default: | ||
return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0); | ||
} | ||
} |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.