Skip to content

Commit 8ec8060

Browse files
committed
Refactor projector_type enum to enum class for type safety and clarity
- Added explicit handling for `ProjectorType::UNKNOWN` for robustness. This should solve issue #7073.
1 parent ae9818e commit 8ec8060

File tree

1 file changed

+62
-45
lines changed

1 file changed

+62
-45
lines changed

examples/llava/clip.cpp

Lines changed: 62 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -153,20 +153,20 @@ static std::string format(const char * fmt, ...) {
153153
#define TN_MINICPMV_LN "resampler.ln_%s.%s"
154154

155155

156-
enum projector_type {
157-
PROJECTOR_TYPE_MLP,
158-
PROJECTOR_TYPE_MLP_NORM,
159-
PROJECTOR_TYPE_LDP,
160-
PROJECTOR_TYPE_LDPV2,
161-
PROJECTOR_TYPE_RESAMPLER,
162-
PROJECTOR_TYPE_UNKNOWN,
156+
enum class ProjectorType {
157+
MLP,
158+
MLP_NORM,
159+
LDP,
160+
LDPV2,
161+
RESAMPLER,
162+
UNKNOWN,
163163
};
164164

165-
static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
166-
{ PROJECTOR_TYPE_MLP, "mlp" },
167-
{ PROJECTOR_TYPE_LDP, "ldp" },
168-
{ PROJECTOR_TYPE_LDPV2, "ldpv2"},
169-
{ PROJECTOR_TYPE_RESAMPLER, "resampler"},
165+
static std::map<ProjectorType, std::string> ProjectorTypeNames = {
166+
{ ProjectorType::MLP, "mlp" },
167+
{ ProjectorType::LDP, "ldp" },
168+
{ ProjectorType::LDPV2, "ldpv2"},
169+
{ ProjectorType::RESAMPLER, "resampler"},
170170
};
171171

172172

@@ -287,13 +287,13 @@ static void print_tensor_info(const ggml_tensor * tensor, const char * prefix =
287287
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
288288
}
289289

290-
static projector_type clip_projector_type_from_string(const std::string & name) {
291-
for (const auto & kv : PROJECTOR_TYPE_NAMES) { // NOLINT
290+
static ProjectorType clip_projector_type_from_string(const std::string & name) {
291+
for (const auto & kv : ProjectorTypeNames) { // NOLINT
292292
if (kv.second == name) {
293293
return kv.first;
294294
}
295295
}
296-
return PROJECTOR_TYPE_UNKNOWN;
296+
return ProjectorType::UNKNOWN;
297297
}
298298

299299
#ifdef CLIP_DEBUG_FUNCTIONS
@@ -552,7 +552,7 @@ struct clip_ctx {
552552
int minicpmv_version = 2;
553553

554554
struct clip_vision_model vision_model;
555-
projector_type proj_type = PROJECTOR_TYPE_MLP;
555+
ProjectorType proj_type = ProjectorType::MLP;
556556

557557
float image_mean[3];
558558
float image_std[3];
@@ -790,15 +790,15 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
790790
// print_tensor_info(embeddings, "embeddings");
791791

792792
// llava projector
793-
if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
793+
if (ctx->proj_type == ProjectorType::MLP) {
794794
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
795795
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
796796

797797
embeddings = ggml_gelu(ctx0, embeddings);
798798
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
799799
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
800800
}
801-
else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
801+
else if (ctx->proj_type == ProjectorType::MLP_NORM) {
802802
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
803803
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
804804
// ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
@@ -819,7 +819,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
819819
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w),
820820
model.mm_4_b);
821821
}
822-
else if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
822+
else if (ctx->proj_type == ProjectorType::LDP) {
823823
// MobileVLM projector
824824
int n_patch = 24;
825825
struct ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
@@ -929,7 +929,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
929929
}
930930
embeddings = block_1;
931931
}
932-
else if (ctx->proj_type == PROJECTOR_TYPE_LDPV2)
932+
else if (ctx->proj_type == ProjectorType::LDPV2)
933933
{
934934
int n_patch = 24;
935935
struct ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
@@ -960,7 +960,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
960960
// minicpmv projector
961961
else if (ctx->has_minicpmv_projector)
962962
{
963-
if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
963+
if (ctx->proj_type == ProjectorType::RESAMPLER) {
964964
struct ggml_tensor * q = model.mm_model_query;
965965
{ // layernorm
966966
q = ggml_norm(ctx0, q, eps);
@@ -1139,12 +1139,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
11391139
const std::string proj_type = gguf_get_val_str(ctx, idx);
11401140
new_clip->proj_type = clip_projector_type_from_string(proj_type);
11411141
} else {
1142-
new_clip->proj_type = PROJECTOR_TYPE_MLP;
1142+
new_clip->proj_type = ProjectorType::MLP;
11431143
}
11441144

1145-
if (new_clip->proj_type == PROJECTOR_TYPE_MLP) {
1145+
if (new_clip->proj_type == ProjectorType::MLP) {
11461146
if (gguf_find_tensor(ctx, format(TN_LLAVA_PROJ, 3, "weight").c_str()) != -1) {
1147-
new_clip->proj_type = PROJECTOR_TYPE_MLP_NORM;
1147+
new_clip->proj_type = ProjectorType::MLP_NORM;
11481148
}
11491149
}
11501150
}
@@ -1387,7 +1387,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
13871387
}
13881388

13891389
// LLaVA projection
1390-
if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) {
1390+
switch (new_clip->proj_type)
1391+
{
1392+
case ProjectorType::MLP:
1393+
case ProjectorType::MLP_NORM:
1394+
{
13911395
vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
13921396
vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
13931397
try {
@@ -1414,7 +1418,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
14141418
vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
14151419
// LOG_INF("%s: image_newline tensor (llava-1.6) found\n", __func__);
14161420
} catch (std::runtime_error & /*e*/) { }
1417-
} else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
1421+
break;
1422+
}
1423+
case ProjectorType::LDP:
1424+
{
14181425
// MobileVLM projection
14191426
vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight"));
14201427
vision_model.mm_model_mlp_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "bias"));
@@ -1440,8 +1447,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
14401447
vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
14411448
vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
14421449
vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
1450+
break;
14431451
}
1444-
else if (new_clip->proj_type == PROJECTOR_TYPE_LDPV2)
1452+
case ProjectorType::LDPV2:
14451453
{
14461454
// MobilVLM_V2 projection
14471455
vision_model.mm_model_mlp_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "weight"));
@@ -1450,8 +1458,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
14501458
vision_model.mm_model_mlp_2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "bias"));
14511459
vision_model.mm_model_peg_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "weight"));
14521460
vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias"));
1461+
break;
14531462
}
1454-
else if (new_clip->proj_type == PROJECTOR_TYPE_RESAMPLER) {
1463+
case ProjectorType::RESAMPLER:
1464+
{
14551465
// vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
14561466
vision_model.mm_model_pos_embed_k = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD_K);
14571467
vision_model.mm_model_query = get_tensor(new_clip->ctx_data, TN_MINICPMV_QUERY);
@@ -1471,10 +1481,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
14711481
vision_model.mm_model_ln_kv_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "bias"));
14721482
vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight"));
14731483
vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias"));
1484+
break;
1485+
}
1486+
case ProjectorType::UNKNOWN:
1487+
{
1488+
LOG_ERR("%s: ProjectorType\n", __func__);
1489+
clip_free(new_clip);
1490+
gguf_free(ctx);
1491+
std::terminate();
14741492
}
1475-
else {
1476-
std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
1477-
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
14781493
}
14791494

14801495
vision_model.layers.resize(hparams.n_layer);
@@ -2189,9 +2204,9 @@ int clip_n_patches(const struct clip_ctx * ctx) {
21892204

21902205
int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
21912206

2192-
if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
2207+
if (ctx->proj_type == ProjectorType::LDP || ctx->proj_type == ProjectorType::LDPV2) {
21932208
n_patches /= 4;
2194-
} else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
2209+
} else if (ctx->proj_type == ProjectorType::RESAMPLER) {
21952210
if (ctx->minicpmv_version == 2) {
21962211
n_patches = 96;
21972212
}
@@ -2597,29 +2612,31 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
25972612
}
25982613

25992614
int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
2600-
if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
2615+
switch (ctx->proj_type)
2616+
{
2617+
case ProjectorType::LDP:
26012618
return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0];
2602-
}
2603-
if (ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
2619+
case ProjectorType::LDPV2:
26042620
return ctx->vision_model.mm_model_peg_0_b->ne[0];
2605-
}
2606-
if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
2621+
case ProjectorType::MLP:
26072622
return ctx->vision_model.mm_2_b->ne[0];
2608-
}
2609-
if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
2623+
case ProjectorType::MLP_NORM:
26102624
return ctx->vision_model.mm_3_b->ne[0];
2611-
}
2612-
if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
2625+
case ProjectorType::RESAMPLER:
26132626
if (ctx->minicpmv_version == 2) {
26142627
return 4096;
26152628
}
26162629
else if (ctx->minicpmv_version == 3) {
26172630
return 3584;
26182631
}
2632+
[[fallthrough]];
2633+
case ProjectorType::UNKNOWN:
2634+
LOG_ERR("%s: ProjectorType\n", __func__);
2635+
std::terminate();
26192636
}
2620-
2621-
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
2622-
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
2637+
// Handle unexpected ProjectorType values explicitly since Enum Class switch should have no default case
2638+
LOG_ERR("%s: Unhandled ProjectorType\n", __func__);
2639+
std::terminate();
26232640
}
26242641

26252642
int clip_is_minicpmv(const struct clip_ctx * ctx) {

0 commit comments

Comments
 (0)