@@ -440,15 +440,14 @@ struct clip_graph {
440440
441441 if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
442442 const int batch_size = 1 ;
443- const int mm_tokens_per_image = 256 ; // default value for gemma3
444- const int tokens_per_side = sqrt (mm_tokens_per_image);
445- const int patches_per_image = sqrt (n_patches);
446- const int kernel_size = patches_per_image / tokens_per_side;
443+ GGML_ASSERT (n_patches_x == n_patches_y);
444+ const int patches_per_image = n_patches_x;
445+ const int kernel_size = hparams.proj_scale_factor ;
447446
448447 cur = ggml_cont (ctx0, ggml_transpose (ctx0, cur));
449448 cur = ggml_reshape_4d (ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
450449
451- // doing a pool2d to reduce the number of output tokens to 256
450+ // doing a pool2d to reduce the number of output tokens
452451 cur = ggml_pool_2d (ctx0, cur, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0 , 0 );
453452 cur = ggml_reshape_3d (ctx0, cur, cur->ne [0 ] * cur->ne [0 ], n_embd, batch_size);
454453 cur = ggml_cont (ctx0, ggml_transpose (ctx0, cur));
@@ -1795,6 +1794,14 @@ struct clip_model_loader {
17951794 hparams.rope_theta = 10000 .0f ;
17961795 get_u32 (KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size , false );
17971796 } break ;
1797+ case PROJECTOR_TYPE_GEMMA3:
1798+ {
1799+ // default value (used by all model sizes in gemma 3 family)
1800+ // number of patches for each **side** is reduced by a factor of 4
1801+ hparams.proj_scale_factor = 4 ;
1802+ // test model (tinygemma3) has a different value, we optionally read it
1803+ get_u32 (KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor , false );
1804+ } break ;
17981805 case PROJECTOR_TYPE_QWEN25VL:
17991806 {
18001807 get_u32 (KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern );
@@ -1804,6 +1811,14 @@ struct clip_model_loader {
18041811 }
18051812
18061813 LOG_INF (" %s: projector: %s\n " , __func__, proj_type.c_str ());
1814+ LOG_INF (" %s: n_embd: %d\n " , __func__, hparams.n_embd );
1815+ LOG_INF (" %s: n_head: %d\n " , __func__, hparams.n_head );
1816+ LOG_INF (" %s: n_ff: %d\n " , __func__, hparams.n_ff );
1817+ LOG_INF (" %s: n_layer: %d\n " , __func__, hparams.n_layer );
1818+ LOG_INF (" %s: projection_dim: %d\n " , __func__, hparams.projection_dim );
1819+ LOG_INF (" %s: image_size: %d\n " , __func__, hparams.image_size );
1820+ LOG_INF (" %s: patch_size: %d\n " , __func__, hparams.patch_size );
1821+ LOG_INF (" \n " );
18071822 LOG_INF (" %s: has_llava_proj: %d\n " , __func__, ctx_clip.has_llava_projector );
18081823 LOG_INF (" %s: minicpmv_version: %d\n " , __func__, ctx_clip.minicpmv_version );
18091824 LOG_INF (" %s: proj_scale_factor: %d\n " , __func__, hparams.proj_scale_factor );
@@ -2990,11 +3005,13 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
29903005 int y_patch = img->ny / patch_size + (int )(img->ny % patch_size > 0 );
29913006 n_patches = x_patch * y_patch;
29923007 } else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
2993- n_patches = 256 ;
3008+ int n_per_side = params.image_size / params.patch_size ;
3009+ int n_per_side_2d_pool = n_per_side / params.proj_scale_factor ;
3010+ n_patches = n_per_side_2d_pool * n_per_side_2d_pool;
29943011 } else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
2995- n_patches /= ctx-> vision_model . hparams .proj_scale_factor ;
3012+ n_patches /= params .proj_scale_factor ;
29963013 } else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
2997- int n_merge = ctx-> vision_model . hparams .spatial_merge_size ;
3014+ int n_merge = params .spatial_merge_size ;
29983015 int n_patches_x = img->nx / params.patch_size / (n_merge > 0 ? n_merge : 1 );
29993016 int n_patches_y = img->ny / params.patch_size / (n_merge > 0 ? n_merge : 1 );
30003017 n_patches = n_patches_y*n_patches_x + n_patches_y - 1 ; // + one [IMG_BREAK] per row, except the last row
0 commit comments