@@ -3648,8 +3648,9 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
3648
3648
int clip_n_output_tokens (const struct clip_ctx * ctx, struct clip_image_f32 * img) {
3649
3649
const auto & params = ctx->model .hparams ;
3650
3650
3651
- // only for models using fixed size square images
3652
- int n_patches_sq = (params.image_size / params.patch_size ) * (params.image_size / params.patch_size );
3651
+ // for models with fixed size image, the input image is already pre-processed and resized to square
3652
+ int patch_size = params.patch_size ;
3653
+ int n_patches = (img->nx / patch_size) * (img->ny / patch_size);
3653
3654
3654
3655
projector_type proj = ctx->proj_type ();
3655
3656
@@ -3663,27 +3664,27 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
3663
3664
case PROJECTOR_TYPE_LDPV2:
3664
3665
case PROJECTOR_TYPE_GLM_EDGE:
3665
3666
{
3666
- n_patches_sq /= 4 ;
3667
+ n_patches /= 4 ;
3667
3668
if (ctx->model .mm_glm_tok_boi ) {
3668
- n_patches_sq += 2 ; // for BOI and EOI token embeddings
3669
+ n_patches += 2 ; // for BOI and EOI token embeddings
3669
3670
}
3670
3671
} break ;
3671
3672
case PROJECTOR_TYPE_MINICPMV:
3672
3673
{
3673
3674
// Use actual config value if available, otherwise fall back to hardcoded values
3674
3675
if (params.minicpmv_query_num > 0 ) {
3675
- n_patches_sq = params.minicpmv_query_num ;
3676
+ n_patches = params.minicpmv_query_num ;
3676
3677
} else {
3677
3678
// Fallback to hardcoded values for legacy models
3678
3679
if (params.minicpmv_version == 2 ) {
3679
- n_patches_sq = 96 ;
3680
+ n_patches = 96 ;
3680
3681
} else if (params.minicpmv_version == 3 ) {
3681
- n_patches_sq = 64 ;
3682
+ n_patches = 64 ;
3682
3683
} else if (params.minicpmv_version == 4 ) {
3683
- n_patches_sq = 64 ;
3684
+ n_patches = 64 ;
3684
3685
} else if (params.minicpmv_version == 5 ) {
3685
3686
// MiniCPM-V 4.0
3686
- n_patches_sq = 64 ;
3687
+ n_patches = 64 ;
3687
3688
} else {
3688
3689
GGML_ABORT (" Unknown minicpmv version" );
3689
3690
}
@@ -3692,67 +3693,56 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
3692
3693
case PROJECTOR_TYPE_QWEN2VL:
3693
3694
case PROJECTOR_TYPE_QWEN25VL:
3694
3695
{
3695
- // dynamic size
3696
+ // dynamic size (2 conv, so double patch size)
3696
3697
int patch_size = params.patch_size * 2 ;
3697
3698
int x_patch = img->nx / patch_size + (int )(img->nx % patch_size > 0 );
3698
3699
int y_patch = img->ny / patch_size + (int )(img->ny % patch_size > 0 );
3699
- n_patches_sq = x_patch * y_patch;
3700
+ n_patches = x_patch * y_patch;
3700
3701
} break ;
3701
3702
case PROJECTOR_TYPE_GEMMA3:
3702
- {
3703
- int n_per_side = params.image_size / params.patch_size ;
3704
- int n_per_side_2d_pool = n_per_side / params.proj_scale_factor ;
3705
- n_patches_sq = n_per_side_2d_pool * n_per_side_2d_pool;
3706
- } break ;
3707
3703
case PROJECTOR_TYPE_IDEFICS3:
3708
3704
case PROJECTOR_TYPE_INTERNVL:
3705
+ case PROJECTOR_TYPE_LLAMA4:
3706
+ case PROJECTOR_TYPE_LFM2:
3709
3707
{
3710
3708
// both W and H are divided by proj_scale_factor
3711
- n_patches_sq /= (params.proj_scale_factor * params.proj_scale_factor );
3709
+ int scale_factor = ctx->model .hparams .proj_scale_factor ;
3710
+ n_patches /= (scale_factor * scale_factor);
3712
3711
} break ;
3713
3712
case PROJECTOR_TYPE_PIXTRAL:
3714
3713
{
3715
3714
// dynamic size
3716
3715
int n_merge = params.spatial_merge_size ;
3717
- int n_patches_x = img->nx / params.patch_size / (n_merge > 0 ? n_merge : 1 );
3718
- int n_patches_y = img->ny / params.patch_size / (n_merge > 0 ? n_merge : 1 );
3719
- n_patches_sq = n_patches_y * n_patches_x + n_patches_y - 1 ; // + one [IMG_BREAK] per row, except the last row
3720
- } break ;
3721
- case PROJECTOR_TYPE_LLAMA4:
3722
- {
3723
- int scale_factor = ctx->model .hparams .proj_scale_factor ;
3724
- n_patches_sq /= (scale_factor * scale_factor);
3716
+ int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1 );
3717
+ int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1 );
3718
+ n_patches = n_patches_y * n_patches_x + n_patches_y - 1 ; // + one [IMG_BREAK] per row, except the last row
3725
3719
} break ;
3726
3720
case PROJECTOR_TYPE_VOXTRAL:
3727
3721
case PROJECTOR_TYPE_ULTRAVOX:
3728
3722
case PROJECTOR_TYPE_QWEN2A:
3729
3723
{
3730
- n_patches_sq = img->nx ;
3724
+ n_patches = img->nx ;
3731
3725
3732
3726
const int proj_stack_factor = ctx->model .hparams .proj_stack_factor ;
3733
3727
if (ctx->model .audio_has_stack_frames ()) {
3734
3728
GGML_ASSERT (proj_stack_factor > 0 );
3735
- const int n_len = CLIP_ALIGN (n_patches_sq , proj_stack_factor);
3736
- n_patches_sq = n_len / proj_stack_factor;
3729
+ const int n_len = CLIP_ALIGN (n_patches , proj_stack_factor);
3730
+ n_patches = n_len / proj_stack_factor;
3737
3731
}
3738
3732
3739
3733
// whisper downscales input token by half after conv1d
3740
- n_patches_sq /= 2 ;
3734
+ n_patches /= 2 ;
3741
3735
3742
3736
if (ctx->model .audio_has_avgpool ()) {
3743
3737
// divide by 2 because of nn.AvgPool1d(2, stride=2)
3744
- n_patches_sq /= 2 ;
3738
+ n_patches /= 2 ;
3745
3739
}
3746
3740
} break ;
3747
- case PROJECTOR_TYPE_LFM2:
3748
- {
3749
- n_patches_sq = (img->nx / (params.patch_size * params.proj_scale_factor )) * (img->ny / (params.patch_size * params.proj_scale_factor ));
3750
- } break ;
3751
3741
default :
3752
3742
GGML_ABORT (" unsupported projector type" );
3753
3743
}
3754
3744
3755
- return n_patches_sq ;
3745
+ return n_patches ;
3756
3746
}
3757
3747
3758
3748
static std::vector<std::vector<std::vector<float >>> get_1d_sincos_pos_embed_from_grid_new (int embed_dim, const std::vector<std::vector<float >> & pos) {
0 commit comments