1010#include " ggml-backend.h"
1111#include " gguf.h"
1212
13+ #include < algorithm>
1314#include < cassert>
1415#include < cmath>
1516#include < cstdlib>
@@ -1116,9 +1117,8 @@ struct clip_model_loader {
11161117 case PROJECTOR_TYPE_LFM2:
11171118 {
11181119 get_u32 (KEY_PROJ_SCALE_FACTOR, hparams.n_merge , false );
1119- // ref: https://huggingface.co/LiquidAI/LFM2-VL-3B/blob/main/preprocessor_config.json
1120- // config above specifies number of tokens after downsampling, while here it is before, relax lowerbound to 64
1121- hparams.set_limit_image_tokens (64 , 1024 );
1120+ // ref: https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B/blob/main/processor_config.json
1121+ hparams.set_limit_image_tokens (64 , 256 );
11221122 } break ;
11231123 case PROJECTOR_TYPE_PIXTRAL:
11241124 case PROJECTOR_TYPE_LIGHTONOCR:
@@ -2807,6 +2807,119 @@ struct llava_uhd {
28072807 }
28082808};
28092809
2810+ // ref: https://github.com/huggingface/transformers/blob/v5.1.0/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py
2811+ // some of the logic is similar to llava_uhd, but with different hyperparameters and some logic is unique (e.g. grid layout)
2812+ struct lfm2_vl_image_processor {
2813+ // ref: https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B/blob/main/processor_config.json
2814+ static constexpr int min_tiles = 2 ;
2815+ static constexpr int max_tiles = 10 ;
2816+ static constexpr float max_pixels_tolerance = 2 .0f ;
2817+ static constexpr int tile_size = 512 ;
2818+
2819+ static llava_uhd::slice_instructions get_slice_instructions (struct clip_ctx * ctx, const clip_image_size & original_size) {
2820+ llava_uhd::slice_instructions inst;
2821+ const auto & params = ctx->model .hparams ;
2822+ const int align_size = params.patch_size * params.n_merge ;
2823+
2824+ inst.interpolation_overview = img_tool::RESIZE_ALGO_BILINEAR;
2825+ inst.interpolation_refined = img_tool::RESIZE_ALGO_BILINEAR;
2826+ inst.overview_size = img_tool::calc_size_preserved_ratio (original_size, align_size, params.image_min_pixels , params.image_max_pixels );
2827+
2828+ // tile if either dimension exceeds tile_size with tolerance
2829+ const bool needs_tiling = original_size.width > tile_size * max_pixels_tolerance || original_size.height > tile_size * max_pixels_tolerance;
2830+
2831+ if (!needs_tiling) {
2832+ inst.refined_size = clip_image_size{0 , 0 };
2833+ inst.grid_size = clip_image_size{0 , 0 };
2834+ return inst;
2835+ }
2836+
2837+ const clip_image_size grid = get_grid_layout (original_size.height , original_size.width );
2838+
2839+ inst.grid_size = grid;
2840+ inst.refined_size = clip_image_size{tile_size * grid.width , tile_size * grid.height };
2841+
2842+ LOG_DBG (" %s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n " ,
2843+ __func__,
2844+ original_size.width , original_size.height ,
2845+ inst.overview_size .width , inst.overview_size .height ,
2846+ inst.refined_size .width , inst.refined_size .height ,
2847+ grid.width , grid.height );
2848+
2849+ for (int row = 0 ; row < grid.height ; row++) {
2850+ for (int col = 0 ; col < grid.width ; col++) {
2851+ llava_uhd::slice_coordinates slice;
2852+ slice.x = col * tile_size;
2853+ slice.y = row * tile_size;
2854+ slice.size = clip_image_size{tile_size, tile_size};
2855+ inst.slices .push_back (slice);
2856+ LOG_DBG (" %s: slice %d: x=%d, y=%d, size=%d x %d\n " ,
2857+ __func__, (int )inst.slices .size () - 1 ,
2858+ slice.x , slice.y , slice.size .width , slice.size .height );
2859+ }
2860+ }
2861+
2862+ return inst;
2863+ }
2864+
2865+ private:
2866+ static clip_image_size find_closest_aspect_ratio (
2867+ float aspect_ratio,
2868+ const std::vector<clip_image_size> & target_ratios,
2869+ int width, int height) {
2870+ float best_ratio_diff = std::numeric_limits<float >::max ();
2871+ clip_image_size best_ratio = {1 , 1 };
2872+ const float area = static_cast <float >(width * height);
2873+
2874+ for (const auto & ratio : target_ratios) {
2875+ const float target_aspect_ratio = static_cast <float >(ratio.width ) / ratio.height ;
2876+ const float ratio_diff = std::abs (aspect_ratio - target_aspect_ratio);
2877+ if (ratio_diff < best_ratio_diff) {
2878+ best_ratio_diff = ratio_diff;
2879+ best_ratio = ratio;
2880+ } else if (ratio_diff == best_ratio_diff) {
2881+ const float target_area = static_cast <float >(tile_size * tile_size * ratio.width * ratio.height );
2882+ if (area > 0 .5f * target_area) {
2883+ best_ratio = ratio;
2884+ }
2885+ }
2886+ }
2887+ return best_ratio;
2888+ }
2889+
2890+ static std::vector<clip_image_size> get_target_ratios () {
2891+ std::vector<clip_image_size> ratios;
2892+ for (int n = min_tiles; n <= max_tiles; n++) {
2893+ for (int w = 1 ; w <= n; w++) {
2894+ for (int h = 1 ; h <= n; h++) {
2895+ if (w * h >= min_tiles && w * h <= max_tiles) {
2896+ bool found = false ;
2897+ for (const auto & r : ratios) {
2898+ if (r.width == w && r.height == h) {
2899+ found = true ;
2900+ break ;
2901+ }
2902+ }
2903+ if (!found) {
2904+ ratios.push_back ({w, h});
2905+ }
2906+ }
2907+ }
2908+ }
2909+ }
2910+ std::sort (ratios.begin (), ratios.end (), [](const clip_image_size & a, const clip_image_size & b) {
2911+ return a.width * a.height < b.width * b.height ;
2912+ });
2913+ return ratios;
2914+ }
2915+
2916+ static clip_image_size get_grid_layout (int height, int width) {
2917+ const float aspect_ratio = static_cast <float >(width) / height;
2918+ const auto ratios = get_target_ratios ();
2919+ return find_closest_aspect_ratio (aspect_ratio, ratios, width, height);
2920+ }
2921+ };
2922+
28102923// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
28112924// res_imgs memory is being allocated here, previous allocations will be freed if found
28122925bool clip_image_preprocess (struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) {
@@ -3021,6 +3134,20 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
30213134 } break ;
30223135
30233136 case PROJECTOR_TYPE_LFM2:
3137+ {
3138+ auto const inst = lfm2_vl_image_processor::get_slice_instructions (ctx, original_size);
3139+ std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image (img, inst);
3140+
3141+ for (size_t i = 0 ; i < imgs.size (); ++i) {
3142+ clip_image_f32_ptr res (clip_image_f32_init ());
3143+ normalize_image_u8_to_f32 (*imgs[i], *res, params.image_mean , params.image_std );
3144+ res_imgs->entries .push_back (std::move (res));
3145+ }
3146+
3147+ res_imgs->grid_x = inst.grid_size .width ;
3148+ res_imgs->grid_y = inst.grid_size .height ;
3149+ } break ;
3150+
30243151 case PROJECTOR_TYPE_KIMIVL:
30253152 {
30263153 GGML_ASSERT (params.image_min_pixels > 0 && params.image_max_pixels > 0 );
@@ -3032,8 +3159,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
30323159 const std::array<uint8_t , 3 > pad_color = {122 , 116 , 104 };
30333160
30343161 clip_image_u8 resized_img;
3035- const bool pad = (ctx->proj_type () != PROJECTOR_TYPE_LFM2);
3036- img_tool::resize (*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, pad, pad_color);
3162+ img_tool::resize (*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, true , pad_color);
30373163 clip_image_f32_ptr res (clip_image_f32_init ());
30383164 normalize_image_u8_to_f32 (resized_img, *res, params.image_mean , params.image_std );
30393165 res_imgs->entries .push_back (std::move (res));
0 commit comments