gemma 3: implement _pan & scan_

foldl · foldl · commit 1ea378d9d8a3 · 2025-06-12T10:58:29.000+08:00
diff --git a/docs/models.md b/docs/models.md
@@ -300,14 +300,17 @@ Please use `--format completion` for these models.
     * [x] v3: [Instruct-4B](https://huggingface.co/google/gemma-3-4b-it/tree/dbd91bbaf64a0e591f4340ce8b66fd1dba9ab6bd), [Instruct-12B](https://huggingface.co/google/gemma-3-12b-it/tree/7553b6f39c33dc229bfbfe3831f7bcdbb6b738c7), [Instruct-27B](https://huggingface.co/google/gemma-3-27b-it/tree/dfb98f29ff907e391ceed2be3834ca071ea260f1)
     * [x] MedGemma: [Instruct-4B](https://huggingface.co/google/medgemma-4b-it/commit/698f7911b8e0569ff4ebac5d5552f02a9553063c)
 
-    Note: Only download `tokenizer.model` and DO NOT download `tokenizer.json` when converting.
+    Note: Only download `tokenizer.model` and DO NOT download `tokenizer.json` when converting. Use `--set do-pan-and-scan 1` to enable _Pan and Scan_.
+
 
 * Kimi (`KimiVLForConditionalGeneration`)
     * [x] VL: [A3B-Instruct](https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/tree/7a3c132a7b0f1f1677f5a72f258bd3afded7d357), [A3B-Thinking](https://huggingface.co/moonshotai/Kimi-VL-A3B-Thinking/commit/16681d8ac24e505088698e4e34ea494dd6e24400)
 
 * SmolVLM2 (`SmolVLMForConditionalGeneration`)
     * [x] [2.2B-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct/tree/482adb537c021c86670beed01cd58990d01e72e4)
 
+    Note: Use `--set do-split 1` to enable _Split_.
+
 ## RAG Models
 
 * Text Embedding (`XLMRobertaModel`)
diff --git a/models/gemma.cpp b/models/gemma.cpp
@@ -362,6 +362,10 @@ struct Config
     float image_mean[3];
     float image_std[3];
     bool vision_use_head;
+
+    int max_num_crops;
+    int min_crop_size;
+    float min_ratio_to_activate;
 };
 
 class PatchEmbedding : public Block
@@ -377,7 +381,7 @@ class PatchEmbedding : public Block
     ggml::tensor *forward(ComputeContext *ctx, ggml::tensor *input) override
     {
         auto embedding = patch_embedding.forward(ctx, input);
-        embedding = ggml::reshape_2d(ctx, embedding, ggml::get_dim(embedding, 0) * ggml::get_dim(embedding, 1), ggml::get_dim(embedding, 2));
+        embedding = ggml::reshape_3d(ctx, embedding, ggml::get_dim(embedding, 0) * ggml::get_dim(embedding, 1), ggml::get_dim(embedding, 2), ggml::get_dim(embedding, 3));
         embedding = ggml::transpose(ctx, embedding);
         embedding = ggml::cont(ctx, embedding);
         embedding = ggml::add(ctx, embedding, position_embedding);
@@ -589,6 +593,11 @@ class VisualEmbeddingGeneration
             vis_config.image_std[i]     = 0.5f;
         }
 
+        // ref: https://github.com/huggingface/transformers/blob/9487765f07ef4e5500d6ec21cad99aed4a037a3d/src/transformers/models/gemma3/processing_gemma3.py#L36
+        vis_config.min_crop_size = 256;
+        vis_config.max_num_crops =  4;
+        vis_config.min_ratio_to_activate = 1.2f;
+
         const size_t tensor_ovhd = ggml_tensor_overhead();
         const size_t num_tensors = 7 + vis_config.num_hidden_layers * 17;
         const size_t ctx_size = num_tensors * tensor_ovhd;
@@ -608,18 +617,21 @@ class VisualEmbeddingGeneration
         if ((vis_model.get() == nullptr) || (tok->media_emb.size() < 1)) return;
         if (!vis_model->is_loaded()) return;
 
-        run_model(gen_config, tok, dtype, buf);
+        for (auto &image : tok->media_emb)
+        {
+            run_model(gen_config, tok, dtype, image, buf);
+        }
     }
 
 protected:
-    bool run_model(const GenerationConfig &gen_config, BaseTokenizer *tok, ggml::type dtype, std::vector<uint8_t> &buf)
+    bool run_model(const GenerationConfig &gen_config, BaseTokenizer *tok, ggml::type dtype, const BaseTokenizer::MediaAsEmbeddingVector &image, std::vector<uint8_t> &buf)
     {
         ForwardContext ctx(&backend_context);
         ctx.gctx = GGMLContext({.mem_size = backend_context.buf_compute_meta.size(), .mem_buffer = backend_context.buf_compute_meta.data(), .no_alloc = true});
         ctx.gf = ggml::new_graph_custom(&ctx, GRAPH_SIZE, false);
 
         ctx.move_to_layer(LayerAllocatorManager::MiscLayer::Prolog);
-        ggml::tensor *media_emb = ggml::new_tensor_4d(&ctx, ggml::type::GGML_TYPE_F32, vis_config.image_size, vis_config.image_size, 3, tok->media_emb.size());
+        ggml::tensor *media_emb = ggml::new_tensor_3d(&ctx, ggml::type::GGML_TYPE_F32, vis_config.image_size, vis_config.image_size, 3);
 
         dbg_ctx = &ctx;
 
@@ -642,18 +654,14 @@ class VisualEmbeddingGeneration
             exit(-1);
         }
 
-        size_t offset = 0;
-        for (auto &image : tok->media_emb)
-        {
-            size_t size = image.data.size() * sizeof(image.data[0]);
-            Backend::write_tensor_data(media_emb, image.data.data(), offset, size);
-            offset += size;
-        }
+        Backend::write_tensor_data(media_emb, image.data.data(), 0, image.data.size() * sizeof(image.data[0]));
 
         ctx.compute();
 
-        buf.resize(ggml::nbytes(r));
-        Backend::read_tensor_data(r, buf.data());
+        size_t offset = buf.size();
+        buf.resize(offset + ggml::nbytes(r));
+        Backend::read_tensor_data(r, buf.data() + offset);
+
         ctx.reset();
 
         return true;
@@ -691,6 +699,8 @@ class ChatHistoryEncoder : public v1::ChatHistoryEncoder
 {
 public:
     void append_user(int round_idx, const Content &user, std::vector<int> &ids) const override;
+protected:
+    bool append_image(const vision::image_pixels_t pixels, const int w, const int h, std::vector<int> &ids) const;
 public:
     const siglip::Config *vis_config = nullptr;
     int MAX_PATCH_NUM = 0;
@@ -717,6 +727,7 @@ class Tokenizer : public v1::Tokenizer
 public:
     int boi_token_id;
     int eoi_token_id;
+    bool do_pan_and_scan = false;
 };
 
 template <int sliding_window_len> class Gemma3SWASelfAttention : public QKNormedAttention<RMSNorm, SlidingWindowAttentionImpl<sliding_window_len>>
@@ -860,6 +871,17 @@ class ConditionalGeneration : public BaseModelForConditionalGeneration
         return r;
     }
 
+    void set_additional_args(const std::map<std::string, std::string> &args) override
+    {
+        Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
+        auto it = args.find("do_pan_and_scan");
+        if (it == args.end()) it = args.find("do-pan-and-scan");
+        if (it != args.end())
+        {
+            tok->do_pan_and_scan = it->second != "0";
+        }
+    }
+
     void before_generate(const GenerationConfig &gen_config) override
     {
         std::vector<uint8_t> buf;
@@ -885,6 +907,41 @@ class ConditionalGeneration : public BaseModelForConditionalGeneration
     siglip::VisualEmbeddingGeneration visual;
 };
 
+bool ChatHistoryEncoder::append_image(const vision::image_pixels_t pixels, const int w, const int h, std::vector<int> &ids) const
+{
+    const int patch_size = vis_config->patch_size;
+    Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
+    std::vector<float> scaled;
+
+    vision::image_rescale(pixels, scaled);
+
+    vision::image_normalize(scaled, vis_config->image_mean, vis_config->image_std);
+
+    tok->media_emb.push_back({.grid_width = w / patch_size, .grid_height = h / patch_size, .patch_size = patch_size, .data = {}});
+
+    auto &image = tok->media_emb.back();
+
+    vision::image_arrange(scaled, w, patch_size, image.data, vision::PatchesFormat::ChannelsRGB_PixelsLeftRightDown);
+
+    image.emb_vec_number = vis_config->mm_tokens_per_image;
+
+    const int total_patches = tok->get_image_total_emb_vectors();
+    CHATLLM_CHECK(total_patches <= MAX_PATCH_NUM) << "too many image patches!";
+
+    ids.push_back(tok->nl_token_id);
+    ids.push_back(tok->nl_token_id);
+    ids.push_back(tok->boi_token_id);
+    int id = total_patches - image.emb_vec_number + tok->vocab_size;
+    for (int j = 0; j < image.emb_vec_number; j++)
+    {
+        ids.push_back(id++);
+    }
+    ids.push_back(tok->eoi_token_id);
+    ids.push_back(tok->nl_token_id);
+    ids.push_back(tok->nl_token_id);
+
+    return true;
+}
 
 void ChatHistoryEncoder::append_user(int round_idx, const Content &user, std::vector<int> &ids) const
 {
@@ -902,40 +959,51 @@ void ChatHistoryEncoder::append_user(int round_idx, const Content &user, std::ve
         {
             CHATLLM_CHECK(vit_loaded) << "Vision model not loaded";
 
-            vision::Resize resize(vis_config->image_size, vis_config->image_size);
-
-            int w, h;
-            std::vector<uint8_t> pixels;
-            const int patch_size = vis_config->patch_size;
-            vision::image_load(piece.content.c_str(), pixels, w, h, patch_size);
-
-            if (w <= 0) continue;
+            if (tok->do_pan_and_scan)
+            {
+                std::vector<vision::image_pixels_t> crops;
 
-            std::vector<float> scaled;
-            vision::image_rescale(pixels, scaled);
+                int splits_cols_num = 0;
+                vision::PanScanDir dir = vision::PanScanDir::Horizontal;
 
-            vision::image_normalize(scaled, vis_config->image_mean, vis_config->image_std);
+                vision::image_load_pan_and_scan(piece.content.c_str(),
+                    crops, tok->do_pan_and_scan,
+                    vis_config->min_crop_size, vis_config->max_num_crops, vis_config->min_ratio_to_activate,
+                    vis_config->image_size, vis_config->image_size,
+                    dir);
 
-            tok->media_emb.push_back({.grid_width = w / patch_size, .grid_height = h / patch_size, .patch_size = patch_size, .data = {}});
+                printf("crops: %d\n", (int)crops.size());
+                if (crops.size() < 1) continue;
 
-            auto &image = tok->media_emb.back();
+                if (crops.size() == 1)
+                {
+                    append_image(crops[0], vis_config->image_size, vis_config->image_size, ids);
+                    continue;
+                }
 
-            vision::image_arrange(scaled, w, patch_size, image.data, vision::PatchesFormat::ChannelsRGB_PixelsLeftRightDown);
+                tok->encode("Here is the original image ", ids, false, false);
+                append_image(crops[0], vis_config->image_size, vis_config->image_size, ids);
+                tok->encode(" and here are some crops to help you see better", ids, false, false);
 
-            image.emb_vec_number = vis_config->mm_tokens_per_image;
+                for (size_t i = 1; i < crops.size(); i++)
+                {
+                    tok->encode(" ", ids, false, false);
+                    append_image(crops[i], vis_config->image_size, vis_config->image_size, ids);
+                }
+            }
+            else
+            {
+                vision::Resize resize(vis_config->image_size, vis_config->image_size);
 
-            CHATLLM_CHECK(image.emb_vec_number) << "too many image patches!";
+                int w, h;
+                std::vector<uint8_t> pixels;
+                const int patch_size = vis_config->patch_size;
+                vision::image_load(piece.content.c_str(), pixels, w, h, patch_size);
 
-            const int total_patches = tok->get_image_total_emb_vectors();
-            CHATLLM_CHECK(total_patches <= MAX_PATCH_NUM) << "too many image patches!";
+                if (w <= 0) continue;
 
-            ids.push_back(tok->boi_token_id);
-            int id = total_patches - image.emb_vec_number + tok->vocab_size;
-            for (int j = 0; j < image.emb_vec_number; j++)
-            {
-                ids.push_back(id++);
+                append_image(pixels, w, h, ids);
             }
-            ids.push_back(tok->eoi_token_id);
         }
         else
         {
diff --git a/src/vision_process.cpp b/src/vision_process.cpp
@@ -247,6 +247,84 @@ namespace vision
         run_cmd(oss, image);
     }
 
+    void image_load_pan_and_scan(const char *fn, std::vector<image_pixels_t> &crops, bool do_pas,
+        const int min_crop_size, const int max_num_crops, float min_ratio_to_activate,
+        const int crop_width, const int crop_height,
+        PanScanDir &dir)
+    {
+        crops.clear();
+
+        int width = -1;
+        int height = -1;
+        image_dimension(fn, width, height);
+        if (width <= 0) return;
+
+        // whole image
+        {
+            std::ostringstream oss;
+            oss << "magick -depth 8 \"" << std::string(fn) << "\"";
+            oss << " -resize " << crop_width << "x" << crop_height << "!";
+            crops.emplace_back(image_pixels_t());
+            auto &image = crops.back();
+            run_cmd(oss, image);
+        }
+
+        int num_crops_w = 1;
+        int num_crops_h = 1;
+
+        if (width >= height)
+        {
+            auto ratio = (float)width / height;
+            if (ratio < min_ratio_to_activate)
+                return;
+            dir = PanScanDir::Horizontal;
+
+            num_crops_w = int(width / height + 0.5);
+            num_crops_w = std::min((width / min_crop_size), num_crops_w);
+
+            num_crops_w = std::max(2, num_crops_w);
+            num_crops_w = std::min(max_num_crops, num_crops_w);
+        }
+        else
+        {
+            auto ratio = (float)height / width;
+            if (ratio < min_ratio_to_activate)
+                return;
+            dir = PanScanDir::Vertical;
+
+            num_crops_h = int(height / width  + 0.5);
+            num_crops_h = std::min((height / min_crop_size), num_crops_h);
+
+            num_crops_h = std::max(2, num_crops_h);
+            num_crops_h = std::min(max_num_crops, num_crops_h);
+        }
+
+        const int crop_size_w = (width  + (num_crops_w - 1) / num_crops_w);
+        const int crop_size_h = (height + (num_crops_h - 1) / num_crops_h);
+
+        if (std::min(crop_size_w, crop_size_h) < min_crop_size)
+            return;
+
+        for (int r = 0; r < num_crops_h; r++)
+        {
+            const int start_y = r * crop_size_h;
+
+            for (int c = 0; c < num_crops_w; c++)
+            {
+                const int start_x = c * crop_size_w;
+
+                std::ostringstream oss;
+                oss << "magick -depth 8 \"" << std::string(fn) << "\"";
+                oss << " -crop " << crop_size_w << "x" << crop_size_h << "+" << start_x << "+" << start_y;
+                oss << " -resize " << crop_width << "x" << crop_height << "!";
+
+                crops.emplace_back(image_pixels_t());
+                auto &image = crops.back();
+                run_cmd(oss, image);
+            }
+        }
+    }
+
     void image_load(const char *fn, std::vector<uint8_t> &rgb_pixels, int &width, int &height, int patch_size, PaddingMode pad)
     {
         // magick -depth 8 demo.jpeg -resize 100x100 rgb:"aaa.raw"
diff --git a/src/vision_process.h b/src/vision_process.h
@@ -93,11 +93,22 @@ namespace vision
         static bool PreScale(int &width, int &height);
     };
 
+    enum PanScanDir
+    {
+        Horizontal,
+        Vertical,
+    };
+
     typedef std::vector<uint8_t> image_pixels_t; // natural sequence of RGB pixels
 
     void image_dimension(const char *fn, int &width, int &height);
     void image_load(const char *fn, std::vector<uint8_t> &rgb_pixels, int &width, int &height, int patch_size, PaddingMode pad = PaddingMode::No);
     void image_load_split(const char *fn, std::vector<image_pixels_t> &splits, bool do_split, const int split_width, const int split_height, int &splits_cols_num, int &splits_rows_num); // splits are in natural order
+    void image_load_pan_and_scan(const char *fn, std::vector<image_pixels_t> &crops, bool do_pas,
+        const int min_crop_size, const int max_num_crops, float min_ratio_to_activate,
+        const int crop_width, const int crop_height,
+        PanScanDir &dir);
+
     void image_rescale(const std::vector<uint8_t> &rgb_pixels, std::vector<float> &scaled_rgb_pixels, float scale_factor = 1/255.0f);
     void image_normalize(std::vector<float> &rgb_pixels, const float *mean, const float *std_d);