docs for kimi-vl

foldl · foldl · commit bed565d89762 · 2025-06-25T09:30:24.000+08:00
diff --git a/docs/models.md b/docs/models.md
@@ -305,6 +305,11 @@ Please use `--format completion` for these models.
 * Kimi (`KimiVLForConditionalGeneration`)
     * [x] VL: [A3B-Instruct](https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/tree/7a3c132a7b0f1f1677f5a72f258bd3afded7d357), [A3B-Thinking](https://huggingface.co/moonshotai/Kimi-VL-A3B-Thinking/commit/16681d8ac24e505088698e4e34ea494dd6e24400), [A3B-Thinking-2506](https://huggingface.co/moonshotai/Kimi-VL-A3B-Thinking-2506/tree/f124f44fb6ab5778cfac5117e3902ef03e860ad4)
 
+    Additional options (Use `--set X Y` to change values):
+    * `video_max_frames`: default 20.
+    * `native_resolution`: use native resolution or not, default: `false` (This seems sensitive to quantization, so defaults to `false`).
+    * `fps`: Default 1.0.
+
 * Qwen (`Qwen2AudioForConditionalGeneration`)
     * [x] Qwen2-Audio: [7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct/tree/0a095220c30b7b31434169c3086508ef3ea5bf0a)
 
diff --git a/models/kimi.cpp b/models/kimi.cpp
@@ -587,9 +587,9 @@ namespace vl
         int media_end_token_id;
         int media_pad_token_id;
 
-        int video_max_frames = 20;
-        bool arbitrary_resolution = false;
-        double fps = 1.0;
+        int     video_max_frames = 20;
+        bool    native_resolution = false;
+        double  fps = 1.0;
     };
 
     void ChatHistoryEncoder::append_ai(int round_idx, const std::string &ai, std::vector<int> &ids) const
@@ -686,7 +686,7 @@ namespace vl
         {
             Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
             tok->video_max_frames       = utils::get_opt(args, "video_max_frames", tok->video_max_frames);
-            tok->arbitrary_resolution   = utils::get_opt(args, "arbitrary_resolution", false);
+            tok->native_resolution      = utils::get_opt(args, "native_resolution", tok->native_resolution);
             tok->fps                    = utils::get_opt(args, "fps", tok->fps);
         }
 
@@ -714,7 +714,7 @@ namespace vl
         std::unique_ptr<vision::Resize> resize;
         std::unique_ptr<vision::PreMaxImageSize> max_size;
 
-        if (!tok->arbitrary_resolution)
+        if (!tok->native_resolution)
             resize.reset(new vision::Resize(896, 896));
 
         // expand video into images