update readme and upload models

foldl · foldl · commit 5ffcc30ab7a6 · 2025-06-03T16:08:30.000+08:00
diff --git a/README.md b/README.md
@@ -13,6 +13,7 @@ pure C++ implementation based on [@ggerganov](https://github.com/ggerganov)'s [g
 
 **What's New:**
 
+* 2025-06-03: Kimi-VL
 * 2025-05-28: Gemma3 fully supported
 * 2025-05-23: [I can see](./docs/multimodal.md): Fuyu
 * 2025-05-21: Re-quantization when loading (e.g. `--re-quantize q4_k`)
diff --git a/docs/gpu.md b/docs/gpu.md
@@ -51,6 +51,15 @@ The full format of `-ngl` is `-ngl [id:]layer_specs[;id:layer_specs]..`. `id` is
 Suppose device 0 is GPU, and device 1 is CPU, `-ngl 1:5;0:10` will put the first 5 layers to CPU, the next 10 layers to GPU,
 and all other layers to CPU as default.
 
+You can use `-mgl` (`--model_gpu_layers`) to specify number of layers of a specific model to be deployed to different backend devices.
+`-mgl MODEL N`, in which `N` shares the same syntax as `-ngl` and `MODEL` can be
+
+* `main`: the main model.
+* `vis`: the vision accessory model (which typically project images/videos into LLM).
+* `any`: any model.
+
+`-ngl N` is equivalent to `-mgl any N`.
+
 Tip: Use `--show_devices` to check all available devices and `--show` to check basic hyper parameters of a model.
 
 ## Known issues
diff --git a/docs/models.md b/docs/models.md
@@ -297,6 +297,9 @@ Please use `--format completion` for these models.
 
     Note: Only download `tokenizer.model` and DO NOT download `tokenizer.json` when converting.
 
+* Kimi (`KimiVLForConditionalGeneration`)
+    * [x] VL: [A3B-Instruct](https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/tree/7a3c132a7b0f1f1677f5a72f258bd3afded7d357), [A3B-Thinking](https://huggingface.co/moonshotai/Kimi-VL-A3B-Thinking/commit/16681d8ac24e505088698e4e34ea494dd6e24400)
+
 ## RAG Models
 
 * Text Embedding (`XLMRobertaModel`)
diff --git a/scripts/models.json b/scripts/models.json
@@ -2776,5 +2776,25 @@
                 }
             }
         }
+    },
+    "kimi-vl": {
+        "brief": "Kimi-VL, an efficient open-source Mixture-of-Experts (MoE) vision-language model (VLM) that offers advanced multimodal reasoning, long-context understanding, and strong agent capabilities.",
+        "default": "a3b-instruct",
+        "license": "MIT",
+        "variants": {
+            "a3b-instruct": {
+                "default": "q8",
+                "quantized": {
+                    "q8": {
+                        "size": 17566398608,
+                        "url": "chatllm_quantized_kimi-vl/kimi-vl.bin"
+                    },
+                    "q4_1": {
+                        "size": 10447149072,
+                        "url": "chatllm_quantized_kimi-vl/kimi-vl-q4_1.bin"
+                    }
+                }
+            }
+        }
     }
 }

Original file line number	Diff line number	Diff line change
`@@ -2776,5 +2776,25 @@`
`2776`	`2776`	`}`
`2777`	`2777`	`}`
`2778`	`2778`	`}`
	`2779`	`+ },`
	`2780`	`+ "kimi-vl": {`
	`2781`	`+ "brief": "Kimi-VL, an efficient open-source Mixture-of-Experts (MoE) vision-language model (VLM) that offers advanced multimodal reasoning, long-context understanding, and strong agent capabilities.",`
	`2782`	`+ "default": "a3b-instruct",`
	`2783`	`+ "license": "MIT",`
	`2784`	`+ "variants": {`
	`2785`	`+ "a3b-instruct": {`
	`2786`	`+ "default": "q8",`
	`2787`	`+ "quantized": {`
	`2788`	`+ "q8": {`
	`2789`	`+ "size": 17566398608,`
	`2790`	`+ "url": "chatllm_quantized_kimi-vl/kimi-vl.bin"`
	`2791`	`+ },`
	`2792`	`+ "q4_1": {`
	`2793`	`+ "size": 10447149072,`
	`2794`	`+ "url": "chatllm_quantized_kimi-vl/kimi-vl-q4_1.bin"`
	`2795`	`+ }`
	`2796`	`+ }`
	`2797`	`+ }`
	`2798`	`+ }`
`2779`	`2799`	`}`
`2780`	`2800`	`}`