support fp8 models; upload 1 model.

foldl · foldl · commit 07144a81fd5e · 2025-12-10T17:56:14.000+08:00
diff --git a/convert.py b/convert.py
@@ -273,6 +273,7 @@ class TokenizerType(Enum):
 g_tokenizer_type = TokenizerType.BPE1
 
 g_special_tokens: Dict = {}
+g_do_dequantization: bool = False
 
 def pad_to_len(l: list, to_len: int, v = 0) -> list:
     assert len(l) <= to_len
@@ -605,7 +606,22 @@ def format_time(t) -> str:
         remain = (total - i - 1) * per_item
         print_progress_bar(i + 1, total, prefix=desc, suffix=f"({i}/{total}) {format_time(per_item)}/it rem: {format_time(remain)}")
 
+def dequantize(state_dict: dict) -> dict:
+    r = {}
+    for k in state_dict.keys():
+        t: torch.Tensor = state_dict[k]
+        if k.endswith('.weight_scale_inv'):
+            k = k.replace('.weight_scale_inv', '.weight')
+            assert k in state_dict
+            r[k] = state_dict[k].float() * t
+            continue
+        if k not in r:
+            r[k] = t
+    return r
+
 def dump_state_dict(f, weight_names, model_files, ggml_type, config, state_dict_pp, loader_fun = None):
+    global g_do_dequantization
+
     tensor_info = []
     converted_names = []
 
@@ -618,6 +634,10 @@ def dump_state_dict(f, weight_names, model_files, ggml_type, config, state_dict_
 
     for state_dict in loader_fun(model_files):
         this_round = []
+
+        if g_do_dequantization:
+            state_dict = dequantize(state_dict)
+
         state_dict = state_dict_pp(config, state_dict)
 
         for x in state_dict:
@@ -8462,7 +8482,7 @@ def load_some_model(path: Path, fallback_files: list[Path] = []) -> List[Path]:
         return [path]
 
 def main():
-    global g_lora
+    global g_lora, g_do_dequantization
 
     parser = argparse.ArgumentParser("chatllm-convert")
     parser.add_argument("-i", "--model_name_or_path", type=str)
@@ -8499,6 +8519,8 @@ def main():
     else:
         config = AttributeDict({})
 
+    g_do_dequantization = config.quantization_config is not None
+
     if arch == '':
         if config.architectures is None:
             if "model_type" in config:
diff --git a/docs/models.md b/docs/models.md
@@ -399,6 +399,7 @@ Please use `--format completion` for these models.
     [3B-Reasoning-2512](https://huggingface.co/mistralai/Ministral-3-3B-Reasoning-2512/tree/039f888eb54340b5e9870721f3c249fbc809b8e8),
     [8B-Instruct-2512](https://huggingface.co/mistralai/Ministral-3-8B-Instruct-2512-BF16/tree/bde2b3370dbf8ad77ceab25a5a43bc9013cda350),
     [8B-Reasoning-2512](https://huggingface.co/mistralai/Ministral-3-8B-Reasoning-2512/tree/f511871f6402ba68dadfb42a94a7a7e13499fd65)
+    * [x] Devstral-Small-2: [24B-Instruct-2512](https://huggingface.co/mistralai/Devstral-Small-2-24B-Instruct-2512/tree/8d27a0d2120f1563c11dc91d494e99f9678ecf79)
 
 * Qwen (`Qwen2AudioForConditionalGeneration`, `Qwen2_5_VLForConditionalGeneration`)
     * [x] Qwen2-Audio: [7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct/tree/0a095220c30b7b31434169c3086508ef3ea5bf0a)
diff --git a/scripts/models.json b/scripts/models.json
@@ -285,6 +285,22 @@
             }
         }
     },
+    "devstral-small-2": {
+        "brief": "Devstral Small 2 excels at using tools to explore codebases, editing multiple files and power software engineering agents.",
+        "default": "24b-2512",
+        "license": "Apache License Version 2.0",
+        "variants": {
+            "24b-2512": {
+                "default": "q4_1",
+                "quantized": {
+                    "q4_1": {
+                        "size": 15015421232,
+                        "url": "chatllm_quantized_ministral-3/devstral-small-2-24b-2512-q4_1.bin"
+                    }
+                }
+            }
+        }
+    },
     "mistral0.1": {
         "brief": "The Mistral-7B-Instruct-v0.1 Large Language Model (LLM) is a instruct fine-tuned version of the Mistral-7B-v0.1 generative text model using a variety of publicly available conversation datasets.",
         "default": "7b",