nicoboss · nicoboss · Aug 4, 2025 · Aug 4, 2025
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -351,6 +351,16 @@ def prepare_tensors(self):
                         data_qtype = gguf.GGMLQuantizationType.TQ1_0
                     elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0:
                         data_qtype = gguf.GGMLQuantizationType.TQ2_0
+                    elif self.ftype == gguf.LlamaFileType.MOSTLY_SOURCE:
+                        if old_dtype == torch.float16:
+                            data_qtype = gguf.GGMLQuantizationType.F16
+                        elif old_dtype == torch.bfloat16:
+                            data_qtype = gguf.GGMLQuantizationType.BF16
+                        elif old_dtype == torch.float32:
+                            data_qtype = gguf.GGMLQuantizationType.F32
+                        else:
+                            logger.warning(f"Cannot find destination type matching {old_dtype}: Using F16")
+                            data_qtype = gguf.GGMLQuantizationType.F16
                     else:
                         raise ValueError(f"Unknown file type: {self.ftype.name}")
 
@@ -8164,8 +8174,8 @@ def parse_args() -> argparse.Namespace:
         help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
     )
     parser.add_argument(
-        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16",
-        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
+        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "source", "auto"], default="f16",
+        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, source to keep it unchanged, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
     )
     parser.add_argument(
         "--bigendian", action="store_true",
@@ -8308,6 +8318,7 @@ def main() -> None:
         "tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0,
         "tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0,
         "auto": gguf.LlamaFileType.GUESSED,
+        "source": gguf.LlamaFileType.MOSTLY_SOURCE,
     }
 
     is_split = args.split_max_tensors > 0 or args.split_max_size != "0"

diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
@@ -2760,6 +2760,7 @@ class LlamaFileType(IntEnum):
     MOSTLY_TQ2_0         = 37  # except 1d tensors
 
     GUESSED              = 1024  # not specified in the model file
+    MOSTLY_SOURCE        = 1025  # not specified in the model file
 
 
 class GGUFEndian(IntEnum):

diff --git a/include/llama.h b/include/llama.h
@@ -154,6 +154,7 @@ extern "C" {
         LLAMA_FTYPE_MOSTLY_TQ2_0         = 37, // except 1d tensors
 
         LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
+        LLAMA_FTYPE_GUESSED_MOSTLY_SORUCE = 1025, // not specified in the model file
     };
 
     enum llama_rope_scaling_type {