diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index a215f4ed729..bb146c55ffb 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -351,6 +351,16 @@ def prepare_tensors(self): data_qtype = gguf.GGMLQuantizationType.TQ1_0 elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0: data_qtype = gguf.GGMLQuantizationType.TQ2_0 + elif self.ftype == gguf.LlamaFileType.MOSTLY_SOURCE: + if old_dtype == torch.float16: + data_qtype = gguf.GGMLQuantizationType.F16 + elif old_dtype == torch.bfloat16: + data_qtype = gguf.GGMLQuantizationType.BF16 + elif old_dtype == torch.float32: + data_qtype = gguf.GGMLQuantizationType.F32 + else: + logger.warning(f"Cannot find destination type matching {old_dtype}: Using F16") + data_qtype = gguf.GGMLQuantizationType.F16 else: raise ValueError(f"Unknown file type: {self.ftype.name}") @@ -8164,8 +8174,8 @@ def parse_args() -> argparse.Namespace: help="path to write to; default: based on input. {ftype} will be replaced by the outtype.", ) parser.add_argument( - "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16", - help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type", + "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "source", "auto"], default="f16", + help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, source to keep it unchanged, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type", ) parser.add_argument( "--bigendian", action="store_true", @@ -8308,6 +8318,7 @@ def main() -> None: "tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0, "tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0, "auto": gguf.LlamaFileType.GUESSED, + "source": gguf.LlamaFileType.MOSTLY_SOURCE, } is_split = args.split_max_tensors > 0 or args.split_max_size != "0" diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index e2d81dd9891..f1ce4a846e0 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -2760,6 +2760,7 @@ class LlamaFileType(IntEnum): MOSTLY_TQ2_0 = 37 # except 1d tensors GUESSED = 1024 # not specified in the model file + MOSTLY_SOURCE = 1025 # not specified in the model file class GGUFEndian(IntEnum): diff --git a/include/llama.h b/include/llama.h index 2cbe18d8cfb..ec6212886df 100644 --- a/include/llama.h +++ b/include/llama.h @@ -154,6 +154,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file + LLAMA_FTYPE_GUESSED_MOSTLY_SORUCE = 1025, // not specified in the model file }; enum llama_rope_scaling_type {