Revamp Q2_K and Q3_K quants

Nexesenex · Nexesenex · commit 5ae59714d210 · 2024-08-25T03:12:29.000+02:00
Q3_K_XL takes the place of Q3_K_L.
Q3_K_L becomes intermediary between Q3_K_M and XL.
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
@@ -41,7 +41,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
     { "IQ3_XS",   LLAMA_FTYPE_MOSTLY_IQ3_XS,   " 3.3 bpw quantization",             },
     { "Q3_K_S",   LLAMA_FTYPE_MOSTLY_Q3_K_S,   " 3.41G, +1.6321 ppl @ Llama-3-8B",  },
     { "Q3_K_M",   LLAMA_FTYPE_MOSTLY_Q3_K_M,   " 3.74G, +0.6569 ppl @ Llama-3-8B",  },
-    { "Q3_K_L",   LLAMA_FTYPE_MOSTLY_Q3_K_L,   " 4.03G, +0.5562 ppl @ Llama-3-8B",  },
+    { "Q3_K_L",   LLAMA_FTYPE_MOSTLY_Q3_K_L,   " 4.10 bpw quantization mix",        },
+    { "Q3_K_XL",  LLAMA_FTYPE_MOSTLY_Q3_K_XL,  " 4.03G, +0.5562 ppl @ Llama-3-8B",  },
     { "IQ4_NL",   LLAMA_FTYPE_MOSTLY_IQ4_NL,   " 4.50 bpw non-linear quantization", },
     { "IQ4_XS",   LLAMA_FTYPE_MOSTLY_IQ4_XS,   " 4.25 bpw non-linear quantization", },
     { "IQ4_XSR",  LLAMA_FTYPE_MOSTLY_IQ4_XSR,  " 4.xx bpw non-linear quantization", },
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
@@ -1227,7 +1227,7 @@ class LlamaFileType(IntEnum):
     MOSTLY_Q2_K          = 10  # except 1d tensors
     MOSTLY_Q3_K_S        = 11  # except 1d tensors
     MOSTLY_Q3_K_M        = 12  # except 1d tensors
-    MOSTLY_Q3_K_L        = 13  # except 1d tensors
+    MOSTLY_Q3_K_XL       = 13  # except 1d tensors
     MOSTLY_Q4_K_S        = 14  # except 1d tensors
     MOSTLY_Q4_K_M        = 15  # except 1d tensors
     MOSTLY_Q5_K_S        = 16  # except 1d tensors
@@ -1257,6 +1257,7 @@ class LlamaFileType(IntEnum):
     MOSTLY_IQ1_XL        = 42  # except 1d tensors
     MOSTLY_IQ4_XSR       = 43  # except 1d tensors
     MOSTLY_IQ3_XXL       = 44  # except 1d tensors
+    MOSTLY_Q3_K_L        = 45  # except 1d tensors
 
     GUESSED              = 1024  # not specified in the model file
 
diff --git a/include/llama.h b/include/llama.h
@@ -143,7 +143,7 @@ extern "C" {
         LLAMA_FTYPE_MOSTLY_Q2_K          = 10, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q3_K_S        = 11, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q3_K_M        = 12, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q3_K_L        = 13, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q3_K_XL       = 13, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q4_K_S        = 14, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q4_K_M        = 15, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q5_K_S        = 16, // except 1d tensors
@@ -173,6 +173,7 @@ extern "C" {
         LLAMA_FTYPE_MOSTLY_IQ1_XL        = 42, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_IQ4_XSR       = 43, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_IQ3_XXL       = 44, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q3_K_L        = 45, // except 1d tensors
         LLAMA_FTYPE_CQS                  = 99, // except 1d tensors
 
         LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
diff --git a/src/llama.cpp b/src/llama.cpp