diff --git a/docs/source/en/gguf.md b/docs/source/en/gguf.md index 359ed4d5e1e8..16e990f77f6d 100644 --- a/docs/source/en/gguf.md +++ b/docs/source/en/gguf.md @@ -53,9 +53,10 @@ on the Hub. - Q5_K - Q6_K - Q8_0 +- IQ2_XXS We take example from the excellent [99991/pygguf](https://github.com/99991/pygguf) Python parser to dequantize the -weights. +weights in k-quants. ### Supported model architectures diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py index 7da09be841e1..ca1670dc9ae5 100644 --- a/src/transformers/integrations/ggml.py +++ b/src/transformers/integrations/ggml.py @@ -44,6 +44,7 @@ "Q4_K": 12, "Q5_K": 13, "Q6_K": 14, + "IQ2_XXS": 16, } # The Blocksizes are reported in bytes @@ -58,6 +59,7 @@ "Q2_K": 256 // 16 + 256 // 4 + 2 + 2, "Q3_K": 256 // 8 + 256 // 4 + 12 + 2, "Q5_K": 2 + 2 + 12 + 256 // 8 + 256 // 2, + "IQ2_XXS": 2 + 256 // 8 * 2, } # Listed here: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md @@ -487,6 +489,48 @@ def dequantize_q5_k(data, n_bytes: int): ) +def dequantize_iq2_xxs(data, n_bytes): + # C implementation + # https://github.com/ggerganov/ggml/blob/3f5a4bbe59285c0f679b376f6259187d5514ff9c/src/ggml-quants.c#L3311 + # C struct definition + # https://github.com/ggerganov/ggml/blob/3f5a4bbe59285c0f679b376f6259187d5514ff9c/src/ggml-common.h#L314-L321 + def _dequantize_iq2xxs_column(qs_block, d): + """ + The qs matrix could be splitted into 8 sub_blocks (4 x int16 bytes each block) for each row: + | Block_11 | Block_12 | Block_13 | Block_14 | Block_15 | Block_16 | Block_17 | Block_18 | + | Block_21 | Block_22 | Block_23 | Block_24 | Block_25 | Block_26 | Block_27 | Block_28 | + ... + | Block_n1 | Block_n2 | Block_n3 | Block_n4 | Block_n5 | Block_n6 | Block_n7 | Block_n8 | + + This function process n rows at a time (Block_11 to Block_n1, Block_12 to Block_n2 etc). + """ + from .ggml_utils import IQ2XXS_GRID, KMASK_IQ2XS, KSIGNS_IQ2XS + + aux32 = np.frombuffer(qs_block, dtype=np.uint32).reshape(num_blocks, 2) + aux8 = np.frombuffer(qs_block, dtype=np.uint8).reshape(num_blocks, 8) + + l = np.arange(4) + db = d * (0.5 + (aux32[:, [1]] >> 28)) * 0.25 + + grid = np.frombuffer(np.ascontiguousarray(IQ2XXS_GRID[aux8[:, l]]), dtype=np.uint8).reshape(num_blocks, 32) + signs = KSIGNS_IQ2XS[(aux32[:, [1]] >> 7 * l) & 127] + + y = db * grid * np.where(signs.repeat(8, axis=1) & np.tile(KMASK_IQ2XS, 4), -1, 1) + return y + + num_blocks = n_bytes // GGML_BLOCK_SIZES["IQ2_XXS"] + + data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, GGML_BLOCK_SIZES["IQ2_XXS"] // 2) + data_i16 = np.frombuffer(data, dtype=np.int16).reshape(num_blocks, GGML_BLOCK_SIZES["IQ2_XXS"] // 2) + + d = data_f16[:, 0].reshape(num_blocks, 1).astype(np.float32) + qs = data_i16[:, 1:].reshape(num_blocks, 32) + + y = [_dequantize_iq2xxs_column(np.ascontiguousarray(qs[:, 4 * i : 4 * (i + 1)]), d) for i in range(8)] + y = np.concatenate(y, axis=1) + return y + + def load_dequant_gguf_tensor(shape, ggml_type, data, n_bytes): if ggml_type == GGML_TYPES["F32"]: values = data @@ -506,6 +550,8 @@ def load_dequant_gguf_tensor(shape, ggml_type, data, n_bytes): values = dequantize_q3_k(data, n_bytes) elif ggml_type == GGML_TYPES["Q5_K"]: values = dequantize_q5_k(data, n_bytes) + elif ggml_type == GGML_TYPES["IQ2_XXS"]: + values = dequantize_iq2_xxs(data, n_bytes) else: raise NotImplementedError( f"ggml_type {ggml_type} not implemented - please raise an issue on huggingface transformers: https://github.com/huggingface/transformers/issues/new/choose" diff --git a/src/transformers/integrations/ggml_utils.py b/src/transformers/integrations/ggml_utils.py new file mode 100644 index 000000000000..59ba176a8725 --- /dev/null +++ b/src/transformers/integrations/ggml_utils.py @@ -0,0 +1,84 @@ +# Constants for ggml imatrix dequantization +# migrate from https://github.com/ggerganov/ggml/blob/3f5a4bbe59285c0f679b376f6259187d5514ff9c/src/ggml-common.h#L437 +import numpy as np + + +IQ2XXS_GRID = np.array([ + 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08, + 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808, + 0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819, + 0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819, + 0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b, + 0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808, + 0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08, + 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b, + 0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819, + 0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08, + 0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808, + 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08, + 0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808, + 0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808, + 0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919, + 0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819, + 0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08, + 0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908, + 0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819, + 0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808, + 0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808, + 0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908, + 0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808, + 0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08, + 0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819, + 0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819, + 0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819, + 0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908, + 0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19, + 0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819, + 0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b, + 0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808, + 0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908, + 0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08, + 0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08, + 0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908, + 0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819, + 0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808, + 0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808, + 0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19, + 0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819, + 0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, + 0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b, + 0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08, + 0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808, + 0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908, + 0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b, + 0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819, + 0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08, + 0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08, + 0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808, + 0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b, + 0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b, + 0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908, + 0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819, + 0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808, + 0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908, + 0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b, + 0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808, + 0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b, + 0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b, + 0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808, + 0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19, + 0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908, +]) + +KSIGNS_IQ2XS = np.array([ + 0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15, + 144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159, + 160, 33, 34, 163, 36, 165, 166, 39, 40, 169, 170, 43, 172, 45, 46, 175, + 48, 177, 178, 51, 180, 53, 54, 183, 184, 57, 58, 187, 60, 189, 190, 63, + 192, 65, 66, 195, 68, 197, 198, 71, 72, 201, 202, 75, 204, 77, 78, 207, + 80, 209, 210, 83, 212, 85, 86, 215, 216, 89, 90, 219, 92, 221, 222, 95, + 96, 225, 226, 99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111, + 240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255, +]) + +KMASK_IQ2XS = np.array([1, 2, 4, 8, 16, 32, 64, 128]) diff --git a/tests/quantization/ggml/test_ggml.py b/tests/quantization/ggml/test_ggml.py index e42900a1d51b..a7931bcb3e09 100644 --- a/tests/quantization/ggml/test_ggml.py +++ b/tests/quantization/ggml/test_ggml.py @@ -34,6 +34,7 @@ class GgufIntegrationTests(unittest.TestCase): qwen2_model_id = "Qwen/Qwen1.5-0.5B-Chat-GGUF" llama3_model_id = "NousResearch/Meta-Llama-3-8B-GGUF" tinyllama_model_id = "PenutChen/TinyLlama-1.1B-Chat-v1.0-GGUF" + imatrix_model_id = "duyntnet/TinyLlama-1.1B-Chat-v1.0-imatrix-GGUF" q4_0_gguf_model_id = "tinyllama-1.1b-chat-v1.0.Q4_0.gguf" q4_k_gguf_model_id = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf" @@ -42,6 +43,7 @@ class GgufIntegrationTests(unittest.TestCase): q5_k_gguf_model_id = "tinyllama-1.1b-chat-v1.0.Q5_K_M.gguf" q6_k_gguf_model_id = "tinyllama-1.1b-chat-v1.0.Q6_K.gguf" q8_0_gguf_model_id = "tinyllama-1.1b-chat-v1.0.Q8_0.gguf" + iq2_xxs_gguf_model_id = "TinyLlama-1.1B-Chat-v1.0-IQ2_XXS.gguf" q4_0_mistral_model_id = "mistral-7b-instruct-v0.2.Q4_0.gguf" q4_0_qwen2_model_id = "qwen1_5-0_5b-chat-q4_0.gguf" @@ -163,6 +165,18 @@ def test_f16(self): EXPECTED_TEXT = "Hello, World!\n\n5. Node.js" self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT) + def test_iq2_xxs(self): + tokenizer = AutoTokenizer.from_pretrained(self.imatrix_model_id, gguf_file=self.iq2_xxs_gguf_model_id) + model = AutoModelForCausalLM.from_pretrained( + self.imatrix_model_id, gguf_file=self.iq2_xxs_gguf_model_id + ).to(torch_device) + + text = tokenizer(self.example_text, return_tensors="pt").to(torch_device) + out = model.generate(**text, max_new_tokens=10) + + EXPECTED_TEXT = "Hello, I'm a software engineer. I'" + self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT) + def test_mistral_q4_0(self): tokenizer = AutoTokenizer.from_pretrained(self.mistral_model_id, gguf_file=self.q4_0_mistral_model_id) model = AutoModelForCausalLM.from_pretrained(