Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/transformers/integrations/ggml.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@
"rope.dimension_count": None,
"rope.freq_base": "rope_theta",
"attention.head_count": "num_attention_heads",
"attention.head_count_kv": "num_key_value_heads",
"attention.head_count_kv": "num_kv_heads",
"attention.layer_norm_rms_epsilon": "rms_norm_eps",
"vocab_size": "vocab_size",
},
Expand Down
39 changes: 39 additions & 0 deletions src/transformers/modeling_gguf_pytorch_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,42 @@ def process(self, weights, name, **kwargs):
return GGUFTensor(weights, name, {})


class FalconTensorProcessor(TensorProcessor):
def __init__(self, config=None):
super().__init__(config=config)

def process(self, weights, name, **kwargs):
if "qkv" in name:
shape = weights.shape
weights_copy = weights.copy()
parsed_parameters = kwargs.get("parsed_parameters")
num_attention_heads = parsed_parameters["config"]["num_attention_heads"]
num_key_value_heads = parsed_parameters["config"]["num_kv_heads"]
hidden_size = parsed_parameters["config"]["hidden_size"]
head_dim = hidden_size // num_attention_heads

# Split the weights array into q, k, v
split_indices = [
num_attention_heads * head_dim,
num_attention_heads * head_dim + num_key_value_heads * head_dim,
]

q, k, v = np.split(weights_copy, split_indices)

# Reshape q, k, and v as needed
q = q.reshape(num_key_value_heads, num_attention_heads // num_key_value_heads, head_dim, hidden_size)
k = k.reshape(num_key_value_heads, 1, head_dim, hidden_size)
v = v.reshape(num_key_value_heads, 1, head_dim, hidden_size)

# Concatenate q, k, and v along the second dimension
qkv = np.concatenate((q, k, v), axis=1)

# Reshape qkv back to the original shape
weights = qkv.reshape(shape)

return GGUFTensor(weights, name, {})


class LlamaTensorProcessor(TensorProcessor):
def __init__(self, config=None):
super().__init__(config=config)
Expand Down Expand Up @@ -246,6 +282,7 @@ def process(self, weights, name, **kwargs):
"t5encoder": T5TensorProcessor,
"gpt2": GPT2TensorProcessor,
"mamba": MambaTensorProcessor,
"falcon": FalconTensorProcessor,
}


Expand Down Expand Up @@ -321,6 +358,8 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
f"From file name, cannot determine the number of parameters for {architecture} architecture"
)
model_size = m.group().strip("-") # only keeps `7b`
if model_size == "40b":
parsed_parameters["config"]["new_decoder_architecture"] = True
Comment on lines +361 to +362
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add tests for the 40b even if we won't run them on the CI (add @Skip) .

Copy link
Collaborator

@Isotr0py Isotr0py Dec 10, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In fact, I'm thinking about avoid using model_size extracted from file name, because some user fine-tuned model may use custom filename without "40b" explicitly. (I also removed the model_size logic in #34385)

Considering "new_decoder_architecture" having 2 layernorm for attn in decoder layer, I will prefer to check the existence of attn_norm_2 to determine "new_decoder_architecture".

You can refer to https://huggingface.co/maddes8cht/tiiuae-falcon-40b-instruct-gguf?show_file_info=tiiuae-falcon-40b-instruct-Q2_K.gguf, which indeed has attn_norm_2 params.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if model_size == "40b":
parsed_parameters["config"]["new_decoder_architecture"] = True
new_decoder_architecture = any("attn_norm_2" in tensor.name for tensor in reader.tensors)
parsed_parameters["config"]["new_decoder_architecture"] = new_decoder_architecture

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


if architecture + model_size not in GGUF_SUPPORTED_ARCHITECTURES:
raise ValueError(f"Architecture {architecture + model_size} not supported")
Expand Down
4 changes: 3 additions & 1 deletion tests/quantization/ggml/test_ggml.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ class GgufIntegrationTests(unittest.TestCase):
mamba_model_id = "jpodivin/mamba-2.8b-hf-GGUF"
nemotron_original_model_id = "nvidia/Nemotron-Mini-4B-Instruct"
nemotron_model_id = "bartowski/Nemotron-Mini-4B-Instruct-GGUF"
falcon40b_model_id = "tensorblock/falcon-40b-GGUF"

# standard quants
q4_0_gguf_model_id = "tinyllama-1.1b-chat-v1.0.Q4_0.gguf"
Expand Down Expand Up @@ -111,6 +112,7 @@ class GgufIntegrationTests(unittest.TestCase):
fp16_mamba_model_id = "ggml-model-f16.gguf"
q6_k_nemotron_model_id = "Nemotron-Mini-4B-Instruct-Q6_K.gguf"
fp16_nemotron_model_id = "Nemotron-Mini-4B-Instruct-f16.gguf"
q2_falcon40b_id = "falcon-40b-Q2_K.gguf"

example_text = "Hello"

Expand Down Expand Up @@ -612,7 +614,7 @@ def test_falcon40b_q2_k(self):
text = tokenizer(self.example_text, return_tensors="pt").to(torch_device)
out = model.generate(**text, max_new_tokens=10)

EXPECTED_TEXT = "Hello All,\nI am new to this forum."
EXPECTED_TEXT = "Hello All,\nOn ne sait plus quoi manger,"
self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT)

def test_falcon7b_q2_k(self):
Expand Down