Skip to content

Commit 7c671f2

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .github/workflows/docker.yml # examples/cvector-generator/mean.hpp # examples/cvector-generator/pca.hpp # examples/export-lora/export-lora.cpp # examples/rpc/rpc-server.cpp # examples/run/README.md # examples/run/run.cpp # examples/server/CMakeLists.txt # examples/server/README.md # ggml/src/CMakeLists.txt # ggml/src/ggml-cpu/CMakeLists.txt # ggml/src/ggml-vulkan/ggml-vulkan.cpp # scripts/compare-llama-bench.py # scripts/hf.sh # tests/test-chat-template.cpp
2 parents 29afdb7 + d79d8f3 commit 7c671f2

27 files changed

+25899
-13403
lines changed

convert_hf_to_gguf.py

Lines changed: 210 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -529,9 +529,19 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
529529
else:
530530
token: str = reverse_vocab[i]
531531
if token in added_vocab:
532+
# The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
533+
# To avoid unexpected issues - we make sure to normalize non-normalized tokens
534+
if not tokenizer.added_tokens_decoder[i].normalized:
535+
previous_token = token
536+
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
537+
if previous_token != token:
538+
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
539+
532540
if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
533541
toktypes.append(gguf.TokenType.CONTROL)
534542
else:
543+
# NOTE: this was added for Gemma.
544+
# Encoding and decoding the tokens above isn't sufficient for this case.
535545
token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
536546
toktypes.append(gguf.TokenType.USER_DEFINED)
537547
else:
@@ -575,6 +585,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
575585
if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
576586
# ref: https://huggingface.co/tiiuae/falcon-7b
577587
res = "falcon"
588+
if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
589+
# ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
590+
res = "falcon3"
578591
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
579592
# ref: https://huggingface.co/BAAI/bge-small-en-v1.5
580593
res = "bert-bge"
@@ -671,6 +684,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
671684
if chkhsh == "ad851be1dba641f2e3711822f816db2c265f788b37c63b4e1aeacb9ee92de8eb":
672685
# ref: https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct
673686
res = "gigachat"
687+
if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1":
688+
# ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct
689+
res = "megrez"
674690

675691
if res is None:
676692
logger.warning("\n")
@@ -1679,6 +1695,184 @@ def prepare_tensors(self):
16791695
raise ValueError(f"Unprocessed experts: {experts}")
16801696

16811697

1698+
@Model.register("DeciLMForCausalLM")
1699+
class DeciModel(Model):
1700+
model_arch = gguf.MODEL_ARCH.DECI
1701+
1702+
@staticmethod
1703+
def _ffn_mult_to_intermediate_size(ffn_mult: float, n_embd: int) -> int:
1704+
# DeciLM-specific code
1705+
intermediate_size = int(2 * ffn_mult * n_embd / 3)
1706+
return DeciModel._find_multiple(intermediate_size, 256)
1707+
1708+
@staticmethod
1709+
def _find_multiple(n: int, k: int) -> int:
1710+
# DeciLM-specific code
1711+
if n % k == 0:
1712+
return n
1713+
return n + k - (n % k)
1714+
1715+
def __init__(self, *args, **kwargs):
1716+
super().__init__(*args, **kwargs)
1717+
1718+
if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B
1719+
_block_configs: list[dict[str,Any]] = self.hparams["block_configs"]
1720+
assert self.block_count == len(_block_configs)
1721+
self._num_kv_heads = list()
1722+
self._num_heads = list()
1723+
_ffn_multipliers = list()
1724+
# ***linear attention layer***
1725+
# if n_heads_in_group is None and replace_with_linear is True
1726+
# then _num_kv_heads[il] is 0 and _num_heads[il] is num_attention_heads
1727+
# ***attention-free layer***
1728+
# if n_heads_in_group is None and replace_with_linear is False
1729+
# then _num_kv_heads[il] is 0 and _num_heads[il] is 0
1730+
# ***normal attention-layer***
1731+
# if n_heads_in_group is not None, then
1732+
# _num_kv_heads[il] is num_attention_head // n_heads_in_group and
1733+
# _num_heads[il] is num_attention_head
1734+
for il in range(len(_block_configs)):
1735+
if _block_configs[il]["attention"]["n_heads_in_group"] is None:
1736+
if _block_configs[il]["attention"]["replace_with_linear"] is True:
1737+
self._num_kv_heads.append(0)
1738+
self._num_heads.append(self.hparams["num_attention_heads"])
1739+
else:
1740+
self._num_kv_heads.append(0)
1741+
self._num_heads.append(0)
1742+
else:
1743+
self._num_kv_heads.append(self.hparams["num_attention_heads"] // _block_configs[il]["attention"]["n_heads_in_group"])
1744+
self._num_heads.append(self.hparams["num_attention_heads"])
1745+
_ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"])
1746+
assert self.block_count == len(self._num_kv_heads)
1747+
assert self.block_count == len(self._num_heads)
1748+
assert self.block_count == len(_ffn_multipliers)
1749+
assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int)
1750+
assert isinstance(self._num_heads, list) and isinstance(self._num_heads[0], int)
1751+
assert isinstance(_ffn_multipliers, list) and isinstance(_ffn_multipliers[0], float)
1752+
self._ffn_dims: list[int] = [
1753+
DeciModel._ffn_mult_to_intermediate_size(multiplier, self.hparams["hidden_size"])
1754+
for multiplier in _ffn_multipliers
1755+
]
1756+
1757+
def set_vocab(self):
1758+
# Please change tokenizer_config.json of Llama-3_1-Nemotron-51B's
1759+
# eos_token from '|eot_id|' to '|end_of_text|'
1760+
if self.hparams.get("vocab_size", 128256) == 128256:
1761+
tokens, toktypes, tokpre = self.get_vocab_base()
1762+
self.gguf_writer.add_tokenizer_model("gpt2")
1763+
self.gguf_writer.add_tokenizer_pre(tokpre)
1764+
self.gguf_writer.add_token_list(tokens)
1765+
self.gguf_writer.add_token_types(toktypes)
1766+
1767+
special_vocab = gguf.SpecialVocab(
1768+
self.dir_model, load_merges=True,
1769+
special_token_types = ['bos', 'eos', 'eom', 'eot']
1770+
)
1771+
special_vocab._set_special_token("bos", 128000)
1772+
special_vocab._set_special_token("eos", 128001)
1773+
special_vocab._set_special_token("eom", 128008)
1774+
special_vocab._set_special_token("eot", 128009)
1775+
special_vocab.add_to_gguf(self.gguf_writer)
1776+
else:
1777+
# DeciLM-7B
1778+
self._set_vocab_llama_hf()
1779+
# self._set_vocab_gpt2()
1780+
1781+
def set_gguf_parameters(self):
1782+
if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B
1783+
assert self.block_count == len(self._num_kv_heads)
1784+
assert self.block_count == len(self._num_heads)
1785+
assert self.block_count == len(self._ffn_dims)
1786+
self.gguf_writer.add_head_count_kv(self._num_kv_heads)
1787+
self.gguf_writer.add_head_count(self._num_heads)
1788+
self.gguf_writer.add_feed_forward_length(self._ffn_dims)
1789+
self.gguf_writer.add_block_count(self.block_count)
1790+
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
1791+
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
1792+
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
1793+
self.gguf_writer.add_key_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
1794+
self.gguf_writer.add_value_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
1795+
self.gguf_writer.add_file_type(self.ftype)
1796+
else: # DeciLM-7B
1797+
super().set_gguf_parameters()
1798+
if "num_key_value_heads_per_layer" in self.hparams: # DeciLM-7B
1799+
self._num_kv_heads: list[int] = self.hparams["num_key_value_heads_per_layer"]
1800+
assert self.block_count == len(self._num_kv_heads)
1801+
self.gguf_writer.add_head_count_kv(self._num_kv_heads)
1802+
hparams = self.hparams
1803+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
1804+
1805+
if "head_dim" in hparams:
1806+
rope_dim = hparams["head_dim"]
1807+
else:
1808+
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
1809+
self.gguf_writer.add_rope_dimension_count(rope_dim)
1810+
1811+
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
1812+
if self.hparams["rope_scaling"].get("type") == "linear":
1813+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
1814+
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
1815+
1816+
@staticmethod
1817+
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
1818+
if n_head_kv is not None and n_head != n_head_kv:
1819+
n_head = n_head_kv
1820+
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
1821+
.swapaxes(1, 2)
1822+
.reshape(weights.shape))
1823+
1824+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
1825+
n_head = self.hparams["num_attention_heads"]
1826+
if bid is not None:
1827+
if "num_key_value_heads_per_layer" in self.hparams:
1828+
n_kv_head = self.hparams["num_key_value_heads_per_layer"][bid]
1829+
elif "block_configs" in self.hparams:
1830+
n_kv_head = self._num_kv_heads[bid]
1831+
n_head = self._num_heads[bid]
1832+
else:
1833+
n_kv_head = self.hparams.get("num_key_value_heads")
1834+
else:
1835+
n_kv_head = self.hparams.get("num_key_value_heads")
1836+
1837+
if name.endswith(("q_proj.weight", "q_proj.bias")):
1838+
data_torch = DeciModel.permute(data_torch, n_head, n_head)
1839+
if name.endswith(("k_proj.weight", "k_proj.bias")):
1840+
data_torch = DeciModel.permute(data_torch, n_head, n_kv_head)
1841+
return [(self.map_tensor_name(name), data_torch)]
1842+
1843+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
1844+
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
1845+
if rope_scaling.get("rope_type", '').lower() == "llama3":
1846+
base = self.hparams.get("rope_theta", 10000.0)
1847+
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
1848+
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
1849+
1850+
factor = rope_scaling.get("factor", 8.0)
1851+
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
1852+
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
1853+
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
1854+
1855+
low_freq_wavelen = old_context_len / low_freq_factor
1856+
high_freq_wavelen = old_context_len / high_freq_factor
1857+
assert low_freq_wavelen != high_freq_wavelen
1858+
1859+
rope_factors = []
1860+
for freq in freqs:
1861+
wavelen = 2 * math.pi / freq
1862+
if wavelen < high_freq_wavelen:
1863+
rope_factors.append(1)
1864+
elif wavelen > low_freq_wavelen:
1865+
rope_factors.append(factor)
1866+
else:
1867+
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
1868+
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
1869+
1870+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
1871+
1872+
def prepare_tensors(self):
1873+
super().prepare_tensors()
1874+
1875+
16821876
@Model.register("BitnetForCausalLM")
16831877
class BitnetModel(Model):
16841878
model_arch = gguf.MODEL_ARCH.BITNET
@@ -2628,7 +2822,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
26282822
return [(self.map_tensor_name(name), data_torch)]
26292823

26302824

2631-
@Model.register("BertModel", "CamembertModel")
2825+
@Model.register("BertModel", "BertForMaskedLM", "CamembertModel")
26322826
class BertModel(Model):
26332827
model_arch = gguf.MODEL_ARCH.BERT
26342828

@@ -2694,10 +2888,25 @@ def phantom(tok):
26942888
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
26952889
del bid # unused
26962890

2891+
if name.startswith("bert."):
2892+
name = name[5:]
2893+
2894+
if name.endswith(".gamma"):
2895+
name = name[:-6] + ".weight"
2896+
2897+
if name.endswith(".beta"):
2898+
name = name[:-5] + ".bias"
2899+
26972900
# we are only using BERT for embeddings so we don't need the pooling layer
26982901
if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"):
26992902
return [] # we don't need these
27002903

2904+
if name.startswith("cls.predictions"):
2905+
return []
2906+
2907+
if name.startswith("cls.seq_relationship"):
2908+
return []
2909+
27012910
return [(self.map_tensor_name(name), data_torch)]
27022911

27032912

convert_hf_to_gguf_update.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ class TOKENIZER_TYPE(IntEnum):
7272
{"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
7373
{"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
7474
{"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
75+
{"name": "falcon3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon3-7B-Base", },
7576
{"name": "bert-bge-large", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", },
7677
{"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
7778
{"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
@@ -105,6 +106,7 @@ class TOKENIZER_TYPE(IntEnum):
105106
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", },
106107
{"name": "roberta-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
107108
{"name": "gigachat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"},
109+
{"name": "megrez", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
108110
]
109111

110112

0 commit comments

Comments
 (0)