Skip to content

Commit 9a5c859

Browse files
committed
Merge branch 'master' into Nexes_PR_Quants
2 parents b39aa8e + c8ddce8 commit 9a5c859

File tree

24 files changed

+685
-72
lines changed

24 files changed

+685
-72
lines changed

.github/workflows/bench.yml renamed to .github/workflows/bench.yml.disabled

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
# TODO: there have been some issues with the workflow, so disabling for now
2+
# https://github.com/ggerganov/llama.cpp/issues/7893
3+
#
14
# Benchmark
25
name: Benchmark
36

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ Typically finetunes of the base models below are supported as well.
105105
- [x] [Open Elm models](https://huggingface.co/collections/apple/openelm-instruct-models-6619ad295d7ae9f868b759ca)
106106
- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b)
107107
- [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
108+
- [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
108109

109110
(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
110111

common/common.cpp

Lines changed: 34 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -110,8 +110,34 @@ int32_t cpu_get_num_physical_cores() {
110110
if (result == 0) {
111111
return num_physical_cores;
112112
}
113-
#elif defined(_WIN32)
114-
//TODO: Implement
113+
#elif defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
114+
// TODO: windows + arm64 + mingw64
115+
unsigned int n_threads_win = std::thread::hardware_concurrency();
116+
unsigned int default_threads = n_threads_win > 0 ? (n_threads_win <= 4 ? n_threads_win : n_threads_win / 2) : 4;
117+
118+
DWORD buffer_size = 0;
119+
if (!GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &buffer_size)) {
120+
if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
121+
return default_threads;
122+
}
123+
}
124+
125+
std::vector<char> buffer(buffer_size);
126+
if (!GetLogicalProcessorInformationEx(RelationProcessorCore, reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data()), &buffer_size)) {
127+
return default_threads;
128+
}
129+
130+
int32_t num_physical_cores = 0;
131+
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data());
132+
while (buffer_size > 0) {
133+
if (info->Relationship == RelationProcessorCore) {
134+
num_physical_cores += info->Processor.GroupCount;
135+
}
136+
buffer_size -= info->Size;
137+
info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(reinterpret_cast<char*>(info) + info->Size);
138+
}
139+
140+
return num_physical_cores > 0 ? num_physical_cores : default_threads;
115141
#endif
116142
unsigned int n_threads = std::thread::hardware_concurrency();
117143
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
@@ -1735,7 +1761,13 @@ std::string gpt_params_get_system_info(const gpt_params & params) {
17351761
if (params.n_threads_batch != -1) {
17361762
os << " (n_threads_batch = " << params.n_threads_batch << ")";
17371763
}
1764+
#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
1765+
// TODO: windows + arm64 + mingw64
1766+
DWORD logicalProcessorCount = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
1767+
os << " / " << logicalProcessorCount << " | " << llama_print_system_info();
1768+
#else
17381769
os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
1770+
#endif
17391771

17401772
return os.str();
17411773
}
@@ -2710,12 +2742,6 @@ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token>
27102742
return text;
27112743
}
27122744

2713-
bool llama_should_add_bos_token(const llama_model * model) {
2714-
const int add_bos = llama_add_bos_token(model);
2715-
2716-
return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
2717-
}
2718-
27192745
//
27202746
// Chat template utils
27212747
//

common/common.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -382,10 +382,6 @@ std::string llama_detokenize(
382382
const std::vector<llama_token> & tokens,
383383
bool special = true);
384384

385-
// Uses the value from the model metadata if possible, otherwise
386-
// defaults to true when model type is SPM, otherwise false.
387-
bool llama_should_add_bos_token(const llama_model * model);
388-
389385
//
390386
// Chat template utils
391387
//

convert_hf_to_gguf.py

Lines changed: 122 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -590,6 +590,15 @@ def get_vocab_base_pre(self, tokenizer) -> str:
590590
if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249":
591591
# ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M
592592
res = "smollm"
593+
if chkhsh == "3c30d3ad1d6b64202cd222813e7736c2db6e1bd6d67197090fc1211fbc612ae7":
594+
# ref: https://huggingface.co/bigscience/bloom
595+
res = "bloom"
596+
if chkhsh == "bc01ce58980e1db43859146dc51b1758b3b88729b217a74792e9f8d43e479d21":
597+
# ref: https://huggingface.co/TurkuNLP/gpt3-finnish-small
598+
res = "gpt3-finnish"
599+
if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
600+
# ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
601+
res = "exaone"
593602

594603
if res is None:
595604
logger.warning("\n")
@@ -893,7 +902,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
893902
return tensors
894903

895904

896-
@Model.register("BloomForCausalLM")
905+
@Model.register("BloomForCausalLM", "BloomModel")
897906
class BloomModel(Model):
898907
model_arch = gguf.MODEL_ARCH.BLOOM
899908

@@ -3734,6 +3743,118 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
37343743
name = name.removeprefix("transformer.")
37353744
return [(self.map_tensor_name(name), data_torch)]
37363745

3746+
3747+
@Model.register("NemotronForCausalLM")
3748+
class NemotronModel(Model):
3749+
model_arch = gguf.MODEL_ARCH.NEMOTRON
3750+
3751+
def set_vocab(self):
3752+
self._set_vocab_sentencepiece()
3753+
self.gguf_writer.add_pad_token_id(0)
3754+
self.gguf_writer.add_unk_token_id(1)
3755+
3756+
def set_gguf_parameters(self):
3757+
super().set_gguf_parameters()
3758+
hparams = self.hparams
3759+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
3760+
3761+
f_norm_eps = self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon", "norm_eps"])
3762+
self.gguf_writer.add_layer_norm_eps(f_norm_eps)
3763+
3764+
# * Partial RoPE
3765+
rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"])
3766+
n_embd = self.find_hparam(["hidden_size", "n_embd"])
3767+
n_head = self.find_hparam(["num_attention_heads", "n_head"])
3768+
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
3769+
3770+
# * RopeScaling for Nemotron
3771+
if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None:
3772+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
3773+
else:
3774+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
3775+
self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"])
3776+
3777+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3778+
# * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side
3779+
# model.layers.{l}.input_layernorm.weight
3780+
# model.layers.{l}.post_attention_layernorm.weight
3781+
# model.norm.weight
3782+
if name.endswith("norm.weight"):
3783+
data_torch = data_torch + 1
3784+
3785+
return [(self.map_tensor_name(name), data_torch)]
3786+
3787+
3788+
@Model.register("ExaoneForCausalLM")
3789+
class ExaoneModel(Model):
3790+
model_arch = gguf.MODEL_ARCH.EXAONE
3791+
3792+
def set_gguf_parameters(self):
3793+
hparams = self.hparams
3794+
3795+
assert(hparams["activation_function"] == "silu")
3796+
3797+
max_position_embeddings = hparams["max_position_embeddings"]
3798+
embed_dim = hparams["hidden_size"]
3799+
num_heads = hparams["num_attention_heads"]
3800+
num_kv_heads = hparams.get("num_key_value_heads", num_heads)
3801+
layer_norm_eps = hparams["layer_norm_epsilon"]
3802+
intermediate_size = hparams["intermediate_size"] if "intermediate_size" in hparams else 4 * embed_dim
3803+
num_layers = hparams["num_layers"]
3804+
# ignore for now as EXAONE-3.0-7.8B-Instruct attentino_dropout is 0.0
3805+
# attention_dropout_rate = hparams["attention_dropout"]
3806+
# ignore for now as EXAONE-3.0-7.8B-Instruct embed_dropout is 0.0
3807+
# embed_dropout_rate = hparams["embed_dropout"]
3808+
self.gguf_writer.add_embedding_length(embed_dim)
3809+
self.gguf_writer.add_head_count(num_heads)
3810+
self.gguf_writer.add_head_count_kv(num_kv_heads)
3811+
self.gguf_writer.add_context_length(max_position_embeddings)
3812+
self.gguf_writer.add_layer_norm_rms_eps(layer_norm_eps)
3813+
self.gguf_writer.add_feed_forward_length(intermediate_size)
3814+
self.gguf_writer.add_block_count(num_layers)
3815+
self.gguf_writer.add_file_type(self.ftype)
3816+
3817+
if (rope_theta := self.hparams.get("rope_theta")) is not None:
3818+
self.gguf_writer.add_rope_freq_base(rope_theta)
3819+
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
3820+
rotary_factor = rotary_factor if rotary_factor is not None else 1.0
3821+
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
3822+
if hparams.get("rope_scaling") is not None and "factor" in hparams["rope_scaling"]:
3823+
if hparams["rope_scaling"].get("type") == "linear":
3824+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
3825+
self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
3826+
3827+
def prepare_tensors(self):
3828+
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
3829+
if rope_scaling.get("rope_type", '').lower() == "llama3":
3830+
base = self.hparams.get("rope_theta", 10000.0)
3831+
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
3832+
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
3833+
3834+
factor = rope_scaling.get("factor", 8.0)
3835+
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
3836+
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
3837+
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
3838+
3839+
low_freq_wavelen = old_context_len / low_freq_factor
3840+
high_freq_wavelen = old_context_len / high_freq_factor
3841+
assert low_freq_wavelen != high_freq_wavelen
3842+
3843+
rope_factors = []
3844+
for freq in freqs:
3845+
wavelen = 2 * math.pi / freq
3846+
if wavelen < high_freq_wavelen:
3847+
rope_factors.append(1)
3848+
elif wavelen > low_freq_wavelen:
3849+
rope_factors.append(factor)
3850+
else:
3851+
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
3852+
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
3853+
3854+
self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
3855+
3856+
super().prepare_tensors()
3857+
37373858
###### CONVERSION LOGIC ######
37383859

37393860

convert_hf_to_gguf_update.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,9 @@ class TOKENIZER_TYPE(IntEnum):
9494
{"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
9595
{"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
9696
{"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
97+
{'name': "bloom", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
98+
{'name': "gpt3-finnish", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
99+
{"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
97100
]
98101

99102

examples/cvector-generator/cvector-generator.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,7 @@ struct tokenized_prompt {
271271
size_t max_seq_len;
272272

273273
tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
274-
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
274+
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
275275
tokens_pos = ::llama_tokenize(ctx, pos, add_bos, true);
276276
tokens_neg = ::llama_tokenize(ctx, neg, add_bos, true);
277277
max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());

examples/eval-callback/eval-callback.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
127127
}
128128

129129
static bool run(llama_context * ctx, const gpt_params & params) {
130-
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
130+
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
131131

132132
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
133133

examples/imatrix/imatrix.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -434,8 +434,8 @@ static void process_logits(
434434
}
435435

436436
static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
437-
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
438-
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
437+
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
438+
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
439439
const int n_ctx = llama_n_ctx(ctx);
440440

441441
auto tim1 = std::chrono::high_resolution_clock::now();

examples/infill/infill.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -203,8 +203,8 @@ int main(int argc, char ** argv) {
203203
LOG_TEE("\n");
204204
LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
205205
}
206-
const bool add_bos = llama_should_add_bos_token(model);
207-
GGML_ASSERT(llama_add_eos_token(model) != 1);
206+
const bool add_bos = llama_add_bos_token(model);
207+
GGML_ASSERT(!llama_add_eos_token(model));
208208
LOG("add_bos: %d\n", add_bos);
209209

210210
std::vector<llama_token> embd_inp;

0 commit comments

Comments
 (0)