Skip to content

Commit 3c1e250

Browse files
committed
Merge branch 'master' into imatrix
2 parents 2b45dca + 9008328 commit 3c1e250

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+3244
-1515
lines changed

.devops/nix/package.nix

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ let
4747
inherit (lib)
4848
cmakeBool
4949
cmakeFeature
50+
optionalAttrs
5051
optionals
5152
strings
5253
;
@@ -197,7 +198,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
197198
];
198199

199200
# Environment variables needed for ROCm
200-
env = optionals useRocm {
201+
env = optionalAttrs useRocm {
201202
ROCM_PATH = "${rocmPackages.clr}";
202203
HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode";
203204
};

common/common.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -448,6 +448,15 @@ void string_replace_all(std::string & s, const std::string & search, const std::
448448
bool string_ends_with(const std::string_view & str, const std::string_view & suffix) {
449449
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
450450
}
451+
452+
bool string_remove_suffix(std::string & str, const std::string_view & suffix) {
453+
bool has_suffix = string_ends_with(str, suffix);
454+
if (has_suffix) {
455+
str = str.substr(0, str.size() - suffix.size());
456+
}
457+
return has_suffix;
458+
}
459+
451460
size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop) {
452461
if (!str.empty() && !stop.empty()) {
453462
const char text_last_char = str.back();

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -535,6 +535,7 @@ static bool string_starts_with(const std::string & str,
535535

536536
// While we wait for C++20's std::string::ends_with...
537537
bool string_ends_with(const std::string_view & str, const std::string_view & suffix);
538+
bool string_remove_suffix(std::string & str, const std::string_view & suffix);
538539
size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop);
539540

540541
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);

convert_hf_to_gguf.py

Lines changed: 161 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -843,6 +843,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
843843
if chkhsh == "169bf0296a13c4d9b7672313f749eb36501d931022de052aad6e36f2bf34dd51":
844844
# ref: https://huggingface.co/LiquidAI/LFM2-Tokenizer
845845
res = "lfm2"
846+
if chkhsh == "2085e1638f6c377a0aa4ead21b27bb4cb941bf800df86ed391011769c1758dfb":
847+
# ref: https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B
848+
res = "exaone4"
846849

847850
if res is None:
848851
logger.warning("\n")
@@ -2861,7 +2864,8 @@ def set_gguf_parameters(self):
28612864
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
28622865
num_heads = self.hparams["num_attention_heads"]
28632866
num_kv_heads = self.hparams["num_key_value_heads"]
2864-
head_dim = self.hparams["head_dim"]
2867+
if (head_dim := self.hparams.get("head_dim")) is None:
2868+
head_dim = self.hparams["hidden_size"] // num_heads
28652869

28662870
if "ernie." in name:
28672871
name = name.replace("ernie.", "model.")
@@ -2894,6 +2898,93 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
28942898
return [(self.map_tensor_name(name), data_torch)]
28952899

28962900

2901+
@ModelBase.register("Ernie4_5_MoeForCausalLM")
2902+
class Ernie4_5MoeModel(Ernie4_5Model):
2903+
model_arch = gguf.MODEL_ARCH.ERNIE4_5_MOE
2904+
_experts: list[dict[str, Tensor]] | None = None
2905+
2906+
def __init__(self, *args, **kwargs):
2907+
super().__init__(*args, **kwargs)
2908+
self._experts = [{} for _ in range(self.block_count)]
2909+
2910+
def set_gguf_parameters(self):
2911+
super().set_gguf_parameters()
2912+
self.gguf_writer.add_expert_count(self.hparams["moe_num_experts"])
2913+
self.gguf_writer.add_expert_used_count(self.hparams["moe_k"])
2914+
self.gguf_writer.add_interleave_moe_layer_step(self.hparams["moe_layer_interval"])
2915+
self.gguf_writer.add_leading_dense_block_count(self.hparams["moe_layer_start_index"])
2916+
if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
2917+
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
2918+
if (shared_expert_count := self.hparams.get('moe_num_shared_experts')) is not None:
2919+
self.gguf_writer.add_expert_shared_count(shared_expert_count)
2920+
if shared_expert_count > 0 and (shared_expert_intermediate_size := self.hparams.get('intermediate_size')) is not None and (num_key_value_heads := self.hparams.get('num_key_value_heads')) is not None:
2921+
self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size // num_key_value_heads)
2922+
2923+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2924+
# Modify correction bias name as in DeepseekV2
2925+
if name.endswith("e_score_correction_bias"):
2926+
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
2927+
2928+
# skip Multi-Token Prediction (MTP) layers (again, same as DeepseekV2)
2929+
match = re.match(r"model.mtp_block.(\d+)", name)
2930+
if match:
2931+
return []
2932+
2933+
# skip all other MTP tensors for now
2934+
match = re.match(r"model.mtp_emb_norm.(\d+)", name)
2935+
if match:
2936+
return []
2937+
2938+
match = re.match(r"model.mtp_hidden_norm.(\d+)", name)
2939+
if match:
2940+
return []
2941+
2942+
match = re.match(r"model.mtp_linear_proj.(\d+)", name)
2943+
if match:
2944+
return []
2945+
2946+
# process the experts separately
2947+
if name.find("mlp.experts") != -1:
2948+
n_experts = self.hparams["moe_num_experts"]
2949+
assert bid is not None
2950+
2951+
if self._experts is None:
2952+
self._experts = [{} for _ in range(self.block_count)]
2953+
2954+
self._experts[bid][name] = data_torch
2955+
2956+
if len(self._experts[bid]) >= n_experts * 3:
2957+
tensors: list[tuple[str, Tensor]] = []
2958+
2959+
# merge the experts into a single 3d tensor
2960+
for w_name in ["gate_proj", "up_proj", "down_proj"]:
2961+
datas: list[Tensor] = []
2962+
2963+
for xid in range(n_experts):
2964+
ename_to_retrieve = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
2965+
datas.append(self._experts[bid][ename_to_retrieve])
2966+
del self._experts[bid][ename_to_retrieve]
2967+
2968+
data_torch = torch.stack(datas, dim=0)
2969+
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
2970+
new_name = self.map_tensor_name(merged_name)
2971+
tensors.append((new_name, data_torch))
2972+
2973+
return tensors
2974+
else:
2975+
return []
2976+
return [(self.map_tensor_name(name), data_torch)]
2977+
2978+
def prepare_tensors(self):
2979+
super().prepare_tensors()
2980+
2981+
if self._experts is not None:
2982+
# flatten `list[dict[str, Tensor]]` into `list[str]`
2983+
experts = [k for d in self._experts for k in d.keys()]
2984+
if len(experts) > 0:
2985+
raise ValueError(f"Unprocessed experts: {experts}")
2986+
2987+
28972988
@ModelBase.register(
28982989
"Qwen2VLModel",
28992990
"Qwen2VLForConditionalGeneration",
@@ -6692,6 +6783,75 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
66926783
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
66936784

66946785

6786+
@ModelBase.register("Exaone4ForCausalLM")
6787+
class Exaone4Model(TextModel):
6788+
model_arch = gguf.MODEL_ARCH.EXAONE4
6789+
6790+
def set_vocab(self):
6791+
tokens, toktypes, tokpre = self.get_vocab_base()
6792+
self.gguf_writer.add_tokenizer_model("gpt2")
6793+
self.gguf_writer.add_tokenizer_pre(tokpre)
6794+
self.gguf_writer.add_token_list(tokens)
6795+
self.gguf_writer.add_token_types(toktypes)
6796+
6797+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
6798+
special_vocab.add_to_gguf(self.gguf_writer)
6799+
6800+
def set_gguf_parameters(self):
6801+
super().set_gguf_parameters()
6802+
hparams = self.hparams
6803+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
6804+
6805+
if hparams.get("sliding_window") is not None:
6806+
self.gguf_writer.add_sliding_window(hparams["sliding_window"])
6807+
if "layer_types" in hparams:
6808+
self.gguf_writer.add_sliding_window_pattern([t == "sliding_attention" for t in hparams["layer_types"]])
6809+
elif "sliding_window_pattern" in hparams:
6810+
sliding_window_pattern = []
6811+
if isinstance(hparams["sliding_window_pattern"], str): # e.g. LLLG
6812+
for i in range(hparams["num_hidden_layers"]):
6813+
sliding_window_pattern.append(hparams["sliding_window_pattern"][i % len(hparams["sliding_window_pattern"])] == "L")
6814+
if isinstance(hparams["sliding_window_pattern"], int): # e.g. 4
6815+
for i in range(hparams["num_hidden_layers"]):
6816+
sliding_window_pattern.append((i + 1) % hparams["sliding_window_pattern"] != 0)
6817+
if len(sliding_window_pattern) == hparams["num_hidden_layers"]:
6818+
self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
6819+
6820+
rope_scaling = self.hparams.get("rope_scaling") or {}
6821+
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
6822+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
6823+
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
6824+
6825+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
6826+
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
6827+
if rope_scaling.get("rope_type", '').lower() == "llama3":
6828+
base = self.hparams.get("rope_theta", 10_000.0)
6829+
if (dim := self.hparams.get("head_dim")) is None:
6830+
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
6831+
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
6832+
6833+
factor = rope_scaling.get("factor", 16.0)
6834+
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
6835+
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
6836+
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
6837+
6838+
low_freq_wavelen = old_context_len / low_freq_factor
6839+
high_freq_wavelen = old_context_len / high_freq_factor
6840+
6841+
rope_factors = []
6842+
for freq in freqs:
6843+
wavelen = 2 * math.pi / freq
6844+
if wavelen < high_freq_wavelen:
6845+
rope_factors.append(1)
6846+
elif wavelen > low_freq_wavelen:
6847+
rope_factors.append(factor)
6848+
else:
6849+
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
6850+
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
6851+
6852+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
6853+
6854+
66956855
@ModelBase.register("GraniteForCausalLM")
66966856
class GraniteModel(LlamaModel):
66976857
"""Conversion for IBM's GraniteForCausalLM"""

convert_hf_to_gguf_update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,7 @@ class TOKENIZER_TYPE(IntEnum):
129129
{"name": "a.x-4.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/skt/A.X-4.0", },
130130
{"name": "midm-2.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct", },
131131
{"name": "lfm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
132+
{"name": "exaone4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
132133
]
133134

134135
# some models are known to be broken upstream, so we will skip them as exceptions

docs/build.md

Lines changed: 18 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -305,9 +305,8 @@ On Linux it is possible to use unified memory architecture (UMA) to share main m
305305

306306
## Vulkan
307307

308-
**Windows**
309-
310-
### w64devkit
308+
### For Windows Users:
309+
**w64devkit**
311310

312311
Download and extract [`w64devkit`](https://github.com/skeeto/w64devkit/releases).
313312

@@ -334,7 +333,7 @@ cmake -B build -DGGML_VULKAN=ON
334333
cmake --build build --config Release
335334
```
336335

337-
### Git Bash MINGW64
336+
**Git Bash MINGW64**
338337

339338
Download and install [`Git-SCM`](https://git-scm.com/downloads/win) with the default settings
340339

@@ -357,7 +356,8 @@ Now you can load the model in conversation mode using `Vulkan`
357356
build/bin/Release/llama-cli -m "[PATH TO MODEL]" -ngl 100 -c 16384 -t 10 -n -2 -cnv
358357
```
359358

360-
### MSYS2
359+
**MSYS2**
360+
361361
Install [MSYS2](https://www.msys2.org/) and then run the following commands in a UCRT terminal to install dependencies.
362362
```sh
363363
pacman -S git \
@@ -373,9 +373,9 @@ cmake -B build -DGGML_VULKAN=ON
373373
cmake --build build --config Release
374374
```
375375

376-
**With docker**:
376+
### For Docker users:
377377

378-
You don't need to install Vulkan SDK. It will be installed inside the container.
378+
You don't need to install the Vulkan SDK. It will be installed inside the container.
379379

380380
```sh
381381
# Build the image
@@ -385,32 +385,28 @@ docker build -t llama-cpp-vulkan --target light -f .devops/vulkan.Dockerfile .
385385
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
386386
```
387387

388-
**Without docker**:
388+
### For Linux users:
389389

390-
Firstly, you need to make sure you have installed [Vulkan SDK](https://vulkan.lunarg.com/doc/view/latest/linux/getting_started_ubuntu.html)
390+
First, follow the the official [Getting Started with the Linux Tarball Vulkan SDK](https://vulkan.lunarg.com/doc/sdk/latest/linux/getting_started.html) guide.
391391

392-
For example, on Ubuntu 22.04 (jammy), use the command below:
392+
> [!IMPORTANT]
393+
> After completing the first step, ensure that you have used the `source` command on the `setup_env.sh` file inside of the Vulkan SDK in your current terminal session. Otherwise, the build won't work. Additionally, if you close out of your terminal, you must perform this step again if you intend to perform a build. However, there are ways to make this persistent. Refer to the Vulkan SDK guide linked in the first step for more information about any of this.
393394
395+
Second, after verifying that you have done everything in the Vulkan SDK guide provided in the first step, run the following command to verify that everything is set up correctly:
394396
```bash
395-
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add -
396-
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
397-
apt update -y
398-
apt-get install -y vulkan-sdk
399-
# To verify the installation, use the command below:
400397
vulkaninfo
401398
```
402399

403-
Alternatively your package manager might be able to provide the appropriate libraries.
404-
For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
405-
For Fedora 40, you can install `vulkan-devel`, `glslc` and `glslang` packages.
406-
407-
Then, build llama.cpp using the cmake command below:
408-
400+
Then, assuming you have `cd` into your llama.cpp folder and there are no errors with running `vulkaninfo`, you can proceed to build llama.cpp using the CMake commands below:
409401
```bash
410402
cmake -B build -DGGML_VULKAN=1
411403
cmake --build build --config Release
404+
```
405+
406+
Finally, after finishing your build, you should be able to do this:
407+
```bash
412408
# Test the output binary (with "-ngl 33" to offload all layers to GPU)
413-
./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
409+
./build/bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
414410

415411
# You should see in the output, ggml_vulkan detected your GPU. For example:
416412
# ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32

examples/parallel/parallel.cpp

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,9 @@ int main(int argc, char ** argv) {
184184
// extra text to insert in each client's prompt in order to make it larger
185185
const int32_t n_junk = std::max(1, params.n_junk);
186186

187+
// signed seed, use negative values to indicate different seeds for the different clients
188+
const int32_t & sseed = params.sampling.seed;
189+
187190
// init llama.cpp
188191
llama_backend_init();
189192
llama_numa_init(params.numa);
@@ -219,12 +222,21 @@ int main(int argc, char ** argv) {
219222

220223
const int n_ctx = llama_n_ctx(ctx);
221224

225+
if (sseed >= 0) {
226+
LOG_INF("%s: initializing all samplers with the same RNG seed: %d (use a negative seed to have different seeds)\n", __func__, sseed);
227+
} else {
228+
LOG_INF("%s: initializing samplers with different RNG seeds, starting from %d\n", __func__, sseed);
229+
}
230+
222231
std::vector<client> clients(n_clients);
223232
for (size_t i = 0; i < clients.size(); ++i) {
224233
auto & client = clients[i];
225234
client.id = i;
226235
client.smpl = common_sampler_init(model, params.sampling);
227-
//params.sampling.seed++;
236+
237+
if (sseed < 0) {
238+
params.sampling.seed--;
239+
}
228240
}
229241

230242
std::vector<llama_token> tokens_system;

ggml/src/ggml-alloc.c

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -22,21 +22,6 @@ static bool ggml_is_view(const struct ggml_tensor * t) {
2222
return t->view_src != NULL;
2323
}
2424

25-
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
26-
if (a->type != b->type) {
27-
return false;
28-
}
29-
for (int i = 0; i < GGML_MAX_DIMS; i++) {
30-
if (a->ne[i] != b->ne[i]) {
31-
return false;
32-
}
33-
if (a->nb[i] != b->nb[i]) {
34-
return false;
35-
}
36-
}
37-
return true;
38-
}
39-
4025
// ops that return true for this function must not use restrict pointers for their backend implementations
4126
static bool ggml_op_can_inplace(enum ggml_op op) {
4227
switch (op) {

0 commit comments

Comments
 (0)