Skip to content

Commit ad6ab17

Browse files
Merge branch 'ggml-org:master' into quant_types
2 parents a82a8c1 + d3bd719 commit ad6ab17

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+1687
-217
lines changed

.devops/llama-cli-cann.Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
1-
ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8
1+
ARG ASCEND_VERSION=8.1.RC1.alpha001-910b-openeuler22.03-py3.10
22

33
FROM ascendai/cann:$ASCEND_VERSION AS build
44

55
WORKDIR /app
66

77
COPY . .
88

9-
RUN yum install -y gcc g++ cmake make
9+
RUN yum install -y gcc g++ cmake make libcurl-devel
1010
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
1111
ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
1212
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}

.github/workflows/build.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1771,7 +1771,7 @@ jobs:
17711771
strategy:
17721772
matrix:
17731773
cann:
1774-
- '8.0.rc3.beta1-910b-openeuler22.03-py3.10'
1774+
- '8.1.RC1.alpha001-910b-openeuler22.03-py3.10'
17751775
device:
17761776
- 'ascend910b3'
17771777
build:
@@ -1784,7 +1784,7 @@ jobs:
17841784
- name: Dependencies
17851785
run: |
17861786
yum update -y
1787-
yum install -y git gcc gcc-c++ make cmake
1787+
yum install -y git gcc gcc-c++ make cmake libcurl-devel
17881788
17891789
- name: Build
17901790
run: |

.github/workflows/docker.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ jobs:
3838
# Multi-stage build
3939
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: false}
4040
- { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
41-
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
41+
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: true}
4242
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
4343
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
4444
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete

README.md

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,6 @@
99

1010
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
1111

12-
> [!IMPORTANT]
13-
> New `llama.cpp` package location: [ggml-org/llama.cpp](https://github.com/ggml-org/llama.cpp/pkgs/container/llama.cpp)
14-
>
15-
> Update your container URLs to: `ghcr.io/ggml-org/llama.cpp`
16-
>
17-
> More info: https://github.com/ggml-org/llama.cpp/discussions/11801
18-
1912
## Recent API changes
2013

2114
- [Changelog for `libllama` API](https://github.com/ggml-org/llama.cpp/issues/9289)
@@ -247,6 +240,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
247240
| [Vulkan](docs/build.md#vulkan) | GPU |
248241
| [CANN](docs/build.md#cann) | Ascend NPU |
249242
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
243+
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/examples/rpc) | All |
250244

251245
## Building the project
252246

common/arg.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,8 @@ struct common_hf_file_res {
163163
# if !defined(PATH_MAX)
164164
# define PATH_MAX MAX_PATH
165165
# endif
166+
#elif defined(_AIX)
167+
#include <sys/limits.h>
166168
#else
167169
#include <sys/syslimits.h>
168170
#endif

convert_hf_to_gguf.py

Lines changed: 74 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -714,6 +714,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
714714
if chkhsh == "96a5f08be6259352137b512d4157e333e21df7edd3fcd152990608735a65b224":
715715
# ref: https://huggingface.co/inclusionAI/Ling-lite
716716
res = "bailingmoe"
717+
if chkhsh == "d353350c764d8c3b39c763113960e4fb4919bea5fbf208a0e3b22e8469dc7406":
718+
# ref: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
719+
res = "llama4"
717720

718721
if res is None:
719722
logger.warning("\n")
@@ -1608,6 +1611,7 @@ def prepare_tensors(self):
16081611
@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
16091612
class LlamaModel(Model):
16101613
model_arch = gguf.MODEL_ARCH.LLAMA
1614+
undo_permute = True
16111615

16121616
def set_vocab(self):
16131617
try:
@@ -1672,10 +1676,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
16721676
n_head = self.hparams["num_attention_heads"]
16731677
n_kv_head = self.hparams.get("num_key_value_heads")
16741678

1675-
if name.endswith(("q_proj.weight", "q_proj.bias")):
1676-
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
1677-
if name.endswith(("k_proj.weight", "k_proj.bias")):
1678-
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
1679+
if self.undo_permute:
1680+
if name.endswith(("q_proj.weight", "q_proj.bias")):
1681+
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
1682+
if name.endswith(("k_proj.weight", "k_proj.bias")):
1683+
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
16791684

16801685
# process the experts separately
16811686
if name.find("block_sparse_moe.experts") != -1:
@@ -1752,6 +1757,61 @@ def prepare_tensors(self):
17521757
raise ValueError(f"Unprocessed experts: {experts}")
17531758

17541759

1760+
@Model.register("Llama4ForConditionalGeneration")
1761+
class Llama4Model(LlamaModel):
1762+
model_arch = gguf.MODEL_ARCH.LLAMA4
1763+
has_vision: bool = False
1764+
undo_permute = False
1765+
1766+
# TODO @ngxson : avoid duplicate this code everywhere by at least support "text_config"
1767+
# same with llama, but we need to merge the text_config into the root level of hparams
1768+
def __init__(self, *args, **kwargs):
1769+
hparams = kwargs["hparams"] if "hparams" in kwargs else Model.load_hparams(args[0])
1770+
if "text_config" in hparams:
1771+
hparams = {**hparams, **hparams["text_config"]}
1772+
kwargs["hparams"] = hparams
1773+
super().__init__(*args, **kwargs)
1774+
if "vision_config" in hparams:
1775+
logger.info("Has vision encoder, but it will be ignored")
1776+
self.has_vision = True
1777+
# IMPORTANT: the normal "intermediate_size" is renamed to "intermediate_size_mlp", we need to undo this
1778+
self.hparams["intermediate_size_moe"] = self.hparams["intermediate_size"]
1779+
self.hparams["intermediate_size"] = self.hparams["intermediate_size_mlp"]
1780+
1781+
def set_vocab(self):
1782+
self._set_vocab_gpt2()
1783+
self.gguf_writer.add_add_bos_token(True)
1784+
1785+
def set_gguf_parameters(self):
1786+
super().set_gguf_parameters()
1787+
self.gguf_writer.add_interleave_moe_layer_step(self.hparams["interleave_moe_layer_step"])
1788+
self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size_moe"])
1789+
1790+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
1791+
name = name.replace("language_model.", "")
1792+
name = name.replace("feed_forward.", "mlp.") # a bit hacky for now
1793+
name = name.replace(".router.weight", ".gate.weight") # a bit hacky for now
1794+
1795+
# split the gate_up into gate and up
1796+
if "gate_up_proj" in name:
1797+
name_up = name.replace("gate_up_proj", "up_proj.weight")
1798+
name_gate = name.replace("gate_up_proj", "gate_proj.weight")
1799+
dim_half = data_torch.shape[-1] // 2
1800+
gate_proj_weight, up_proj_weight = data_torch.transpose(-1, -2).split(dim_half, dim=-2)
1801+
return [
1802+
(self.map_tensor_name(name_gate), gate_proj_weight),
1803+
(self.map_tensor_name(name_up), up_proj_weight)
1804+
]
1805+
1806+
if name.endswith("down_proj"):
1807+
name += ".weight"
1808+
data_torch = data_torch.transpose(-1, -2)
1809+
1810+
if "multi_modal_projector" in name or "vision_model" in name:
1811+
return []
1812+
return super().modify_tensors(data_torch, name, bid)
1813+
1814+
17551815
@Model.register("Mistral3ForConditionalGeneration")
17561816
class Mistral3Model(LlamaModel):
17571817
model_arch = gguf.MODEL_ARCH.LLAMA
@@ -2399,6 +2459,16 @@ def prepare_tensors(self):
23992459
raise ValueError(f"Unprocessed experts: {experts}")
24002460

24012461

2462+
@Model.register("Qwen3ForCausalLM")
2463+
class Qwen3Model(Qwen2Model):
2464+
model_arch = gguf.MODEL_ARCH.QWEN3
2465+
2466+
2467+
@Model.register("Qwen3MoeForCausalLM")
2468+
class Qwen3MoeModel(Qwen2MoeModel):
2469+
model_arch = gguf.MODEL_ARCH.QWEN3MOE
2470+
2471+
24022472
@Model.register("GPT2LMHeadModel")
24032473
class GPT2Model(Model):
24042474
model_arch = gguf.MODEL_ARCH.GPT2

convert_hf_to_gguf_update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ class TOKENIZER_TYPE(IntEnum):
113113
{"name": "superbpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k", },
114114
{"name": "trillion", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/trillionlabs/Trillion-7B-preview", },
115115
{"name": "bailingmoe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-lite", },
116+
{"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
116117
]
117118

118119

docs/backend/SYCL.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -425,13 +425,13 @@ Examples:
425425
- Use device 0:
426426

427427
```sh
428-
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
428+
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
429429
```
430430

431431
- Use multiple devices:
432432

433433
```sh
434-
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
434+
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
435435
```
436436

437437
*Notes:*
@@ -697,13 +697,13 @@ Examples:
697697
- Use device 0:
698698

699699
```
700-
build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
700+
build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
701701
```
702702

703703
- Use multiple devices:
704704

705705
```
706-
build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
706+
build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
707707
```
708708

709709

examples/llava/clip.cpp

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -331,7 +331,6 @@ struct clip_ctx {
331331
float image_std[3];
332332
bool use_gelu = false;
333333
bool use_silu = false;
334-
int32_t ftype = 1;
335334

336335
struct gguf_context * ctx_gguf = nullptr;
337336
struct ggml_context * ctx_data = nullptr;
@@ -380,6 +379,7 @@ struct clip_ctx {
380379
if (backend_cpu != backend) {
381380
ggml_backend_free(backend_cpu);
382381
}
382+
clip_image_size_free(load_image_size);
383383
}
384384
};
385385

@@ -1141,9 +1141,6 @@ struct clip_model_loader {
11411141

11421142
// print gguf info
11431143
{
1144-
int ftype = -1;
1145-
get_u32(KEY_FTYPE, ftype, false);
1146-
const std::string ftype_str = ggml_type_name(static_cast<ggml_type>(ftype));
11471144
std::string name;
11481145
get_string(KEY_NAME, name, false);
11491146
std::string description;
@@ -1154,7 +1151,6 @@ struct clip_model_loader {
11541151
LOG_INF("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx_gguf.get()));
11551152
LOG_INF("%s: n_tensors: %d\n", __func__, n_tensors);
11561153
LOG_INF("%s: n_kv: %d\n", __func__, (int)gguf_get_n_kv(ctx_gguf.get()));
1157-
LOG_INF("%s: ftype: %s\n", __func__, ftype_str.c_str());
11581154
LOG_INF("\n");
11591155
}
11601156

@@ -1618,6 +1614,12 @@ struct clip_image_f32 * clip_image_f32_init() {
16181614
return new clip_image_f32();
16191615
}
16201616

1617+
void clip_image_size_free(struct clip_image_size * load_image_size) {
1618+
if (load_image_size == nullptr) {
1619+
return;
1620+
}
1621+
delete load_image_size;
1622+
}
16211623
void clip_image_u8_free(struct clip_image_u8 * img) { delete img; }
16221624
void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
16231625
void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) {
@@ -2270,6 +2272,9 @@ ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
22702272
}
22712273

22722274
void clip_free(clip_ctx * ctx) {
2275+
if (ctx == nullptr) {
2276+
return;
2277+
}
22732278
delete ctx;
22742279
}
22752280

@@ -2840,10 +2845,19 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) {
28402845
bool clip_is_glm(const struct clip_ctx * ctx) {
28412846
return ctx->has_glm_projector;
28422847
}
2848+
28432849
bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
28442850
return ctx->has_qwen2vl_merger;
28452851
}
28462852

2853+
bool clip_is_llava(const struct clip_ctx * ctx) {
2854+
return ctx->has_llava_projector;
2855+
}
2856+
2857+
bool clip_is_gemma3(const struct clip_ctx * ctx) {
2858+
return ctx->proj_type == PROJECTOR_TYPE_GEMMA3;
2859+
}
2860+
28472861
// Determine the number of encoder layers to iterate over
28482862
int get_deepest_feature_layer(const struct clip_ctx * ctx) {
28492863
// Get the index of the second to last layer; this is the

examples/llava/clip.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ CLIP_API struct clip_image_size * clip_image_size_init();
7777
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
7878
CLIP_API struct clip_image_f32 * clip_image_f32_init();
7979

80+
CLIP_API void clip_image_size_free (struct clip_image_size * img_size);
8081
CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
8182
CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
8283
CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch);
@@ -106,6 +107,8 @@ CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out
106107
CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
107108
CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);
108109
CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
110+
CLIP_API bool clip_is_llava(const struct clip_ctx * ctx);
111+
CLIP_API bool clip_is_gemma3(const struct clip_ctx * ctx);
109112

110113
CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx);
111114

0 commit comments

Comments
 (0)