Skip to content

Commit 6140bc0

Browse files
committed
Merge branch 'master' into ci-build-cross
2 parents bb93531 + 5dec47d commit 6140bc0

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

64 files changed

+3835
-1520
lines changed

ci/README.md

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,4 +26,43 @@ GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
2626
# with SYCL support
2727
source /opt/intel/oneapi/setvars.sh
2828
GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
29+
30+
# with MUSA support
31+
GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
32+
```
33+
34+
## Running MUSA CI in a Docker Container
35+
36+
Assuming `$PWD` is the root of the `llama.cpp` repository, follow these steps to set up and run MUSA CI in a Docker container:
37+
38+
### 1. Create a local directory to store cached models, configuration files and venv:
39+
40+
```bash
41+
mkdir -p $HOME/llama.cpp/ci-cache
42+
```
43+
44+
### 2. Create a local directory to store CI run results:
45+
46+
```bash
47+
mkdir -p $HOME/llama.cpp/ci-results
48+
```
49+
50+
### 3. Start a Docker container and run the CI:
51+
52+
```bash
53+
docker run --privileged -it \
54+
-v $HOME/llama.cpp/ci-cache:/ci-cache \
55+
-v $HOME/llama.cpp/ci-results:/ci-results \
56+
-v $PWD:/ws -w /ws \
57+
mthreads/musa:rc3.1.1-devel-ubuntu22.04
2958
```
59+
60+
Inside the container, execute the following commands:
61+
62+
```bash
63+
apt update -y && apt install -y bc cmake git python3.10-venv time unzip wget
64+
git config --global --add safe.directory /ws
65+
GG_BUILD_MUSA=1 bash ./ci/run.sh /ci-results /ci-cache
66+
```
67+
68+
This setup ensures that the CI runs within an isolated Docker environment while maintaining cached files and results across runs.

ci/run.sh

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@
1616
# # with VULKAN support
1717
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
1818
#
19+
# # with MUSA support
20+
# GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
21+
#
1922

2023
if [ -z "$2" ]; then
2124
echo "usage: $0 <output-dir> <mnt-dir>"
@@ -52,13 +55,22 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then
5255
echo "source /opt/intel/oneapi/setvars.sh"
5356
exit 1
5457
fi
55-
58+
# Use only main GPU
59+
export ONEAPI_DEVICE_SELECTOR="level_zero:0"
60+
# Enable sysman for correct memory reporting
61+
export ZES_ENABLE_SYSMAN=1
5662
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
5763
fi
5864

5965
if [ ! -z ${GG_BUILD_VULKAN} ]; then
6066
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
6167
fi
68+
69+
if [ ! -z ${GG_BUILD_MUSA} ]; then
70+
# Use qy1 by default (MTT S80)
71+
MUSA_ARCH=${MUSA_ARCH:-21}
72+
CMAKE_EXTRA="-DGGML_MUSA=ON -DMUSA_ARCHITECTURES=${MUSA_ARCH}"
73+
fi
6274
## helpers
6375

6476
# download a file if it does not exist or if it is outdated
@@ -808,7 +820,7 @@ export LLAMA_LOG_PREFIX=1
808820
export LLAMA_LOG_TIMESTAMPS=1
809821

810822
if [ -z ${GG_BUILD_LOW_PERF} ]; then
811-
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
823+
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models
812824
rm -rf ${SRC}/models-mnt
813825
mnt_models=${MNT}/models
814826
mkdir -p ${mnt_models}
@@ -826,16 +838,20 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
826838
fi
827839

828840
ret=0
829-
830-
test $ret -eq 0 && gg_run ctest_debug
841+
if [ -z ${GG_BUILD_SYCL} ]; then
842+
# SYCL build breaks with debug build flags
843+
test $ret -eq 0 && gg_run ctest_debug
844+
fi
831845
test $ret -eq 0 && gg_run ctest_release
832846

833847
if [ -z ${GG_BUILD_LOW_PERF} ]; then
834848
test $ret -eq 0 && gg_run embd_bge_small
835849
test $ret -eq 0 && gg_run rerank_tiny
836850

837851
if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
838-
test $ret -eq 0 && gg_run test_scripts_debug
852+
if [ -z ${GG_BUILD_SYCL} ]; then
853+
test $ret -eq 0 && gg_run test_scripts_debug
854+
fi
839855
test $ret -eq 0 && gg_run test_scripts_release
840856
fi
841857

@@ -846,7 +862,9 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
846862
test $ret -eq 0 && gg_run pythia_2_8b
847863
#test $ret -eq 0 && gg_run open_llama_7b_v2
848864
fi
849-
test $ret -eq 0 && gg_run ctest_with_model_debug
865+
if [ -z ${GG_BUILD_SYCL} ]; then
866+
test $ret -eq 0 && gg_run ctest_with_model_debug
867+
fi
850868
test $ret -eq 0 && gg_run ctest_with_model_release
851869
fi
852870
fi

common/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,8 +114,8 @@ if (LLAMA_LLGUIDANCE)
114114

115115
ExternalProject_Add(llguidance_ext
116116
GIT_REPOSITORY https://github.com/guidance-ai/llguidance
117-
# v0.6.12:
118-
GIT_TAG ced1c9023d47ec194fa977932d35ce65c2ebfc09
117+
# v0.7.10:
118+
GIT_TAG 0309d2a6bf40abda35344a362edc71e06d5009f8
119119
PREFIX ${CMAKE_BINARY_DIR}/llguidance
120120
SOURCE_DIR ${LLGUIDANCE_SRC}
121121
BUILD_IN_SOURCE TRUE

common/llguidance.cpp

Lines changed: 30 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -11,25 +11,24 @@ struct llama_sampler_llg {
1111
std::string grammar_kind;
1212
std::string grammar_data;
1313
LlgTokenizer * tokenizer;
14-
LlgConstraint * grammar;
15-
LlgMaskResult llg_res;
16-
bool has_llg_res;
14+
LlgMatcher * grammar;
1715
};
1816

19-
static LlgConstraint * llama_sampler_llg_new(LlgTokenizer * tokenizer, const char * grammar_kind,
20-
const char * grammar_data) {
17+
static LlgMatcher * llama_sampler_llg_new(LlgTokenizer * tokenizer, const char * grammar_kind,
18+
const char * grammar_data) {
2119
LlgConstraintInit cinit;
2220
llg_constraint_init_set_defaults(&cinit, tokenizer);
2321
const char * log_level = getenv("LLGUIDANCE_LOG_LEVEL");
2422
if (log_level && *log_level) {
2523
cinit.log_stderr_level = atoi(log_level);
2624
}
27-
auto c = llg_new_constraint_any(&cinit, grammar_kind, grammar_data);
28-
if (llg_get_error(c)) {
29-
LOG_ERR("llg error: %s\n", llg_get_error(c));
30-
llg_free_constraint(c);
25+
auto c = llg_new_matcher(&cinit, grammar_kind, grammar_data);
26+
if (llg_matcher_get_error(c)) {
27+
LOG_ERR("llg error: %s\n", llg_matcher_get_error(c));
28+
llg_free_matcher(c);
3129
return nullptr;
3230
}
31+
3332
return c;
3433
}
3534

@@ -40,54 +39,39 @@ static const char * llama_sampler_llg_name(const llama_sampler * /*smpl*/) {
4039
static void llama_sampler_llg_accept_impl(llama_sampler * smpl, llama_token token) {
4140
auto * ctx = (llama_sampler_llg *) smpl->ctx;
4241
if (ctx->grammar) {
43-
LlgCommitResult res;
44-
llg_commit_token(ctx->grammar, token, &res);
45-
ctx->has_llg_res = false;
42+
llg_matcher_consume_token(ctx->grammar, token);
4643
}
4744
}
4845

4946
static void llama_sampler_llg_apply(llama_sampler * smpl, llama_token_data_array * cur_p) {
5047
auto * ctx = (llama_sampler_llg *) smpl->ctx;
5148
if (ctx->grammar) {
52-
if (!ctx->has_llg_res) {
53-
if (llg_compute_mask(ctx->grammar, &ctx->llg_res) == 0) {
54-
ctx->has_llg_res = true;
49+
const uint32_t * mask = llg_matcher_get_mask(ctx->grammar);
50+
if (mask == nullptr) {
51+
if (llg_matcher_compute_mask(ctx->grammar) == 0) {
52+
mask = llg_matcher_get_mask(ctx->grammar);
5553
} else {
56-
LOG_ERR("llg error: %s\n", llg_get_error(ctx->grammar));
57-
llg_free_constraint(ctx->grammar);
54+
LOG_ERR("llg error: %s\n", llg_matcher_get_error(ctx->grammar));
55+
llg_free_matcher(ctx->grammar);
5856
ctx->grammar = nullptr;
57+
return;
5958
}
6059
}
61-
if (ctx->has_llg_res) {
62-
if (ctx->llg_res.is_stop) {
63-
for (size_t i = 0; i < cur_p->size; ++i) {
64-
if (!llama_vocab_is_eog(ctx->vocab, cur_p->data[i].id)) {
65-
cur_p->data[i].logit = -INFINITY;
66-
}
67-
}
68-
} else {
69-
const uint32_t * mask = ctx->llg_res.sample_mask;
70-
for (size_t i = 0; i < cur_p->size; ++i) {
71-
auto token = cur_p->data[i].id;
72-
if ((mask[token / 32] & (1 << (token % 32))) == 0) {
73-
cur_p->data[i].logit = -INFINITY;
74-
}
75-
}
60+
61+
for (size_t i = 0; i < cur_p->size; ++i) {
62+
auto token = cur_p->data[i].id;
63+
if ((mask[token / 32] & (1 << (token % 32))) == 0) {
64+
cur_p->data[i].logit = -INFINITY;
7665
}
7766
}
7867
}
7968
}
8069

8170
static void llama_sampler_llg_reset(llama_sampler * smpl) {
8271
auto * ctx = (llama_sampler_llg *) smpl->ctx;
83-
if (!ctx->grammar) {
84-
return;
72+
if (ctx->grammar) {
73+
llg_matcher_reset(ctx->grammar);
8574
}
86-
87-
auto * grammar_new = llama_sampler_llg_new(ctx->tokenizer, ctx->grammar_kind.c_str(), ctx->grammar_data.c_str());
88-
llg_free_constraint(ctx->grammar);
89-
ctx->grammar = grammar_new;
90-
ctx->has_llg_res = false;
9175
}
9276

9377
static llama_sampler * llama_sampler_llg_clone(const llama_sampler * smpl) {
@@ -102,7 +86,7 @@ static llama_sampler * llama_sampler_llg_clone(const llama_sampler * smpl) {
10286
if (ctx->grammar) {
10387
result_ctx->grammar_kind = ctx->grammar_kind;
10488
result_ctx->grammar_data = ctx->grammar_data;
105-
result_ctx->grammar = llg_clone_constraint(ctx->grammar);
89+
result_ctx->grammar = llg_clone_matcher(ctx->grammar);
10690
result_ctx->tokenizer = llg_clone_tokenizer(ctx->tokenizer);
10791
}
10892
}
@@ -114,7 +98,7 @@ static void llama_sampler_llg_free(llama_sampler * smpl) {
11498
const auto * ctx = (llama_sampler_llg *) smpl->ctx;
11599

116100
if (ctx->grammar) {
117-
llg_free_constraint(ctx->grammar);
101+
llg_free_matcher(ctx->grammar);
118102
llg_free_tokenizer(ctx->tokenizer);
119103
}
120104

@@ -239,25 +223,24 @@ llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * g
239223
/* .grammar_data = */ grammar_data,
240224
/* .tokenizer = */ tokenizer,
241225
/* .grammar = */ llama_sampler_llg_new(tokenizer, grammar_kind, grammar_data),
242-
/* .llg_res = */ {},
243-
/* .has_llg_res = */ false,
244226
};
227+
if (ctx->grammar) {
228+
GGML_ASSERT(((size_t) llama_vocab_n_tokens(vocab) + 31) / 32 * 4 ==
229+
llg_matcher_get_mask_byte_size(ctx->grammar));
230+
}
245231
} else {
246232
*ctx = {
247233
/* .vocab = */ vocab,
248234
/* .grammar_kind = */ {},
249235
/* .grammar_data = */ {},
250236
/* .tokenizer = */ nullptr,
251237
/* .grammar = */ nullptr,
252-
/* .llg_res = */ {},
253-
/* .has_llg_res = */ false,
254238
};
255239
}
256240

257241
return llama_sampler_init(
258242
/* .iface = */ &llama_sampler_llg_i,
259-
/* .ctx = */ ctx
260-
);
243+
/* .ctx = */ ctx);
261244
}
262245

263246
#else

convert_hf_to_gguf.py

Lines changed: 34 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -705,6 +705,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
705705
if chkhsh == "ccc2ef013c104be7bae2965776d611e1d7a8a2a9c547dd93a682c9a9fc80352e":
706706
# ref: https://huggingface.co/Xenova/gpt-4o
707707
res = "gpt-4o"
708+
if chkhsh == "7dec86086fcc38b66b7bc1575a160ae21cf705be7718b9d5598190d7c12db76f":
709+
# ref: https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k
710+
res = "superbpe"
708711

709712
if res is None:
710713
logger.warning("\n")
@@ -1749,7 +1752,7 @@ class Mistral3Model(LlamaModel):
17491752

17501753
# we need to merge the text_config into the root level of hparams
17511754
def __init__(self, *args, **kwargs):
1752-
hparams = Model.load_hparams(kwargs["dir_model"])
1755+
hparams = kwargs["hparams"] if "hparams" in kwargs else Model.load_hparams(args[0])
17531756
if "text_config" in hparams:
17541757
hparams = {**hparams, **hparams["text_config"]}
17551758
kwargs["hparams"] = hparams
@@ -2266,7 +2269,7 @@ def set_gguf_parameters(self):
22662269
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
22672270

22682271

2269-
@Model.register("Qwen2VLForConditionalGeneration")
2272+
@Model.register("Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
22702273
class Qwen2VLModel(Model):
22712274
model_arch = gguf.MODEL_ARCH.QWEN2VL
22722275

@@ -3382,7 +3385,7 @@ class Gemma3Model(Model):
33823385

33833386
# we need to merge the text_config into the root level of hparams
33843387
def __init__(self, *args, **kwargs):
3385-
hparams = Model.load_hparams(kwargs["dir_model"])
3388+
hparams = kwargs["hparams"] if "hparams" in kwargs else Model.load_hparams(args[0])
33863389
if "text_config" in hparams:
33873390
hparams = {**hparams, **hparams["text_config"]}
33883391
kwargs["hparams"] = hparams
@@ -3800,8 +3803,6 @@ def set_gguf_parameters(self):
38003803
_tok_embd = None
38013804

38023805
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3803-
del bid # unused
3804-
38053806
output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
38063807
tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD)
38073808

@@ -3811,6 +3812,10 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
38113812
logger.debug("A_log --> A ==> " + new_name)
38123813
data_torch = -torch.exp(data_torch)
38133814

3815+
# [4 1 8192 1] -> [4 8192 1 1]
3816+
if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid):
3817+
data_torch = data_torch.squeeze()
3818+
38143819
# assuming token_embd.weight is seen before output.weight
38153820
if self._tok_embd is not None and new_name == output_name:
38163821
if torch.equal(self._tok_embd, data_torch):
@@ -4414,6 +4419,29 @@ def prepare_tensors(self):
44144419
raise ValueError(f"Unprocessed experts: {experts}")
44154420

44164421

4422+
@Model.register("PLMForCausalLM")
4423+
class PLMModel(Model):
4424+
model_arch = gguf.MODEL_ARCH.PLM
4425+
4426+
def set_vocab(self):
4427+
self._set_vocab_gpt2()
4428+
4429+
def set_gguf_parameters(self):
4430+
super().set_gguf_parameters()
4431+
hparams = self.hparams
4432+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
4433+
self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
4434+
self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
4435+
self.gguf_writer.add_value_length(hparams["v_head_dim"])
4436+
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
4437+
4438+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4439+
return [(self.map_tensor_name(name), data_torch)]
4440+
4441+
def prepare_tensors(self):
4442+
super().prepare_tensors()
4443+
4444+
44174445
@Model.register("T5WithLMHeadModel")
44184446
@Model.register("T5ForConditionalGeneration")
44194447
@Model.register("MT5ForConditionalGeneration")
@@ -5355,7 +5383,7 @@ def main() -> None:
53555383
logger.error(f"Model {model_architecture} is not supported")
53565384
sys.exit(1)
53575385

5358-
model_instance = model_class(dir_model=dir_model, ftype=output_type, fname_out=fname_out,
5386+
model_instance = model_class(dir_model, output_type, fname_out,
53595387
is_big_endian=args.bigendian, use_temp_file=args.use_temp_file,
53605388
eager=args.no_lazy,
53615389
metadata_override=args.metadata, model_name=args.model_name,

convert_hf_to_gguf_update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ class TOKENIZER_TYPE(IntEnum):
110110
{"name": "deepseek-v3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"},
111111
{"name": "deepseek-r1-qwen", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"},
112112
{"name": "gpt-4o", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Xenova/gpt-4o", },
113+
{"name": "superbpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k", },
113114
]
114115

115116

0 commit comments

Comments
 (0)