Skip to content

Commit f2da75d

Browse files
Merge pull request #108 from menloresearch/update-dev-from-master-2025-05-29-00-08
Sync master with upstream release b5527
2 parents 94e6f6d + 763d06e commit f2da75d

24 files changed

+368
-325
lines changed

.editorconfig

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,6 @@ charset = unset
4949
trim_trailing_whitespace = unset
5050
insert_final_newline = unset
5151

52-
[tools/mtmd/miniaudio.h]
52+
[tools/mtmd/vendor/miniaudio.h]
5353
trim_trailing_whitespace = unset
5454
insert_final_newline = unset

.github/workflows/build-linux-cross.yml

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,12 @@ jobs:
2626
sudo apt-get install -y --no-install-recommends \
2727
build-essential \
2828
gcc-14-riscv64-linux-gnu \
29-
g++-14-riscv64-linux-gnu \
30-
libcurl4-openssl-dev:riscv64
29+
g++-14-riscv64-linux-gnu
3130
3231
- name: Build
3332
run: |
34-
cmake -B build -DCMAKE_BUILD_TYPE=Release \
33+
cmake -B build -DLLAMA_CURL=OFF \
34+
-DCMAKE_BUILD_TYPE=Release \
3535
-DGGML_OPENMP=OFF \
3636
-DLLAMA_BUILD_EXAMPLES=ON \
3737
-DLLAMA_BUILD_TOOLS=ON \
@@ -72,12 +72,12 @@ jobs:
7272
glslc \
7373
gcc-14-riscv64-linux-gnu \
7474
g++-14-riscv64-linux-gnu \
75-
libvulkan-dev:riscv64 \
76-
libcurl4-openssl-dev:riscv64
75+
libvulkan-dev:riscv64
7776
7877
- name: Build
7978
run: |
80-
cmake -B build -DCMAKE_BUILD_TYPE=Release \
79+
cmake -B build -DLLAMA_CURL=OFF \
80+
-DCMAKE_BUILD_TYPE=Release \
8181
-DGGML_VULKAN=ON \
8282
-DGGML_OPENMP=OFF \
8383
-DLLAMA_BUILD_EXAMPLES=ON \
@@ -118,12 +118,12 @@ jobs:
118118
build-essential \
119119
glslc \
120120
crossbuild-essential-arm64 \
121-
libvulkan-dev:arm64 \
122-
libcurl4-openssl-dev:arm64
121+
libvulkan-dev:arm64
123122
124123
- name: Build
125124
run: |
126-
cmake -B build -DCMAKE_BUILD_TYPE=Release \
125+
cmake -B build -DLLAMA_CURL=OFF \
126+
-DCMAKE_BUILD_TYPE=Release \
127127
-DGGML_VULKAN=ON \
128128
-DGGML_OPENMP=OFF \
129129
-DLLAMA_BUILD_EXAMPLES=ON \
@@ -163,12 +163,12 @@ jobs:
163163
sudo apt-get install -y --no-install-recommends \
164164
build-essential \
165165
gcc-14-powerpc64le-linux-gnu \
166-
g++-14-powerpc64le-linux-gnu \
167-
libcurl4-openssl-dev:ppc64el
166+
g++-14-powerpc64le-linux-gnu
168167
169168
- name: Build
170169
run: |
171-
cmake -B build -DCMAKE_BUILD_TYPE=Release \
170+
cmake -B build -DLLAMA_CURL=OFF \
171+
-DCMAKE_BUILD_TYPE=Release \
172172
-DGGML_OPENMP=OFF \
173173
-DLLAMA_BUILD_EXAMPLES=ON \
174174
-DLLAMA_BUILD_TOOLS=ON \
@@ -209,12 +209,12 @@ jobs:
209209
glslc \
210210
gcc-14-powerpc64le-linux-gnu \
211211
g++-14-powerpc64le-linux-gnu \
212-
libvulkan-dev:ppc64el \
213-
libcurl4-openssl-dev:ppc64el
212+
libvulkan-dev:ppc64el
214213
215214
- name: Build
216215
run: |
217-
cmake -B build -DCMAKE_BUILD_TYPE=Release \
216+
cmake -B build -DLLAMA_CURL=OFF \
217+
-DCMAKE_BUILD_TYPE=Release \
218218
-DGGML_VULKAN=ON \
219219
-DGGML_OPENMP=OFF \
220220
-DLLAMA_BUILD_EXAMPLES=ON \

common/common.cpp

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -903,13 +903,16 @@ struct common_init_result common_init_from_params(common_params & params) {
903903
ok = false;
904904
}
905905

906-
if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
907-
LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__);
908-
ok = false;
909-
}
906+
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
907+
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
910908

911-
if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) {
912-
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
909+
if (!has_eos && !has_sep) {
910+
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
911+
ok = false;
912+
} else if (!has_eos) {
913+
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
914+
} else if (!has_sep) {
915+
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
913916
ok = false;
914917
}
915918

convert_hf_to_gguf.py

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -423,19 +423,19 @@ def load_hparams(dir_model: Path):
423423
try:
424424
# for security reason, we don't allow loading remote code by default
425425
# if a model need remote code, we will fallback to config.json
426-
return AutoConfig.from_pretrained(dir_model, trust_remote_code=False).to_dict()
426+
config = AutoConfig.from_pretrained(dir_model, trust_remote_code=False).to_dict()
427427
except Exception as e:
428428
logger.warning(f"Failed to load model config from {dir_model}: {e}")
429429
logger.warning("Trying to load config.json instead")
430430
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
431431
config = json.load(f)
432-
if "llm_config" in config:
433-
# rename for InternVL
434-
config["text_config"] = config["llm_config"]
435-
if "thinker_config" in config:
436-
# rename for Qwen2.5-Omni
437-
config["text_config"] = config["thinker_config"]["text_config"]
438-
return config
432+
if "llm_config" in config:
433+
# rename for InternVL
434+
config["text_config"] = config["llm_config"]
435+
if "thinker_config" in config:
436+
# rename for Qwen2.5-Omni
437+
config["text_config"] = config["thinker_config"]["text_config"]
438+
return config
439439

440440
@classmethod
441441
def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
@@ -1207,7 +1207,7 @@ def set_gguf_parameters(self):
12071207
self.gguf_writer.add_audio_block_count(self.find_aparam(self.n_block_keys))
12081208
self.gguf_writer.add_audio_head_count(self.find_aparam(["num_attention_heads"]))
12091209

1210-
else:
1210+
if not self.has_vision_encoder and not self.has_audio_encoder:
12111211
raise ValueError("MmprojModel must have either vision or audio encoder")
12121212

12131213
def write_vocab(self):
@@ -1841,7 +1841,8 @@ def prepare_tensors(self):
18411841
"MistralForCausalLM",
18421842
"MixtralForCausalLM",
18431843
"VLlama3ForCausalLM",
1844-
"LlavaForConditionalGeneration")
1844+
"LlavaForConditionalGeneration",
1845+
"LlamaModel")
18451846
class LlamaModel(TextModel):
18461847
model_arch = gguf.MODEL_ARCH.LLAMA
18471848
undo_permute = True
@@ -1921,6 +1922,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
19211922

19221923
if is_vision_tensor:
19231924
return [] # skip vision tensors
1925+
elif self.hf_arch == "LlamaModel":
1926+
name = "model." + name
19241927
elif name.startswith("model.text_model"):
19251928
name = name.replace("text_model.", "") # for SmolVLM
19261929
elif name.startswith("language_model."):
@@ -2169,6 +2172,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
21692172
# process vision tensors
21702173
if "positional_embedding_vlm" in name and ".weight" not in name:
21712174
name += ".weight"
2175+
if "multi_modal_projector.linear_1" in name:
2176+
# despite the name with number postfix, this is a single fully connected layer
2177+
return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC], data_torch)]
21722178
return [(self.map_tensor_name(name), data_torch)]
21732179
return []
21742180

@@ -3676,7 +3682,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
36763682
return [(self.map_tensor_name(name), data_torch)]
36773683

36783684

3679-
@ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel")
3685+
@ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel", "BertForSequenceClassification")
36803686
class BertModel(TextModel):
36813687
model_arch = gguf.MODEL_ARCH.BERT
36823688

@@ -3739,6 +3745,13 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
37393745
if name.startswith("cls.seq_relationship"):
37403746
return []
37413747

3748+
# For BertForSequenceClassification (direct projection layer)
3749+
if name == "classifier.weight":
3750+
name = "classifier.out_proj.weight"
3751+
3752+
if name == "classifier.bias":
3753+
name = "classifier.out_proj.bias"
3754+
37423755
return [(self.map_tensor_name(name), data_torch)]
37433756

37443757
def _xlmroberta_tokenizer_init(self) -> None:

convert_hf_to_gguf_update.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
288288

289289
tests = [
290290
"ied 4 ½ months",
291-
"Führer",
291+
"Äpfel",
292292
"",
293293
" ",
294294
" ",

ggml/src/ggml-cann/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ string(TOLOWER ${SOC_TYPE} SOC_VERSION) # SOC_VERSION need lower
3030
string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
3131
set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
3232
string(TOUPPER ${SOC_TYPE_COMPILE_OPTION} SOC_TYPE_COMPILE_OPTION)
33+
message(STATUS "CANN: SOC_VERSION = ${SOC_VERSION}")
3334

3435
if (CANN_INSTALL_DIR)
3536
# Only Support Linux.

ggml/src/ggml-cuda/fattn-common.cuh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -623,8 +623,8 @@ static __global__ void flash_attn_combine_results(
623623
__builtin_assume(tid < D);
624624

625625
extern __shared__ float2 meta[];
626-
if (tid < 2*parallel_blocks) {
627-
((float *) meta)[threadIdx.x] = ((const float *)VKQ_meta) [blockIdx.z*(2*parallel_blocks) + tid];
626+
for (int i = tid; i < 2*parallel_blocks; i += D) {
627+
((float *) meta)[i] = ((const float *)VKQ_meta) [blockIdx.z*(2*parallel_blocks) + i];
628628
}
629629

630630
__syncthreads();

gguf-py/gguf/tensor_mapping.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -902,7 +902,6 @@ class TensorNameMap:
902902

903903
MODEL_TENSOR.V_MMPROJ_FC: (
904904
"model.connector.modality_projection.proj", # SmolVLM
905-
"multi_modal_projector.linear_1", # llama 4
906905
),
907906

908907
MODEL_TENSOR.V_MMPROJ_MLP: (

src/llama-graph.cpp

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -455,7 +455,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
455455
}
456456

457457
int64_t llm_graph_context::n_pos_per_embd() const {
458-
return arch == LLM_ARCH_QWEN2VL ? 4 : 1;
458+
return hparams.rope_type == LLAMA_ROPE_TYPE_MROPE ? 4 : 1;
459459
}
460460

461461
void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const {
@@ -1562,20 +1562,25 @@ void llm_graph_context::build_pooling(
15621562
ggml_tensor * inp_cls = build_inp_cls();
15631563
inp = ggml_get_rows(ctx0, inp, inp_cls);
15641564

1565-
// classification head
1566-
// https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
1567-
GGML_ASSERT(cls != nullptr);
1568-
GGML_ASSERT(cls_b != nullptr);
1569-
1570-
cur = ggml_add (ctx0, ggml_mul_mat(ctx0, cls, inp), cls_b);
1571-
cur = ggml_tanh(ctx0, cur);
1572-
1573-
// some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
1574-
// https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
1575-
if (cls_out) {
1565+
if (cls != nullptr && cls_b != nullptr) {
1566+
// classification head
1567+
// https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
1568+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls, inp), cls_b);
1569+
cur = ggml_tanh(ctx0, cur);
1570+
1571+
// some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
1572+
// https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
1573+
if (cls_out) {
1574+
GGML_ASSERT(cls_out_b != nullptr);
1575+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls_out, cur), cls_out_b);
1576+
}
1577+
} else if (cls_out) {
1578+
// Single layer classification head (direct projection)
1579+
// https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
15761580
GGML_ASSERT(cls_out_b != nullptr);
1577-
1578-
cur = ggml_add (ctx0, ggml_mul_mat(ctx0, cls_out, cur), cls_out_b);
1581+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls_out, inp), cls_out_b);
1582+
} else {
1583+
GGML_ABORT("RANK pooling requires either cls+cls_b or cls_out+cls_out_b");
15791584
}
15801585
} break;
15811586
default:

src/llama-kv-cache.cpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -757,11 +757,19 @@ ggml_tensor * llama_kv_cache_unified::build_rope_shift(
757757
const auto & yarn_beta_slow = cparams.yarn_beta_slow;
758758

759759
const auto & n_rot = hparams.n_rot;
760-
const auto & rope_type = hparams.rope_type;
760+
const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE
761+
// @ngxson : this is a workaround
762+
// for M-RoPE, we want to rotate the whole vector when doing KV shift
763+
// a normal RoPE should work, we just need to use the correct ordering
764+
// ref: https://github.com/ggml-org/llama.cpp/pull/13870
765+
? LLAMA_ROPE_TYPE_NEOX
766+
: hparams.rope_type;
761767

762768
// See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
763769
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
764-
const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2 ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)) : cparams.yarn_attn_factor;
770+
const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2
771+
? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale))
772+
: cparams.yarn_attn_factor;
765773

766774
ggml_tensor * tmp;
767775

0 commit comments

Comments
 (0)