janhq
diff --git a/‎.editorconfig‎
Lines changed: 1 addition & 1 deletion b/‎.editorconfig‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/build-linux-cross.yml‎
Lines changed: 15 additions & 15 deletions b/‎.github/workflows/build-linux-cross.yml‎
Lines changed: 15 additions & 15 deletions
diff --git a/‎common/common.cpp‎
Lines changed: 9 additions & 6 deletions b/‎common/common.cpp‎
Lines changed: 9 additions & 6 deletions
diff --git a/‎convert_hf_to_gguf.py‎
Lines changed: 24 additions & 11 deletions b/‎convert_hf_to_gguf.py‎
Lines changed: 24 additions & 11 deletions
diff --git a/‎convert_hf_to_gguf_update.py‎
Lines changed: 1 addition & 1 deletion b/‎convert_hf_to_gguf_update.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ggml/src/ggml-cann/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎ggml/src/ggml-cann/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ggml/src/ggml-cuda/fattn-common.cuh‎
Lines changed: 2 additions & 2 deletions b/‎ggml/src/ggml-cuda/fattn-common.cuh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎gguf-py/gguf/tensor_mapping.py‎
Lines changed: 0 additions & 1 deletion b/‎gguf-py/gguf/tensor_mapping.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/llama-graph.cpp‎
Lines changed: 19 additions & 14 deletions b/‎src/llama-graph.cpp‎
Lines changed: 19 additions & 14 deletions
diff --git a/‎src/llama-kv-cache.cpp‎
Lines changed: 10 additions & 2 deletions b/‎src/llama-kv-cache.cpp‎
Lines changed: 10 additions & 2 deletions
@@ -49,6 +49,6 @@ charset = unset
 trim_trailing_whitespace = unset
 insert_final_newline = unset
 
-[tools/mtmd/miniaudio.h]
+[tools/mtmd/vendor/miniaudio.h]
 trim_trailing_whitespace = unset
 insert_final_newline = unset
@@ -26,12 +26,12 @@ jobs:
           sudo apt-get install -y --no-install-recommends \
                   build-essential \
                   gcc-14-riscv64-linux-gnu \
-                  g++-14-riscv64-linux-gnu \
-                  libcurl4-openssl-dev:riscv64
+                  g++-14-riscv64-linux-gnu
 
       - name: Build
         run: |
-          cmake -B build -DCMAKE_BUILD_TYPE=Release \
+          cmake -B build -DLLAMA_CURL=OFF \
+                         -DCMAKE_BUILD_TYPE=Release \
                          -DGGML_OPENMP=OFF \
                          -DLLAMA_BUILD_EXAMPLES=ON \
                          -DLLAMA_BUILD_TOOLS=ON \
@@ -72,12 +72,12 @@ jobs:
                   glslc \
                   gcc-14-riscv64-linux-gnu \
                   g++-14-riscv64-linux-gnu \
-                  libvulkan-dev:riscv64 \
-                  libcurl4-openssl-dev:riscv64
+                  libvulkan-dev:riscv64
 
       - name: Build
         run: |
-          cmake -B build -DCMAKE_BUILD_TYPE=Release \
+          cmake -B build -DLLAMA_CURL=OFF \
+                         -DCMAKE_BUILD_TYPE=Release \
                          -DGGML_VULKAN=ON \
                          -DGGML_OPENMP=OFF \
                          -DLLAMA_BUILD_EXAMPLES=ON \
@@ -118,12 +118,12 @@ jobs:
                   build-essential \
                   glslc \
                   crossbuild-essential-arm64 \
-                  libvulkan-dev:arm64 \
-                  libcurl4-openssl-dev:arm64
+                  libvulkan-dev:arm64
 
       - name: Build
         run: |
-          cmake -B build -DCMAKE_BUILD_TYPE=Release \
+          cmake -B build -DLLAMA_CURL=OFF \
+                         -DCMAKE_BUILD_TYPE=Release \
                          -DGGML_VULKAN=ON \
                          -DGGML_OPENMP=OFF \
                          -DLLAMA_BUILD_EXAMPLES=ON \
@@ -163,12 +163,12 @@ jobs:
           sudo apt-get install -y --no-install-recommends \
                   build-essential \
                   gcc-14-powerpc64le-linux-gnu \
-                  g++-14-powerpc64le-linux-gnu \
-                  libcurl4-openssl-dev:ppc64el
+                  g++-14-powerpc64le-linux-gnu
 
       - name: Build
         run: |
-          cmake -B build -DCMAKE_BUILD_TYPE=Release \
+          cmake -B build -DLLAMA_CURL=OFF \
+                         -DCMAKE_BUILD_TYPE=Release \
                          -DGGML_OPENMP=OFF \
                          -DLLAMA_BUILD_EXAMPLES=ON \
                          -DLLAMA_BUILD_TOOLS=ON \
@@ -209,12 +209,12 @@ jobs:
                   glslc \
                   gcc-14-powerpc64le-linux-gnu \
                   g++-14-powerpc64le-linux-gnu \
-                  libvulkan-dev:ppc64el \
-                  libcurl4-openssl-dev:ppc64el
+                  libvulkan-dev:ppc64el
 
       - name: Build
         run: |
-          cmake -B build -DCMAKE_BUILD_TYPE=Release \
+          cmake -B build -DLLAMA_CURL=OFF \
+                         -DCMAKE_BUILD_TYPE=Release \
                          -DGGML_VULKAN=ON \
                          -DGGML_OPENMP=OFF \
                          -DLLAMA_BUILD_EXAMPLES=ON \
 
@@ -903,13 +903,16 @@ struct common_init_result common_init_from_params(common_params & params) {
             ok = false;
         }
 
-        if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
-            LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__);
-            ok = false;
-        }
+        bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
+        bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
 
-        if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) {
-            LOG_WRN("%s: warning: vocab does not have a  SEP token, reranking will not work\n", __func__);
+        if (!has_eos && !has_sep) {
+            LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
+            ok = false;
+        } else if (!has_eos) {
+            LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
+        } else if (!has_sep) {
+            LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
             ok = false;
         }
 
 
@@ -423,19 +423,19 @@ def load_hparams(dir_model: Path):
         try:
             # for security reason, we don't allow loading remote code by default
             # if a model need remote code, we will fallback to config.json
-            return AutoConfig.from_pretrained(dir_model, trust_remote_code=False).to_dict()
+            config = AutoConfig.from_pretrained(dir_model, trust_remote_code=False).to_dict()
         except Exception as e:
             logger.warning(f"Failed to load model config from {dir_model}: {e}")
             logger.warning("Trying to load config.json instead")
             with open(dir_model / "config.json", "r", encoding="utf-8") as f:
                 config = json.load(f)
-                if "llm_config" in config:
-                    # rename for InternVL
-                    config["text_config"] = config["llm_config"]
-                if "thinker_config" in config:
-                    # rename for Qwen2.5-Omni
-                    config["text_config"] = config["thinker_config"]["text_config"]
-                return config
+        if "llm_config" in config:
+            # rename for InternVL
+            config["text_config"] = config["llm_config"]
+        if "thinker_config" in config:
+            # rename for Qwen2.5-Omni
+            config["text_config"] = config["thinker_config"]["text_config"]
+        return config
 
     @classmethod
     def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
@@ -1207,7 +1207,7 @@ def set_gguf_parameters(self):
             self.gguf_writer.add_audio_block_count(self.find_aparam(self.n_block_keys))
             self.gguf_writer.add_audio_head_count(self.find_aparam(["num_attention_heads"]))
 
-        else:
+        if not self.has_vision_encoder and not self.has_audio_encoder:
             raise ValueError("MmprojModel must have either vision or audio encoder")
 
     def write_vocab(self):
@@ -1841,7 +1841,8 @@ def prepare_tensors(self):
     "MistralForCausalLM",
     "MixtralForCausalLM",
     "VLlama3ForCausalLM",
-    "LlavaForConditionalGeneration")
+    "LlavaForConditionalGeneration",
+    "LlamaModel")
 class LlamaModel(TextModel):
     model_arch = gguf.MODEL_ARCH.LLAMA
     undo_permute = True
@@ -1921,6 +1922,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
 
         if is_vision_tensor:
             return [] # skip vision tensors
+        elif self.hf_arch == "LlamaModel":
+            name = "model." + name
         elif name.startswith("model.text_model"):
             name = name.replace("text_model.", "") # for SmolVLM
         elif name.startswith("language_model."):
@@ -2169,6 +2172,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
             # process vision tensors
             if "positional_embedding_vlm" in name and ".weight" not in name:
                 name += ".weight"
+            if "multi_modal_projector.linear_1" in name:
+                # despite the name with number postfix, this is a single fully connected layer
+                return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC], data_torch)]
             return [(self.map_tensor_name(name), data_torch)]
         return []
 
@@ -3676,7 +3682,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         return [(self.map_tensor_name(name), data_torch)]
 
 
-@ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel")
+@ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel", "BertForSequenceClassification")
 class BertModel(TextModel):
     model_arch = gguf.MODEL_ARCH.BERT
 
@@ -3739,6 +3745,13 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         if name.startswith("cls.seq_relationship"):
             return []
 
+        # For BertForSequenceClassification (direct projection layer)
+        if name == "classifier.weight":
+            name = "classifier.out_proj.weight"
+
+        if name == "classifier.bias":
+            name = "classifier.out_proj.bias"
+
         return [(self.map_tensor_name(name), data_torch)]
 
     def _xlmroberta_tokenizer_init(self) -> None:
 
@@ -288,7 +288,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
 
 tests = [
     "ied 4 ½ months",
-    "Führer",
+    "Äpfel",
     "",
     " ",
     "  ",
 
@@ -30,6 +30,7 @@ string(TOLOWER ${SOC_TYPE} SOC_VERSION) # SOC_VERSION need lower
 string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
 set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
 string(TOUPPER ${SOC_TYPE_COMPILE_OPTION} SOC_TYPE_COMPILE_OPTION)
+message(STATUS "CANN: SOC_VERSION =  ${SOC_VERSION}")
 
 if (CANN_INSTALL_DIR)
     # Only Support Linux.
 
@@ -623,8 +623,8 @@ static __global__ void flash_attn_combine_results(
     __builtin_assume(tid < D);
 
     extern __shared__ float2 meta[];
-    if (tid < 2*parallel_blocks) {
-        ((float *) meta)[threadIdx.x] = ((const float *)VKQ_meta) [blockIdx.z*(2*parallel_blocks) + tid];
+    for (int i = tid; i < 2*parallel_blocks; i += D) {
+        ((float *) meta)[i] = ((const float *)VKQ_meta) [blockIdx.z*(2*parallel_blocks) + i];
     }
 
     __syncthreads();
 
@@ -902,7 +902,6 @@ class TensorNameMap:
 
         MODEL_TENSOR.V_MMPROJ_FC: (
             "model.connector.modality_projection.proj", # SmolVLM
-            "multi_modal_projector.linear_1", # llama 4
         ),
 
         MODEL_TENSOR.V_MMPROJ_MLP: (
 
@@ -455,7 +455,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
     }
 
 int64_t llm_graph_context::n_pos_per_embd() const {
-    return arch == LLM_ARCH_QWEN2VL ? 4 : 1;
+    return hparams.rope_type == LLAMA_ROPE_TYPE_MROPE ? 4 : 1;
 }
 
 void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const {
@@ -1562,20 +1562,25 @@ void llm_graph_context::build_pooling(
                 ggml_tensor * inp_cls = build_inp_cls();
                 inp = ggml_get_rows(ctx0, inp, inp_cls);
 
-                // classification head
-                // https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
-                GGML_ASSERT(cls   != nullptr);
-                GGML_ASSERT(cls_b != nullptr);
-
-                cur = ggml_add (ctx0, ggml_mul_mat(ctx0, cls, inp), cls_b);
-                cur = ggml_tanh(ctx0, cur);
-
-                // some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
-                // https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
-                if (cls_out) {
+                if (cls != nullptr && cls_b != nullptr) {
+                    // classification head
+                    // https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
+                    cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls, inp), cls_b);
+                    cur = ggml_tanh(ctx0, cur);
+
+                    // some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
+                    // https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
+                    if (cls_out) {
+                        GGML_ASSERT(cls_out_b != nullptr);
+                        cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls_out, cur), cls_out_b);
+                    }
+                } else if (cls_out) {
+                    // Single layer classification head (direct projection)
+                    // https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
                     GGML_ASSERT(cls_out_b != nullptr);
-
-                    cur = ggml_add (ctx0, ggml_mul_mat(ctx0, cls_out, cur), cls_out_b);
+                    cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls_out, inp), cls_out_b);
+                } else {
+                    GGML_ABORT("RANK pooling requires either cls+cls_b or cls_out+cls_out_b");
                 }
             } break;
         default:
 
@@ -757,11 +757,19 @@ ggml_tensor * llama_kv_cache_unified::build_rope_shift(
     const auto & yarn_beta_slow  = cparams.yarn_beta_slow;
 
     const auto & n_rot     = hparams.n_rot;
-    const auto & rope_type = hparams.rope_type;
+    const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE
+                                // @ngxson : this is a workaround
+                                // for M-RoPE, we want to rotate the whole vector when doing KV shift
+                                // a normal RoPE should work, we just need to use the correct ordering
+                                // ref: https://github.com/ggml-org/llama.cpp/pull/13870
+                                ? LLAMA_ROPE_TYPE_NEOX
+                                : hparams.rope_type;
 
     // See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
     // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
-    const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2 ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)) : cparams.yarn_attn_factor;
+    const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2
+                                    ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale))
+                                    : cparams.yarn_attn_factor;
 
     ggml_tensor * tmp;
Original file line number	Diff line number	Diff line change
`@@ -623,8 +623,8 @@ static __global__ void flash_attn_combine_results(`
`623`	`623`	`__builtin_assume(tid < D);`
`624`	`624`
`625`	`625`	`extern __shared__ float2 meta[];`
`626`		`- if (tid < 2*parallel_blocks) {`
`627`		`- ((float ) meta)[threadIdx.x] = ((const float )VKQ_meta) [blockIdx.z(2parallel_blocks) + tid];`
	`626`	`+ for (int i = tid; i < 2*parallel_blocks; i += D) {`
	`627`	`+ ((float ) meta)[i] = ((const float )VKQ_meta) [blockIdx.z(2parallel_blocks) + i];`
`628`	`628`	`}`
`629`	`629`
`630`	`630`	`__syncthreads();`