diff --git a/.editorconfig b/.editorconfig
index 316448c7e93ce..47111c72d7529 100644
--- a/.editorconfig
+++ b/.editorconfig
@@ -49,6 +49,6 @@ charset = unset
 trim_trailing_whitespace = unset
 insert_final_newline = unset
 
-[tools/mtmd/miniaudio.h]
+[tools/mtmd/vendor/miniaudio.h]
 trim_trailing_whitespace = unset
 insert_final_newline = unset
diff --git a/.github/workflows/build-linux-cross.yml b/.github/workflows/build-linux-cross.yml
index dbd31e589be3e..92dc41f9d729c 100644
--- a/.github/workflows/build-linux-cross.yml
+++ b/.github/workflows/build-linux-cross.yml
@@ -26,12 +26,12 @@ jobs:
           sudo apt-get install -y --no-install-recommends \
                   build-essential \
                   gcc-14-riscv64-linux-gnu \
-                  g++-14-riscv64-linux-gnu \
-                  libcurl4-openssl-dev:riscv64
+                  g++-14-riscv64-linux-gnu
 
       - name: Build
         run: |
-          cmake -B build -DCMAKE_BUILD_TYPE=Release \
+          cmake -B build -DLLAMA_CURL=OFF \
+                         -DCMAKE_BUILD_TYPE=Release \
                          -DGGML_OPENMP=OFF \
                          -DLLAMA_BUILD_EXAMPLES=ON \
                          -DLLAMA_BUILD_TOOLS=ON \
@@ -72,12 +72,12 @@ jobs:
                   glslc \
                   gcc-14-riscv64-linux-gnu \
                   g++-14-riscv64-linux-gnu \
-                  libvulkan-dev:riscv64 \
-                  libcurl4-openssl-dev:riscv64
+                  libvulkan-dev:riscv64
 
       - name: Build
         run: |
-          cmake -B build -DCMAKE_BUILD_TYPE=Release \
+          cmake -B build -DLLAMA_CURL=OFF \
+                         -DCMAKE_BUILD_TYPE=Release \
                          -DGGML_VULKAN=ON \
                          -DGGML_OPENMP=OFF \
                          -DLLAMA_BUILD_EXAMPLES=ON \
@@ -118,12 +118,12 @@ jobs:
                   build-essential \
                   glslc \
                   crossbuild-essential-arm64 \
-                  libvulkan-dev:arm64 \
-                  libcurl4-openssl-dev:arm64
+                  libvulkan-dev:arm64
 
       - name: Build
         run: |
-          cmake -B build -DCMAKE_BUILD_TYPE=Release \
+          cmake -B build -DLLAMA_CURL=OFF \
+                         -DCMAKE_BUILD_TYPE=Release \
                          -DGGML_VULKAN=ON \
                          -DGGML_OPENMP=OFF \
                          -DLLAMA_BUILD_EXAMPLES=ON \
@@ -163,12 +163,12 @@ jobs:
           sudo apt-get install -y --no-install-recommends \
                   build-essential \
                   gcc-14-powerpc64le-linux-gnu \
-                  g++-14-powerpc64le-linux-gnu \
-                  libcurl4-openssl-dev:ppc64el
+                  g++-14-powerpc64le-linux-gnu
 
       - name: Build
         run: |
-          cmake -B build -DCMAKE_BUILD_TYPE=Release \
+          cmake -B build -DLLAMA_CURL=OFF \
+                         -DCMAKE_BUILD_TYPE=Release \
                          -DGGML_OPENMP=OFF \
                          -DLLAMA_BUILD_EXAMPLES=ON \
                          -DLLAMA_BUILD_TOOLS=ON \
@@ -209,12 +209,12 @@ jobs:
                   glslc \
                   gcc-14-powerpc64le-linux-gnu \
                   g++-14-powerpc64le-linux-gnu \
-                  libvulkan-dev:ppc64el \
-                  libcurl4-openssl-dev:ppc64el
+                  libvulkan-dev:ppc64el
 
       - name: Build
         run: |
-          cmake -B build -DCMAKE_BUILD_TYPE=Release \
+          cmake -B build -DLLAMA_CURL=OFF \
+                         -DCMAKE_BUILD_TYPE=Release \
                          -DGGML_VULKAN=ON \
                          -DGGML_OPENMP=OFF \
                          -DLLAMA_BUILD_EXAMPLES=ON \
diff --git a/common/common.cpp b/common/common.cpp
index 2afa9b2d641d4..d80c47bc393fa 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -903,13 +903,16 @@ struct common_init_result common_init_from_params(common_params & params) {
             ok = false;
         }
 
-        if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
-            LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__);
-            ok = false;
-        }
+        bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
+        bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
 
-        if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) {
-            LOG_WRN("%s: warning: vocab does not have a  SEP token, reranking will not work\n", __func__);
+        if (!has_eos && !has_sep) {
+            LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
+            ok = false;
+        } else if (!has_eos) {
+            LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
+        } else if (!has_sep) {
+            LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
             ok = false;
         }
 
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index a015ecee08328..227ae7bc235ee 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -423,19 +423,19 @@ def load_hparams(dir_model: Path):
         try:
             # for security reason, we don't allow loading remote code by default
             # if a model need remote code, we will fallback to config.json
-            return AutoConfig.from_pretrained(dir_model, trust_remote_code=False).to_dict()
+            config = AutoConfig.from_pretrained(dir_model, trust_remote_code=False).to_dict()
         except Exception as e:
             logger.warning(f"Failed to load model config from {dir_model}: {e}")
             logger.warning("Trying to load config.json instead")
             with open(dir_model / "config.json", "r", encoding="utf-8") as f:
                 config = json.load(f)
-                if "llm_config" in config:
-                    # rename for InternVL
-                    config["text_config"] = config["llm_config"]
-                if "thinker_config" in config:
-                    # rename for Qwen2.5-Omni
-                    config["text_config"] = config["thinker_config"]["text_config"]
-                return config
+        if "llm_config" in config:
+            # rename for InternVL
+            config["text_config"] = config["llm_config"]
+        if "thinker_config" in config:
+            # rename for Qwen2.5-Omni
+            config["text_config"] = config["thinker_config"]["text_config"]
+        return config
 
     @classmethod
     def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
@@ -1207,7 +1207,7 @@ def set_gguf_parameters(self):
             self.gguf_writer.add_audio_block_count(self.find_aparam(self.n_block_keys))
             self.gguf_writer.add_audio_head_count(self.find_aparam(["num_attention_heads"]))
 
-        else:
+        if not self.has_vision_encoder and not self.has_audio_encoder:
             raise ValueError("MmprojModel must have either vision or audio encoder")
 
     def write_vocab(self):
@@ -1841,7 +1841,8 @@ def prepare_tensors(self):
     "MistralForCausalLM",
     "MixtralForCausalLM",
     "VLlama3ForCausalLM",
-    "LlavaForConditionalGeneration")
+    "LlavaForConditionalGeneration",
+    "LlamaModel")
 class LlamaModel(TextModel):
     model_arch = gguf.MODEL_ARCH.LLAMA
     undo_permute = True
@@ -1921,6 +1922,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
 
         if is_vision_tensor:
             return [] # skip vision tensors
+        elif self.hf_arch == "LlamaModel":
+            name = "model." + name
         elif name.startswith("model.text_model"):
             name = name.replace("text_model.", "") # for SmolVLM
         elif name.startswith("language_model."):
@@ -2169,6 +2172,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
             # process vision tensors
             if "positional_embedding_vlm" in name and ".weight" not in name:
                 name += ".weight"
+            if "multi_modal_projector.linear_1" in name:
+                # despite the name with number postfix, this is a single fully connected layer
+                return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC], data_torch)]
             return [(self.map_tensor_name(name), data_torch)]
         return []
 
@@ -3676,7 +3682,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         return [(self.map_tensor_name(name), data_torch)]
 
 
-@ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel")
+@ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel", "BertForSequenceClassification")
 class BertModel(TextModel):
     model_arch = gguf.MODEL_ARCH.BERT
 
@@ -3739,6 +3745,13 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         if name.startswith("cls.seq_relationship"):
             return []
 
+        # For BertForSequenceClassification (direct projection layer)
+        if name == "classifier.weight":
+            name = "classifier.out_proj.weight"
+
+        if name == "classifier.bias":
+            name = "classifier.out_proj.bias"
+
         return [(self.map_tensor_name(name), data_torch)]
 
     def _xlmroberta_tokenizer_init(self) -> None:
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
index 5993a4c9836b5..c66ec547bca26 100755
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -288,7 +288,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
 
 tests = [
     "ied 4 ½ months",
-    "Führer",
+    "Äpfel",
     "",
     " ",
     "  ",
diff --git a/ggml/src/ggml-cann/CMakeLists.txt b/ggml/src/ggml-cann/CMakeLists.txt
index 0d8e483b291c7..7742b39153f88 100755
--- a/ggml/src/ggml-cann/CMakeLists.txt
+++ b/ggml/src/ggml-cann/CMakeLists.txt
@@ -30,6 +30,7 @@ string(TOLOWER ${SOC_TYPE} SOC_VERSION) # SOC_VERSION need lower
 string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
 set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
 string(TOUPPER ${SOC_TYPE_COMPILE_OPTION} SOC_TYPE_COMPILE_OPTION)
+message(STATUS "CANN: SOC_VERSION =  ${SOC_VERSION}")
 
 if (CANN_INSTALL_DIR)
     # Only Support Linux.
diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh
index a4fbd823638fa..cfab2b5ebaccc 100644
--- a/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@@ -623,8 +623,8 @@ static __global__ void flash_attn_combine_results(
     __builtin_assume(tid < D);
 
     extern __shared__ float2 meta[];
-    if (tid < 2*parallel_blocks) {
-        ((float *) meta)[threadIdx.x] = ((const float *)VKQ_meta) [blockIdx.z*(2*parallel_blocks) + tid];
+    for (int i = tid; i < 2*parallel_blocks; i += D) {
+        ((float *) meta)[i] = ((const float *)VKQ_meta) [blockIdx.z*(2*parallel_blocks) + i];
     }
 
     __syncthreads();
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 000ffd00615b5..48167dd648c0a 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -902,7 +902,6 @@ class TensorNameMap:
 
         MODEL_TENSOR.V_MMPROJ_FC: (
             "model.connector.modality_projection.proj", # SmolVLM
-            "multi_modal_projector.linear_1", # llama 4
         ),
 
         MODEL_TENSOR.V_MMPROJ_MLP: (
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index cdd5887de961c..7c383e2eb3f27 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -455,7 +455,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
     }
 
 int64_t llm_graph_context::n_pos_per_embd() const {
-    return arch == LLM_ARCH_QWEN2VL ? 4 : 1;
+    return hparams.rope_type == LLAMA_ROPE_TYPE_MROPE ? 4 : 1;
 }
 
 void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const {
@@ -1562,20 +1562,25 @@ void llm_graph_context::build_pooling(
                 ggml_tensor * inp_cls = build_inp_cls();
                 inp = ggml_get_rows(ctx0, inp, inp_cls);
 
-                // classification head
-                // https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
-                GGML_ASSERT(cls   != nullptr);
-                GGML_ASSERT(cls_b != nullptr);
-
-                cur = ggml_add (ctx0, ggml_mul_mat(ctx0, cls, inp), cls_b);
-                cur = ggml_tanh(ctx0, cur);
-
-                // some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
-                // https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
-                if (cls_out) {
+                if (cls != nullptr && cls_b != nullptr) {
+                    // classification head
+                    // https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
+                    cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls, inp), cls_b);
+                    cur = ggml_tanh(ctx0, cur);
+
+                    // some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
+                    // https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
+                    if (cls_out) {
+                        GGML_ASSERT(cls_out_b != nullptr);
+                        cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls_out, cur), cls_out_b);
+                    }
+                } else if (cls_out) {
+                    // Single layer classification head (direct projection)
+                    // https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
                     GGML_ASSERT(cls_out_b != nullptr);
-
-                    cur = ggml_add (ctx0, ggml_mul_mat(ctx0, cls_out, cur), cls_out_b);
+                    cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls_out, inp), cls_out_b);
+                } else {
+                    GGML_ABORT("RANK pooling requires either cls+cls_b or cls_out+cls_out_b");
                 }
             } break;
         default:
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index 4a42d6ecdc455..766f8d079afb2 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -757,11 +757,19 @@ ggml_tensor * llama_kv_cache_unified::build_rope_shift(
     const auto & yarn_beta_slow  = cparams.yarn_beta_slow;
 
     const auto & n_rot     = hparams.n_rot;
-    const auto & rope_type = hparams.rope_type;
+    const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE
+                                // @ngxson : this is a workaround
+                                // for M-RoPE, we want to rotate the whole vector when doing KV shift
+                                // a normal RoPE should work, we just need to use the correct ordering
+                                // ref: https://github.com/ggml-org/llama.cpp/pull/13870
+                                ? LLAMA_ROPE_TYPE_NEOX
+                                : hparams.rope_type;
 
     // See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
     // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
-    const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2 ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)) : cparams.yarn_attn_factor;
+    const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2
+                                    ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale))
+                                    : cparams.yarn_attn_factor;
 
     ggml_tensor * tmp;
 
diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt
index c3024cec18360..33e251d3b87ff 100644
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@@ -1,48 +1,54 @@
 # mtmd
 
-# compile mtmd-audio separately to avoid long compile times with miniaudio.h
-# TODO @ngxson : move miniaudio.h and stb_image.h to mtmd-helper.cpp, then compile the helper as a separate library
-add_library(mtmd_audio STATIC mtmd-audio.cpp mtmd-audio.h)
-if (BUILD_SHARED_LIBS)
-    set_target_properties(mtmd_audio PROPERTIES POSITION_INDEPENDENT_CODE ON)
-endif()
-target_link_libraries(mtmd_audio PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(mtmd_audio PRIVATE cxx_std_17)
-target_include_directories(mtmd_audio PRIVATE .)
-
 add_library(mtmd OBJECT
             mtmd.cpp
-            mtmd-helper.cpp
+            mtmd-audio.cpp
             mtmd.h
             clip.cpp
             clip.h
             clip-impl.h
             )
 
-target_link_libraries(mtmd PRIVATE ggml llama mtmd_audio ${CMAKE_THREAD_LIBS_INIT})
-
+target_link_libraries(mtmd PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(mtmd PUBLIC .)
 target_include_directories(mtmd PRIVATE ../..)
-target_include_directories(mtmd PRIVATE ../../common) # for stb_image.h
-
 target_compile_features(mtmd PRIVATE cxx_std_17)
 
-add_library(mtmd_static STATIC $<TARGET_OBJECTS:mtmd>)
+# compile the helper separately, to avoid long compile times with miniaudio.h and stb_image.h
+
+add_library(mtmd_helper OBJECT
+            mtmd-helper.cpp
+            mtmd-helper.h
+            )
+
+target_link_libraries(mtmd_helper PRIVATE ggml llama mtmd ${CMAKE_THREAD_LIBS_INIT})
+target_include_directories(mtmd_helper PUBLIC .)
+target_include_directories(mtmd_helper PRIVATE ./vendor)
+target_include_directories(mtmd_helper PRIVATE ../..)
+target_compile_features(mtmd_helper PRIVATE cxx_std_17)
+
 if (BUILD_SHARED_LIBS)
     set_target_properties(mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
     target_compile_definitions(mtmd PRIVATE LLAMA_SHARED LLAMA_BUILD)
     add_library(mtmd_shared SHARED $<TARGET_OBJECTS:mtmd>)
-    target_link_libraries(mtmd_shared PRIVATE ggml llama mtmd_audio ${CMAKE_THREAD_LIBS_INIT})
+    target_link_libraries(mtmd_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
     install(TARGETS mtmd_shared LIBRARY)
+
+    set_target_properties(mtmd_helper PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    target_compile_definitions(mtmd_helper PRIVATE LLAMA_SHARED LLAMA_BUILD)
+    add_library(mtmd_helper_shared SHARED $<TARGET_OBJECTS:mtmd>)
+    target_link_libraries(mtmd_helper_shared PRIVATE ggml llama mtmd ${CMAKE_THREAD_LIBS_INIT})
+    install(TARGETS mtmd_helper_shared LIBRARY)
 endif()
 
 if (NOT MSVC)
-    target_compile_options(mtmd PRIVATE -Wno-cast-qual) # stb_image.h
-    target_compile_options(mtmd_audio PRIVATE -Wno-cast-qual) # miniaudio.h
+    # for stb_image.h and miniaudio.h
+    target_compile_options(mtmd_helper PRIVATE -Wno-cast-qual)
 endif()
 
 if(TARGET BUILD_INFO)
     add_dependencies(mtmd BUILD_INFO)
+    add_dependencies(mtmd_helper BUILD_INFO)
 endif()
 
 add_executable(llama-llava-cli    deprecation-warning.cpp)
@@ -54,5 +60,5 @@ set(TARGET llama-mtmd-cli)
 add_executable(${TARGET} mtmd-cli.cpp)
 set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common mtmd mtmd_helper ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 6ae2c2ce46fd2..c25bacc17769b 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -11,9 +11,6 @@
 #include "ggml-backend.h"
 #include "gguf.h"
 
-#define STB_IMAGE_IMPLEMENTATION
-#include "stb_image.h"
-
 #include <cassert>
 #include <cmath>
 #include <cstdlib>
@@ -2786,30 +2783,6 @@ void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny
     memcpy(img->buf.data(), rgb_pixels, img->buf.size());
 }
 
-bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
-    int nx, ny, nc;
-    auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
-    if (!data) {
-        LOG_ERR("%s: failed to load image '%s'\n", __func__, fname);
-        return false;
-    }
-    clip_build_img_from_pixels(data, nx, ny, img);
-    stbi_image_free(data);
-    return true;
-}
-
-bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) {
-    int nx, ny, nc;
-    auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
-    if (!data) {
-        LOG_ERR("%s: failed to decode image bytes\n", __func__);
-        return false;
-    }
-    clip_build_img_from_pixels(data, nx, ny, img);
-    stbi_image_free(data);
-    return true;
-}
-
 // Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not
 static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) {
     dst.nx = src.nx;
diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp
index ae06a695d9dde..4d053895cdae9 100644
--- a/tools/mtmd/mtmd-audio.cpp
+++ b/tools/mtmd/mtmd-audio.cpp
@@ -1,28 +1,5 @@
-// fix problem with std::min and std::max
-#if defined(_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#ifndef NOMINMAX
-#   define NOMINMAX
-#endif
-#include <windows.h>
-#endif
-
 #include "mtmd-audio.h"
 
-//#define MTMD_AUDIO_DEBUG
-
-#define MINIAUDIO_IMPLEMENTATION
-#ifndef MTMD_AUDIO_DEBUG
-#   define MA_NO_ENCODING
-#endif
-#define MA_NO_DEVICE_IO
-#define MA_NO_RESOURCE_MANAGER
-#define MA_NO_NODE_GRAPH
-#define MA_NO_ENGINE
-#define MA_NO_GENERATION
-#define MA_API static
-#include "miniaudio.h"
-
 #define _USE_MATH_DEFINES // for M_PI
 #include <cmath>
 #include <cstdint>
@@ -359,69 +336,6 @@ bool preprocess_audio(
 } // namespace whisper_preprocessor
 
 
-namespace audio_helpers {
-
-bool is_audio_file(const char * buf, size_t len) {
-    if (len < 12) {
-        return false;
-    }
-
-    // RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
-    // WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
-    bool is_wav = memcmp(buf, "RIFF", 4) == 0 && memcmp(buf + 8, "WAVE", 4) == 0;
-    bool is_mp3 = len >= 3 && (
-        memcmp(buf, "ID3", 3) == 0 ||
-        // Check for MPEG sync word (simplified check)
-        ((unsigned char)buf[0] == 0xFF && ((unsigned char)buf[1] & 0xE0) == 0xE0)
-    );
-    bool is_flac = memcmp(buf, "fLaC", 4) == 0;
-
-    return is_wav || is_mp3 || is_flac;
-}
-
-// returns true if the buffer is a valid audio file
-bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector<float> & pcmf32_mono) {
-    ma_result result;
-    const int channels = 1;
-    ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, channels, target_sampler_rate);
-    ma_decoder decoder;
-
-    result = ma_decoder_init_memory(buf_in, len, &decoder_config, &decoder);
-    if (result != MA_SUCCESS) {
-        return false;
-    }
-
-    ma_uint64 frame_count;
-    ma_uint64 frames_read;
-    result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count);
-    if (result != MA_SUCCESS) {
-        ma_decoder_uninit(&decoder);
-        return false;
-    }
-
-    pcmf32_mono.resize(frame_count);
-    result = ma_decoder_read_pcm_frames(&decoder, pcmf32_mono.data(), frame_count, &frames_read);
-    if (result != MA_SUCCESS) {
-        ma_decoder_uninit(&decoder);
-        return false;
-    }
-
-#ifdef MTMD_AUDIO_DEBUG
-    // save audio to wav file
-    ma_encoder_config config = ma_encoder_config_init(ma_encoding_format_wav, ma_format_f32, 1, target_sampler_rate);
-    ma_encoder encoder;
-    ma_encoder_init_file("output.wav", &config, &encoder);
-    ma_encoder_write_pcm_frames(&encoder, pcmf32_mono.data(), pcmf32_mono.size(), &frames_read);
-    ma_encoder_uninit(&encoder);
-#endif
-
-    ma_decoder_uninit(&decoder);
-    return true;
-}
-
-} // namespace wav_utils
-
-
 // precalculated mel filter banks
 // values are multiplied by 1000.0 to save space, and will be divided by 1000.0 in the end of the function
 //
diff --git a/tools/mtmd/mtmd-audio.h b/tools/mtmd/mtmd-audio.h
index 348d11dcaf959..b7b940affb570 100644
--- a/tools/mtmd/mtmd-audio.h
+++ b/tools/mtmd/mtmd-audio.h
@@ -32,7 +32,7 @@ struct whisper_filters {
     std::vector<float> data;
 };
 
-extern bool preprocess_audio(
+bool preprocess_audio(
         const float * samples,
         size_t n_samples,
         const whisper_filters & filters,
@@ -40,23 +40,8 @@ extern bool preprocess_audio(
 
 } // namespace whisper_preprocessor
 
-
-// TODO @ngxson : move this helper to mtmd-helpers.cpp
-namespace audio_helpers {
-
-extern bool is_audio_file(const char * buf, size_t len);
-
-extern bool decode_audio_from_buf(
-        const unsigned char * buf_in,
-        size_t len,
-        int target_sampler_rate,
-        std::vector<float> & pcmf32_mono);
-
-} // namespace audio_helpers
-
-
 namespace whisper_precalc_filters {
 
-extern whisper_preprocessor::whisper_filters get_128_bins();
+whisper_preprocessor::whisper_filters get_128_bins();
 
 } // namespace whisper_precalc_filters
diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp
index a70f11ca9d718..508a64c586de1 100644
--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@@ -7,6 +7,7 @@
 #include "console.h"
 #include "chat.h"
 #include "mtmd.h"
+#include "mtmd-helper.h"
 
 #include <vector>
 #include <limits.h>
@@ -143,7 +144,7 @@ struct mtmd_cli_context {
     }
 
     bool load_media(const std::string & fname) {
-        mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(fname.c_str()));
+        mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str()));
         if (!bmp.ptr) {
             return false;
         }
diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp
index e6c926080cde3..058323818cd79 100644
--- a/tools/mtmd/mtmd-helper.cpp
+++ b/tools/mtmd/mtmd-helper.cpp
@@ -1,10 +1,37 @@
+// fix problem with std::min and std::max
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#   define NOMINMAX
+#endif
+#include <windows.h>
+#endif
+
 #include "mtmd.h"
+#include "mtmd-helper.h"
 #include "llama.h"
 
 #include <algorithm>
 #include <cinttypes>
 #include <vector>
 
+//#define MTMD_AUDIO_DEBUG
+
+#define MINIAUDIO_IMPLEMENTATION
+#ifndef MTMD_AUDIO_DEBUG
+#   define MA_NO_ENCODING
+#endif
+#define MA_NO_DEVICE_IO
+#define MA_NO_RESOURCE_MANAGER
+#define MA_NO_NODE_GRAPH
+#define MA_NO_ENGINE
+#define MA_NO_GENERATION
+#define MA_API static
+#include "vendor/miniaudio.h"
+
+#define STB_IMAGE_IMPLEMENTATION
+#include "vendor/stb_image.h"
+
 #define LOG_INF(...) fprintf(stdout, __VA_ARGS__)
 #define LOG_ERR(...) fprintf(stderr, __VA_ARGS__)
 
@@ -315,3 +342,118 @@ int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
 
     return 0;
 }
+
+namespace audio_helpers {
+
+static bool is_audio_file(const char * buf, size_t len) {
+    if (len < 12) {
+        return false;
+    }
+
+    // RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
+    // WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
+    bool is_wav = memcmp(buf, "RIFF", 4) == 0 && memcmp(buf + 8, "WAVE", 4) == 0;
+    bool is_mp3 = len >= 3 && (
+        memcmp(buf, "ID3", 3) == 0 ||
+        // Check for MPEG sync word (simplified check)
+        ((unsigned char)buf[0] == 0xFF && ((unsigned char)buf[1] & 0xE0) == 0xE0)
+    );
+    bool is_flac = memcmp(buf, "fLaC", 4) == 0;
+
+    return is_wav || is_mp3 || is_flac;
+}
+
+// returns true if the buffer is a valid audio file
+static bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector<float> & pcmf32_mono) {
+    ma_result result;
+    const int channels = 1;
+    ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, channels, target_sampler_rate);
+    ma_decoder decoder;
+
+    result = ma_decoder_init_memory(buf_in, len, &decoder_config, &decoder);
+    if (result != MA_SUCCESS) {
+        return false;
+    }
+
+    ma_uint64 frame_count;
+    ma_uint64 frames_read;
+    result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count);
+    if (result != MA_SUCCESS) {
+        ma_decoder_uninit(&decoder);
+        return false;
+    }
+
+    pcmf32_mono.resize(frame_count);
+    result = ma_decoder_read_pcm_frames(&decoder, pcmf32_mono.data(), frame_count, &frames_read);
+    if (result != MA_SUCCESS) {
+        ma_decoder_uninit(&decoder);
+        return false;
+    }
+
+#ifdef MTMD_AUDIO_DEBUG
+    // save audio to wav file
+    ma_encoder_config config = ma_encoder_config_init(ma_encoding_format_wav, ma_format_f32, 1, target_sampler_rate);
+    ma_encoder encoder;
+    ma_encoder_init_file("output.wav", &config, &encoder);
+    ma_encoder_write_pcm_frames(&encoder, pcmf32_mono.data(), pcmf32_mono.size(), &frames_read);
+    ma_encoder_uninit(&encoder);
+#endif
+
+    ma_decoder_uninit(&decoder);
+    return true;
+}
+
+} // namespace audio_helpers
+
+mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len) {
+    if (audio_helpers::is_audio_file((const char *)buf, len)) {
+        std::vector<float> pcmf32;
+        int bitrate = mtmd_get_audio_bitrate(ctx);
+        if (bitrate < 0) {
+            LOG_ERR("This model does not support audio input\n");
+            return nullptr;
+        }
+        if (!audio_helpers::decode_audio_from_buf(buf, len, bitrate, pcmf32)) {
+            LOG_ERR("Unable to read WAV audio file from buffer\n");
+            return nullptr;
+        }
+        return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data());
+    }
+
+    // otherwise, we assume it's an image
+    mtmd_bitmap * result = nullptr;
+    {
+        int nx, ny, nc;
+        auto * data = stbi_load_from_memory(buf, len, &nx, &ny, &nc, 3);
+        if (!data) {
+            LOG_ERR("%s: failed to decode image bytes\n", __func__);
+            return nullptr;
+        }
+        result = mtmd_bitmap_init(nx, ny, data);
+        stbi_image_free(data);
+    }
+    return result;
+}
+
+mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname) {
+    std::vector<unsigned char> buf;
+    FILE * f = fopen(fname, "rb");
+    if (!f) {
+        LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
+        return nullptr;
+    }
+
+    fseek(f, 0, SEEK_END);
+    long file_size = ftell(f);
+    fseek(f, 0, SEEK_SET);
+    buf.resize(file_size);
+
+    size_t n_read = fread(buf.data(), 1, file_size, f);
+    fclose(f);
+    if (n_read != (size_t)file_size) {
+        LOG_ERR("Failed to read entire file %s", fname);
+        return nullptr;
+    }
+
+    return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size());
+}
diff --git a/tools/mtmd/mtmd-helper.h b/tools/mtmd/mtmd-helper.h
new file mode 100644
index 0000000000000..5c0edc6937eee
--- /dev/null
+++ b/tools/mtmd/mtmd-helper.h
@@ -0,0 +1,91 @@
+#ifndef MTMD_HELPER_H
+#define MTMD_HELPER_H
+
+#include "ggml.h"
+#include "llama.h"
+#include "mtmd.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//
+// libmtmd helper functions
+//
+// Please note that these helpers are not guaranteed to be stable.
+// BREAKING CHANGES are expected.
+//
+
+// helper function to construct a mtmd_bitmap from a file
+// it calls mtmd_helper_bitmap_init_from_buf() internally
+// returns nullptr on failure
+// this function is thread-safe
+MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname);
+
+// helper function to construct a mtmd_bitmap from a buffer containing a file
+// supported formats:
+//     image: formats supported by stb_image: jpg, png, bmp, gif, etc.
+//     audio: formats supported by miniaudio: wav, mp3, flac
+// note: audio files will be auto-detected based on magic bytes
+// returns nullptr on failure
+// this function is thread-safe
+MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len);
+
+// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
+MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
+
+// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
+// normally, n_pos is equal to n_tokens, but for M-RoPE it is different
+MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);
+
+// helper function that automatically:
+// 1. run llama_decode() on text chunks
+// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
+// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
+// otherwise, returns 0 on success
+// this function is NOT thread-safe
+MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
+                                         struct llama_context * lctx,
+                                         const mtmd_input_chunks * chunks,
+                                         llama_pos n_past,
+                                         llama_seq_id seq_id,
+                                         int32_t n_batch,
+                                         bool logits_last,
+                                         llama_pos * new_n_past);
+
+// works like mtmd_helper_eval_chunks(), but only for a single chunk
+// this function is NOT thread-safe
+MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
+                                               struct llama_context * lctx,
+                                               const mtmd_input_chunk * chunk,
+                                               llama_pos n_past,
+                                               llama_seq_id seq_id,
+                                               int32_t n_batch,
+                                               bool logits_last,
+                                               llama_pos * new_n_past);
+
+// helper function to decode an image whose embeddings have already been calculated
+// this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention)
+// ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
+MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
+                                                struct llama_context * lctx,
+                                                const mtmd_input_chunk * chunk,
+                                                float * encoded_embd,
+                                                llama_pos n_past,
+                                                llama_seq_id seq_id,
+                                                int32_t n_batch,
+                                                llama_pos * new_n_past);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+//
+// C++ wrappers
+//
+
+#endif
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index 52bf71e2c9dc0..8573f11437f1b 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -819,53 +819,12 @@ bool mtmd_support_audio(mtmd_context * ctx) {
     return ctx->ctx_a != nullptr;
 }
 
-// these 2 helpers below use internal clip_image_u8_ptr,
-// so unfortunately they cannot moved to mtmd-helper.h
-// however, in theory, user can decode image file to bitmap using
-// whichever library they want, and then use mtmd_bitmap_init() to create bitmap
-
-mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len) {
-    if (audio_helpers::is_audio_file((const char *)buf, len)) {
-        std::vector<float> pcmf32;
-        if (!audio_helpers::decode_audio_from_buf(buf, len, COMMON_SAMPLE_RATE, pcmf32)) {
-            LOG_ERR("Unable to read WAV audio file from buffer\n");
-            return nullptr;
-        }
-        return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data());
-    }
-
-    clip_image_u8_ptr img_u8(clip_image_u8_init());
-    bool ok = clip_image_load_from_bytes(buf, len, img_u8.get());
-    if (!ok) {
-        LOG_ERR("Unable to load image from buffer\n");
-        return nullptr;
+int mtmd_get_audio_bitrate(mtmd_context * ctx) {
+    if (!ctx->ctx_a) {
+        return -1;
     }
-    uint32_t nx, ny;
-    unsigned char * data = clip_image_u8_get_data(img_u8.get(), &nx, &ny);
-    return mtmd_bitmap_init(nx, ny, data);
-}
-
-mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname) {
-    std::vector<unsigned char> buf;
-    FILE * f = fopen(fname, "rb");
-    if (!f) {
-        LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
-        return nullptr;
-    }
-
-    fseek(f, 0, SEEK_END);
-    long file_size = ftell(f);
-    fseek(f, 0, SEEK_SET);
-    buf.resize(file_size);
-
-    size_t n_read = fread(buf.data(), 1, file_size, f);
-    fclose(f);
-    if (n_read != (size_t)file_size) {
-        LOG_ERR("Failed to read entire file %s", fname);
-        return nullptr;
-    }
-
-    return mtmd_helper_bitmap_init_from_buf(buf.data(), buf.size());
+    // for now, we assume that all audio models have the same bitrate
+    return 16000; // 16kHz
 }
 
 //
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
index b53f215a2fafd..541918e095dbb 100644
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@@ -109,6 +109,10 @@ MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
 // whether the current model supports audio input
 MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
 
+// get audio bitrate in Hz, for example 16000 for Whisper
+// return -1 if audio is not supported
+MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx);
+
 // mtmd_bitmap
 //
 // if bitmap is image:
@@ -209,75 +213,6 @@ MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
 
 /////////////////////////////////////////
 
-//
-// Helper functions (can be implemented based on other functions)
-//
-// Please note that these helpers are not guaranteed to be stable.
-// BREAKING CHANGES are expected.
-//
-
-// helper function to construct a mtmd_bitmap from a file
-// it calls mtmd_helper_bitmap_init_from_buf() internally
-// returns nullptr on failure
-// this function is thread-safe
-MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname);
-
-// helper function to construct a mtmd_bitmap from a buffer containing a file
-// supported formats:
-//     image: formats supported by stb_image: jpg, png, bmp, gif, etc.
-//     audio: formats supported by miniaudio: wav, mp3, flac
-// note: audio files will be auto-detected based on magic bytes
-// returns nullptr on failure
-// this function is thread-safe
-MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len);
-
-// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
-MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
-
-// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
-// normally, n_pos is equal to n_tokens, but for M-RoPE it is different
-MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);
-
-// helper function that automatically:
-// 1. run llama_decode() on text chunks
-// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
-// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
-// otherwise, returns 0 on success
-// this function is NOT thread-safe
-MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
-                                         struct llama_context * lctx,
-                                         const mtmd_input_chunks * chunks,
-                                         llama_pos n_past,
-                                         llama_seq_id seq_id,
-                                         int32_t n_batch,
-                                         bool logits_last,
-                                         llama_pos * new_n_past);
-
-// works like mtmd_helper_eval_chunks(), but only for a single chunk
-// this function is NOT thread-safe
-MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
-                                               struct llama_context * lctx,
-                                               const mtmd_input_chunk * chunk,
-                                               llama_pos n_past,
-                                               llama_seq_id seq_id,
-                                               int32_t n_batch,
-                                               bool logits_last,
-                                               llama_pos * new_n_past);
-
-// helper function to decode an image whose embeddings have already been calculated
-// this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention)
-// ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
-MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
-                                                struct llama_context * lctx,
-                                                const mtmd_input_chunk * chunk,
-                                                float * encoded_embd,
-                                                llama_pos n_past,
-                                                llama_seq_id seq_id,
-                                                int32_t n_batch,
-                                                llama_pos * new_n_past);
-
-/////////////////////////////////////////
-
 // test function, to be used in test-mtmd-c-api.c
 MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void);
 
diff --git a/tools/mtmd/miniaudio.h b/tools/mtmd/vendor/miniaudio.h
similarity index 100%
rename from tools/mtmd/miniaudio.h
rename to tools/mtmd/vendor/miniaudio.h
diff --git a/common/stb_image.h b/tools/mtmd/vendor/stb_image.h
similarity index 100%
rename from common/stb_image.h
rename to tools/mtmd/vendor/stb_image.h
diff --git a/tools/server/CMakeLists.txt b/tools/server/CMakeLists.txt
index 17109fddbd307..08597145c88d4 100644
--- a/tools/server/CMakeLists.txt
+++ b/tools/server/CMakeLists.txt
@@ -36,7 +36,7 @@ install(TARGETS ${TARGET} RUNTIME)
 
 target_include_directories(${TARGET} PRIVATE ../llava)
 target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
-target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common mtmd mtmd_helper ${CMAKE_THREAD_LIBS_INIT})
 
 if (LLAMA_SERVER_SSL)
     find_package(OpenSSL REQUIRED)
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index fe6c685ec76b6..96d1e4adf2446 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -9,6 +9,7 @@
 #include "sampling.h"
 #include "speculative.h"
 #include "mtmd.h"
+#include "mtmd-helper.h"
 
 // Change JSON_ASSERT from assert() to GGML_ASSERT:
 #define JSON_ASSERT GGML_ASSERT
@@ -4187,7 +4188,7 @@ int main(int argc, char ** argv) {
                     throw std::runtime_error("This server does not support multimodal");
                 }
                 for (auto & file : files) {
-                    mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(file.data(), file.size()));
+                    mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(ctx_server.mctx, file.data(), file.size()));
                     if (!bmp.ptr) {
                         throw std::runtime_error("Failed to load image or audio file");
                     }
diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
index 8456a02e6fa29..58b679d759bb9 100644
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@@ -6,6 +6,7 @@
 #include "arg.h" // common_remote_get_content
 #include "base64.hpp"
 #include "mtmd.h"
+#include "mtmd-helper.h"
 
 // increase max payload length to allow use of larger context size
 #define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
@@ -264,13 +265,19 @@ static size_t validate_utf8(const std::string& text) {
 static llama_tokens format_rerank(const struct llama_vocab * vocab, const llama_tokens & query, const llama_tokens & doc) {
     llama_tokens result;
 
+    // Get EOS token - use SEP token as fallback if EOS is not available
+    llama_token eos_token = llama_vocab_eos(vocab);
+    if (eos_token == LLAMA_TOKEN_NULL) {
+        eos_token = llama_vocab_sep(vocab);
+    }
+
     result.reserve(doc.size() + query.size() + 4);
     result.push_back(llama_vocab_bos(vocab));
     result.insert(result.end(), query.begin(), query.end());
-    result.push_back(llama_vocab_eos(vocab));
+    result.push_back(eos_token);
     result.push_back(llama_vocab_sep(vocab));
     result.insert(result.end(), doc.begin(), doc.end());
-    result.push_back(llama_vocab_eos(vocab));
+    result.push_back(eos_token);
 
     return result;
 }
@@ -573,7 +580,7 @@ struct oaicompat_parser_options {
 
 // used by /chat/completions endpoint
 static json oaicompat_chat_params_parse(
-    const json & body, /* openai api json semantics */
+    json & body, /* openai api json semantics */
     const oaicompat_parser_options & opt,
     std::vector<raw_buffer> & out_files)
 {
@@ -624,7 +631,7 @@ static json oaicompat_chat_params_parse(
     if (!body.contains("messages")) {
         throw std::runtime_error("'messages' is required");
     }
-    json messages = body.at("messages");
+    json & messages = body.at("messages");
     if (!messages.is_array()) {
         throw std::runtime_error("Expected 'messages' to be an array");
     }