Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,6 @@ charset = unset
trim_trailing_whitespace = unset
insert_final_newline = unset

[tools/mtmd/miniaudio.h]
[tools/mtmd/vendor/miniaudio.h]
trim_trailing_whitespace = unset
insert_final_newline = unset
30 changes: 15 additions & 15 deletions .github/workflows/build-linux-cross.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,12 @@ jobs:
sudo apt-get install -y --no-install-recommends \
build-essential \
gcc-14-riscv64-linux-gnu \
g++-14-riscv64-linux-gnu \
libcurl4-openssl-dev:riscv64
g++-14-riscv64-linux-gnu

- name: Build
run: |
cmake -B build -DCMAKE_BUILD_TYPE=Release \
cmake -B build -DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TOOLS=ON \
Expand Down Expand Up @@ -72,12 +72,12 @@ jobs:
glslc \
gcc-14-riscv64-linux-gnu \
g++-14-riscv64-linux-gnu \
libvulkan-dev:riscv64 \
libcurl4-openssl-dev:riscv64
libvulkan-dev:riscv64

- name: Build
run: |
cmake -B build -DCMAKE_BUILD_TYPE=Release \
cmake -B build -DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_VULKAN=ON \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
Expand Down Expand Up @@ -118,12 +118,12 @@ jobs:
build-essential \
glslc \
crossbuild-essential-arm64 \
libvulkan-dev:arm64 \
libcurl4-openssl-dev:arm64
libvulkan-dev:arm64

- name: Build
run: |
cmake -B build -DCMAKE_BUILD_TYPE=Release \
cmake -B build -DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_VULKAN=ON \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
Expand Down Expand Up @@ -163,12 +163,12 @@ jobs:
sudo apt-get install -y --no-install-recommends \
build-essential \
gcc-14-powerpc64le-linux-gnu \
g++-14-powerpc64le-linux-gnu \
libcurl4-openssl-dev:ppc64el
g++-14-powerpc64le-linux-gnu

- name: Build
run: |
cmake -B build -DCMAKE_BUILD_TYPE=Release \
cmake -B build -DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TOOLS=ON \
Expand Down Expand Up @@ -209,12 +209,12 @@ jobs:
glslc \
gcc-14-powerpc64le-linux-gnu \
g++-14-powerpc64le-linux-gnu \
libvulkan-dev:ppc64el \
libcurl4-openssl-dev:ppc64el
libvulkan-dev:ppc64el

- name: Build
run: |
cmake -B build -DCMAKE_BUILD_TYPE=Release \
cmake -B build -DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_VULKAN=ON \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
Expand Down
15 changes: 9 additions & 6 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -903,13 +903,16 @@ struct common_init_result common_init_from_params(common_params & params) {
ok = false;
}

if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__);
ok = false;
}
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;

if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) {
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
if (!has_eos && !has_sep) {
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
ok = false;
} else if (!has_eos) {
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
} else if (!has_sep) {
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
ok = false;
}

Expand Down
35 changes: 24 additions & 11 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -423,19 +423,19 @@ def load_hparams(dir_model: Path):
try:
# for security reason, we don't allow loading remote code by default
# if a model need remote code, we will fallback to config.json
return AutoConfig.from_pretrained(dir_model, trust_remote_code=False).to_dict()
config = AutoConfig.from_pretrained(dir_model, trust_remote_code=False).to_dict()
except Exception as e:
logger.warning(f"Failed to load model config from {dir_model}: {e}")
logger.warning("Trying to load config.json instead")
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
config = json.load(f)
if "llm_config" in config:
# rename for InternVL
config["text_config"] = config["llm_config"]
if "thinker_config" in config:
# rename for Qwen2.5-Omni
config["text_config"] = config["thinker_config"]["text_config"]
return config
if "llm_config" in config:
# rename for InternVL
config["text_config"] = config["llm_config"]
if "thinker_config" in config:
# rename for Qwen2.5-Omni
config["text_config"] = config["thinker_config"]["text_config"]
return config

@classmethod
def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
Expand Down Expand Up @@ -1207,7 +1207,7 @@ def set_gguf_parameters(self):
self.gguf_writer.add_audio_block_count(self.find_aparam(self.n_block_keys))
self.gguf_writer.add_audio_head_count(self.find_aparam(["num_attention_heads"]))

else:
if not self.has_vision_encoder and not self.has_audio_encoder:
raise ValueError("MmprojModel must have either vision or audio encoder")

def write_vocab(self):
Expand Down Expand Up @@ -1841,7 +1841,8 @@ def prepare_tensors(self):
"MistralForCausalLM",
"MixtralForCausalLM",
"VLlama3ForCausalLM",
"LlavaForConditionalGeneration")
"LlavaForConditionalGeneration",
"LlamaModel")
class LlamaModel(TextModel):
model_arch = gguf.MODEL_ARCH.LLAMA
undo_permute = True
Expand Down Expand Up @@ -1921,6 +1922,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter

if is_vision_tensor:
return [] # skip vision tensors
elif self.hf_arch == "LlamaModel":
name = "model." + name
elif name.startswith("model.text_model"):
name = name.replace("text_model.", "") # for SmolVLM
elif name.startswith("language_model."):
Expand Down Expand Up @@ -2169,6 +2172,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
# process vision tensors
if "positional_embedding_vlm" in name and ".weight" not in name:
name += ".weight"
if "multi_modal_projector.linear_1" in name:
# despite the name with number postfix, this is a single fully connected layer
return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC], data_torch)]
return [(self.map_tensor_name(name), data_torch)]
return []

Expand Down Expand Up @@ -3676,7 +3682,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
return [(self.map_tensor_name(name), data_torch)]


@ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel")
@ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel", "BertForSequenceClassification")
class BertModel(TextModel):
model_arch = gguf.MODEL_ARCH.BERT

Expand Down Expand Up @@ -3739,6 +3745,13 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
if name.startswith("cls.seq_relationship"):
return []

# For BertForSequenceClassification (direct projection layer)
if name == "classifier.weight":
name = "classifier.out_proj.weight"

if name == "classifier.bias":
name = "classifier.out_proj.bias"

return [(self.map_tensor_name(name), data_torch)]

def _xlmroberta_tokenizer_init(self) -> None:
Expand Down
2 changes: 1 addition & 1 deletion convert_hf_to_gguf_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:

tests = [
"ied 4 ½ months",
"Führer",
"Äpfel",
"",
" ",
" ",
Expand Down
1 change: 1 addition & 0 deletions ggml/src/ggml-cann/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ string(TOLOWER ${SOC_TYPE} SOC_VERSION) # SOC_VERSION need lower
string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
string(TOUPPER ${SOC_TYPE_COMPILE_OPTION} SOC_TYPE_COMPILE_OPTION)
message(STATUS "CANN: SOC_VERSION = ${SOC_VERSION}")

if (CANN_INSTALL_DIR)
# Only Support Linux.
Expand Down
4 changes: 2 additions & 2 deletions ggml/src/ggml-cuda/fattn-common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -623,8 +623,8 @@ static __global__ void flash_attn_combine_results(
__builtin_assume(tid < D);

extern __shared__ float2 meta[];
if (tid < 2*parallel_blocks) {
((float *) meta)[threadIdx.x] = ((const float *)VKQ_meta) [blockIdx.z*(2*parallel_blocks) + tid];
for (int i = tid; i < 2*parallel_blocks; i += D) {
((float *) meta)[i] = ((const float *)VKQ_meta) [blockIdx.z*(2*parallel_blocks) + i];
}

__syncthreads();
Expand Down
1 change: 0 additions & 1 deletion gguf-py/gguf/tensor_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -902,7 +902,6 @@ class TensorNameMap:

MODEL_TENSOR.V_MMPROJ_FC: (
"model.connector.modality_projection.proj", # SmolVLM
"multi_modal_projector.linear_1", # llama 4
),

MODEL_TENSOR.V_MMPROJ_MLP: (
Expand Down
33 changes: 19 additions & 14 deletions src/llama-graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
}

int64_t llm_graph_context::n_pos_per_embd() const {
return arch == LLM_ARCH_QWEN2VL ? 4 : 1;
return hparams.rope_type == LLAMA_ROPE_TYPE_MROPE ? 4 : 1;
}

void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const {
Expand Down Expand Up @@ -1562,20 +1562,25 @@ void llm_graph_context::build_pooling(
ggml_tensor * inp_cls = build_inp_cls();
inp = ggml_get_rows(ctx0, inp, inp_cls);

// classification head
// https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
GGML_ASSERT(cls != nullptr);
GGML_ASSERT(cls_b != nullptr);

cur = ggml_add (ctx0, ggml_mul_mat(ctx0, cls, inp), cls_b);
cur = ggml_tanh(ctx0, cur);

// some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
// https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
if (cls_out) {
if (cls != nullptr && cls_b != nullptr) {
// classification head
// https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls, inp), cls_b);
cur = ggml_tanh(ctx0, cur);

// some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
// https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
if (cls_out) {
GGML_ASSERT(cls_out_b != nullptr);
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls_out, cur), cls_out_b);
}
} else if (cls_out) {
// Single layer classification head (direct projection)
// https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
GGML_ASSERT(cls_out_b != nullptr);

cur = ggml_add (ctx0, ggml_mul_mat(ctx0, cls_out, cur), cls_out_b);
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls_out, inp), cls_out_b);
} else {
GGML_ABORT("RANK pooling requires either cls+cls_b or cls_out+cls_out_b");
}
} break;
default:
Expand Down
12 changes: 10 additions & 2 deletions src/llama-kv-cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -757,11 +757,19 @@ ggml_tensor * llama_kv_cache_unified::build_rope_shift(
const auto & yarn_beta_slow = cparams.yarn_beta_slow;

const auto & n_rot = hparams.n_rot;
const auto & rope_type = hparams.rope_type;
const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE
// @ngxson : this is a workaround
// for M-RoPE, we want to rotate the whole vector when doing KV shift
// a normal RoPE should work, we just need to use the correct ordering
// ref: https://github.com/ggml-org/llama.cpp/pull/13870
? LLAMA_ROPE_TYPE_NEOX
: hparams.rope_type;

// See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2 ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)) : cparams.yarn_attn_factor;
const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2
? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale))
: cparams.yarn_attn_factor;

ggml_tensor * tmp;

Expand Down
46 changes: 26 additions & 20 deletions tools/mtmd/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,48 +1,54 @@
# mtmd

# compile mtmd-audio separately to avoid long compile times with miniaudio.h
# TODO @ngxson : move miniaudio.h and stb_image.h to mtmd-helper.cpp, then compile the helper as a separate library
add_library(mtmd_audio STATIC mtmd-audio.cpp mtmd-audio.h)
if (BUILD_SHARED_LIBS)
set_target_properties(mtmd_audio PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
target_link_libraries(mtmd_audio PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(mtmd_audio PRIVATE cxx_std_17)
target_include_directories(mtmd_audio PRIVATE .)

add_library(mtmd OBJECT
mtmd.cpp
mtmd-helper.cpp
mtmd-audio.cpp
mtmd.h
clip.cpp
clip.h
clip-impl.h
)

target_link_libraries(mtmd PRIVATE ggml llama mtmd_audio ${CMAKE_THREAD_LIBS_INIT})

target_link_libraries(mtmd PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(mtmd PUBLIC .)
target_include_directories(mtmd PRIVATE ../..)
target_include_directories(mtmd PRIVATE ../../common) # for stb_image.h

target_compile_features(mtmd PRIVATE cxx_std_17)

add_library(mtmd_static STATIC $<TARGET_OBJECTS:mtmd>)
# compile the helper separately, to avoid long compile times with miniaudio.h and stb_image.h

add_library(mtmd_helper OBJECT
mtmd-helper.cpp
mtmd-helper.h
)

target_link_libraries(mtmd_helper PRIVATE ggml llama mtmd ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(mtmd_helper PUBLIC .)
target_include_directories(mtmd_helper PRIVATE ./vendor)
target_include_directories(mtmd_helper PRIVATE ../..)
target_compile_features(mtmd_helper PRIVATE cxx_std_17)

if (BUILD_SHARED_LIBS)
set_target_properties(mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(mtmd PRIVATE LLAMA_SHARED LLAMA_BUILD)
add_library(mtmd_shared SHARED $<TARGET_OBJECTS:mtmd>)
target_link_libraries(mtmd_shared PRIVATE ggml llama mtmd_audio ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(mtmd_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
install(TARGETS mtmd_shared LIBRARY)

set_target_properties(mtmd_helper PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(mtmd_helper PRIVATE LLAMA_SHARED LLAMA_BUILD)
add_library(mtmd_helper_shared SHARED $<TARGET_OBJECTS:mtmd>)
target_link_libraries(mtmd_helper_shared PRIVATE ggml llama mtmd ${CMAKE_THREAD_LIBS_INIT})
install(TARGETS mtmd_helper_shared LIBRARY)
endif()

if (NOT MSVC)
target_compile_options(mtmd PRIVATE -Wno-cast-qual) # stb_image.h
target_compile_options(mtmd_audio PRIVATE -Wno-cast-qual) # miniaudio.h
# for stb_image.h and miniaudio.h
target_compile_options(mtmd_helper PRIVATE -Wno-cast-qual)
endif()

if(TARGET BUILD_INFO)
add_dependencies(mtmd BUILD_INFO)
add_dependencies(mtmd_helper BUILD_INFO)
endif()

add_executable(llama-llava-cli deprecation-warning.cpp)
Expand All @@ -54,5 +60,5 @@ set(TARGET llama-mtmd-cli)
add_executable(${TARGET} mtmd-cli.cpp)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE common mtmd mtmd_helper ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
Loading