Skip to content

Commit 04b8f51

Browse files
committed
Merge branch 'master' into compilade/test-model-random
2 parents 352703b + e434e69 commit 04b8f51

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

61 files changed

+1701
-335
lines changed

.github/workflows/build.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -693,7 +693,7 @@ jobs:
693693
- build: 'openblas-x64'
694694
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
695695
- build: 'vulkan-x64'
696-
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
696+
defines: '-DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
697697
- build: 'llvm-arm64'
698698
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
699699
- build: 'llvm-arm64-opencl-adreno'
@@ -778,6 +778,7 @@ jobs:
778778
cmake -S . -B build ${{ matrix.defines }} `
779779
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
780780
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
781+
cp $env:CURL_PATH/bin/libcurl-*.dll build/bin/Release
781782
782783
- name: Add libopenblas.dll
783784
id: add_libopenblas_dll

ci/run.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ sd=`dirname $0`
3939
cd $sd/../
4040
SRC=`pwd`
4141

42-
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=OFF"
42+
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON"
4343

4444
if [ ! -z ${GG_BUILD_METAL} ]; then
4545
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON"

common/arg.cpp

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -988,10 +988,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
988988
params.tensor_buft_overrides.push_back({nullptr, nullptr});
989989
}
990990

991-
if (params.reranking && params.embedding) {
992-
throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
993-
}
994-
995991
if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
996992
throw std::runtime_error(string_format(
997993
"error: the supplied chat template is not supported: %s%s\n",
@@ -2747,9 +2743,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
27472743
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
27482744
add_opt(common_arg(
27492745
{"--reranking", "--rerank"},
2750-
string_format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"),
2746+
string_format("enable reranking endpoint on server (default: %s)", "disabled"),
27512747
[](common_params & params) {
2752-
params.reranking = true;
2748+
params.embedding = true;
2749+
params.pooling_type = LLAMA_POOLING_TYPE_RANK;
27532750
}
27542751
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
27552752
add_opt(common_arg(

common/chat-parser.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::
4949

5050
// LOG_DBG("Tool call arguments:\n\traw: %s\n\tresult: %s\n", arguments.c_str(), tool_call.arguments.c_str());
5151
result_.tool_calls.emplace_back(tool_call);
52+
5253
return true;
5354
}
5455
bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
@@ -378,3 +379,7 @@ std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parse
378379
/* .is_partial = */ found_healing_marker,
379380
};
380381
}
382+
383+
void common_chat_msg_parser::clear_tools() {
384+
result_.tool_calls.clear();
385+
}

common/chat-parser.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,4 +115,6 @@ class common_chat_msg_parser {
115115
const std::vector<std::vector<std::string>> & args_paths = {},
116116
const std::vector<std::vector<std::string>> & content_paths = {}
117117
);
118+
119+
void clear_tools();
118120
};

common/chat.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1838,7 +1838,7 @@ static common_chat_params common_chat_templates_apply_legacy(
18381838
if (res < 0) {
18391839
// if the custom "tmpl" is not supported, we throw an error
18401840
// this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
1841-
throw std::runtime_error("this custom template is not supported");
1841+
throw std::runtime_error("this custom template is not supported, try using --jinja");
18421842
}
18431843

18441844
// if it turns out that our buffer is too small, we resize it
@@ -1921,7 +1921,9 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
19211921
} catch (const common_chat_msg_partial_exception & ex) {
19221922
LOG_DBG("Partial parse: %s\n", ex.what());
19231923
if (!is_partial) {
1924-
throw std::runtime_error(ex.what());
1924+
builder.clear_tools();
1925+
builder.move_to(0);
1926+
common_chat_parse_content_only(builder);
19251927
}
19261928
}
19271929
auto msg = builder.result();

common/common.cpp

Lines changed: 41 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -767,6 +767,9 @@ bool fs_validate_filename(const std::string & filename) {
767767
return true;
768768
}
769769

770+
#include <iostream>
771+
772+
770773
// returns true if successful, false otherwise
771774
bool fs_create_directory_with_parents(const std::string & path) {
772775
#ifdef _WIN32
@@ -784,9 +787,16 @@ bool fs_create_directory_with_parents(const std::string & path) {
784787
// process path from front to back, procedurally creating directories
785788
while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
786789
const std::wstring subpath = wpath.substr(0, pos_slash);
787-
const wchar_t * test = subpath.c_str();
788790

789-
const bool success = CreateDirectoryW(test, NULL);
791+
pos_slash += 1;
792+
793+
// skip the drive letter, in some systems it can return an access denied error
794+
if (subpath.length() == 2 && subpath[1] == ':') {
795+
continue;
796+
}
797+
798+
const bool success = CreateDirectoryW(subpath.c_str(), NULL);
799+
790800
if (!success) {
791801
const DWORD error = GetLastError();
792802

@@ -800,8 +810,6 @@ bool fs_create_directory_with_parents(const std::string & path) {
800810
return false;
801811
}
802812
}
803-
804-
pos_slash += 1;
805813
}
806814

807815
return true;
@@ -897,34 +905,6 @@ struct common_init_result common_init_from_params(common_params & params) {
897905

898906
const llama_vocab * vocab = llama_model_get_vocab(model);
899907

900-
if (params.reranking) {
901-
bool ok = true;
902-
903-
if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
904-
LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
905-
ok = false;
906-
}
907-
908-
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
909-
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
910-
911-
if (!has_eos && !has_sep) {
912-
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
913-
ok = false;
914-
} else if (!has_eos) {
915-
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
916-
} else if (!has_sep) {
917-
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
918-
ok = false;
919-
}
920-
921-
if (!ok) {
922-
llama_model_free(model);
923-
924-
return iparams;
925-
}
926-
}
927-
928908
auto cparams = common_context_params_to_llama(params);
929909

930910
llama_context * lctx = llama_init_from_model(model, cparams);
@@ -966,6 +946,35 @@ struct common_init_result common_init_from_params(common_params & params) {
966946
}
967947
}
968948

949+
if (llama_pooling_type(lctx) == LLAMA_POOLING_TYPE_RANK) {
950+
bool ok = true;
951+
952+
if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
953+
LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
954+
ok = false;
955+
}
956+
957+
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
958+
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
959+
960+
if (!has_eos && !has_sep) {
961+
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
962+
ok = false;
963+
} else if (!has_eos) {
964+
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
965+
} else if (!has_sep) {
966+
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
967+
ok = false;
968+
}
969+
970+
if (!ok) {
971+
llama_free(lctx);
972+
llama_model_free(model);
973+
974+
return iparams;
975+
}
976+
}
977+
969978
// load and optionally apply lora adapters
970979
for (auto & la : params.lora_adapters) {
971980
llama_adapter_lora_ptr lora;
@@ -1143,11 +1152,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
11431152
cparams.op_offload = !params.no_op_offload;
11441153
cparams.swa_full = params.swa_full;
11451154

1146-
if (params.reranking) {
1147-
cparams.embeddings = true;
1148-
cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
1149-
}
1150-
11511155
cparams.type_k = params.cache_type_k;
11521156
cparams.type_v = params.cache_type_v;
11531157

common/common.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -355,7 +355,6 @@ struct common_params {
355355
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
356356
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
357357
std::string embd_sep = "\n"; // separator of embeddings
358-
bool reranking = false; // enable reranking support on server
359358

360359
// server params
361360
int32_t port = 8080; // server listens on this network port

convert_hf_to_gguf.py

Lines changed: 71 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -519,7 +519,7 @@ def prepare_metadata(self, vocab_only: bool):
519519
def set_gguf_parameters(self):
520520
self.gguf_writer.add_block_count(self.block_count)
521521

522-
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions"], optional=True)) is not None:
522+
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions", "max_length"], optional=True)) is not None:
523523
self.gguf_writer.add_context_length(n_ctx)
524524
logger.info(f"gguf: context length = {n_ctx}")
525525

@@ -2020,6 +2020,20 @@ def prepare_tensors(self):
20202020
raise ValueError(f"Unprocessed experts: {experts}")
20212021

20222022

2023+
@ModelBase.register("ArceeForCausalLM")
2024+
class ArceeModel(LlamaModel):
2025+
model_arch = gguf.MODEL_ARCH.ARCEE
2026+
2027+
def set_gguf_parameters(self):
2028+
super().set_gguf_parameters()
2029+
self._try_set_pooling_type()
2030+
rope_scaling = self.hparams.get("rope_scaling") or {}
2031+
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
2032+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
2033+
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
2034+
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
2035+
2036+
20232037
@ModelBase.register(
20242038
"LlavaForConditionalGeneration", # pixtral
20252039
"Mistral3ForConditionalGeneration", # mistral small 3.1
@@ -4062,6 +4076,34 @@ def _is_tokenizer_xlmroberta(self) -> bool:
40624076
raise ValueError(f"unknown tokenizer: {toktyp}")
40634077

40644078

4079+
@ModelBase.register("NeoBERT", "NeoBERTLMHead", "NeoBERTForSequenceClassification")
4080+
class NeoBert(BertModel):
4081+
model_arch = gguf.MODEL_ARCH.NEO_BERT
4082+
4083+
def set_gguf_parameters(self):
4084+
super().set_gguf_parameters()
4085+
4086+
# NeoBERT uses 2/3 of the intermediate size as feed forward length
4087+
self.gguf_writer.add_feed_forward_length(int(2 * self.hparams["intermediate_size"] / 3))
4088+
self.gguf_writer.add_rope_freq_base(10000.0) # default value for NeoBERT
4089+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
4090+
4091+
f_rms_eps = self.hparams.get("norm_eps", 1e-6) # default value for NeoBERT
4092+
self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
4093+
logger.info(f"gguf: rms norm epsilon = {f_rms_eps}")
4094+
4095+
self.gguf_writer.add_pooling_type(gguf.PoolingType.CLS) # https://huggingface.co/chandar-lab/NeoBERT#how-to-use
4096+
4097+
def modify_tensors(self, data_torch, name, bid):
4098+
if name.startswith("decoder."):
4099+
return []
4100+
4101+
if name.startswith("model."):
4102+
name = name[6:]
4103+
4104+
return super().modify_tensors(data_torch, name, bid)
4105+
4106+
40654107
@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
40664108
class XLMRobertaModel(BertModel):
40674109
model_arch = gguf.MODEL_ARCH.BERT
@@ -5262,6 +5304,34 @@ def prepare_tensors(self):
52625304
raise ValueError(f"Unprocessed experts: {experts}")
52635305

52645306

5307+
@ModelBase.register("Dots1ForCausalLM")
5308+
class Dots1Model(Qwen2MoeModel):
5309+
model_arch = gguf.MODEL_ARCH.DOTS1
5310+
5311+
def __init__(self, *args, **kwargs):
5312+
super().__init__(*args, **kwargs)
5313+
self.hparams["num_experts"] = self.hparams["n_routed_experts"]
5314+
5315+
def set_gguf_parameters(self):
5316+
super().set_gguf_parameters()
5317+
self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"])
5318+
self.gguf_writer.add_expert_shared_count(self.hparams["n_shared_experts"])
5319+
self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
5320+
self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
5321+
5322+
if self.hparams["scoring_func"] == "noaux_tc":
5323+
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
5324+
else:
5325+
raise ValueError(f"Unsupported scoring_func value: {self.hparams['scoring_func']}")
5326+
5327+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
5328+
if name.endswith("e_score_correction_bias"):
5329+
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
5330+
if "shared_experts" in name:
5331+
return [(self.map_tensor_name(name), data_torch)]
5332+
return super().modify_tensors(data_torch, name, bid)
5333+
5334+
52655335
@ModelBase.register("PLMForCausalLM")
52665336
class PLMModel(TextModel):
52675337
model_arch = gguf.MODEL_ARCH.PLM

docs/function-calling.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ Function calling is supported for all models (see https://github.com/ggml-org/ll
1111
- Llama 3.1 / 3.3 (including builtin tools support - tool names for `wolfram_alpha`, `web_search` / `brave_search`, `code_interpreter`), Llama 3.2
1212
- Functionary v3.1 / v3.2
1313
- Hermes 2/3, Qwen 2.5
14-
- Qwen 2.5 Coder (WIP: https://github.com/ggml-org/llama.cpp/pull/12034)
14+
- Qwen 2.5 Coder
1515
- Mistral Nemo
1616
- Firefunction v2
1717
- Command R7B

0 commit comments

Comments
 (0)