Skip to content

Commit a0e54cf

Browse files
committed
Merge branch 'master' into dev-refactoring
2 parents beff5c4 + 13b4548 commit a0e54cf

File tree

146 files changed

+11941
-8673
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

146 files changed

+11941
-8673
lines changed

.clang-tidy

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ Checks: >
1313
-readability-magic-numbers,
1414
-readability-uppercase-literal-suffix,
1515
-readability-simplify-boolean-expr,
16+
-readability-math-missing-parentheses,
1617
clang-analyzer-*,
1718
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
1819
performance-*,

.github/workflows/build.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -601,8 +601,9 @@ jobs:
601601
-DGGML_SYCL_F16=ON
602602
cmake --build build --config Release -j $(nproc)
603603
604-
build-linux-cross:
605-
uses: ./.github/workflows/build-linux-cross.yml
604+
# Disabled for now due to sporadic issue syncing.
605+
# build-linux-cross:
606+
# uses: ./.github/workflows/build-linux-cross.yml
606607

607608
macOS-latest-cmake-ios:
608609
runs-on: macos-latest

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
1616

1717
## Hot topics
1818

19+
- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli` and `gemma3-cli` https://github.com/ggml-org/llama.cpp/pull/13012, `libllava` will be deprecated
1920
- **How to use [MTLResidencySet](https://developer.apple.com/documentation/metal/mtlresidencyset?language=objc) to keep the GPU memory active?** https://github.com/ggml-org/llama.cpp/pull/11427
2021
- **VS Code extension for FIM completions:** https://github.com/ggml-org/llama.vscode
2122
- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639

SECURITY.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,8 @@ To protect sensitive data from potential leaks or unauthorized access, it is cru
4040
### Untrusted environments or networks
4141

4242
If you can't run your models in a secure and isolated environment or if it must be exposed to an untrusted network, make sure to take the following security precautions:
43-
* Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value
43+
* Do not use the RPC backend, [rpc-server](https://github.com/ggml-org/llama.cpp/tree/master/examples/rpc) and [llama-server](https://github.com/ggml-org/llama.cpp/tree/master/examples/server) functionality (see https://github.com/ggml-org/llama.cpp/pull/13061).
44+
* Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value.
4445
* Encrypt your data if sending it over the network.
4546

4647
### Multi-Tenant environments

common/arg.cpp

Lines changed: 60 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,11 @@
3838

3939
using json = nlohmann::ordered_json;
4040

41+
std::initializer_list<enum llama_example> mmproj_examples = {
42+
LLAMA_EXAMPLE_LLAVA,
43+
// TODO: add LLAMA_EXAMPLE_SERVER when it's ready
44+
};
45+
4146
common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
4247
this->examples = std::move(examples);
4348
return *this;
@@ -641,11 +646,16 @@ static struct common_hf_file_res common_get_hf_file(const std::string &, const s
641646
// utils
642647
//
643648

644-
static void common_params_handle_model(
649+
struct handle_model_result {
650+
bool found_mmproj = false;
651+
common_params_model mmproj;
652+
};
653+
654+
static handle_model_result common_params_handle_model(
645655
struct common_params_model & model,
646656
const std::string & bearer_token,
647-
const std::string & model_path_default,
648-
bool is_mmproj = false) { // TODO: move is_mmproj to an enum when we have more files?
657+
const std::string & model_path_default) {
658+
handle_model_result result;
649659
// handle pre-fill default model path and url based on hf_repo and hf_file
650660
{
651661
if (!model.hf_repo.empty()) {
@@ -657,7 +667,12 @@ static void common_params_handle_model(
657667
exit(1); // built without CURL, error message already printed
658668
}
659669
model.hf_repo = auto_detected.repo;
660-
model.hf_file = is_mmproj ? auto_detected.mmprojFile : auto_detected.ggufFile;
670+
model.hf_file = auto_detected.ggufFile;
671+
if (!auto_detected.mmprojFile.empty()) {
672+
result.found_mmproj = true;
673+
result.mmproj.hf_repo = model.hf_repo;
674+
result.mmproj.hf_file = auto_detected.mmprojFile;
675+
}
661676
} else {
662677
model.hf_file = model.path;
663678
}
@@ -694,6 +709,8 @@ static void common_params_handle_model(
694709
exit(1);
695710
}
696711
}
712+
713+
return result;
697714
}
698715

699716
const std::vector<ggml_type> kv_cache_types = {
@@ -827,16 +844,25 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
827844
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
828845
}
829846

830-
common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH);
831-
common_params_handle_model(params.speculative.model, params.hf_token, "");
832-
common_params_handle_model(params.vocoder.model, params.hf_token, "");
833-
834-
// allow --mmproj to be set from -hf
835-
// assuming that mmproj is always in the same repo as text model
836-
if (!params.model.hf_repo.empty() && ctx_arg.ex == LLAMA_EXAMPLE_LLAVA) {
837-
params.mmproj.hf_repo = params.model.hf_repo;
847+
// handle model and download
848+
{
849+
auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH);
850+
if (params.no_mmproj) {
851+
params.mmproj = {};
852+
} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
853+
// optionally, handle mmproj model when -hf is specified
854+
params.mmproj = res.mmproj;
855+
}
856+
// only download mmproj if the current example is using it
857+
for (auto & ex : mmproj_examples) {
858+
if (ctx_arg.ex == ex) {
859+
common_params_handle_model(params.mmproj, params.hf_token, "");
860+
break;
861+
}
862+
}
863+
common_params_handle_model(params.speculative.model, params.hf_token, "");
864+
common_params_handle_model(params.vocoder.model, params.hf_token, "");
838865
}
839-
common_params_handle_model(params.mmproj, params.hf_token, "", true);
840866

841867
if (params.escape) {
842868
string_process_escapes(params.prompt);
@@ -968,28 +994,25 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
968994
"llama-embedding",
969995
"llama-eval-callback",
970996
"llama-export-lora",
971-
"llama-gbnf-validator",
972997
"llama-gen-docs",
973998
"llama-gguf",
974999
"llama-gguf-hash",
9751000
"llama-gguf-split",
9761001
"llama-gritlm",
9771002
"llama-imatrix",
9781003
"llama-infill",
979-
"llama-llava-cli",
1004+
"llama-mtmd-cli",
9801005
"llama-llava-clip-quantize-cli",
9811006
"llama-lookahead",
9821007
"llama-lookup",
9831008
"llama-lookup-create",
9841009
"llama-lookup-merge",
9851010
"llama-lookup-stats",
986-
"llama-minicpmv-cli",
9871011
"llama-parallel",
9881012
"llama-passkey",
9891013
"llama-perplexity",
9901014
"llama-q8dot",
9911015
"llama-quantize",
992-
"llama-quantize-stats",
9931016
"llama-qwen2vl-cli",
9941017
"llama-retrieval",
9951018
"llama-run",
@@ -2096,18 +2119,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
20962119
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
20972120
add_opt(common_arg(
20982121
{"--mmproj"}, "FILE",
2099-
"path to a multimodal projector file for LLaVA. see examples/llava/README.md",
2122+
"path to a multimodal projector file. see examples/llava/README.md",
21002123
[](common_params & params, const std::string & value) {
21012124
params.mmproj.path = value;
21022125
}
2103-
).set_examples({LLAMA_EXAMPLE_LLAVA}));
2126+
).set_examples(mmproj_examples));
21042127
add_opt(common_arg(
21052128
{"--mmproj-url"}, "URL",
2106-
"URL to a multimodal projector file for LLaVA. see examples/llava/README.md",
2129+
"URL to a multimodal projector file. see examples/llava/README.md",
21072130
[](common_params & params, const std::string & value) {
21082131
params.mmproj.url = value;
21092132
}
2110-
).set_examples({LLAMA_EXAMPLE_LLAVA}));
2133+
).set_examples(mmproj_examples));
2134+
add_opt(common_arg(
2135+
{"--no-mmproj"},
2136+
"explicitly disable multimodal projector, useful when using -hf",
2137+
[](common_params & params) {
2138+
params.no_mmproj = true;
2139+
}
2140+
).set_examples(mmproj_examples));
2141+
add_opt(common_arg(
2142+
{"--no-mmproj-offload"},
2143+
"do not offload multimodal projector to GPU",
2144+
[](common_params & params) {
2145+
params.mmproj_use_gpu = false;
2146+
}
2147+
).set_examples(mmproj_examples));
21112148
add_opt(common_arg(
21122149
{"--image"}, "FILE",
21132150
"path to an image file. use with multimodal models. Specify multiple times for batching",
@@ -2382,6 +2419,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
23822419
add_opt(common_arg(
23832420
{"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
23842421
"Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
2422+
"mmproj is also downloaded automatically if available. to disable, add --no-mmproj\n"
23852423
"example: unsloth/phi-4-GGUF:q4_k_m\n"
23862424
"(default: unused)",
23872425
[](common_params & params, const std::string & value) {
@@ -2726,7 +2764,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
27262764
[](common_params & params, const std::string & value) {
27272765
params.chat_template = value;
27282766
}
2729-
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
2767+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_LLAVA}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
27302768
add_opt(common_arg(
27312769
{"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
27322770
string_format(

common/common.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,8 @@ struct common_params {
342342

343343
// multimodal models (see examples/llava)
344344
struct common_params_model mmproj;
345+
bool mmproj_use_gpu = true; // use GPU for multimodal model
346+
bool no_mmproj = false; // explicitly disable multimodal model
345347
std::vector<std::string> image; // path to image file(s)
346348

347349
// embedding

0 commit comments

Comments
 (0)