Skip to content

Commit 1606fef

Browse files
Merge pull request #67 from menloresearch/update-dev-from-master-2025-04-25-00-08
Sync master with upstream release b5186
2 parents a977fc6 + 13be08d commit 1606fef

34 files changed

+488
-191
lines changed

.clang-tidy

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ Checks: >
1313
-readability-magic-numbers,
1414
-readability-uppercase-literal-suffix,
1515
-readability-simplify-boolean-expr,
16+
-readability-math-missing-parentheses,
1617
clang-analyzer-*,
1718
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
1819
performance-*,

common/arg.cpp

Lines changed: 58 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,11 @@
3838

3939
using json = nlohmann::ordered_json;
4040

41+
std::initializer_list<enum llama_example> mmproj_examples = {
42+
LLAMA_EXAMPLE_LLAVA,
43+
// TODO: add LLAMA_EXAMPLE_SERVER when it's ready
44+
};
45+
4146
common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
4247
this->examples = std::move(examples);
4348
return *this;
@@ -641,11 +646,16 @@ static struct common_hf_file_res common_get_hf_file(const std::string &, const s
641646
// utils
642647
//
643648

644-
static void common_params_handle_model(
649+
struct handle_model_result {
650+
bool found_mmproj = false;
651+
common_params_model mmproj;
652+
};
653+
654+
static handle_model_result common_params_handle_model(
645655
struct common_params_model & model,
646656
const std::string & bearer_token,
647-
const std::string & model_path_default,
648-
bool is_mmproj = false) { // TODO: move is_mmproj to an enum when we have more files?
657+
const std::string & model_path_default) {
658+
handle_model_result result;
649659
// handle pre-fill default model path and url based on hf_repo and hf_file
650660
{
651661
if (!model.hf_repo.empty()) {
@@ -657,7 +667,12 @@ static void common_params_handle_model(
657667
exit(1); // built without CURL, error message already printed
658668
}
659669
model.hf_repo = auto_detected.repo;
660-
model.hf_file = is_mmproj ? auto_detected.mmprojFile : auto_detected.ggufFile;
670+
model.hf_file = auto_detected.ggufFile;
671+
if (!auto_detected.mmprojFile.empty()) {
672+
result.found_mmproj = true;
673+
result.mmproj.hf_repo = model.hf_repo;
674+
result.mmproj.hf_file = auto_detected.mmprojFile;
675+
}
661676
} else {
662677
model.hf_file = model.path;
663678
}
@@ -694,6 +709,8 @@ static void common_params_handle_model(
694709
exit(1);
695710
}
696711
}
712+
713+
return result;
697714
}
698715

699716
const std::vector<ggml_type> kv_cache_types = {
@@ -827,16 +844,25 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
827844
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
828845
}
829846

830-
common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH);
831-
common_params_handle_model(params.speculative.model, params.hf_token, "");
832-
common_params_handle_model(params.vocoder.model, params.hf_token, "");
833-
834-
// allow --mmproj to be set from -hf
835-
// assuming that mmproj is always in the same repo as text model
836-
if (!params.model.hf_repo.empty() && ctx_arg.ex == LLAMA_EXAMPLE_LLAVA) {
837-
params.mmproj.hf_repo = params.model.hf_repo;
847+
// handle model and download
848+
{
849+
auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH);
850+
if (params.no_mmproj) {
851+
params.mmproj = {};
852+
} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
853+
// optionally, handle mmproj model when -hf is specified
854+
params.mmproj = res.mmproj;
855+
}
856+
// only download mmproj if the current example is using it
857+
for (auto & ex : mmproj_examples) {
858+
if (ctx_arg.ex == ex) {
859+
common_params_handle_model(params.mmproj, params.hf_token, "");
860+
break;
861+
}
862+
}
863+
common_params_handle_model(params.speculative.model, params.hf_token, "");
864+
common_params_handle_model(params.vocoder.model, params.hf_token, "");
838865
}
839-
common_params_handle_model(params.mmproj, params.hf_token, "", true);
840866

841867
if (params.escape) {
842868
string_process_escapes(params.prompt);
@@ -968,7 +994,6 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
968994
"llama-embedding",
969995
"llama-eval-callback",
970996
"llama-export-lora",
971-
"llama-gbnf-validator",
972997
"llama-gen-docs",
973998
"llama-gguf",
974999
"llama-gguf-hash",
@@ -988,7 +1013,6 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
9881013
"llama-perplexity",
9891014
"llama-q8dot",
9901015
"llama-quantize",
991-
"llama-quantize-stats",
9921016
"llama-qwen2vl-cli",
9931017
"llama-retrieval",
9941018
"llama-run",
@@ -2095,18 +2119,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
20952119
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
20962120
add_opt(common_arg(
20972121
{"--mmproj"}, "FILE",
2098-
"path to a multimodal projector file for LLaVA. see examples/llava/README.md",
2122+
"path to a multimodal projector file. see examples/llava/README.md",
20992123
[](common_params & params, const std::string & value) {
21002124
params.mmproj.path = value;
21012125
}
2102-
).set_examples({LLAMA_EXAMPLE_LLAVA}));
2126+
).set_examples(mmproj_examples));
21032127
add_opt(common_arg(
21042128
{"--mmproj-url"}, "URL",
2105-
"URL to a multimodal projector file for LLaVA. see examples/llava/README.md",
2129+
"URL to a multimodal projector file. see examples/llava/README.md",
21062130
[](common_params & params, const std::string & value) {
21072131
params.mmproj.url = value;
21082132
}
2109-
).set_examples({LLAMA_EXAMPLE_LLAVA}));
2133+
).set_examples(mmproj_examples));
2134+
add_opt(common_arg(
2135+
{"--no-mmproj"},
2136+
"explicitly disable multimodal projector, useful when using -hf",
2137+
[](common_params & params) {
2138+
params.no_mmproj = true;
2139+
}
2140+
).set_examples(mmproj_examples));
2141+
add_opt(common_arg(
2142+
{"--no-mmproj-offload"},
2143+
"do not offload multimodal projector to GPU",
2144+
[](common_params & params) {
2145+
params.mmproj_use_gpu = false;
2146+
}
2147+
).set_examples(mmproj_examples));
21102148
add_opt(common_arg(
21112149
{"--image"}, "FILE",
21122150
"path to an image file. use with multimodal models. Specify multiple times for batching",
@@ -2381,6 +2419,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
23812419
add_opt(common_arg(
23822420
{"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
23832421
"Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
2422+
"mmproj is also downloaded automatically if available. to disable, add --no-mmproj\n"
23842423
"example: unsloth/phi-4-GGUF:q4_k_m\n"
23852424
"(default: unused)",
23862425
[](common_params & params, const std::string & value) {

common/common.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,8 @@ struct common_params {
342342

343343
// multimodal models (see examples/llava)
344344
struct common_params_model mmproj;
345+
bool mmproj_use_gpu = true; // use GPU for multimodal model
346+
bool no_mmproj = false; // explicitly disable multimodal model
345347
std::vector<std::string> image; // path to image file(s)
346348

347349
// embedding

examples/CMakeLists.txt

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,6 @@ else()
2121
add_subdirectory(embedding)
2222
add_subdirectory(eval-callback)
2323

24-
if (NOT WIN32)
25-
# disabled on Windows because it uses internal functions not exported with LLAMA_API
26-
add_subdirectory(gbnf-validator)
27-
endif()
28-
2924
add_subdirectory(gguf-hash)
3025
add_subdirectory(gguf-split)
3126
add_subdirectory(gguf)
@@ -58,10 +53,6 @@ else()
5853
add_subdirectory(convert-llama2c-to-ggml)
5954
add_subdirectory(cvector-generator)
6055
add_subdirectory(export-lora)
61-
if (NOT WIN32)
62-
# disabled on Windows because it uses internal functions not exported with LLAMA_API
63-
add_subdirectory(quantize-stats)
64-
endif()
6556
add_subdirectory(llava)
6657
if (GGML_RPC)
6758
add_subdirectory(rpc)

examples/embedding/embedding.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,13 @@ int main(int argc, char ** argv) {
8989
common_init();
9090

9191
params.embedding = true;
92+
93+
// utilize the full context
94+
if (params.n_batch < params.n_ctx) {
95+
LOG_WRN("%s: setting batch size to %d\n", __func__, params.n_ctx);
96+
params.n_batch = params.n_ctx;
97+
}
98+
9299
// For non-causal models, batch size must be equal to ubatch size
93100
params.n_ubatch = params.n_batch;
94101

@@ -134,7 +141,6 @@ int main(int argc, char ** argv) {
134141

135142
// max batch size
136143
const uint64_t n_batch = params.n_batch;
137-
GGML_ASSERT(params.n_batch >= params.n_ctx);
138144

139145
// tokenize the prompts and trim
140146
std::vector<std::vector<int32_t>> inputs;

examples/gbnf-validator/CMakeLists.txt

Lines changed: 0 additions & 5 deletions
This file was deleted.

examples/llava/clip-impl.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,8 +90,6 @@
9090
#define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s"
9191
#define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s"
9292
#define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
93-
#define TN_GLM_BOI_W "adapter.boi"
94-
#define TN_GLM_EOI_W "adapter.eoi"
9593

9694
enum projector_type {
9795
PROJECTOR_TYPE_MLP,

examples/llava/clip.cpp

Lines changed: 1 addition & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -244,8 +244,6 @@ struct clip_vision_model {
244244
//GLMV-Edge projection
245245
struct ggml_tensor * mm_model_adapter_conv_w = nullptr;
246246
struct ggml_tensor * mm_model_adapter_conv_b = nullptr;
247-
struct ggml_tensor * boi_w = nullptr;
248-
struct ggml_tensor * eoi_w = nullptr;
249247

250248
// MobileVLM projection
251249
struct ggml_tensor * mm_model_mlp_1_w = nullptr;
@@ -1697,8 +1695,6 @@ struct clip_model_loader {
16971695
vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H,"weight"));
16981696
vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE,"weight"));
16991697
vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H,"weight"));
1700-
vision_model.boi_w = get_tensor(TN_GLM_BOI_W);
1701-
vision_model.eoi_w = get_tensor(TN_GLM_EOI_W);
17021698
} break;
17031699
case PROJECTOR_TYPE_MERGER:
17041700
{
@@ -2593,8 +2589,7 @@ void clip_free(clip_ctx * ctx) {
25932589
}
25942590

25952591
size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
2596-
int extra_tokens = ctx->has_glm_projector ? 2 : 0;
2597-
return (clip_n_patches(ctx) + extra_tokens) * clip_n_mmproj_embd(ctx) * sizeof(float);
2592+
return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
25982593
}
25992594

26002595
size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) {
@@ -2790,9 +2785,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
27902785
}
27912786
if (ctx->has_glm_projector) {
27922787
GGML_ASSERT(batch_size == 1);
2793-
ggml_tensor * boi = ctx->vision_model.boi_w;
2794-
ggml_backend_tensor_get(boi,vec,0,ggml_nbytes(boi));
2795-
vec = (float*)(vec+ggml_nelements(boi)); //offset for boi
27962788
}
27972789

27982790
// build the inference graph
@@ -3001,13 +2993,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
30012993
// copy the embeddings to the location passed by the user
30022994
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
30032995

3004-
if (ctx->has_glm_projector) {
3005-
//eoi
3006-
ggml_tensor * eoi = ctx->vision_model.eoi_w;
3007-
int offset = ggml_nelements(embeddings);
3008-
ggml_backend_tensor_get(eoi, vec+offset, 0, ggml_nbytes(eoi));
3009-
}
3010-
30112996
return true;
30122997
}
30132998

examples/llava/mtmd-cli.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,8 @@ static void show_additional_info(int /*argc*/, char ** argv) {
4040
"Usage: %s [options] -m <model> --mmproj <mmproj> --image <image> -p <prompt>\n\n"
4141
" -m and --mmproj are required\n"
4242
" -hf user/repo can replace both -m and --mmproj in most cases\n"
43-
" --image and -p are optional, if NOT provided, the CLI will run in chat mode\n",
43+
" --image and -p are optional, if NOT provided, the CLI will run in chat mode\n"
44+
" to disable using GPU for mmproj model, add --no-mmproj-offload\n",
4445
argv[0]
4546
);
4647
}
@@ -112,10 +113,10 @@ struct mtmd_cli_context {
112113
void init_vision_context(common_params & params) {
113114
const char * clip_path = params.mmproj.path.c_str();
114115
ctx_vision.reset(mtmd_init_from_file(clip_path, model, mtmd_context_params{
115-
/* use_gpu */ true,
116+
/* use_gpu */ params.mmproj_use_gpu,
116117
/* timings */ true,
117118
/* n_threads */ params.cpuparams.n_threads,
118-
/* verbosity */ GGML_LOG_LEVEL_INFO,
119+
/* verbosity */ params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO,
119120
}));
120121
if (!ctx_vision.get()) {
121122
LOG_ERR("Failed to load vision model from %s\n", clip_path);
@@ -261,6 +262,7 @@ int main(int argc, char ** argv) {
261262

262263
if (params.mmproj.path.empty()) {
263264
show_additional_info(argc, argv);
265+
LOG_ERR("ERR: Missing --mmproj argument\n");
264266
return 1;
265267
}
266268

examples/llava/mtmd.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,11 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
186186
marker_modified = "<start_of_image>" + ctx->image_marker + "<end_of_image>";
187187
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
188188

189+
} else if (proj_type == PROJECTOR_TYPE_GLM_EDGE) {
190+
// <|begin_of_image|> ... (image embeddings) ... <|end_of_image|>
191+
marker_modified = "<|begin_of_image|>" + ctx->image_marker + "<|end_of_image|>";
192+
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
193+
189194
} else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
190195
// https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
191196
marker_modified = "<fake_token_around_image><global-img>" + ctx->image_marker + "<fake_token_around_image>";

0 commit comments

Comments
 (0)