Skip to content

Commit 19b9fe1

Browse files
committed
Merge branch 'master' into xsn/server_mtmd
2 parents 989730c + 87616f0 commit 19b9fe1

30 files changed

+499
-181
lines changed

.clang-tidy

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ Checks: >
1313
-readability-magic-numbers,
1414
-readability-uppercase-literal-suffix,
1515
-readability-simplify-boolean-expr,
16+
-readability-math-missing-parentheses,
1617
clang-analyzer-*,
1718
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
1819
performance-*,

common/arg.cpp

Lines changed: 58 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,11 @@
3838

3939
using json = nlohmann::ordered_json;
4040

41+
std::initializer_list<enum llama_example> mmproj_examples = {
42+
LLAMA_EXAMPLE_LLAVA,
43+
LLAMA_EXAMPLE_SERVER,
44+
};
45+
4146
common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
4247
this->examples = std::move(examples);
4348
return *this;
@@ -641,11 +646,16 @@ static struct common_hf_file_res common_get_hf_file(const std::string &, const s
641646
// utils
642647
//
643648

644-
static void common_params_handle_model(
649+
struct handle_model_result {
650+
bool found_mmproj = false;
651+
common_params_model mmproj;
652+
};
653+
654+
static handle_model_result common_params_handle_model(
645655
struct common_params_model & model,
646656
const std::string & bearer_token,
647-
const std::string & model_path_default,
648-
bool is_mmproj = false) { // TODO: move is_mmproj to an enum when we have more files?
657+
const std::string & model_path_default) {
658+
handle_model_result result;
649659
// handle pre-fill default model path and url based on hf_repo and hf_file
650660
{
651661
if (!model.hf_repo.empty()) {
@@ -657,7 +667,12 @@ static void common_params_handle_model(
657667
exit(1); // built without CURL, error message already printed
658668
}
659669
model.hf_repo = auto_detected.repo;
660-
model.hf_file = is_mmproj ? auto_detected.mmprojFile : auto_detected.ggufFile;
670+
model.hf_file = auto_detected.ggufFile;
671+
if (!auto_detected.mmprojFile.empty()) {
672+
result.found_mmproj = true;
673+
result.mmproj.hf_repo = model.hf_repo;
674+
result.mmproj.hf_file = auto_detected.mmprojFile;
675+
}
661676
} else {
662677
model.hf_file = model.path;
663678
}
@@ -694,6 +709,8 @@ static void common_params_handle_model(
694709
exit(1);
695710
}
696711
}
712+
713+
return result;
697714
}
698715

699716
const std::vector<ggml_type> kv_cache_types = {
@@ -827,18 +844,25 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
827844
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
828845
}
829846

830-
common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH);
831-
common_params_handle_model(params.speculative.model, params.hf_token, "");
832-
common_params_handle_model(params.vocoder.model, params.hf_token, "");
833-
834-
// allow --mmproj to be set from -hf
835-
// assuming that mmproj is always in the same repo as text model
836-
if (!params.model.hf_repo.empty() && (
837-
ctx_arg.ex == LLAMA_EXAMPLE_LLAVA || ctx_arg.ex == LLAMA_EXAMPLE_SERVER)) {
838-
params.mmproj.hf_repo = params.model.hf_repo;
847+
// handle model and download
848+
{
849+
auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH);
850+
if (params.no_mmproj) {
851+
params.mmproj = {};
852+
} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
853+
// optionally, handle mmproj model when -hf is specified
854+
params.mmproj = res.mmproj;
855+
}
856+
// only download mmproj if the current example is using it
857+
for (auto & ex : mmproj_examples) {
858+
if (ctx_arg.ex == ex) {
859+
common_params_handle_model(params.mmproj, params.hf_token, "");
860+
break;
861+
}
862+
}
863+
common_params_handle_model(params.speculative.model, params.hf_token, "");
864+
common_params_handle_model(params.vocoder.model, params.hf_token, "");
839865
}
840-
// TODO @ngxson : this will break non-vision model with -hf, need to fix before merging
841-
common_params_handle_model(params.mmproj, params.hf_token, "", true);
842866

843867
if (params.escape) {
844868
string_process_escapes(params.prompt);
@@ -970,7 +994,6 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
970994
"llama-embedding",
971995
"llama-eval-callback",
972996
"llama-export-lora",
973-
"llama-gbnf-validator",
974997
"llama-gen-docs",
975998
"llama-gguf",
976999
"llama-gguf-hash",
@@ -990,7 +1013,6 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
9901013
"llama-perplexity",
9911014
"llama-q8dot",
9921015
"llama-quantize",
993-
"llama-quantize-stats",
9941016
"llama-qwen2vl-cli",
9951017
"llama-retrieval",
9961018
"llama-run",
@@ -2097,18 +2119,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
20972119
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
20982120
add_opt(common_arg(
20992121
{"--mmproj"}, "FILE",
2100-
"path to a multimodal projector file for LLaVA. see examples/llava/README.md",
2122+
"path to a multimodal projector file. see examples/llava/README.md",
21012123
[](common_params & params, const std::string & value) {
21022124
params.mmproj.path = value;
21032125
}
2104-
).set_examples({LLAMA_EXAMPLE_LLAVA, LLAMA_EXAMPLE_SERVER}));
2126+
).set_examples(mmproj_examples));
21052127
add_opt(common_arg(
21062128
{"--mmproj-url"}, "URL",
2107-
"URL to a multimodal projector file for LLaVA. see examples/llava/README.md",
2129+
"URL to a multimodal projector file. see examples/llava/README.md",
21082130
[](common_params & params, const std::string & value) {
21092131
params.mmproj.url = value;
21102132
}
2111-
).set_examples({LLAMA_EXAMPLE_LLAVA, LLAMA_EXAMPLE_SERVER}));
2133+
).set_examples(mmproj_examples));
2134+
add_opt(common_arg(
2135+
{"--no-mmproj"},
2136+
"explicitly disable multimodal projector, useful when using -hf",
2137+
[](common_params & params) {
2138+
params.no_mmproj = true;
2139+
}
2140+
).set_examples(mmproj_examples));
2141+
add_opt(common_arg(
2142+
{"--no-mmproj-offload"},
2143+
"do not offload multimodal projector to GPU",
2144+
[](common_params & params) {
2145+
params.mmproj_use_gpu = false;
2146+
}
2147+
).set_examples(mmproj_examples));
21122148
add_opt(common_arg(
21132149
{"--image"}, "FILE",
21142150
"path to an image file. use with multimodal models. Specify multiple times for batching",
@@ -2383,6 +2419,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
23832419
add_opt(common_arg(
23842420
{"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
23852421
"Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
2422+
"mmproj is also downloaded automatically if available. to disable, add --no-mmproj\n"
23862423
"example: unsloth/phi-4-GGUF:q4_k_m\n"
23872424
"(default: unused)",
23882425
[](common_params & params, const std::string & value) {

common/common.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,8 @@ struct common_params {
342342

343343
// multimodal models (see examples/llava)
344344
struct common_params_model mmproj;
345+
bool mmproj_use_gpu = true; // use GPU for multimodal model
346+
bool no_mmproj = false; // explicitly disable multimodal model
345347
std::vector<std::string> image; // path to image file(s)
346348

347349
// embedding

examples/CMakeLists.txt

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,6 @@ else()
2121
add_subdirectory(embedding)
2222
add_subdirectory(eval-callback)
2323

24-
if (NOT WIN32)
25-
# disabled on Windows because it uses internal functions not exported with LLAMA_API
26-
add_subdirectory(gbnf-validator)
27-
endif()
28-
2924
add_subdirectory(gguf-hash)
3025
add_subdirectory(gguf-split)
3126
add_subdirectory(gguf)
@@ -58,10 +53,6 @@ else()
5853
add_subdirectory(convert-llama2c-to-ggml)
5954
add_subdirectory(cvector-generator)
6055
add_subdirectory(export-lora)
61-
if (NOT WIN32)
62-
# disabled on Windows because it uses internal functions not exported with LLAMA_API
63-
add_subdirectory(quantize-stats)
64-
endif()
6556
add_subdirectory(llava)
6657
if (GGML_RPC)
6758
add_subdirectory(rpc)

examples/gbnf-validator/CMakeLists.txt

Lines changed: 0 additions & 5 deletions
This file was deleted.

examples/llava/mtmd-cli.cpp

Lines changed: 29 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,9 @@
2424
#include <signal.h>
2525
#endif
2626

27-
static bool g_is_generating = false;
27+
// volatile, because of signal being an interrupt
28+
static volatile bool g_is_generating = false;
29+
static volatile bool g_is_interrupted = false;
2830

2931
/**
3032
* Please note that this is NOT a production-ready stuff.
@@ -38,7 +40,8 @@ static void show_additional_info(int /*argc*/, char ** argv) {
3840
"Usage: %s [options] -m <model> --mmproj <mmproj> --image <image> -p <prompt>\n\n"
3941
" -m and --mmproj are required\n"
4042
" -hf user/repo can replace both -m and --mmproj in most cases\n"
41-
" --image and -p are optional, if NOT provided, the CLI will run in chat mode\n",
43+
" --image and -p are optional, if NOT provided, the CLI will run in chat mode\n"
44+
" to disable using GPU for mmproj model, add --no-mmproj-offload\n",
4245
argv[0]
4346
);
4447
}
@@ -50,8 +53,10 @@ static void sigint_handler(int signo) {
5053
g_is_generating = false;
5154
} else {
5255
console::cleanup();
53-
LOG("\nInterrupted by user\n");
54-
_exit(130);
56+
if (g_is_interrupted) {
57+
_exit(1);
58+
}
59+
g_is_interrupted = true;
5560
}
5661
}
5762
}
@@ -108,10 +113,10 @@ struct mtmd_cli_context {
108113
void init_vision_context(common_params & params) {
109114
const char * clip_path = params.mmproj.path.c_str();
110115
ctx_vision.reset(mtmd_init_from_file(clip_path, model, mtmd_context_params{
111-
/* use_gpu */ true,
116+
/* use_gpu */ params.mmproj_use_gpu,
112117
/* timings */ true,
113118
/* n_threads */ params.cpuparams.n_threads,
114-
/* verbosity */ GGML_LOG_LEVEL_INFO,
119+
/* verbosity */ params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO,
115120
}));
116121
if (!ctx_vision.get()) {
117122
LOG_ERR("Failed to load vision model from %s\n", clip_path);
@@ -167,7 +172,7 @@ struct decode_embd_batch {
167172
static int generate_response(mtmd_cli_context & ctx, common_sampler * smpl, int n_predict) {
168173
llama_tokens generated_tokens;
169174
for (int i = 0; i < n_predict; i++) {
170-
if (i > n_predict || !g_is_generating) {
175+
if (i > n_predict || !g_is_generating || g_is_interrupted) {
171176
printf("\n");
172177
break;
173178
}
@@ -184,6 +189,11 @@ static int generate_response(mtmd_cli_context & ctx, common_sampler * smpl, int
184189
printf("%s", common_token_to_piece(ctx.lctx, token_id).c_str());
185190
fflush(stdout);
186191

192+
if (g_is_interrupted) {
193+
printf("\n");
194+
break;
195+
}
196+
187197
// eval the token
188198
common_batch_clear(ctx.batch);
189199
common_batch_add(ctx.batch, token_id, ctx.n_past++, {0}, true);
@@ -219,6 +229,9 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, std::vect
219229
text.add_special = add_bos;
220230
text.parse_special = true;
221231
mtmd_input_chunks chunks;
232+
233+
if (g_is_interrupted) return 0;
234+
222235
int32_t res = mtmd_tokenize(ctx.ctx_vision.get(), chunks, text, bitmaps);
223236
if (res != 0) {
224237
LOG_ERR("Unable to tokenize prompt, res = %d\n", res);
@@ -249,6 +262,7 @@ int main(int argc, char ** argv) {
249262

250263
if (params.mmproj.path.empty()) {
251264
show_additional_info(argc, argv);
265+
LOG_ERR("ERR: Missing --mmproj argument\n");
252266
return 1;
253267
}
254268

@@ -276,6 +290,8 @@ int main(int argc, char ** argv) {
276290
#endif
277291
}
278292

293+
if (g_is_interrupted) return 130;
294+
279295
if (is_single_turn) {
280296
g_is_generating = true;
281297
if (params.prompt.find("<__image__>") == std::string::npos) {
@@ -287,7 +303,7 @@ int main(int argc, char ** argv) {
287303
if (eval_message(ctx, msg, params.image, true)) {
288304
return 1;
289305
}
290-
if (generate_response(ctx, smpl, n_predict)) {
306+
if (!g_is_interrupted && generate_response(ctx, smpl, n_predict)) {
291307
return 1;
292308
}
293309

@@ -302,12 +318,13 @@ int main(int argc, char ** argv) {
302318
std::vector<std::string> images_fname;
303319
std::string content;
304320

305-
while (true) {
321+
while (!g_is_interrupted) {
306322
g_is_generating = false;
307323
LOG("\n> ");
308324
console::set_display(console::user_input);
309325
std::string line;
310326
console::readline(line, false);
327+
if (g_is_interrupted) break;
311328
console::set_display(console::reset);
312329
line = string_strip(line);
313330
if (line.empty()) {
@@ -335,6 +352,7 @@ int main(int argc, char ** argv) {
335352
msg.role = "user";
336353
msg.content = content;
337354
int ret = eval_message(ctx, msg, images_fname, is_first_msg);
355+
if (g_is_interrupted) break;
338356
if (ret == 2) {
339357
// non-fatal error
340358
images_fname.clear();
@@ -352,6 +370,7 @@ int main(int argc, char ** argv) {
352370
is_first_msg = false;
353371
}
354372
}
373+
if (g_is_interrupted) LOG("\nInterrupted by user\n");
355374
llama_perf_context_print(ctx.lctx);
356-
return 0;
375+
return g_is_interrupted ? 130 : 0;
357376
}

examples/quantize-stats/CMakeLists.txt

Lines changed: 0 additions & 6 deletions
This file was deleted.

ggml/include/ggml.h

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -481,6 +481,7 @@ extern "C" {
481481
GGML_OP_CONV_TRANSPOSE_1D,
482482
GGML_OP_IM2COL,
483483
GGML_OP_IM2COL_BACK,
484+
GGML_OP_CONV_2D_DW,
484485
GGML_OP_CONV_TRANSPOSE_2D,
485486
GGML_OP_POOL_1D,
486487
GGML_OP_POOL_2D,
@@ -677,6 +678,9 @@ extern "C" {
677678
GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
678679
GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
679680

681+
// true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
682+
GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
683+
680684
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
681685
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
682686

@@ -1660,7 +1664,7 @@ extern "C" {
16601664
struct ggml_tensor * a,
16611665
struct ggml_tensor * b);
16621666

1663-
// depthwise
1667+
// depthwise (via im2col and mul_mat)
16641668
GGML_API struct ggml_tensor * ggml_conv_2d_dw(
16651669
struct ggml_context * ctx,
16661670
struct ggml_tensor * a, // convolution kernel
@@ -1672,6 +1676,22 @@ extern "C" {
16721676
int d0, // dilation dimension 0
16731677
int d1); // dilation dimension 1
16741678

1679+
// Depthwise 2D convolution
1680+
// may be faster than ggml_conv_2d_dw, but not available in all backends
1681+
// a: KW KH 1 C convolution kernel
1682+
// b: W H C N input data
1683+
// res: W_out H_out C N
1684+
GGML_API struct ggml_tensor * ggml_conv_2d_dw_direct(
1685+
struct ggml_context * ctx,
1686+
struct ggml_tensor * a,
1687+
struct ggml_tensor * b,
1688+
int stride0,
1689+
int stride1,
1690+
int pad0,
1691+
int pad1,
1692+
int dilation0,
1693+
int dilation1);
1694+
16751695
GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
16761696
struct ggml_context * ctx,
16771697
struct ggml_tensor * a,

0 commit comments

Comments
 (0)