Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
715 changes: 644 additions & 71 deletions common/arg.cpp

Large diffs are not rendered by default.

533 changes: 14 additions & 519 deletions common/common.cpp

Large diffs are not rendered by default.

49 changes: 13 additions & 36 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,10 +121,6 @@ struct common_grammar_trigger {
common_grammar_trigger_type type;
std::string value;
llama_token token = LLAMA_TOKEN_NULL;

// T can only be nlohmann::ordered_json
template <class T> T to_json() const;
template <class T> static common_grammar_trigger from_json(const T & in);
};

// sampling parameters
Expand Down Expand Up @@ -184,6 +180,13 @@ struct common_params_sampling {
std::string print() const;
};

struct common_params_model {
std::string path = ""; // model local path // NOLINT
std::string url = ""; // model url to download // NOLINT
std::string hf_repo = ""; // HF repo // NOLINT
std::string hf_file = ""; // HF file // NOLINT
};

struct common_params_speculative {
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading

Expand All @@ -197,19 +200,11 @@ struct common_params_speculative {
struct cpu_params cpuparams;
struct cpu_params cpuparams_batch;

std::string hf_repo = ""; // HF repo // NOLINT
std::string hf_file = ""; // HF file // NOLINT

std::string model = ""; // draft model for speculative decoding // NOLINT
std::string model_url = ""; // model url to download // NOLINT
struct common_params_model model;
};

struct common_params_vocoder {
std::string hf_repo = ""; // HF repo // NOLINT
std::string hf_file = ""; // HF file // NOLINT

std::string model = ""; // model path // NOLINT
std::string model_url = ""; // model url to download // NOLINT
struct common_params_model model;

std::string speaker_file = ""; // speaker file path // NOLINT

Expand Down Expand Up @@ -267,12 +262,10 @@ struct common_params {
struct common_params_speculative speculative;
struct common_params_vocoder vocoder;

std::string model = ""; // model path // NOLINT
struct common_params_model model;

std::string model_alias = ""; // model alias // NOLINT
std::string model_url = ""; // model url to download // NOLINT
std::string hf_token = ""; // HF token // NOLINT
std::string hf_repo = ""; // HF repo // NOLINT
std::string hf_file = ""; // HF file // NOLINT
std::string prompt = ""; // NOLINT
std::string system_prompt = ""; // NOLINT
std::string prompt_file = ""; // store the external prompt file name // NOLINT
Expand All @@ -286,6 +279,7 @@ struct common_params {
std::vector<std::string> in_files; // all input files
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
std::vector<llama_model_kv_override> kv_overrides;
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;

bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
Expand Down Expand Up @@ -347,7 +341,7 @@ struct common_params {
common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;

// multimodal models (see examples/llava)
std::string mmproj = ""; // path to multimodal projector // NOLINT
struct common_params_model mmproj;
std::vector<std::string> image; // path to image file(s)

// embedding
Expand Down Expand Up @@ -546,23 +540,6 @@ struct llama_model_params common_model_params_to_llama ( common_params
struct llama_context_params common_context_params_to_llama(const common_params & params);
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);

struct llama_model * common_load_model_from_url(
const std::string & model_url,
const std::string & local_path,
const std::string & hf_token,
const struct llama_model_params & params);

struct llama_model * common_load_model_from_hf(
const std::string & repo,
const std::string & remote_path,
const std::string & local_path,
const std::string & hf_token,
const struct llama_model_params & params);

std::pair<std::string, std::string> common_get_hf_file(
const std::string & hf_repo_with_tag,
const std::string & hf_token);

// clear LoRA adapters from context, then apply new list of adapters
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);

Expand Down
2 changes: 1 addition & 1 deletion examples/batched-bench/batched-bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ int main(int argc, char ** argv) {

llama_model_params model_params = common_model_params_to_llama(params);

llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);

if (model == NULL) {
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
Expand Down
2 changes: 1 addition & 1 deletion examples/batched/batched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ int main(int argc, char ** argv) {

llama_model_params model_params = common_model_params_to_llama(params);

llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);

if (model == NULL) {
LOG_ERR("%s: error: unable to load model\n" , __func__);
Expand Down
2 changes: 1 addition & 1 deletion examples/export-lora/export-lora.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -421,7 +421,7 @@ int main(int argc, char ** argv) {

g_verbose = (params.verbosity > 1);
try {
lora_merge_ctx ctx(params.model, params.lora_adapters, params.out_file, params.cpuparams.n_threads);
lora_merge_ctx ctx(params.model.path, params.lora_adapters, params.out_file, params.cpuparams.n_threads);
ctx.run_merge();
} catch (const std::exception & err) {
fprintf(stderr, "%s\n", err.what());
Expand Down
2 changes: 1 addition & 1 deletion examples/gritlm/gritlm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ int main(int argc, char * argv[]) {

llama_backend_init();

llama_model * model = llama_model_load_from_file(params.model.c_str(), mparams);
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);

// create generation context
llama_context * ctx = llama_init_from_model(model, cparams);
Expand Down
20 changes: 20 additions & 0 deletions examples/llava/README-gemma3.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,26 @@
>
> This is very experimental, only used for demo purpose.

## Quick started

You can use pre-quantized model from [ggml-org](https://huggingface.co/ggml-org)'s Hugging Face account

```bash
# build
cmake -B build
cmake --build build --target llama-gemma3-cli

# alternatively, install from brew (MacOS)
brew install llama.cpp

# run it
llama-gemma3-cli -hf ggml-org/gemma-3-4b-it-GGUF
llama-gemma3-cli -hf ggml-org/gemma-3-12b-it-GGUF
llama-gemma3-cli -hf ggml-org/gemma-3-27b-it-GGUF

# note: 1B model does not support vision
```

## How to get mmproj.gguf?

```bash
Expand Down
6 changes: 3 additions & 3 deletions examples/llava/gemma3-cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ struct gemma3_context {
}

void init_clip_model(common_params & params) {
const char * clip_path = params.mmproj.c_str();
const char * clip_path = params.mmproj.path.c_str();
ctx_clip = clip_model_load(clip_path, params.verbosity > 1);
}

Expand Down Expand Up @@ -232,13 +232,13 @@ int main(int argc, char ** argv) {

common_init();

if (params.mmproj.empty()) {
if (params.mmproj.path.empty()) {
show_additional_info(argc, argv);
return 1;
}

gemma3_context ctx(params);
printf("%s: %s\n", __func__, params.model.c_str());
printf("%s: %s\n", __func__, params.model.path.c_str());

bool is_single_turn = !params.prompt.empty() && !params.image.empty();

Expand Down
6 changes: 3 additions & 3 deletions examples/llava/llava-cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ static struct llama_model * llava_init(common_params * params) {

llama_model_params model_params = common_model_params_to_llama(*params);

llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
if (model == NULL) {
LOG_ERR("%s: unable to load model\n" , __func__);
return NULL;
Expand All @@ -234,7 +234,7 @@ static struct llama_model * llava_init(common_params * params) {
}

static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
const char * clip_path = params->mmproj.c_str();
const char * clip_path = params->mmproj.path.c_str();

auto prompt = params->prompt;
if (prompt.empty()) {
Expand Down Expand Up @@ -283,7 +283,7 @@ int main(int argc, char ** argv) {

common_init();

if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
if (params.mmproj.path.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
print_usage(argc, argv);
return 1;
}
Expand Down
6 changes: 3 additions & 3 deletions examples/llava/minicpmv-cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ static struct llama_model * llava_init(common_params * params) {

llama_model_params model_params = common_model_params_to_llama(*params);

llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
if (model == NULL) {
LOG_ERR("%s: unable to load model\n" , __func__);
return NULL;
Expand Down Expand Up @@ -80,7 +80,7 @@ static void llava_free(struct llava_context * ctx_llava) {
}

static struct clip_ctx * clip_init_context(common_params * params) {
const char * clip_path = params->mmproj.c_str();
const char * clip_path = params->mmproj.path.c_str();

auto prompt = params->prompt;
if (prompt.empty()) {
Expand Down Expand Up @@ -290,7 +290,7 @@ int main(int argc, char ** argv) {

common_init();

if (params.mmproj.empty() || (params.image.empty())) {
if (params.mmproj.path.empty() || (params.image.empty())) {
show_additional_info(argc, argv);
return 1;
}
Expand Down
6 changes: 3 additions & 3 deletions examples/llava/qwen2vl-cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,7 @@ static struct llama_model * llava_init(common_params * params) {

llama_model_params model_params = common_model_params_to_llama(*params);

llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
if (model == NULL) {
LOG_ERR("%s: unable to load model\n" , __func__);
return NULL;
Expand All @@ -323,7 +323,7 @@ static struct llama_model * llava_init(common_params * params) {
}

static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
const char * clip_path = params->mmproj.c_str();
const char * clip_path = params->mmproj.path.c_str();

auto prompt = params->prompt;
if (prompt.empty()) {
Expand Down Expand Up @@ -524,7 +524,7 @@ int main(int argc, char ** argv) {

common_init();

if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
if (params.mmproj.path.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
print_usage(argc, argv);
return 1;
}
Expand Down
4 changes: 3 additions & 1 deletion examples/parallel/parallel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,8 @@ int main(int argc, char ** argv) {

common_params params;

params.n_predict = 128;

if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
return 1;
}
Expand Down Expand Up @@ -405,7 +407,7 @@ int main(int argc, char ** argv) {
params.prompt_file = "used built-in defaults";
}
LOG_INF("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
LOG_INF("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str());
LOG_INF("Model and path used: \033[32m%s\033[0m\n\n", params.model.path.c_str());

LOG_INF("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6);
LOG_INF("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6);
Expand Down
2 changes: 1 addition & 1 deletion examples/passkey/passkey.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ int main(int argc, char ** argv) {

llama_model_params model_params = common_model_params_to_llama(params);

llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);

if (model == NULL) {
LOG_ERR("%s: unable to load model\n" , __func__);
Expand Down
Loading