From 838337b814b6c79f0182c04f38eac38853bd0b02 Mon Sep 17 00:00:00 2001 From: sefgit <3461761+sefgit@users.noreply.github.com> Date: Mon, 2 Jun 2025 18:07:46 +0700 Subject: [PATCH 1/2] add api_like_OAI feature to support file and image context --- llama.cpp/server/oai.h | 8 ++++-- llama.cpp/server/utils.h | 62 +++++++++++++++++++++++++++++++++------- 2 files changed, 57 insertions(+), 13 deletions(-) diff --git a/llama.cpp/server/oai.h b/llama.cpp/server/oai.h index b82ef25d55..b84da87472 100644 --- a/llama.cpp/server/oai.h +++ b/llama.cpp/server/oai.h @@ -23,7 +23,7 @@ inline static json oaicompat_completion_params_parse( const std::string &chat_template) { json llama_params; - + std::string *images_ptr = nullptr; llama_params["__oaicompat"] = true; // Map OpenAI parameters to llama.cpp parameters @@ -35,7 +35,10 @@ inline static json oaicompat_completion_params_parse( // https://platform.openai.com/docs/api-reference/chat/create llama_sampling_params default_sparams; llama_params["model"] = json_value(body, "model", std::string("unknown")); - llama_params["prompt"] = format_chat(model, chat_template, body.at("messages")); + llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"), images_ptr); + if (images_ptr != nullptr) { + llama_params["image_data"] = *images_ptr; + } llama_params["cache_prompt"] = json_value(body, "cache_prompt", false); llama_params["temperature"] = json_value(body, "temperature", 0.0); llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k); @@ -225,4 +228,3 @@ inline static json format_embeddings_response_oaicompat(const json &request, con }; return res; } - diff --git a/llama.cpp/server/utils.h b/llama.cpp/server/utils.h index 5034a512e1..9e289929f2 100644 --- a/llama.cpp/server/utils.h +++ b/llama.cpp/server/utils.h @@ -3,6 +3,8 @@ #pragma once +#include +#include #include #include #include @@ -10,6 +12,7 @@ #include #include #include // [jart] +#include #include "llama.cpp/json.h" #include "llama.cpp/llava/clip.h" @@ -206,20 +209,56 @@ inline bool verify_custom_template(const std::string & tmpl) { } // Format given chat. If tmpl is empty, we take the template from model metadata -inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector & messages) +inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector & messages, std::string * images) { size_t alloc_size = 0; // vector holding all allocated string to be passed to llama_chat_apply_template - std::vector str(messages.size() * 2); - std::vector chat(messages.size()); - + std::vector str; + std::vector chat; + int image_id = 1; + json images_data = json::array(); for (size_t i = 0; i < messages.size(); ++i) { auto &curr_msg = messages[i]; - str[i*2 + 0] = json_value(curr_msg, "role", std::string("")); - str[i*2 + 1] = json_value(curr_msg, "content", std::string("")); - alloc_size += str[i*2 + 1].length(); - chat[i].role = str[i*2 + 0].c_str(); - chat[i].content = str[i*2 + 1].c_str(); + auto role = json_value(curr_msg, "role", std::string("")); + auto tmp = json_value(curr_msg, "content", json::array()); + if (tmp.is_array()) { + for (auto &item : tmp) { + auto type = json_value(item, "type", std::string("")); + if (type == "text") { + str.push_back(role); + auto content = json_value(item, "text", std::string("")); + alloc_size += content.length(); + str.push_back(content); + llama_chat_message msg; + msg.role = strdup(role.c_str()); + msg.content = strdup(content.c_str()); + chat.push_back(msg); + } else if (type == "image_url") { + auto image_url = json_value(item, "image_url", json::object()); + if (image_url.is_object()) { + auto url = json_value(image_url, "url", std::string("")); + std::vector parts; + std::istringstream f(url); + std::string s; + while (getline(f, s, ',')) { + parts.push_back(s); + } + if (parts.size()>0) { + images_data.emplace_back(json::object({{"data", parts[1]}, {"id", image_id++}})); + } + } + } + } + } else { + str.push_back(role); + auto content = json_value(curr_msg, "content", std::string("")); + alloc_size += content.length(); + str.push_back(content); + llama_chat_message msg; + msg.role = strdup(role.c_str()); + msg.content = strdup(content.c_str()); + chat.push_back(msg); + } } const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str(); @@ -236,7 +275,10 @@ inline std::string format_chat(const struct llama_model * model, const std::stri std::string formatted_chat(buf.data(), res); LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}}); - + if (image_id > 1) { + images = new std::string(images_data.dump()); + LOG_VERBOSE("images_chat", {{"text", images->c_str()}}); + } return formatted_chat; } From 4829d242d32f97e854c352c32740721353af14e9 Mon Sep 17 00:00:00 2001 From: sefgit <3461761+sefgit@users.noreply.github.com> Date: Mon, 2 Jun 2025 18:20:20 +0700 Subject: [PATCH 2/2] fix missing device info in CUDA log --- llama.cpp/ggml-cuda.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp/ggml-cuda.cu b/llama.cpp/ggml-cuda.cu index 4a810580c0..d61b789210 100644 --- a/llama.cpp/ggml-cuda.cu +++ b/llama.cpp/ggml-cuda.cu @@ -14409,7 +14409,7 @@ static ggml_cuda_device_info ggml_cuda_init() { cudaDeviceProp prop; CUDA_CHECK(cudaGetDeviceProperties(&prop, id)); - GGML_CUDA_LOG_INFO(" Device %%d: %%s, compute capability %%d.%%d, VMM: %%s\n"); + GGML_CUDA_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, (device_vmm ? "yes" : "no")); info.default_tensor_split[id] = total_vram; total_vram += prop.totalGlobalMem;