mozilla-ai · sefgit · Jun 2, 2025 · Jun 2, 2025
diff --git a/llama.cpp/ggml-cuda.cu b/llama.cpp/ggml-cuda.cu
@@ -14409,7 +14409,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
 
         cudaDeviceProp prop;
         CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
-        GGML_CUDA_LOG_INFO("  Device %%d: %%s, compute capability %%d.%%d, VMM: %%s\n");
+        GGML_CUDA_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, (device_vmm ? "yes" : "no"));
 
         info.default_tensor_split[id] = total_vram;
         total_vram += prop.totalGlobalMem;

diff --git a/llama.cpp/server/oai.h b/llama.cpp/server/oai.h
@@ -23,7 +23,7 @@ inline static json oaicompat_completion_params_parse(
     const std::string &chat_template)
 {
     json llama_params;
-
+    std::string *images_ptr = nullptr;
     llama_params["__oaicompat"] = true;
 
     // Map OpenAI parameters to llama.cpp parameters
@@ -35,7 +35,10 @@ inline static json oaicompat_completion_params_parse(
     // https://platform.openai.com/docs/api-reference/chat/create
     llama_sampling_params default_sparams;
     llama_params["model"]             = json_value(body, "model", std::string("unknown"));
-    llama_params["prompt"]            = format_chat(model, chat_template, body.at("messages"));
+    llama_params["prompt"]            = format_chat(model, chat_template, body.at("messages"), images_ptr);
+    if (images_ptr != nullptr) {
+        llama_params["image_data"]    = *images_ptr;
+    }
     llama_params["cache_prompt"]      = json_value(body, "cache_prompt", false);
     llama_params["temperature"]       = json_value(body, "temperature", 0.0);
     llama_params["top_k"]             = json_value(body, "top_k", default_sparams.top_k);
@@ -225,4 +228,3 @@ inline static json format_embeddings_response_oaicompat(const json &request, con
         };
     return res;
 }
-
diff --git a/llama.cpp/server/utils.h b/llama.cpp/server/utils.h
@@ -3,13 +3,16 @@
 
 #pragma once
 
+#include <cstddef>
+#include <cstring>
 #include <string>
 #include <vector>
 #include <set>
 #include <mutex>
 #include <condition_variable>
 #include <unordered_map>
 #include <iostream> // [jart]
+#include <sstream>
 
 #include "llama.cpp/json.h"
 #include "llama.cpp/llava/clip.h"
@@ -206,20 +209,56 @@ inline bool verify_custom_template(const std::string & tmpl) {
 }
 
 // Format given chat. If tmpl is empty, we take the template from model metadata
-inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages)
+inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages, std::string * images)
 {
     size_t alloc_size = 0;
     // vector holding all allocated string to be passed to llama_chat_apply_template
-    std::vector<std::string> str(messages.size() * 2);
-    std::vector<llama_chat_message> chat(messages.size());
-
+    std::vector<std::string> str;
+    std::vector<llama_chat_message> chat;
+    int image_id = 1;
+    json images_data = json::array();
     for (size_t i = 0; i < messages.size(); ++i) {
         auto &curr_msg = messages[i];
-        str[i*2 + 0]    = json_value(curr_msg, "role",    std::string(""));
-        str[i*2 + 1]    = json_value(curr_msg, "content", std::string(""));
-        alloc_size     += str[i*2 + 1].length();
-        chat[i].role    = str[i*2 + 0].c_str();
-        chat[i].content = str[i*2 + 1].c_str();
+        auto role = json_value(curr_msg, "role", std::string(""));
+        auto tmp = json_value(curr_msg, "content", json::array());
+        if (tmp.is_array()) {
+            for (auto &item : tmp) {
+                auto type = json_value(item, "type", std::string(""));
+                if (type == "text") {
+                    str.push_back(role);
+                    auto content = json_value(item, "text", std::string(""));
+                    alloc_size += content.length();
+                    str.push_back(content);
+                    llama_chat_message msg;
+                    msg.role    = strdup(role.c_str());
+                    msg.content = strdup(content.c_str());
+                    chat.push_back(msg);
+                } else if (type == "image_url") {
+                    auto image_url = json_value(item, "image_url", json::object());
+                    if (image_url.is_object()) {
+                        auto url = json_value(image_url, "url", std::string(""));
+                        std::vector<std::string> parts;
+                        std::istringstream f(url);
+                        std::string s;
+                        while (getline(f, s, ',')) {
+                            parts.push_back(s);
+                        }
+                        if (parts.size()>0) {
+                            images_data.emplace_back(json::object({{"data", parts[1]}, {"id", image_id++}}));
+                        }
+                    }
+                }
+            }
+        } else {
+            str.push_back(role);
+            auto content = json_value(curr_msg, "content", std::string(""));
+            alloc_size += content.length();
+            str.push_back(content);
+            llama_chat_message msg;
+            msg.role    = strdup(role.c_str());
+            msg.content = strdup(content.c_str());
+            chat.push_back(msg);
+        }
     }
 
     const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
@@ -236,7 +275,10 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
 
     std::string formatted_chat(buf.data(), res);
     LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
-
+    if (image_id > 1) {
+        images = new std::string(images_data.dump());
+        LOG_VERBOSE("images_chat", {{"text", images->c_str()}});
+    }
     return formatted_chat;
 }