Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion llama.cpp/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14409,7 +14409,7 @@ static ggml_cuda_device_info ggml_cuda_init() {

cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
GGML_CUDA_LOG_INFO(" Device %%d: %%s, compute capability %%d.%%d, VMM: %%s\n");
GGML_CUDA_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, (device_vmm ? "yes" : "no"));

info.default_tensor_split[id] = total_vram;
total_vram += prop.totalGlobalMem;
Expand Down
8 changes: 5 additions & 3 deletions llama.cpp/server/oai.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ inline static json oaicompat_completion_params_parse(
const std::string &chat_template)
{
json llama_params;

std::string *images_ptr = nullptr;
llama_params["__oaicompat"] = true;

// Map OpenAI parameters to llama.cpp parameters
Expand All @@ -35,7 +35,10 @@ inline static json oaicompat_completion_params_parse(
// https://platform.openai.com/docs/api-reference/chat/create
llama_sampling_params default_sparams;
llama_params["model"] = json_value(body, "model", std::string("unknown"));
llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"));
llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"), images_ptr);
if (images_ptr != nullptr) {
llama_params["image_data"] = *images_ptr;
}
llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
llama_params["temperature"] = json_value(body, "temperature", 0.0);
llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k);
Expand Down Expand Up @@ -225,4 +228,3 @@ inline static json format_embeddings_response_oaicompat(const json &request, con
};
return res;
}

62 changes: 52 additions & 10 deletions llama.cpp/server/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,16 @@

#pragma once

#include <cstddef>
#include <cstring>
#include <string>
#include <vector>
#include <set>
#include <mutex>
#include <condition_variable>
#include <unordered_map>
#include <iostream> // [jart]
#include <sstream>

#include "llama.cpp/json.h"
#include "llama.cpp/llava/clip.h"
Expand Down Expand Up @@ -206,20 +209,56 @@ inline bool verify_custom_template(const std::string & tmpl) {
}

// Format given chat. If tmpl is empty, we take the template from model metadata
inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages)
inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages, std::string * images)
{
size_t alloc_size = 0;
// vector holding all allocated string to be passed to llama_chat_apply_template
std::vector<std::string> str(messages.size() * 2);
std::vector<llama_chat_message> chat(messages.size());

std::vector<std::string> str;
std::vector<llama_chat_message> chat;
int image_id = 1;
json images_data = json::array();
for (size_t i = 0; i < messages.size(); ++i) {
auto &curr_msg = messages[i];
str[i*2 + 0] = json_value(curr_msg, "role", std::string(""));
str[i*2 + 1] = json_value(curr_msg, "content", std::string(""));
alloc_size += str[i*2 + 1].length();
chat[i].role = str[i*2 + 0].c_str();
chat[i].content = str[i*2 + 1].c_str();
auto role = json_value(curr_msg, "role", std::string(""));
auto tmp = json_value(curr_msg, "content", json::array());
if (tmp.is_array()) {
for (auto &item : tmp) {
auto type = json_value(item, "type", std::string(""));
if (type == "text") {
str.push_back(role);
auto content = json_value(item, "text", std::string(""));
alloc_size += content.length();
str.push_back(content);
llama_chat_message msg;
msg.role = strdup(role.c_str());
msg.content = strdup(content.c_str());
chat.push_back(msg);
} else if (type == "image_url") {
auto image_url = json_value(item, "image_url", json::object());
if (image_url.is_object()) {
auto url = json_value(image_url, "url", std::string(""));
std::vector<std::string> parts;
std::istringstream f(url);
std::string s;
while (getline(f, s, ',')) {
parts.push_back(s);
}
if (parts.size()>0) {
images_data.emplace_back(json::object({{"data", parts[1]}, {"id", image_id++}}));
}
}
}
}
} else {
str.push_back(role);
auto content = json_value(curr_msg, "content", std::string(""));
alloc_size += content.length();
str.push_back(content);
llama_chat_message msg;
msg.role = strdup(role.c_str());
msg.content = strdup(content.c_str());
chat.push_back(msg);
}
}

const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
Expand All @@ -236,7 +275,10 @@ inline std::string format_chat(const struct llama_model * model, const std::stri

std::string formatted_chat(buf.data(), res);
LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});

if (image_id > 1) {
images = new std::string(images_data.dump());
LOG_VERBOSE("images_chat", {{"text", images->c_str()}});
}
return formatted_chat;
}

Expand Down