Skip to content

Commit 15159a8

Browse files
authored
Add vision support in llama-server (ikawrakow#901)
* server: add support for vision model webui: add support for vision model * server : remove hack for extra parallel slot#10187 * llama : fix KV shift for qwen2vl #13870 * add no-context-shift parameter --------- Co-authored-by: firecoperana <firecoperana>
1 parent 5b38d43 commit 15159a8

26 files changed

+2457
-730
lines changed

common/CMakeLists.txt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,6 @@ add_library(${TARGET} STATIC
5757
chat-parser.cpp
5858
chat-parser.h
5959
common.cpp
60-
chat.h
61-
chat.cpp
6260
sampling.h
6361
sampling.cpp
6462
console.h

common/common.cpp

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,14 @@ static std::string parse_device_list(const std::string& value) {
270270
return value;
271271
}
272272

273+
274+
std::pair<long, std::vector<char>> common_remote_get_content(const std::string& url, const common_remote_params&) {
275+
if (!url.empty()) {
276+
throw std::runtime_error("error: built without CURL, cannot download file from the internet");
277+
}
278+
return {};
279+
}
280+
273281
//
274282
// CLI argument parsing
275283
//
@@ -1727,6 +1735,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
17271735
params.n_junk = std::stoi(argv[i]);
17281736
return true;
17291737
}
1738+
if (arg == "--no-context-shift") {
1739+
CHECK_ARG
1740+
params.ctx_shift = false;
1741+
return true;
1742+
}
17301743
if (arg == "--pos") {
17311744
CHECK_ARG
17321745
params.i_pos = std::stoi(argv[i]);
@@ -2060,7 +2073,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
20602073
options.push_back({ "multi-modality" });
20612074
options.push_back({ "*", " --mmproj FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md" });
20622075
options.push_back({ "*", " --image FILE", "path to an image file. use with multimodal models. Specify multiple times for batching" });
2063-
2076+
options.push_back({ "*", " --no-context-shift", "disable context-shift." });
20642077
options.push_back({ "backend" });
20652078
options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" });
20662079

@@ -3311,6 +3324,29 @@ std::vector<llama_token> llama_tokenize(
33113324
return result;
33123325
}
33133326

3327+
std::vector<llama_token> llama_tokenize(
3328+
const struct llama_vocab* vocab,
3329+
const std::string& text,
3330+
bool add_special,
3331+
bool parse_special) {
3332+
// upper limit for the number of tokens
3333+
int n_tokens = text.length() + 2 * add_special;
3334+
std::vector<llama_token> result(n_tokens);
3335+
n_tokens = llama_vocab_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
3336+
if (n_tokens == std::numeric_limits<int32_t>::min()) {
3337+
throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
3338+
}
3339+
if (n_tokens < 0) {
3340+
result.resize(-n_tokens);
3341+
int check = llama_vocab_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
3342+
GGML_ASSERT(check == -n_tokens);
3343+
}
3344+
else {
3345+
result.resize(n_tokens);
3346+
}
3347+
return result;
3348+
}
3349+
33143350
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
33153351
std::string piece;
33163352
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
@@ -3343,7 +3379,7 @@ std::string llama_token_to_piece(const struct llama_model* model, llama_token to
33433379
return piece;
33443380
}
33453381

3346-
std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
3382+
std::string llama_detokenize(const llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
33473383
std::string text;
33483384
text.resize(std::max(text.capacity(), tokens.size()));
33493385
int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
@@ -3359,6 +3395,7 @@ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token>
33593395
return text;
33603396
}
33613397

3398+
33623399
bool llama_should_add_bos_token(const llama_model * model) {
33633400
const int add_bos = llama_add_bos_token(model);
33643401

common/common.h

Lines changed: 24 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,8 @@ struct llama_lora_adapter_container : llama_lora_adapter_info {
5353
struct llama_lora_adapter * adapter;
5454
};
5555

56+
using llama_tokens = std::vector<llama_token>;
57+
5658
// build info
5759
extern int LLAMA_BUILD_NUMBER;
5860
extern char const * LLAMA_COMMIT;
@@ -237,7 +239,7 @@ struct gpt_params {
237239
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
238240
bool prompt_cache_all = false; // save user input and generations to prompt cache
239241
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
240-
242+
bool ctx_shift = true;
241243
bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
242244
bool multiline_input = false; // reverse the usage of `\`
243245
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
@@ -371,6 +373,9 @@ struct gpt_params {
371373
bool sweep_bench_output_jsonl = false;
372374
};
373375

376+
377+
378+
void gpt_params_handle_hf_token(gpt_params & params);
374379
void gpt_params_parse_from_env(gpt_params & params);
375380
void gpt_params_handle_model_default(gpt_params & params);
376381

@@ -381,6 +386,15 @@ void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
381386

382387
std::string gpt_params_get_system_info(const gpt_params & params);
383388

389+
390+
struct common_remote_params {
391+
std::vector<std::string> headers;
392+
long timeout = 0; // CURLOPT_TIMEOUT, in seconds ; 0 means no timeout
393+
long max_size = 0; // max size of the response ; unlimited if 0 ; max is 2GB
394+
};
395+
// get remote file content, returns <http_code, raw_response_body>
396+
std::pair<long, std::vector<char>> common_remote_get_content(const std::string& url, const common_remote_params& params);
397+
384398
//
385399
// String utils
386400
//
@@ -497,6 +511,12 @@ std::vector<llama_token> llama_tokenize(
497511
bool add_special,
498512
bool parse_special = false);
499513

514+
std::vector<llama_token> llama_tokenize(
515+
const struct llama_vocab* vocab,
516+
const std::string& text,
517+
bool add_special,
518+
bool parse_special = false);
519+
500520
// tokenizes a token into a piece, optionally renders special/control tokens
501521
// should work similar to Python's `tokenizer.id_to_piece`
502522
std::string llama_token_to_piece(
@@ -513,70 +533,16 @@ std::string llama_token_to_piece(
513533
// should work similar to Python's `tokenizer.decode`
514534
// optionally renders special/control tokens
515535
std::string llama_detokenize(
516-
llama_context * ctx,
536+
const llama_context * ctx,
517537
const std::vector<llama_token> & tokens,
518538
bool special = true);
519539

540+
520541
// Uses the value from the model metadata if possible, otherwise
521542
// defaults to true when model type is SPM, otherwise false.
522543
bool llama_should_add_bos_token(const llama_model * model);
523544

524-
//
525-
// Chat template utils
526-
//
527-
//struct common_tool_call {
528-
// std::string name;
529-
// std::string arguments;
530-
// std::string id;
531-
//};
532-
//
533-
//// same with llama_chat_message, but uses std::string
534-
//struct common_chat_msg {
535-
// std::string role;
536-
// std::string content;
537-
// std::vector<common_tool_call> tool_calls;
538-
// std::string reasoning_content = "";
539-
//};
540-
541-
//// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
542-
//bool llama_chat_verify_template(const struct llama_model* , const std::string& tmpl, bool use_jinja);
543-
//
544-
//namespace minja {
545-
// class chat_template;
546-
//}
547-
//
548-
//typedef minja::chat_template common_chat_template;
549-
//
550-
//struct common_chat_templates {
551-
// bool has_explicit_template; // Model had builtin template or template overridde was specified.
552-
// std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
553-
// std::unique_ptr<common_chat_template> template_tool_use;
554-
//};
555-
//
556-
//
557-
//// CPP wrapper for llama_chat_apply_template
558-
//// If the built-in template is not supported, we default to chatml
559-
//// If the custom "tmpl" is not supported, we throw an error
560-
//std::string llama_chat_apply_template(
561-
// const struct llama_model* model,
562-
// const common_chat_template& tmpl,
563-
// const std::vector< common_chat_msg>& chat,
564-
// bool add_ass,
565-
// bool use_jinja);
566-
//
567-
//// Format single message, while taking into account the position of that message in chat history
568-
//std::string llama_chat_format_single(const struct llama_model* model,
569-
// const common_chat_template& tmpl,
570-
// const std::vector< common_chat_msg>& past_msg,
571-
// const common_chat_msg& new_msg,
572-
// bool add_ass,
573-
// bool use_jinja);
574-
//
575-
//// Returns an example of formatted chat
576-
//std::string llama_chat_format_example(const struct llama_model* model,
577-
// const common_chat_template& tmpl, bool use_jinja);
578-
//
579-
//common_chat_templates llama_chat_templates_from_model(const struct llama_model* model, const std::string& chat_template_override);
545+
580546

581547

582548
//

examples/mtmd/clip.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3331,7 +3331,7 @@ struct image_manipulation {
33313331
dst.buf.resize(3 * target_width * target_height);
33323332

33333333
float Cc;
3334-
float C[5];
3334+
float C[5] = {};
33353335
float d0, d2, d3, a0, a1, a2, a3;
33363336
int i, j, k, jj;
33373337
int x, y;

examples/server/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,9 @@ endif()
7070
target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
7171
target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
7272

73+
target_include_directories(${TARGET} PRIVATE ../mtmd)
74+
target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT})
75+
7376
if (LLAMA_SERVER_SSL)
7477
find_package(OpenSSL REQUIRED)
7578
target_link_libraries(${TARGET} PRIVATE OpenSSL::SSL OpenSSL::Crypto)
572 KB
Binary file not shown.

0 commit comments

Comments
 (0)