1515#include < sstream>
1616#include < random>
1717
18+ // increase max payload length to allow use of larger context size
19+ #define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
20+ // increase backlog size to avoid connection resets for >> 1 slots
21+ #define CPPHTTPLIB_LISTEN_BACKLOG 512
22+ // increase max URI length to handle longer prompts in query string
23+ #define CPPHTTPLIB_REQUEST_URI_MAX_LENGTH 32768
24+ // disable Nagle's algorithm
25+ #define CPPHTTPLIB_TCP_NODELAY true
26+ #include " httplib.h"
27+
1828#define DEFAULT_OAICOMPAT_MODEL " gpt-3.5-turbo-0613"
1929
2030using json = nlohmann::ordered_json;
@@ -411,6 +421,17 @@ static json probs_vector_to_json(const llama_context * ctx, const std::vector<co
411421 return out;
412422}
413423
424+ static bool server_sent_event (httplib::DataSink& sink, const json& data) {
425+ const std::string str =
426+ " data: " +
427+ data.dump (-1 , ' ' , false , json::error_handler_t ::replace) +
428+ " \n\n " ; // required by RFC 8895 - A message is terminated by a blank line (two line terminators in a row).
429+
430+ LOG_VERBOSE (" data stream, to_send: %s" , str.c_str ());
431+
432+ return sink.write (str.c_str (), str.size ());
433+ }
434+
414435//
415436// OAI utils
416437//
@@ -1065,7 +1086,6 @@ struct server_tokens {
10651086 if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE || type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
10661087 GGML_ASSERT (has_mtmd);
10671088 const int n_pos = mtmd_input_chunk_get_n_pos (chunk);
1068- fprintf (stdout, " n_pos: %d\n " , n_pos);
10691089 llama_pos start_pos = tokens.size ();
10701090 for (int i = 0 ; i < n_pos; ++i) {
10711091 tokens.emplace_back (LLAMA_TOKEN_NULL);
@@ -1209,39 +1229,54 @@ struct server_tokens {
12091229 }
12101230
12111231 size_t get_common_prefix (const server_tokens& b) const {
1212- size_t max_idx = std::min (tokens.size (), b.tokens .size ());
1232+ const size_t max_idx = std::min (tokens.size (), b.tokens .size ());
1233+
1234+ if (!has_mtmd) {
1235+ for (size_t i = 0 ; i < max_idx; ++i) {
1236+ if (tokens[i] == b.tokens [i]) {
1237+ continue ;
1238+ }
1239+ return i;
1240+ }
1241+ return max_idx;
1242+ }
1243+
12131244 for (size_t i = 0 ; i < max_idx; ++i) {
1214- auto & ai = tokens[i];
1215- auto & bi = b.tokens [i];
1245+ const llama_token ai = tokens[i];
1246+ const llama_token bi = b.tokens [i];
12161247
12171248 if (ai == LLAMA_TOKEN_NULL && bi == LLAMA_TOKEN_NULL) {
1218- GGML_ASSERT (has_mtmd);
12191249 const auto & a_chunk = find_chunk (i);
12201250 const auto & b_chunk = b.find_chunk (i);
1251+
12211252 GGML_ASSERT (a_chunk && b_chunk);
1222- std::string ai_id = mtmd_input_chunk_get_id (a_chunk.get ());
1223- std::string bi_id = mtmd_input_chunk_get_id (b_chunk.get ());
1224- size_t a_pos = mtmd_input_chunk_get_n_pos (a_chunk.get ());
1225- size_t b_pos = mtmd_input_chunk_get_n_pos (b_chunk.get ());
1226- if (ai_id == bi_id && a_pos == b_pos) {
1227- GGML_ASSERT (a_pos > 0 && " Invalid media chunk" ); // should never happen
1228- i += a_pos - 1 ; // will be +1 by the for loop
1253+
1254+ const std::string id_ai = mtmd_input_chunk_get_id (a_chunk.get ());
1255+ const std::string id_bi = mtmd_input_chunk_get_id (b_chunk.get ());
1256+
1257+ const size_t pos_a = mtmd_input_chunk_get_n_pos (a_chunk.get ());
1258+ const size_t pos_b = mtmd_input_chunk_get_n_pos (b_chunk.get ());
1259+
1260+ if (id_ai == id_bi && pos_a == pos_b) {
1261+ GGML_ASSERT (pos_a > 0 && " Invalid media chunk" ); // should never happen
1262+ i += pos_a - 1 ; // will be +1 by the for loop
12291263 continue ;
12301264 }
1231- else {
1232- return i;
1233- }
1265+
1266+ return i;
12341267 }
1235- else if (ai == bi) {
1268+
1269+ if (ai == bi) {
12361270 continue ;
12371271 }
1238- else {
1239- return i;
1240- }
1272+
1273+ return i;
12411274 }
1275+
12421276 return max_idx; // all tokens are equal
12431277 }
12441278
1279+
12451280 // make sure all text tokens are within the vocab range
12461281 bool validate (const struct llama_context * ctx) const {
12471282 const llama_model* model = llama_get_model (ctx);
@@ -1274,10 +1309,12 @@ struct server_tokens {
12741309 llama_pos n_past,
12751310 int32_t seq_id,
12761311 llama_pos& n_pos_out) {
1312+ char buffer[512 ];
12771313 auto & chunk = find_chunk (n_past);
12781314 const char * name = mtmd_input_chunk_get_type (chunk.get ()) == MTMD_INPUT_CHUNK_TYPE_IMAGE
12791315 ? " image" : " audio" ;
1280- LOG_INFO (" processing %s...\n " , name);
1316+ snprintf (buffer, 512 , " processing : %s" ,name);
1317+ LOG_INFO (buffer, {});
12811318 int32_t n_batch = llama_n_batch (ctx);
12821319 int64_t t0 = ggml_time_ms ();
12831320 llama_pos new_n_past = n_past;
@@ -1288,9 +1325,11 @@ struct server_tokens {
12881325 n_batch,
12891326 true , // logits last
12901327 &new_n_past);
1291- LOG_INFO (" processed in %" PRId64 " ms\n " , ggml_time_ms () - t0);
1328+ snprintf (buffer, 512 , " processed in %d ms" , ggml_time_ms () - t0);
1329+ LOG_INFO (buffer, {});
12921330 if (result != 0 ) {
1293- LOG_ERROR (" mtmd_helper_eval failed with status %d" , result);
1331+ snprintf (buffer, 512 , " mtmd_helper_eval failed with status %d" , result);
1332+ LOG_ERROR (buffer, {});
12941333 n_pos_out = n_past;
12951334 return result;
12961335 }
@@ -1422,7 +1461,7 @@ static std::vector<server_tokens> tokenize_input_prompts(const llama_vocab* voca
14221461 return result;
14231462}
14241463// Assuming raw_buffer has .data() and .size() members
1425- inline void printFilesInfo (const std::vector<raw_buffer>& files) {
1464+ inline void print_files_info (const std::vector<raw_buffer>& files) {
14261465 for (size_t i = 0 ; i < files.size (); ++i) {
14271466 const auto & file = files[i];
14281467 std::cout << " File " << i << " : Size = " << file.size () << " bytes\n " ;
0 commit comments