Nexesenex
diff --git a/‎common/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎common/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎common/common.cpp‎
Lines changed: 2 additions & 39 deletions b/‎common/common.cpp‎
Lines changed: 2 additions & 39 deletions
diff --git a/‎common/common.h‎
Lines changed: 58 additions & 24 deletions b/‎common/common.h‎
Lines changed: 58 additions & 24 deletions
diff --git a/‎examples/mtmd/clip.cpp‎
Lines changed: 1 addition & 1 deletion b/‎examples/mtmd/clip.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/server/CMakeLists.txt‎
Lines changed: 0 additions & 3 deletions b/‎examples/server/CMakeLists.txt‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎examples/server/public/index.html.gz‎
-572 KB b/‎examples/server/public/index.html.gz‎
-572 KB
@@ -57,6 +57,8 @@ add_library(${TARGET} STATIC
     chat-parser.cpp
     chat-parser.h
     common.cpp
+    chat.h
+    chat.cpp
     sampling.h
     sampling.cpp
     console.h
 
@@ -270,14 +270,6 @@ static std::string parse_device_list(const std::string& value) {
     return value;
 }
 
-
-std::pair<long, std::vector<char>> common_remote_get_content(const std::string& url, const common_remote_params&) {
-    if (!url.empty()) {
-        throw std::runtime_error("error: built without CURL, cannot download file from the internet");
-    }
-    return {};
-}
-
 //
 // CLI argument parsing
 //
@@ -1740,11 +1732,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.n_junk = std::stoi(argv[i]);
         return true;
     }
-    if (arg == "--no-context-shift") {
-        CHECK_ARG
-        params.ctx_shift = false;
-        return true;
-    }
     if (arg == "--pos") {
         CHECK_ARG
         params.i_pos = std::stoi(argv[i]);
@@ -2078,7 +2065,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "multi-modality" });
     options.push_back({ "*",           "       --mmproj FILE",          "path to a multimodal projector file for LLaVA. see examples/llava/README.md" });
     options.push_back({ "*",           "       --image FILE",           "path to an image file. use with multimodal models. Specify multiple times for batching" });
-    options.push_back({ "*",           "       --no-context-shift",           "disable context-shift." });
+
     options.push_back({ "backend" });
     options.push_back({ "*",           "       --rpc SERVERS",          "comma separated list of RPC servers" });
     options.push_back({ "*",           "-cuda, --cuda-params",          "comma separate list of cuda parameters" });
@@ -3331,29 +3318,6 @@ std::vector<llama_token> llama_tokenize(
     return result;
 }
 
-std::vector<llama_token> llama_tokenize(
-    const struct llama_vocab* vocab,
-    const std::string& text,
-    bool   add_special,
-    bool   parse_special) {
-    // upper limit for the number of tokens
-    int n_tokens = text.length() + 2 * add_special;
-    std::vector<llama_token> result(n_tokens);
-    n_tokens = llama_vocab_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
-    if (n_tokens == std::numeric_limits<int32_t>::min()) {
-        throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
-    }
-    if (n_tokens < 0) {
-        result.resize(-n_tokens);
-        int check = llama_vocab_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
-        GGML_ASSERT(check == -n_tokens);
-    }
-    else {
-        result.resize(n_tokens);
-    }
-    return result;
-}
-
 std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
     std::string piece;
     piece.resize(piece.capacity());  // using string internal cache, 15 bytes + '\n'
@@ -3386,7 +3350,7 @@ std::string llama_token_to_piece(const struct llama_model* model, llama_token to
     return piece;
 }
 
-std::string llama_detokenize(const llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
+std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
     std::string text;
     text.resize(std::max(text.capacity(), tokens.size()));
     int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
@@ -3402,7 +3366,6 @@ std::string llama_detokenize(const llama_context * ctx, const std::vector<llama_
     return text;
 }
 
-
 bool llama_should_add_bos_token(const llama_model * model) {
     const int add_bos = llama_add_bos_token(model);
 
 
@@ -53,8 +53,6 @@ struct llama_lora_adapter_container : llama_lora_adapter_info {
     struct llama_lora_adapter * adapter;
 };
 
-using llama_tokens = std::vector<llama_token>;
-
 // build info
 extern int LLAMA_BUILD_NUMBER;
 extern char const * LLAMA_COMMIT;
@@ -240,7 +238,7 @@ struct gpt_params {
     bool conversation      = false; // conversation mode (does not print special tokens and suffix/prefix)
     bool prompt_cache_all  = false; // save user input and generations to prompt cache
     bool prompt_cache_ro   = false; // open the prompt cache read-only and do not update it
-    bool ctx_shift         = true;
+
     bool escape            = true;  // escape "\n", "\r", "\t", "\'", "\"", and "\\"
     bool multiline_input   = false; // reverse the usage of `\`
     bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
@@ -374,9 +372,6 @@ struct gpt_params {
     bool sweep_bench_output_jsonl = false;
 };
 
-
-
-void gpt_params_handle_hf_token(gpt_params & params);
 void gpt_params_parse_from_env(gpt_params & params);
 void gpt_params_handle_model_default(gpt_params & params);
 
@@ -387,15 +382,6 @@ void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
 
 std::string gpt_params_get_system_info(const gpt_params & params);
 
-
-struct common_remote_params {
-    std::vector<std::string> headers;
-    long timeout = 0; // CURLOPT_TIMEOUT, in seconds ; 0 means no timeout
-    long max_size = 0; // max size of the response ; unlimited if 0 ; max is 2GB
-};
-// get remote file content, returns <http_code, raw_response_body>
-std::pair<long, std::vector<char>> common_remote_get_content(const std::string& url, const common_remote_params& params);
-
 //
 // String utils
 //
@@ -512,12 +498,6 @@ std::vector<llama_token> llama_tokenize(
                         bool   add_special,
                         bool   parse_special = false);
 
-std::vector<llama_token> llama_tokenize(
-    const struct llama_vocab* vocab,
-    const std::string& text,
-    bool   add_special,
-    bool   parse_special = false);
-
 // tokenizes a token into a piece, optionally renders special/control tokens
 // should work similar to Python's `tokenizer.id_to_piece`
 std::string llama_token_to_piece(
@@ -534,16 +514,70 @@ std::string llama_token_to_piece(
 // should work similar to Python's `tokenizer.decode`
 // optionally renders special/control tokens
 std::string llama_detokenize(
-        const llama_context * ctx,
+                         llama_context * ctx,
         const std::vector<llama_token> & tokens,
                                   bool   special = true);
 
-
 // Uses the value from the model metadata if possible, otherwise
 // defaults to true when model type is SPM, otherwise false.
 bool llama_should_add_bos_token(const llama_model * model);
 
-
+//
+// Chat template utils
+//
+//struct common_tool_call {
+//    std::string name;
+//    std::string arguments;
+//    std::string id;
+//};
+//
+//// same with llama_chat_message, but uses std::string
+//struct common_chat_msg {
+//    std::string role;
+//    std::string content;
+//    std::vector<common_tool_call> tool_calls;
+//    std::string reasoning_content = "";
+//};
+
+//// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
+//bool llama_chat_verify_template(const struct llama_model* , const std::string& tmpl, bool use_jinja);
+//
+//namespace minja {
+//    class chat_template;
+//}
+//
+//typedef minja::chat_template common_chat_template;
+//
+//struct common_chat_templates {
+//    bool has_explicit_template; // Model had builtin template or template overridde was specified.
+//    std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
+//    std::unique_ptr<common_chat_template> template_tool_use;
+//};
+//
+//
+//// CPP wrapper for llama_chat_apply_template
+//// If the built-in template is not supported, we default to chatml
+//// If the custom "tmpl" is not supported, we throw an error
+//std::string llama_chat_apply_template(
+//    const struct llama_model* model,
+//    const common_chat_template& tmpl,
+//    const std::vector< common_chat_msg>& chat,
+//    bool add_ass,
+//    bool use_jinja);
+//
+//// Format single message, while taking into account the position of that message in chat history
+//std::string  llama_chat_format_single(const struct llama_model* model,
+//    const common_chat_template& tmpl,
+//    const std::vector< common_chat_msg>& past_msg,
+//    const  common_chat_msg& new_msg,
+//    bool add_ass,
+//    bool use_jinja);
+//
+//// Returns an example of formatted chat
+//std::string  llama_chat_format_example(const struct llama_model* model,
+//    const common_chat_template& tmpl, bool use_jinja);
+//
+//common_chat_templates  llama_chat_templates_from_model(const struct llama_model* model, const std::string& chat_template_override);
 
 
 //
 
@@ -3331,7 +3331,7 @@ struct image_manipulation {
         dst.buf.resize(3 * target_width * target_height);
 
         float Cc;
-        float C[5] = {};
+        float C[5];
         float d0, d2, d3, a0, a1, a2, a3;
         int i, j, k, jj;
         int x, y;
 
@@ -70,9 +70,6 @@ endif()
 target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
 target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
 
-target_include_directories(${TARGET} PRIVATE ../mtmd)
-target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT})
-
 if (LLAMA_SERVER_SSL)
     find_package(OpenSSL REQUIRED)
     target_link_libraries(${TARGET} PRIVATE OpenSSL::SSL OpenSSL::Crypto)