Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,6 @@ add_library(${TARGET} STATIC
chat-parser.cpp
chat-parser.h
common.cpp
chat.h
chat.cpp
sampling.h
sampling.cpp
console.h
Expand Down
41 changes: 39 additions & 2 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,14 @@ static std::string parse_device_list(const std::string& value) {
return value;
}


std::pair<long, std::vector<char>> common_remote_get_content(const std::string& url, const common_remote_params&) {
if (!url.empty()) {
throw std::runtime_error("error: built without CURL, cannot download file from the internet");
}
return {};
}

//
// CLI argument parsing
//
Expand Down Expand Up @@ -1727,6 +1735,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.n_junk = std::stoi(argv[i]);
return true;
}
if (arg == "--no-context-shift") {
CHECK_ARG
params.ctx_shift = false;
return true;
}
if (arg == "--pos") {
CHECK_ARG
params.i_pos = std::stoi(argv[i]);
Expand Down Expand Up @@ -2060,7 +2073,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "multi-modality" });
options.push_back({ "*", " --mmproj FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md" });
options.push_back({ "*", " --image FILE", "path to an image file. use with multimodal models. Specify multiple times for batching" });

options.push_back({ "*", " --no-context-shift", "disable context-shift." });
options.push_back({ "backend" });
options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" });

Expand Down Expand Up @@ -3311,6 +3324,29 @@ std::vector<llama_token> llama_tokenize(
return result;
}

std::vector<llama_token> llama_tokenize(
const struct llama_vocab* vocab,
const std::string& text,
bool add_special,
bool parse_special) {
// upper limit for the number of tokens
int n_tokens = text.length() + 2 * add_special;
std::vector<llama_token> result(n_tokens);
n_tokens = llama_vocab_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
if (n_tokens == std::numeric_limits<int32_t>::min()) {
throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
}
if (n_tokens < 0) {
result.resize(-n_tokens);
int check = llama_vocab_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
GGML_ASSERT(check == -n_tokens);
}
else {
result.resize(n_tokens);
}
return result;
}

std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
std::string piece;
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
Expand Down Expand Up @@ -3343,7 +3379,7 @@ std::string llama_token_to_piece(const struct llama_model* model, llama_token to
return piece;
}

std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
std::string llama_detokenize(const llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
std::string text;
text.resize(std::max(text.capacity(), tokens.size()));
int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
Expand All @@ -3359,6 +3395,7 @@ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token>
return text;
}


bool llama_should_add_bos_token(const llama_model * model) {
const int add_bos = llama_add_bos_token(model);

Expand Down
82 changes: 24 additions & 58 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ struct llama_lora_adapter_container : llama_lora_adapter_info {
struct llama_lora_adapter * adapter;
};

using llama_tokens = std::vector<llama_token>;

// build info
extern int LLAMA_BUILD_NUMBER;
extern char const * LLAMA_COMMIT;
Expand Down Expand Up @@ -237,7 +239,7 @@ struct gpt_params {
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
bool prompt_cache_all = false; // save user input and generations to prompt cache
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it

bool ctx_shift = true;
bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
bool multiline_input = false; // reverse the usage of `\`
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
Expand Down Expand Up @@ -371,6 +373,9 @@ struct gpt_params {
bool sweep_bench_output_jsonl = false;
};



void gpt_params_handle_hf_token(gpt_params & params);
void gpt_params_parse_from_env(gpt_params & params);
void gpt_params_handle_model_default(gpt_params & params);

Expand All @@ -381,6 +386,15 @@ void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);

std::string gpt_params_get_system_info(const gpt_params & params);


struct common_remote_params {
std::vector<std::string> headers;
long timeout = 0; // CURLOPT_TIMEOUT, in seconds ; 0 means no timeout
long max_size = 0; // max size of the response ; unlimited if 0 ; max is 2GB
};
// get remote file content, returns <http_code, raw_response_body>
std::pair<long, std::vector<char>> common_remote_get_content(const std::string& url, const common_remote_params& params);

//
// String utils
//
Expand Down Expand Up @@ -497,6 +511,12 @@ std::vector<llama_token> llama_tokenize(
bool add_special,
bool parse_special = false);

std::vector<llama_token> llama_tokenize(
const struct llama_vocab* vocab,
const std::string& text,
bool add_special,
bool parse_special = false);

// tokenizes a token into a piece, optionally renders special/control tokens
// should work similar to Python's `tokenizer.id_to_piece`
std::string llama_token_to_piece(
Expand All @@ -513,70 +533,16 @@ std::string llama_token_to_piece(
// should work similar to Python's `tokenizer.decode`
// optionally renders special/control tokens
std::string llama_detokenize(
llama_context * ctx,
const llama_context * ctx,
const std::vector<llama_token> & tokens,
bool special = true);


// Uses the value from the model metadata if possible, otherwise
// defaults to true when model type is SPM, otherwise false.
bool llama_should_add_bos_token(const llama_model * model);

//
// Chat template utils
//
//struct common_tool_call {
// std::string name;
// std::string arguments;
// std::string id;
//};
//
//// same with llama_chat_message, but uses std::string
//struct common_chat_msg {
// std::string role;
// std::string content;
// std::vector<common_tool_call> tool_calls;
// std::string reasoning_content = "";
//};

//// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
//bool llama_chat_verify_template(const struct llama_model* , const std::string& tmpl, bool use_jinja);
//
//namespace minja {
// class chat_template;
//}
//
//typedef minja::chat_template common_chat_template;
//
//struct common_chat_templates {
// bool has_explicit_template; // Model had builtin template or template overridde was specified.
// std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
// std::unique_ptr<common_chat_template> template_tool_use;
//};
//
//
//// CPP wrapper for llama_chat_apply_template
//// If the built-in template is not supported, we default to chatml
//// If the custom "tmpl" is not supported, we throw an error
//std::string llama_chat_apply_template(
// const struct llama_model* model,
// const common_chat_template& tmpl,
// const std::vector< common_chat_msg>& chat,
// bool add_ass,
// bool use_jinja);
//
//// Format single message, while taking into account the position of that message in chat history
//std::string llama_chat_format_single(const struct llama_model* model,
// const common_chat_template& tmpl,
// const std::vector< common_chat_msg>& past_msg,
// const common_chat_msg& new_msg,
// bool add_ass,
// bool use_jinja);
//
//// Returns an example of formatted chat
//std::string llama_chat_format_example(const struct llama_model* model,
// const common_chat_template& tmpl, bool use_jinja);
//
//common_chat_templates llama_chat_templates_from_model(const struct llama_model* model, const std::string& chat_template_override);



//
Expand Down
2 changes: 1 addition & 1 deletion examples/mtmd/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3331,7 +3331,7 @@ struct image_manipulation {
dst.buf.resize(3 * target_width * target_height);

float Cc;
float C[5];
float C[5] = {};
float d0, d2, d3, a0, a1, a2, a3;
int i, j, k, jj;
int x, y;
Expand Down
3 changes: 3 additions & 0 deletions examples/server/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,9 @@ endif()
target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})

target_include_directories(${TARGET} PRIVATE ../mtmd)
target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT})

if (LLAMA_SERVER_SSL)
find_package(OpenSSL REQUIRED)
target_link_libraries(${TARGET} PRIVATE OpenSSL::SSL OpenSSL::Crypto)
Expand Down
Binary file modified examples/server/public/index.html.gz
Binary file not shown.
Loading