Skip to content

Commit a36e0e1

Browse files
committed
Merge branch 'master' into prune
2 parents 2ea44c4 + bb16041 commit a36e0e1

File tree

108 files changed

+6989
-4722
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

108 files changed

+6989
-4722
lines changed

.github/workflows/build.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -693,7 +693,7 @@ jobs:
693693
- build: 'openblas-x64'
694694
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
695695
- build: 'vulkan-x64'
696-
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
696+
defines: '-DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
697697
- build: 'llvm-arm64'
698698
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
699699
- build: 'llvm-arm64-opencl-adreno'
@@ -778,6 +778,7 @@ jobs:
778778
cmake -S . -B build ${{ matrix.defines }} `
779779
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
780780
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
781+
cp $env:CURL_PATH/bin/libcurl-*.dll build/bin/Release
781782
782783
- name: Add libopenblas.dll
783784
id: add_libopenblas_dll

ci/run.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ sd=`dirname $0`
3939
cd $sd/../
4040
SRC=`pwd`
4141

42-
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=OFF"
42+
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON"
4343

4444
if [ ! -z ${GG_BUILD_METAL} ]; then
4545
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON"
@@ -779,7 +779,7 @@ function gg_run_rerank_tiny {
779779
model_f16="${path_models}/ggml-model-f16.gguf"
780780

781781
# for this model, the SEP token is "</s>"
782-
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
782+
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
783783

784784
# sample output
785785
# rerank score 0: 0.029

common/arg.cpp

Lines changed: 36 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -988,10 +988,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
988988
params.tensor_buft_overrides.push_back({nullptr, nullptr});
989989
}
990990

991-
if (params.reranking && params.embedding) {
992-
throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
993-
}
994-
995991
if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
996992
throw std::runtime_error(string_format(
997993
"error: the supplied chat template is not supported: %s%s\n",
@@ -2710,6 +2706,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
27102706
params.embd_sep = value;
27112707
}
27122708
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
2709+
add_opt(common_arg(
2710+
{"--cls-separator"}, "STRING",
2711+
"separator of classification sequences (default \\t) for example \"<#seq#>\"",
2712+
[](common_params & params, const std::string & value) {
2713+
params.cls_sep = value;
2714+
}
2715+
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
27132716
add_opt(common_arg(
27142717
{"--host"}, "HOST",
27152718
string_format("ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: %s)", params.hostname.c_str()),
@@ -2747,9 +2750,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
27472750
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
27482751
add_opt(common_arg(
27492752
{"--reranking", "--rerank"},
2750-
string_format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"),
2753+
string_format("enable reranking endpoint on server (default: %s)", "disabled"),
27512754
[](common_params & params) {
2752-
params.reranking = true;
2755+
params.embedding = true;
2756+
params.pooling_type = LLAMA_POOLING_TYPE_RANK;
27532757
}
27542758
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
27552759
add_opt(common_arg(
@@ -3213,6 +3217,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
32133217
params.speculative.model.path = value;
32143218
}
32153219
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
3220+
add_opt(common_arg(
3221+
{"-ctkd", "--cache-type-k-draft"}, "TYPE",
3222+
string_format(
3223+
"KV cache data type for K for the draft model\n"
3224+
"allowed values: %s\n"
3225+
"(default: %s)",
3226+
get_all_kv_cache_types().c_str(),
3227+
ggml_type_name(params.speculative.cache_type_k)
3228+
),
3229+
[](common_params & params, const std::string & value) {
3230+
params.speculative.cache_type_k = kv_cache_type_from_str(value);
3231+
}
3232+
).set_env("LLAMA_ARG_CACHE_TYPE_K_DRAFT"));
3233+
add_opt(common_arg(
3234+
{"-ctvd", "--cache-type-v-draft"}, "TYPE",
3235+
string_format(
3236+
"KV cache data type for V for the draft model\n"
3237+
"allowed values: %s\n"
3238+
"(default: %s)",
3239+
get_all_kv_cache_types().c_str(),
3240+
ggml_type_name(params.speculative.cache_type_v)
3241+
),
3242+
[](common_params & params, const std::string & value) {
3243+
params.speculative.cache_type_v = kv_cache_type_from_str(value);
3244+
}
3245+
).set_env("LLAMA_ARG_CACHE_TYPE_V_DRAFT"));
32163246

32173247
add_opt(common_arg(
32183248
{"-mv", "--model-vocoder"}, "FNAME",

common/chat.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1838,7 +1838,7 @@ static common_chat_params common_chat_templates_apply_legacy(
18381838
if (res < 0) {
18391839
// if the custom "tmpl" is not supported, we throw an error
18401840
// this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
1841-
throw std::runtime_error("this custom template is not supported");
1841+
throw std::runtime_error("this custom template is not supported, try using --jinja");
18421842
}
18431843

18441844
// if it turns out that our buffer is too small, we resize it

common/common.cpp

Lines changed: 50 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -706,11 +706,17 @@ bool fs_validate_filename(const std::string & filename) {
706706
// disable C++17 deprecation warning for std::codecvt_utf8
707707
# pragma clang diagnostic push
708708
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
709+
#elif defined(__GNUC__)
710+
# pragma GCC diagnostic push
711+
# pragma GCC diagnostic ignored "-Wdeprecated-declarations"
709712
#endif
713+
710714
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
711715

712716
#if defined(__clang__)
713717
# pragma clang diagnostic pop
718+
#elif defined(__GNUC__)
719+
# pragma GCC diagnostic pop
714720
#endif
715721

716722
filename_utf32 = converter.from_bytes(filename);
@@ -767,6 +773,9 @@ bool fs_validate_filename(const std::string & filename) {
767773
return true;
768774
}
769775

776+
#include <iostream>
777+
778+
770779
// returns true if successful, false otherwise
771780
bool fs_create_directory_with_parents(const std::string & path) {
772781
#ifdef _WIN32
@@ -784,9 +793,16 @@ bool fs_create_directory_with_parents(const std::string & path) {
784793
// process path from front to back, procedurally creating directories
785794
while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
786795
const std::wstring subpath = wpath.substr(0, pos_slash);
787-
const wchar_t * test = subpath.c_str();
788796

789-
const bool success = CreateDirectoryW(test, NULL);
797+
pos_slash += 1;
798+
799+
// skip the drive letter, in some systems it can return an access denied error
800+
if (subpath.length() == 2 && subpath[1] == ':') {
801+
continue;
802+
}
803+
804+
const bool success = CreateDirectoryW(subpath.c_str(), NULL);
805+
790806
if (!success) {
791807
const DWORD error = GetLastError();
792808

@@ -800,8 +816,6 @@ bool fs_create_directory_with_parents(const std::string & path) {
800816
return false;
801817
}
802818
}
803-
804-
pos_slash += 1;
805819
}
806820

807821
return true;
@@ -897,34 +911,6 @@ struct common_init_result common_init_from_params(common_params & params) {
897911

898912
const llama_vocab * vocab = llama_model_get_vocab(model);
899913

900-
if (params.reranking) {
901-
bool ok = true;
902-
903-
if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
904-
LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
905-
ok = false;
906-
}
907-
908-
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
909-
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
910-
911-
if (!has_eos && !has_sep) {
912-
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
913-
ok = false;
914-
} else if (!has_eos) {
915-
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
916-
} else if (!has_sep) {
917-
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
918-
ok = false;
919-
}
920-
921-
if (!ok) {
922-
llama_model_free(model);
923-
924-
return iparams;
925-
}
926-
}
927-
928914
auto cparams = common_context_params_to_llama(params);
929915

930916
llama_context * lctx = llama_init_from_model(model, cparams);
@@ -966,6 +952,35 @@ struct common_init_result common_init_from_params(common_params & params) {
966952
}
967953
}
968954

955+
if (llama_pooling_type(lctx) == LLAMA_POOLING_TYPE_RANK) {
956+
bool ok = true;
957+
958+
if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
959+
LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
960+
ok = false;
961+
}
962+
963+
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
964+
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
965+
966+
if (!has_eos && !has_sep) {
967+
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
968+
ok = false;
969+
} else if (!has_eos) {
970+
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
971+
} else if (!has_sep) {
972+
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
973+
ok = false;
974+
}
975+
976+
if (!ok) {
977+
llama_free(lctx);
978+
llama_model_free(model);
979+
980+
return iparams;
981+
}
982+
}
983+
969984
// load and optionally apply lora adapters
970985
for (auto & la : params.lora_adapters) {
971986
llama_adapter_lora_ptr lora;
@@ -1143,11 +1158,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
11431158
cparams.op_offload = !params.no_op_offload;
11441159
cparams.swa_full = params.swa_full;
11451160

1146-
if (params.reranking) {
1147-
cparams.embeddings = true;
1148-
cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
1149-
}
1150-
11511161
cparams.type_k = params.cache_type_k;
11521162
cparams.type_v = params.cache_type_v;
11531163

@@ -1280,6 +1290,9 @@ std::vector<llama_token> common_tokenize(
12801290
int n_tokens = text.length() + 2 * add_special;
12811291
std::vector<llama_token> result(n_tokens);
12821292
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
1293+
if (n_tokens == std::numeric_limits<int32_t>::min()) {
1294+
throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
1295+
}
12831296
if (n_tokens < 0) {
12841297
result.resize(-n_tokens);
12851298
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);

common/common.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,9 @@ struct common_params_speculative {
199199
float p_split = 0.1f; // speculative decoding split probability
200200
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
201201

202+
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
203+
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
204+
202205
struct cpu_params cpuparams;
203206
struct cpu_params cpuparams_batch;
204207

@@ -355,7 +358,7 @@ struct common_params {
355358
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
356359
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
357360
std::string embd_sep = "\n"; // separator of embeddings
358-
bool reranking = false; // enable reranking support on server
361+
std::string cls_sep = "\t"; // separator of classification sequences
359362

360363
// server params
361364
int32_t port = 8080; // server listens on this network port

0 commit comments

Comments
 (0)