Skip to content

Commit df01a89

Browse files
authored
Merge branch 'ggerganov:master' into k-shift2
2 parents 8411453 + 9830b69 commit df01a89

File tree

4 files changed

+51
-24
lines changed

4 files changed

+51
-24
lines changed

CMakePresets.json

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,10 +48,23 @@
4848
}
4949
},
5050

51+
{
52+
"name": "arm64-apple-clang", "hidden": true,
53+
"architecture": { "value": "arm64", "strategy": "external" },
54+
"toolset": { "value": "host=x64", "strategy": "external" },
55+
"cacheVariables": {
56+
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-apple-clang.cmake"
57+
}
58+
},
59+
5160
{ "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
5261
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
5362
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg", "static" ] },
5463

64+
{ "name": "arm64-apple-clang-debug" , "inherits": [ "base", "arm64-apple-clang", "debug" ] },
65+
{ "name": "arm64-apple-clang-release" , "inherits": [ "base", "arm64-apple-clang", "reldbg" ] },
66+
{ "name": "arm64-apple-clang+static-release" , "inherits": [ "base", "arm64-apple-clang", "reldbg", "static" ] },
67+
5568
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
5669
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] },
5770
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] },

cmake/arm64-apple-clang.cmake

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
set( CMAKE_SYSTEM_NAME Darwin )
2+
set( CMAKE_SYSTEM_PROCESSOR arm64 )
3+
4+
set( target arm64-apple-darwin-macho )
5+
6+
set( CMAKE_C_COMPILER clang )
7+
set( CMAKE_CXX_COMPILER clang++ )
8+
9+
set( CMAKE_C_COMPILER_TARGET ${target} )
10+
set( CMAKE_CXX_COMPILER_TARGET ${target} )
11+
12+
set( arch_c_flags "-march=armv8.4-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
13+
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function" )
14+
15+
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
16+
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )

examples/server/server.cpp

Lines changed: 15 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,7 @@ struct server_slot {
247247
if (is_processing()) {
248248
SLT_INF(*this, "stop processing: n_past = %d, truncated = %d\n", n_past, truncated);
249249

250+
t_last_used = ggml_time_us();
250251
t_token_generation = (ggml_time_us() - t_start_generation) / 1e3;
251252
state = SLOT_STATE_IDLE;
252253
callback_on_release(id);
@@ -730,7 +731,7 @@ struct server_context {
730731

731732
// find the slot that has at least n% prompt similarity
732733
if (ret == nullptr && slot_prompt_similarity != 0.0f) {
733-
int max_lcs_len = 0;
734+
int lcs_len = 0;
734735
float similarity = 0;
735736

736737
for (server_slot & slot : slots) {
@@ -745,20 +746,21 @@ struct server_context {
745746
}
746747

747748
// length of the Longest Common Subsequence between the current slot's prompt and the input prompt
748-
int lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens);
749+
int cur_lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens);
749750

750751
// fraction of the common subsequence length compared to the current slot's prompt length
751-
similarity = static_cast<float>(lcs_len) / static_cast<int>(slot.cache_tokens.size());
752+
float cur_similarity = static_cast<float>(cur_lcs_len) / static_cast<int>(slot.cache_tokens.size());
752753

753754
// select the current slot if the criteria match
754-
if (lcs_len > max_lcs_len && similarity > slot_prompt_similarity) {
755-
max_lcs_len = lcs_len;
755+
if (cur_lcs_len > lcs_len && cur_similarity > slot_prompt_similarity) {
756+
lcs_len = cur_lcs_len;
757+
similarity = cur_similarity;
756758
ret = &slot;
757759
}
758760
}
759761

760762
if (ret != nullptr) {
761-
SLT_DBG(*ret, "selected slot by lcs similarity, max_lcs_len = %d, similarity = %f\n", max_lcs_len, similarity);
763+
SLT_DBG(*ret, "selected slot by lcs similarity, lcs_len = %d, similarity = %f\n", lcs_len, similarity);
762764
}
763765
}
764766

@@ -2705,8 +2707,8 @@ int main(int argc, char ** argv) {
27052707
};
27062708

27072709
const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_inf_type inf_type, json & data, httplib::Response & res) {
2708-
if (ctx_server.params.embedding || ctx_server.params.reranking) {
2709-
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings` or `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
2710+
if (ctx_server.params.embedding) {
2711+
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
27102712
return;
27112713
}
27122714

@@ -2811,8 +2813,8 @@ int main(int argc, char ** argv) {
28112813

28122814
// TODO: maybe merge this function with "handle_completions_generic"
28132815
const auto handle_chat_completions = [&ctx_server, &params, &res_error, &res_ok, verbose](const httplib::Request & req, httplib::Response & res) {
2814-
if (ctx_server.params.embedding || ctx_server.params.reranking) {
2815-
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings` or `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
2816+
if (ctx_server.params.embedding) {
2817+
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
28162818
return;
28172819
}
28182820

@@ -2937,11 +2939,6 @@ int main(int argc, char ** argv) {
29372939
};
29382940

29392941
const auto handle_embeddings = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
2940-
// TODO: somehow clean up this checks in the future
2941-
if (!ctx_server.params.embedding || ctx_server.params.reranking) {
2942-
res_error(res, format_error_response("This server does not support embeddings. Start it with `--embeddings` and without `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
2943-
return;
2944-
}
29452942
const json body = json::parse(req.body);
29462943
bool is_openai = false;
29472944

@@ -2993,10 +2990,11 @@ int main(int argc, char ** argv) {
29932990
};
29942991

29952992
const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
2996-
if (!ctx_server.params.reranking) {
2997-
res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
2993+
if (!ctx_server.params.reranking || ctx_server.params.embedding) {
2994+
res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking` and without `--embedding`", ERROR_TYPE_NOT_SUPPORTED));
29982995
return;
29992996
}
2997+
30002998
const json body = json::parse(req.body);
30012999

30023000
// TODO: implement

examples/server/utils.hpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -453,20 +453,20 @@ static size_t longest_common_subsequence(const llama_tokens & a, const llama_tok
453453
}
454454

455455
// get the lengths of the input sequences
456-
int a_len = a.size();
457-
int b_len = b.size();
456+
size_t a_len = a.size();
457+
size_t b_len = b.size();
458458

459459
// initialize the maximum length of the longest common subsequence (LCS)
460-
int max_length = 0;
460+
size_t max_length = 0;
461461

462462
// use two rows instead of a 2D matrix to optimize space
463-
std::vector<int> prev_row(b_len + 1, 0);
464-
std::vector<int> curr_row(b_len + 1, 0);
463+
std::vector<size_t> prev_row(b_len + 1, 0);
464+
std::vector<size_t> curr_row(b_len + 1, 0);
465465

466466
// iterate through the elements of a
467-
for (int i = 1; i <= a_len; i++) {
467+
for (size_t i = 1; i <= a_len; i++) {
468468
// iterate through the elements of b
469-
for (int j = 1; j <= b_len; j++) {
469+
for (size_t j = 1; j <= b_len; j++) {
470470
// if elements at the current positions match
471471
if (a[i - 1] == b[j - 1]) {
472472
// if it's the first element of either sequences, set LCS length to 1

0 commit comments

Comments
 (0)