diff --git a/examples/run/run.cpp b/examples/run/run.cpp index e567ad716a30d..c710d432674a9 100644 --- a/examples/run/run.cpp +++ b/examples/run/run.cpp @@ -634,20 +634,20 @@ class LlamaData { return path.substr(pos + 1); } - int remove_proto(std::string & model_) { - const std::string::size_type pos = model_.find("://"); + int rm_until_substring(std::string & model_, const std::string & substring) { + const std::string::size_type pos = model_.find(substring); if (pos == std::string::npos) { return 1; } - model_ = model_.substr(pos + 3); // Skip past "://" + model_ = model_.substr(pos + substring.size()); // Skip past the substring return 0; } int resolve_model(std::string & model_) { int ret = 0; if (string_starts_with(model_, "file://") || std::filesystem::exists(model_)) { - remove_proto(model_); + rm_until_substring(model_, "://"); return ret; } @@ -656,13 +656,16 @@ class LlamaData { const std::vector headers = { "--header", "Accept: application/vnd.docker.distribution.manifest.v2+json" }; if (string_starts_with(model_, "hf://") || string_starts_with(model_, "huggingface://")) { - remove_proto(model_); + rm_until_substring(model_, "://"); + ret = huggingface_dl(model_, headers, bn); + } else if (string_starts_with(model_, "hf.co/")) { + rm_until_substring(model_, "hf.co/"); ret = huggingface_dl(model_, headers, bn); } else if (string_starts_with(model_, "ollama://")) { - remove_proto(model_); + rm_until_substring(model_, "://"); ret = ollama_dl(model_, headers, bn); } else if (string_starts_with(model_, "https://")) { - download(model_, headers, bn, true); + ret = download(model_, headers, bn, true); } else { ret = ollama_dl(model_, headers, bn); } diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 4cfb3c9bbd7d0..a94c3822c4dd7 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1433,6 +1433,10 @@ struct server_queue { } else { queue_tasks.push_back(std::move(task)); } + // if this is cancel task make sure to clean up pending tasks + if (task.type == SERVER_TASK_TYPE_CANCEL) { + cleanup_pending_task(task.id_target); + } condition_tasks.notify_one(); return task.id; } @@ -1450,6 +1454,10 @@ struct server_queue { } else { queue_tasks.push_back(std::move(task)); } + // if this is cancel task make sure to clean up pending tasks + if (task.type == SERVER_TASK_TYPE_CANCEL) { + cleanup_pending_task(task.id_target); + } } condition_tasks.notify_one(); return 0; @@ -1544,6 +1552,20 @@ struct server_queue { } } } + +private: + void cleanup_pending_task(int id_task) { + // no need lock because this is called exclusively by post() + auto rm_func = [id_task](const server_task & task) { + return task.id_target == id_task; + }; + queue_tasks.erase( + std::remove_if(queue_tasks.begin(), queue_tasks.end(), rm_func), + queue_tasks.end()); + queue_tasks_deferred.erase( + std::remove_if(queue_tasks_deferred.begin(), queue_tasks_deferred.end(), rm_func), + queue_tasks_deferred.end()); + } }; struct server_response { @@ -1579,6 +1601,12 @@ struct server_response { std::unique_lock lock(mutex_results); waiting_task_ids.erase(id_task); + // make sure to clean up all pending results + queue_results.erase( + std::remove_if(queue_results.begin(), queue_results.end(), [id_task](const server_task_result_ptr & res) { + return res->id == id_task; + }), + queue_results.end()); } void remove_waiting_task_ids(const std::unordered_set & id_tasks) { @@ -1598,7 +1626,7 @@ struct server_response { return !queue_results.empty(); }); - for (int i = 0; i < (int) queue_results.size(); i++) { + for (size_t i = 0; i < queue_results.size(); i++) { if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) { server_task_result_ptr res = std::move(queue_results[i]); queue_results.erase(queue_results.begin() + i); @@ -1615,12 +1643,6 @@ struct server_response { server_task_result_ptr recv_with_timeout(const std::unordered_set & id_tasks, int timeout) { while (true) { std::unique_lock lock(mutex_results); - bool cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout), [&]{ - return !queue_results.empty(); - }); - if (!cr_res) { - return nullptr; - } for (int i = 0; i < (int) queue_results.size(); i++) { if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) { @@ -1629,6 +1651,11 @@ struct server_response { return res; } } + + std::cv_status cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout)); + if (cr_res == std::cv_status::timeout) { + return nullptr; + } } // should never reach here @@ -2376,8 +2403,8 @@ struct server_context { server_task task(SERVER_TASK_TYPE_CANCEL); task.id_target = id_task; - cancel_tasks.push_back(task); queue_results.remove_waiting_task_id(id_task); + cancel_tasks.push_back(task); } // push to beginning of the queue, so it has highest priority queue_tasks.post(cancel_tasks, true); diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp index c22a662876c4a..5ec2561597161 100644 --- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp +++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp @@ -2125,7 +2125,7 @@ class tinyBLAS_PPC { switch(m_rem) { case 1: mc = 1; - gemm_small(m0, m, n0, n, mc, nc); + gemv(m0, m, n0, n, mc, nc); break; case 2: mc = 2; @@ -2143,7 +2143,7 @@ class tinyBLAS_PPC { switch(n_rem) { case 1: nc = 1; - gemm_small(m0, m, n0, n, mc, nc); + gemv(m0, m, n0, n, mc, nc); break; case 2: nc = 2; @@ -2171,7 +2171,7 @@ class tinyBLAS_PPC { case 0x41: mc = 4; nc = 1; - gemm_small(m0, m, n0, n, mc, nc); + gemv(m0, m, n0, n, mc, nc); break; case 0x34: mc = 3; @@ -2191,7 +2191,7 @@ class tinyBLAS_PPC { case 0x31: mc = 3; nc = 1; - gemm_small(m0, m, n0, n, mc, nc); + gemv(m0, m, n0, n, mc, nc); break; case 0x24: mc = 2; @@ -2211,27 +2211,27 @@ class tinyBLAS_PPC { case 0x21: mc = 2; nc = 1; - gemm_small(m0, m, n0, n, mc, nc); + gemv(m0, m, n0, n, mc, nc); break; case 0x14: mc = 1; nc = 4; - gemm_small(m0, m, n0, n, mc, nc); + gemv(m0, m, n0, n, mc, nc); break; case 0x13: mc = 1; nc = 3; - gemm_small(m0, m, n0, n, mc, nc); + gemv(m0, m, n0, n, mc, nc); break; case 0x12: mc = 1; nc = 2; - gemm_small(m0, m, n0, n, mc, nc); + gemv(m0, m, n0, n, mc, nc); break; case 0x11: mc = 1; nc = 1; - gemm_small(m0, m, n0, n, mc, nc); + gemv(m0, m, n0, n, mc, nc); break; default: return; @@ -2285,6 +2285,53 @@ class tinyBLAS_PPC { } } + void gemv(int64_t m0, int64_t m, int64_t n0, int64_t n, int RM, int RN) { + //printf("In gemv, RM = %d, RN = %d \n", RM, RN); + int64_t ytiles = (m - m0) / RM; + int64_t xtiles = (n - n0) / RN; + int64_t tiles = xtiles * ytiles; + int64_t duty = (tiles + nth - 1) / nth; + int64_t start = duty * ith; + int64_t end = start + duty; + if (end > tiles) + end = tiles; + for (int64_t job = start; job < end; ++job) { + int64_t ii = m0 + job / xtiles * RM; + int64_t jj = n0 + job % xtiles * RN; + vec_t vec_C[4]; + acc_t acc_0; + __builtin_mma_xxsetaccz(&acc_0); + vec_t vec_A[4], vec_B[4]; + for (int l=0; l(A+(ii)*lda+l); + packTranspose(B+(jj*ldb)+l, ldb, RN, 4, (TA*)vec_B); + vec_A[0] = (vec_t)vec_xl(0,a); + vec_A[1] = (vec_t)vec_splats(*((TA*)&vec_A+1)); + vec_A[2] = (vec_t)vec_splats(*((TA*)&vec_A+2)); + vec_A[3] = (vec_t)vec_splats(*((TA*)&vec_A+3)); + } else if (RN == 1) { + packTranspose(A+(ii*lda)+l, lda, RM, 4, (TA*)vec_A); + TB* b = const_cast(B+(jj)*ldb+l); + vec_B[0] = (vec_t)vec_xl(0,b); + vec_B[1] = (vec_t)vec_splats(*((TB*)&vec_B+1)); + vec_B[2] = (vec_t)vec_splats(*((TB*)&vec_B+2)); + vec_B[3] = (vec_t)vec_splats(*((TB*)&vec_B+3)); + } + __builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]); + __builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]); + __builtin_mma_xvf32gerpp(&acc_0, vec_A[2], vec_B[2]); + __builtin_mma_xvf32gerpp(&acc_0, vec_A[3], vec_B[3]); + } + __builtin_mma_disassemble_acc(vec_C, &acc_0); + for (int I = 0; I < RM; I++) { + for (int J = 0; J < RN; J++) { + *((TC*)(C+ii+((jj+J)*ldc)+I)) = *((TC*)&vec_C[I]+J); + } + } + } + } + template NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) { int64_t ytiles = (m - m0) / RM; @@ -2370,9 +2417,11 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64 assert(params->ith < params->nth); // only enable sgemm for prompt processing +/*#if !defined(__MMA__) if (n < 2) return false; - +#endif +*/ if (Ctype != GGML_TYPE_F32) return false; @@ -2401,7 +2450,7 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64 (const float *)B, ldb, (float *)C, ldc}; return tb.matmul(m, n); -#elif defined(__MMA__) +/*#elif defined(__MMA__) if (k % 8) return false; tinyBLAS_PPC tb{ @@ -2411,6 +2460,7 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64 params->ith, params->nth}; tb.matmul(m, n); return true; +*/ #else return false; #endif