ggml-org
diff --git a/‎.devops/llama-cli-cann.Dockerfile‎
Lines changed: 2 additions & 2 deletions b/‎.devops/llama-cli-cann.Dockerfile‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/build.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/build.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/docker.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/docker.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 1 addition & 7 deletions b/‎README.md‎
Lines changed: 1 addition & 7 deletions
diff --git a/‎convert_hf_to_gguf.py‎
Lines changed: 10 additions & 0 deletions b/‎convert_hf_to_gguf.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎docs/backend/SYCL.md‎
Lines changed: 4 additions & 4 deletions b/‎docs/backend/SYCL.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎examples/llava/clip.cpp‎
Lines changed: 10 additions & 5 deletions b/‎examples/llava/clip.cpp‎
Lines changed: 10 additions & 5 deletions
diff --git a/‎examples/llava/clip.h‎
Lines changed: 1 addition & 0 deletions b/‎examples/llava/clip.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/server/server.cpp‎
Lines changed: 20 additions & 3 deletions b/‎examples/server/server.cpp‎
Lines changed: 20 additions & 3 deletions
diff --git a/‎examples/server/tests/unit/test_embedding.py‎
Lines changed: 20 additions & 0 deletions b/‎examples/server/tests/unit/test_embedding.py‎
Lines changed: 20 additions & 0 deletions
@@ -1,12 +1,12 @@
-ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8
+ARG ASCEND_VERSION=8.1.RC1.alpha001-910b-openeuler22.03-py3.10
 
 FROM ascendai/cann:$ASCEND_VERSION AS build
 
 WORKDIR /app
 
 COPY . .
 
-RUN yum install -y gcc g++ cmake make
+RUN yum install -y gcc g++ cmake make libcurl-devel
 ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
 ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
 ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
 
@@ -1771,7 +1771,7 @@ jobs:
     strategy:
       matrix:
         cann:
-          - '8.0.rc3.beta1-910b-openeuler22.03-py3.10'
+          - '8.1.RC1.alpha001-910b-openeuler22.03-py3.10'
         device:
           - 'ascend910b3'
         build:
@@ -1784,7 +1784,7 @@ jobs:
       - name: Dependencies
         run: |
           yum update -y
-          yum install -y git gcc gcc-c++ make cmake
+          yum install -y git gcc gcc-c++ make cmake libcurl-devel
 
       - name: Build
         run: |
 
@@ -38,7 +38,7 @@ jobs:
           # Multi-stage build
           - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: false}
           - { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
-          - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
+          - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: true}
           - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
           - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
           # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
 
@@ -9,13 +9,6 @@
 
 Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
 
-> [!IMPORTANT]
-> New `llama.cpp` package location: [ggml-org/llama.cpp](https://github.com/ggml-org/llama.cpp/pkgs/container/llama.cpp)
->
-> Update your container URLs to: `ghcr.io/ggml-org/llama.cpp`
->
-> More info: https://github.com/ggml-org/llama.cpp/discussions/11801
-
 ## Recent API changes
 
 - [Changelog for `libllama` API](https://github.com/ggml-org/llama.cpp/issues/9289)
@@ -247,6 +240,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 | [Vulkan](docs/build.md#vulkan) | GPU |
 | [CANN](docs/build.md#cann) | Ascend NPU |
 | [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
+| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/examples/rpc) | All |
 
 ## Building the project
 
 
@@ -2459,6 +2459,16 @@ def prepare_tensors(self):
                 raise ValueError(f"Unprocessed experts: {experts}")
 
 
+@Model.register("Qwen3ForCausalLM")
+class Qwen3Model(Qwen2Model):
+    model_arch = gguf.MODEL_ARCH.QWEN3
+
+
+@Model.register("Qwen3MoeForCausalLM")
+class Qwen3MoeModel(Qwen2MoeModel):
+    model_arch = gguf.MODEL_ARCH.QWEN3MOE
+
+
 @Model.register("GPT2LMHeadModel")
 class GPT2Model(Model):
     model_arch = gguf.MODEL_ARCH.GPT2
 
@@ -425,13 +425,13 @@ Examples:
 - Use device 0:
 
 ```sh
-ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
+ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
 ```
 
 - Use multiple devices:
 
 ```sh
-ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
+ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
 ```
 
 *Notes:*
@@ -697,13 +697,13 @@ Examples:
 - Use device 0:
 
 ```
-build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
+build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
 ```
 
 - Use multiple devices:
 
 ```
-build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
+build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
 ```
 
 
 
@@ -314,7 +314,6 @@ struct clip_ctx {
     float image_std[3];
     bool use_gelu = false;
     bool use_silu = false;
-    int32_t ftype = 1;
 
     struct gguf_context * ctx_gguf = nullptr;
     struct ggml_context * ctx_data = nullptr;
@@ -363,6 +362,7 @@ struct clip_ctx {
         if (backend_cpu != backend) {
             ggml_backend_free(backend_cpu);
         }
+        clip_image_size_free(load_image_size);
     }
 };
 
@@ -1124,9 +1124,6 @@ struct clip_model_loader {
 
         // print gguf info
         {
-            int ftype = -1;
-            get_u32(KEY_FTYPE, ftype, false);
-            const std::string ftype_str = ggml_type_name(static_cast<ggml_type>(ftype));
             std::string name;
             get_string(KEY_NAME, name, false);
             std::string description;
@@ -1137,7 +1134,6 @@ struct clip_model_loader {
             LOG_INF("%s: alignment:    %zu\n", __func__, gguf_get_alignment(ctx_gguf.get()));
             LOG_INF("%s: n_tensors:    %d\n",  __func__, n_tensors);
             LOG_INF("%s: n_kv:         %d\n",  __func__, (int)gguf_get_n_kv(ctx_gguf.get()));
-            LOG_INF("%s: ftype:        %s\n",  __func__, ftype_str.c_str());
             LOG_INF("\n");
         }
 
@@ -1607,6 +1603,12 @@ unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx
     return img->buf.data();
 }
 
+void clip_image_size_free(struct clip_image_size * load_image_size) {
+    if (load_image_size == nullptr) {
+        return;
+    }
+    delete load_image_size;
+}
 void clip_image_u8_free(struct clip_image_u8  * img) { delete img; }
 void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
 void clip_image_u8_batch_free(struct clip_image_u8_batch  * batch) {
@@ -2259,6 +2261,9 @@ ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
 }
 
 void clip_free(clip_ctx * ctx) {
+    if (ctx == nullptr) {
+        return;
+    }
     delete ctx;
 }
 
 
@@ -80,6 +80,7 @@ CLIP_API struct clip_image_f32 * clip_image_f32_init();
 // nx, ny are the output image dimensions
 CLIP_API unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
 
+CLIP_API void clip_image_size_free (struct clip_image_size * img_size);
 CLIP_API void clip_image_u8_free (struct clip_image_u8  * img);
 CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
 CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch  * batch);
 
@@ -1705,6 +1705,8 @@ struct server_queue {
 };
 
 struct server_response {
+    bool running = true;
+
     // for keeping track of all tasks waiting for the result
     std::unordered_set<int> waiting_task_ids;
 
@@ -1759,6 +1761,10 @@ struct server_response {
         while (true) {
             std::unique_lock<std::mutex> lock(mutex_results);
             condition_results.wait(lock, [&]{
+                if (!running) {
+                    SRV_DBG("%s : queue result stop\n", __func__);
+                    std::terminate(); // we cannot return here since the caller is HTTP code
+                }
                 return !queue_results.empty();
             });
 
@@ -1789,6 +1795,10 @@ struct server_response {
             }
 
             std::cv_status cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout));
+            if (!running) {
+                SRV_DBG("%s : queue result stop\n", __func__);
+                std::terminate(); // we cannot return here since the caller is HTTP code
+            }
             if (cr_res == std::cv_status::timeout) {
                 return nullptr;
             }
@@ -1818,6 +1828,12 @@ struct server_response {
             }
         }
     }
+
+    // terminate the waiting loop
+    void terminate() {
+        running = false;
+        condition_results.notify_all();
+    }
 };
 
 struct server_context {
@@ -4491,9 +4507,10 @@ int main(int argc, char ** argv) {
     svr->new_task_queue = [&params] { return new httplib::ThreadPool(params.n_threads_http); };
 
     // clean up function, to be called before exit
-    auto clean_up = [&svr]() {
+    auto clean_up = [&svr, &ctx_server]() {
         SRV_INF("%s: cleaning up before exit...\n", __func__);
         svr->stop();
+        ctx_server.queue_results.terminate();
         llama_backend_free();
     };
 
@@ -4534,7 +4551,7 @@ int main(int argc, char ** argv) {
 
     if (!ctx_server.load_model(params)) {
         clean_up();
-        // t.join(); // FIXME: see below
+        t.join();
         LOG_ERR("%s: exiting due to model loading error\n", __func__);
         return 1;
     }
@@ -4582,7 +4599,7 @@ int main(int argc, char ** argv) {
     ctx_server.queue_tasks.start_loop();
 
     clean_up();
-    // t.join(); // FIXME: http thread may stuck if there is an on-going request. we don't need to care about this for now as the HTTP connection will already be closed at this point, but it's better to fix this
+    t.join();
 
     return 0;
 }
@@ -49,6 +49,26 @@ def test_embedding_multiple():
         assert len(d['embedding']) > 1
 
 
+def test_embedding_multiple_with_fa():
+    server = ServerPreset.bert_bge_small_with_fa()
+    server.pooling = 'last'
+    server.start()
+    # one of these should trigger the FA branch (i.e. context size % 256 == 0)
+    res = server.make_request("POST", "/v1/embeddings", data={
+        "input": [
+            "a "*253,
+            "b "*254,
+            "c "*255,
+            "d "*256,
+        ],
+    })
+    assert res.status_code == 200
+    assert len(res.body['data']) == 4
+    for d in res.body['data']:
+        assert 'embedding' in d
+        assert len(d['embedding']) > 1
+
+
 @pytest.mark.parametrize(
     "input,is_multi_prompt",
     [