Skip to content

Commit 61b4c86

Browse files
authored
Merge branch 'ggml-org:master' into mradermacher
2 parents bccafff + d3bd719 commit 61b4c86

File tree

32 files changed

+1017
-144
lines changed

32 files changed

+1017
-144
lines changed

.devops/llama-cli-cann.Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
1-
ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8
1+
ARG ASCEND_VERSION=8.1.RC1.alpha001-910b-openeuler22.03-py3.10
22

33
FROM ascendai/cann:$ASCEND_VERSION AS build
44

55
WORKDIR /app
66

77
COPY . .
88

9-
RUN yum install -y gcc g++ cmake make
9+
RUN yum install -y gcc g++ cmake make libcurl-devel
1010
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
1111
ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
1212
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}

.github/workflows/build.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1771,7 +1771,7 @@ jobs:
17711771
strategy:
17721772
matrix:
17731773
cann:
1774-
- '8.0.rc3.beta1-910b-openeuler22.03-py3.10'
1774+
- '8.1.RC1.alpha001-910b-openeuler22.03-py3.10'
17751775
device:
17761776
- 'ascend910b3'
17771777
build:
@@ -1784,7 +1784,7 @@ jobs:
17841784
- name: Dependencies
17851785
run: |
17861786
yum update -y
1787-
yum install -y git gcc gcc-c++ make cmake
1787+
yum install -y git gcc gcc-c++ make cmake libcurl-devel
17881788
17891789
- name: Build
17901790
run: |

.github/workflows/docker.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ jobs:
3838
# Multi-stage build
3939
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: false}
4040
- { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
41-
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
41+
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: true}
4242
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
4343
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
4444
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete

README.md

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,6 @@
99

1010
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
1111

12-
> [!IMPORTANT]
13-
> New `llama.cpp` package location: [ggml-org/llama.cpp](https://github.com/ggml-org/llama.cpp/pkgs/container/llama.cpp)
14-
>
15-
> Update your container URLs to: `ghcr.io/ggml-org/llama.cpp`
16-
>
17-
> More info: https://github.com/ggml-org/llama.cpp/discussions/11801
18-
1912
## Recent API changes
2013

2114
- [Changelog for `libllama` API](https://github.com/ggml-org/llama.cpp/issues/9289)
@@ -247,6 +240,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
247240
| [Vulkan](docs/build.md#vulkan) | GPU |
248241
| [CANN](docs/build.md#cann) | Ascend NPU |
249242
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
243+
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/examples/rpc) | All |
250244

251245
## Building the project
252246

common/arg.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,8 @@ struct common_hf_file_res {
163163
# if !defined(PATH_MAX)
164164
# define PATH_MAX MAX_PATH
165165
# endif
166+
#elif defined(_AIX)
167+
#include <sys/limits.h>
166168
#else
167169
#include <sys/syslimits.h>
168170
#endif

convert_hf_to_gguf.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2459,6 +2459,16 @@ def prepare_tensors(self):
24592459
raise ValueError(f"Unprocessed experts: {experts}")
24602460

24612461

2462+
@Model.register("Qwen3ForCausalLM")
2463+
class Qwen3Model(Qwen2Model):
2464+
model_arch = gguf.MODEL_ARCH.QWEN3
2465+
2466+
2467+
@Model.register("Qwen3MoeForCausalLM")
2468+
class Qwen3MoeModel(Qwen2MoeModel):
2469+
model_arch = gguf.MODEL_ARCH.QWEN3MOE
2470+
2471+
24622472
@Model.register("GPT2LMHeadModel")
24632473
class GPT2Model(Model):
24642474
model_arch = gguf.MODEL_ARCH.GPT2

docs/backend/SYCL.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -425,13 +425,13 @@ Examples:
425425
- Use device 0:
426426

427427
```sh
428-
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
428+
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
429429
```
430430

431431
- Use multiple devices:
432432

433433
```sh
434-
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
434+
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
435435
```
436436

437437
*Notes:*
@@ -697,13 +697,13 @@ Examples:
697697
- Use device 0:
698698

699699
```
700-
build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
700+
build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
701701
```
702702

703703
- Use multiple devices:
704704

705705
```
706-
build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
706+
build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
707707
```
708708

709709

examples/llava/clip.cpp

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -331,7 +331,6 @@ struct clip_ctx {
331331
float image_std[3];
332332
bool use_gelu = false;
333333
bool use_silu = false;
334-
int32_t ftype = 1;
335334

336335
struct gguf_context * ctx_gguf = nullptr;
337336
struct ggml_context * ctx_data = nullptr;
@@ -380,6 +379,7 @@ struct clip_ctx {
380379
if (backend_cpu != backend) {
381380
ggml_backend_free(backend_cpu);
382381
}
382+
clip_image_size_free(load_image_size);
383383
}
384384
};
385385

@@ -1141,9 +1141,6 @@ struct clip_model_loader {
11411141

11421142
// print gguf info
11431143
{
1144-
int ftype = -1;
1145-
get_u32(KEY_FTYPE, ftype, false);
1146-
const std::string ftype_str = ggml_type_name(static_cast<ggml_type>(ftype));
11471144
std::string name;
11481145
get_string(KEY_NAME, name, false);
11491146
std::string description;
@@ -1154,7 +1151,6 @@ struct clip_model_loader {
11541151
LOG_INF("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx_gguf.get()));
11551152
LOG_INF("%s: n_tensors: %d\n", __func__, n_tensors);
11561153
LOG_INF("%s: n_kv: %d\n", __func__, (int)gguf_get_n_kv(ctx_gguf.get()));
1157-
LOG_INF("%s: ftype: %s\n", __func__, ftype_str.c_str());
11581154
LOG_INF("\n");
11591155
}
11601156

@@ -1618,6 +1614,12 @@ struct clip_image_f32 * clip_image_f32_init() {
16181614
return new clip_image_f32();
16191615
}
16201616

1617+
void clip_image_size_free(struct clip_image_size * load_image_size) {
1618+
if (load_image_size == nullptr) {
1619+
return;
1620+
}
1621+
delete load_image_size;
1622+
}
16211623
void clip_image_u8_free(struct clip_image_u8 * img) { delete img; }
16221624
void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
16231625
void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) {
@@ -2270,6 +2272,9 @@ ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
22702272
}
22712273

22722274
void clip_free(clip_ctx * ctx) {
2275+
if (ctx == nullptr) {
2276+
return;
2277+
}
22732278
delete ctx;
22742279
}
22752280

@@ -2840,10 +2845,19 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) {
28402845
bool clip_is_glm(const struct clip_ctx * ctx) {
28412846
return ctx->has_glm_projector;
28422847
}
2848+
28432849
bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
28442850
return ctx->has_qwen2vl_merger;
28452851
}
28462852

2853+
bool clip_is_llava(const struct clip_ctx * ctx) {
2854+
return ctx->has_llava_projector;
2855+
}
2856+
2857+
bool clip_is_gemma3(const struct clip_ctx * ctx) {
2858+
return ctx->proj_type == PROJECTOR_TYPE_GEMMA3;
2859+
}
2860+
28472861
// Determine the number of encoder layers to iterate over
28482862
int get_deepest_feature_layer(const struct clip_ctx * ctx) {
28492863
// Get the index of the second to last layer; this is the

examples/llava/clip.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ CLIP_API struct clip_image_size * clip_image_size_init();
7777
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
7878
CLIP_API struct clip_image_f32 * clip_image_f32_init();
7979

80+
CLIP_API void clip_image_size_free (struct clip_image_size * img_size);
8081
CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
8182
CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
8283
CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch);
@@ -106,6 +107,8 @@ CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out
106107
CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
107108
CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);
108109
CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
110+
CLIP_API bool clip_is_llava(const struct clip_ctx * ctx);
111+
CLIP_API bool clip_is_gemma3(const struct clip_ctx * ctx);
109112

110113
CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx);
111114

examples/server/server.cpp

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1705,6 +1705,8 @@ struct server_queue {
17051705
};
17061706

17071707
struct server_response {
1708+
bool running = true;
1709+
17081710
// for keeping track of all tasks waiting for the result
17091711
std::unordered_set<int> waiting_task_ids;
17101712

@@ -1759,6 +1761,10 @@ struct server_response {
17591761
while (true) {
17601762
std::unique_lock<std::mutex> lock(mutex_results);
17611763
condition_results.wait(lock, [&]{
1764+
if (!running) {
1765+
SRV_DBG("%s : queue result stop\n", __func__);
1766+
std::terminate(); // we cannot return here since the caller is HTTP code
1767+
}
17621768
return !queue_results.empty();
17631769
});
17641770

@@ -1789,6 +1795,10 @@ struct server_response {
17891795
}
17901796

17911797
std::cv_status cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout));
1798+
if (!running) {
1799+
SRV_DBG("%s : queue result stop\n", __func__);
1800+
std::terminate(); // we cannot return here since the caller is HTTP code
1801+
}
17921802
if (cr_res == std::cv_status::timeout) {
17931803
return nullptr;
17941804
}
@@ -1818,6 +1828,12 @@ struct server_response {
18181828
}
18191829
}
18201830
}
1831+
1832+
// terminate the waiting loop
1833+
void terminate() {
1834+
running = false;
1835+
condition_results.notify_all();
1836+
}
18211837
};
18221838

18231839
struct server_context {
@@ -4491,9 +4507,10 @@ int main(int argc, char ** argv) {
44914507
svr->new_task_queue = [&params] { return new httplib::ThreadPool(params.n_threads_http); };
44924508

44934509
// clean up function, to be called before exit
4494-
auto clean_up = [&svr]() {
4510+
auto clean_up = [&svr, &ctx_server]() {
44954511
SRV_INF("%s: cleaning up before exit...\n", __func__);
44964512
svr->stop();
4513+
ctx_server.queue_results.terminate();
44974514
llama_backend_free();
44984515
};
44994516

@@ -4534,7 +4551,7 @@ int main(int argc, char ** argv) {
45344551

45354552
if (!ctx_server.load_model(params)) {
45364553
clean_up();
4537-
// t.join(); // FIXME: see below
4554+
t.join();
45384555
LOG_ERR("%s: exiting due to model loading error\n", __func__);
45394556
return 1;
45404557
}
@@ -4582,7 +4599,7 @@ int main(int argc, char ** argv) {
45824599
ctx_server.queue_tasks.start_loop();
45834600

45844601
clean_up();
4585-
// t.join(); // FIXME: http thread may stuck if there is an on-going request. we don't need to care about this for now as the HTTP connection will already be closed at this point, but it's better to fix this
4602+
t.join();
45864603

45874604
return 0;
45884605
}

0 commit comments

Comments
 (0)