Skip to content

Commit 94564ac

Browse files
committed
Merge branch 'master' into xsn/llava2
2 parents 96bf95e + d3bd719 commit 94564ac

File tree

31 files changed

+1004
-144
lines changed

31 files changed

+1004
-144
lines changed

.devops/llama-cli-cann.Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
1-
ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8
1+
ARG ASCEND_VERSION=8.1.RC1.alpha001-910b-openeuler22.03-py3.10
22

33
FROM ascendai/cann:$ASCEND_VERSION AS build
44

55
WORKDIR /app
66

77
COPY . .
88

9-
RUN yum install -y gcc g++ cmake make
9+
RUN yum install -y gcc g++ cmake make libcurl-devel
1010
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
1111
ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
1212
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}

.github/workflows/build.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1771,7 +1771,7 @@ jobs:
17711771
strategy:
17721772
matrix:
17731773
cann:
1774-
- '8.0.rc3.beta1-910b-openeuler22.03-py3.10'
1774+
- '8.1.RC1.alpha001-910b-openeuler22.03-py3.10'
17751775
device:
17761776
- 'ascend910b3'
17771777
build:
@@ -1784,7 +1784,7 @@ jobs:
17841784
- name: Dependencies
17851785
run: |
17861786
yum update -y
1787-
yum install -y git gcc gcc-c++ make cmake
1787+
yum install -y git gcc gcc-c++ make cmake libcurl-devel
17881788
17891789
- name: Build
17901790
run: |

.github/workflows/docker.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ jobs:
3838
# Multi-stage build
3939
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: false}
4040
- { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
41-
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
41+
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: true}
4242
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
4343
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
4444
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete

README.md

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,6 @@
99

1010
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
1111

12-
> [!IMPORTANT]
13-
> New `llama.cpp` package location: [ggml-org/llama.cpp](https://github.com/ggml-org/llama.cpp/pkgs/container/llama.cpp)
14-
>
15-
> Update your container URLs to: `ghcr.io/ggml-org/llama.cpp`
16-
>
17-
> More info: https://github.com/ggml-org/llama.cpp/discussions/11801
18-
1912
## Recent API changes
2013

2114
- [Changelog for `libllama` API](https://github.com/ggml-org/llama.cpp/issues/9289)
@@ -247,6 +240,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
247240
| [Vulkan](docs/build.md#vulkan) | GPU |
248241
| [CANN](docs/build.md#cann) | Ascend NPU |
249242
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
243+
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/examples/rpc) | All |
250244

251245
## Building the project
252246

convert_hf_to_gguf.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2459,6 +2459,16 @@ def prepare_tensors(self):
24592459
raise ValueError(f"Unprocessed experts: {experts}")
24602460

24612461

2462+
@Model.register("Qwen3ForCausalLM")
2463+
class Qwen3Model(Qwen2Model):
2464+
model_arch = gguf.MODEL_ARCH.QWEN3
2465+
2466+
2467+
@Model.register("Qwen3MoeForCausalLM")
2468+
class Qwen3MoeModel(Qwen2MoeModel):
2469+
model_arch = gguf.MODEL_ARCH.QWEN3MOE
2470+
2471+
24622472
@Model.register("GPT2LMHeadModel")
24632473
class GPT2Model(Model):
24642474
model_arch = gguf.MODEL_ARCH.GPT2

docs/backend/SYCL.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -425,13 +425,13 @@ Examples:
425425
- Use device 0:
426426

427427
```sh
428-
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
428+
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
429429
```
430430

431431
- Use multiple devices:
432432

433433
```sh
434-
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
434+
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
435435
```
436436

437437
*Notes:*
@@ -697,13 +697,13 @@ Examples:
697697
- Use device 0:
698698

699699
```
700-
build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
700+
build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
701701
```
702702

703703
- Use multiple devices:
704704

705705
```
706-
build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
706+
build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
707707
```
708708

709709

examples/llava/clip.cpp

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,6 @@ struct clip_ctx {
314314
float image_std[3];
315315
bool use_gelu = false;
316316
bool use_silu = false;
317-
int32_t ftype = 1;
318317

319318
struct gguf_context * ctx_gguf = nullptr;
320319
struct ggml_context * ctx_data = nullptr;
@@ -363,6 +362,7 @@ struct clip_ctx {
363362
if (backend_cpu != backend) {
364363
ggml_backend_free(backend_cpu);
365364
}
365+
clip_image_size_free(load_image_size);
366366
}
367367
};
368368

@@ -1124,9 +1124,6 @@ struct clip_model_loader {
11241124

11251125
// print gguf info
11261126
{
1127-
int ftype = -1;
1128-
get_u32(KEY_FTYPE, ftype, false);
1129-
const std::string ftype_str = ggml_type_name(static_cast<ggml_type>(ftype));
11301127
std::string name;
11311128
get_string(KEY_NAME, name, false);
11321129
std::string description;
@@ -1137,7 +1134,6 @@ struct clip_model_loader {
11371134
LOG_INF("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx_gguf.get()));
11381135
LOG_INF("%s: n_tensors: %d\n", __func__, n_tensors);
11391136
LOG_INF("%s: n_kv: %d\n", __func__, (int)gguf_get_n_kv(ctx_gguf.get()));
1140-
LOG_INF("%s: ftype: %s\n", __func__, ftype_str.c_str());
11411137
LOG_INF("\n");
11421138
}
11431139

@@ -1607,6 +1603,12 @@ unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx
16071603
return img->buf.data();
16081604
}
16091605

1606+
void clip_image_size_free(struct clip_image_size * load_image_size) {
1607+
if (load_image_size == nullptr) {
1608+
return;
1609+
}
1610+
delete load_image_size;
1611+
}
16101612
void clip_image_u8_free(struct clip_image_u8 * img) { delete img; }
16111613
void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
16121614
void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) {
@@ -2259,6 +2261,9 @@ ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
22592261
}
22602262

22612263
void clip_free(clip_ctx * ctx) {
2264+
if (ctx == nullptr) {
2265+
return;
2266+
}
22622267
delete ctx;
22632268
}
22642269

examples/llava/clip.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ CLIP_API struct clip_image_f32 * clip_image_f32_init();
8080
// nx, ny are the output image dimensions
8181
CLIP_API unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
8282

83+
CLIP_API void clip_image_size_free (struct clip_image_size * img_size);
8384
CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
8485
CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
8586
CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch);

examples/server/server.cpp

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1705,6 +1705,8 @@ struct server_queue {
17051705
};
17061706

17071707
struct server_response {
1708+
bool running = true;
1709+
17081710
// for keeping track of all tasks waiting for the result
17091711
std::unordered_set<int> waiting_task_ids;
17101712

@@ -1759,6 +1761,10 @@ struct server_response {
17591761
while (true) {
17601762
std::unique_lock<std::mutex> lock(mutex_results);
17611763
condition_results.wait(lock, [&]{
1764+
if (!running) {
1765+
SRV_DBG("%s : queue result stop\n", __func__);
1766+
std::terminate(); // we cannot return here since the caller is HTTP code
1767+
}
17621768
return !queue_results.empty();
17631769
});
17641770

@@ -1789,6 +1795,10 @@ struct server_response {
17891795
}
17901796

17911797
std::cv_status cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout));
1798+
if (!running) {
1799+
SRV_DBG("%s : queue result stop\n", __func__);
1800+
std::terminate(); // we cannot return here since the caller is HTTP code
1801+
}
17921802
if (cr_res == std::cv_status::timeout) {
17931803
return nullptr;
17941804
}
@@ -1818,6 +1828,12 @@ struct server_response {
18181828
}
18191829
}
18201830
}
1831+
1832+
// terminate the waiting loop
1833+
void terminate() {
1834+
running = false;
1835+
condition_results.notify_all();
1836+
}
18211837
};
18221838

18231839
struct server_context {
@@ -4491,9 +4507,10 @@ int main(int argc, char ** argv) {
44914507
svr->new_task_queue = [&params] { return new httplib::ThreadPool(params.n_threads_http); };
44924508

44934509
// clean up function, to be called before exit
4494-
auto clean_up = [&svr]() {
4510+
auto clean_up = [&svr, &ctx_server]() {
44954511
SRV_INF("%s: cleaning up before exit...\n", __func__);
44964512
svr->stop();
4513+
ctx_server.queue_results.terminate();
44974514
llama_backend_free();
44984515
};
44994516

@@ -4534,7 +4551,7 @@ int main(int argc, char ** argv) {
45344551

45354552
if (!ctx_server.load_model(params)) {
45364553
clean_up();
4537-
// t.join(); // FIXME: see below
4554+
t.join();
45384555
LOG_ERR("%s: exiting due to model loading error\n", __func__);
45394556
return 1;
45404557
}
@@ -4582,7 +4599,7 @@ int main(int argc, char ** argv) {
45824599
ctx_server.queue_tasks.start_loop();
45834600

45844601
clean_up();
4585-
// t.join(); // FIXME: http thread may stuck if there is an on-going request. we don't need to care about this for now as the HTTP connection will already be closed at this point, but it's better to fix this
4602+
t.join();
45864603

45874604
return 0;
45884605
}

examples/server/tests/unit/test_embedding.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,26 @@ def test_embedding_multiple():
4949
assert len(d['embedding']) > 1
5050

5151

52+
def test_embedding_multiple_with_fa():
53+
server = ServerPreset.bert_bge_small_with_fa()
54+
server.pooling = 'last'
55+
server.start()
56+
# one of these should trigger the FA branch (i.e. context size % 256 == 0)
57+
res = server.make_request("POST", "/v1/embeddings", data={
58+
"input": [
59+
"a "*253,
60+
"b "*254,
61+
"c "*255,
62+
"d "*256,
63+
],
64+
})
65+
assert res.status_code == 200
66+
assert len(res.body['data']) == 4
67+
for d in res.body['data']:
68+
assert 'embedding' in d
69+
assert len(d['embedding']) > 1
70+
71+
5272
@pytest.mark.parametrize(
5373
"input,is_multi_prompt",
5474
[

0 commit comments

Comments
 (0)