Skip to content

Commit bf1d72b

Browse files
committed
clip : bring back GPU support
1 parent 2c9f833 commit bf1d72b

File tree

2 files changed

+127
-70
lines changed

2 files changed

+127
-70
lines changed

examples/llava/clip.cpp

Lines changed: 118 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -4,30 +4,31 @@
44
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
55
#include "clip.h"
66
#include "ggml.h"
7+
#include "ggml-cpp.h"
78
#include "ggml-cpu.h"
89
#include "ggml-alloc.h"
910
#include "ggml-backend.h"
1011
#include "gguf.h"
1112

12-
//#ifdef GGML_USE_CUDA
13-
//#include "ggml-cuda.h"
14-
//#endif
15-
//
16-
//#ifdef GGML_USE_SYCL
17-
//#include "ggml-sycl.h"
18-
//#endif
19-
//
20-
//#ifdef GGML_USE_METAL
21-
//#include "ggml-metal.h"
22-
//#endif
23-
//
24-
//#ifdef GGML_USE_CANN
25-
//#include "ggml-cann.h"
26-
//#endif
27-
//
28-
//#ifdef GGML_USE_VULKAN
29-
//#include "ggml-vulkan.h"
30-
//#endif
13+
#ifdef GGML_USE_CUDA
14+
#include "ggml-cuda.h"
15+
#endif
16+
17+
#ifdef GGML_USE_SYCL
18+
#include "ggml-sycl.h"
19+
#endif
20+
21+
#ifdef GGML_USE_METAL
22+
#include "ggml-metal.h"
23+
#endif
24+
25+
#ifdef GGML_USE_CANN
26+
#include "ggml-cann.h"
27+
#endif
28+
29+
#ifdef GGML_USE_VULKAN
30+
#include "ggml-vulkan.h"
31+
#endif
3132

3233
#define STB_IMAGE_IMPLEMENTATION
3334
#include "stb_image.h"
@@ -600,18 +601,36 @@ struct clip_ctx {
600601
bool has_post_norm = false;
601602
bool has_patch_bias = false;
602603

603-
struct gguf_context * ctx_gguf;
604-
struct ggml_context * ctx_data;
604+
struct gguf_context * ctx_gguf = nullptr;
605+
struct ggml_context * ctx_data = nullptr;
605606

606607
std::vector<uint8_t> buf_compute_meta;
607608

608-
// memory buffers to evaluate the model
609-
ggml_backend_buffer_t params_buffer = NULL;
609+
ggml_backend_t backend = nullptr;
610+
ggml_backend_t backend_cpu = nullptr;
611+
ggml_backend_buffer_t buf = nullptr;
610612

611-
ggml_backend_t backend = NULL;
612-
ggml_gallocr_t compute_alloc = NULL;
613+
ggml_backend_sched_ptr sched;
613614

614615
struct clip_image_size * load_image_size;
616+
617+
~clip_ctx() {
618+
if (ctx_data) {
619+
ggml_free(ctx_data);
620+
}
621+
if (ctx_gguf) {
622+
gguf_free(ctx_gguf);
623+
}
624+
if (buf) {
625+
ggml_backend_buffer_free(buf);
626+
}
627+
if (backend) {
628+
ggml_backend_free(backend);
629+
}
630+
if (backend_cpu) {
631+
ggml_backend_free(backend_cpu);
632+
}
633+
}
615634
};
616635

617636
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
@@ -1184,6 +1203,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
11841203

11851204
// read and create ggml_context containing the tensors and their data
11861205
struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1206+
return clip_init(fname, clip_context_params{
1207+
/* use_gpu */ true,
1208+
/* verbosity */ verbosity,
1209+
});
1210+
}
1211+
1212+
struct clip_ctx * clip_init(const char * fname, clip_context_params ctx_params) {
1213+
int verbosity = ctx_params.verbosity;
11871214
struct ggml_context * meta = NULL;
11881215

11891216
struct gguf_init_params params = {
@@ -1296,36 +1323,53 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
12961323
}
12971324
}
12981325

1299-
//#ifdef GGML_USE_CUDA
1300-
// new_clip->backend = ggml_backend_cuda_init(0);
1301-
// LOG_INF("%s: CLIP using CUDA backend\n", __func__);
1302-
//#endif
1303-
//
1304-
//#ifdef GGML_USE_METAL
1305-
// new_clip->backend = ggml_backend_metal_init();
1306-
// LOG_INF("%s: CLIP using Metal backend\n", __func__);
1307-
//#endif
1308-
//
1309-
//#ifdef GGML_USE_CANN
1310-
// new_clip->backend = ggml_backend_cann_init(0);
1311-
// LOG_INF("%s: CLIP using CANN backend\n", __func__);
1312-
//#endif
1313-
//
1314-
//#ifdef GGML_USE_VULKAN
1315-
// new_clip->backend = ggml_backend_vk_init(0);
1316-
// LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
1317-
//#endif
1318-
//
1319-
//#ifdef GGML_USE_SYCL
1320-
// new_clip->backend = ggml_backend_sycl_init(0);
1321-
// LOG_INF("%s: CLIP using SYCL backend\n", __func__);
1322-
//#endif
1326+
std::vector<ggml_backend_buffer_type_t> backend_buft;
1327+
std::vector<ggml_backend_t> backend_ptrs;
1328+
1329+
new_clip->backend_cpu = ggml_backend_cpu_init();
1330+
1331+
if (ctx_params.use_gpu) {
1332+
#ifdef GGML_USE_CUDA
1333+
new_clip->backend = ggml_backend_cuda_init(0);
1334+
LOG_INF("%s: CLIP using CUDA backend\n", __func__);
1335+
#endif
1336+
1337+
#ifdef GGML_USE_METAL
1338+
new_clip->backend = ggml_backend_metal_init();
1339+
LOG_INF("%s: CLIP using Metal backend\n", __func__);
1340+
#endif
1341+
1342+
#ifdef GGML_USE_CANN
1343+
new_clip->backend = ggml_backend_cann_init(0);
1344+
LOG_INF("%s: CLIP using CANN backend\n", __func__);
1345+
#endif
13231346

1324-
if (!new_clip->backend) {
1325-
new_clip->backend = ggml_backend_cpu_init();
1347+
#ifdef GGML_USE_VULKAN
1348+
new_clip->backend = ggml_backend_vk_init(0);
1349+
LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
1350+
#endif
1351+
1352+
#ifdef GGML_USE_SYCL
1353+
new_clip->backend = ggml_backend_sycl_init(0);
1354+
LOG_INF("%s: CLIP using SYCL backend\n", __func__);
1355+
#endif
1356+
}
1357+
1358+
if (new_clip->backend) {
1359+
backend_ptrs.push_back(new_clip->backend);
1360+
backend_buft.push_back(ggml_backend_get_default_buffer_type(new_clip->backend));
1361+
} else {
1362+
new_clip->backend = new_clip->backend_cpu;
13261363
LOG_INF("%s: CLIP using CPU backend\n", __func__);
13271364
}
13281365

1366+
backend_ptrs.push_back(new_clip->backend_cpu);
1367+
backend_buft.push_back(ggml_backend_get_default_buffer_type(new_clip->backend_cpu));
1368+
1369+
new_clip->sched.reset(
1370+
ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false)
1371+
);
1372+
13291373
// model size and capabilities
13301374
{
13311375
int idx = get_key_idx(ctx, KEY_HAS_TEXT_ENC);
@@ -1421,7 +1465,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
14211465
}
14221466

14231467
// alloc memory and offload data
1424-
new_clip->params_buffer = ggml_backend_alloc_ctx_tensors(new_clip->ctx_data, new_clip->backend);
1468+
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(new_clip->backend);
1469+
new_clip->buf = ggml_backend_alloc_ctx_tensors_from_buft(new_clip->ctx_data, buft);
1470+
ggml_backend_buffer_set_usage(new_clip->buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
14251471
for (int i = 0; i < n_tensors; ++i) {
14261472
const char * name = gguf_get_tensor_name(ctx, i);
14271473
struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx_data, name);
@@ -1434,7 +1480,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
14341480
return nullptr;
14351481
}
14361482
int num_bytes = ggml_nbytes(cur);
1437-
if (ggml_backend_buffer_is_host(new_clip->params_buffer)) {
1483+
if (ggml_backend_buft_is_host(buft)) {
14381484
// for the CPU and Metal backend, we can read directly into the tensor
14391485
fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
14401486
} else {
@@ -1720,14 +1766,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
17201766
// measure mem requirement and allocate
17211767
{
17221768
new_clip->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
1723-
new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
17241769
clip_image_f32_batch batch;
17251770
batch.size = 1;
17261771
batch.data = nullptr;
17271772
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
1728-
ggml_gallocr_reserve(new_clip->compute_alloc, gf);
1729-
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
1730-
LOG_INF("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
1773+
ggml_backend_sched_reserve(new_clip->sched.get(), gf);
1774+
for (size_t i = 0; i < backend_ptrs.size(); ++i) {
1775+
ggml_backend_t backend = backend_ptrs[i];
1776+
ggml_backend_buffer_type_t buft = backend_buft[i];
1777+
size_t size = ggml_backend_sched_get_buffer_size(new_clip->sched.get(), backend);
1778+
if (size > 1) {
1779+
LOG_INF("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
1780+
ggml_backend_buft_name(buft),
1781+
size / 1024.0 / 1024.0);
1782+
}
1783+
}
17311784
}
17321785

17331786
return new_clip;
@@ -2408,12 +2461,6 @@ ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
24082461
}
24092462

24102463
void clip_free(clip_ctx * ctx) {
2411-
ggml_free(ctx->ctx_data);
2412-
gguf_free(ctx->ctx_gguf);
2413-
2414-
ggml_backend_buffer_free(ctx->params_buffer);
2415-
ggml_backend_free(ctx->backend);
2416-
ggml_gallocr_free(ctx->compute_alloc);
24172464
delete ctx;
24182465
}
24192466

@@ -2609,8 +2656,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
26092656
}
26102657

26112658
// build the inference graph
2659+
ggml_backend_sched_reset(ctx->sched.get());
26122660
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true);
2613-
ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
2661+
ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
26142662

26152663
// set inputs
26162664
const auto & model = ctx->vision_model;
@@ -2775,11 +2823,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
27752823
}
27762824
}
27772825

2778-
if (ggml_backend_is_cpu(ctx->backend)) {
2779-
ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
2780-
}
2826+
ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads);
27812827

2782-
ggml_backend_graph_compute(ctx->backend, gf);
2828+
auto status = ggml_backend_sched_graph_compute(ctx->sched.get(), gf);
2829+
if (status != GGML_STATUS_SUCCESS) {
2830+
LOG_ERR("%s: ggml_backend_sched_graph_compute failed with error %d\n", __func__, status);
2831+
return false;
2832+
}
27832833

27842834
// the last node is the embedding tensor
27852835
struct ggml_tensor * embeddings = ggml_graph_node(gf, -1);

examples/llava/clip.h

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,15 @@ struct clip_image_f32_batch {
3939
size_t size;
4040
};
4141

42-
CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity);
43-
CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
42+
struct clip_context_params {
43+
bool use_gpu;
44+
int verbosity;
45+
};
46+
47+
// deprecated, use clip_init
48+
CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity);
49+
50+
CLIP_API struct clip_ctx * clip_init(const char * fname, clip_context_params ctx_params);
4451

4552
CLIP_API void clip_free(struct clip_ctx * ctx);
4653

0 commit comments

Comments
 (0)