44// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
55#include " clip.h"
66#include " ggml.h"
7+ #include " ggml-cpp.h"
78#include " ggml-cpu.h"
89#include " ggml-alloc.h"
910#include " ggml-backend.h"
1011#include " gguf.h"
1112
12- // #ifdef GGML_USE_CUDA
13- // #include "ggml-cuda.h"
14- // #endif
15- //
16- // #ifdef GGML_USE_SYCL
17- // #include "ggml-sycl.h"
18- // #endif
19- //
20- // #ifdef GGML_USE_METAL
21- // #include "ggml-metal.h"
22- // #endif
23- //
24- // #ifdef GGML_USE_CANN
25- // #include "ggml-cann.h"
26- // #endif
27- //
28- // #ifdef GGML_USE_VULKAN
29- // #include "ggml-vulkan.h"
30- // #endif
31-
3213#define STB_IMAGE_IMPLEMENTATION
3314#include " stb_image.h"
3415
@@ -600,18 +581,54 @@ struct clip_ctx {
600581 bool has_post_norm = false ;
601582 bool has_patch_bias = false ;
602583
603- struct gguf_context * ctx_gguf;
604- struct ggml_context * ctx_data;
584+ struct gguf_context * ctx_gguf = nullptr ;
585+ struct ggml_context * ctx_data = nullptr ;
605586
606587 std::vector<uint8_t > buf_compute_meta;
607588
608- // memory buffers to evaluate the model
609- ggml_backend_buffer_t params_buffer = NULL ;
589+ std::vector<ggml_backend_t > backend_ptrs;
590+ std::vector<ggml_backend_buffer_type_t > backend_buft;
591+
592+ ggml_backend_t backend = nullptr ;
593+ ggml_backend_t backend_cpu = nullptr ;
594+ ggml_backend_buffer_t buf = nullptr ;
610595
611- ggml_backend_t backend = NULL ;
612- ggml_gallocr_t compute_alloc = NULL ;
596+ ggml_backend_sched_ptr sched;
613597
614598 struct clip_image_size * load_image_size;
599+
600+ clip_ctx (clip_context_params & ctx_params) {
601+ backend_cpu = ggml_backend_init_by_type (GGML_BACKEND_DEVICE_TYPE_CPU, nullptr );
602+ backend = ctx_params.use_gpu
603+ ? ggml_backend_init_by_type (GGML_BACKEND_DEVICE_TYPE_GPU, nullptr )
604+ : nullptr ;
605+
606+ if (backend) {
607+ LOG_INF (" %s: CLIP using %s backend\n " , __func__, ggml_backend_name (backend));
608+ backend_ptrs.push_back (backend);
609+ backend_buft.push_back (ggml_backend_get_default_buffer_type (backend));
610+ } else {
611+ backend = backend_cpu;
612+ LOG_INF (" %s: CLIP using CPU backend\n " , __func__);
613+ }
614+
615+ backend_ptrs.push_back (backend_cpu);
616+ backend_buft.push_back (ggml_backend_get_default_buffer_type (backend_cpu));
617+
618+ sched.reset (
619+ ggml_backend_sched_new (backend_ptrs.data (), backend_buft.data (), backend_ptrs.size (), 8192 , false )
620+ );
621+ }
622+
623+ ~clip_ctx () {
624+ ggml_free (ctx_data);
625+ gguf_free (ctx_gguf);
626+ ggml_backend_buffer_free (buf);
627+ ggml_backend_free (backend);
628+ if (backend_cpu != backend) {
629+ ggml_backend_free (backend_cpu);
630+ }
631+ }
615632};
616633
617634static ggml_cgraph * clip_image_build_graph (clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false ) {
@@ -1184,6 +1201,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
11841201
11851202// read and create ggml_context containing the tensors and their data
11861203struct clip_ctx * clip_model_load (const char * fname, const int verbosity = 1 ) {
1204+ return clip_init (fname, clip_context_params{
1205+ /* use_gpu */ true ,
1206+ /* verbosity */ verbosity,
1207+ });
1208+ }
1209+
1210+ struct clip_ctx * clip_init (const char * fname, struct clip_context_params ctx_params) {
1211+ int verbosity = ctx_params.verbosity ;
11871212 struct ggml_context * meta = NULL ;
11881213
11891214 struct gguf_init_params params = {
@@ -1277,7 +1302,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
12771302 }
12781303 }
12791304
1280- clip_ctx * new_clip = new clip_ctx{} ;
1305+ clip_ctx * new_clip = new clip_ctx (ctx_params) ;
12811306
12821307 // update projector type
12831308 {
@@ -1296,36 +1321,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
12961321 }
12971322 }
12981323
1299- // #ifdef GGML_USE_CUDA
1300- // new_clip->backend = ggml_backend_cuda_init(0);
1301- // LOG_INF("%s: CLIP using CUDA backend\n", __func__);
1302- // #endif
1303- //
1304- // #ifdef GGML_USE_METAL
1305- // new_clip->backend = ggml_backend_metal_init();
1306- // LOG_INF("%s: CLIP using Metal backend\n", __func__);
1307- // #endif
1308- //
1309- // #ifdef GGML_USE_CANN
1310- // new_clip->backend = ggml_backend_cann_init(0);
1311- // LOG_INF("%s: CLIP using CANN backend\n", __func__);
1312- // #endif
1313- //
1314- // #ifdef GGML_USE_VULKAN
1315- // new_clip->backend = ggml_backend_vk_init(0);
1316- // LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
1317- // #endif
1318- //
1319- // #ifdef GGML_USE_SYCL
1320- // new_clip->backend = ggml_backend_sycl_init(0);
1321- // LOG_INF("%s: CLIP using SYCL backend\n", __func__);
1322- // #endif
1323-
1324- if (!new_clip->backend ) {
1325- new_clip->backend = ggml_backend_cpu_init ();
1326- LOG_INF (" %s: CLIP using CPU backend\n " , __func__);
1327- }
1328-
13291324 // model size and capabilities
13301325 {
13311326 int idx = get_key_idx (ctx, KEY_HAS_TEXT_ENC);
@@ -1421,7 +1416,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
14211416 }
14221417
14231418 // alloc memory and offload data
1424- new_clip->params_buffer = ggml_backend_alloc_ctx_tensors (new_clip->ctx_data , new_clip->backend );
1419+ ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type (new_clip->backend );
1420+ new_clip->buf = ggml_backend_alloc_ctx_tensors_from_buft (new_clip->ctx_data , buft);
1421+ ggml_backend_buffer_set_usage (new_clip->buf , GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
14251422 for (int i = 0 ; i < n_tensors; ++i) {
14261423 const char * name = gguf_get_tensor_name (ctx, i);
14271424 struct ggml_tensor * cur = ggml_get_tensor (new_clip->ctx_data , name);
@@ -1434,7 +1431,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
14341431 return nullptr ;
14351432 }
14361433 int num_bytes = ggml_nbytes (cur);
1437- if (ggml_backend_buffer_is_host (new_clip-> params_buffer )) {
1434+ if (ggml_backend_buft_is_host (buft )) {
14381435 // for the CPU and Metal backend, we can read directly into the tensor
14391436 fin.read (reinterpret_cast <char *>(cur->data ), num_bytes);
14401437 } else {
@@ -1720,14 +1717,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
17201717 // measure mem requirement and allocate
17211718 {
17221719 new_clip->buf_compute_meta .resize (GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead () + ggml_graph_overhead ());
1723- new_clip->compute_alloc = ggml_gallocr_new (ggml_backend_get_default_buffer_type (new_clip->backend ));
17241720 clip_image_f32_batch batch;
17251721 batch.size = 1 ;
17261722 batch.data = nullptr ;
17271723 ggml_cgraph * gf = clip_image_build_graph (new_clip, &batch, nullptr , false );
1728- ggml_gallocr_reserve (new_clip->compute_alloc , gf);
1729- size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size (new_clip->compute_alloc , 0 );
1730- LOG_INF (" %s: compute allocated memory: %.2f MB\n " , __func__, compute_memory_buffer_size /1024.0 /1024.0 );
1724+ ggml_backend_sched_reserve (new_clip->sched .get (), gf);
1725+ for (size_t i = 0 ; i < new_clip->backend_ptrs .size (); ++i) {
1726+ ggml_backend_t backend = new_clip->backend_ptrs [i];
1727+ ggml_backend_buffer_type_t buft = new_clip->backend_buft [i];
1728+ size_t size = ggml_backend_sched_get_buffer_size (new_clip->sched .get (), backend);
1729+ if (size > 1 ) {
1730+ LOG_INF (" %s: %10s compute buffer size = %8.2f MiB\n " , __func__,
1731+ ggml_backend_buft_name (buft),
1732+ size / 1024.0 / 1024.0 );
1733+ }
1734+ }
17311735 }
17321736
17331737 return new_clip;
@@ -2408,12 +2412,6 @@ ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
24082412}
24092413
24102414void clip_free (clip_ctx * ctx) {
2411- ggml_free (ctx->ctx_data );
2412- gguf_free (ctx->ctx_gguf );
2413-
2414- ggml_backend_buffer_free (ctx->params_buffer );
2415- ggml_backend_free (ctx->backend );
2416- ggml_gallocr_free (ctx->compute_alloc );
24172415 delete ctx;
24182416}
24192417
@@ -2609,8 +2607,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
26092607 }
26102608
26112609 // build the inference graph
2610+ ggml_backend_sched_reset (ctx->sched .get ());
26122611 ggml_cgraph * gf = clip_image_build_graph (ctx, imgs, ctx->load_image_size , true );
2613- ggml_gallocr_alloc_graph (ctx->compute_alloc , gf);
2612+ ggml_backend_sched_alloc_graph (ctx->sched . get () , gf);
26142613
26152614 // set inputs
26162615 const auto & model = ctx->vision_model ;
@@ -2775,11 +2774,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
27752774 }
27762775 }
27772776
2778- if (ggml_backend_is_cpu (ctx->backend )) {
2779- ggml_backend_cpu_set_n_threads (ctx->backend , n_threads);
2780- }
2777+ ggml_backend_cpu_set_n_threads (ctx->backend_cpu , n_threads);
27812778
2782- ggml_backend_graph_compute (ctx->backend , gf);
2779+ auto status = ggml_backend_sched_graph_compute (ctx->sched .get (), gf);
2780+ if (status != GGML_STATUS_SUCCESS) {
2781+ LOG_ERR (" %s: ggml_backend_sched_graph_compute failed with error %d\n " , __func__, status);
2782+ return false ;
2783+ }
27832784
27842785 // the last node is the embedding tensor
27852786 struct ggml_tensor * embeddings = ggml_graph_node (gf, -1 );
0 commit comments