44// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
55#include " clip.h"
66#include " ggml.h"
7+ #include " ggml-cpp.h"
78#include " ggml-cpu.h"
89#include " ggml-alloc.h"
910#include " ggml-backend.h"
1011#include " gguf.h"
1112
12- // #ifdef GGML_USE_CUDA
13- // #include "ggml-cuda.h"
14- // #endif
15- //
16- // #ifdef GGML_USE_SYCL
17- // #include "ggml-sycl.h"
18- // #endif
19- //
20- // #ifdef GGML_USE_METAL
21- // #include "ggml-metal.h"
22- // #endif
23- //
24- // #ifdef GGML_USE_CANN
25- // #include "ggml-cann.h"
26- // #endif
27- //
28- // #ifdef GGML_USE_VULKAN
29- // #include "ggml-vulkan.h"
30- // #endif
13+ #ifdef GGML_USE_CUDA
14+ #include " ggml-cuda.h"
15+ #endif
16+
17+ #ifdef GGML_USE_SYCL
18+ #include " ggml-sycl.h"
19+ #endif
20+
21+ #ifdef GGML_USE_METAL
22+ #include " ggml-metal.h"
23+ #endif
24+
25+ #ifdef GGML_USE_CANN
26+ #include " ggml-cann.h"
27+ #endif
28+
29+ #ifdef GGML_USE_VULKAN
30+ #include " ggml-vulkan.h"
31+ #endif
3132
3233#define STB_IMAGE_IMPLEMENTATION
3334#include " stb_image.h"
@@ -600,18 +601,36 @@ struct clip_ctx {
600601 bool has_post_norm = false ;
601602 bool has_patch_bias = false ;
602603
603- struct gguf_context * ctx_gguf;
604- struct ggml_context * ctx_data;
604+ struct gguf_context * ctx_gguf = nullptr ;
605+ struct ggml_context * ctx_data = nullptr ;
605606
606607 std::vector<uint8_t > buf_compute_meta;
607608
608- // memory buffers to evaluate the model
609- ggml_backend_buffer_t params_buffer = NULL ;
609+ ggml_backend_t backend = nullptr ;
610+ ggml_backend_t backend_cpu = nullptr ;
611+ ggml_backend_buffer_t buf = nullptr ;
610612
611- ggml_backend_t backend = NULL ;
612- ggml_gallocr_t compute_alloc = NULL ;
613+ ggml_backend_sched_ptr sched;
613614
614615 struct clip_image_size * load_image_size;
616+
617+ ~clip_ctx () {
618+ if (ctx_data) {
619+ ggml_free (ctx_data);
620+ }
621+ if (ctx_gguf) {
622+ gguf_free (ctx_gguf);
623+ }
624+ if (buf) {
625+ ggml_backend_buffer_free (buf);
626+ }
627+ if (backend) {
628+ ggml_backend_free (backend);
629+ }
630+ if (backend_cpu) {
631+ ggml_backend_free (backend_cpu);
632+ }
633+ }
615634};
616635
617636static ggml_cgraph * clip_image_build_graph (clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false ) {
@@ -1184,6 +1203,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
11841203
11851204// read and create ggml_context containing the tensors and their data
11861205struct clip_ctx * clip_model_load (const char * fname, const int verbosity = 1 ) {
1206+ return clip_init (fname, clip_context_params{
1207+ /* use_gpu */ true ,
1208+ /* verbosity */ verbosity,
1209+ });
1210+ }
1211+
1212+ struct clip_ctx * clip_init (const char * fname, clip_context_params ctx_params) {
1213+ int verbosity = ctx_params.verbosity ;
11871214 struct ggml_context * meta = NULL ;
11881215
11891216 struct gguf_init_params params = {
@@ -1296,36 +1323,53 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
12961323 }
12971324 }
12981325
1299- // #ifdef GGML_USE_CUDA
1300- // new_clip->backend = ggml_backend_cuda_init(0);
1301- // LOG_INF("%s: CLIP using CUDA backend\n", __func__);
1302- // #endif
1303- //
1304- // #ifdef GGML_USE_METAL
1305- // new_clip->backend = ggml_backend_metal_init();
1306- // LOG_INF("%s: CLIP using Metal backend\n", __func__);
1307- // #endif
1308- //
1309- // #ifdef GGML_USE_CANN
1310- // new_clip->backend = ggml_backend_cann_init(0);
1311- // LOG_INF("%s: CLIP using CANN backend\n", __func__);
1312- // #endif
1313- //
1314- // #ifdef GGML_USE_VULKAN
1315- // new_clip->backend = ggml_backend_vk_init(0);
1316- // LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
1317- // #endif
1318- //
1319- // #ifdef GGML_USE_SYCL
1320- // new_clip->backend = ggml_backend_sycl_init(0);
1321- // LOG_INF("%s: CLIP using SYCL backend\n", __func__);
1322- // #endif
1326+ std::vector<ggml_backend_buffer_type_t > backend_buft;
1327+ std::vector<ggml_backend_t > backend_ptrs;
1328+
1329+ new_clip->backend_cpu = ggml_backend_cpu_init ();
1330+
1331+ if (ctx_params.use_gpu ) {
1332+ #ifdef GGML_USE_CUDA
1333+ new_clip->backend = ggml_backend_cuda_init (0 );
1334+ LOG_INF (" %s: CLIP using CUDA backend\n " , __func__);
1335+ #endif
1336+
1337+ #ifdef GGML_USE_METAL
1338+ new_clip->backend = ggml_backend_metal_init ();
1339+ LOG_INF (" %s: CLIP using Metal backend\n " , __func__);
1340+ #endif
1341+
1342+ #ifdef GGML_USE_CANN
1343+ new_clip->backend = ggml_backend_cann_init (0 );
1344+ LOG_INF (" %s: CLIP using CANN backend\n " , __func__);
1345+ #endif
13231346
1324- if (!new_clip->backend ) {
1325- new_clip->backend = ggml_backend_cpu_init ();
1347+ #ifdef GGML_USE_VULKAN
1348+ new_clip->backend = ggml_backend_vk_init (0 );
1349+ LOG_INF (" %s: CLIP using Vulkan backend\n " , __func__);
1350+ #endif
1351+
1352+ #ifdef GGML_USE_SYCL
1353+ new_clip->backend = ggml_backend_sycl_init (0 );
1354+ LOG_INF (" %s: CLIP using SYCL backend\n " , __func__);
1355+ #endif
1356+ }
1357+
1358+ if (new_clip->backend ) {
1359+ backend_ptrs.push_back (new_clip->backend );
1360+ backend_buft.push_back (ggml_backend_get_default_buffer_type (new_clip->backend ));
1361+ } else {
1362+ new_clip->backend = new_clip->backend_cpu ;
13261363 LOG_INF (" %s: CLIP using CPU backend\n " , __func__);
13271364 }
13281365
1366+ backend_ptrs.push_back (new_clip->backend_cpu );
1367+ backend_buft.push_back (ggml_backend_get_default_buffer_type (new_clip->backend_cpu ));
1368+
1369+ new_clip->sched .reset (
1370+ ggml_backend_sched_new (backend_ptrs.data (), backend_buft.data (), backend_ptrs.size (), 8192 , false )
1371+ );
1372+
13291373 // model size and capabilities
13301374 {
13311375 int idx = get_key_idx (ctx, KEY_HAS_TEXT_ENC);
@@ -1421,7 +1465,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
14211465 }
14221466
14231467 // alloc memory and offload data
1424- new_clip->params_buffer = ggml_backend_alloc_ctx_tensors (new_clip->ctx_data , new_clip->backend );
1468+ ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type (new_clip->backend );
1469+ new_clip->buf = ggml_backend_alloc_ctx_tensors_from_buft (new_clip->ctx_data , buft);
1470+ ggml_backend_buffer_set_usage (new_clip->buf , GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
14251471 for (int i = 0 ; i < n_tensors; ++i) {
14261472 const char * name = gguf_get_tensor_name (ctx, i);
14271473 struct ggml_tensor * cur = ggml_get_tensor (new_clip->ctx_data , name);
@@ -1434,7 +1480,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
14341480 return nullptr ;
14351481 }
14361482 int num_bytes = ggml_nbytes (cur);
1437- if (ggml_backend_buffer_is_host (new_clip-> params_buffer )) {
1483+ if (ggml_backend_buft_is_host (buft )) {
14381484 // for the CPU and Metal backend, we can read directly into the tensor
14391485 fin.read (reinterpret_cast <char *>(cur->data ), num_bytes);
14401486 } else {
@@ -1720,14 +1766,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
17201766 // measure mem requirement and allocate
17211767 {
17221768 new_clip->buf_compute_meta .resize (GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead () + ggml_graph_overhead ());
1723- new_clip->compute_alloc = ggml_gallocr_new (ggml_backend_get_default_buffer_type (new_clip->backend ));
17241769 clip_image_f32_batch batch;
17251770 batch.size = 1 ;
17261771 batch.data = nullptr ;
17271772 ggml_cgraph * gf = clip_image_build_graph (new_clip, &batch, nullptr , false );
1728- ggml_gallocr_reserve (new_clip->compute_alloc , gf);
1729- size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size (new_clip->compute_alloc , 0 );
1730- LOG_INF (" %s: compute allocated memory: %.2f MB\n " , __func__, compute_memory_buffer_size /1024.0 /1024.0 );
1773+ ggml_backend_sched_reserve (new_clip->sched .get (), gf);
1774+ for (size_t i = 0 ; i < backend_ptrs.size (); ++i) {
1775+ ggml_backend_t backend = backend_ptrs[i];
1776+ ggml_backend_buffer_type_t buft = backend_buft[i];
1777+ size_t size = ggml_backend_sched_get_buffer_size (new_clip->sched .get (), backend);
1778+ if (size > 1 ) {
1779+ LOG_INF (" %s: %10s compute buffer size = %8.2f MiB\n " , __func__,
1780+ ggml_backend_buft_name (buft),
1781+ size / 1024.0 / 1024.0 );
1782+ }
1783+ }
17311784 }
17321785
17331786 return new_clip;
@@ -2408,12 +2461,6 @@ ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
24082461}
24092462
24102463void clip_free (clip_ctx * ctx) {
2411- ggml_free (ctx->ctx_data );
2412- gguf_free (ctx->ctx_gguf );
2413-
2414- ggml_backend_buffer_free (ctx->params_buffer );
2415- ggml_backend_free (ctx->backend );
2416- ggml_gallocr_free (ctx->compute_alloc );
24172464 delete ctx;
24182465}
24192466
@@ -2609,8 +2656,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
26092656 }
26102657
26112658 // build the inference graph
2659+ ggml_backend_sched_reset (ctx->sched .get ());
26122660 ggml_cgraph * gf = clip_image_build_graph (ctx, imgs, ctx->load_image_size , true );
2613- ggml_gallocr_alloc_graph (ctx->compute_alloc , gf);
2661+ ggml_backend_sched_alloc_graph (ctx->sched . get () , gf);
26142662
26152663 // set inputs
26162664 const auto & model = ctx->vision_model ;
@@ -2775,11 +2823,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
27752823 }
27762824 }
27772825
2778- if (ggml_backend_is_cpu (ctx->backend )) {
2779- ggml_backend_cpu_set_n_threads (ctx->backend , n_threads);
2780- }
2826+ ggml_backend_cpu_set_n_threads (ctx->backend_cpu , n_threads);
27812827
2782- ggml_backend_graph_compute (ctx->backend , gf);
2828+ auto status = ggml_backend_sched_graph_compute (ctx->sched .get (), gf);
2829+ if (status != GGML_STATUS_SUCCESS) {
2830+ LOG_ERR (" %s: ggml_backend_sched_graph_compute failed with error %d\n " , __func__, status);
2831+ return false ;
2832+ }
27832833
27842834 // the last node is the embedding tensor
27852835 struct ggml_tensor * embeddings = ggml_graph_node (gf, -1 );
0 commit comments