44//  Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
55#include  " clip.h" 
66#include  " ggml.h" 
7+ #include  " ggml-cpp.h" 
78#include  " ggml-cpu.h" 
89#include  " ggml-alloc.h" 
910#include  " ggml-backend.h" 
1011#include  " gguf.h" 
1112
12- // #ifdef GGML_USE_CUDA
13- // #include "ggml-cuda.h"
14- // #endif
15- // 
16- // #ifdef GGML_USE_SYCL
17- // #include "ggml-sycl.h"
18- // #endif
19- // 
20- // #ifdef GGML_USE_METAL
21- // #include "ggml-metal.h"
22- // #endif
23- // 
24- // #ifdef GGML_USE_CANN
25- // #include "ggml-cann.h"
26- // #endif
27- // 
28- // #ifdef GGML_USE_VULKAN
29- // #include "ggml-vulkan.h"
30- // #endif
31- 
3213#define  STB_IMAGE_IMPLEMENTATION 
3314#include  " stb_image.h" 
3415
@@ -600,18 +581,54 @@ struct clip_ctx {
600581    bool  has_post_norm = false ;
601582    bool  has_patch_bias = false ;
602583
603-     struct  gguf_context  * ctx_gguf;
604-     struct  ggml_context  * ctx_data;
584+     struct  gguf_context  * ctx_gguf =  nullptr ;
585+     struct  ggml_context  * ctx_data =  nullptr ;
605586
606587    std::vector<uint8_t > buf_compute_meta;
607588
608-     //  memory buffers to evaluate the model
609-     ggml_backend_buffer_t  params_buffer  = NULL ;
589+     std::vector<ggml_backend_t > backend_ptrs;
590+     std::vector<ggml_backend_buffer_type_t > backend_buft;
591+ 
592+     ggml_backend_t  backend     = nullptr ;
593+     ggml_backend_t  backend_cpu = nullptr ;
594+     ggml_backend_buffer_t  buf  = nullptr ;
610595
611-     ggml_backend_t  backend       = NULL ;
612-     ggml_gallocr_t  compute_alloc = NULL ;
596+     ggml_backend_sched_ptr sched;
613597
614598    struct  clip_image_size  * load_image_size;
599+ 
600+     clip_ctx (clip_context_params & ctx_params) {
601+         backend_cpu = ggml_backend_init_by_type (GGML_BACKEND_DEVICE_TYPE_CPU, nullptr );
602+         backend     = ctx_params.use_gpu 
603+                         ? ggml_backend_init_by_type (GGML_BACKEND_DEVICE_TYPE_GPU, nullptr )
604+                         : nullptr ;
605+ 
606+         if  (backend) {
607+             LOG_INF (" %s: CLIP using %s backend\n "  , __func__, ggml_backend_name (backend));
608+             backend_ptrs.push_back (backend);
609+             backend_buft.push_back (ggml_backend_get_default_buffer_type (backend));
610+         } else  {
611+             backend = backend_cpu;
612+             LOG_INF (" %s: CLIP using CPU backend\n "  , __func__);
613+         }
614+ 
615+         backend_ptrs.push_back (backend_cpu);
616+         backend_buft.push_back (ggml_backend_get_default_buffer_type (backend_cpu));
617+ 
618+         sched.reset (
619+             ggml_backend_sched_new (backend_ptrs.data (), backend_buft.data (), backend_ptrs.size (), 8192 , false )
620+         );
621+     }
622+ 
623+     ~clip_ctx () {
624+         ggml_free (ctx_data);
625+         gguf_free (ctx_gguf);
626+         ggml_backend_buffer_free (buf);
627+         ggml_backend_free (backend);
628+         if  (backend_cpu != backend) {
629+             ggml_backend_free (backend_cpu);
630+         }
631+     }
615632};
616633
617634static  ggml_cgraph * clip_image_build_graph (clip_ctx * ctx, const  clip_image_f32_batch * imgs, struct  clip_image_size  * load_image_size, bool  is_inf = false ) {
@@ -1184,6 +1201,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
11841201
11851202//  read and create ggml_context containing the tensors and their data
11861203struct  clip_ctx  * clip_model_load (const  char  * fname, const  int  verbosity = 1 ) {
1204+     return  clip_init (fname, clip_context_params{
1205+         /*  use_gpu */     true ,
1206+         /*  verbosity */   verbosity,
1207+     });
1208+ }
1209+ 
1210+ struct  clip_ctx  * clip_init (const  char  * fname, struct  clip_context_params  ctx_params) {
1211+     int  verbosity = ctx_params.verbosity ;
11871212    struct  ggml_context  * meta = NULL ;
11881213
11891214    struct  gguf_init_params  params = {
@@ -1277,7 +1302,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
12771302        }
12781303    }
12791304
1280-     clip_ctx * new_clip = new  clip_ctx{} ;
1305+     clip_ctx * new_clip = new  clip_ctx (ctx_params) ;
12811306
12821307    //  update projector type
12831308    {
@@ -1296,36 +1321,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
12961321        }
12971322    }
12981323
1299- // #ifdef GGML_USE_CUDA
1300- //     new_clip->backend = ggml_backend_cuda_init(0);
1301- //     LOG_INF("%s: CLIP using CUDA backend\n", __func__);
1302- // #endif
1303- // 
1304- // #ifdef GGML_USE_METAL
1305- //     new_clip->backend = ggml_backend_metal_init();
1306- //     LOG_INF("%s: CLIP using Metal backend\n", __func__);
1307- // #endif
1308- // 
1309- // #ifdef GGML_USE_CANN
1310- //     new_clip->backend = ggml_backend_cann_init(0);
1311- //     LOG_INF("%s: CLIP using CANN backend\n", __func__);
1312- // #endif
1313- // 
1314- // #ifdef GGML_USE_VULKAN
1315- //     new_clip->backend = ggml_backend_vk_init(0);
1316- //     LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
1317- // #endif
1318- // 
1319- // #ifdef GGML_USE_SYCL
1320- //     new_clip->backend = ggml_backend_sycl_init(0);
1321- //     LOG_INF("%s: CLIP using SYCL backend\n", __func__);
1322- // #endif
1323- 
1324-     if  (!new_clip->backend ) {
1325-         new_clip->backend  = ggml_backend_cpu_init ();
1326-         LOG_INF (" %s: CLIP using CPU backend\n "  , __func__);
1327-     }
1328- 
13291324    //  model size and capabilities
13301325    {
13311326        int  idx = get_key_idx (ctx, KEY_HAS_TEXT_ENC);
@@ -1421,7 +1416,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
14211416        }
14221417
14231418        //  alloc memory and offload data
1424-         new_clip->params_buffer  = ggml_backend_alloc_ctx_tensors (new_clip->ctx_data , new_clip->backend );
1419+         ggml_backend_buffer_type_t  buft = ggml_backend_get_default_buffer_type (new_clip->backend );
1420+         new_clip->buf  = ggml_backend_alloc_ctx_tensors_from_buft (new_clip->ctx_data , buft);
1421+         ggml_backend_buffer_set_usage (new_clip->buf , GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
14251422        for  (int  i = 0 ; i < n_tensors; ++i) {
14261423            const  char  * name = gguf_get_tensor_name (ctx, i);
14271424            struct  ggml_tensor  * cur = ggml_get_tensor (new_clip->ctx_data , name);
@@ -1434,7 +1431,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
14341431                return  nullptr ;
14351432            }
14361433            int  num_bytes = ggml_nbytes (cur);
1437-             if  (ggml_backend_buffer_is_host (new_clip-> params_buffer )) {
1434+             if  (ggml_backend_buft_is_host (buft )) {
14381435                //  for the CPU and Metal backend, we can read directly into the tensor
14391436                fin.read (reinterpret_cast <char  *>(cur->data ), num_bytes);
14401437            } else  {
@@ -1720,14 +1717,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
17201717    //  measure mem requirement and allocate
17211718    {
17221719        new_clip->buf_compute_meta .resize (GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead () + ggml_graph_overhead ());
1723-         new_clip->compute_alloc  = ggml_gallocr_new (ggml_backend_get_default_buffer_type (new_clip->backend ));
17241720        clip_image_f32_batch batch;
17251721        batch.size  = 1 ;
17261722        batch.data  = nullptr ;
17271723        ggml_cgraph * gf = clip_image_build_graph (new_clip, &batch, nullptr , false );
1728-         ggml_gallocr_reserve (new_clip->compute_alloc , gf);
1729-         size_t  compute_memory_buffer_size = ggml_gallocr_get_buffer_size (new_clip->compute_alloc , 0 );
1730-         LOG_INF (" %s: compute allocated memory: %.2f MB\n "  , __func__, compute_memory_buffer_size /1024.0 /1024.0 );
1724+         ggml_backend_sched_reserve (new_clip->sched .get (), gf);
1725+         for  (size_t  i = 0 ; i < new_clip->backend_ptrs .size (); ++i) {
1726+             ggml_backend_t  backend = new_clip->backend_ptrs [i];
1727+             ggml_backend_buffer_type_t  buft = new_clip->backend_buft [i];
1728+             size_t  size = ggml_backend_sched_get_buffer_size (new_clip->sched .get (), backend);
1729+             if  (size > 1 ) {
1730+                 LOG_INF (" %s: %10s compute buffer size = %8.2f MiB\n "  , __func__,
1731+                         ggml_backend_buft_name (buft),
1732+                         size / 1024.0  / 1024.0 );
1733+             }
1734+         }
17311735    }
17321736
17331737    return  new_clip;
@@ -2408,12 +2412,6 @@ ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
24082412}
24092413
24102414void  clip_free (clip_ctx * ctx) {
2411-     ggml_free (ctx->ctx_data );
2412-     gguf_free (ctx->ctx_gguf );
2413- 
2414-     ggml_backend_buffer_free (ctx->params_buffer );
2415-     ggml_backend_free (ctx->backend );
2416-     ggml_gallocr_free (ctx->compute_alloc );
24172415    delete  ctx;
24182416}
24192417
@@ -2609,8 +2607,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
26092607    }
26102608
26112609    //  build the inference graph
2610+     ggml_backend_sched_reset (ctx->sched .get ());
26122611    ggml_cgraph * gf = clip_image_build_graph (ctx, imgs, ctx->load_image_size , true );
2613-     ggml_gallocr_alloc_graph (ctx->compute_alloc , gf);
2612+     ggml_backend_sched_alloc_graph (ctx->sched . get () , gf);
26142613
26152614    //  set inputs
26162615    const  auto  & model = ctx->vision_model ;
@@ -2775,11 +2774,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
27752774        }
27762775    }
27772776
2778-     if  (ggml_backend_is_cpu (ctx->backend )) {
2779-         ggml_backend_cpu_set_n_threads (ctx->backend , n_threads);
2780-     }
2777+     ggml_backend_cpu_set_n_threads (ctx->backend_cpu , n_threads);
27812778
2782-     ggml_backend_graph_compute (ctx->backend , gf);
2779+     auto  status = ggml_backend_sched_graph_compute (ctx->sched .get (), gf);
2780+     if  (status != GGML_STATUS_SUCCESS) {
2781+         LOG_ERR (" %s: ggml_backend_sched_graph_compute failed with error %d\n "  , __func__, status);
2782+         return  false ;
2783+     }
27832784
27842785    //  the last node is the embedding tensor
27852786    struct  ggml_tensor  * embeddings = ggml_graph_node (gf, -1 );
0 commit comments