2828#include  < cinttypes> 
2929#include  < limits> 
3030
31- #if  defined(LLAVA_LOG_OFF)
32- #   define  LOG_INF (...)
33- #   define  LOG_WRN (...)
34- #   define  LOG_ERR (...)
35- #   define  LOG_DBG (...)
36- #else  //  defined(LLAVA_LOG_OFF)
37- #   define  LOG_INF (...) do  { fprintf (stdout, __VA_ARGS__); } while  (0 )
38- #   define  LOG_WRN (...) do  { fprintf (stderr, __VA_ARGS__); } while  (0 )
39- #   define  LOG_ERR (...) do  { fprintf (stderr, __VA_ARGS__); } while  (0 )
40- #   define  LOG_DBG (...) do  { fprintf (stdout, __VA_ARGS__); } while  (0 )
41- #endif  //  defined(LLAVA_LOG_OFF)
31+ struct  clip_logger_state  g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL };
4232
4333// #define CLIP_DEBUG_FUNCTIONS
4434
@@ -1121,7 +1111,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
11211111}
11221112
11231113struct  clip_model_loader  {
1124-     clip_log_level verbosity;
11251114    ggml_context_ptr ctx_meta;
11261115    gguf_context_ptr ctx_gguf;
11271116
@@ -1131,7 +1120,7 @@ struct clip_model_loader {
11311120    size_t  model_size; //  in bytes
11321121
11331122    //  TODO @ngxson : we should not pass clip_ctx here, it should be clip_vision_model
1134-     clip_model_loader (const  char  * fname, struct   clip_context_params  ctx_params,  clip_ctx & ctx_clip) : verbosity(ctx_params.verbosity),  ctx_clip(ctx_clip), fname(fname) {
1123+     clip_model_loader (const  char  * fname, clip_ctx & ctx_clip) : ctx_clip(ctx_clip), fname(fname) {
11351124        struct  ggml_context  * meta = NULL ;
11361125
11371126        struct  gguf_init_params  params = {
@@ -1149,7 +1138,7 @@ struct clip_model_loader {
11491138        const  int  n_tensors = gguf_get_n_tensors (ctx_gguf.get ());
11501139
11511140        //  print gguf info
1152-         if  (verbosity >= CLIP_LOG_ERROR)  {
1141+         {
11531142            int  ftype = -1 ;
11541143            get_u32 (KEY_FTYPE, ftype, false );
11551144            const  std::string ftype_str = ggml_type_name (static_cast <ggml_type>(ftype));
@@ -1176,10 +1165,8 @@ struct clip_model_loader {
11761165                struct  ggml_tensor  * cur = ggml_get_tensor (meta, name);
11771166                size_t  tensor_size = ggml_nbytes (cur);
11781167                model_size += tensor_size;
1179-                 if  (verbosity >= CLIP_LOG_DEBUG) {
1180-                     LOG_INF (" %s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%"   PRIu64 " , %"   PRIu64 " , %"   PRIu64 " , %"   PRIu64 " ], type = %s\n "  ,
1181-                         __func__, i, ggml_n_dims (cur), cur->name , tensor_size, offset, cur->ne [0 ], cur->ne [1 ], cur->ne [2 ], cur->ne [3 ], ggml_type_name (type));
1182-                 }
1168+                 LOG_DBG (" %s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%"   PRIu64 " , %"   PRIu64 " , %"   PRIu64 " , %"   PRIu64 " ], type = %s\n "  ,
1169+                     __func__, i, ggml_n_dims (cur), cur->name , tensor_size, offset, cur->ne [0 ], cur->ne [1 ], cur->ne [2 ], cur->ne [3 ], ggml_type_name (type));
11831170            }
11841171        }
11851172    }
@@ -1262,16 +1249,14 @@ struct clip_model_loader {
12621249            //  Calculate the deepest feature layer based on hparams and projector type
12631250            ctx_clip.max_feature_layer  = get_deepest_feature_layer (&ctx_clip);
12641251
1265-             if  (verbosity >= CLIP_LOG_ERROR) {
1266-                 LOG_INF (" %s: text_encoder:       %d\n "  , __func__, ctx_clip.has_text_encoder );
1267-                 LOG_INF (" %s: vision_encoder:     %d\n "  , __func__, ctx_clip.has_vision_encoder );
1268-                 LOG_INF (" %s: llava_projector:    %d\n "  , __func__, ctx_clip.has_llava_projector );
1269-                 LOG_INF (" %s: minicpmv_projector: %d\n "  , __func__, ctx_clip.has_minicpmv_projector );
1270-                 LOG_INF (" %s: minicpmv_version:   %d\n "  , __func__, ctx_clip.minicpmv_version );
1271-                 LOG_INF (" %s: glm_projector:      %d\n "  , __func__, ctx_clip.has_glm_projector );
1272-                 LOG_INF (" %s: model size:         %.2f MiB\n "  , __func__, model_size / 1024.0  / 1024.0 );
1273-                 LOG_INF (" %s: metadata size:      %.2f MiB\n "  , __func__, ggml_get_mem_size (ctx_meta.get ()) / 1024.0  / 1024.0 );
1274-             }
1252+             LOG_INF (" %s: text_encoder:       %d\n "  , __func__, ctx_clip.has_text_encoder );
1253+             LOG_INF (" %s: vision_encoder:     %d\n "  , __func__, ctx_clip.has_vision_encoder );
1254+             LOG_INF (" %s: llava_projector:    %d\n "  , __func__, ctx_clip.has_llava_projector );
1255+             LOG_INF (" %s: minicpmv_projector: %d\n "  , __func__, ctx_clip.has_minicpmv_projector );
1256+             LOG_INF (" %s: minicpmv_version:   %d\n "  , __func__, ctx_clip.minicpmv_version );
1257+             LOG_INF (" %s: glm_projector:      %d\n "  , __func__, ctx_clip.has_glm_projector );
1258+             LOG_INF (" %s: model size:         %.2f MiB\n "  , __func__, model_size / 1024.0  / 1024.0 );
1259+             LOG_INF (" %s: metadata size:      %.2f MiB\n "  , __func__, ggml_get_mem_size (ctx_meta.get ()) / 1024.0  / 1024.0 );
12751260        }
12761261    }
12771262
@@ -1495,9 +1480,7 @@ struct clip_model_loader {
14951480            }
14961481            fin.close ();
14971482
1498-             if  (verbosity >= CLIP_LOG_INFO) {
1499-                 LOG_INF (" %s: loaded %zu tensors from %s\n "  , __func__, tensors_to_load.size (), fname.c_str ());
1500-             }
1483+             LOG_DBG (" %s: loaded %zu tensors from %s\n "  , __func__, tensors_to_load.size (), fname.c_str ());
15011484        }
15021485    }
15031486
@@ -1581,18 +1564,19 @@ struct clip_model_loader {
15811564};
15821565
15831566//  read and create ggml_context containing the tensors and their data
1584- struct  clip_ctx  * clip_model_load (const  char  * fname, const  int  verbosity =  1 ) {
1567+ struct  clip_ctx  * clip_model_load (const  char  * fname, const  int  verbosity) {
15851568    return  clip_init (fname, clip_context_params{
15861569        /*  use_gpu */     true ,
1587-         /*  verbosity */   static_cast <clip_log_level >(verbosity),
1570+         /*  verbosity */   static_cast <ggml_log_level >(verbosity),
15881571    });
15891572}
15901573
15911574struct  clip_ctx  * clip_init (const  char  * fname, struct  clip_context_params  ctx_params) {
1575+     g_logger_state.verbosity_thold  = ctx_params.verbosity ;
15921576    clip_ctx * ctx_clip = new  clip_ctx (ctx_params);
15931577
15941578    try  {
1595-         clip_model_loader loader (fname, ctx_params,  *ctx_clip);
1579+         clip_model_loader loader (fname, *ctx_clip);
15961580        loader.load_hparams ();
15971581        loader.load_tensors ();
15981582        loader.alloc_compute_meta ();
@@ -1974,7 +1958,7 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
19741958    const  int  multiple = fmin (ceil (ratio), max_slice_nums);
19751959
19761960    std::vector<std::vector<clip_image_u8 *>> images;
1977-     LOG_INF (" %s: multiple %d\n "  , __func__, multiple);
1961+     LOG_DBG (" %s: multiple %d\n "  , __func__, multiple);
19781962    images.push_back (std::vector<clip_image_u8 *>());
19791963
19801964    if  (multiple <= 1 ) {
@@ -1989,17 +1973,17 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
19891973        clip_image_u8 * source_image = clip_image_u8_init ();
19901974        bicubic_resize (*img, *source_image, best_size.first , best_size.second );
19911975        //  source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
1992-         LOG_INF (" %s: image_size: %d %d; source_image size: %d %d\n "  , __func__, img->nx , img->ny , best_size.first , best_size.second );
1976+         LOG_DBG (" %s: image_size: %d %d; source_image size: %d %d\n "  , __func__, img->nx , img->ny , best_size.first , best_size.second );
19931977        images[images.size ()-1 ].push_back (source_image);
19941978
19951979        std::pair<int , int > best_grid = uhd_best_grid (max_slice_nums, multiple, log_ratio);
1996-         LOG_INF (" %s: image_size: %d %d; best_grid: %d %d\n "  , __func__, img->nx , img->ny , best_grid.first , best_grid.second );
1980+         LOG_DBG (" %s: image_size: %d %d; best_grid: %d %d\n "  , __func__, img->nx , img->ny , best_grid.first , best_grid.second );
19971981
19981982        auto  refine_size = uhd_get_refine_size (original_size, best_grid, scale_resolution, patch_size, true );
19991983        clip_image_u8 * refine_image = clip_image_u8_init ();
20001984        bicubic_resize (*img, *refine_image, refine_size.first , refine_size.second );
20011985
2002-         LOG_INF (" %s: refine_image_size: %d %d; refine_size: %d %d\n "  , __func__, refine_image->nx , refine_image->ny , refine_size.first , refine_size.second );
1986+         LOG_DBG (" %s: refine_image_size: %d %d; refine_size: %d %d\n "  , __func__, refine_image->nx , refine_image->ny , refine_size.first , refine_size.second );
20031987
20041988        //  split_to_patches
20051989        int  width = refine_image->nx ;
@@ -2107,7 +2091,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
21072091
21082092    bool  pad_to_square = true ;
21092093    if  (!ctx->has_vision_encoder ) {
2110-         LOG_ERR (" This gguf file seems to have no vision encoder\n "  );
2094+         LOG_ERR (" %s:  This gguf file seems to have no vision encoder\n " , __func__ );
21112095        return  false ;
21122096    }
21132097    auto  & params = ctx->vision_model .hparams ;
@@ -2444,7 +2428,7 @@ static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, co
24442428
24452429bool  clip_image_encode (struct  clip_ctx  * ctx, const  int  n_threads, clip_image_f32 * img, float  * vec) {
24462430    if  (!ctx->has_vision_encoder ) {
2447-         LOG_ERR (" This gguf file seems to have no vision encoder\n "  );
2431+         LOG_ERR (" %s:  This gguf file seems to have no vision encoder\n " , __func__ );
24482432        return  false ;
24492433    }
24502434
@@ -2456,7 +2440,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
24562440
24572441bool  clip_image_batch_encode (clip_ctx * ctx, const  int  n_threads, const  clip_image_f32_batch * imgs, float  * vec) {
24582442    if  (!ctx->has_vision_encoder ) {
2459-         LOG_ERR (" This gguf file seems to have no vision encoder\n "  );
2443+         LOG_ERR (" %s:  This gguf file seems to have no vision encoder\n " , __func__ );
24602444        return  false ;
24612445    }
24622446
@@ -2673,7 +2657,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
26732657
26742658    auto  * ctx_clip = clip_init (fname_inp, clip_context_params{
26752659        /*  use_gpu */     false ,
2676-         /*  verbosity */   CLIP_LOG_ERROR ,
2660+         /*  verbosity */   GGML_LOG_LEVEL_ERROR ,
26772661    });
26782662
26792663    const  auto  & ctx_src = ctx_clip->ctx_gguf ;
@@ -2751,7 +2735,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
27512735                f32_data = (float  *)conv_buf.data ();
27522736                break ;
27532737            default :
2754-                 LOG_ERR (" Please use an input file in f32 or f16\n "  );
2738+                 LOG_ERR (" %s:  Please use an input file in f32 or f16\n " , __func__ );
27552739                gguf_free (ctx_out);
27562740                return  false ;
27572741            }
0 commit comments