1414
1515//  export clip_image_u8 to bmp file for debugging
1616//  https://codereview.stackexchange.com/questions/195121/writing-a-bitmap-image-from-c
17- static  int  bmp_export (const  clip_image_u8 &img, const  std::string &location);
17+ struct  clip_image_size ;
18+ static  int  bmp_export (const  struct  clip_image_u8  &img, const  std::string &location);
1819#endif 
1920
2021struct  clip_image_size  {
@@ -53,21 +54,21 @@ struct clip_image_f32 {
5354using  clip_image_f32_batch = std::vector<clip_image_f32>;
5455using  clip_image_f8_batch  = std::vector<clip_image_u8>;
5556
56- static   int  clip_n_patches (const  clip_context & ctx) {
57+ int  clip_n_patches (const  clip_context & ctx) {
5758    auto  & hparams = ctx.model ->hparams ;
5859    int  n_patches = (hparams.image_size  / hparams.patch_size ) * (hparams.image_size  / hparams.patch_size );
5960    return  n_patches;
6061}
6162
62- static   int  clip_n_mmproj_embd (const  clip_context & ctx) {
63+ int  clip_n_mmproj_embd (const  clip_context & ctx) {
6364    if  (ctx.model ->hparams .proj_type  == CLIP_PROJECTOR_TYPE_MLP) {
6465        return  ctx.model ->mm_b_b ->ne [0 ];
6566    } else  {
6667        GGML_ASSERT (false  && " invalid proj type" 
6768    }
6869}
6970
70- static   int  clip_n_embd (const  clip_context & ctx) {
71+ int  clip_n_embd (const  clip_context & ctx) {
7172    return  clip_n_patches (ctx) * clip_n_mmproj_embd (ctx);
7273}
7374
@@ -323,7 +324,7 @@ static bool clip_image_preprocess(const clip_context & ctx, const clip_image_u8
323324
324325    const  int  nx = temp.nx ;
325326    const  int  ny = temp.ny ;
326-     //  clip_image_save_to_bmp(* temp, "resized_vanilla.bmp");
327+     //  bmp_export( temp, "resized_vanilla.bmp");
327328
328329    const  int  nx2 = params.image_size ;
329330    const  int  ny2 = params.image_size ;
@@ -451,11 +452,11 @@ static ggml_cgraph * clip_image_build_graph(clip_context & ctx, int batch_size,
451452        embeddings = ggml_norm (ctx0, embeddings, eps);
452453        ggml_set_name (embeddings, " pre_ln" 
453454
454-         embeddings = ggml_add (ctx0, ggml_mul (ctx0, embeddings, model.pre_norm_w ), model.pre_norm_w );
455+         embeddings = ggml_add (ctx0, ggml_mul (ctx0, embeddings, model.pre_norm_w ), model.pre_norm_b );
455456    }
456457
457458    //  loop over layers
458-     for  (int  il = 0 ; il < (int )hparams.n_layer  - 1 ; il++) {
459+     for  (int  il = 0 ; il < (int )hparams.n_layer  - 2 ; il++) {
459460        struct  ggml_tensor  * cur = embeddings;
460461
461462        //  layernorm1
@@ -537,6 +538,14 @@ static ggml_cgraph * clip_image_build_graph(clip_context & ctx, int batch_size,
537538        embeddings = cur;
538539    }
539540
541+     //  post-layernorm
542+     if  (model.post_norm_w ) {
543+         embeddings = ggml_norm (ctx0, embeddings, eps);
544+         ggml_set_name (embeddings, " post_ln" 
545+ 
546+         embeddings = ggml_add (ctx0, ggml_mul (ctx0, embeddings, model.post_norm_w ), model.post_norm_b );
547+     }
548+ 
540549    //  llava projector
541550    {
542551        embeddings = ggml_reshape_2d (ctx0, embeddings, embeddings->ne [0 ], embeddings->ne [1 ]);
@@ -673,6 +682,7 @@ static int32_t encode_image_with_clip(clip_context & ctx, const llama_img img, s
673682    clip_image_u8 img_u8 (img);
674683    clip_image_f32_batch img_res_v;
675684    auto  & hparams = ctx.model ->hparams ;
685+     //  bmp_export(img_u8, "test_inp.bmp");
676686
677687    if  (!clip_image_preprocess (ctx, img_u8, img_res_v)) {
678688        LLAMA_LOG_ERROR (" %s: unable to preprocess image\n " 
@@ -724,7 +734,6 @@ int32_t llama_vision_encode_internal(clip_context & ctx, llama_img_batch * batch
724734        //  copy output embeddings to result
725735        for  (int  k = 0 ; k < n_embd; k++) {
726736            ctx.output [n_embd*i + k] = output_single[k];
727-             //  if (k<10) printf("%f\n", output_single[k]);
728737        }
729738    }
730739
@@ -735,10 +744,19 @@ int32_t llama_vision_encode_internal(clip_context & ctx, llama_img_batch * batch
735744//  for debugging
736745#ifndef  NDEBUG
737746
738- static  int  bmp_export (const  clip_image_u8 &img, const  std::string &location) {
747+ static  int  bmp_export (const  struct   clip_image_u8  &img, const  std::string &location) {
739748    const  uint32_t  width = img.nx ;
740749    const  uint32_t  height = img.ny ;
741-     const  std::vector<uint8_t > &buffer = img.buf ;
750+     //  swap red and blue channel
751+     std::vector<uint8_t > buffer (width*height*3 );
752+     for  (uint32_t  y = 0 ; y < height; y++) {
753+         for  (uint32_t  x = 0 ; x < width; x++) {
754+             size_t  base = x*3  + y*3 *width;
755+             buffer[base+2 ] = img.buf [base];
756+             buffer[base+1 ] = img.buf [base+1 ];
757+             buffer[base]   = img.buf [base+2 ];
758+         }
759+     }
742760    const  bool  hasAlphaChannel = false ;
743761
744762    std::ofstream fout (location, std::ios::out | std::ios::binary);
0 commit comments