@@ -179,9 +179,9 @@ class CLIPTokenizer {
179179
180180        auto  it = encoder.find (utf8_to_utf32 (" img</w>"  ));
181181        if  (it != encoder.end ()) {
182-             LOG_DEBUG ("   trigger word img already in vocab"  );
182+             LOG_DEBUG (" trigger word img already in vocab"  );
183183        } else  {
184-             LOG_DEBUG ("   trigger word img not in vocab yet"  );
184+             LOG_DEBUG (" trigger word img not in vocab yet"  );
185185        }
186186
187187        int  rank = 0 ;
@@ -488,14 +488,14 @@ struct CLIPLayer : public GGMLBlock {
488488        blocks[" mlp"  ] = std::shared_ptr<GGMLBlock>(new  CLIPMLP (d_model, intermediate_size));
489489    }
490490
491-     struct  ggml_tensor * forward (struct  ggml_context * ctx, struct  ggml_tensor * x, bool  mask = true ) {
491+     struct  ggml_tensor * forward (struct  ggml_context * ctx, ggml_backend_t  backend,  struct  ggml_tensor * x, bool  mask = true ) {
492492        //  x: [N, n_token, d_model]
493493        auto  self_attn   = std::dynamic_pointer_cast<MultiheadAttention>(blocks[" self_attn"  ]);
494494        auto  layer_norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks[" layer_norm1"  ]);
495495        auto  layer_norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks[" layer_norm2"  ]);
496496        auto  mlp         = std::dynamic_pointer_cast<CLIPMLP>(blocks[" mlp"  ]);
497497
498-         x = ggml_add (ctx, x, self_attn->forward (ctx, layer_norm1->forward (ctx, x), mask));
498+         x = ggml_add (ctx, x, self_attn->forward (ctx, backend,  layer_norm1->forward (ctx, x), mask));
499499        x = ggml_add (ctx, x, mlp->forward (ctx, layer_norm2->forward (ctx, x)));
500500        return  x;
501501    }
@@ -517,7 +517,11 @@ struct CLIPEncoder : public GGMLBlock {
517517        }
518518    }
519519
520-     struct  ggml_tensor * forward (struct  ggml_context * ctx, struct  ggml_tensor * x, int  clip_skip = -1 , bool  mask = true ) {
520+     struct  ggml_tensor * forward (struct  ggml_context * ctx,
521+                                 ggml_backend_t  backend,
522+                                 struct  ggml_tensor * x,
523+                                 int  clip_skip = -1 ,
524+                                 bool  mask     = true ) {
521525        //  x: [N, n_token, d_model]
522526        int  layer_idx = n_layer - 1 ;
523527        //  LOG_DEBUG("clip_skip %d", clip_skip);
@@ -532,7 +536,7 @@ struct CLIPEncoder : public GGMLBlock {
532536            }
533537            std::string name = " layers."   + std::to_string (i);
534538            auto  layer       = std::dynamic_pointer_cast<CLIPLayer>(blocks[name]);
535-             x                = layer->forward (ctx, x, mask);  //  [N, n_token, d_model]
539+             x                = layer->forward (ctx, backend,  x, mask);  //  [N, n_token, d_model]
536540            //  LOG_DEBUG("layer %d", i);
537541        }
538542        return  x;
@@ -718,6 +722,7 @@ class CLIPTextModel : public GGMLBlock {
718722    }
719723
720724    struct  ggml_tensor * forward (struct  ggml_context * ctx,
725+                                 ggml_backend_t  backend,
721726                                struct  ggml_tensor * input_ids,
722727                                struct  ggml_tensor * tkn_embeddings,
723728                                size_t  max_token_idx = 0 ,
@@ -728,7 +733,7 @@ class CLIPTextModel : public GGMLBlock {
728733        auto  final_layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks[" final_layer_norm"  ]);
729734
730735        auto  x = embeddings->forward (ctx, input_ids, tkn_embeddings);  //  [N, n_token, hidden_size]
731-         x      = encoder->forward (ctx, x, return_pooled ? -1  : clip_skip, true );
736+         x      = encoder->forward (ctx, backend,  x, return_pooled ? -1  : clip_skip, true );
732737        if  (return_pooled || with_final_ln) {
733738            x = final_layer_norm->forward (ctx, x);
734739        }
@@ -739,7 +744,7 @@ class CLIPTextModel : public GGMLBlock {
739744            if  (text_projection != NULL ) {
740745                pooled = ggml_nn_linear (ctx, pooled, text_projection, NULL );
741746            } else  {
742-                 LOG_DEBUG (" Missing text_projection matrix, assuming  identity... "  );
747+                 LOG_DEBUG (" identity projection "  );
743748            }
744749            return  pooled;  //  [hidden_size, 1, 1]
745750        }
@@ -780,7 +785,11 @@ class CLIPVisionModel : public GGMLBlock {
780785        blocks[" post_layernorm"  ] = std::shared_ptr<GGMLBlock>(new  LayerNorm (hidden_size));
781786    }
782787
783-     struct  ggml_tensor * forward (struct  ggml_context * ctx, struct  ggml_tensor * pixel_values, bool  return_pooled = true ) {
788+     struct  ggml_tensor * forward (struct  ggml_context * ctx,
789+                                 ggml_backend_t  backend,
790+                                 struct  ggml_tensor * pixel_values,
791+                                 bool  return_pooled = true ,
792+                                 int  clip_skip      = -1 ) {
784793        //  pixel_values: [N, num_channels, image_size, image_size]
785794        auto  embeddings     = std::dynamic_pointer_cast<CLIPVisionEmbeddings>(blocks[" embeddings"  ]);
786795        auto  pre_layernorm  = std::dynamic_pointer_cast<LayerNorm>(blocks[" pre_layernorm"  ]);
@@ -789,7 +798,7 @@ class CLIPVisionModel : public GGMLBlock {
789798
790799        auto  x = embeddings->forward (ctx, pixel_values);  //  [N, num_positions, embed_dim]
791800        x      = pre_layernorm->forward (ctx, x);
792-         x      = encoder->forward (ctx, x, - 1 , false );
801+         x      = encoder->forward (ctx, backend,  x, clip_skip , false );
793802        //  print_ggml_tensor(x, true, "ClipVisionModel x: ");
794803        auto  last_hidden_state = x;
795804        x                      = post_layernorm->forward (ctx, x);  //  [N, n_token, hidden_size]
@@ -857,29 +866,37 @@ class CLIPVisionModelProjection : public GGMLBlock {
857866        blocks[" visual_projection"  ] = std::shared_ptr<GGMLBlock>(new  CLIPProjection (hidden_size, projection_dim, transpose_proj_w));
858867    }
859868
860-     struct  ggml_tensor * forward (struct  ggml_context * ctx, struct  ggml_tensor * pixel_values) {
869+     struct  ggml_tensor * forward (struct  ggml_context * ctx,
870+                                 ggml_backend_t  backend,
871+                                 struct  ggml_tensor * pixel_values,
872+                                 bool  return_pooled = true ,
873+                                 int  clip_skip      = -1 ) {
861874        //  pixel_values: [N, num_channels, image_size, image_size]
862-         //  return: [N, projection_dim]
875+         //  return: [N, projection_dim] if return_pooled else [N, n_token, hidden_size] 
863876        auto  vision_model      = std::dynamic_pointer_cast<CLIPVisionModel>(blocks[" vision_model"  ]);
864877        auto  visual_projection = std::dynamic_pointer_cast<CLIPProjection>(blocks[" visual_projection"  ]);
865878
866-         auto  x = vision_model->forward (ctx, pixel_values);  //  [N, hidden_size]
867-         x      = visual_projection->forward (ctx, x);        //  [N, projection_dim]
879+         auto  x = vision_model->forward (ctx, backend, pixel_values, return_pooled, clip_skip);  //  [N, hidden_size] or [N, n_token, hidden_size]
868880
869-         return  x;  //  [N, projection_dim]
881+         if  (return_pooled) {
882+             x = visual_projection->forward (ctx, x);  //  [N, projection_dim]
883+         }
884+ 
885+         return  x;
870886    }
871887};
872888
873889struct  CLIPTextModelRunner  : public  GGMLRunner  {
874890    CLIPTextModel model;
875891
876892    CLIPTextModelRunner (ggml_backend_t  backend,
893+                         bool  offload_params_to_cpu,
877894                        const  String2GGMLType& tensor_types,
878895                        const  std::string prefix,
879896                        CLIPVersion version = OPENAI_CLIP_VIT_L_14,
880897                        bool  with_final_ln  = true ,
881898                        int  clip_skip_value = -1 )
882-         : GGMLRunner(backend), model(version, with_final_ln, clip_skip_value) {
899+         : GGMLRunner(backend, offload_params_to_cpu ), model(version, with_final_ln, clip_skip_value) {
883900        model.init (params_ctx, tensor_types, prefix);
884901    }
885902
@@ -896,6 +913,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
896913    }
897914
898915    struct  ggml_tensor * forward (struct  ggml_context * ctx,
916+                                 ggml_backend_t  backend,
899917                                struct  ggml_tensor * input_ids,
900918                                struct  ggml_tensor * embeddings,
901919                                size_t  max_token_idx = 0 ,
@@ -907,7 +925,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
907925            input_ids = ggml_reshape_2d (ctx, input_ids, model.n_token , input_ids->ne [0 ] / model.n_token );
908926        }
909927
910-         return  model.forward (ctx, input_ids, embeddings, max_token_idx, return_pooled);
928+         return  model.forward (ctx, backend,  input_ids, embeddings, max_token_idx, return_pooled);
911929    }
912930
913931    struct  ggml_cgraph * build_graph (struct  ggml_tensor * input_ids,
@@ -933,7 +951,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
933951            embeddings = ggml_concat (compute_ctx, token_embed_weight, custom_embeddings, 1 );
934952        }
935953
936-         struct  ggml_tensor * hidden_states = forward (compute_ctx, input_ids, embeddings, max_token_idx, return_pooled);
954+         struct  ggml_tensor * hidden_states = forward (compute_ctx, runtime_backend,  input_ids, embeddings, max_token_idx, return_pooled);
937955
938956        ggml_build_forward_expand (gf, hidden_states);
939957
0 commit comments