deepseek-ocr clip-vit model impl

sfallah · sfallah · commit 2aab52e2c43a · 2025-11-15T15:30:07.000+01:00
diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h
@@ -141,10 +141,7 @@
 #define TN_SAM_FFN_UP     "sam.blk.%d.mlp.lin1.%s"
 #define TN_SAM_FFN_DOWN   "sam.blk.%d.mlp.lin2.%s"
 #define TN_SAM_NECK       "sam.neck.%d.%s"
-#define TN_SAM_NET      "sam.net_%d.%s"
-
-
-#define TN_SAM_ATTN_OUT   "sam.blk.%d.attn_out"
+#define TN_SAM_NET        "sam.net_%d.%s"
 
 // align x to upper multiple of n
 #define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
@@ -1558,48 +1558,20 @@ struct clip_graph {
         // add CLS token
         inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
 
-        // The larger models use a different ViT, which uses RMS norm instead of layer norm
-        // ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188
-        norm_type norm_t = (hparams.n_embd == 3200 && hparams.n_layer == 45) ?
-                               NORM_TYPE_RMS      // 6B ViT (Used by InternVL 2.5/3 - 26B, 38B, 78B)
-                               :
-                               NORM_TYPE_NORMAL;  // 300M ViT (Used by all smaller InternVL models)
+        //TODO : check norm type for dp-ocr-clip
+        norm_type norm_t  = NORM_TYPE_NORMAL;
 
-        ggml_tensor * cur = build_vit(inp, n_pos, norm_t, hparams.ffn_op, model.position_embeddings,
-                                      nullptr);  // shape [1024, 16, 16]
+        // for selecting learned pos embd, used by ViT
+        struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
+        ggml_set_name(positions, "positions");
+        ggml_set_input(positions);
+        ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
 
-        // remove CLS token
-        cur = ggml_view_2d(ctx0, cur, n_embd, n_patches, ggml_row_size(cur->type, n_embd), 0);
 
-        // pixel shuffle
-        {
-            const int scale_factor = model.hparams.n_merge;
-            const int bsz          = 1;  // batch size, always 1 for now since we don't support batching
-            const int height       = n_patches_y;
-            const int width        = n_patches_x;
-            GGML_ASSERT(scale_factor > 0);
-            cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz);
-            cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
-            cur = ggml_cont_4d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor,
-                               width / scale_factor, bsz);
-            cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
-            // flatten to 2D
-            cur = ggml_cont_2d(ctx0, cur, n_embd * scale_factor * scale_factor, cur->ne[1] * cur->ne[2]);
-        }
-
-        // projector (always using GELU activation)
-        {
-            // projector LayerNorm uses pytorch's default eps = 1e-5
-            // ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79
-            cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
-            cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
-            cur = ggml_add(ctx0, cur, model.mm_1_b);
-            cur = ggml_gelu(ctx0, cur);
-            cur = ggml_mul_mat(ctx0, model.mm_3_w, cur);
-            cur = ggml_add(ctx0, cur, model.mm_3_b);
-        }
+        ggml_tensor * cur = build_vit(inp, n_pos, norm_t, hparams.ffn_op, learned_pos_embd,
+                                      nullptr);  // shape [1024, 16, 16]
 
-        // build the graph
+        ggml_build_forward_expand(gf, cur);
 
         return cur;
     }