@@ -1558,48 +1558,20 @@ struct clip_graph {
15581558 // add CLS token
15591559 inp = ggml_concat (ctx0, inp, model.class_embedding , 1 );
15601560
1561- // The larger models use a different ViT, which uses RMS norm instead of layer norm
1562- // ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188
1563- norm_type norm_t = (hparams.n_embd == 3200 && hparams.n_layer == 45 ) ?
1564- NORM_TYPE_RMS // 6B ViT (Used by InternVL 2.5/3 - 26B, 38B, 78B)
1565- :
1566- NORM_TYPE_NORMAL; // 300M ViT (Used by all smaller InternVL models)
1561+ // TODO : check norm type for dp-ocr-clip
1562+ norm_type norm_t = NORM_TYPE_NORMAL;
15671563
1568- ggml_tensor * cur = build_vit (inp, n_pos, norm_t , hparams.ffn_op , model.position_embeddings ,
1569- nullptr ); // shape [1024, 16, 16]
1564+ // for selecting learned pos embd, used by ViT
1565+ struct ggml_tensor * positions = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_pos);
1566+ ggml_set_name (positions, " positions" );
1567+ ggml_set_input (positions);
1568+ ggml_tensor * learned_pos_embd = ggml_get_rows (ctx0, model.position_embeddings , positions);
15701569
1571- // remove CLS token
1572- cur = ggml_view_2d (ctx0, cur, n_embd, n_patches, ggml_row_size (cur->type , n_embd), 0 );
15731570
1574- // pixel shuffle
1575- {
1576- const int scale_factor = model.hparams .n_merge ;
1577- const int bsz = 1 ; // batch size, always 1 for now since we don't support batching
1578- const int height = n_patches_y;
1579- const int width = n_patches_x;
1580- GGML_ASSERT (scale_factor > 0 );
1581- cur = ggml_reshape_4d (ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz);
1582- cur = ggml_permute (ctx0, cur, 0 , 2 , 1 , 3 );
1583- cur = ggml_cont_4d (ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor,
1584- width / scale_factor, bsz);
1585- cur = ggml_permute (ctx0, cur, 0 , 2 , 1 , 3 );
1586- // flatten to 2D
1587- cur = ggml_cont_2d (ctx0, cur, n_embd * scale_factor * scale_factor, cur->ne [1 ] * cur->ne [2 ]);
1588- }
1589-
1590- // projector (always using GELU activation)
1591- {
1592- // projector LayerNorm uses pytorch's default eps = 1e-5
1593- // ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79
1594- cur = build_norm (cur, model.mm_0_w , model.mm_0_b , NORM_TYPE_NORMAL, 1e-5 , -1 );
1595- cur = ggml_mul_mat (ctx0, model.mm_1_w , cur);
1596- cur = ggml_add (ctx0, cur, model.mm_1_b );
1597- cur = ggml_gelu (ctx0, cur);
1598- cur = ggml_mul_mat (ctx0, model.mm_3_w , cur);
1599- cur = ggml_add (ctx0, cur, model.mm_3_b );
1600- }
1571+ ggml_tensor * cur = build_vit (inp, n_pos, norm_t , hparams.ffn_op , learned_pos_embd,
1572+ nullptr ); // shape [1024, 16, 16]
16011573
1602- // build the graph
1574+ ggml_build_forward_expand (gf, cur);
16031575
16041576 return cur;
16051577 }
0 commit comments