Skip to content

Commit e8b2610

Browse files
committed
Merge branch 'sf/deepseek-ocr' of github.com:sfallah/llama.cpp into sf/deepseek-ocr
2 parents 2de3436 + 2aab52e commit e8b2610

File tree

2 files changed

+11
-42
lines changed

2 files changed

+11
-42
lines changed

tools/mtmd/clip-impl.h

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -141,10 +141,7 @@
141141
#define TN_SAM_FFN_UP "sam.blk.%d.mlp.lin1.%s"
142142
#define TN_SAM_FFN_DOWN "sam.blk.%d.mlp.lin2.%s"
143143
#define TN_SAM_NECK "sam.neck.%d.%s"
144-
#define TN_SAM_NET "sam.net_%d.%s"
145-
146-
147-
#define TN_SAM_ATTN_OUT "sam.blk.%d.attn_out"
144+
#define TN_SAM_NET "sam.net_%d.%s"
148145

149146
// align x to upper multiple of n
150147
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))

tools/mtmd/clip.cpp

Lines changed: 10 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1558,48 +1558,20 @@ struct clip_graph {
15581558
// add CLS token
15591559
inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
15601560

1561-
// The larger models use a different ViT, which uses RMS norm instead of layer norm
1562-
// ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188
1563-
norm_type norm_t = (hparams.n_embd == 3200 && hparams.n_layer == 45) ?
1564-
NORM_TYPE_RMS // 6B ViT (Used by InternVL 2.5/3 - 26B, 38B, 78B)
1565-
:
1566-
NORM_TYPE_NORMAL; // 300M ViT (Used by all smaller InternVL models)
1561+
//TODO : check norm type for dp-ocr-clip
1562+
norm_type norm_t = NORM_TYPE_NORMAL;
15671563

1568-
ggml_tensor * cur = build_vit(inp, n_pos, norm_t, hparams.ffn_op, model.position_embeddings,
1569-
nullptr); // shape [1024, 16, 16]
1564+
// for selecting learned pos embd, used by ViT
1565+
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
1566+
ggml_set_name(positions, "positions");
1567+
ggml_set_input(positions);
1568+
ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
15701569

1571-
// remove CLS token
1572-
cur = ggml_view_2d(ctx0, cur, n_embd, n_patches, ggml_row_size(cur->type, n_embd), 0);
15731570

1574-
// pixel shuffle
1575-
{
1576-
const int scale_factor = model.hparams.n_merge;
1577-
const int bsz = 1; // batch size, always 1 for now since we don't support batching
1578-
const int height = n_patches_y;
1579-
const int width = n_patches_x;
1580-
GGML_ASSERT(scale_factor > 0);
1581-
cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz);
1582-
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
1583-
cur = ggml_cont_4d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor,
1584-
width / scale_factor, bsz);
1585-
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
1586-
// flatten to 2D
1587-
cur = ggml_cont_2d(ctx0, cur, n_embd * scale_factor * scale_factor, cur->ne[1] * cur->ne[2]);
1588-
}
1589-
1590-
// projector (always using GELU activation)
1591-
{
1592-
// projector LayerNorm uses pytorch's default eps = 1e-5
1593-
// ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79
1594-
cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
1595-
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
1596-
cur = ggml_add(ctx0, cur, model.mm_1_b);
1597-
cur = ggml_gelu(ctx0, cur);
1598-
cur = ggml_mul_mat(ctx0, model.mm_3_w, cur);
1599-
cur = ggml_add(ctx0, cur, model.mm_3_b);
1600-
}
1571+
ggml_tensor * cur = build_vit(inp, n_pos, norm_t, hparams.ffn_op, learned_pos_embd,
1572+
nullptr); // shape [1024, 16, 16]
16011573

1602-
// build the graph
1574+
ggml_build_forward_expand(gf, cur);
16031575

16041576
return cur;
16051577
}

0 commit comments

Comments
 (0)