Skip to content

Commit a65ddf5

Browse files
committed
Merge branch 'sf/deepseek-ocr' of github.com:sfallah/llama.cpp into sf/deepseek-ocr
2 parents 6c0715b + 331cea8 commit a65ddf5

File tree

3 files changed

+20
-8
lines changed

3 files changed

+20
-8
lines changed

gguf-py/gguf/constants.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -290,7 +290,7 @@ class Projector:
290290
class SAM:
291291
BLOCK_COUNT = "clip.vision.sam.block_count"
292292
EMBEDDING_LENGTH = "clip.vision.sam.embedding_length"
293-
293+
294294
class ClipAudio:
295295
NUM_MEL_BINS = "clip.audio.num_mel_bins"
296296
EMBEDDING_LENGTH = "clip.audio.embedding_length"
@@ -1064,8 +1064,8 @@ class MODEL_TENSOR(IntEnum):
10641064
MODEL_TENSOR.V_SAM_NECK: "v.sam.neck.{bid}",
10651065
MODEL_TENSOR.V_SAM_NET_2: "v.sam.net_2",
10661066
MODEL_TENSOR.V_SAM_NET_3: "v.sam.net_3",
1067-
MODEL_TENSOR.V_ENC_EMBD_IMGNL: "v.image_newline_embd", # Deepseek-OCR
1068-
MODEL_TENSOR.V_ENC_EMBD_VSEP: "v.view_separator_embd", # Deepseek-OCR
1067+
MODEL_TENSOR.V_ENC_EMBD_IMGNL: "model.image_newline", # Deepseek-OCR
1068+
MODEL_TENSOR.V_ENC_EMBD_VSEP: "model.view_seperator", # Deepseek-OCR
10691069
# audio (mtmd)
10701070
MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd",
10711071
MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}",

tools/mtmd/clip-impl.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@
8686
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
8787
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
8888
#define TN_IMAGE_NEWLINE "model.image_newline"
89+
#define TN_IMAGE_SEPERATOR "model.view_seperator"
8990
#define TN_MM_INP_NORM "mm.input_norm.weight"
9091
#define TN_MM_INP_NORM_B "mm.input_norm.bias"
9192
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3

tools/mtmd/clip.cpp

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -835,14 +835,23 @@ struct clip_graph {
835835
ggml_tensor * global_features_2 = build_dp_ocr_clip(inp_raw, global_features_1);
836836

837837
// torch global_features = torch.cat((global_features_2[:, 1:], global_features_1.flatten(2).permute(0, 2, 1)), dim=-1)
838-
ggml_tensor * global_features = ggml_concat(ctx0, global_features_1, global_features_2, 0);
838+
global_features_1 = ggml_permute(ctx0, global_features_1,2,1,0,3);
839+
global_features_1 = ggml_cont(ctx0, global_features_1);
840+
global_features_1 = ggml_reshape_2d(ctx0, global_features_1, n_embd, n_patches);
841+
// remove CLS token
842+
global_features_2 = ggml_view_2d(ctx0, global_features_2,
843+
n_embd, n_patches,
844+
ggml_row_size(global_features_2->type, n_embd), 0);
845+
846+
ggml_tensor * global_features = ggml_concat(ctx0, global_features_2, global_features_1, 1);
839847
global_features = build_global_local_features(
840848
ctx0,
841849
global_features,
842850
n_patches_y,
843851
n_patches_x,
844852
n_embd
845853
);
854+
ggml_build_forward_expand(gf, global_features);
846855

847856
return gf;
848857
}
@@ -858,8 +867,8 @@ struct clip_graph {
858867
int n_dim) {
859868
GGML_ASSERT(model.image_newline != nullptr);
860869
GGML_ASSERT(model.view_seperator != nullptr);
861-
GGML_ASSERT(global_features->ne[0] == (int64_t) n_dim);
862-
GGML_ASSERT(global_features->ne[1] == (int64_t) (h * w));
870+
GGML_ASSERT(global_features->ne[0] == static_cast<int64_t>(n_dim));
871+
GGML_ASSERT(global_features->ne[1] == static_cast<int64_t>(2 * (h * w)));
863872

864873
// 1) global_features: [n_dim, h*w] -> [n_dim, w, h] -> [h, w, n_dim]
865874
ggml_tensor * t = ggml_reshape_3d(ctx0, global_features, n_dim, w, h); // (n_dim, w, h)
@@ -1552,8 +1561,7 @@ struct clip_graph {
15521561

15531562
// for selecting learned pos embd, used by ViT
15541563
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
1555-
ggml_set_name(positions, "positions");
1556-
ggml_set_input(positions);
1564+
cb(positions, "positions", -1);
15571565
ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
15581566

15591567

@@ -3607,6 +3615,9 @@ struct clip_model_loader {
36073615
model.net_2 = get_tensor(string_format(TN_SAM_NET, 2, "weight"));
36083616
model.net_3 = get_tensor(string_format(TN_SAM_NET, 3, "weight"));
36093617
}
3618+
model.image_newline = get_tensor(TN_IMAGE_NEWLINE, false);
3619+
model.view_seperator = get_tensor(TN_IMAGE_SEPERATOR, false);
3620+
36103621
break;
36113622
default:
36123623
GGML_ASSERT(false && "unknown projector type");

0 commit comments

Comments
 (0)