Skip to content

Commit b32bb5e

Browse files
authored
Merge pull request #2 from bluebread/sf/deepseek-ocr
mtmd: DeepseekOCR Implement DeepSeek3B-MoE-A570M (LM component)
2 parents 97e0907 + 13dc6fb commit b32bb5e

File tree

3 files changed

+13
-12
lines changed

3 files changed

+13
-12
lines changed

convert_hf_to_gguf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7100,7 +7100,7 @@ def set_gguf_parameters(self):
71007100
else:
71017101
# note: deepseek2 using MLA converts into MQA (ie: GQA with 1 group)
71027102
self.hparams["num_key_value_heads"] = 1
7103-
7103+
71047104
super().set_gguf_parameters()
71057105
hparams = self.hparams
71067106
kv_lora_rank = hparams["q_lora_rank"] if hparams["q_lora_rank"] is not None else 512

examples/eval-callback/eval-callback.cpp

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -74,19 +74,19 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
7474
}
7575
}
7676
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
77-
LOG(" [\n");
77+
LOG(" [\n");
7878
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
7979
if (i2 == n && ne[2] > 2*n) {
80-
LOG(" ..., \n");
80+
LOG(" ..., \n");
8181
i2 = ne[2] - n;
8282
}
83-
LOG(" [\n");
83+
LOG(" [\n");
8484
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
8585
if (i1 == n && ne[1] > 2*n) {
86-
LOG(" ..., \n");
86+
LOG(" ..., \n");
8787
i1 = ne[1] - n;
8888
}
89-
LOG(" [");
89+
LOG(" [");
9090
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
9191
if (i0 == n && ne[0] > 2*n) {
9292
LOG("..., ");
@@ -98,10 +98,10 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
9898
}
9999
LOG("],\n");
100100
}
101-
LOG(" ],\n");
101+
LOG(" ],\n");
102102
}
103-
LOG(" ]\n");
104-
LOG(" sum = %f\n", sum);
103+
LOG(" ]\n");
104+
LOG(" sum = %f\n", sum);
105105
}
106106

107107
// TODO: make this abort configurable/optional?
@@ -136,7 +136,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
136136
snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
137137
}
138138

139-
LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
139+
LOG("%s: %16s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
140140
t->name, ggml_type_name(t->type), ggml_op_desc(t),
141141
src0->name, ggml_ne_string(src0).c_str(),
142142
src1 ? src1_str : "",

src/models/deepseek2.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
4747
// self_attention
4848
if (is_ocr) {
4949
const int n_embed_head = hparams.n_embd / hparams.n_head();
50+
const int ocr_rope_type = GGML_ROPE_TYPE_NEOX;
5051
GGML_ASSERT(n_embed_head == n_embd_head_k && n_embed_head == n_embd_head_v);
5152

5253
ggml_tensor * Qcur = NULL;
@@ -65,8 +66,8 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
6566
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embed_head, n_head, n_tokens);
6667

6768
GGML_ASSERT(fabs(freq_base - 10000.0) < 1e-4);
68-
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_embed_head, rope_type, 0, freq_base, 1, 0, 1, 0, 0);
69-
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_embed_head, rope_type, 0, freq_base, 1, 0, 1, 0, 0);
69+
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_embed_head, ocr_rope_type, 0, freq_base, 1, 0, 1, 0, 0);
70+
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_embed_head, ocr_rope_type, 0, freq_base, 1, 0, 1, 0, 0);
7071
cb(Qcur, "q_pe", il);
7172
cb(Kcur, "k_pe", il);
7273

0 commit comments

Comments
 (0)