Skip to content

Commit 8586d23

Browse files
committed
minicpm working without uhd
1 parent c0d93dd commit 8586d23

File tree

9 files changed

+77
-2
lines changed

9 files changed

+77
-2
lines changed

convert_hf_to_gguf.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2339,6 +2339,7 @@ class MiniCPMVModel(Qwen2Model):
23392339
model_arch = gguf.MODEL_ARCH.QWEN2
23402340
proj_type: gguf.constants.CLIPProjectorType | None
23412341
resampler_n_embd = 0
2342+
tok_embd_tensor: Tensor | None = None
23422343

23432344
def __init__(self, *args, **kwargs):
23442345
super().__init__(*args, **kwargs)
@@ -2361,6 +2362,8 @@ def __init__(self, *args, **kwargs):
23612362
for tname, tensor in self.get_tensors():
23622363
if tname == "resampler.ln_post.bias":
23632364
self.resampler_n_embd = tensor.shape[0]
2365+
if tname.endswith("embed_tokens.weight"):
2366+
self.tok_embd_tensor = tensor
23642367
if self.resampler_n_embd < 2:
23652368
raise ValueError("Failed to detect resampler embedding size")
23662369
else:
@@ -2372,6 +2375,16 @@ def __init__(self, *args, **kwargs):
23722375
self.hparams["vision_feature_layer"] = 0
23732376
self.v_tensor_map = gguf.get_tensor_name_map(self.vision_arch, self.vparams["num_hidden_layers"])
23742377

2378+
def get_embd_of_tokens(self, map_token_to_tensor_name: Iterable[tuple[str, str]]) -> Iterable[tuple[str, Tensor]]:
2379+
if self.tok_embd_tensor is None:
2380+
raise ValueError("Token embedding tensor not found")
2381+
from transformers import AutoTokenizer
2382+
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
2383+
for token, tensor_name in map_token_to_tensor_name:
2384+
tok_id = tokenizer.get_vocab()[token]
2385+
row = self.tok_embd_tensor[tok_id]
2386+
yield tensor_name, row
2387+
23752388
def set_gguf_parameters(self):
23762389
super().set_gguf_parameters()
23772390
# For vision model
@@ -2388,6 +2401,14 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
23882401
self.format_tensor_name(gguf.MODEL_TENSOR.V_RESMPL_POS_EMBD_K, is_vision=True),
23892402
torch.from_numpy(self._get_2d_sincos_pos_embed(self.resampler_n_embd, (70, 70)))
23902403
)
2404+
added_tokens = [
2405+
("<image>", gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_IMAGE ] + ".weight"),
2406+
("</image>", gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_END_IMAGE] + ".weight"),
2407+
("<slice>", gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_SLICE ] + ".weight"),
2408+
("</slice>", gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_END_SLICE] + ".weight"),
2409+
]
2410+
for tensor_name, tensor in self.get_embd_of_tokens(added_tokens):
2411+
yield tensor_name, tensor
23912412

23922413
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
23932414
del bid # unused
@@ -2404,6 +2425,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
24042425
name_k = name.replace("in_proj_", "in_proj_k.") # in_proj_k.(weight|bias)
24052426
name_v = name.replace("in_proj_", "in_proj_v.") # in_proj_v.(weight|bias)
24062427
return [
2428+
# TODO: permute these
24072429
(self.map_tensor_name(name_q), split_tensor[0]),
24082430
(self.map_tensor_name(name_k), split_tensor[1]),
24092431
(self.map_tensor_name(name_v), split_tensor[2]),
@@ -2413,6 +2435,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
24132435
if name == "resampler.proj" or name == "resampler.query":
24142436
name += ".weight"
24152437

2438+
if name.startswith("resampler.proj"):
2439+
data_torch = data_torch.transpose(-1, -2).contiguous()
2440+
24162441
if "post_layernorm" in name:
24172442
return [] # skip post_layernorm
24182443

examples/vision/vision.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ int main(int argc, char ** argv) {
100100
// default prompt for llava 1.5
101101
//params.prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:<img_placement>\nwhat did you see?\nASSISTANT:";
102102
// default prompt for minicpmv 2.6
103-
params.prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nwhat did you see?\n<image><img_placement></image><|im_end|>\n<|im_start|>assistant\n";
103+
params.prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nwhat did you see?\n<img_placement><|im_end|>\n<|im_start|>assistant\n";
104104
params.n_predict = 64;
105105
params.n_batch = 2048;
106106
params.n_ubatch = 1024;

gguf-py/gguf/constants.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -467,6 +467,10 @@ class MODEL_TENSOR(IntEnum):
467467
V_RESMPL_Q_NORM = auto() # minicpmv
468468
V_RESMPL_PROJ = auto() # minicpmv
469469
V_RESMPL_QUERY = auto() # minicpmv
470+
V_TOK_EMBD_IMAGE = auto() # embedding for <image> token
471+
V_TOK_EMBD_END_IMAGE = auto() # embedding for </image> token
472+
V_TOK_EMBD_SLICE = auto() # embedding for <slice> token
473+
V_TOK_EMBD_END_SLICE = auto() # embedding for </slice> token
470474

471475

472476
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@@ -686,6 +690,10 @@ class MODEL_TENSOR(IntEnum):
686690
MODEL_TENSOR.V_RESMPL_Q_NORM: "v.resmpl.q_norm",
687691
MODEL_TENSOR.V_RESMPL_PROJ: "v.resmpl.proj",
688692
MODEL_TENSOR.V_RESMPL_QUERY: "v.resmpl.query",
693+
MODEL_TENSOR.V_TOK_EMBD_IMAGE: "v.tok_embd.image",
694+
MODEL_TENSOR.V_TOK_EMBD_END_IMAGE: "v.tok_embd.end_image",
695+
MODEL_TENSOR.V_TOK_EMBD_SLICE: "v.tok_embd.slice",
696+
MODEL_TENSOR.V_TOK_EMBD_END_SLICE: "v.tok_embd.end_slice",
689697
}
690698

691699
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@@ -1682,6 +1690,10 @@ class MODEL_TENSOR(IntEnum):
16821690
MODEL_TENSOR.V_RESMPL_Q_NORM,
16831691
MODEL_TENSOR.V_RESMPL_PROJ,
16841692
MODEL_TENSOR.V_RESMPL_QUERY,
1693+
MODEL_TENSOR.V_TOK_EMBD_IMAGE,
1694+
MODEL_TENSOR.V_TOK_EMBD_END_IMAGE,
1695+
MODEL_TENSOR.V_TOK_EMBD_SLICE,
1696+
MODEL_TENSOR.V_TOK_EMBD_END_SLICE,
16851697
],
16861698
# TODO
16871699
}

gguf-py/gguf/tensor_mapping.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -907,6 +907,22 @@ class TensorNameMap:
907907
MODEL_TENSOR.V_RESMPL_QUERY: (
908908
"resampler.query",
909909
),
910+
911+
MODEL_TENSOR.V_TOK_EMBD_IMAGE:(
912+
"v.tok_embd.image", # tensor generated from token embeddings
913+
),
914+
915+
MODEL_TENSOR.V_TOK_EMBD_END_IMAGE:(
916+
"v.tok_embd.end_image", # tensor generated from token embeddings
917+
),
918+
919+
MODEL_TENSOR.V_TOK_EMBD_SLICE:(
920+
"v.tok_embd.slice", # tensor generated from token embeddings
921+
),
922+
923+
MODEL_TENSOR.V_TOK_EMBD_END_SLICE:(
924+
"v.tok_embd.end_slice", # tensor generated from token embeddings
925+
),
910926
}
911927

912928
# architecture-specific block mappings

src/llama-arch.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1382,6 +1382,10 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
13821382
{ LLM_TENSOR_V_RESMPL_Q_NORM, "v.resmpl.q_norm" },
13831383
{ LLM_TENSOR_V_RESMPL_PROJ, "v.resmpl.proj" },
13841384
{ LLM_TENSOR_V_RESMPL_QUERY, "v.resmpl.query" },
1385+
{ LLM_TENSOR_V_TOK_EMBD_IMAGE, "v.tok_embd.image" },
1386+
{ LLM_TENSOR_V_TOK_EMBD_END_IMAGE, "v.tok_embd.end_image" },
1387+
{ LLM_TENSOR_V_TOK_EMBD_SLICE, "v.tok_embd.slice" },
1388+
{ LLM_TENSOR_V_TOK_EMBD_END_SLICE, "v.tok_embd.end_slice" },
13851389
}
13861390
},
13871391
{

src/llama-arch.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,10 @@ enum llm_tensor {
381381
LLM_TENSOR_V_RESMPL_Q_NORM,
382382
LLM_TENSOR_V_RESMPL_PROJ,
383383
LLM_TENSOR_V_RESMPL_QUERY,
384+
LLM_TENSOR_V_TOK_EMBD_IMAGE,
385+
LLM_TENSOR_V_TOK_EMBD_END_IMAGE,
386+
LLM_TENSOR_V_TOK_EMBD_SLICE,
387+
LLM_TENSOR_V_TOK_EMBD_END_SLICE,
384388
};
385389

386390
enum llm_tensor_layer {

src/llama-model.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3549,6 +3549,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
35493549
vit.mm_model_ln_post_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_POST_NORM, "weight"), {rs_n_embd});
35503550
vit.mm_model_ln_post_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_POST_NORM, "bias" ), {rs_n_embd});
35513551

3552+
// tok embd
3553+
vit.mm_tok_embd_image = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_TOK_EMBD_IMAGE, "weight"), {n_embd});
3554+
vit.mm_tok_embd_end_image = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_TOK_EMBD_END_IMAGE, "weight"), {n_embd});
3555+
vit.mm_tok_embd_slice = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_TOK_EMBD_SLICE, "weight"), {n_embd});
3556+
vit.mm_tok_embd_end_slice = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_TOK_EMBD_END_SLICE, "weight"), {n_embd});
3557+
35523558
for (int i = 0; i < n_vlayer; ++i) {
35533559
auto & layer = vit.layers[i];
35543560

src/llama-vision.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -895,6 +895,10 @@ struct llama_vision_graph_builder {
895895
cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
896896
}
897897

898+
// add <image> and </image> token embeddings
899+
cur = ggml_concat(ctx0, model.mm_tok_embd_image, cur, 1);
900+
cur = ggml_concat(ctx0, cur, model.mm_tok_embd_end_image, 1);
901+
898902
ggml_set_name(cur, "output");
899903
ggml_build_forward_expand(gf, cur);
900904

src/llama-vision.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,11 @@ struct llama_vision_model {
129129
struct ggml_tensor * mm_model_ln_post_w = nullptr;
130130
struct ggml_tensor * mm_model_ln_post_b = nullptr;
131131

132-
struct ggml_tensor * image_newline = nullptr;
132+
// special tokens
133+
struct ggml_tensor * mm_tok_embd_image = nullptr;
134+
struct ggml_tensor * mm_tok_embd_end_image = nullptr;
135+
struct ggml_tensor * mm_tok_embd_slice = nullptr;
136+
struct ggml_tensor * mm_tok_embd_end_slice = nullptr;
133137
};
134138

135139
struct llama_vision_context {

0 commit comments

Comments
 (0)