Skip to content

Commit c430c21

Browse files
committed
Merge branch 'xsn/vision' of https://github.com/ngxson/llama.cpp into wirthual/fix-vision
2 parents 3ca3898 + a88c0d5 commit c430c21

File tree

12 files changed

+311
-133
lines changed

12 files changed

+311
-133
lines changed

common/common.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1474,6 +1474,27 @@ std::vector<llama_token> llama_tokenize(
14741474
return result;
14751475
}
14761476

1477+
// TODO: this function is hacky, need to be improved
1478+
std::vector<llama_token> llama_tokenize_with_img(
1479+
const struct llama_context * ctx,
1480+
const std::string & text,
1481+
bool add_special,
1482+
bool parse_special) {
1483+
static const std::string IMG_PLACEMENT = "<img_placement>";
1484+
std::vector<std::string> parts = string_split(text, IMG_PLACEMENT);
1485+
std::vector<llama_token> output;
1486+
for (const auto & part : parts) {
1487+
bool add_bos = &parts.front() == &part;
1488+
auto tokens = llama_tokenize(ctx, part, add_special && add_bos, parse_special);
1489+
output.insert(output.end(), tokens.begin(), tokens.end());
1490+
if (&parts.back() != &part) {
1491+
// add image token to middle of 2 parts
1492+
output.push_back(TOKEN_IMG_PLACEMENT);
1493+
}
1494+
}
1495+
return output;
1496+
}
1497+
14771498
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
14781499
std::string piece;
14791500
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'

common/common.h

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,20 @@ static std::vector<T> string_split(const std::string & str, char delim) {
378378
return values;
379379
}
380380

381+
// split string by a `std::string delim` instead of `char delim`
382+
static std::vector<std::string> string_split(std::string s, const std::string & delimiter) {
383+
std::vector<std::string> tokens;
384+
size_t pos = 0;
385+
std::string token;
386+
while ((pos = s.find(delimiter)) != std::string::npos) {
387+
token = s.substr(0, pos);
388+
tokens.push_back(token);
389+
s.erase(0, pos + delimiter.length());
390+
}
391+
tokens.push_back(s);
392+
return tokens;
393+
}
394+
381395
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
382396
void string_process_escapes(std::string & input);
383397

@@ -447,6 +461,17 @@ std::vector<llama_token> llama_tokenize(
447461
bool add_special,
448462
bool parse_special = false);
449463

464+
const llama_token TOKEN_IMG_PLACEMENT = -1000;
465+
466+
// tokenize with "placeholder" for image embedding tokens
467+
// "<img_placement>" will be replaced with TOKEN_IMG_PLACEMENT
468+
// TODO: this function is hacky, need to be improved
469+
std::vector<llama_token> llama_tokenize_with_img(
470+
const struct llama_context * ctx,
471+
const std::string & text,
472+
bool add_special,
473+
bool parse_special = false);
474+
450475
// tokenizes a token into a piece, optionally renders special/control tokens
451476
// should work similar to Python's `tokenizer.id_to_piece`
452477
std::string llama_token_to_piece(

common/vision.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ llama_img * load_image_from_file(const char * fname) {
3131
// printf("\n");
3232
// }
3333
// printf("\n");
34-
llama_img * result = llama_img_alloc(nx, ny);
34+
llama_img * result = llama_img_init(nx, ny);
3535
memcpy(result->data, img, nx*ny*3);
3636
stbi_image_free(img);
3737
return result;

convert_hf_to_gguf.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -471,7 +471,7 @@ def load_hparams(dir_model: Path):
471471
text_config = AutoConfig.from_pretrained(text_config["_name_or_path"]).to_dict()
472472
hparams = {**text_config, **hparams}
473473
return hparams
474-
474+
475475
@staticmethod
476476
def load_preprocessor_config(dir_model: Path):
477477
file_path = dir_model / "preprocessor_config.json"
@@ -1590,7 +1590,7 @@ def set_gguf_parameters(self):
15901590

15911591
# For vision model
15921592
if self.vparams is not None and self.preprocessor_config is not None:
1593-
self.gguf_writer.add_vision_type("clip")
1593+
self.gguf_writer.add_vision_type("clip-vit")
15941594
self.gguf_writer.add_vision_image_size(self.vparams["image_size"])
15951595
self.gguf_writer.add_vision_patch_size(self.vparams["patch_size"])
15961596
self.gguf_writer.add_vision_clip_architecture("llava")
@@ -1600,6 +1600,8 @@ def set_gguf_parameters(self):
16001600
self.gguf_writer.add_vision_clip_head_count(self.vparams["num_attention_heads"])
16011601
self.gguf_writer.add_vision_clip_image_mean(self.preprocessor_config["image_mean"])
16021602
self.gguf_writer.add_vision_clip_image_std(self.preprocessor_config["image_std"])
1603+
self.gguf_writer.add_vision_clip_select_layer(self.hparams["vision_feature_layer"])
1604+
self.gguf_writer.add_vision_clip_patch_merge_type(gguf.CLIPPatchMergeType.FLAT)
16031605
max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 + 1
16041606
self.gguf_writer.add_vision_clip_max_position_embeddings(max_pos_embd)
16051607
# TODO: should not hardcode these, but they are currently missing from config.json

examples/simple/simple.cpp

Lines changed: 60 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,9 @@ static void print_usage(int, char ** argv) {
1515
int main(int argc, char ** argv) {
1616
gpt_params params;
1717

18-
params.prompt = "Hello my name is";
18+
//params.prompt = "Hello my name is";
19+
params.prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n"
20+
"USER:<img_placement>\nwhat did you see?\nASSISTANT:";
1921
params.n_predict = 32;
2022

2123
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
@@ -62,52 +64,10 @@ int main(int argc, char ** argv) {
6264

6365
llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
6466

65-
66-
67-
68-
// TODO: this is for testing; DELETE ME
69-
int n_cur = 0;
70-
params.prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:";
71-
{
72-
llama_img_batch ibatch;
73-
ibatch.n_imgs = 1;
74-
ibatch.imgs = (llama_img **) malloc(1024);
75-
ibatch.imgs[0] = load_image_from_file("../models/eiffel-tower-3349075_1280.jpg");
76-
llama_vision_encode(ctx, &ibatch);
77-
78-
auto tokens = ::llama_tokenize(ctx, params.prompt, true);
79-
int n_imgs = ibatch.n_imgs;
80-
int n_embd = llama_n_embd(model);
81-
int n_patches = llama_vision_n_patches(ctx);
82-
printf("n_embd = %d ; n_patches = %d \n", n_embd, n_patches);
83-
float * output_img = llama_vision_get_embeddings(ctx, 0);
84-
85-
n_cur += tokens.size();
86-
llama_batch batch = llama_batch_init(512, 0, 1);
87-
llama_batch_clear(batch);
88-
for (auto t : tokens) { llama_batch_add(batch, t, n_cur, { 0 }, false); n_cur++; }
89-
if (llama_decode(ctx, batch) != 0) {
90-
LOG("%s: llama_decode() failed\n", __func__);
91-
return 1;
92-
}
93-
94-
// for (int k = 0; k < 10; k++) printf("%f\n", output_img[k]);
95-
llama_batch_clear(batch);
96-
batch = {int32_t(n_patches*n_imgs), nullptr, output_img, nullptr, nullptr, nullptr, nullptr, n_cur, 1, 0, };
97-
if (llama_decode(ctx, batch) != 0) {
98-
LOG("%s: llama_decode() failed\n", __func__);
99-
return 1;
100-
}
101-
n_cur += n_embd*n_imgs;
102-
}
103-
params.prompt = "\nwhat did you see?\nASSISTANT:";
104-
105-
106-
10767
// tokenize the prompt
10868

10969
std::vector<llama_token> tokens_list;
110-
tokens_list = ::llama_tokenize(ctx, params.prompt, true);
70+
tokens_list = ::llama_tokenize_with_img(ctx, params.prompt, true);
11171

11272
const int n_ctx = llama_n_ctx(ctx);
11373
const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size());
@@ -127,33 +87,82 @@ int main(int argc, char ** argv) {
12787
LOG("\n");
12888

12989
for (auto id : tokens_list) {
130-
LOG("%s", llama_token_to_piece(ctx, id).c_str());
90+
if (id == TOKEN_IMG_PLACEMENT) {
91+
LOG("<img_placement>");
92+
} else {
93+
LOG("%s", llama_token_to_piece(ctx, id).c_str());
94+
}
13195
}
13296

97+
LOG("\n\n");
98+
99+
// load image
100+
llama_batch_img img_batch = llama_batch_img_init(1);
101+
img_batch.imgs[0] = load_image_from_file("../models/eiffel-tower-3349075_1280.jpg");
102+
133103
// create a llama_batch with size 512
134104
// we use this object to submit token data for decoding
135105

136106
llama_batch batch = llama_batch_init(512, 0, 1);
137107

138108
// evaluate the initial prompt
139-
for (size_t i = 0; i < tokens_list.size(); i++) {
140-
//llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
141-
if (i == 0) continue;
142-
llama_batch_add(batch, tokens_list[i], n_cur, { 0 }, false);
143-
n_cur++;
109+
int n_cur = 0;
110+
int i_img = 0;
111+
for (auto id : tokens_list) {
112+
if (id == TOKEN_IMG_PLACEMENT) {
113+
img_batch.pos[i_img] = n_cur;
114+
n_cur += llama_img_n_tokens(ctx, img_batch.imgs[i_img]);
115+
i_img++;
116+
} else {
117+
llama_batch_add(batch, id, n_cur, { 0 }, false);
118+
printf("pos %d tok %d --> %s\n", n_cur, id, llama_token_to_piece(ctx, id).c_str());
119+
n_cur++;
120+
}
144121
}
145122

146123
// llama_decode will output logits only for the last token of the prompt
147124
batch.logits[batch.n_tokens - 1] = true;
148125

126+
if (llama_encode_vision(ctx, img_batch) != 0) {
127+
LOG("%s: llama_encode_vision() failed\n", __func__);
128+
return 1;
129+
}
130+
131+
n_cur = 0;
132+
{
133+
auto t1 = ::llama_tokenize(ctx, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", false);
134+
auto t2 = ::llama_tokenize(ctx, "\nwhat did you see?\nASSISTANT:", false);
135+
t1.insert(t1.begin(), 1);
136+
137+
n_cur = 0;
138+
llama_batch_clear(batch);
139+
llama_batch_add(batch, 1, 0, { 0 }, false);
140+
llama_decode(ctx, batch);
141+
142+
n_cur = t1.size();
143+
llama_batch_clear(batch);
144+
llama_batch batch0 = {int32_t(576), nullptr, _test_get_img_embd(ctx), nullptr, nullptr, nullptr, nullptr, n_cur, 1, 0, };
145+
llama_decode(ctx, batch0);
146+
147+
n_cur = 0;
148+
llama_batch_clear(batch);
149+
for (auto t : t1) { llama_batch_add(batch, t, n_cur, { 0 }, false); n_cur++; }
150+
llama_decode(ctx, batch);
151+
152+
n_cur = t1.size() + 576;
153+
llama_batch_clear(batch);
154+
printf("pos %d\n", n_cur);
155+
for (auto t : t2) { llama_batch_add(batch, t, n_cur, { 0 }, false); n_cur++; }
156+
batch.logits[batch.n_tokens - 1] = true;
157+
}
158+
149159
if (llama_decode(ctx, batch) != 0) {
150160
LOG("%s: llama_decode() failed\n", __func__);
151161
return 1;
152162
}
153163

154164
// main loop
155165

156-
//int n_cur = batch.n_tokens;
157166
int n_decode = 0;
158167

159168
const auto t_main_start = ggml_time_us();

gguf-py/gguf/constants.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -173,13 +173,15 @@ class Tokenizer:
173173
MIDDLE_ID = "tokenizer.ggml.middle_token_id"
174174
EOT_ID = "tokenizer.ggml.eot_token_id"
175175
EOM_ID = "tokenizer.ggml.eom_token_id"
176+
IMAGE_START_ID = "tokenizer.ggml.image_start_token_id"
177+
IMAGE_END_ID = "tokenizer.ggml.image_end_token_id"
176178

177179
class Adapter:
178180
TYPE = "adapter.type"
179181
LORA_ALPHA = "adapter.lora.alpha"
180182

181183
class Vision:
182-
# only support vision.type = "clip" for now
184+
# only support vision.type = "clip-vit" for now
183185
TYPE = "vision.type"
184186
IMAGE_SIZE = "vision.image_size"
185187
PATCH_SIZE = "vision.patch_size"
@@ -196,7 +198,10 @@ class Clip:
196198
PROJECTION_DIM = "vision.clip.projection_dim"
197199
USE_GELU = "vision.clip.use_gelu"
198200
MAX_POS_EMBEDDING = "vision.clip.max_position_embeddings"
201+
MAX_SLICES = "vision.clip.max_slices"
199202
PROJECTOR_TYPE = "vision.clip.projector_type"
203+
SELECT_LAYER = "vision.clip.select_layer"
204+
PATCH_MERGE_TYPE = "vision.clip.patch_merge_type"
200205
HEAD_COUNT = "vision.clip.attention.head_count"
201206
LAYERNORM_EPS = "vision.clip.attention.layer_norm_epsilon"
202207

@@ -370,8 +375,7 @@ class MODEL_TENSOR(IntEnum):
370375
ENC_FFN_UP = auto()
371376
ENC_OUTPUT_NORM = auto()
372377
# vision
373-
V_MMPROJ_A = auto()
374-
V_MMPROJ_B = auto()
378+
V_MMPROJ = auto()
375379
V_ENC_EMBD_CLS = auto()
376380
V_ENC_EMBD_PATCH = auto()
377381
V_ENC_EMBD_POS = auto()
@@ -547,8 +551,7 @@ class MODEL_TENSOR(IntEnum):
547551
MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up",
548552
MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm",
549553
# vision
550-
MODEL_TENSOR.V_MMPROJ_A: "v.mmproj_a",
551-
MODEL_TENSOR.V_MMPROJ_B: "v.mmproj_b",
554+
MODEL_TENSOR.V_MMPROJ: "v.mmproj_{bid}",
552555
MODEL_TENSOR.V_ENC_EMBD_CLS: "v.enc.embd.cls",
553556
MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.enc.embd.patch",
554557
MODEL_TENSOR.V_ENC_EMBD_POS: "v.enc.embd.pos",
@@ -1338,8 +1341,7 @@ class MODEL_TENSOR(IntEnum):
13381341
MODEL_TENSOR.FFN_UP,
13391342
],
13401343
MODEL_ARCH.LLAVA_VISION: [
1341-
MODEL_TENSOR.V_MMPROJ_A,
1342-
MODEL_TENSOR.V_MMPROJ_B,
1344+
MODEL_TENSOR.V_MMPROJ,
13431345
MODEL_TENSOR.V_ENC_EMBD_CLS,
13441346
MODEL_TENSOR.V_ENC_EMBD_PATCH,
13451347
MODEL_TENSOR.V_ENC_EMBD_POS,
@@ -1430,6 +1432,11 @@ class CLIPProjectorType(Enum):
14301432
MLP = 'mlp'
14311433

14321434

1435+
class CLIPPatchMergeType(Enum):
1436+
FLAT = 'flat'
1437+
SPATIAL_UNPAD = 'spatial_unpad'
1438+
1439+
14331440
class GGMLQuantizationType(IntEnum):
14341441
F32 = 0
14351442
F16 = 1

gguf-py/gguf/gguf_writer.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
PoolingType,
2828
TokenType,
2929
CLIPProjectorType,
30+
CLIPPatchMergeType,
3031
)
3132

3233
from .quants import quant_shape_from_byte_shape
@@ -848,6 +849,15 @@ def add_vision_clip_max_position_embeddings(self, value: int) -> None:
848849
def add_vision_clip_projector_type(self, value: CLIPProjectorType) -> None:
849850
self.add_string(Keys.Vision.Clip.PROJECTOR_TYPE, value.value)
850851

852+
def add_vision_clip_max_slices(self, value: int) -> None:
853+
self.add_uint32(Keys.Vision.Clip.MAX_SLICES, value)
854+
855+
def add_vision_clip_select_layer(self, value: int) -> None:
856+
self.add_int32(Keys.Vision.Clip.SELECT_LAYER, value)
857+
858+
def add_vision_clip_patch_merge_type(self, value: CLIPPatchMergeType) -> None:
859+
self.add_string(Keys.Vision.Clip.PATCH_MERGE_TYPE, value.value)
860+
851861
def add_vision_clip_layer_norm_epsilon(self, value: float) -> None:
852862
self.add_float32(Keys.Vision.Clip.LAYERNORM_EPS, value)
853863

gguf-py/gguf/tensor_mapping.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -680,12 +680,12 @@ class TensorNameMap:
680680
"encoder.final_layer_norm", # t5
681681
),
682682

683-
MODEL_TENSOR.V_MMPROJ_A: (
684-
"multi_modal_projector.linear_1",
683+
MODEL_TENSOR.V_MMPROJ: (
684+
"multi_modal_projector.linear_{bid}",
685685
),
686686

687-
MODEL_TENSOR.V_MMPROJ_B: (
688-
"multi_modal_projector.linear_2",
687+
MODEL_TENSOR.V_MMPROJ: (
688+
"multi_modal_projector.linear_{bid}",
689689
),
690690

691691
MODEL_TENSOR.V_ENC_EMBD_CLS: (

0 commit comments

Comments
 (0)