Skip to content

Commit ca71fb9

Browse files
gabe-l-hartngxson
andauthored
model : Granite docling + Idefics3 preprocessing (SmolVLM) (#16206)
* feat: Add granite-docling conversion using trillion pretokenizer Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <[email protected]> * feat: Add granite-docling vocab pre enum Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <[email protected]> * fix: Use granite-docling pre Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <[email protected]> * feat: Add clip_is_idefics3 Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <[email protected]> * feat: Allow multi-token boundary sequences for image templating Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <[email protected]> * feat: Add tiling support for idefices3 in clip.cpp This should likely be moved into llava_uhd::get_slice_instructions, but for now this avoids disrupting the logic there. Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <[email protected]> * feat: Partial support for full templating for idefics3 in mtmd There are still errors encoding some of the image chunks, but the token sequence now matches transformers _almost_ perfectly, except for the double newline before the global image which shows up as two consecutive newline tokens instead of a single double-newline token. I think this is happening because the blocks are tokenized separately then concatenated. Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <[email protected]> * feat: Fully working image preprocessing for idefics3 w/ resize and slicing Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <[email protected]> * feat: Parse the preprocessor config's longest side and add it to the mmproj hparams Branch: GraniteDocling Signed-off-by: Gabe Goodhart <[email protected]> * fix: Use the longest side instead of size * scale_factor For Granite Docling, these come out to the same value, but that was just a conicidence. Branch: GraniteDocling Signed-off-by: Gabe Goodhart <[email protected]> * fix: Allow batch encoding and remove clip_is_idefics3 Branch: GraniteDocling Signed-off-by: Gabe Goodhart <[email protected]> * refactor: Remove unnecessary conditionals for empty token vectors Branch: GraniteDocling Signed-off-by: Gabe Goodhart <[email protected]> * refactor: Use image_manipulation util Branch: GraniteDocling Signed-off-by: Gabe Goodhart <[email protected]> * add test model --------- Signed-off-by: Gabe Goodhart <[email protected]> Co-authored-by: Xuan Son Nguyen <[email protected]>
1 parent 3526657 commit ca71fb9

File tree

10 files changed

+165
-97
lines changed

10 files changed

+165
-97
lines changed

convert_hf_to_gguf.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -891,6 +891,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
891891
if chkhsh == "9b1be57e70d20d9501b2b3186e792d81181ae36ada3903c26f9fea418cf87206":
892892
# ref: https://huggingface.co/inclusionAI/LLaDA-MoE-7B-A1B-Base
893893
res = "llada-moe"
894+
if chkhsh == "53e325976a6e142379c19b09afcae354f2f496f147afa8f9e189a33fe4e3024e":
895+
# ref: https://huggingface.co/ibm-granite/granite-docling-258M
896+
res = "granite-docling"
894897

895898
if res is None:
896899
logger.warning("\n")
@@ -1325,6 +1328,7 @@ def __init__(self, *args, **kwargs):
13251328
self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count)
13261329

13271330
# load preprocessor config
1331+
self.preprocessor_config = {}
13281332
if not self.is_mistral_format:
13291333
with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
13301334
self.preprocessor_config = json.load(f)
@@ -1347,7 +1351,8 @@ def set_gguf_parameters(self):
13471351
self.gguf_writer.add_vision_projection_dim(self.n_embd_text)
13481352

13491353
# vision config
1350-
self.gguf_writer.add_vision_image_size(self.find_vparam(["image_size"]))
1354+
self.image_size = self.find_vparam(["image_size"])
1355+
self.gguf_writer.add_vision_image_size(self.image_size)
13511356
self.gguf_writer.add_vision_patch_size(self.find_vparam(["patch_size"]))
13521357
self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size"]))
13531358
self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size"]))
@@ -2378,6 +2383,10 @@ def set_gguf_parameters(self):
23782383
self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("scale_factor", 2))
23792384
self.gguf_writer.add_vision_use_gelu(True)
23802385

2386+
# Add the preprocessor longest edge size
2387+
preproc_image_size = self.preprocessor_config.get("size", {}).get("longest_edge", self.image_size)
2388+
self.gguf_writer.add_vision_preproc_image_size(preproc_image_size)
2389+
23812390
def tensor_force_quant(self, name, new_name, bid, n_dims):
23822391
if ".embeddings." in name:
23832392
return gguf.GGMLQuantizationType.F32

convert_hf_to_gguf_update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ class TOKENIZER_TYPE(IntEnum):
140140
{"name": "exaone4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
141141
{"name": "mellum", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", },
142142
{"name": "llada-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/LLaDA-MoE-7B-A1B-Base", },
143+
{"name": "granite-docling", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", },
143144
]
144145

145146
# some models are known to be broken upstream, so we will skip them as exceptions

gguf-py/gguf/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,7 @@ class Clip:
261261

262262
class ClipVision:
263263
IMAGE_SIZE = "clip.vision.image_size"
264+
PREPROC_IMAGE_SIZE = "clip.vision.preproc_image_size"
264265
PATCH_SIZE = "clip.vision.patch_size"
265266
EMBEDDING_LENGTH = "clip.vision.embedding_length"
266267
FEED_FORWARD_LENGTH = "clip.vision.feed_forward_length"

gguf-py/gguf/gguf_writer.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1037,6 +1037,9 @@ def add_vision_attention_layernorm_eps(self, value: float) -> None:
10371037
def add_vision_image_size(self, value: int) -> None:
10381038
self.add_uint32(Keys.ClipVision.IMAGE_SIZE, value)
10391039

1040+
def add_vision_preproc_image_size(self, value: int) -> None:
1041+
self.add_uint32(Keys.ClipVision.PREPROC_IMAGE_SIZE, value)
1042+
10401043
def add_vision_image_mean(self, values: Sequence[float]) -> None:
10411044
self.add_array(Keys.ClipVision.IMAGE_MEAN, values)
10421045

src/llama-vocab.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
347347
case LLAMA_VOCAB_PRE_TYPE_OLMO:
348348
case LLAMA_VOCAB_PRE_TYPE_JAIS:
349349
case LLAMA_VOCAB_PRE_TYPE_TRILLION:
350+
case LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING:
350351
regex_exprs = {
351352
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
352353
};
@@ -1961,6 +1962,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
19611962
tokenizer_pre == "trillion") {
19621963
pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
19631964
clean_spaces = false;
1965+
} else if (
1966+
tokenizer_pre == "granite-docling") {
1967+
pre_type = LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING;
1968+
clean_spaces = false;
19641969
} else if (
19651970
tokenizer_pre == "bailingmoe" ||
19661971
tokenizer_pre == "llada-moe") {

src/llama-vocab.h

Lines changed: 41 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -8,46 +8,47 @@
88

99
// pre-tokenization types
1010
enum llama_vocab_pre_type {
11-
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
12-
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
13-
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
14-
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
15-
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
16-
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
17-
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
18-
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
19-
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
20-
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
21-
LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
22-
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
23-
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
24-
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
25-
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
26-
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
27-
LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
28-
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
29-
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
30-
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
31-
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
32-
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
33-
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
34-
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
35-
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
36-
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
37-
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
38-
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
39-
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
40-
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
41-
LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
42-
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
43-
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
44-
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
45-
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
46-
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
47-
LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
48-
LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
49-
LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
50-
LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39,
11+
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
12+
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
13+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
14+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
15+
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
16+
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
17+
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
18+
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
19+
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
20+
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
21+
LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
22+
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
23+
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
24+
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
25+
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
26+
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
27+
LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
28+
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
29+
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
30+
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
31+
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
32+
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
33+
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
34+
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
35+
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
36+
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
37+
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
38+
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
39+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
40+
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
41+
LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
42+
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
43+
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
44+
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
45+
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
46+
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
47+
LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
48+
LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
49+
LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
50+
LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39,
51+
LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40,
5152
};
5253

5354
struct LLM_KV;

tools/mtmd/clip-impl.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131

3232
// vision-specific
3333
#define KEY_IMAGE_SIZE "clip.vision.image_size"
34+
#define KEY_PREPROC_IMAGE_SIZE "clip.vision.preproc_image_size"
3435
#define KEY_PATCH_SIZE "clip.vision.patch_size"
3536
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
3637
#define KEY_IMAGE_STD "clip.vision.image_std"

tools/mtmd/clip.cpp

Lines changed: 48 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,9 @@ struct clip_hparams {
170170
int32_t projection_dim;
171171
int32_t n_head;
172172
int32_t n_layer;
173-
int32_t proj_scale_factor = 0; // idefics3
173+
// idefics3
174+
int32_t preproc_image_size = 0;
175+
int32_t proj_scale_factor = 0;
174176

175177
float image_mean[3];
176178
float image_std[3];
@@ -2250,6 +2252,7 @@ struct clip_model_loader {
22502252

22512253
if (is_vision) {
22522254
get_u32(KEY_IMAGE_SIZE, hparams.image_size);
2255+
get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.preproc_image_size, false);
22532256
get_u32(KEY_PATCH_SIZE, hparams.patch_size);
22542257
get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
22552258
get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy
@@ -3551,10 +3554,51 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
35513554
// res_imgs->data[0] = *res;
35523555
res_imgs->entries.push_back(std::move(img_f32));
35533556
return true;
3554-
}
3555-
else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE
3557+
} else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) {
3558+
// The refined size has two steps:
3559+
// 1. Resize w/ aspect-ratio preserving such that the longer side is
3560+
// the preprocessor longest size
3561+
// 2. Resize w/out preserving aspect ratio such that both sides are
3562+
// multiples of image_size (always rounding up)
3563+
//
3564+
// CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737
3565+
const clip_image_size refined_size = image_manipulation::calc_size_preserved_ratio(
3566+
original_size, params.image_size, params.preproc_image_size);
3567+
3568+
llava_uhd::slice_instructions instructions;
3569+
instructions.overview_size = clip_image_size{params.image_size, params.image_size};
3570+
instructions.refined_size = refined_size;
3571+
instructions.grid_size = clip_image_size{
3572+
static_cast<int>(std::ceil(static_cast<float>(refined_size.width) / params.image_size)),
3573+
static_cast<int>(std::ceil(static_cast<float>(refined_size.height) / params.image_size)),
3574+
};
3575+
for (int y = 0; y < refined_size.height; y += params.image_size) {
3576+
for (int x = 0; x < refined_size.width; x += params.image_size) {
3577+
instructions.slices.push_back(llava_uhd::slice_coordinates{
3578+
/* x */x,
3579+
/* y */y,
3580+
/* size */clip_image_size{
3581+
std::min(params.image_size, refined_size.width - x),
3582+
std::min(params.image_size, refined_size.height - y)
3583+
}
3584+
});
3585+
}
3586+
}
3587+
auto imgs = llava_uhd::slice_image(img, instructions);
3588+
3589+
// cast and normalize to f32
3590+
for (size_t i = 0; i < imgs.size(); ++i) {
3591+
// clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
3592+
clip_image_f32_ptr res(clip_image_f32_init());
3593+
normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
3594+
res_imgs->entries.push_back(std::move(res));
3595+
}
3596+
3597+
res_imgs->grid_x = instructions.grid_size.width;
3598+
res_imgs->grid_y = instructions.grid_size.height;
3599+
return true;
3600+
} else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE
35563601
|| ctx->proj_type() == PROJECTOR_TYPE_GEMMA3
3557-
|| ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3
35583602
|| ctx->proj_type() == PROJECTOR_TYPE_INTERNVL // TODO @ngxson : support dynamic resolution
35593603
) {
35603604
clip_image_u8 resized_image;

0 commit comments

Comments
 (0)