Skip to content

Commit 644cb87

Browse files
authored
Merge branch 'ggml-org:master' into master
2 parents 3782433 + c55d53a commit 644cb87

File tree

7 files changed

+72
-15
lines changed

7 files changed

+72
-15
lines changed

convert_hf_to_gguf.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2460,18 +2460,21 @@ def set_gguf_parameters(self):
24602460
)
24612461
class LlavaVisionModel(MmprojModel):
24622462
img_break_tok_id = -1
2463+
use_break_tok = True
24632464

24642465
def __init__(self, *args, **kwargs):
24652466
super().__init__(*args, **kwargs)
24662467
if self.hparams.get("model_type") == "pixtral":
24672468
# layer_norm_eps is not in config.json, it is hard-coded in modeling_pixtral.py
24682469
self.hparams["layer_norm_eps"] = self.hparams.get("layer_norm_eps", 1e-5)
2469-
self.img_break_tok_id = self.get_token_id("[IMG_BREAK]")
2470+
if self.use_break_tok:
2471+
self.img_break_tok_id = self.get_token_id("[IMG_BREAK]")
24702472
elif self.is_mistral_format:
24712473
# hparams is already vision config here so norm_eps is only defined in global_config.
24722474
self.hparams["norm_eps"] = self.global_config.get("norm_eps", None)
24732475
assert self.hparams["norm_eps"] is not None, "norm_eps not found in params.json"
2474-
self.img_break_tok_id = self.find_vparam(["image_break_token_id"])
2476+
if self.use_break_tok:
2477+
self.img_break_tok_id = self.find_vparam(["image_break_token_id"])
24752478
else:
24762479
raise ValueError(f"Unsupported model type: {self.hparams['model_type']}")
24772480
logger.info(f"Image break token id: {self.img_break_tok_id}")
@@ -3962,6 +3965,10 @@ def _get_cls_out_tensor(self, data_torch: Tensor) -> Tensor:
39623965
return torch.stack([true_row, false_row], dim=0)
39633966

39643967
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3968+
if "model.vision_" in name:
3969+
# skip multimodal tensors
3970+
return []
3971+
39653972
if self.is_rerank:
39663973
is_tied_head = self.is_tied_embeddings and "embed_tokens" in name
39673974
is_real_head = not self.is_tied_embeddings and "lm_head" in name
@@ -9435,6 +9442,21 @@ def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", "
94359442
return super().map_tensor_name(name, try_suffixes)
94369443

94379444

9445+
@ModelBase.register("LightOnOCRForConditionalGeneration")
9446+
class LightOnOCRVisionModel(LlavaVisionModel):
9447+
is_mistral_format = False
9448+
use_break_tok = False
9449+
9450+
def set_gguf_parameters(self):
9451+
super().set_gguf_parameters()
9452+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LIGHTONOCR)
9453+
9454+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
9455+
name = name.replace("model.vision_encoder.", "vision_tower.")
9456+
name = name.replace("model.vision_projection.", "multi_modal_projector.")
9457+
return super().modify_tensors(data_torch, name, bid)
9458+
9459+
94389460
@ModelBase.register("KimiVLForConditionalGeneration")
94399461
class KimiVLModel(MmprojModel):
94409462
def __init__(self, *args, **kwargs):

gguf-py/gguf/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3062,6 +3062,7 @@ class VisionProjectorType:
30623062
VOXTRAL = "voxtral"
30633063
LFM2 = "lfm2"
30643064
KIMIVL = "kimivl"
3065+
LIGHTONOCR = "lightonocr"
30653066

30663067

30673068
# Items here are (block size, type size)

src/llama-model.cpp

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515

1616
#include <algorithm>
1717
#include <cassert>
18-
#include <cmath>
1918
#include <cfloat>
2019
#include <cstring>
2120
#include <cmath>
@@ -438,7 +437,7 @@ struct llama_model::impl {
438437
llama_mlocks mlock_mmaps;
439438

440439
// contexts where the model tensors metadata is stored as well ass the corresponding buffers:
441-
std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr>> ctxs_bufs;
440+
std::vector<std::pair<ggml_context_ptr, std::vector<ggml_backend_buffer_ptr>>> ctxs_bufs;
442441

443442
buft_list_t cpu_buft_list;
444443
std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
@@ -6186,7 +6185,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
61866185
bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
61876186
bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
61886187

6189-
ggml_backend_buffer_t buf = nullptr;
6188+
std::vector<ggml_backend_buffer_ptr> bufs;
61906189
if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
61916190
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
61926191
// only the mmap region containing the tensors in the model is mapped to the backend buffer
@@ -6199,15 +6198,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
61996198
continue;
62006199
}
62016200
const size_t max_size = ggml_get_max_tensor_size(ctx);
6202-
buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
6201+
ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
62036202
if (buf == nullptr) {
62046203
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
62056204
}
6205+
bufs.emplace_back(buf);
62066206
buf_map.emplace(idx, buf);
62076207
}
62086208
}
62096209
else {
6210-
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
6210+
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
62116211
if (buf == nullptr) {
62126212
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
62136213
}
@@ -6217,11 +6217,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
62176217
mlock_buf->init (ggml_backend_buffer_get_base(buf));
62186218
mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
62196219
}
6220+
bufs.emplace_back(buf);
62206221
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
62216222
buf_map.emplace(idx, buf);
62226223
}
62236224
}
6224-
pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr), buf);
6225+
pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr), std::move(bufs));
62256226

62266227
for (auto & buf : buf_map) {
62276228
// indicate that this buffer contains weights
@@ -6247,8 +6248,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
62476248
}
62486249

62496250
// print memory requirements per buffer type
6250-
for (auto & [_, buf] : pimpl->ctxs_bufs) {
6251-
LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
6251+
for (auto & [_, bufs] : pimpl->ctxs_bufs) {
6252+
for (auto & buf: bufs) {
6253+
LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n",
6254+
__func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
6255+
}
62526256
}
62536257

62546258
// populate tensors_by_name
@@ -6300,8 +6304,10 @@ size_t llama_model::n_devices() const {
63006304

63016305
std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
63026306
std::map<ggml_backend_buffer_type_t, size_t> ret;
6303-
for (const auto & [_, buf] : pimpl->ctxs_bufs) {
6304-
ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
6307+
for (const auto & [_, bufs] : pimpl->ctxs_bufs) {
6308+
for (const auto & buf : bufs) {
6309+
ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
6310+
}
63056311
}
63066312
return ret;
63076313
}

tools/mtmd/clip-impl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ enum projector_type {
139139
PROJECTOR_TYPE_VOXTRAL,
140140
PROJECTOR_TYPE_LFM2,
141141
PROJECTOR_TYPE_KIMIVL,
142+
PROJECTOR_TYPE_LIGHTONOCR,
142143
PROJECTOR_TYPE_UNKNOWN,
143144
};
144145

@@ -161,6 +162,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
161162
{ PROJECTOR_TYPE_VOXTRAL, "voxtral"},
162163
{ PROJECTOR_TYPE_LFM2, "lfm2"},
163164
{ PROJECTOR_TYPE_KIMIVL, "kimivl"},
165+
{ PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
164166
};
165167

166168
static projector_type clip_projector_type_from_string(const std::string & str) {

tools/mtmd/clip.cpp

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -621,7 +621,7 @@ struct clip_graph {
621621
}
622622

623623
// arrangement of the [IMG_BREAK] token
624-
{
624+
if (model.token_embd_img_break) {
625625
// not efficient, but works
626626
// the trick is to view the embeddings as a 3D tensor with shape [n_embd, n_patches_per_row, n_rows]
627627
// and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension
@@ -2095,6 +2095,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
20952095
res = graph.build_siglip();
20962096
} break;
20972097
case PROJECTOR_TYPE_PIXTRAL:
2098+
case PROJECTOR_TYPE_LIGHTONOCR:
20982099
{
20992100
res = graph.build_pixtral();
21002101
} break;
@@ -2380,6 +2381,7 @@ struct clip_model_loader {
23802381
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
23812382
} break;
23822383
case PROJECTOR_TYPE_PIXTRAL:
2384+
case PROJECTOR_TYPE_LIGHTONOCR:
23832385
{
23842386
hparams.rope_theta = 10000.0f;
23852387
hparams.warmup_image_size = hparams.patch_size * 8;
@@ -2722,6 +2724,15 @@ struct clip_model_loader {
27222724
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
27232725
model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false);
27242726
} break;
2727+
case PROJECTOR_TYPE_LIGHTONOCR:
2728+
{
2729+
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
2730+
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
2731+
model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
2732+
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
2733+
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
2734+
model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false);
2735+
} break;
27252736
case PROJECTOR_TYPE_ULTRAVOX:
27262737
{
27272738
model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
@@ -3622,7 +3633,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
36223633
res_imgs->entries.push_back(std::move(img_f32));
36233634
return true;
36243635

3625-
} else if (ctx->proj_type() == PROJECTOR_TYPE_PIXTRAL) {
3636+
} else if (ctx->proj_type() == PROJECTOR_TYPE_PIXTRAL
3637+
|| ctx->proj_type() == PROJECTOR_TYPE_LIGHTONOCR
3638+
) {
36263639
clip_image_u8 resized_image;
36273640
auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, params.patch_size, params.image_size);
36283641
image_manipulation::bilinear_resize(*img, resized_image, new_size.width, new_size.height);
@@ -3865,12 +3878,17 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
38653878
n_patches = x_patch * y_patch;
38663879
} break;
38673880
case PROJECTOR_TYPE_PIXTRAL:
3881+
case PROJECTOR_TYPE_LIGHTONOCR:
38683882
{
38693883
// dynamic size
38703884
int n_merge = params.spatial_merge_size;
38713885
int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1);
38723886
int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1);
3873-
n_patches = n_patches_y * n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
3887+
if (ctx->model.token_embd_img_break) {
3888+
n_patches = n_patches_y * n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
3889+
} else {
3890+
n_patches = n_patches_y * n_patches_x;
3891+
}
38743892
} break;
38753893
case PROJECTOR_TYPE_VOXTRAL:
38763894
case PROJECTOR_TYPE_ULTRAVOX:
@@ -4247,6 +4265,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
42474265
} break;
42484266
case PROJECTOR_TYPE_PIXTRAL:
42494267
case PROJECTOR_TYPE_KIMIVL:
4268+
case PROJECTOR_TYPE_LIGHTONOCR:
42504269
{
42514270
// set the 2D positions
42524271
int n_patches_per_col = image_size_width / patch_size;
@@ -4377,6 +4396,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
43774396
return ctx->model.mm_model_peg_0_b->ne[0];
43784397
case PROJECTOR_TYPE_MLP:
43794398
case PROJECTOR_TYPE_PIXTRAL:
4399+
case PROJECTOR_TYPE_LIGHTONOCR:
43804400
return ctx->model.mm_2_w->ne[1];
43814401
case PROJECTOR_TYPE_MLP_NORM:
43824402
return ctx->model.mm_3_b->ne[0];

tools/mtmd/mtmd.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,11 @@ struct mtmd_context {
275275
img_beg = "<img>";
276276
img_end = "</img>";
277277

278+
} else if (proj == PROJECTOR_TYPE_LIGHTONOCR) {
279+
// <|im_start|> ... (image embeddings) ... <|im_end|>
280+
img_beg = "<|im_start|>";
281+
img_end = "<|im_end|>";
282+
278283
}
279284
}
280285

tools/mtmd/tests.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ add_test_vision "ggml-org/InternVL3-1B-Instruct-GGUF:Q8_0"
7070
add_test_vision "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M"
7171
add_test_vision "ggml-org/LFM2-VL-450M-GGUF:Q8_0"
7272
add_test_vision "ggml-org/granite-docling-258M-GGUF:Q8_0"
73+
add_test_vision "ggml-org/LightOnOCR-1B-1025-GGUF:Q8_0"
7374

7475
add_test_audio "ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF:Q8_0"
7576
add_test_audio "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M"

0 commit comments

Comments
 (0)