Skip to content

Commit 366abe7

Browse files
committed
wip paddleocr
1 parent 4926419 commit 366abe7

File tree

3 files changed

+44
-1
lines changed

3 files changed

+44
-1
lines changed

convert_hf_to_gguf.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3234,7 +3234,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
32343234
yield from super().modify_tensors(data_torch, name, bid)
32353235

32363236

3237-
@ModelBase.register("Ernie4_5_ForCausalLM", "Ernie4_5ForCausalLM")
3237+
@ModelBase.register("Ernie4_5_ForCausalLM", "Ernie4_5ForCausalLM", "PaddleOCRVLForConditionalGeneration")
32383238
class Ernie4_5Model(TextModel):
32393239
model_arch = gguf.MODEL_ARCH.ERNIE4_5
32403240

@@ -3250,6 +3250,10 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
32503250
if (head_dim := self.hparams.get("head_dim")) is None:
32513251
head_dim = self.hparams["hidden_size"] // num_heads
32523252

3253+
if "mlp_AR" in name or "vision_model" in name:
3254+
# skip vision model and projector tensors
3255+
return []
3256+
32533257
if "ernie." in name:
32543258
name = name.replace("ernie.", "model.")
32553259
# split the qkv weights
@@ -3368,6 +3372,36 @@ def prepare_tensors(self):
33683372
raise ValueError(f"Unprocessed experts: {experts}")
33693373

33703374

3375+
@ModelBase.register("SiglipVisionModel")
3376+
class PaddleOCRVisionModel(MmprojModel):
3377+
# PaddleOCR uses Siglip under the hood
3378+
min_pixels: int = 0
3379+
max_pixels: int = 0
3380+
3381+
def __init__(self, *args, **kwargs):
3382+
super().__init__(*args, **kwargs)
3383+
assert self.hparams_vision is not None
3384+
self.min_pixels = self.preprocessor_config["size"]["min_pixels"]
3385+
self.max_pixels = self.preprocessor_config["size"]["max_pixels"]
3386+
self.hparams_vision["image_size"] = int(math.sqrt(self.max_pixels))
3387+
3388+
def set_gguf_parameters(self):
3389+
super().set_gguf_parameters()
3390+
assert self.hparams_vision is not None
3391+
hparams = self.hparams_vision
3392+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PADDLEOCR)
3393+
self.gguf_writer.add_vision_max_pixels(self.max_pixels)
3394+
self.gguf_writer.add_vision_min_pixels(self.min_pixels)
3395+
self.gguf_writer.add_vision_use_gelu(True)
3396+
self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("rms_norm_eps", 1e-6))
3397+
3398+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3399+
del bid # unused
3400+
if "vision_model" in name or "mlp_AR" in name:
3401+
return [(self.map_tensor_name(name), data_torch)]
3402+
return [] # skip other tensors
3403+
3404+
33713405
@ModelBase.register(
33723406
"Qwen2VLModel",
33733407
"Qwen2VLForConditionalGeneration",

gguf-py/gguf/constants.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,8 @@ class Clip:
265265

266266
class ClipVision:
267267
IMAGE_SIZE = "clip.vision.image_size"
268+
MAX_PIXELS = "clip.vision.max_pixels"
269+
MIN_PIXELS = "clip.vision.min_pixels"
268270
PREPROC_IMAGE_SIZE = "clip.vision.preproc_image_size"
269271
PATCH_SIZE = "clip.vision.patch_size"
270272
EMBEDDING_LENGTH = "clip.vision.embedding_length"
@@ -3062,6 +3064,7 @@ class VisionProjectorType:
30623064
VOXTRAL = "voxtral"
30633065
LFM2 = "lfm2"
30643066
KIMIVL = "kimivl"
3067+
PADDLEOCR = "paddleocr"
30653068

30663069

30673070
# Items here are (block size, type size)

gguf-py/gguf/gguf_writer.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1029,6 +1029,12 @@ def add_vision_projection_dim(self, value: int) -> None:
10291029
def add_vision_patch_size(self, value: int) -> None:
10301030
self.add_uint32(Keys.ClipVision.PATCH_SIZE, value)
10311031

1032+
def add_vision_max_pixels(self, value: int) -> None:
1033+
self.add_uint32(Keys.ClipVision.MAX_PIXELS, value)
1034+
1035+
def add_vision_min_pixels(self, value: int) -> None:
1036+
self.add_uint32(Keys.ClipVision.MIN_PIXELS, value)
1037+
10321038
def add_vision_embedding_length(self, value: int) -> None:
10331039
self.add_uint32(Keys.ClipVision.EMBEDDING_LENGTH, value)
10341040

0 commit comments

Comments
 (0)