Skip to content

Commit 6854ad4

Browse files
committed
img pre processing
1 parent a75c5c4 commit 6854ad4

File tree

6 files changed

+564
-26
lines changed

6 files changed

+564
-26
lines changed

convert_hf_to_gguf.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1584,8 +1584,13 @@ def set_gguf_parameters(self):
15841584
self.gguf_writer.add_vision_clip_feed_forward_length(self.vparams["intermediate_size"])
15851585
self.gguf_writer.add_vision_clip_head_count(self.vparams["num_attention_heads"])
15861586
# TODO: should not hardcode these, but they are currently missing from config.json
1587+
self.gguf_writer.add_vision_clip_projector_type(gguf.constants.CLIPProjectorType.MLP)
15871588
self.gguf_writer.add_vision_clip_max_position_embeddings(577)
15881589
self.gguf_writer.add_vision_clip_layer_norm_epsilon(1e-05)
1590+
default_image_mean = [0.48145466, 0.4578275, 0.40821073]
1591+
default_image_std = [0.26862954, 0.26130258, 0.27577711]
1592+
self.gguf_writer.add_vision_clip_image_mean(default_image_mean)
1593+
self.gguf_writer.add_vision_clip_image_std(default_image_std)
15891594

15901595
@staticmethod
15911596
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):

gguf-py/gguf/constants.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,7 @@ class Clip:
196196
PROJECTION_DIM = "vision.clip.projection_dim"
197197
USE_GELU = "vision.clip.use_gelu"
198198
MAX_POS_EMBEDDING = "vision.clip.max_position_embeddings"
199+
PROJECTOR_TYPE = "vision.clip.projector_type"
199200
HEAD_COUNT = "vision.clip.attention.head_count"
200201
LAYERNORM_EPS = "vision.clip.attention.layer_norm_epsilon"
201202

@@ -1425,6 +1426,10 @@ class PoolingType(IntEnum):
14251426
CLS = 2
14261427

14271428

1429+
class CLIPProjectorType(Enum):
1430+
MLP = 'mlp'
1431+
1432+
14281433
class GGMLQuantizationType(IntEnum):
14291434
F32 = 0
14301435
F16 = 1

gguf-py/gguf/gguf_writer.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
RopeScalingType,
2727
PoolingType,
2828
TokenType,
29+
CLIPProjectorType,
2930
)
3031

3132
from .quants import quant_shape_from_byte_shape
@@ -844,9 +845,18 @@ def add_vision_clip_head_count(self, value: int) -> None:
844845
def add_vision_clip_max_position_embeddings(self, value: int) -> None:
845846
self.add_uint32(Keys.Vision.Clip.MAX_POS_EMBEDDING, value)
846847

848+
def add_vision_clip_projector_type(self, value: CLIPProjectorType) -> None:
849+
self.add_string(Keys.Vision.Clip.PROJECTOR_TYPE, value.value)
850+
847851
def add_vision_clip_layer_norm_epsilon(self, value: float) -> None:
848852
self.add_float32(Keys.Vision.Clip.LAYERNORM_EPS, value)
849853

854+
def add_vision_clip_image_mean(self, value: Sequence[float]) -> None:
855+
self.add_array(Keys.Vision.IMAGE_MEAN, value)
856+
857+
def add_vision_clip_image_std(self, value: Sequence[float]) -> None:
858+
self.add_array(Keys.Vision.IMAGE_STD, value)
859+
850860
def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
851861
if not isinstance(value, str):
852862
template_default = None

0 commit comments

Comments
 (0)