Skip to content

Commit f60cc7e

Browse files
committed
scale_factor is an int
1 parent bee5200 commit f60cc7e

File tree

3 files changed

+7
-7
lines changed

3 files changed

+7
-7
lines changed

convert_hf_to_gguf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1901,7 +1901,7 @@ def set_gguf_parameters(self):
19011901
super().set_gguf_parameters()
19021902
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.IDEFICS3)
19031903
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
1904-
self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("scale_factor", 2.0))
1904+
self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("scale_factor", 2))
19051905
self.gguf_writer.add_vision_use_gelu(True)
19061906

19071907
def tensor_force_quant(self, name, new_name, bid, n_dims):

examples/llava/clip.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -159,11 +159,11 @@ struct clip_hparams {
159159
int32_t projection_dim;
160160
int32_t n_head;
161161
int32_t n_layer;
162+
int32_t proj_scale_factor = 0; // idefics3
162163

163164
patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT;
164165

165166
float eps;
166-
float proj_scale_factor = 0.0; // idefics3
167167

168168
std::vector<int32_t> image_grid_pinpoints;
169169
int32_t image_crop_resolution;
@@ -518,7 +518,7 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
518518
const int bsz = 1; // batch size, always 1 for now since we don't support batching
519519
const int height = std::sqrt(seq);
520520
const int width = std::sqrt(seq);
521-
GGML_ASSERT(scale_factor != 0.0);
521+
GGML_ASSERT(scale_factor != 0);
522522
cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height, bsz);
523523
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
524524
cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur),
@@ -1277,7 +1277,7 @@ struct clip_model_loader {
12771277
switch (ctx_clip.proj_type) {
12781278
case PROJECTOR_TYPE_IDEFICS3:
12791279
{
1280-
get_f32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
1280+
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
12811281
} break;
12821282
default:
12831283
break;
@@ -2386,7 +2386,7 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i
23862386
} else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
23872387
n_patches = 256;
23882388
} else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
2389-
n_patches /= (int)ctx->vision_model.hparams.proj_scale_factor;
2389+
n_patches /= ctx->vision_model.hparams.proj_scale_factor;
23902390
}
23912391

23922392
return n_patches;

gguf-py/gguf/gguf_writer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -975,8 +975,8 @@ def add_vision_use_gelu(self, value: bool) -> None:
975975
def add_vision_use_silu(self, value: bool) -> None:
976976
self.add_bool(Keys.ClipVision.USE_SILU, value)
977977

978-
def add_vision_projector_scale_factor(self, value: float) -> None:
979-
self.add_float32(Keys.ClipVision.Projector.SCALE_FACTOR, value)
978+
def add_vision_projector_scale_factor(self, value: int) -> None:
979+
self.add_uint32(Keys.ClipVision.Projector.SCALE_FACTOR, value)
980980

981981
def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
982982
pack_prefix = ''

0 commit comments

Comments
 (0)