Skip to content

Commit 1434730

Browse files
committed
reverting change to legacy-models script
1 parent ba079d3 commit 1434730

File tree

1 file changed

+9
-18
lines changed

1 file changed

+9
-18
lines changed

tools/mtmd/legacy-models/convert_image_encoder_to_gguf.py

Lines changed: 9 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,6 @@
66
import torch
77
import numpy as np
88
from gguf import *
9-
from typing import cast
10-
from torch.nn import ModuleList
11-
from transformers.models.clip.modeling_clip import CLIPVisionTransformer
12-
from transformers import PreTrainedModel
139
from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel, SiglipVisionModel
1410

1511
TEXT = "clip.text"
@@ -166,13 +162,13 @@ def bytes_to_unicode():
166162
ftype = 0
167163

168164
if args.clip_model_is_siglip:
169-
model: PreTrainedModel = SiglipVisionModel.from_pretrained(dir_model)
165+
model = SiglipVisionModel.from_pretrained(dir_model)
170166
processor = None
171167
elif args.clip_model_is_vision or args.clip_model_is_openclip:
172-
model: PreTrainedModel = CLIPVisionModel.from_pretrained(dir_model)
168+
model = CLIPVisionModel.from_pretrained(dir_model)
173169
processor = None
174170
else:
175-
model: PreTrainedModel = CLIPModel.from_pretrained(dir_model)
171+
model = CLIPModel.from_pretrained(dir_model)
176172
processor = CLIPProcessor.from_pretrained(dir_model)
177173

178174
fname_middle = None
@@ -354,14 +350,9 @@ def get_non_negative_vision_feature_layers(v_hparams):
354350
# By default, we drop the last layer for llava projector
355351
# models unless we have explicitly set vision feature layers
356352
if feature_layers is None:
357-
vision_model = cast(CLIPVisionTransformer, model.vision_model)
358-
encoder_layers = vision_model.encoder.layers
359-
encoder_layers.pop(-1)
353+
model.vision_model.encoder.layers.pop(-1)
360354
else:
361-
vision_model = cast(CLIPVisionTransformer, model.vision_model)
362-
encoder_layers = vision_model.encoder.layers
363-
encoder_layers = cast(ModuleList, encoder_layers)
364-
encoder_layers.__init__(encoder_layers[:max(feature_layers)])
355+
model.vision_model.encoder.layers = model.vision_model.encoder.layers[:max(feature_layers)]
365356

366357
projector = torch.load(args.llava_projector)
367358
for name, data in projector.items():
@@ -384,24 +375,24 @@ def get_non_negative_vision_feature_layers(v_hparams):
384375
continue
385376

386377
name = get_tensor_name(name)
387-
data = np.ascontiguousarray(data.detach().cpu().squeeze().numpy())
378+
data = data.squeeze().numpy()
388379

389380
n_dims = len(data.shape)
390381

391382
# ftype == 0 -> float32, ftype == 1 -> float16
392383
ftype_cur = 0
393384
if n_dims == 4:
394385
print(f"tensor {name} is always saved in f16")
395-
data = np.asarray(data, dtype=np.float16)
386+
data = data.astype(np.float16)
396387
ftype_cur = 1
397388
elif ftype == 1:
398389
if name[-7:] == ".weight" and n_dims == 2:
399390
print(" Converting to float16")
400-
data = np.asarray(data, dtype=np.float16)
391+
data = data.astype(np.float16)
401392
ftype_cur = 1
402393
else:
403394
print(" Converting to float32")
404-
data = np.asarray(data, dtype=np.float32)
395+
data = data.astype(np.float32)
405396
ftype_cur = 0
406397
else:
407398
if data.dtype != np.float32:

0 commit comments

Comments
 (0)