66import torch
77import numpy as np
88from gguf import *
9- from typing import cast
10- from torch .nn import ModuleList
11- from transformers .models .clip .modeling_clip import CLIPVisionTransformer
12- from transformers import PreTrainedModel
139from transformers import CLIPModel , CLIPProcessor , CLIPVisionModel , SiglipVisionModel
1410
1511TEXT = "clip.text"
@@ -166,13 +162,13 @@ def bytes_to_unicode():
166162 ftype = 0
167163
168164if args .clip_model_is_siglip :
169- model : PreTrainedModel = SiglipVisionModel .from_pretrained (dir_model )
165+ model = SiglipVisionModel .from_pretrained (dir_model )
170166 processor = None
171167elif args .clip_model_is_vision or args .clip_model_is_openclip :
172- model : PreTrainedModel = CLIPVisionModel .from_pretrained (dir_model )
168+ model = CLIPVisionModel .from_pretrained (dir_model )
173169 processor = None
174170else :
175- model : PreTrainedModel = CLIPModel .from_pretrained (dir_model )
171+ model = CLIPModel .from_pretrained (dir_model )
176172 processor = CLIPProcessor .from_pretrained (dir_model )
177173
178174fname_middle = None
@@ -354,14 +350,9 @@ def get_non_negative_vision_feature_layers(v_hparams):
354350 # By default, we drop the last layer for llava projector
355351 # models unless we have explicitly set vision feature layers
356352 if feature_layers is None :
357- vision_model = cast (CLIPVisionTransformer , model .vision_model )
358- encoder_layers = vision_model .encoder .layers
359- encoder_layers .pop (- 1 )
353+ model .vision_model .encoder .layers .pop (- 1 )
360354 else :
361- vision_model = cast (CLIPVisionTransformer , model .vision_model )
362- encoder_layers = vision_model .encoder .layers
363- encoder_layers = cast (ModuleList , encoder_layers )
364- encoder_layers .__init__ (encoder_layers [:max (feature_layers )])
355+ model .vision_model .encoder .layers = model .vision_model .encoder .layers [:max (feature_layers )]
365356
366357 projector = torch .load (args .llava_projector )
367358 for name , data in projector .items ():
@@ -384,24 +375,24 @@ def get_non_negative_vision_feature_layers(v_hparams):
384375 continue
385376
386377 name = get_tensor_name (name )
387- data = np . ascontiguousarray ( data .detach (). cpu (). squeeze ().numpy () )
378+ data = data .squeeze ().numpy ()
388379
389380 n_dims = len (data .shape )
390381
391382 # ftype == 0 -> float32, ftype == 1 -> float16
392383 ftype_cur = 0
393384 if n_dims == 4 :
394385 print (f"tensor { name } is always saved in f16" )
395- data = np . asarray ( data , dtype = np .float16 )
386+ data = data . astype ( np .float16 )
396387 ftype_cur = 1
397388 elif ftype == 1 :
398389 if name [- 7 :] == ".weight" and n_dims == 2 :
399390 print (" Converting to float16" )
400- data = np . asarray ( data , dtype = np .float16 )
391+ data = data . astype ( np .float16 )
401392 ftype_cur = 1
402393 else :
403394 print (" Converting to float32" )
404- data = np . asarray ( data , dtype = np .float32 )
395+ data = data . astype ( np .float32 )
405396 ftype_cur = 0
406397 else :
407398 if data .dtype != np .float32 :
0 commit comments