55from .hf_vision import HFVisionTower
66from .siglip_encoder import SigLipVisionTower
77from .clip_encoder import CLIPVisionTower , CLIPVisionTowerS2
8+ from .mlcd_encoder import MLCDVisionTower , MLCDVisionTowerS2
89
910# from .eva_clip.eva_clip_encoder import EvaClipVisionTower
1011# from .dev_eva_clip.eva_vit import EvaViTWrapper
@@ -14,12 +15,20 @@ def build_vision_tower(vision_tower_cfg, **kwargs):
1415 vision_tower = getattr (vision_tower_cfg , "mm_vision_tower" , getattr (vision_tower_cfg , "vision_tower" , None ))
1516 is_absolute_path_exists = os .path .exists (vision_tower )
1617 use_s2 = getattr (vision_tower_cfg , "s2" , False )
17- if "clip" in vision_tower or "mlcd" in vision_tower or "unicom" in vision_tower or vision_tower .startswith ("openai" ) \
18+
19+ if "mlcd-vit-bigG-patch14-336" in vision_tower :
20+ if use_s2 :
21+ return MLCDVisionTowerS2 (vision_tower , args = vision_tower_cfg , ** kwargs )
22+ else :
23+ return MLCDVisionTower (vision_tower , args = vision_tower_cfg , ** kwargs )
24+
25+ elif "clip" in vision_tower or "mlcd" in vision_tower or "unicom" in vision_tower or vision_tower .startswith ("openai" ) \
1826 or vision_tower .startswith ("laion" ) or "ShareGPT4V" in vision_tower or vision_tower .startswith ("DeepGlint" ):
1927 if use_s2 :
2028 return CLIPVisionTowerS2 (vision_tower , args = vision_tower_cfg , ** kwargs )
2129 else :
2230 return CLIPVisionTower (vision_tower , args = vision_tower_cfg , ** kwargs )
31+
2332 elif "siglip" in vision_tower :
2433 return SigLipVisionTower (vision_tower , vision_tower_cfg = vision_tower_cfg , ** kwargs )
2534 elif vision_tower .startswith ("hf:" ):
0 commit comments