comfyanonymous · yousef-rafat · Sep 5, 2025 · Sep 5, 2025 · Sep 5, 2025 · Sep 5, 2025
@@ -122,9 +122,10 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
         json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_g.json")
     elif "vision_model.encoder.layers.30.layer_norm1.weight" in sd:
         json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_h.json")
-    elif "vision_model.encoder.layers.22.layer_norm1.weight" in sd:
+    elif "vision_model.encoder.layers.11.layer_norm1.weight" in sd:
         embed_shape = sd["vision_model.embeddings.position_embedding.weight"].shape[0]
-        if sd["vision_model.encoder.layers.0.layer_norm1.weight"].shape[0] == 1152:
+        norm_weight = sd["vision_model.encoder.layers.0.layer_norm1.weight"].shape[0]
+        if norm_weight == 1152:
             if embed_shape == 729:
                 json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_384.json")
             elif embed_shape == 1024:
@@ -134,6 +135,8 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
                 json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336_llava.json")
             else:
                 json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336.json")
+        elif embed_shape == 1024 and norm_weight == 768:
+            json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip2_base_512.json")
         else:
             json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl.json")
 

@@ -0,0 +1,14 @@
+{
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 768,
+    "image_size": 512,
+    "intermediate_size": 3072,
+    "layer_norm_eps": 1e-06,
+    "model_type": "siglip_vision_model",
+    "num_attention_heads": 12,
+    "num_channels": 3,
+    "num_hidden_layers": 12,
+    "patch_size": 16,
+    "image_mean": [0.5, 0.5, 0.5],
+    "image_std": [0.5, 0.5, 0.5]
+  }
@@ -630,6 +630,10 @@ class ACEAudio(LatentFormat):
     latent_channels = 8
     latent_dimensions = 2
 
+class HunyuanFoley(LatentFormat):
+    latent_dimensions = 128
+    latent_channels = 1024
+
 class ChromaRadiance(LatentFormat):
     latent_channels = 3