Qwen2-VL vision support

city96 · city96 · commit 0ca2aeb797d2 · 2025-08-19T17:50:55.000+02:00
#317
diff --git a/loader.py b/loader.py
@@ -3,12 +3,14 @@
 import logging
 import torch
 import gguf
+import os
 
 from .ops import GGMLTensor
 from .dequant import is_quantized, dequantize_tensor
 
 IMG_ARCH_LIST = {"flux", "sd1", "sdxl", "sd3", "aura", "hidream", "cosmos", "ltxv", "hyvid", "wan", "lumina2", "qwen_image"}
 TXT_ARCH_LIST = {"t5", "t5encoder", "llama", "qwen2vl"}
+VIS_TYPE_LIST = {"clip-vision"}
 
 def get_orig_shape(reader, tensor_name):
     field_key = f"comfy.gguf.orig_shape.{tensor_name}"
@@ -70,6 +72,7 @@ def gguf_sd_loader(path, handle_prefix="model.diffusion_model.", return_arch=Fal
     # detect and verify architecture
     compat = None
     arch_str = get_field(reader, "general.architecture", str)
+    type_str = get_field(reader, "general.type", str)
     if arch_str in [None, "pig"]:
         if is_text_model:
             raise ValueError(f"This text model is incompatible with llama.cpp!\nConsider using the safetensors version\n({path})")
@@ -81,7 +84,8 @@ def gguf_sd_loader(path, handle_prefix="model.diffusion_model.", return_arch=Fal
         except Exception as e:
             raise ValueError(f"This model is not currently supported - ({e})")
     elif arch_str not in TXT_ARCH_LIST and is_text_model:
-        raise ValueError(f"Unexpected text model architecture type in GGUF file: {arch_str!r}")
+        if type_str not in VIS_TYPE_LIST:
+            raise ValueError(f"Unexpected text model architecture type in GGUF file: {arch_str!r}")
     elif arch_str not in IMG_ARCH_LIST and not is_text_model:
         raise ValueError(f"Unexpected architecture type in GGUF file: {arch_str!r}")
 
@@ -165,6 +169,19 @@ def gguf_sd_loader(path, handle_prefix="model.diffusion_model.", return_arch=Fal
     "output.weight": "lm_head.weight",
 }
 
+CLIP_VISION_SD_MAP = {
+    "mm.": "visual.merger.mlp.",
+    "v.post_ln.": "visual.merger.ln_q.",
+    "v.patch_embd": "visual.patch_embed.proj",
+    "v.blk.": "visual.blocks.",
+    "ffn_up": "mlp.up_proj",
+    "ffn_down": "mlp.down_proj",
+    "ffn_gate": "mlp.gate_proj",
+    "attn_out.": "attn.proj.",
+    "ln1.": "norm1.",
+    "ln2.": "norm2.",
+}
+
 def sd_map_replace(raw_sd, key_map):
     sd = {}
     for k,v in raw_sd.items():
@@ -185,6 +202,76 @@ def llama_permute(raw_sd, n_head, n_head_kv):
         sd[k] = v
     return sd
 
+def gguf_mmproj_loader(path):
+    # Reverse version of Qwen2VLVisionModel.modify_tensors
+    logging.info("Attenpting to find mmproj file for text encoder...")
+
+    # get name to match w/o quant suffix
+    tenc_fname = os.path.basename(path)
+    tenc = os.path.splitext(tenc_fname)[0].lower()
+    for q in [x.name for x in gguf.GGMLQuantizationType]:
+        if q.lower() in tenc:
+            tenc = tenc.rsplit(q.lower(), 1)[0]
+            break
+    tenc = tenc[:-1] # dash/underscore/etc
+
+    # try and find matching mmproj
+    target = []
+    root = os.path.dirname(path)
+    for fname in os.listdir(root):
+        name, ext = os.path.splitext(fname)
+        if ext.lower() != ".gguf":
+            continue
+        if "mmproj" not in name.lower():
+            continue
+        if tenc in name.lower():
+            target.append(fname)
+
+    if len(target) == 0:
+        logging.error(f"Error: Can't find mmproj file for '{tenc_fname}'! Qwen-Image-Edit will be broken!")
+        return {}
+    if len(target) > 1:
+        logging.error(f"Ambiguous mmproj for text encoder '{tenc_fname}', will use first match.")
+
+    logging.info(f"Using mmproj '{target[0]}' for text encoder '{tenc_fname}'.")
+    target = os.path.join(root, target[0])
+    vsd = gguf_sd_loader(target, is_text_model=True)
+
+    # concat 4D to 5D
+    if "v.patch_embd.weight.1" in vsd:
+        w1 = dequantize_tensor(vsd.pop("v.patch_embd.weight"), dtype=torch.float32)
+        w2 = dequantize_tensor(vsd.pop("v.patch_embd.weight.1"), dtype=torch.float32)
+        vsd["v.patch_embd.weight"] = torch.stack([w1, w2], dim=2)
+
+    # run main replacement
+    vsd = sd_map_replace(vsd, CLIP_VISION_SD_MAP)
+
+    # handle split Q/K/V
+    if "visual.blocks.0.attn_q.weight" in vsd:
+        attns = {}
+        # filter out attentions + group
+        for k,v in vsd.items():
+            if any(x in k for x in ["attn_q", "attn_k", "attn_v"]):
+                k_attn, k_name = k.rsplit(".attn_", 1)
+                k_attn += ".attn.qkv." + k_name.split(".")[-1]
+                if k_attn not in attns:
+                    attns[k_attn] = {}
+                attns[k_attn][k_name] = dequantize_tensor(
+                    v, dtype=(torch.bfloat16 if is_quantized(v) else torch.float16)
+                )
+
+        # recombine
+        for k,v in attns.items():
+            suffix = k.split(".")[-1]
+            vsd[k] = torch.cat([
+                v[f"q.{suffix}"],
+                v[f"k.{suffix}"],
+                v[f"v.{suffix}"],
+            ], dim=0)
+        del attns
+
+    return vsd
+
 def gguf_tokenizer_loader(path, temb_shape):
     # convert gguf tokenizer to spiece
     logging.info("Attempting to recreate sentencepiece tokenizer from GGUF file metadata...")
@@ -254,6 +341,9 @@ def gguf_clip_loader(path):
         sd = sd_map_replace(sd, LLAMA_SD_MAP)
         if arch == "llama":
             sd = llama_permute(sd, 32, 8) # L3
+        if arch == "qwen2vl":
+            vsd = gguf_mmproj_loader(path)
+            sd.update(vsd)
     else:
         pass
     return sd