Don't add template to qwen2.5vl when template is in prompt. (#10043)

comfyanonymous · web-flow · commit 1e098d61327e · 2025-09-26T18:34:17.000-04:00
Make the hunyuan image refiner template_end 36.
diff --git a/comfy/text_encoders/hunyuan_image.py b/comfy/text_encoders/hunyuan_image.py
@@ -63,7 +63,13 @@ def __init__(self, byt5=True, device="cpu", dtype=None, model_options={}):
             self.byt5_small = None
 
     def encode_token_weights(self, token_weight_pairs):
-        cond, p, extra = super().encode_token_weights(token_weight_pairs)
+        tok_pairs = token_weight_pairs["qwen25_7b"][0]
+        template_end = -1
+        if tok_pairs[0][0] == 27:
+            if len(tok_pairs) > 36:  # refiner prompt uses a fixed 36 template_end
+                template_end = 36
+
+        cond, p, extra = super().encode_token_weights(token_weight_pairs, template_end=template_end)
         if self.byt5_small is not None and "byt5" in token_weight_pairs:
             out = self.byt5_small.encode_token_weights(token_weight_pairs["byt5"])
             extra["conditioning_byt5small"] = out[0]
diff --git a/comfy/text_encoders/qwen_image.py b/comfy/text_encoders/qwen_image.py
@@ -18,13 +18,22 @@ def __init__(self, embedding_directory=None, tokenizer_data={}):
         self.llama_template_images = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n"
 
     def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, images=[], **kwargs):
-        if llama_template is None:
-            if len(images) > 0:
-                llama_text = self.llama_template_images.format(text)
-            else:
-                llama_text = self.llama_template.format(text)
+        skip_template = False
+        if text.startswith('<|im_start|>'):
+            skip_template = True
+        if text.startswith('<|start_header_id|>'):
+            skip_template = True
+
+        if skip_template:
+            llama_text = text
         else:
-            llama_text = llama_template.format(text)
+            if llama_template is None:
+                if len(images) > 0:
+                    llama_text = self.llama_template_images.format(text)
+                else:
+                    llama_text = self.llama_template.format(text)
+            else:
+                llama_text = llama_template.format(text)
         tokens = super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, disable_weights=True, **kwargs)
         key_name = next(iter(tokens))
         embed_count = 0
@@ -47,22 +56,23 @@ class QwenImageTEModel(sd1_clip.SD1ClipModel):
     def __init__(self, device="cpu", dtype=None, model_options={}):
         super().__init__(device=device, dtype=dtype, name="qwen25_7b", clip_model=Qwen25_7BVLIModel, model_options=model_options)
 
-    def encode_token_weights(self, token_weight_pairs):
+    def encode_token_weights(self, token_weight_pairs, template_end=-1):
         out, pooled, extra = super().encode_token_weights(token_weight_pairs)
         tok_pairs = token_weight_pairs["qwen25_7b"][0]
         count_im_start = 0
-        for i, v in enumerate(tok_pairs):
-            elem = v[0]
-            if not torch.is_tensor(elem):
-                if isinstance(elem, numbers.Integral):
-                    if elem == 151644 and count_im_start < 2:
-                        template_end = i
-                        count_im_start += 1
-
-        if out.shape[1] > (template_end + 3):
-            if tok_pairs[template_end + 1][0] == 872:
-                if tok_pairs[template_end + 2][0] == 198:
-                    template_end += 3
+        if template_end == -1:
+            for i, v in enumerate(tok_pairs):
+                elem = v[0]
+                if not torch.is_tensor(elem):
+                    if isinstance(elem, numbers.Integral):
+                        if elem == 151644 and count_im_start < 2:
+                            template_end = i
+                            count_im_start += 1
+
+            if out.shape[1] > (template_end + 3):
+                if tok_pairs[template_end + 1][0] == 872:
+                    if tok_pairs[template_end + 2][0] == 198:
+                        template_end += 3
 
         out = out[:, template_end:]