GempollAI
diff --git a/‎.ci/nightly/update_windows/update_comfyui_and_python_dependencies.bat‎
Lines changed: 0 additions & 3 deletions b/‎.ci/nightly/update_windows/update_comfyui_and_python_dependencies.bat‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎.ci/nightly/windows_base_files/run_nvidia_gpu.bat‎
Lines changed: 0 additions & 2 deletions b/‎.ci/nightly/windows_base_files/run_nvidia_gpu.bat‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎.github/workflows/test-ui.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/test-ui.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/windows_release_nightly_pytorch.yml‎
Lines changed: 26 additions & 7 deletions b/‎.github/workflows/windows_release_nightly_pytorch.yml‎
Lines changed: 26 additions & 7 deletions
diff --git a/‎README.md‎
Lines changed: 7 additions & 3 deletions b/‎README.md‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎comfy/cldm/cldm.py‎
Lines changed: 15 additions & 15 deletions b/‎comfy/cldm/cldm.py‎
Lines changed: 15 additions & 15 deletions
diff --git a/‎comfy/cli_args.py‎
Lines changed: 2 additions & 1 deletion b/‎comfy/cli_args.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎comfy/clip_model.py‎
Lines changed: 64 additions & 6 deletions b/‎comfy/clip_model.py‎
Lines changed: 64 additions & 6 deletions
diff --git a/‎comfy/clip_vision.py‎
Lines changed: 27 additions & 33 deletions b/‎comfy/clip_vision.py‎
Lines changed: 27 additions & 33 deletions
@@ -22,5 +22,5 @@ jobs:
       run: | 
         npm ci
         npm run test:generate
-        npm test
+        npm test -- --verbose
       working-directory: ./tests-ui
@@ -2,6 +2,24 @@ name: "Windows Release Nightly pytorch"
 
 on:
   workflow_dispatch:
+    inputs:
+      cu:
+        description: 'cuda version'
+        required: true
+        type: string
+        default: "121"
+
+      python_minor:
+        description: 'python minor version'
+        required: true
+        type: string
+        default: "12"
+
+      python_patch:
+        description: 'python patch version'
+        required: true
+        type: string
+        default: "1"
 #  push:
 #    branches:
 #      - master
@@ -20,21 +38,21 @@ jobs:
             persist-credentials: false
         - uses: actions/setup-python@v4
           with:
-            python-version: '3.11.6'
+            python-version: 3.${{ inputs.python_minor }}.${{ inputs.python_patch }}
         - shell: bash
           run: |
             cd ..
             cp -r ComfyUI ComfyUI_copy
-            curl https://www.python.org/ftp/python/3.11.6/python-3.11.6-embed-amd64.zip -o python_embeded.zip
+            curl https://www.python.org/ftp/python/3.${{ inputs.python_minor }}.${{ inputs.python_patch }}/python-3.${{ inputs.python_minor }}.${{ inputs.python_patch }}-embed-amd64.zip -o python_embeded.zip
             unzip python_embeded.zip -d python_embeded
             cd python_embeded
-            echo 'import site' >> ./python311._pth
+            echo 'import site' >> ./python3${{ inputs.python_minor }}._pth
             curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
             ./python.exe get-pip.py
-            python -m pip wheel torch torchvision torchaudio aiohttp==3.8.5 --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu121 -r ../ComfyUI/requirements.txt pygit2 -w ../temp_wheel_dir
+            python -m pip wheel torch torchvision torchaudio --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu${{ inputs.cu }} -r ../ComfyUI/requirements.txt pygit2 -w ../temp_wheel_dir
             ls ../temp_wheel_dir
             ./python.exe -s -m pip install --pre ../temp_wheel_dir/*
-            sed -i '1i../ComfyUI' ./python311._pth
+            sed -i '1i../ComfyUI' ./python3${{ inputs.python_minor }}._pth
             cd ..
 
             git clone https://github.com/comfyanonymous/taesd
@@ -49,9 +67,10 @@ jobs:
             mkdir update
             cp -r ComfyUI/.ci/update_windows/* ./update/
             cp -r ComfyUI/.ci/windows_base_files/* ./
-            cp -r ComfyUI/.ci/nightly/update_windows/* ./update/
-            cp -r ComfyUI/.ci/nightly/windows_base_files/* ./
 
+            echo "..\python_embeded\python.exe .\update.py ..\ComfyUI\\
+            ..\python_embeded\python.exe -s -m pip install --upgrade --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cu${{ inputs.cu }} -r ../ComfyUI/requirements.txt pygit2
+            pause" > ./update/update_comfyui_and_python_dependencies.bat
             cd ..
 
             "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma -mx=8 -mfb=64 -md=32m -ms=on -mf=BCJ2 ComfyUI_windows_portable_nightly_pytorch.7z ComfyUI_windows_portable_nightly_pytorch
 
@@ -93,23 +93,27 @@ Put your SD checkpoints (the huge ckpt/safetensors files) in: models/checkpoints
 
 Put your VAE in: models/vae
 
-Note: pytorch does not support python 3.12 yet so make sure your python version is 3.11 or earlier.
+Note: pytorch stable does not support python 3.12 yet. If you have python 3.12 you will have to use the nightly version of pytorch. If you run into issues you should try python 3.11 instead.
 
 ### AMD GPUs (Linux only)
 AMD users can install rocm and pytorch with pip if you don't have it already installed, this is the command to install the stable version:
 
 ```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.6```
 
-This is the command to install the nightly with ROCm 5.7 that might have some performance improvements:
+This is the command to install the nightly with ROCm 5.7 which has a python 3.12 package and might have some performance improvements:
 
 ```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm5.7```
 
 ### NVIDIA
 
-Nvidia users should install pytorch using this command:
+Nvidia users should install stable pytorch using this command:
 
 ```pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu121```
 
+This is the command to install pytorch nightly instead which has a python 3.12 package and might have performance improvements:
+
+```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121```
+
 #### Troubleshooting
 
 If you get the "Torch not compiled with CUDA enabled" error, uninstall torch with:
 
@@ -53,7 +53,7 @@ def __init__(
         transformer_depth_middle=None,
         transformer_depth_output=None,
         device=None,
-        operations=comfy.ops,
+        operations=comfy.ops.disable_weight_init,
         **kwargs,
     ):
         super().__init__()
@@ -141,24 +141,24 @@ def __init__(
                 )
             ]
         )
-        self.zero_convs = nn.ModuleList([self.make_zero_conv(model_channels, operations=operations)])
+        self.zero_convs = nn.ModuleList([self.make_zero_conv(model_channels, operations=operations, dtype=self.dtype, device=device)])
 
         self.input_hint_block = TimestepEmbedSequential(
-                    operations.conv_nd(dims, hint_channels, 16, 3, padding=1),
+                    operations.conv_nd(dims, hint_channels, 16, 3, padding=1, dtype=self.dtype, device=device),
                     nn.SiLU(),
-                    operations.conv_nd(dims, 16, 16, 3, padding=1),
+                    operations.conv_nd(dims, 16, 16, 3, padding=1, dtype=self.dtype, device=device),
                     nn.SiLU(),
-                    operations.conv_nd(dims, 16, 32, 3, padding=1, stride=2),
+                    operations.conv_nd(dims, 16, 32, 3, padding=1, stride=2, dtype=self.dtype, device=device),
                     nn.SiLU(),
-                    operations.conv_nd(dims, 32, 32, 3, padding=1),
+                    operations.conv_nd(dims, 32, 32, 3, padding=1, dtype=self.dtype, device=device),
                     nn.SiLU(),
-                    operations.conv_nd(dims, 32, 96, 3, padding=1, stride=2),
+                    operations.conv_nd(dims, 32, 96, 3, padding=1, stride=2, dtype=self.dtype, device=device),
                     nn.SiLU(),
-                    operations.conv_nd(dims, 96, 96, 3, padding=1),
+                    operations.conv_nd(dims, 96, 96, 3, padding=1, dtype=self.dtype, device=device),
                     nn.SiLU(),
-                    operations.conv_nd(dims, 96, 256, 3, padding=1, stride=2),
+                    operations.conv_nd(dims, 96, 256, 3, padding=1, stride=2, dtype=self.dtype, device=device),
                     nn.SiLU(),
-                    zero_module(operations.conv_nd(dims, 256, model_channels, 3, padding=1))
+                    operations.conv_nd(dims, 256, model_channels, 3, padding=1, dtype=self.dtype, device=device)
         )
 
         self._feature_size = model_channels
@@ -206,7 +206,7 @@ def __init__(
                             )
                         )
                 self.input_blocks.append(TimestepEmbedSequential(*layers))
-                self.zero_convs.append(self.make_zero_conv(ch, operations=operations))
+                self.zero_convs.append(self.make_zero_conv(ch, operations=operations, dtype=self.dtype, device=device))
                 self._feature_size += ch
                 input_block_chans.append(ch)
             if level != len(channel_mult) - 1:
@@ -234,7 +234,7 @@ def __init__(
                 )
                 ch = out_ch
                 input_block_chans.append(ch)
-                self.zero_convs.append(self.make_zero_conv(ch, operations=operations))
+                self.zero_convs.append(self.make_zero_conv(ch, operations=operations, dtype=self.dtype, device=device))
                 ds *= 2
                 self._feature_size += ch
 
@@ -276,11 +276,11 @@ def __init__(
                 operations=operations
             )]
         self.middle_block = TimestepEmbedSequential(*mid_block)
-        self.middle_block_out = self.make_zero_conv(ch, operations=operations)
+        self.middle_block_out = self.make_zero_conv(ch, operations=operations, dtype=self.dtype, device=device)
         self._feature_size += ch
 
-    def make_zero_conv(self, channels, operations=None):
-        return TimestepEmbedSequential(zero_module(operations.conv_nd(self.dims, channels, channels, 1, padding=0)))
+    def make_zero_conv(self, channels, operations=None, dtype=None, device=None):
+        return TimestepEmbedSequential(operations.conv_nd(self.dims, channels, channels, 1, padding=0, dtype=dtype, device=device))
 
     def forward(self, x, hint, timesteps, context, y=None, **kwargs):
         t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False).to(x.dtype)
 
@@ -57,6 +57,7 @@ def __call__(self, parser, namespace, values, option_string=None):
 
 fpunet_group = parser.add_mutually_exclusive_group()
 fpunet_group.add_argument("--bf16-unet", action="store_true", help="Run the UNET in bf16. This should only be used for testing stuff.")
+fpunet_group.add_argument("--fp16-unet", action="store_true", help="Store unet weights in fp16.")
 fpunet_group.add_argument("--fp8_e4m3fn-unet", action="store_true", help="Store unet weights in fp8_e4m3fn.")
 fpunet_group.add_argument("--fp8_e5m2-unet", action="store_true", help="Store unet weights in fp8_e5m2.")
 
@@ -101,7 +102,7 @@ class LatentPreviewMethod(enum.Enum):
 
 
 parser.add_argument("--disable-smart-memory", action="store_true", help="Force ComfyUI to agressively offload to regular ram instead of keeping models in vram when it can.")
-
+parser.add_argument("--deterministic", action="store_true", help="Make pytorch use slower deterministic algorithms when it can. Note that this might not make images deterministic in all cases.")
 
 parser.add_argument("--dont-print-server", action="store_true", help="Don't print server output.")
 parser.add_argument("--quick-test-for-ci", action="store_true", help="Quick test for CI.")
 
@@ -57,12 +57,7 @@ def __init__(self, num_layers, embed_dim, heads, intermediate_size, intermediate
         self.layers = torch.nn.ModuleList([CLIPLayer(embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations) for i in range(num_layers)])
 
     def forward(self, x, mask=None, intermediate_output=None):
-        optimized_attention = optimized_attention_for_device(x.device, mask=True)
-        causal_mask = torch.empty(x.shape[1], x.shape[1], dtype=x.dtype, device=x.device).fill_(float("-inf")).triu_(1)
-        if mask is not None:
-            mask += causal_mask
-        else:
-            mask = causal_mask
+        optimized_attention = optimized_attention_for_device(x.device, mask=mask is not None)
 
         if intermediate_output is not None:
             if intermediate_output < 0:
@@ -105,6 +100,12 @@ def forward(self, input_tokens, attention_mask=None, intermediate_output=None, f
             mask = 1.0 - attention_mask.to(x.dtype).unsqueeze(1).unsqueeze(1).expand(attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1])
             mask = mask.masked_fill(mask.to(torch.bool), float("-inf"))
 
+        causal_mask = torch.empty(x.shape[1], x.shape[1], dtype=x.dtype, device=x.device).fill_(float("-inf")).triu_(1)
+        if mask is not None:
+            mask += causal_mask
+        else:
+            mask = causal_mask
+
         x, i = self.encoder(x, mask=mask, intermediate_output=intermediate_output)
         x = self.final_layer_norm(x)
         if i is not None and final_layer_norm_intermediate:
@@ -128,3 +129,60 @@ def set_input_embeddings(self, embeddings):
 
     def forward(self, *args, **kwargs):
         return self.text_model(*args, **kwargs)
+
+class CLIPVisionEmbeddings(torch.nn.Module):
+    def __init__(self, embed_dim, num_channels=3, patch_size=14, image_size=224, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.class_embedding = torch.nn.Parameter(torch.empty(embed_dim, dtype=dtype, device=device))
+
+        self.patch_embedding = operations.Conv2d(
+            in_channels=num_channels,
+            out_channels=embed_dim,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=False,
+            dtype=dtype,
+            device=device
+        )
+
+        num_patches = (image_size // patch_size) ** 2
+        num_positions = num_patches + 1
+        self.position_embedding = torch.nn.Embedding(num_positions, embed_dim, dtype=dtype, device=device)
+
+    def forward(self, pixel_values):
+        embeds = self.patch_embedding(pixel_values).flatten(2).transpose(1, 2)
+        return torch.cat([self.class_embedding.expand(pixel_values.shape[0], 1, -1), embeds], dim=1) + self.position_embedding.weight
+
+
+class CLIPVision(torch.nn.Module):
+    def __init__(self, config_dict, dtype, device, operations):
+        super().__init__()
+        num_layers = config_dict["num_hidden_layers"]
+        embed_dim = config_dict["hidden_size"]
+        heads = config_dict["num_attention_heads"]
+        intermediate_size = config_dict["intermediate_size"]
+        intermediate_activation = config_dict["hidden_act"]
+
+        self.embeddings = CLIPVisionEmbeddings(embed_dim, config_dict["num_channels"], config_dict["patch_size"], config_dict["image_size"], dtype=torch.float32, device=device, operations=operations)
+        self.pre_layrnorm = operations.LayerNorm(embed_dim)
+        self.encoder = CLIPEncoder(num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations)
+        self.post_layernorm = operations.LayerNorm(embed_dim)
+
+    def forward(self, pixel_values, attention_mask=None, intermediate_output=None):
+        x = self.embeddings(pixel_values)
+        x = self.pre_layrnorm(x)
+        #TODO: attention_mask?
+        x, i = self.encoder(x, mask=None, intermediate_output=intermediate_output)
+        pooled_output = self.post_layernorm(x[:, 0, :])
+        return x, i, pooled_output
+
+class CLIPVisionModelProjection(torch.nn.Module):
+    def __init__(self, config_dict, dtype, device, operations):
+        super().__init__()
+        self.vision_model = CLIPVision(config_dict, dtype, device, operations)
+        self.visual_projection = operations.Linear(config_dict["hidden_size"], config_dict["projection_dim"], bias=False)
+
+    def forward(self, *args, **kwargs):
+        x = self.vision_model(*args, **kwargs)
+        out = self.visual_projection(x[2])
+        return (x[0], x[1], out)
@@ -1,64 +1,58 @@
-from transformers import CLIPVisionModelWithProjection, CLIPVisionConfig, modeling_utils
 from .utils import load_torch_file, transformers_convert, common_upscale
 import os
 import torch
 import contextlib
+import json
 
 import comfy.ops
 import comfy.model_patcher
 import comfy.model_management
 import comfy.utils
+import comfy.clip_model
+
+class Output:
+    def __getitem__(self, key):
+        return getattr(self, key)
+    def __setitem__(self, key, item):
+        setattr(self, key, item)
 
 def clip_preprocess(image, size=224):
     mean = torch.tensor([ 0.48145466,0.4578275,0.40821073], device=image.device, dtype=image.dtype)
     std = torch.tensor([0.26862954,0.26130258,0.27577711], device=image.device, dtype=image.dtype)
-    scale = (size / min(image.shape[1], image.shape[2]))
-    image = torch.nn.functional.interpolate(image.movedim(-1, 1), size=(round(scale * image.shape[1]), round(scale * image.shape[2])), mode="bicubic", antialias=True)
-    h = (image.shape[2] - size)//2
-    w = (image.shape[3] - size)//2
-    image = image[:,:,h:h+size,w:w+size]
+    image = image.movedim(-1, 1)
+    if not (image.shape[2] == size and image.shape[3] == size):
+        scale = (size / min(image.shape[2], image.shape[3]))
+        image = torch.nn.functional.interpolate(image, size=(round(scale * image.shape[2]), round(scale * image.shape[3])), mode="bicubic", antialias=True)
+        h = (image.shape[2] - size)//2
+        w = (image.shape[3] - size)//2
+        image = image[:,:,h:h+size,w:w+size]
     image = torch.clip((255. * image), 0, 255).round() / 255.0
     return (image - mean.view([3,1,1])) / std.view([3,1,1])
 
 class ClipVisionModel():
     def __init__(self, json_config):
-        config = CLIPVisionConfig.from_json_file(json_config)
+        with open(json_config) as f:
+            config = json.load(f)
+
         self.load_device = comfy.model_management.text_encoder_device()
         offload_device = comfy.model_management.text_encoder_offload_device()
-        self.dtype = torch.float32
-        if comfy.model_management.should_use_fp16(self.load_device, prioritize_performance=False):
-            self.dtype = torch.float16
-
-        with comfy.ops.use_comfy_ops(offload_device, self.dtype):
-            with modeling_utils.no_init_weights():
-                self.model = CLIPVisionModelWithProjection(config)
-        self.model.to(self.dtype)
+        self.dtype = comfy.model_management.text_encoder_dtype(self.load_device)
+        self.model = comfy.clip_model.CLIPVisionModelProjection(config, self.dtype, offload_device, comfy.ops.manual_cast)
+        self.model.eval()
 
         self.patcher = comfy.model_patcher.ModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device)
     def load_sd(self, sd):
         return self.model.load_state_dict(sd, strict=False)
 
     def encode_image(self, image):
         comfy.model_management.load_model_gpu(self.patcher)
-        pixel_values = clip_preprocess(image.to(self.load_device))
-
-        if self.dtype != torch.float32:
-            precision_scope = torch.autocast
-        else:
-            precision_scope = lambda a, b: contextlib.nullcontext(a)
-
-        with precision_scope(comfy.model_management.get_autocast_device(self.load_device), torch.float32):
-            outputs = self.model(pixel_values=pixel_values, output_hidden_states=True)
-
-        for k in outputs:
-            t = outputs[k]
-            if t is not None:
-                if k == 'hidden_states':
-                    outputs["penultimate_hidden_states"] = t[-2].to(comfy.model_management.intermediate_device())
-                    outputs["hidden_states"] = None
-                else:
-                    outputs[k] = t.to(comfy.model_management.intermediate_device())
+        pixel_values = clip_preprocess(image.to(self.load_device)).float()
+        out = self.model(pixel_values=pixel_values, intermediate_output=-2)
 
+        outputs = Output()
+        outputs["last_hidden_state"] = out[0].to(comfy.model_management.intermediate_device())
+        outputs["image_embeds"] = out[2].to(comfy.model_management.intermediate_device())
+        outputs["penultimate_hidden_states"] = out[1].to(comfy.model_management.intermediate_device())
         return outputs
 
 def convert_to_transformers(sd, prefix):