GempollAI
diff --git a/‎.ci/windows_nightly_base_files/run_nvidia_gpu_fast.bat‎
Lines changed: 2 additions & 0 deletions b/‎.ci/windows_nightly_base_files/run_nvidia_gpu_fast.bat‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.github/workflows/windows_release_nightly_pytorch.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/windows_release_nightly_pytorch.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎app/frontend_management.py‎
Lines changed: 19 additions & 12 deletions b/‎app/frontend_management.py‎
Lines changed: 19 additions & 12 deletions
diff --git a/‎comfy/controlnet.py‎
Lines changed: 9 additions & 7 deletions b/‎comfy/controlnet.py‎
Lines changed: 9 additions & 7 deletions
diff --git a/‎comfy/float.py‎
Lines changed: 26 additions & 23 deletions b/‎comfy/float.py‎
Lines changed: 26 additions & 23 deletions
diff --git a/‎comfy/ldm/flux/layers.py‎
Lines changed: 1 addition & 3 deletions b/‎comfy/ldm/flux/layers.py‎
Lines changed: 1 addition & 3 deletions
@@ -0,0 +1,2 @@
+.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --fast
+pause
@@ -67,6 +67,7 @@ jobs:
             mkdir update
             cp -r ComfyUI/.ci/update_windows/* ./update/
             cp -r ComfyUI/.ci/windows_base_files/* ./
+            cp -r ComfyUI/.ci/windows_nightly_base_files/* ./
 
             echo "call update_comfyui.bat nopause
             ..\python_embeded\python.exe -s -m pip install --upgrade --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cu${{ inputs.cu }} -r ../ComfyUI/requirements.txt pygit2
 
@@ -230,7 +230,7 @@ To use a textual inversion concepts/embeddings in a text prompt put them in the
 
 Use ```--preview-method auto``` to enable previews.
 
-The default installation includes a fast latent preview method that's low-resolution. To enable higher-quality previews with [TAESD](https://github.com/madebyollin/taesd), download the [taesd_decoder.pth](https://github.com/madebyollin/taesd/raw/main/taesd_decoder.pth) (for SD1.x and SD2.x) and [taesdxl_decoder.pth](https://github.com/madebyollin/taesd/raw/main/taesdxl_decoder.pth) (for SDXL) models and place them in the `models/vae_approx` folder. Once they're installed, restart ComfyUI to enable high-quality previews.
+The default installation includes a fast latent preview method that's low-resolution. To enable higher-quality previews with [TAESD](https://github.com/madebyollin/taesd), download the [taesd_decoder.pth, taesdxl_decoder.pth, taesd3_decoder.pth and taef1_decoder.pth](https://github.com/madebyollin/taesd/) and place them in the `models/vae_approx` folder. Once they're installed, restart ComfyUI and launch it with `--preview-method taesd` to enable high-quality previews.
 
 ## How to use TLS/SSL?
 Generate a self-signed certificate (not appropriate for shared/production use) and key by running the command: `openssl req -x509 -newkey rsa:4096 -keyout key.pem -out cert.pem -sha256 -days 3650 -nodes -subj "/C=XX/ST=StateName/L=CityName/O=CompanyName/OU=CompanySectionName/CN=CommonNameOrHostname"`
 
@@ -8,7 +8,7 @@
 from dataclasses import dataclass
 from functools import cached_property
 from pathlib import Path
-from typing import TypedDict
+from typing import TypedDict, Optional
 
 import requests
 from typing_extensions import NotRequired
@@ -132,12 +132,13 @@ def parse_version_string(cls, value: str) -> tuple[str, str, str]:
         return match_result.group(1), match_result.group(2), match_result.group(3)
 
     @classmethod
-    def init_frontend_unsafe(cls, version_string: str) -> str:
+    def init_frontend_unsafe(cls, version_string: str, provider: Optional[FrontEndProvider] = None) -> str:
         """
         Initializes the frontend for the specified version.
 
         Args:
             version_string (str): The version string.
+            provider (FrontEndProvider, optional): The provider to use. Defaults to None.
 
         Returns:
             str: The path to the initialized frontend.
@@ -150,23 +151,29 @@ def init_frontend_unsafe(cls, version_string: str) -> str:
             return cls.DEFAULT_FRONTEND_PATH
 
         repo_owner, repo_name, version = cls.parse_version_string(version_string)
-        provider = FrontEndProvider(repo_owner, repo_name)
+        provider = provider or FrontEndProvider(repo_owner, repo_name)
         release = provider.get_release(version)
 
         semantic_version = release["tag_name"].lstrip("v")
         web_root = str(
             Path(cls.CUSTOM_FRONTENDS_ROOT) / provider.folder_name / semantic_version
         )
         if not os.path.exists(web_root):
-            os.makedirs(web_root, exist_ok=True)
-            logging.info(
-                "Downloading frontend(%s) version(%s) to (%s)",
-                provider.folder_name,
-                semantic_version,
-                web_root,
-            )
-            logging.debug(release)
-            download_release_asset_zip(release, destination_path=web_root)
+            try:
+                os.makedirs(web_root, exist_ok=True)
+                logging.info(
+                    "Downloading frontend(%s) version(%s) to (%s)",
+                    provider.folder_name,
+                    semantic_version,
+                    web_root,
+                )
+                logging.debug(release)
+                download_release_asset_zip(release, destination_path=web_root)
+            finally:
+                # Clean up the directory if it is empty, i.e. the download failed
+                if not os.listdir(web_root):
+                    os.rmdir(web_root)
+
         return web_root
 
     @classmethod
 
@@ -391,7 +391,8 @@ def controlnet_config(sd):
     else:
         operations = comfy.ops.disable_weight_init
 
-    return model_config, operations, load_device, unet_dtype, manual_cast_dtype
+    offload_device = comfy.model_management.unet_offload_device()
+    return model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device
 
 def controlnet_load_state_dict(control_model, sd):
     missing, unexpected = control_model.load_state_dict(sd, strict=False)
@@ -405,12 +406,12 @@ def controlnet_load_state_dict(control_model, sd):
 
 def load_controlnet_mmdit(sd):
     new_sd = comfy.model_detection.convert_diffusers_mmdit(sd, "")
-    model_config, operations, load_device, unet_dtype, manual_cast_dtype = controlnet_config(new_sd)
+    model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(new_sd)
     num_blocks = comfy.model_detection.count_blocks(new_sd, 'joint_blocks.{}.')
     for k in sd:
         new_sd[k] = sd[k]
 
-    control_model = comfy.cldm.mmdit.ControlNet(num_blocks=num_blocks, operations=operations, device=load_device, dtype=unet_dtype, **model_config.unet_config)
+    control_model = comfy.cldm.mmdit.ControlNet(num_blocks=num_blocks, operations=operations, device=offload_device, dtype=unet_dtype, **model_config.unet_config)
     control_model = controlnet_load_state_dict(control_model, new_sd)
 
     latent_format = comfy.latent_formats.SD3()
@@ -420,9 +421,9 @@ def load_controlnet_mmdit(sd):
 
 
 def load_controlnet_hunyuandit(controlnet_data):
-    model_config, operations, load_device, unet_dtype, manual_cast_dtype = controlnet_config(controlnet_data)
+    model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(controlnet_data)
 
-    control_model = comfy.ldm.hydit.controlnet.HunYuanControlNet(operations=operations, device=load_device, dtype=unet_dtype)
+    control_model = comfy.ldm.hydit.controlnet.HunYuanControlNet(operations=operations, device=offload_device, dtype=unet_dtype)
     control_model = controlnet_load_state_dict(control_model, controlnet_data)
 
     latent_format = comfy.latent_formats.SDXL()
@@ -431,8 +432,8 @@ def load_controlnet_hunyuandit(controlnet_data):
     return control
 
 def load_controlnet_flux_xlabs(sd):
-    model_config, operations, load_device, unet_dtype, manual_cast_dtype = controlnet_config(sd)
-    control_model = comfy.ldm.flux.controlnet_xlabs.ControlNetFlux(operations=operations, device=load_device, dtype=unet_dtype, **model_config.unet_config)
+    model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(sd)
+    control_model = comfy.ldm.flux.controlnet_xlabs.ControlNetFlux(operations=operations, device=offload_device, dtype=unet_dtype, **model_config.unet_config)
     control_model = controlnet_load_state_dict(control_model, sd)
     extra_conds = ['y', 'guidance']
     control = ControlNet(control_model, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds)
@@ -536,6 +537,7 @@ def load_controlnet(ckpt_path, model=None):
     if manual_cast_dtype is not None:
         controlnet_config["operations"] = comfy.ops.manual_cast
     controlnet_config["dtype"] = unet_dtype
+    controlnet_config["device"] = comfy.model_management.unet_offload_device()
     controlnet_config.pop("out_channels")
     controlnet_config["hint_channels"] = controlnet_data["{}input_hint_block.0.weight".format(prefix)].shape[1]
     control_model = comfy.cldm.cldm.ControlNet(**controlnet_config)
 
@@ -1,59 +1,62 @@
 import torch
+import math
+
+def calc_mantissa(abs_x, exponent, normal_mask, MANTISSA_BITS, EXPONENT_BIAS, generator=None):
+    mantissa_scaled = torch.where(
+        normal_mask,
+        (abs_x / (2.0 ** (exponent - EXPONENT_BIAS)) - 1.0) * (2**MANTISSA_BITS),
+        (abs_x / (2.0 ** (-EXPONENT_BIAS + 1 - MANTISSA_BITS)))
+    )
+
+    mantissa_scaled += torch.rand(mantissa_scaled.size(), dtype=mantissa_scaled.dtype, layout=mantissa_scaled.layout, device=mantissa_scaled.device, generator=generator)
+    return mantissa_scaled.floor() / (2**MANTISSA_BITS)
 
 #Not 100% sure about this
-def manual_stochastic_round_to_float8(x, dtype):
+def manual_stochastic_round_to_float8(x, dtype, generator=None):
     if dtype == torch.float8_e4m3fn:
         EXPONENT_BITS, MANTISSA_BITS, EXPONENT_BIAS = 4, 3, 7
     elif dtype == torch.float8_e5m2:
         EXPONENT_BITS, MANTISSA_BITS, EXPONENT_BIAS = 5, 2, 15
     else:
         raise ValueError("Unsupported dtype")
 
+    x = x.half()
     sign = torch.sign(x)
     abs_x = x.abs()
+    sign = torch.where(abs_x == 0, 0, sign)
 
     # Combine exponent calculation and clamping
     exponent = torch.clamp(
-        torch.floor(torch.log2(abs_x)).to(torch.int32) + EXPONENT_BIAS,
+        torch.floor(torch.log2(abs_x)) + EXPONENT_BIAS,
         0, 2**EXPONENT_BITS - 1
     )
 
     # Combine mantissa calculation and rounding
-    # min_normal = 2.0 ** (-EXPONENT_BIAS + 1)
-    # zero_mask = (abs_x == 0)
-    # subnormal_mask = (exponent == 0) & (abs_x != 0)
     normal_mask = ~(exponent == 0)
 
-    mantissa_scaled = torch.where(
-        normal_mask,
-        (abs_x / (2.0 ** (exponent - EXPONENT_BIAS)) - 1.0) * (2**MANTISSA_BITS),
-        (abs_x / (2.0 ** (-EXPONENT_BIAS + 1 - MANTISSA_BITS)))
-    )
-    mantissa_floor = mantissa_scaled.floor()
-    mantissa = torch.where(
-        torch.rand_like(mantissa_scaled) < (mantissa_scaled - mantissa_floor),
-        (mantissa_floor + 1) / (2**MANTISSA_BITS),
-        mantissa_floor / (2**MANTISSA_BITS)
-    )
-    result = torch.where(
+    abs_x[:] = calc_mantissa(abs_x, exponent, normal_mask, MANTISSA_BITS, EXPONENT_BIAS, generator=generator)
+
+    sign *= torch.where(
         normal_mask,
-        sign * (2.0 ** (exponent - EXPONENT_BIAS)) * (1.0 + mantissa),
-        sign * (2.0 ** (-EXPONENT_BIAS + 1)) * mantissa
+        (2.0 ** (exponent - EXPONENT_BIAS)) * (1.0 + abs_x),
+        (2.0 ** (-EXPONENT_BIAS + 1)) * abs_x
     )
+    del abs_x
 
-    result = torch.where(abs_x == 0, 0, result)
-    return result.to(dtype=dtype)
+    return sign.to(dtype=dtype)
 
 
 
-def stochastic_rounding(value, dtype):
+def stochastic_rounding(value, dtype, seed=0):
     if dtype == torch.float32:
         return value.to(dtype=torch.float32)
     if dtype == torch.float16:
         return value.to(dtype=torch.float16)
     if dtype == torch.bfloat16:
         return value.to(dtype=torch.bfloat16)
     if dtype == torch.float8_e4m3fn or dtype == torch.float8_e5m2:
-        return manual_stochastic_round_to_float8(value, dtype)
+        generator = torch.Generator(device=value.device)
+        generator.manual_seed(seed)
+        return manual_stochastic_round_to_float8(value, dtype, generator=generator)
 
     return value.to(dtype=dtype)
@@ -63,10 +63,8 @@ def __init__(self, dim: int, dtype=None, device=None, operations=None):
         self.scale = nn.Parameter(torch.empty((dim), dtype=dtype, device=device))
 
     def forward(self, x: Tensor):
-        x_dtype = x.dtype
-        x = x.float()
         rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
-        return (x * rrms).to(dtype=x_dtype) * comfy.ops.cast_to(self.scale, dtype=x_dtype, device=x.device)
+        return (x * rrms) * comfy.ops.cast_to(self.scale, dtype=x.dtype, device=x.device)
 
 
 class QKNorm(torch.nn.Module):
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --fast`
	`2`	`+pause`