Skip to content

Commit 5191fb2

Browse files
authored
Merge branch 'master' into yousef-higgsv2
2 parents 6412422 + 103a12c commit 5191fb2

File tree

5 files changed

+51
-13
lines changed

5 files changed

+51
-13
lines changed

comfy/cli_args.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ class PerformanceFeature(enum.Enum):
145145
CublasOps = "cublas_ops"
146146
AutoTune = "autotune"
147147

148-
parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: fp16_accumulation fp8_matrix_mult cublas_ops")
148+
parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: {}".format(" ".join(map(lambda c: c.value, PerformanceFeature))))
149149

150150
parser.add_argument("--mmap-torch-files", action="store_true", help="Use mmap when loading ckpt/pt files.")
151151
parser.add_argument("--disable-mmap", action="store_true", help="Don't use mmap when loading safetensors.")

comfy/controlnet.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,10 @@ def get_control(self, x_noisy, t, cond, batched_number, transformer_options):
253253
to_concat = []
254254
for c in self.extra_concat_orig:
255255
c = c.to(self.cond_hint.device)
256-
c = comfy.utils.common_upscale(c, self.cond_hint.shape[3], self.cond_hint.shape[2], self.upscale_algorithm, "center")
256+
c = comfy.utils.common_upscale(c, self.cond_hint.shape[-1], self.cond_hint.shape[-2], self.upscale_algorithm, "center")
257+
if c.ndim < self.cond_hint.ndim:
258+
c = c.unsqueeze(2)
259+
c = comfy.utils.repeat_to_batch_size(c, self.cond_hint.shape[2], dim=2)
257260
to_concat.append(comfy.utils.repeat_to_batch_size(c, self.cond_hint.shape[0]))
258261
self.cond_hint = torch.cat([self.cond_hint] + to_concat, dim=1)
259262

@@ -585,11 +588,18 @@ def load_controlnet_flux_instantx(sd, model_options={}):
585588

586589
def load_controlnet_qwen_instantx(sd, model_options={}):
587590
model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(sd, model_options=model_options)
588-
control_model = comfy.ldm.qwen_image.controlnet.QwenImageControlNetModel(operations=operations, device=offload_device, dtype=unet_dtype, **model_config.unet_config)
591+
control_latent_channels = sd.get("controlnet_x_embedder.weight").shape[1]
592+
593+
extra_condition_channels = 0
594+
concat_mask = False
595+
if control_latent_channels == 68: #inpaint controlnet
596+
extra_condition_channels = control_latent_channels - 64
597+
concat_mask = True
598+
control_model = comfy.ldm.qwen_image.controlnet.QwenImageControlNetModel(extra_condition_channels=extra_condition_channels, operations=operations, device=offload_device, dtype=unet_dtype, **model_config.unet_config)
589599
control_model = controlnet_load_state_dict(control_model, sd)
590600
latent_format = comfy.latent_formats.Wan21()
591601
extra_conds = []
592-
control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds)
602+
control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, concat_mask=concat_mask, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds)
593603
return control
594604

595605
def convert_mistoline(sd):

comfy/model_management.py

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from comfy.cli_args import args, PerformanceFeature
2323
import torch
2424
import sys
25+
import importlib
2526
import platform
2627
import weakref
2728
import gc
@@ -289,6 +290,24 @@ def is_amd():
289290
return True
290291
return False
291292

293+
def amd_min_version(device=None, min_rdna_version=0):
294+
if not is_amd():
295+
return False
296+
297+
if is_device_cpu(device):
298+
return False
299+
300+
arch = torch.cuda.get_device_properties(device).gcnArchName
301+
if arch.startswith('gfx') and len(arch) == 7:
302+
try:
303+
cmp_rdna_version = int(arch[4]) + 2
304+
except:
305+
cmp_rdna_version = 0
306+
if cmp_rdna_version >= min_rdna_version:
307+
return True
308+
309+
return False
310+
292311
MIN_WEIGHT_MEMORY_RATIO = 0.4
293312
if is_nvidia():
294313
MIN_WEIGHT_MEMORY_RATIO = 0.0
@@ -321,12 +340,13 @@ def is_amd():
321340
logging.info("AMD arch: {}".format(arch))
322341
logging.info("ROCm version: {}".format(rocm_version))
323342
if args.use_split_cross_attention == False and args.use_quad_cross_attention == False:
324-
if torch_version_numeric >= (2, 7): # works on 2.6 but doesn't actually seem to improve much
325-
if any((a in arch) for a in ["gfx90a", "gfx942", "gfx1100", "gfx1101", "gfx1151"]): # TODO: more arches, TODO: gfx950
326-
ENABLE_PYTORCH_ATTENTION = True
327-
# if torch_version_numeric >= (2, 8):
328-
# if any((a in arch) for a in ["gfx1201"]):
329-
# ENABLE_PYTORCH_ATTENTION = True
343+
if importlib.util.find_spec('triton') is not None: # AMD efficient attention implementation depends on triton. TODO: better way of detecting if it's compiled in or not.
344+
if torch_version_numeric >= (2, 7): # works on 2.6 but doesn't actually seem to improve much
345+
if any((a in arch) for a in ["gfx90a", "gfx942", "gfx1100", "gfx1101", "gfx1151"]): # TODO: more arches, TODO: gfx950
346+
ENABLE_PYTORCH_ATTENTION = True
347+
# if torch_version_numeric >= (2, 8):
348+
# if any((a in arch) for a in ["gfx1201"]):
349+
# ENABLE_PYTORCH_ATTENTION = True
330350
if torch_version_numeric >= (2, 7) and rocm_version >= (6, 4):
331351
if any((a in arch) for a in ["gfx1201", "gfx942", "gfx950"]): # TODO: more arches
332352
SUPPORT_FP8_OPS = True
@@ -905,7 +925,9 @@ def vae_dtype(device=None, allowed_dtypes=[]):
905925

906926
# NOTE: bfloat16 seems to work on AMD for the VAE but is extremely slow in some cases compared to fp32
907927
# slowness still a problem on pytorch nightly 2.9.0.dev20250720+rocm6.4 tested on RDNA3
908-
if d == torch.bfloat16 and (not is_amd()) and should_use_bf16(device):
928+
# also a problem on RDNA4 except fp32 is also slow there.
929+
# This is due to large bf16 convolutions being extremely slow.
930+
if d == torch.bfloat16 and ((not is_amd()) or amd_min_version(device, min_rdna_version=4)) and should_use_bf16(device):
909931
return d
910932

911933
return torch.float32

comfy/text_encoders/llama.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,11 +140,12 @@ def precompute_freqs_cis(head_dim, position_ids, theta, rope_dims=None, device=N
140140

141141

142142
def apply_rope(xq, xk, freqs_cis):
143+
org_dtype = xq.dtype
143144
cos = freqs_cis[0]
144145
sin = freqs_cis[1]
145146
q_embed = (xq * cos) + (rotate_half(xq) * sin)
146147
k_embed = (xk * cos) + (rotate_half(xk) * sin)
147-
return q_embed, k_embed, sin, cos
148+
return q_embed.to(org_dtype), k_embed.to(org_dtype), sin, cos
148149

149150
class LlamaRoPE(nn.Module):
150151
def __init__(self, config, device = None, dtype = None):

comfy_extras/nodes_easycache.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,12 @@ def easycache_sample_wrapper(executor, *args, **kwargs):
162162
logging.info(f"{easycache.name} [verbose] - output_change_rates {len(output_change_rates)}: {output_change_rates}")
163163
logging.info(f"{easycache.name} [verbose] - approx_output_change_rates {len(approx_output_change_rates)}: {approx_output_change_rates}")
164164
total_steps = len(args[3])-1
165-
logging.info(f"{easycache.name} - skipped {easycache.total_steps_skipped}/{total_steps} steps ({total_steps/(total_steps-easycache.total_steps_skipped):.2f}x speedup).")
165+
# catch division by zero for log statement; sucks to crash after all sampling is done
166+
try:
167+
speedup = total_steps/(total_steps-easycache.total_steps_skipped)
168+
except ZeroDivisionError:
169+
speedup = 1.0
170+
logging.info(f"{easycache.name} - skipped {easycache.total_steps_skipped}/{total_steps} steps ({speedup:.2f}x speedup).")
166171
easycache.reset()
167172
guider.model_options = orig_model_options
168173

0 commit comments

Comments
 (0)