Support Flux 2 edit mode with reference images

Acly · Acly · commit 99f04fbd2184 · 2026-01-16T21:38:02.000+01:00
diff --git a/ai_diffusion/api.py b/ai_diffusion/api.py
@@ -111,6 +111,7 @@ class ConditioningInput:
     control: list[ControlInput] = field(default_factory=list)
     regions: list[RegionInput] = field(default_factory=list)
     language: str = ""
+    edit_reference: bool = False  # use input image as conditioning reference
 
 
 class InpaintMode(Enum):
diff --git a/ai_diffusion/control.py b/ai_diffusion/control.py
@@ -141,6 +141,7 @@ def _update_is_supported(self):
         is_supported = True
         if client := root.connection.client_if_connected:
             models = client.models.for_arch(self._model.arch)
+
             if self.mode.is_ip_adapter and models.arch in [Arch.illu, Arch.illu_v]:
                 resid = resource_id(ResourceKind.clip_vision, Arch.illu, "ip_adapter")
                 has_clip_vision = client.models.resources.get(resid, None) is not None
@@ -151,7 +152,7 @@ def _update_is_supported(self):
                     self.error_text = _("The server is missing the ClipVision model") + f" {search}"
                     is_supported = False
 
-            if self.mode.is_ip_adapter and models.arch.is_edit:
+            if self.mode.is_ip_adapter and models.arch.supports_edit:
                 is_supported = True  # Reference images are merged into the conditioning context
             elif self.mode.is_ip_adapter and models.ip_adapter.find(self.mode) is None:
                 search_path = resources.search_path(ResourceKind.ip_adapter, models.arch, self.mode)
diff --git a/ai_diffusion/model.py b/ai_diffusion/model.py
@@ -193,12 +193,11 @@ def _generate(self, queue_mode: QueueMode):
 
     def _prepare_workflow(self, dryrun=False):
         arch = self.arch
-        is_edit = arch.is_edit
         workflow_kind = WorkflowKind.generate
         strength = self.strength
         if arch is Arch.qwen_l:
             strength = 1.0
-        if strength < 1.0 or is_edit:
+        if strength < 1.0 or self.is_editing:
             workflow_kind = WorkflowKind.refine
         client = self._connection.client
         image = None
@@ -409,7 +408,7 @@ def generate_live(self):
     def _prepare_live_workflow(self):
         strength = self.live.strength
         workflow_kind = WorkflowKind.generate
-        if strength < 1.0 or self.arch.is_edit:
+        if strength < 1.0 or self.is_editing:
             workflow_kind = WorkflowKind.refine
         client = self._connection.client
         min_mask_size = 512 if self.arch is Arch.sd15 else 800
@@ -845,11 +844,11 @@ def set_style(self, style: Style):
             self._style_connection = style.changed.connect(self._handle_style_changed)
             self.style_changed.emit(style)
             self.modified.emit(self, "style")
-            self.edit_mode = self.edit_mode and self.edit_style is not None
+            self.edit_mode = self.edit_mode and self.can_edit
 
     def _handle_style_changed(self):
         self.style_changed.emit(self.style)
-        self.edit_mode = self.edit_mode and self.edit_style is not None
+        self.edit_mode = self.edit_mode and self.can_edit
 
     def generate_seed(self):
         self.seed = workflow.generate_seed()
@@ -879,6 +878,8 @@ def add_refs(control: list[ControlInput], layer_names: list[str]):
         for region, r_layers in zip(cond.regions, region_layers):
             add_refs(region.control, r_layers)
 
+        cond.edit_reference = self.is_editing
+
     def _performance_settings(self, client: Client):
         result = client.performance_settings
         if self.resolution_multiplier != 1.0:
@@ -950,14 +951,22 @@ def name(self):
     @property
     def edit_style(self) -> Style | None:
         style_arch = resolve_arch(self.style, self._connection.client_if_connected)
-        if style_arch.is_edit:
+        if style_arch.supports_edit:
             return self.style
         if style_id := self.style.linked_edit_style:
             if style := Styles.list().find(style_id):
                 if is_style_supported(style, self._connection.client_if_connected):
                     return style
         return None
 
+    @property
+    def can_edit(self):
+        return self.edit_style is not None
+
+    @property
+    def is_editing(self):
+        return self.arch.is_edit or (self.can_edit and self.edit_mode)
+
 
 class CustomInpaint(QObject, ObservableProperties):
     mode = Property(InpaintMode.automatic, persist=True)
@@ -1323,7 +1332,7 @@ def _prepare_input(self, canvas: Image | Extent, seed: int, time: int):
         m = self._model
 
         kind = WorkflowKind.generate
-        if m.strength < 1.0 or m.arch.is_edit:
+        if m.strength < 1.0 or m.is_editing:
             kind = WorkflowKind.refine
         bounds = Bounds(0, 0, *m.document.extent)
         conditioning, _ = process_regions(m.regions, bounds, self._model.layers.root, time=time)
diff --git a/ai_diffusion/presets/samplers.json b/ai_diffusion/presets/samplers.json
@@ -96,7 +96,7 @@
         "minimum_steps": 3,
         "cfg": 1.0
     },
-    "Flux2 - Euler": {
+    "Flux 2 - Euler": {
         "sampler": "euler",
         "scheduler": "flux2",
         "steps": 20,
diff --git a/ai_diffusion/resources.py b/ai_diffusion/resources.py
@@ -80,7 +80,7 @@ class Arch(Enum):
     sd3 = "SD 3"
     flux = "Flux"
     flux_k = "Flux Kontext"
-    flux2 = "Flux 2"
+    flux2 = "Flux 2 Klein 4B"
     illu = "Illustrious"
     illu_v = "Illustrious v-prediction"
     chroma = "Chroma"
@@ -108,7 +108,7 @@ def from_string(string: str, model_type: str = "eps", filename: str | None = Non
             return Arch.flux_k
         if string == "flux" or string == "flux-schnell":
             return Arch.flux
-        if string == "flux2":
+        if string == "flux2" and model_type == "klein-4b":
             return Arch.flux2
         if string == "illu":
             return Arch.illu
@@ -188,6 +188,10 @@ def supports_cfg(self):
     def is_edit(self):  # edit models make changes to input images
         return self in [Arch.flux_k, Arch.qwen_e, Arch.qwen_e_p, Arch.qwen_l]
 
+    @property
+    def supports_edit(self):  # includes text-to-image models that can also edit
+        return self.is_edit or self is Arch.flux2
+
     @property
     def is_sdxl_like(self):
         # illustrious technically uses sdxl architecture, but has a separate ecosystem
@@ -749,7 +753,7 @@ def is_required(kind: ResourceKind, arch: Arch, identifier: ControlMode | Upscal
     resource_id(ResourceKind.text_encoder, Arch.all, "clip_g"): ["clip_g"],
     resource_id(ResourceKind.text_encoder, Arch.all, "t5"): ["t5xxl_fp16", "t5xxl_fp8_e4m3fn", "t5xxl_fp8_e4m3fn_scaled", "t5-v1_1-xxl", "t5"],
     resource_id(ResourceKind.text_encoder, Arch.all, "qwen"): ["qwen_2.5_vl_7b", "qwen_2", "qwen-2", "qwen"],
-    resource_id(ResourceKind.text_encoder, Arch.all, "qwen_3"): ["qwen_3_4b", "qwen_3", "qwen-3"],
+    resource_id(ResourceKind.text_encoder, Arch.all, "qwen_3"): ["qwen_3_4b", "qwen3-4b", "qwen_3", "qwen-3"],
     resource_id(ResourceKind.vae, Arch.sd15, "default"): ["vae-ft-mse-840000-ema"],
     resource_id(ResourceKind.vae, Arch.sdxl, "default"): ["sdxl_vae"],
     resource_id(ResourceKind.vae, Arch.illu, "default"): ["sdxl_vae"],
diff --git a/ai_diffusion/styles/flux2-klein.json b/ai_diffusion/styles/flux2-klein.json
@@ -16,10 +16,10 @@
     "v_prediction_zsnr": false,
     "self_attention_guidance": false,
     "preferred_resolution": 0,
-    "sampler": "Flux2 - Euler",
+    "sampler": "Flux 2 - Euler",
     "sampler_steps": 4,
     "cfg_scale": 1.0,
-    "live_sampler": "Flux2 - Euler",
+    "live_sampler": "Flux 2 - Euler",
     "live_sampler_steps": 4,
     "live_cfg_scale": 1.0
 }
diff --git a/ai_diffusion/ui/generation.py b/ai_diffusion/ui/generation.py
@@ -900,18 +900,18 @@ def show_inpaint_menu(self):
                 menu = self.generate_region_menu
             elif self.model.document.selection_bounds:
                 menu = self.inpaint_menu
-                menu.actions()[-2].setEnabled(self.model.edit_style is not None)
+                menu.actions()[-2].setEnabled(self.model.can_edit)
             else:
                 menu = self.generate_menu
         else:
             if self.model.region_only:
                 menu = self.refine_region_menu
             elif self.model.document.selection_bounds:
                 menu = self.refine_selection_menu
-                menu.actions()[1].setEnabled(self.model.edit_style is not None)
+                menu.actions()[1].setEnabled(self.model.can_edit)
             else:
                 menu = self.refine_menu
-                menu.actions()[1].setEnabled(self.model.edit_style is not None)
+                menu.actions()[1].setEnabled(self.model.can_edit)
 
         menu.setFixedWidth(width)
         menu.exec_(self.generate_button.mapToGlobal(pos))
@@ -943,10 +943,8 @@ def update_generate_options(self):
         has_regions = len(regions) > 0
         has_active_region = regions.is_linked(self.model.layers.active)
         is_region_only = has_regions and has_active_region and self.model.region_only
-        is_edit = arch.is_edit
-        can_switch_edit = (
-            self.model.style.linked_edit_style != "" and self.model.edit_style is not None
-        )
+        is_edit = self.model.is_editing
+        can_switch_edit = self.model.can_edit and not arch.is_edit
         self.region_mask_button.setVisible(has_regions)
         self.region_mask_button.setEnabled(has_active_region)
         self.region_mask_button.setIcon(_region_mask_button_icons[is_region_only])
diff --git a/ai_diffusion/workflow.py b/ai_diffusion/workflow.py
@@ -397,6 +397,7 @@ class Conditioning:
     control: list[Control] = field(default_factory=list)
     regions: list[Region] = field(default_factory=list)
     style_prompt: str = ""
+    edit_reference: bool = False
 
     @staticmethod
     def from_input(i: ConditioningInput):
@@ -406,6 +407,7 @@ def from_input(i: ConditioningInput):
             [Control.from_input(c) for c in i.control],
             [Region.from_input(r, idx, i.language) for idx, r in enumerate(i.regions)],
             i.style,
+            i.edit_reference,
         )
 
     def copy(self):
@@ -415,6 +417,7 @@ def copy(self):
             [copy(c) for c in self.control],
             [r.copy() for r in self.regions],
             self.style_prompt,
+            self.edit_reference,
         )
 
     def downscale(self, original: Extent, target: Extent):
@@ -614,8 +617,8 @@ def apply_ip_adapter(
     models: ModelDict,
     mask: Output | None = None,
 ):
-    if models.arch.is_flux_like or models.arch.is_qwen_like:
-        return model  # No IP-adapter for Flux or Qwen, using Style model instead
+    if not (models.arch is Arch.sd15 or models.arch.is_sdxl_like):
+        return model
 
     models = models.ip_adapter
 
@@ -682,35 +685,39 @@ def apply_regional_ip_adapter(
     return model
 
 
-def apply_edit_conditioning(
+def apply_reference_conditioning(
     w: ComfyWorkflow,
-    cond: Output,
-    input_image: Output,
-    input_latent: Output,
-    control_layers: list[Control],
+    positive: Output,
+    input_image: Output | None,
+    input_latent: Output | None,
+    cond: Conditioning,
     vae: Output,
     arch: Arch,
     tiled_vae: bool,
 ):
-    if not arch.is_edit:
-        return cond
-
-    extra_input = [c.image for c in control_layers if c.mode.is_ip_adapter]
-    if len(extra_input) == 0:
-        return w.reference_latent(cond, input_latent)
-
-    if arch == Arch.qwen_e_p:
-        extra_images = [i.load(w) for i in extra_input]
-        cond = w.reference_latent(cond, input_latent)
-        for extra_image in extra_images:
-            latent = vae_encode(w, vae, extra_image, tiled_vae)
-            cond = w.reference_latent(cond, latent)
-        return cond
-    else:
-        input = w.image_stitch([input_image] + [i.load(w) for i in extra_input])
-        latent = vae_encode(w, vae, input, tiled_vae)
-        cond = w.reference_latent(cond, latent)
-        return cond
+    if not arch.supports_edit:
+        return positive
+
+    extra_input = (c.image for c in cond.all_control if c.mode.is_ip_adapter)
+    extra_images = [i.load(w) for i in extra_input]
+    match arch:
+        case Arch.flux2 | Arch.qwen_e_p:
+            if cond.edit_reference and input_latent:
+                positive = w.reference_latent(positive, input_latent)
+            for extra_image in extra_images:
+                latent = vae_encode(w, vae, extra_image, tiled_vae)
+                positive = w.reference_latent(positive, latent)
+        case Arch.flux_k | Arch.qwen_e:
+            if len(extra_images) > 0:
+                if cond.edit_reference and input_image:
+                    extra_images.insert(0, input_image)
+                input = w.image_stitch(extra_images)
+                latent = vae_encode(w, vae, input, tiled_vae)
+                positive = w.reference_latent(positive, latent)
+            elif cond.edit_reference and input_latent:
+                positive = w.reference_latent(positive, input_latent)
+
+    return positive
 
 
 def scale(
@@ -796,7 +803,9 @@ def scale_refine_and_decode(
     model, positive, negative = apply_control(
         w, model, positive, negative, cond.all_control, extent.desired, vae, models
     )
-    positive = apply_edit_conditioning(w, positive, upscale, latent, [], vae, arch, tiled_vae)
+    positive = apply_reference_conditioning(
+        w, positive, upscale, latent, cond, vae, arch, tiled_vae
+    )
     result = w.sampler_custom_advanced(model, positive, negative, latent, arch, **params)
     image = vae_decode(w, vae, result, tiled_vae)
     return image
@@ -834,6 +843,9 @@ def generate(
     model, positive, negative = apply_control(
         w, model, positive, negative, cond.all_control, extent.initial, vae, models
     )
+    positive = apply_reference_conditioning(
+        w, positive, None, None, cond, vae, models.arch, checkpoint.tiled_vae
+    )
     sample_params = _sampler_params(sampling, extent.initial)
     out_latent = w.sampler_custom_advanced(
         model, positive, negative, latent, models.arch, **sample_params
@@ -1092,8 +1104,8 @@ def refine(
     model, positive, negative = apply_control(
         w, model, positive, negative, cond.all_control, extent.desired, vae, models
     )
-    positive = apply_edit_conditioning(
-        w, positive, in_image, latent, cond.all_control, vae, models.arch, checkpoint.tiled_vae
+    positive = apply_reference_conditioning(
+        w, positive, in_image, latent, cond, vae, models.arch, checkpoint.tiled_vae
     )
     sampler_params = _sampler_params(sampling, extent.desired)
     sampler = w.sampler_custom_advanced(
@@ -1147,8 +1159,8 @@ def refine_region(
         inpaint_model = w.apply_fooocus_inpaint(model, inpaint_patch, latent_inpaint)
     else:
         latent = vae_encode(w, vae, in_image, checkpoint.tiled_vae)
-        positive = apply_edit_conditioning(
-            w, positive, in_image, latent, cond.all_control, vae, models.arch, checkpoint.tiled_vae
+        positive = apply_reference_conditioning(
+            w, positive, in_image, latent, cond, vae, models.arch, checkpoint.tiled_vae
         )
         latent = w.set_latent_noise_mask(latent, initial_mask)
         inpaint_model = model
@@ -1321,8 +1333,8 @@ def tiled_region(region: Region, index: int, tile_bounds: Bounds):
 
         latent = vae_encode(w, vae, tile_image, checkpoint.tiled_vae)
         latent = w.set_latent_noise_mask(latent, tile_mask)
-        positive = apply_edit_conditioning(
-            w, positive, tile_image, latent, control, vae, models.arch, checkpoint.tiled_vae
+        positive = apply_reference_conditioning(
+            w, positive, tile_image, latent, tile_cond, vae, models.arch, checkpoint.tiled_vae
         )
         sampler_params = _sampler_params(sampling, layout.bounds(i).extent)
         sampler = w.sampler_custom_advanced(
@@ -1443,7 +1455,7 @@ def prepare_prompts(
         "negative_prompt": cond.negative,
     }
     models = style.get_models([])
-    layer_replace = "Picture {}" if arch is Arch.qwen_e_p else ""
+    layer_replace = "Picture {}" if arch in (Arch.qwen_e_p, Arch.flux2) else ""
 
     cond.style = style.style_prompt
     cond.positive = strip_prompt_comments(cond.positive)
diff --git a/tests/config.py b/tests/config.py
@@ -20,5 +20,6 @@
     Arch.sdxl: "RealVisXL_V5.0_fp16.safetensors",
     Arch.flux: "svdq-int4_r32-flux.1-krea-dev.safetensors",
     Arch.flux_k: "svdq-int4_r32-flux.1-kontext-dev.safetensors",
+    Arch.flux2: "flux-2-klein-4b.safetensors",
     Arch.zimage: "z_image_turbo_bf16.safetensors",
 }
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -34,6 +34,8 @@ def order(item: pytest.Item):
             return 11
         elif "cloud" in item.name:
             return 10
+        elif "flux2" in item.name:
+            return 4
         elif "flux" in item.name:
             return 3
         elif "sdxl" in item.name:
diff --git a/tests/test_workflow.py b/tests/test_workflow.py

Original file line number	Diff line number	Diff line change
`@@ -20,5 +20,6 @@`
`20`	`20`	`Arch.sdxl: "RealVisXL_V5.0_fp16.safetensors",`
`21`	`21`	`Arch.flux: "svdq-int4_r32-flux.1-krea-dev.safetensors",`
`22`	`22`	`Arch.flux_k: "svdq-int4_r32-flux.1-kontext-dev.safetensors",`
	`23`	`+ Arch.flux2: "flux-2-klein-4b.safetensors",`
`23`	`24`	`Arch.zimage: "z_image_turbo_bf16.safetensors",`
`24`	`25`	`}`