william-murray1204
diff --git a/‎README.md‎
Lines changed: 24 additions & 15 deletions b/‎README.md‎
Lines changed: 24 additions & 15 deletions
diff --git a/‎docs/hipBLAS_on_Windows.md‎
Lines changed: 1 addition & 2 deletions b/‎docs/hipBLAS_on_Windows.md‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎stable_diffusion_cpp/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎stable_diffusion_cpp/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎stable_diffusion_cpp/stable_diffusion.py‎
Lines changed: 1 addition & 4 deletions b/‎stable_diffusion_cpp/stable_diffusion.py‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎stable_diffusion_cpp/stable_diffusion_cpp.py‎
Lines changed: 0 additions & 4 deletions b/‎stable_diffusion_cpp/stable_diffusion_cpp.py‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎tests/test_chroma.py‎
Lines changed: 7 additions & 8 deletions b/‎tests/test_chroma.py‎
Lines changed: 7 additions & 8 deletions
diff --git a/‎tests/test_controlnet.py‎
Lines changed: 7 additions & 8 deletions b/‎tests/test_controlnet.py‎
Lines changed: 7 additions & 8 deletions
diff --git a/‎tests/test_convert_model.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/test_convert_model.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/test_edit.py‎
Lines changed: 6 additions & 7 deletions b/‎tests/test_edit.py‎
Lines changed: 6 additions & 7 deletions
diff --git a/‎tests/test_flex2.py‎
Lines changed: 77 additions & 0 deletions b/‎tests/test_flex2.py‎
Lines changed: 77 additions & 0 deletions
@@ -93,14 +93,13 @@ CMAKE_ARGS="-DSD_CUDA=ON" pip install stable-diffusion-cpp-python
 <details>
 <summary>Using HIPBLAS (ROCm)</summary>
 
-This provides BLAS acceleration using the ROCm cores of your AMD GPU. Make sure you have the ROCm toolkit installed and that you replace the `$GFX_NAME` value with that of your GPU architecture (`gfx1030` for consumer RDNA2 cards for example).
-Windows users refer to [docs/hipBLAS_on_Windows.md](docs%2FhipBLAS_on_Windows.md) for a comprehensive guide and troubleshooting tips.
+This provides BLAS acceleration using the ROCm cores of your AMD GPU. Make sure you have the ROCm toolkit installed and that you replace the `$GFX_NAME` value with that of your GPU architecture (`gfx1030` for consumer RDNA2 cards for example).Windows users refer to [docs/hipBLAS_on_Windows.md](docs%2FhipBLAS_on_Windows.md) for a comprehensive guide and troubleshooting tips.
 
 ```bash
 if command -v rocminfo; then export GFX_NAME=$(rocminfo | awk '/ *Name: +gfx[1-9]/ {print $2; exit}'); else echo "rocminfo missing!"; fi
 if [ -z "${GFX_NAME}" ]; then echo "Error: Couldn't detect GPU!"; else echo "Building for GPU: ${GFX_NAME}"; fi
 
-CMAKE_ARGS="-G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=$GFX_NAME -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON" pip install stable-diffusion-cpp-python
+CMAKE_ARGS="-G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=$GFX_NAME -DAMDGPU_TARGETS=$GFX_NAME -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON" pip install stable-diffusion-cpp-python
 ```
 
 </details>
@@ -147,18 +146,6 @@ CMAKE_ARGS="-DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML
 
 </details>
 
-<!-- Flash Attention -->
-<details>
-<summary>Using Flash Attention</summary>
-
-Enabling flash attention reduces memory usage by at least 400 MB. At the moment, it is not supported when CUDA (CUBLAS) is enabled because the kernel implementation is missing.
-
-```bash
-CMAKE_ARGS="-DSD_FLASH_ATTN=ON" pip install stable-diffusion-cpp-python
-```
-
-</details>
-
 <!-- OpenBLAS -->
 <details>
 <summary>Using OpenBLAS</summary>
@@ -250,6 +237,28 @@ _(Note: Don't forget to include `LD_LIBRARY_PATH=/vendor/lib64` in your command
 
 To upgrade and rebuild `stable-diffusion-cpp-python` add `--upgrade --force-reinstall --no-cache-dir` flags to the `pip install` command to ensure the package is rebuilt from source.
 
+### Using Flash Attention
+
+Enabling flash attention for the diffusion model reduces memory usage by varying amounts of MB, e.g.:
+
+- **flux 768x768** ~600mb
+- **SD2 768x768** ~1400mb
+
+For most backends, it slows things down, but for cuda it generally speeds it up too.
+At the moment, it is only supported for some models and some backends (like `cpu`, `cuda/rocm` and `metal`).
+
+Run by passing `diffusion_flash_attn=True` to the `StableDiffusion` class and watch for:
+
+```log
+[INFO] stable-diffusion.cpp:312  - Using flash attention in the diffusion model
+```
+
+and the compute buffer shrink in the debug log:
+
+```log
+[DEBUG] ggml_extend.hpp:1004 - flux compute buffer size: 650.00 MB(VRAM)
+```
+
 ## High-level API
 
 The high-level API provides a simple managed interface through the `StableDiffusion` class.
 
@@ -47,8 +47,7 @@ set ninja=C:\Program Files\ninja\ninja.exe
 
 ## Building stable-diffusion.cpp
 
-The thing different from the regular CPU build is `-DSD_HIPBLAS=ON`,
-`-G "Ninja"`, `-DCMAKE_C_COMPILER=clang`, `-DCMAKE_CXX_COMPILER=clang++`, `-DAMDGPU_TARGETS=gfx1100`, `-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON`, `-DCMAKE_POSITION_INDEPENDENT_CODE=ON`
+The thing different from the regular CPU build is `-G "Ninja"`, `-DCMAKE_C_COMPILER=clang`, `-DCMAKE_CXX_COMPILER=clang++`, `-DSD_HIPBLAS=ON`, `-DGPU_TARGETS=gfx1100`, `-DAMDGPU_TARGETS=gfx1100`, `-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON`, `-DCMAKE_POSITION_INDEPENDENT_CODE=ON`
 
 Note:
 If you encounter an error such as the following:
 
@@ -4,4 +4,4 @@
 
 # isort: on
 
-__version__ = "0.3.4"
+__version__ = "0.3.5"
@@ -977,10 +977,7 @@ def _format_control_image(
     ) -> sd_cpp.sd_image_t:
         """Convert an image path or Pillow Image to an C sd_image_t image."""
 
-        if not isinstance(control_image, (str, Image.Image)) or not self.control_net_path:
-            if not self.control_net_path:
-                log_event(1, "`control_net_path` not set. Skipping control image")
-
+        if not isinstance(control_image, (str, Image.Image)):
             # Return an empty sd_image_t
             return self._c_uint8_to_sd_image_t_p(
                 image=None,
 
@@ -137,10 +137,6 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
 byref = ctypes.byref  # type: ignore
 
 
-# from ggml-backend.h
-# typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
-ggml_backend_sched_eval_callback = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.c_void_p, ctypes.c_bool, ctypes.c_void_p)
-
 # // Abort callback
 # // If not NULL, called before ggml computation
 # // If it returns true, the computation is aborted
 
@@ -27,19 +27,18 @@ def test_chroma():
     def callback(step: int, steps: int, time: float):
         print("Completed step: {} of {}".format(step, steps))
 
-    # Generate images
-    images = stable_diffusion.generate_image(
+    # Generate image
+    image = stable_diffusion.generate_image(
         prompt=PROMPT,
         sample_steps=STEPS,
         cfg_scale=CFG_SCALE,
         progress_callback=callback,
-    )
+    )[0]
 
-    # Save images
-    for i, image in enumerate(images):
-        pnginfo = PngImagePlugin.PngInfo()
-        pnginfo.add_text("Parameters", ", ".join([f"{k.replace('_', ' ').title()}: {v}" for k, v in image.info.items()]))
-        image.save(f"{OUTPUT_DIR}/chroma_{i}.png", pnginfo=pnginfo)
+    # Save image
+    pnginfo = PngImagePlugin.PngInfo()
+    pnginfo.add_text("Parameters", ", ".join([f"{k.replace('_', ' ').title()}: {v}" for k, v in image.info.items()]))
+    image.save(f"{OUTPUT_DIR}/chroma.png", pnginfo=pnginfo)
 
 
 # ===========================================
 
@@ -25,19 +25,18 @@ def callback(step: int, steps: int, time: float):
         print("Completed step: {} of {}".format(step, steps))
 
     for prompt in PROMPTS:
-        # Generate images
-        images = stable_diffusion.generate_image(
+        # Generate image
+        image = stable_diffusion.generate_image(
             prompt=prompt["prompt"],
             control_image=INPUT_IMAGE_PATH,
             canny=prompt["canny"],
             progress_callback=callback,
-        )
+        )[0]
 
-        # Save images
-        for i, image in enumerate(images):
-            pnginfo = PngImagePlugin.PngInfo()
-            pnginfo.add_text("Parameters", ", ".join([f"{k.replace('_', ' ').title()}: {v}" for k, v in image.info.items()]))
-            image.save(f"{OUTPUT_DIR}/controlnet{prompt['add']}_{i}.png", pnginfo=pnginfo)
+        # Save image
+        pnginfo = PngImagePlugin.PngInfo()
+        pnginfo.add_text("Parameters", ", ".join([f"{k.replace('_', ' ').title()}: {v}" for k, v in image.info.items()]))
+        image.save(f"{OUTPUT_DIR}/controlnet{prompt['add']}.png", pnginfo=pnginfo)
 
 
 # ===========================================
 
@@ -11,7 +11,7 @@ def test_convert_model():
 
     model_converted = stable_diffusion.convert(
         input_path=MODEL_PATH,
-        output_path=f"{OUTPUT_DIR}/new_model.gguf",
+        output_path=f"{OUTPUT_DIR}/convert_model.gguf",
         output_type="q8_0",
     )
     print("Model converted: ", model_converted)
@@ -35,21 +35,20 @@ def callback(step: int, steps: int, time: float):
         print("Completed step: {} of {}".format(step, steps))
 
     # Edit image
-    images = stable_diffusion.generate_image(
+    image = stable_diffusion.generate_image(
         prompt=PROMPT,
         ref_images=INPUT_IMAGE_PATHS,
         sample_steps=STEPS,
         cfg_scale=CFG_SCALE,
         image_cfg_scale=IMAGE_CFG_SCALE,
         sample_method=SAMPLE_METHOD,
         progress_callback=callback,
-    )
+    )[0]
 
-    # Save images
-    for i, image in enumerate(images):
-        pnginfo = PngImagePlugin.PngInfo()
-        pnginfo.add_text("Parameters", ", ".join([f"{k.replace('_', ' ').title()}: {v}" for k, v in image.info.items()]))
-        image.save(f"{OUTPUT_DIR}/edit_{i}.png", pnginfo=pnginfo)
+    # Save image
+    pnginfo = PngImagePlugin.PngInfo()
+    pnginfo.add_text("Parameters", ", ".join([f"{k.replace('_', ' ').title()}: {v}" for k, v in image.info.items()]))
+    image.save(f"{OUTPUT_DIR}/edit.png", pnginfo=pnginfo)
 
 
 # ===========================================
 
@@ -0,0 +1,77 @@
+from PIL import PngImagePlugin
+from conftest import OUTPUT_DIR
+
+from stable_diffusion_cpp import StableDiffusion
+
+DIFFUSION_MODEL_PATH = "F:\\stable-diffusion\\flex\\Flex.2-preview-Q8_0.gguf"
+T5XXL_PATH = "F:\\stable-diffusion\\flux\\t5xxl_q8_0.gguf"
+CLIP_L_PATH = "F:\\stable-diffusion\\flux\\clip_l-q8_0.gguf"
+VAE_PATH = "F:\\stable-diffusion\\flux\\ae-f16.gguf"
+
+INPUT_IMAGE_PATH = "assets\\input.png"
+PROMPT = "the cat has a hat"
+STEPS = 20
+
+
+def test_flex2():
+
+    stable_diffusion = StableDiffusion(
+        diffusion_model_path=DIFFUSION_MODEL_PATH,
+        clip_l_path=CLIP_L_PATH,
+        t5xxl_path=T5XXL_PATH,
+        vae_path=VAE_PATH,
+        keep_clip_on_cpu=True,
+        vae_decode_only=True,
+    )
+
+    def callback(step: int, steps: int, time: float):
+        print("Completed step: {} of {}".format(step, steps))
+
+    # Generate image
+    image = stable_diffusion.generate_image(
+        prompt=PROMPT,
+        control_image=INPUT_IMAGE_PATH,
+        sample_steps=STEPS,
+        progress_callback=callback,
+    )[0]
+
+    # Save image
+    pnginfo = PngImagePlugin.PngInfo()
+    pnginfo.add_text("Parameters", ", ".join([f"{k.replace('_', ' ').title()}: {v}" for k, v in image.info.items()]))
+    image.save(f"{OUTPUT_DIR}/flex2.png", pnginfo=pnginfo)
+
+
+# ===========================================
+# C++ CLI
+# ===========================================
+
+# import subprocess
+
+# from conftest import SD_CPP_CLI
+
+# stable_diffusion = None  # Clear model
+
+
+# cli_cmd = [
+#     SD_CPP_CLI,
+#     "--diffusion-model",
+#     DIFFUSION_MODEL_PATH,
+#     "--control-image",
+#     INPUT_IMAGE_PATH,
+#     "--vae",
+#     VAE_PATH,
+#     "--t5xxl",
+#     T5XXL_PATH,
+#     "--clip_l",
+#     CLIP_L_PATH,
+#     "--prompt",
+#     PROMPT,
+#     "--steps",
+#     str(STEPS),
+#     "--clip-on-cpu",
+#     "--output",
+#     f"{OUTPUT_DIR}/flex2_cli.png",
+#     "-v",
+# ]
+# print(" ".join(cli_cmd))
+# subprocess.run(cli_cmd, check=True)
Original file line number	Diff line number	Diff line change
`@@ -4,4 +4,4 @@`
`4`	`4`
`5`	`5`	`# isort: on`
`6`	`6`
`7`		`-__version__ = "0.3.4"`
	`7`	`+__version__ = "0.3.5"`
Original file line number	Diff line number	Diff line change
`@@ -11,7 +11,7 @@ def test_convert_model():`
`11`	`11`
`12`	`12`	`model_converted = stable_diffusion.convert(`
`13`	`13`	`input_path=MODEL_PATH,`
`14`		`- output_path=f"{OUTPUT_DIR}/new_model.gguf",`
	`14`	`+ output_path=f"{OUTPUT_DIR}/convert_model.gguf",`
`15`	`15`	`output_type="q8_0",`
`16`	`16`	`)`
`17`	`17`	`print("Model converted: ", model_converted)`