fix: performance fix with CMAKE_BUILD_TYPE set to Release

william-murray1204 · william-murray1204 · commit 6753a4ecec31 · 2024-10-30T02:23:39.000+11:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -5,9 +5,14 @@ project(stable_diffusion_cpp)
 option(STABLE_DIFFUSION_BUILD "Build stable-diffusion.cpp shared library and install alongside python package" ON)
 
 if (STABLE_DIFFUSION_BUILD)
+    # Ensure we build shared libraries (dlls)
     set(BUILD_SHARED_LIBS "ON")
     option(SD_BUILD_SHARED_LIBS "" "ON")
 
+    # Explicitly set the build type to Release (default `Debug` is slow)
+    set(CMAKE_BUILD_TYPE "Release")
+    option(CMAKE_BUILD_TYPE "" "Release")
+
     add_subdirectory(vendor/stable-diffusion.cpp)
     install(
         TARGETS stable-diffusion
diff --git a/README.md b/README.md
@@ -93,8 +93,8 @@ CMAKE_ARGS="-DSD_CUBLAS=ON" pip install stable-diffusion-cpp-python
 <details>
 <summary>Using HIPBLAS (ROCm)</summary>
 
-This provides BLAS acceleration using the ROCm cores of your AMD GPU. Make sure you have the ROCm toolkit installed.
-Windows users refer to [docs/hipBLAS_on_Windows.md](docs%2FhipBLAS_on_Windows.md) for a comprehensive guide.
+This provides BLAS acceleration using the ROCm cores of your AMD GPU. Make sure you have the ROCm toolkit installed and that you replace the `-DAMDGPU_TARGETS=` value with that of your GPU architecture.
+Windows users refer to [docs/hipBLAS_on_Windows.md](docs%2FhipBLAS_on_Windows.md) for a comprehensive guide and troubleshooting tips.
 
 ```bash
 CMAKE_ARGS="-G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=gfx1101" pip install stable-diffusion-cpp-python
diff --git a/docs/hipBLAS_on_Windows.md b/docs/hipBLAS_on_Windows.md
@@ -50,6 +50,15 @@ set ninja=C:\Program Files\ninja\ninja.exe
 The thing different from the regular CPU build is `-DSD_HIPBLAS=ON` ,
 `-G "Ninja"`, `-DCMAKE_C_COMPILER=clang`, `-DCMAKE_CXX_COMPILER=clang++`, `-DAMDGPU_TARGETS=gfx1100`
 
+Note:
+If you encounter an error such as the following:
+
+```Commandline
+lld-link: error: undefined symbol
+```
+
+Include the `-DGGML_OPENMP=OFF` argument in the CMake options to disable OpenMP, which, despite being marked as supported, is broken for ROCm on Windows. While it may work with newer ROCm versions, it generally has no impact unless you are partially offloading a model or using NKVO.
+
 > **Notice**: check the `clang` and `clang++` information:
 
 ```Commandline
@@ -75,15 +84,12 @@ InstalledDir: C:\Program Files\AMD\ROCm\5.5\bin
 
 > **Notice** that the `gfx1100` is the GPU architecture of my GPU, you can change it to your GPU architecture. Click here to see your architecture [LLVM Target](https://rocm.docs.amd.com/en/latest/release/windows_support.html#windows-supported-gpus)
 
-My GPU is AMD Radeon™ RX 7900 XTX Graphics, so I set it to `gfx1100`.
+As an example, if you have an AMD Radeon™ RX 7900 XTX Graphics Card you would set it to `gfx1100`.
 
-option:
+You can find the GPU architecture of your GPU in the [Accelerator and GPU hardware specifications](https://rocm.docs.amd.com/en/latest/reference/gpu-arch-specs.html) in the ROCm documentation.
 
-```commandline
-mkdir build
-cd build
-cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=gfx1100
-cmake --build . --config Release
-```
+## Running stable-diffusion.cpp
+
+You may also need to specify your device ID if your system has multiple GPUs (such as an integrated GPU) by setting: `$env:HIP_VISIBLE_DEVICES=1` (Replace "1" with the appropriate device ID for your setup).
 
-If everything went OK, `build\bin\sd.exe` file should appear.
+In addition, setting the `HSA_OVERRIDE_GFX_VERSION` environment variable to the GPU architecture of your GPU with the following command is recommended: `$env:HSA_OVERRIDE_GFX_VERSION=11.0.1`
diff --git a/stable_diffusion_cpp/__init__.py b/stable_diffusion_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .stable_diffusion_cpp import *
 from .stable_diffusion import *
 
-__version__ = "0.1.9"
+__version__ = "0.2.0"
diff --git a/stable_diffusion_cpp/_internals.py b/stable_diffusion_cpp/_internals.py
@@ -64,6 +64,7 @@ def __init__(
         self.keep_control_net_cpu = keep_control_net_cpu
         self.keep_vae_on_cpu = keep_vae_on_cpu
         self.verbose = verbose
+
         self._exit_stack = ExitStack()
 
         self.model = None
diff --git a/stable_diffusion_cpp/stable_diffusion.py b/stable_diffusion_cpp/stable_diffusion.py
@@ -32,12 +32,12 @@ def __init__(
         lora_model_dir: str = "",
         embed_dir: str = "",
         stacked_id_embed_dir: str = "",
-        vae_decode_only: bool = False,
+        vae_decode_only: bool = True,
         vae_tiling: bool = False,
         free_params_immediately: bool = False,
         n_threads: int = -1,
         wtype: Union[str, GGMLType, int, float, None] = "default",
-        rng_type: Union[str, RNGType, int, float, None] = "default",
+        rng_type: Union[str, RNGType, int, float, None] = "cuda",
         schedule: Union[str, Schedule, int, float, None] = "default",
         keep_clip_on_cpu: bool = False,
         keep_control_net_cpu: bool = False,
@@ -71,7 +71,7 @@ def __init__(
             vae_decode_only: Process vae in decode only mode.
             vae_tiling: Process vae in tiles to reduce memory usage.
             free_params_immediately: Free parameters immediately after use.
-            n_threads: Number of threads to use for generation.
+            n_threads: Number of threads to use for generation (default: half the number of CPUs).
             wtype: The weight type (default: automatically determines the weight type of the model file).
             rng_type: Random number generator.
             schedule: Denoiser sigma schedule.
@@ -101,7 +101,7 @@ def __init__(
         self.vae_decode_only = vae_decode_only
         self.vae_tiling = vae_tiling
         self.free_params_immediately = free_params_immediately
-        self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1)  # Default to half the number of CPUs
+        self.n_threads = n_threads
         self.wtype = wtype
         self.rng_type = rng_type
         self.schedule = schedule
@@ -110,6 +110,10 @@ def __init__(
         self.keep_vae_on_cpu = keep_vae_on_cpu
         self._stack = contextlib.ExitStack()
 
+        # Default to half the number of CPUs
+        if n_threads <= 0:
+            self.n_threads = max(multiprocessing.cpu_count() // 2, 1)
+
         # =========== Logging ===========
 
         self.verbose = verbose
diff --git a/stable_diffusion_cpp/stable_diffusion_cpp.py b/stable_diffusion_cpp/stable_diffusion_cpp.py
@@ -301,7 +301,7 @@ class GGMLType(IntEnum):
     [
         ctypes.c_char_p,  # model_path
         ctypes.c_char_p,  # clip_l_path
-        ctypes.c_char_p, # clip_g_path
+        ctypes.c_char_p,  # clip_g_path
         ctypes.c_char_p,  # t5xxl_path
         ctypes.c_char_p,  # diffusion_model_path
         ctypes.c_char_p,  # vae_path

Original file line number	Diff line number	Diff line change
`@@ -301,7 +301,7 @@ class GGMLType(IntEnum):`
`301`	`301`	`[`
`302`	`302`	`ctypes.c_char_p, # model_path`
`303`	`303`	`ctypes.c_char_p, # clip_l_path`
`304`		`- ctypes.c_char_p, # clip_g_path`
	`304`	`+ ctypes.c_char_p, # clip_g_path`
`305`	`305`	`ctypes.c_char_p, # t5xxl_path`
`306`	`306`	`ctypes.c_char_p, # diffusion_model_path`
`307`	`307`	`ctypes.c_char_p, # vae_path`