Skip to content

Commit 6753a4e

Browse files
fix: performance fix with CMAKE_BUILD_TYPE set to Release
1 parent bfa7a22 commit 6753a4e

File tree

7 files changed

+33
-17
lines changed

7 files changed

+33
-17
lines changed

CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,14 @@ project(stable_diffusion_cpp)
55
option(STABLE_DIFFUSION_BUILD "Build stable-diffusion.cpp shared library and install alongside python package" ON)
66

77
if (STABLE_DIFFUSION_BUILD)
8+
# Ensure we build shared libraries (dlls)
89
set(BUILD_SHARED_LIBS "ON")
910
option(SD_BUILD_SHARED_LIBS "" "ON")
1011

12+
# Explicitly set the build type to Release (default `Debug` is slow)
13+
set(CMAKE_BUILD_TYPE "Release")
14+
option(CMAKE_BUILD_TYPE "" "Release")
15+
1116
add_subdirectory(vendor/stable-diffusion.cpp)
1217
install(
1318
TARGETS stable-diffusion

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,8 @@ CMAKE_ARGS="-DSD_CUBLAS=ON" pip install stable-diffusion-cpp-python
9393
<details>
9494
<summary>Using HIPBLAS (ROCm)</summary>
9595

96-
This provides BLAS acceleration using the ROCm cores of your AMD GPU. Make sure you have the ROCm toolkit installed.
97-
Windows users refer to [docs/hipBLAS_on_Windows.md](docs%2FhipBLAS_on_Windows.md) for a comprehensive guide.
96+
This provides BLAS acceleration using the ROCm cores of your AMD GPU. Make sure you have the ROCm toolkit installed and that you replace the `-DAMDGPU_TARGETS=` value with that of your GPU architecture.
97+
Windows users refer to [docs/hipBLAS_on_Windows.md](docs%2FhipBLAS_on_Windows.md) for a comprehensive guide and troubleshooting tips.
9898

9999
```bash
100100
CMAKE_ARGS="-G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=gfx1101" pip install stable-diffusion-cpp-python

docs/hipBLAS_on_Windows.md

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,15 @@ set ninja=C:\Program Files\ninja\ninja.exe
5050
The thing different from the regular CPU build is `-DSD_HIPBLAS=ON` ,
5151
`-G "Ninja"`, `-DCMAKE_C_COMPILER=clang`, `-DCMAKE_CXX_COMPILER=clang++`, `-DAMDGPU_TARGETS=gfx1100`
5252

53+
Note:
54+
If you encounter an error such as the following:
55+
56+
```Commandline
57+
lld-link: error: undefined symbol
58+
```
59+
60+
Include the `-DGGML_OPENMP=OFF` argument in the CMake options to disable OpenMP, which, despite being marked as supported, is broken for ROCm on Windows. While it may work with newer ROCm versions, it generally has no impact unless you are partially offloading a model or using NKVO.
61+
5362
> **Notice**: check the `clang` and `clang++` information:
5463
5564
```Commandline
@@ -75,15 +84,12 @@ InstalledDir: C:\Program Files\AMD\ROCm\5.5\bin
7584

7685
> **Notice** that the `gfx1100` is the GPU architecture of my GPU, you can change it to your GPU architecture. Click here to see your architecture [LLVM Target](https://rocm.docs.amd.com/en/latest/release/windows_support.html#windows-supported-gpus)
7786
78-
My GPU is AMD Radeon™ RX 7900 XTX Graphics, so I set it to `gfx1100`.
87+
As an example, if you have an AMD Radeon™ RX 7900 XTX Graphics Card you would set it to `gfx1100`.
7988

80-
option:
89+
You can find the GPU architecture of your GPU in the [Accelerator and GPU hardware specifications](https://rocm.docs.amd.com/en/latest/reference/gpu-arch-specs.html) in the ROCm documentation.
8190

82-
```commandline
83-
mkdir build
84-
cd build
85-
cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=gfx1100
86-
cmake --build . --config Release
87-
```
91+
## Running stable-diffusion.cpp
92+
93+
You may also need to specify your device ID if your system has multiple GPUs (such as an integrated GPU) by setting: `$env:HIP_VISIBLE_DEVICES=1` (Replace "1" with the appropriate device ID for your setup).
8894

89-
If everything went OK, `build\bin\sd.exe` file should appear.
95+
In addition, setting the `HSA_OVERRIDE_GFX_VERSION` environment variable to the GPU architecture of your GPU with the following command is recommended: `$env:HSA_OVERRIDE_GFX_VERSION=11.0.1`

stable_diffusion_cpp/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
from .stable_diffusion_cpp import *
22
from .stable_diffusion import *
33

4-
__version__ = "0.1.9"
4+
__version__ = "0.2.0"

stable_diffusion_cpp/_internals.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ def __init__(
6464
self.keep_control_net_cpu = keep_control_net_cpu
6565
self.keep_vae_on_cpu = keep_vae_on_cpu
6666
self.verbose = verbose
67+
6768
self._exit_stack = ExitStack()
6869

6970
self.model = None

stable_diffusion_cpp/stable_diffusion.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,12 @@ def __init__(
3232
lora_model_dir: str = "",
3333
embed_dir: str = "",
3434
stacked_id_embed_dir: str = "",
35-
vae_decode_only: bool = False,
35+
vae_decode_only: bool = True,
3636
vae_tiling: bool = False,
3737
free_params_immediately: bool = False,
3838
n_threads: int = -1,
3939
wtype: Union[str, GGMLType, int, float, None] = "default",
40-
rng_type: Union[str, RNGType, int, float, None] = "default",
40+
rng_type: Union[str, RNGType, int, float, None] = "cuda",
4141
schedule: Union[str, Schedule, int, float, None] = "default",
4242
keep_clip_on_cpu: bool = False,
4343
keep_control_net_cpu: bool = False,
@@ -71,7 +71,7 @@ def __init__(
7171
vae_decode_only: Process vae in decode only mode.
7272
vae_tiling: Process vae in tiles to reduce memory usage.
7373
free_params_immediately: Free parameters immediately after use.
74-
n_threads: Number of threads to use for generation.
74+
n_threads: Number of threads to use for generation (default: half the number of CPUs).
7575
wtype: The weight type (default: automatically determines the weight type of the model file).
7676
rng_type: Random number generator.
7777
schedule: Denoiser sigma schedule.
@@ -101,7 +101,7 @@ def __init__(
101101
self.vae_decode_only = vae_decode_only
102102
self.vae_tiling = vae_tiling
103103
self.free_params_immediately = free_params_immediately
104-
self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1) # Default to half the number of CPUs
104+
self.n_threads = n_threads
105105
self.wtype = wtype
106106
self.rng_type = rng_type
107107
self.schedule = schedule
@@ -110,6 +110,10 @@ def __init__(
110110
self.keep_vae_on_cpu = keep_vae_on_cpu
111111
self._stack = contextlib.ExitStack()
112112

113+
# Default to half the number of CPUs
114+
if n_threads <= 0:
115+
self.n_threads = max(multiprocessing.cpu_count() // 2, 1)
116+
113117
# =========== Logging ===========
114118

115119
self.verbose = verbose

stable_diffusion_cpp/stable_diffusion_cpp.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -301,7 +301,7 @@ class GGMLType(IntEnum):
301301
[
302302
ctypes.c_char_p, # model_path
303303
ctypes.c_char_p, # clip_l_path
304-
ctypes.c_char_p, # clip_g_path
304+
ctypes.c_char_p, # clip_g_path
305305
ctypes.c_char_p, # t5xxl_path
306306
ctypes.c_char_p, # diffusion_model_path
307307
ctypes.c_char_p, # vae_path

0 commit comments

Comments
 (0)