Skip to content

Commit 4fa6350

Browse files
committed
Merge 'master-323-2e9242e' into apg_merge_master
2 parents 057abde + 2e9242e commit 4fa6350

39 files changed

+144158
-749
lines changed

.github/workflows/build.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ jobs:
149149
runs-on: windows-2025
150150

151151
env:
152-
VULKAN_VERSION: 1.3.261.1
152+
VULKAN_VERSION: 1.4.328.1
153153

154154
strategy:
155155
matrix:
@@ -199,9 +199,9 @@ jobs:
199199
version: 1.11.1
200200
- name: Install Vulkan SDK
201201
id: get_vulkan
202-
if: ${{ matrix.build == 'vulkan' }}
202+
if: ${{ matrix.build == 'vulkan' }} https://sdk.lunarg.com/sdk/download/1.4.328.1/windows/vulkansdk-windows-X64-1.4.328.1.exe
203203
run: |
204-
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
204+
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
205205
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
206206
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
207207
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"

Dockerfile

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,21 @@
11
ARG UBUNTU_VERSION=22.04
22

3-
FROM ubuntu:$UBUNTU_VERSION as build
3+
FROM ubuntu:$UBUNTU_VERSION AS build
44

5-
RUN apt-get update && apt-get install -y build-essential git cmake
5+
RUN apt-get update && apt-get install -y --no-install-recommends build-essential git cmake
66

77
WORKDIR /sd.cpp
88

99
COPY . .
1010

11-
RUN mkdir build && cd build && cmake .. && cmake --build . --config Release
11+
RUN cmake . -B ./build
12+
RUN cmake --build ./build --config Release --parallel
1213

13-
FROM ubuntu:$UBUNTU_VERSION as runtime
14+
FROM ubuntu:$UBUNTU_VERSION AS runtime
15+
16+
RUN apt-get update && \
17+
apt-get install --yes --no-install-recommends libgomp1 && \
18+
apt-get clean
1419

1520
COPY --from=build /sd.cpp/build/bin/sd /sd
1621

README.md

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,10 @@ API and command-line option may change frequently.***
2121
- [SD3/SD3.5](./docs/sd3.md)
2222
- [Flux-dev/Flux-schnell](./docs/flux.md)
2323
- [Chroma](./docs/chroma.md)
24+
- [Qwen Image](./docs/qwen_image.md)
2425
- Image Edit Models
2526
- [FLUX.1-Kontext-dev](./docs/kontext.md)
27+
- [Qwen Image Edit/Qwen Image Edit 2509](./docs/qwen_image_edit.md)
2628
- Video Models
2729
- [Wan2.1/Wan2.2](./docs/wan.md)
2830
- [PhotoMaker](https://github.com/TencentARC/PhotoMaker) support.
@@ -125,13 +127,14 @@ cmake --build . --config Release
125127
126128
##### Using HipBLAS
127129
This provides BLAS acceleration using the ROCm cores of your AMD GPU. Make sure to have the ROCm toolkit installed.
130+
To build for another GPU architecture than installed in your system, set `$GFX_NAME` manually to the desired architecture (replace first command). This is also necessary if your GPU is not officially supported by ROCm, for example you have to set `$GFX_NAME` manually to `gfx1030` for consumer RDNA2 cards.
128131
129132
Windows User Refer to [docs/hipBLAS_on_Windows.md](docs%2FhipBLAS_on_Windows.md) for a comprehensive guide.
130133
131134
```
132-
export GFX_NAME=$(rocminfo | grep -m 1 -E "gfx[^0]{1}" | sed -e 's/ *Name: *//' | awk '{$1=$1; print}' || echo "rocminfo missing")
133-
echo $GFX_NAME
134-
cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=$GFX_NAME -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON
135+
if command -v rocminfo; then export GFX_NAME=$(rocminfo | awk '/ *Name: +gfx[1-9]/ {print $2; exit}'); else echo "rocminfo missing!"; fi
136+
if [ -z "${GFX_NAME}" ]; then echo "Error: Couldn't detect GPU!"; else echo "Building for GPU: ${GFX_NAME}"; fi
137+
cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=$GFX_NAME -DAMDGPU_TARGETS=$GFX_NAME -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
135138
cmake --build . --config Release
136139
```
137140
@@ -284,7 +287,7 @@ usage: ./bin/sd [arguments]
284287
285288
arguments:
286289
-h, --help show this help message and exit
287-
-M, --mode [MODE] run mode, one of: [img_gen, vid_gen, convert], default: img_gen
290+
-M, --mode [MODE] run mode, one of: [img_gen, vid_gen, upscale, convert], default: img_gen
288291
-t, --threads N number of threads to use during computation (default: -1)
289292
If threads <= 0, then threads will be set to the number of CPU physical cores
290293
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
@@ -295,11 +298,13 @@ arguments:
295298
--clip_g path to the clip-g text encoder
296299
--clip_vision path to the clip-vision encoder
297300
--t5xxl path to the t5xxl text encoder
301+
--qwen2vl path to the qwen2vl text encoder
302+
--qwen2vl_vision path to the qwen2vl vit
298303
--vae [VAE] path to vae
299304
--taesd [TAESD_PATH] path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
300305
--control-net [CONTROL_PATH] path to control net model
301306
--embd-dir [EMBEDDING_PATH] path to embeddings
302-
--upscale-model [ESRGAN_PATH] path to esrgan model. Upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now
307+
--upscale-model [ESRGAN_PATH] path to esrgan model. For img_gen mode, upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now
303308
--upscale-repeats Run the ESRGAN upscaler this many times (default 1)
304309
--type [TYPE] weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K)
305310
If not specified, the default is the type of the weight file
@@ -448,6 +453,7 @@ These projects use `stable-diffusion.cpp` as a backend for their image generatio
448453
- [Local Diffusion](https://github.com/rmatif/Local-Diffusion)
449454
- [sd.cpp-webui](https://github.com/daniandtheweb/sd.cpp-webui)
450455
- [LocalAI](https://github.com/mudler/LocalAI)
456+
- [Neural-Pixel](https://github.com/Luiz-Alcantara/Neural-Pixel)
451457
452458
## Contributors
453459
@@ -462,6 +468,7 @@ Thank you to all the people who have already contributed to stable-diffusion.cpp
462468
## References
463469
464470
- [ggml](https://github.com/ggerganov/ggml)
471+
- [diffusers](https://github.com/huggingface/diffusers)
465472
- [stable-diffusion](https://github.com/CompVis/stable-diffusion)
466473
- [sd3-ref](https://github.com/Stability-AI/sd3-ref)
467474
- [stable-diffusion-stability-ai](https://github.com/Stability-AI/stablediffusion)
@@ -472,4 +479,4 @@ Thank you to all the people who have already contributed to stable-diffusion.cpp
472479
- [generative-models](https://github.com/Stability-AI/generative-models/)
473480
- [PhotoMaker](https://github.com/TencentARC/PhotoMaker)
474481
- [Wan2.1](https://github.com/Wan-Video/Wan2.1)
475-
- [Wan2.2](https://github.com/Wan-Video/Wan2.2)
482+
- [Wan2.2](https://github.com/Wan-Video/Wan2.2)

assets/qwen/example.png

1.35 MB
Loading

assets/qwen/qwen_image_edit.png

457 KB
Loading
415 KB
Loading

clip.hpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
/*================================================== CLIPTokenizer ===================================================*/
88

9-
std::pair<std::unordered_map<std::string, float>, std::string> extract_and_remove_lora(std::string text) {
9+
__STATIC_INLINE__ std::pair<std::unordered_map<std::string, float>, std::string> extract_and_remove_lora(std::string text) {
1010
std::regex re("<lora:([^:]+):([^>]+)>");
1111
std::smatch matches;
1212
std::unordered_map<std::string, float> filename2multiplier;
@@ -31,7 +31,7 @@ std::pair<std::unordered_map<std::string, float>, std::string> extract_and_remov
3131
return std::make_pair(filename2multiplier, text);
3232
}
3333

34-
std::vector<std::pair<int, std::u32string>> bytes_to_unicode() {
34+
__STATIC_INLINE__ std::vector<std::pair<int, std::u32string>> bytes_to_unicode() {
3535
std::vector<std::pair<int, std::u32string>> byte_unicode_pairs;
3636
std::set<int> byte_set;
3737
for (int b = static_cast<int>('!'); b <= static_cast<int>('~'); ++b) {
@@ -553,12 +553,12 @@ class CLIPEmbeddings : public GGMLBlock {
553553
void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
554554
enum ggml_type token_wtype = GGML_TYPE_F32;
555555
if (!force_clip_f32) {
556-
auto tensor_type = tensor_types.find(prefix + "token_embedding.weight");
557-
if (tensor_type != tensor_types.end())
558-
token_wtype = tensor_type->second;
556+
token_wtype = get_type(prefix + "token_embedding.weight", tensor_types, GGML_TYPE_F32);
557+
if (!support_get_rows(token_wtype)) {
558+
token_wtype = GGML_TYPE_F32;
559+
}
559560
}
560-
enum ggml_type position_wtype = GGML_TYPE_F32;
561-
561+
enum ggml_type position_wtype = GGML_TYPE_F32;
562562
params["token_embedding.weight"] = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size);
563563
params["position_embedding.weight"] = ggml_new_tensor_2d(ctx, position_wtype, embed_dim, num_positions);
564564
}

common.hpp

Lines changed: 39 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ class ResBlock : public GGMLBlock {
177177
}
178178
};
179179

180-
class GEGLU : public GGMLBlock {
180+
class GEGLU : public UnaryBlock {
181181
protected:
182182
int64_t dim_in;
183183
int64_t dim_out;
@@ -216,23 +216,57 @@ class GEGLU : public GGMLBlock {
216216
}
217217
};
218218

219+
class GELU : public UnaryBlock {
220+
public:
221+
GELU(int64_t dim_in, int64_t dim_out, bool bias = true) {
222+
blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim_in, dim_out, bias));
223+
}
224+
225+
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
226+
// x: [ne3, ne2, ne1, dim_in]
227+
// return: [ne3, ne2, ne1, dim_out]
228+
auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
229+
230+
x = proj->forward(ctx, x);
231+
x = ggml_gelu_inplace(ctx, x);
232+
return x;
233+
}
234+
};
235+
219236
class FeedForward : public GGMLBlock {
220237
public:
238+
enum class Activation {
239+
GEGLU,
240+
GELU
241+
};
221242
FeedForward(int64_t dim,
222243
int64_t dim_out,
223-
int64_t mult = 4) {
244+
int64_t mult = 4,
245+
Activation activation = Activation::GEGLU,
246+
bool precision_fix = false) {
224247
int64_t inner_dim = dim * mult;
248+
if (activation == Activation::GELU) {
249+
blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GELU(dim, inner_dim));
250+
} else {
251+
blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GEGLU(dim, inner_dim));
252+
}
225253

226-
blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GEGLU(dim, inner_dim));
227254
// net_1 is nn.Dropout(), skip for inference
228-
blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out));
255+
float scale = 1.f;
256+
if (precision_fix) {
257+
scale = 1.f / 128.f;
258+
}
259+
// The purpose of the scale here is to prevent NaN issues in certain situations.
260+
// For example, when using Vulkan without enabling force_prec_f32,
261+
// or when using CUDA but the weights are k-quants.
262+
blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out, true, false, false, scale));
229263
}
230264

231265
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
232266
// x: [ne3, ne2, ne1, dim]
233267
// return: [ne3, ne2, ne1, dim_out]
234268

235-
auto net_0 = std::dynamic_pointer_cast<GEGLU>(blocks["net.0"]);
269+
auto net_0 = std::dynamic_pointer_cast<UnaryBlock>(blocks["net.0"]);
236270
auto net_2 = std::dynamic_pointer_cast<Linear>(blocks["net.2"]);
237271

238272
x = net_0->forward(ctx, x); // [ne3, ne2, ne1, inner_dim]

0 commit comments

Comments
 (0)