stduhpf
diff --git a/‎.github/workflows/build.yml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/build.yml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎Dockerfile‎
Lines changed: 9 additions & 4 deletions b/‎Dockerfile‎
Lines changed: 9 additions & 4 deletions
diff --git a/‎README.md‎
Lines changed: 13 additions & 6 deletions b/‎README.md‎
Lines changed: 13 additions & 6 deletions
diff --git a/‎assets/qwen/example.png‎
1.35 MB b/‎assets/qwen/example.png‎
1.35 MB
diff --git a/‎assets/qwen/qwen_image_edit.png‎
457 KB b/‎assets/qwen/qwen_image_edit.png‎
457 KB
diff --git a/‎assets/qwen/qwen_image_edit_2509.png‎
415 KB b/‎assets/qwen/qwen_image_edit_2509.png‎
415 KB
diff --git a/‎clip.hpp‎
Lines changed: 7 additions & 7 deletions b/‎clip.hpp‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎common.hpp‎
Lines changed: 39 additions & 5 deletions b/‎common.hpp‎
Lines changed: 39 additions & 5 deletions
@@ -149,7 +149,7 @@ jobs:
     runs-on: windows-2025
 
     env:
-      VULKAN_VERSION: 1.3.261.1
+      VULKAN_VERSION: 1.4.328.1
 
     strategy:
       matrix:
@@ -199,9 +199,9 @@ jobs:
           version: 1.11.1
       - name: Install Vulkan SDK
         id: get_vulkan
-        if: ${{ matrix.build == 'vulkan' }}
+        if: ${{ matrix.build == 'vulkan' }} https://sdk.lunarg.com/sdk/download/1.4.328.1/windows/vulkansdk-windows-X64-1.4.328.1.exe
         run: |
-          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
+          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
           & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
           Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
           Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
 
@@ -1,16 +1,21 @@
 ARG UBUNTU_VERSION=22.04
 
-FROM ubuntu:$UBUNTU_VERSION as build
+FROM ubuntu:$UBUNTU_VERSION AS build
 
-RUN apt-get update && apt-get install -y build-essential git cmake
+RUN apt-get update && apt-get install -y --no-install-recommends build-essential git cmake
 
 WORKDIR /sd.cpp
 
 COPY . .
 
-RUN mkdir build && cd build && cmake .. && cmake --build . --config Release
+RUN cmake . -B ./build
+RUN cmake --build ./build --config Release --parallel
 
-FROM ubuntu:$UBUNTU_VERSION as runtime
+FROM ubuntu:$UBUNTU_VERSION AS runtime
+
+RUN apt-get update && \
+    apt-get install --yes --no-install-recommends libgomp1 && \
+    apt-get clean
 
 COPY --from=build /sd.cpp/build/bin/sd /sd
 
 
@@ -21,8 +21,10 @@ API and command-line option may change frequently.***
     - [SD3/SD3.5](./docs/sd3.md)
     - [Flux-dev/Flux-schnell](./docs/flux.md)
     - [Chroma](./docs/chroma.md)
+    - [Qwen Image](./docs/qwen_image.md)
   - Image Edit Models
     - [FLUX.1-Kontext-dev](./docs/kontext.md)
+    - [Qwen Image Edit/Qwen Image Edit 2509](./docs/qwen_image_edit.md)
   - Video Models
     - [Wan2.1/Wan2.2](./docs/wan.md)
   - [PhotoMaker](https://github.com/TencentARC/PhotoMaker) support.
@@ -125,13 +127,14 @@ cmake --build . --config Release
 
 ##### Using HipBLAS
 This provides BLAS acceleration using the ROCm cores of your AMD GPU. Make sure to have the ROCm toolkit installed.
+To build for another GPU architecture than installed in your system, set `$GFX_NAME` manually to the desired architecture (replace first command). This is also necessary if your GPU is not officially supported by ROCm, for example you have to set `$GFX_NAME` manually to `gfx1030` for consumer RDNA2 cards.
 
 Windows User Refer to [docs/hipBLAS_on_Windows.md](docs%2FhipBLAS_on_Windows.md) for a comprehensive guide.
 
 ```
-export GFX_NAME=$(rocminfo | grep -m 1 -E "gfx[^0]{1}" | sed -e 's/ *Name: *//' | awk '{$1=$1; print}' || echo "rocminfo missing")
-echo $GFX_NAME
-cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=$GFX_NAME -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON
+if command -v rocminfo; then export GFX_NAME=$(rocminfo | awk '/ *Name: +gfx[1-9]/ {print $2; exit}'); else echo "rocminfo missing!"; fi
+if [ -z "${GFX_NAME}" ]; then echo "Error: Couldn't detect GPU!"; else echo "Building for GPU: ${GFX_NAME}"; fi
+cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=$GFX_NAME -DAMDGPU_TARGETS=$GFX_NAME -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
 cmake --build . --config Release
 ```
 
@@ -284,7 +287,7 @@ usage: ./bin/sd [arguments]
 
 arguments:
   -h, --help                         show this help message and exit
-  -M, --mode [MODE]                  run mode, one of: [img_gen, vid_gen, convert], default: img_gen
+  -M, --mode [MODE]                  run mode, one of: [img_gen, vid_gen, upscale, convert], default: img_gen
   -t, --threads N                    number of threads to use during computation (default: -1)
                                      If threads <= 0, then threads will be set to the number of CPU physical cores
   --offload-to-cpu                   place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
@@ -295,11 +298,13 @@ arguments:
   --clip_g                           path to the clip-g text encoder
   --clip_vision                      path to the clip-vision encoder
   --t5xxl                            path to the t5xxl text encoder
+  --qwen2vl                          path to the qwen2vl text encoder
+  --qwen2vl_vision                   path to the qwen2vl vit
   --vae [VAE]                        path to vae
   --taesd [TAESD_PATH]               path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
   --control-net [CONTROL_PATH]       path to control net model
   --embd-dir [EMBEDDING_PATH]        path to embeddings
-  --upscale-model [ESRGAN_PATH]      path to esrgan model. Upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now
+  --upscale-model [ESRGAN_PATH]      path to esrgan model. For img_gen mode, upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now
   --upscale-repeats                  Run the ESRGAN upscaler this many times (default 1)
   --type [TYPE]                      weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K)
                                      If not specified, the default is the type of the weight file
@@ -448,6 +453,7 @@ These projects use `stable-diffusion.cpp` as a backend for their image generatio
 - [Local Diffusion](https://github.com/rmatif/Local-Diffusion)
 - [sd.cpp-webui](https://github.com/daniandtheweb/sd.cpp-webui)
 - [LocalAI](https://github.com/mudler/LocalAI)
+- [Neural-Pixel](https://github.com/Luiz-Alcantara/Neural-Pixel)
 
 ## Contributors
 
@@ -462,6 +468,7 @@ Thank you to all the people who have already contributed to stable-diffusion.cpp
 ## References
 
 - [ggml](https://github.com/ggerganov/ggml)
+- [diffusers](https://github.com/huggingface/diffusers)
 - [stable-diffusion](https://github.com/CompVis/stable-diffusion)
 - [sd3-ref](https://github.com/Stability-AI/sd3-ref)
 - [stable-diffusion-stability-ai](https://github.com/Stability-AI/stablediffusion)
@@ -472,4 +479,4 @@ Thank you to all the people who have already contributed to stable-diffusion.cpp
 - [generative-models](https://github.com/Stability-AI/generative-models/)
 - [PhotoMaker](https://github.com/TencentARC/PhotoMaker)
 - [Wan2.1](https://github.com/Wan-Video/Wan2.1)
-- [Wan2.2](https://github.com/Wan-Video/Wan2.2)
+- [Wan2.2](https://github.com/Wan-Video/Wan2.2)
@@ -6,7 +6,7 @@
 
 /*================================================== CLIPTokenizer ===================================================*/
 
-std::pair<std::unordered_map<std::string, float>, std::string> extract_and_remove_lora(std::string text) {
+__STATIC_INLINE__ std::pair<std::unordered_map<std::string, float>, std::string> extract_and_remove_lora(std::string text) {
     std::regex re("<lora:([^:]+):([^>]+)>");
     std::smatch matches;
     std::unordered_map<std::string, float> filename2multiplier;
@@ -31,7 +31,7 @@ std::pair<std::unordered_map<std::string, float>, std::string> extract_and_remov
     return std::make_pair(filename2multiplier, text);
 }
 
-std::vector<std::pair<int, std::u32string>> bytes_to_unicode() {
+__STATIC_INLINE__ std::vector<std::pair<int, std::u32string>> bytes_to_unicode() {
     std::vector<std::pair<int, std::u32string>> byte_unicode_pairs;
     std::set<int> byte_set;
     for (int b = static_cast<int>('!'); b <= static_cast<int>('~'); ++b) {
@@ -553,12 +553,12 @@ class CLIPEmbeddings : public GGMLBlock {
     void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
         enum ggml_type token_wtype = GGML_TYPE_F32;
         if (!force_clip_f32) {
-            auto tensor_type = tensor_types.find(prefix + "token_embedding.weight");
-            if (tensor_type != tensor_types.end())
-                token_wtype = tensor_type->second;
+            token_wtype = get_type(prefix + "token_embedding.weight", tensor_types, GGML_TYPE_F32);
+            if (!support_get_rows(token_wtype)) {
+                token_wtype = GGML_TYPE_F32;
+            }
         }
-        enum ggml_type position_wtype = GGML_TYPE_F32;
-
+        enum ggml_type position_wtype       = GGML_TYPE_F32;
         params["token_embedding.weight"]    = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size);
         params["position_embedding.weight"] = ggml_new_tensor_2d(ctx, position_wtype, embed_dim, num_positions);
     }
 
@@ -177,7 +177,7 @@ class ResBlock : public GGMLBlock {
     }
 };
 
-class GEGLU : public GGMLBlock {
+class GEGLU : public UnaryBlock {
 protected:
     int64_t dim_in;
     int64_t dim_out;
@@ -216,23 +216,57 @@ class GEGLU : public GGMLBlock {
     }
 };
 
+class GELU : public UnaryBlock {
+public:
+    GELU(int64_t dim_in, int64_t dim_out, bool bias = true) {
+        blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim_in, dim_out, bias));
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [ne3, ne2, ne1, dim_in]
+        // return: [ne3, ne2, ne1, dim_out]
+        auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
+
+        x = proj->forward(ctx, x);
+        x = ggml_gelu_inplace(ctx, x);
+        return x;
+    }
+};
+
 class FeedForward : public GGMLBlock {
 public:
+    enum class Activation {
+        GEGLU,
+        GELU
+    };
     FeedForward(int64_t dim,
                 int64_t dim_out,
-                int64_t mult = 4) {
+                int64_t mult          = 4,
+                Activation activation = Activation::GEGLU,
+                bool precision_fix    = false) {
         int64_t inner_dim = dim * mult;
+        if (activation == Activation::GELU) {
+            blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GELU(dim, inner_dim));
+        } else {
+            blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GEGLU(dim, inner_dim));
+        }
 
-        blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GEGLU(dim, inner_dim));
         // net_1 is nn.Dropout(), skip for inference
-        blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out));
+        float scale = 1.f;
+        if (precision_fix) {
+            scale = 1.f / 128.f;
+        }
+        // The purpose of the scale here is to prevent NaN issues in certain situations.
+        // For example, when using Vulkan without enabling force_prec_f32,
+        // or when using CUDA but the weights are k-quants.
+        blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out, true, false, false, scale));
     }
 
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
         // x: [ne3, ne2, ne1, dim]
         // return: [ne3, ne2, ne1, dim_out]
 
-        auto net_0 = std::dynamic_pointer_cast<GEGLU>(blocks["net.0"]);
+        auto net_0 = std::dynamic_pointer_cast<UnaryBlock>(blocks["net.0"]);
         auto net_2 = std::dynamic_pointer_cast<Linear>(blocks["net.2"]);
 
         x = net_0->forward(ctx, x);  // [ne3, ne2, ne1, inner_dim]