wbruna
diff --git a/‎CMakeLists.txt‎
Lines changed: 0 additions & 7 deletions b/‎CMakeLists.txt‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎src/common_block.hpp‎
Lines changed: 5 additions & 3 deletions b/‎src/common_block.hpp‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎src/ggml_extend.hpp‎
Lines changed: 109 additions & 74 deletions b/‎src/ggml_extend.hpp‎
Lines changed: 109 additions & 74 deletions
@@ -72,37 +72,31 @@ option(SD_USE_SYSTEM_GGML            "sd: use system-installed GGML library" OFF
 if(SD_CUDA)
     message("-- Use CUDA as backend stable-diffusion")
     set(GGML_CUDA ON)
-    add_definitions(-DSD_USE_CUDA)
 endif()
 
 if(SD_METAL)
     message("-- Use Metal as backend stable-diffusion")
     set(GGML_METAL ON)
-    add_definitions(-DSD_USE_METAL)
 endif()
 
 if (SD_VULKAN)
     message("-- Use Vulkan as backend stable-diffusion")
     set(GGML_VULKAN ON)
-    add_definitions(-DSD_USE_VULKAN)
 endif ()
 
 if (SD_OPENCL)
     message("-- Use OpenCL as backend stable-diffusion")
     set(GGML_OPENCL ON)
-    add_definitions(-DSD_USE_OPENCL)
 endif ()
 
 if (SD_HIPBLAS)
     message("-- Use HIPBLAS as backend stable-diffusion")
     set(GGML_HIP ON)
-    add_definitions(-DSD_USE_CUDA)
 endif ()
 
 if(SD_MUSA)
     message("-- Use MUSA as backend stable-diffusion")
     set(GGML_MUSA ON)
-    add_definitions(-DSD_USE_CUDA)
 endif()
 
 if(SD_WEBP)
@@ -222,7 +216,6 @@ if(SD_SYCL)
     message("-- Use SYCL as backend stable-diffusion")
     set(GGML_SYCL ON)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing -fsycl")
-    add_definitions(-DSD_USE_SYCL)
     # disable fast-math on host, see:
     # https://www.intel.com/content/www/us/en/docs/cpp-compiler/developer-guide-reference/2021-10/fp-model-fp.html
     if (WIN32)
 
@@ -1,7 +1,9 @@
 #ifndef __COMMON_BLOCK_HPP__
 #define __COMMON_BLOCK_HPP__
 
+#include "ggml-backend.h"
 #include "ggml_extend.hpp"
+#include "util.h"
 
 class DownSampleBlock : public GGMLBlock {
 protected:
@@ -248,9 +250,6 @@ class FeedForward : public GGMLBlock {
         float scale         = 1.f;
         if (precision_fix) {
             scale = 1.f / 128.f;
-#ifdef SD_USE_VULKAN
-            force_prec_f32 = true;
-#endif
         }
         // The purpose of the scale here is to prevent NaN issues in certain situations.
         // For example, when using Vulkan without enabling force_prec_f32,
@@ -264,6 +263,9 @@ class FeedForward : public GGMLBlock {
 
         auto net_0 = std::dynamic_pointer_cast<UnaryBlock>(blocks["net.0"]);
         auto net_2 = std::dynamic_pointer_cast<Linear>(blocks["net.2"]);
+        if (sd_backend_is(ctx->backend, "Vulkan")) {
+            net_2->set_force_prec_f32(true);
+        }
 
         x = net_0->forward(ctx, x);  // [ne3, ne2, ne1, inner_dim]
         x = net_2->forward(ctx, x);  // [ne3, ne2, ne1, dim_out]
 
@@ -24,32 +24,12 @@
 
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
-#include "ggml-cpu.h"
 #include "ggml.h"
+#include "ggml_extend_backend.hpp"
 
 #include "model.h"
 #include "tensor.hpp"
 
-#ifdef SD_USE_CUDA
-#include "ggml-cuda.h"
-#endif
-
-#ifdef SD_USE_METAL
-#include "ggml-metal.h"
-#endif
-
-#ifdef SD_USE_VULKAN
-#include "ggml-vulkan.h"
-#endif
-
-#ifdef SD_USE_OPENCL
-#include "ggml-opencl.h"
-#endif
-
-#ifdef SD_USE_SYCL
-#include "ggml-sycl.h"
-#endif
-
 #include "rng.hpp"
 #include "tensor_ggml.hpp"
 #include "util.h"
@@ -91,6 +71,45 @@ __STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const cha
     }
 }
 
+__STATIC_INLINE__ bool backend_name_exists(std::string name) {
+    ggml_backend_load_all_once();
+    const int device_count = ggml_backend_dev_count();
+    for (int i = 0; i < device_count; i++) {
+        if (name == ggml_backend_dev_name(ggml_backend_dev_get(i))) {
+            return true;
+        }
+    }
+    return false;
+}
+
+__STATIC_INLINE__ std::string sanitize_backend_name(std::string name) {
+    if (name == "" || backend_name_exists(name)) {
+        return name;
+    } else {
+        LOG_WARN("Backend %s not found, using default backend", name.c_str());
+        return "";
+    }
+}
+
+__STATIC_INLINE__ std::string get_default_backend_name() {
+    ggml_backend_load_all_once();
+    // should pick the same backend as ggml_backend_init_best
+    ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
+    dev                    = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU);
+    dev                    = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+    return ggml_backend_dev_name(dev);
+}
+
+__STATIC_INLINE__ ggml_backend_t init_named_backend(std::string name = "") {
+    ggml_backend_load_all_once();
+    LOG_DEBUG("Initializing backend: %s", name.c_str());
+    if (name.empty()) {
+        return ggml_backend_init_best();
+    } else {
+        return ggml_backend_init_by_name(name.c_str(), nullptr);
+    }
+}
+
 static_assert(GGML_MAX_NAME >= 128, "GGML_MAX_NAME must be at least 128");
 
 // n-mode tensor-matrix product
@@ -1286,25 +1305,25 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_ones_like(ggml_context* ctx,
     return ggml_ext_ones(ctx, x->ne[0], x->ne[1], x->ne[2], x->ne[3]);
 }
 
-__STATIC_INLINE__ ggml_tensor* ggml_ext_cast_f32(ggml_context* ctx, ggml_tensor* a) {
-#ifdef SD_USE_VULKAN
-    auto zero_index = ggml_get_tensor(ctx, "ggml_runner_build_in_tensor:zero_int");
-    auto out        = ggml_reshape_1d(ctx, a, ggml_nelements(a));
-    out             = ggml_get_rows(ctx, out, zero_index);
-    out             = ggml_reshape(ctx, out, a);
-    // auto out = ggml_cast(ctx, a, GGML_TYPE_F32);
-    return out;
-#else
-    auto out         = ggml_reshape_2d(ctx, a, 1, ggml_nelements(a));
-    ggml_tensor* one = ggml_ext_ones(ctx, 1, 1, 1, 1);  // [1,]
-    if (ggml_is_transposed(out)) {
-        out = ggml_mul_mat(ctx, one, out);
+__STATIC_INLINE__ ggml_tensor* ggml_ext_cast_f32(ggml_context* ctx, ggml_backend_t backend, ggml_tensor* a) {
+    if (sd_backend_is(backend, "Vulkan")) {
+        auto zero_index = ggml_get_tensor(ctx, "ggml_runner_build_in_tensor:zero_int");
+        auto out        = ggml_reshape_1d(ctx, a, ggml_nelements(a));
+        out             = ggml_get_rows(ctx, out, zero_index);
+        out             = ggml_reshape(ctx, out, a);
+        // auto out = ggml_cast(ctx, a, GGML_TYPE_F32);
+        return out;
     } else {
-        out = ggml_mul_mat(ctx, out, one);
+        auto out         = ggml_reshape_2d(ctx, a, 1, ggml_nelements(a));
+        ggml_tensor* one = ggml_ext_ones(ctx, 1, 1, 1, 1);  // [1,]
+        if (ggml_is_transposed(out)) {
+            out = ggml_mul_mat(ctx, one, out);
+        } else {
+            out = ggml_mul_mat(ctx, out, one);
+        }
+        out = ggml_reshape(ctx, out, a);
+        return out;
     }
-    out = ggml_reshape(ctx, out, a);
-#endif
-    return out;
 }
 
 // q: [N, L_q, C(n_head*d_head)] or [N*n_head, L_q, d_head]
@@ -1496,16 +1515,14 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_group_norm(ggml_context* ctx,
 }
 
 __STATIC_INLINE__ void ggml_ext_backend_tensor_get_and_sync(ggml_backend_t backend, const ggml_tensor* tensor, void* data, size_t offset, size_t size) {
-#if defined(SD_USE_CUDA) || defined(SD_USE_SYCL)
-    if (!ggml_backend_is_cpu(backend)) {
+    if ((sd_backend_is(backend, "ROCm") || sd_backend_is(backend, "CUDA") || sd_backend_is(backend, "SYCL")) &&
+        !ggml_backend_is_cpu(backend)) {
         ggml_backend_tensor_get_async(backend, tensor, data, offset, size);
         ggml_backend_synchronize(backend);
-    } else {
-        ggml_backend_tensor_get(tensor, data, offset, size);
+        return;
     }
-#else
+
     ggml_backend_tensor_get(tensor, data, offset, size);
-#endif
 }
 
 __STATIC_INLINE__ float ggml_ext_backend_tensor_get_f32(ggml_tensor* tensor) {
@@ -1664,14 +1681,15 @@ struct WeightAdapter {
             float scale     = 1.f;
         } conv2d;
     };
-    virtual ggml_tensor* patch_weight(ggml_context* ctx, ggml_tensor* weight, const std::string& weight_name) = 0;
+    virtual ggml_tensor* patch_weight(ggml_context* ctx, ggml_backend_t backend, ggml_tensor* weight, const std::string& weight_name) = 0;
     virtual ggml_tensor* forward_with_lora(ggml_context* ctx,
+                                           ggml_backend_t backend,
                                            ggml_tensor* x,
                                            ggml_tensor* w,
                                            ggml_tensor* b,
                                            const std::string& prefix,
-                                           ForwardParams forward_params)                                      = 0;
-    virtual size_t get_extra_graph_size()                                                                     = 0;
+                                           ForwardParams forward_params)                                                              = 0;
+    virtual size_t get_extra_graph_size()                                                                                             = 0;
 };
 
 struct GGMLRunnerContext {
@@ -2192,6 +2210,14 @@ struct GGMLRunner {
     void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {
         weight_adapter = adapter;
     }
+
+    ggml_backend_t get_runtime_backend() {
+        return runtime_backend;
+    }
+
+    ggml_backend_t get_params_backend() {
+        return params_backend;
+    }
 };
 
 class GGMLBlock {
@@ -2336,6 +2362,14 @@ class Linear : public UnaryBlock {
           force_prec_f32(force_prec_f32),
           scale(scale) {}
 
+    void set_scale(float scale_) {
+        scale = scale_;
+    }
+
+    void set_force_prec_f32(bool force_prec_f32_) {
+        force_prec_f32 = force_prec_f32_;
+    }
+
     ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
         ggml_tensor* w = params["weight"];
         ggml_tensor* b = nullptr;
@@ -2347,7 +2381,7 @@ class Linear : public UnaryBlock {
             forward_params.op_type               = WeightAdapter::ForwardParams::op_type_t::OP_LINEAR;
             forward_params.linear.force_prec_f32 = force_prec_f32;
             forward_params.linear.scale          = scale;
-            return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, x, w, b, prefix, forward_params);
+            return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, ctx->backend, x, w, b, prefix, forward_params);
         }
         return ggml_ext_linear(ctx->ggml_ctx, x, w, b, force_prec_f32, scale);
     }
@@ -2463,7 +2497,7 @@ class Conv2d : public UnaryBlock {
             forward_params.conv2d.circular_x = ctx->circular_x_enabled;
             forward_params.conv2d.circular_y = ctx->circular_y_enabled;
             forward_params.conv2d.scale      = scale;
-            return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, x, w, b, prefix, forward_params);
+            return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, ctx->backend, x, w, b, prefix, forward_params);
         }
         return ggml_ext_conv_2d(ctx->ggml_ctx,
                                 x,
@@ -2527,15 +2561,15 @@ class Conv3d : public UnaryBlock {
         ggml_tensor* w = params["weight"];
         ggml_tensor* b = nullptr;
         if (ctx->weight_adapter) {
-            w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight");
+            w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, w, prefix + "weight");
             if (w->type != GGML_TYPE_F16) {
                 w = ggml_cast(ctx->ggml_ctx, w, GGML_TYPE_F16);
             }
         }
         if (bias) {
             b = params["bias"];
             if (ctx->weight_adapter) {
-                b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, b, prefix + "bias");
+                b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, b, prefix + "bias");
             }
         }
         return ggml_ext_conv_3d(ctx->ggml_ctx, x, w, b, in_channels,
@@ -2582,12 +2616,12 @@ class LayerNorm : public UnaryBlock {
         if (elementwise_affine) {
             w = params["weight"];
             if (ctx->weight_adapter) {
-                w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight");
+                w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, w, prefix + "weight");
             }
             if (bias) {
                 b = params["bias"];
                 if (ctx->weight_adapter) {
-                    b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, b, prefix + "bias");
+                    b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, b, prefix + "bias");
                 }
             }
         }
@@ -2630,8 +2664,8 @@ class GroupNorm : public GGMLBlock {
             w = params["weight"];
             b = params["bias"];
             if (ctx->weight_adapter) {
-                w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight");
-                b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, b, prefix + "bias");
+                w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, w, prefix + "weight");
+                b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, b, prefix + "bias");
             }
         }
         return ggml_ext_group_norm(ctx->ggml_ctx, x, w, b, num_groups);
@@ -2665,7 +2699,7 @@ class RMSNorm : public UnaryBlock {
     ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
         ggml_tensor* w = params["weight"];
         if (ctx->weight_adapter) {
-            w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight");
+            w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, w, prefix + "weight");
         }
         x = ggml_rms_norm(ctx->ggml_ctx, x, eps);
         x = ggml_mul_inplace(ctx->ggml_ctx, x, w);
@@ -2748,6 +2782,7 @@ class MultiheadAttention : public GGMLBlock {
 
 __STATIC_INLINE__ ggml_tensor* ggml_ext_lokr_forward(
     ggml_context* ctx,
+    ggml_backend_t backend,
     ggml_tensor* h,    // Input: [q, batch] or [W, H, q, batch]
     ggml_tensor* w1,   // Outer C (Full rank)
     ggml_tensor* w1a,  // Outer A (Low rank part 1)
@@ -2778,29 +2813,29 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_lokr_forward(
         int merge_batch_uq = batch;
         int merge_batch_vp = batch;
 
-#if SD_USE_VULKAN
-        if (batch > 1) {
-            // no access to backend here, worst case is slightly worse perfs for other backends when built alongside Vulkan backend
-            int max_batch    = 65535;
-            int max_batch_uq = max_batch / uq;
-            merge_batch_uq   = 1;
-            for (int i = max_batch_uq; i > 0; i--) {
-                if (batch % i == 0) {
-                    merge_batch_uq = i;
-                    break;
+        if (sd_backend_is(backend, "Vulkan")) {
+            if (batch > 1) {
+                // no access to backend here, worst case is slightly worse perfs for other backends when built alongside Vulkan backend
+                int max_batch    = 65535;
+                int max_batch_uq = max_batch / uq;
+                merge_batch_uq   = 1;
+                for (int i = max_batch_uq; i > 0; i--) {
+                    if (batch % i == 0) {
+                        merge_batch_uq = i;
+                        break;
+                    }
                 }
-            }
 
-            int max_batch_vp = max_batch / vp;
-            merge_batch_vp   = 1;
-            for (int i = max_batch_vp; i > 0; i--) {
-                if (batch % i == 0) {
-                    merge_batch_vp = i;
-                    break;
+                int max_batch_vp = max_batch / vp;
+                merge_batch_vp   = 1;
+                for (int i = max_batch_vp; i > 0; i--) {
+                    if (batch % i == 0) {
+                        merge_batch_vp = i;
+                        break;
+                    }
                 }
             }
         }
-#endif
 
         ggml_tensor* h_split = ggml_reshape_3d(ctx, h, vq, uq * merge_batch_uq, batch / merge_batch_uq);
         if (w2 != NULL) {