support non-square tiles

wbruna · wbruna · commit ff6127c4cb12 · 2025-06-17T18:09:03.000-03:00
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
@@ -635,7 +635,10 @@ sd_tiling_calc_tiles(int &num_tiles_dim, float& tile_overlap_factor_dim, int sma
 }
 
 // Tiling
-__STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const int scale, const int tile_size, const float tile_overlap_factor, on_tile_process on_processing) {
+__STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input, ggml_tensor* output, const int scale,
+                                            const int p_tile_size_x, const int p_tile_size_y,
+                                            const float tile_overlap_factor, on_tile_process on_processing) {
+
     output = ggml_set_f32(output, 0);
 
     int input_width   = (int)input->ne[0];
@@ -658,25 +661,25 @@ __STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const
 
     int num_tiles_x;
     float tile_overlap_factor_x;
-    sd_tiling_calc_tiles(num_tiles_x, tile_overlap_factor_x, small_width, tile_size, tile_overlap_factor);
+    sd_tiling_calc_tiles(num_tiles_x, tile_overlap_factor_x, small_width, p_tile_size_x, tile_overlap_factor);
 
     int num_tiles_y;
     float tile_overlap_factor_y;
-    sd_tiling_calc_tiles(num_tiles_y, tile_overlap_factor_y, small_height, tile_size, tile_overlap_factor);
+    sd_tiling_calc_tiles(num_tiles_y, tile_overlap_factor_y, small_height, p_tile_size_y, tile_overlap_factor);
 
     LOG_DEBUG("num tiles : %d, %d ", num_tiles_x, num_tiles_y);
     LOG_DEBUG("optimal overlap : %f, %f (targeting %f)", tile_overlap_factor_x, tile_overlap_factor_y, tile_overlap_factor);
 
     GGML_ASSERT(input_width % 2 == 0 && input_height % 2 == 0 && output_width % 2 == 0 && output_height % 2 == 0);  // should be multiple of 2
 
-    int tile_overlap_x     = (int32_t)(tile_size * tile_overlap_factor_x);
-    int non_tile_overlap_x = tile_size - tile_overlap_x;
+    int tile_overlap_x     = (int32_t)(p_tile_size_x * tile_overlap_factor_x);
+    int non_tile_overlap_x = p_tile_size_x - tile_overlap_x;
 
-    int tile_overlap_y     = (int32_t)(tile_size * tile_overlap_factor_y);
-    int non_tile_overlap_y = tile_size - tile_overlap_y;
+    int tile_overlap_y     = (int32_t)(p_tile_size_y * tile_overlap_factor_y);
+    int non_tile_overlap_y = p_tile_size_y - tile_overlap_y;
 
-    int tile_size_x = tile_size < small_width ? tile_size : small_width;
-    int tile_size_y = tile_size < small_height ? tile_size : small_height;
+    int tile_size_x = p_tile_size_x < small_width ? p_tile_size_x : small_width;
+    int tile_size_y = p_tile_size_y < small_height ? p_tile_size_y : small_height;
 
     int input_tile_size_x  = tile_size_x;
     int input_tile_size_y  = tile_size_y;
@@ -765,6 +768,11 @@ __STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const
     ggml_free(tiles_ctx);
 }
 
+__STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const int scale,
+    const int tile_size, const float tile_overlap_factor, on_tile_process on_processing) {
+    sd_tiling_non_square(input, output, scale, tile_size, tile_size, tile_overlap_factor, on_processing);
+}
+
 __STATIC_INLINE__ struct ggml_tensor* ggml_group_norm_32(struct ggml_context* ctx,
                                                          struct ggml_tensor* a) {
     const float eps = 1e-6f;  // default eps parameter
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
@@ -1427,13 +1427,62 @@ class StableDiffusionGGML {
                                                  x->ne[3]);  // channels
         int64_t t0          = ggml_time_ms();
 
-        int tile_size = 32;
+        int tile_size_x = 32;
+        int tile_size_y = 32;
         // TODO: arg instead of env?
         const char* SD_TILE_SIZE = getenv("SD_TILE_SIZE");
         if (SD_TILE_SIZE != nullptr) {
+            // format is AxB, or just A (equivalent to AxA)
+            // A and B can be integers (tile size) or floating point
+            // floating point <= 1 means fraction of the latent dimension
+            // floating point > 1 means number of tiles in that dimension
+            // a single number gets applied to both dimensions
             std::string sd_tile_size_str = SD_TILE_SIZE;
+            size_t x_pos = sd_tile_size_str.find('x');
             try {
-                tile_size = std::stoi(sd_tile_size_str);
+                int tmp_x = tile_size_x, tmp_y = tile_size_y;
+                if (x_pos != std::string::npos) {
+                    std::string tile_x_str = sd_tile_size_str.substr(0, x_pos);
+                    std::string tile_y_str = sd_tile_size_str.substr(x_pos + 1);
+                    if (tile_x_str.find('.') != std::string::npos) {
+                        float tile_factor = std::stof(tile_x_str);
+                        if (tile_factor > 0.0) {
+                            if (tile_factor > 1.0)
+                                tile_factor = 1.0 / tile_factor;
+                            tmp_x = (W / (decode ? 1 : 8)) * tile_factor;
+                        }
+                    }
+                    else {
+                        tmp_x = std::stoi(tile_x_str);
+                    }
+                    if (tile_y_str.find('.') != std::string::npos) {
+                        float tile_factor = std::stof(tile_y_str);
+                        if (tile_factor > 0.0) {
+                            if (tile_factor > 1.0)
+                                tile_factor = 1.0 / tile_factor;
+                            tmp_y = (H / (decode ? 1 : 8)) * tile_factor;
+                        }
+                    }
+                    else {
+                        tmp_y = std::stoi(tile_y_str);
+                    }
+                }
+                else {
+                    if (sd_tile_size_str.find('.') != std::string::npos) {
+                        float tile_factor = std::stof(sd_tile_size_str);
+                        if (tile_factor > 0) {
+                            if (tile_factor > 1.0)
+                                tile_factor = 1.0 / tile_factor;
+                            tmp_x = (W / (decode ? 1 : 8)) * tile_factor;
+                            tmp_y = (H / (decode ? 1 : 8)) * tile_factor;
+                        }
+                    }
+                    else {
+                        tmp_x = tmp_y = std::stoi(sd_tile_size_str);
+                    }
+                }
+                tile_size_x = tmp_x;
+                tile_size_y = tmp_y;
             } catch (const std::invalid_argument&) {
                 LOG_WARN("Invalid");
             } catch (const std::out_of_range&) {
@@ -1443,7 +1492,8 @@ class StableDiffusionGGML {
         if(!decode){
             // TODO: also use and arg for this one?
             // to keep the compute buffer size consistent
-            tile_size*=1.30539;
+            tile_size_x*=1.30539;
+            tile_size_y*=1.30539;
         }
         if (!use_tiny_autoencoder) {
             if (decode) {
@@ -1452,11 +1502,14 @@ class StableDiffusionGGML {
                 ggml_tensor_scale_input(x);
             }
             if (vae_tiling) {
+                if (SD_TILE_SIZE != nullptr) {
+                    LOG_INFO("VAE Tile size: %dx%d", tile_size_x, tile_size_y);
+                }
                 // split latent in 32x32 tiles and compute in several steps
                 auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
                     first_stage_model->compute(n_threads, in, decode, &out);
                 };
-                sd_tiling(x, result, 8, tile_size, 0.5f, on_tiling);
+                sd_tiling_non_square(x, result, 8, tile_size_x, tile_size_y, 0.5f, on_tiling);
             } else {
                 first_stage_model->compute(n_threads, x, decode, &result);
             }