Skip to content

Commit ff6127c

Browse files
committed
support non-square tiles
1 parent a27ff2e commit ff6127c

File tree

2 files changed

+74
-13
lines changed

2 files changed

+74
-13
lines changed

ggml_extend.hpp

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -635,7 +635,10 @@ sd_tiling_calc_tiles(int &num_tiles_dim, float& tile_overlap_factor_dim, int sma
635635
}
636636

637637
// Tiling
638-
__STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const int scale, const int tile_size, const float tile_overlap_factor, on_tile_process on_processing) {
638+
__STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input, ggml_tensor* output, const int scale,
639+
const int p_tile_size_x, const int p_tile_size_y,
640+
const float tile_overlap_factor, on_tile_process on_processing) {
641+
639642
output = ggml_set_f32(output, 0);
640643

641644
int input_width = (int)input->ne[0];
@@ -658,25 +661,25 @@ __STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const
658661

659662
int num_tiles_x;
660663
float tile_overlap_factor_x;
661-
sd_tiling_calc_tiles(num_tiles_x, tile_overlap_factor_x, small_width, tile_size, tile_overlap_factor);
664+
sd_tiling_calc_tiles(num_tiles_x, tile_overlap_factor_x, small_width, p_tile_size_x, tile_overlap_factor);
662665

663666
int num_tiles_y;
664667
float tile_overlap_factor_y;
665-
sd_tiling_calc_tiles(num_tiles_y, tile_overlap_factor_y, small_height, tile_size, tile_overlap_factor);
668+
sd_tiling_calc_tiles(num_tiles_y, tile_overlap_factor_y, small_height, p_tile_size_y, tile_overlap_factor);
666669

667670
LOG_DEBUG("num tiles : %d, %d ", num_tiles_x, num_tiles_y);
668671
LOG_DEBUG("optimal overlap : %f, %f (targeting %f)", tile_overlap_factor_x, tile_overlap_factor_y, tile_overlap_factor);
669672

670673
GGML_ASSERT(input_width % 2 == 0 && input_height % 2 == 0 && output_width % 2 == 0 && output_height % 2 == 0); // should be multiple of 2
671674

672-
int tile_overlap_x = (int32_t)(tile_size * tile_overlap_factor_x);
673-
int non_tile_overlap_x = tile_size - tile_overlap_x;
675+
int tile_overlap_x = (int32_t)(p_tile_size_x * tile_overlap_factor_x);
676+
int non_tile_overlap_x = p_tile_size_x - tile_overlap_x;
674677

675-
int tile_overlap_y = (int32_t)(tile_size * tile_overlap_factor_y);
676-
int non_tile_overlap_y = tile_size - tile_overlap_y;
678+
int tile_overlap_y = (int32_t)(p_tile_size_y * tile_overlap_factor_y);
679+
int non_tile_overlap_y = p_tile_size_y - tile_overlap_y;
677680

678-
int tile_size_x = tile_size < small_width ? tile_size : small_width;
679-
int tile_size_y = tile_size < small_height ? tile_size : small_height;
681+
int tile_size_x = p_tile_size_x < small_width ? p_tile_size_x : small_width;
682+
int tile_size_y = p_tile_size_y < small_height ? p_tile_size_y : small_height;
680683

681684
int input_tile_size_x = tile_size_x;
682685
int input_tile_size_y = tile_size_y;
@@ -765,6 +768,11 @@ __STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const
765768
ggml_free(tiles_ctx);
766769
}
767770

771+
__STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const int scale,
772+
const int tile_size, const float tile_overlap_factor, on_tile_process on_processing) {
773+
sd_tiling_non_square(input, output, scale, tile_size, tile_size, tile_overlap_factor, on_processing);
774+
}
775+
768776
__STATIC_INLINE__ struct ggml_tensor* ggml_group_norm_32(struct ggml_context* ctx,
769777
struct ggml_tensor* a) {
770778
const float eps = 1e-6f; // default eps parameter

stable-diffusion.cpp

Lines changed: 57 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1427,13 +1427,62 @@ class StableDiffusionGGML {
14271427
x->ne[3]); // channels
14281428
int64_t t0 = ggml_time_ms();
14291429

1430-
int tile_size = 32;
1430+
int tile_size_x = 32;
1431+
int tile_size_y = 32;
14311432
// TODO: arg instead of env?
14321433
const char* SD_TILE_SIZE = getenv("SD_TILE_SIZE");
14331434
if (SD_TILE_SIZE != nullptr) {
1435+
// format is AxB, or just A (equivalent to AxA)
1436+
// A and B can be integers (tile size) or floating point
1437+
// floating point <= 1 means fraction of the latent dimension
1438+
// floating point > 1 means number of tiles in that dimension
1439+
// a single number gets applied to both dimensions
14341440
std::string sd_tile_size_str = SD_TILE_SIZE;
1441+
size_t x_pos = sd_tile_size_str.find('x');
14351442
try {
1436-
tile_size = std::stoi(sd_tile_size_str);
1443+
int tmp_x = tile_size_x, tmp_y = tile_size_y;
1444+
if (x_pos != std::string::npos) {
1445+
std::string tile_x_str = sd_tile_size_str.substr(0, x_pos);
1446+
std::string tile_y_str = sd_tile_size_str.substr(x_pos + 1);
1447+
if (tile_x_str.find('.') != std::string::npos) {
1448+
float tile_factor = std::stof(tile_x_str);
1449+
if (tile_factor > 0.0) {
1450+
if (tile_factor > 1.0)
1451+
tile_factor = 1.0 / tile_factor;
1452+
tmp_x = (W / (decode ? 1 : 8)) * tile_factor;
1453+
}
1454+
}
1455+
else {
1456+
tmp_x = std::stoi(tile_x_str);
1457+
}
1458+
if (tile_y_str.find('.') != std::string::npos) {
1459+
float tile_factor = std::stof(tile_y_str);
1460+
if (tile_factor > 0.0) {
1461+
if (tile_factor > 1.0)
1462+
tile_factor = 1.0 / tile_factor;
1463+
tmp_y = (H / (decode ? 1 : 8)) * tile_factor;
1464+
}
1465+
}
1466+
else {
1467+
tmp_y = std::stoi(tile_y_str);
1468+
}
1469+
}
1470+
else {
1471+
if (sd_tile_size_str.find('.') != std::string::npos) {
1472+
float tile_factor = std::stof(sd_tile_size_str);
1473+
if (tile_factor > 0) {
1474+
if (tile_factor > 1.0)
1475+
tile_factor = 1.0 / tile_factor;
1476+
tmp_x = (W / (decode ? 1 : 8)) * tile_factor;
1477+
tmp_y = (H / (decode ? 1 : 8)) * tile_factor;
1478+
}
1479+
}
1480+
else {
1481+
tmp_x = tmp_y = std::stoi(sd_tile_size_str);
1482+
}
1483+
}
1484+
tile_size_x = tmp_x;
1485+
tile_size_y = tmp_y;
14371486
} catch (const std::invalid_argument&) {
14381487
LOG_WARN("Invalid");
14391488
} catch (const std::out_of_range&) {
@@ -1443,7 +1492,8 @@ class StableDiffusionGGML {
14431492
if(!decode){
14441493
// TODO: also use and arg for this one?
14451494
// to keep the compute buffer size consistent
1446-
tile_size*=1.30539;
1495+
tile_size_x*=1.30539;
1496+
tile_size_y*=1.30539;
14471497
}
14481498
if (!use_tiny_autoencoder) {
14491499
if (decode) {
@@ -1452,11 +1502,14 @@ class StableDiffusionGGML {
14521502
ggml_tensor_scale_input(x);
14531503
}
14541504
if (vae_tiling) {
1505+
if (SD_TILE_SIZE != nullptr) {
1506+
LOG_INFO("VAE Tile size: %dx%d", tile_size_x, tile_size_y);
1507+
}
14551508
// split latent in 32x32 tiles and compute in several steps
14561509
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
14571510
first_stage_model->compute(n_threads, in, decode, &out);
14581511
};
1459-
sd_tiling(x, result, 8, tile_size, 0.5f, on_tiling);
1512+
sd_tiling_non_square(x, result, 8, tile_size_x, tile_size_y, 0.5f, on_tiling);
14601513
} else {
14611514
first_stage_model->compute(n_threads, x, decode, &result);
14621515
}

0 commit comments

Comments
 (0)