@@ -482,7 +482,10 @@ __STATIC_INLINE__ void ggml_merge_tensor_2d(struct ggml_tensor* input,
482482 struct ggml_tensor * output,
483483 int x,
484484 int y,
485- int overlap) {
485+ int overlap_x,
486+ int overlap_y,
487+ int x_skip = 0 ,
488+ int y_skip = 0 ) {
486489 int64_t width = input->ne [0 ];
487490 int64_t height = input->ne [1 ];
488491 int64_t channels = input->ne [2 ];
@@ -491,17 +494,17 @@ __STATIC_INLINE__ void ggml_merge_tensor_2d(struct ggml_tensor* input,
491494 int64_t img_height = output->ne [1 ];
492495
493496 GGML_ASSERT (input->type == GGML_TYPE_F32 && output->type == GGML_TYPE_F32);
494- for (int iy = 0 ; iy < height; iy++) {
495- for (int ix = 0 ; ix < width; ix++) {
497+ for (int iy = y_skip ; iy < height; iy++) {
498+ for (int ix = x_skip ; ix < width; ix++) {
496499 for (int k = 0 ; k < channels; k++) {
497500 float new_value = ggml_tensor_get_f32 (input, ix, iy, k);
498- if (overlap > 0 ) { // blend colors in overlapped area
501+ if (overlap_x > 0 || overlap_y > 0 ) { // blend colors in overlapped area
499502 float old_value = ggml_tensor_get_f32 (output, x + ix, y + iy, k);
500503
501- const float x_f_0 = (x > 0 ) ? ix / float (overlap ) : 1 ;
502- const float x_f_1 = (x < (img_width - width)) ? (width - ix) / float (overlap ) : 1 ;
503- const float y_f_0 = (y > 0 ) ? iy / float (overlap ) : 1 ;
504- const float y_f_1 = (y < (img_height - height)) ? (height - iy) / float (overlap ) : 1 ;
504+ const float x_f_0 = (overlap_x > 0 && x > 0 ) ? ( ix - x_skip) / float (overlap_x ) : 1 ;
505+ const float x_f_1 = (overlap_x > 0 && x < (img_width - width)) ? (width - ix) / float (overlap_x ) : 1 ;
506+ const float y_f_0 = (overlap_y > 0 && y > 0 ) ? ( iy - y_skip) / float (overlap_y ) : 1 ;
507+ const float y_f_1 = (overlap_y > 0 && y < (img_height - height)) ? (height - iy) / float (overlap_y ) : 1 ;
505508
506509 const float x_f = std::min (std::min (x_f_0, x_f_1), 1 .f );
507510 const float y_f = std::min (std::min (y_f_0, y_f_1), 1 .f );
@@ -733,22 +736,96 @@ __STATIC_INLINE__ std::vector<struct ggml_tensor*> ggml_chunk(struct ggml_contex
733736
734737typedef std::function<void (ggml_tensor*, ggml_tensor*, bool )> on_tile_process;
735738
739+ __STATIC_INLINE__ void
740+ sd_tiling_calc_tiles (int &num_tiles_dim, float & tile_overlap_factor_dim, int small_dim, int tile_size, const float tile_overlap_factor) {
741+
742+ int tile_overlap = (tile_size * tile_overlap_factor);
743+ int non_tile_overlap = tile_size - tile_overlap;
744+
745+ num_tiles_dim = (small_dim - tile_overlap) / non_tile_overlap;
746+ int overshoot_dim = ((num_tiles_dim + 1 ) * non_tile_overlap + tile_overlap) % small_dim;
747+
748+ if ((overshoot_dim != non_tile_overlap) && (overshoot_dim <= num_tiles_dim * (tile_size / 2 - tile_overlap))) {
749+ // if tiles don't fit perfectly using the desired overlap
750+ // and there is enough room to squeeze an extra tile without overlap becoming >0.5
751+ num_tiles_dim++;
752+ }
753+
754+ tile_overlap_factor_dim = (float )(tile_size * num_tiles_dim - small_dim) / (float )(tile_size * (num_tiles_dim - 1 ));
755+ if (num_tiles_dim <= 2 ) {
756+ if (small_dim <= tile_size) {
757+ num_tiles_dim = 1 ;
758+ tile_overlap_factor_dim = 0 ;
759+ } else {
760+ num_tiles_dim = 2 ;
761+ tile_overlap_factor_dim = (2 * tile_size - small_dim) / (float )tile_size;
762+ }
763+ }
764+ }
765+
736766// Tiling
737- __STATIC_INLINE__ void sd_tiling (ggml_tensor* input, ggml_tensor* output, const int scale, const int tile_size, const float tile_overlap_factor, on_tile_process on_processing) {
767+ __STATIC_INLINE__ void sd_tiling_non_square (ggml_tensor* input, ggml_tensor* output, const int scale,
768+ const int p_tile_size_x, const int p_tile_size_y,
769+ const float tile_overlap_factor, on_tile_process on_processing) {
770+
738771 output = ggml_set_f32 (output, 0 );
739772
740773 int input_width = (int )input->ne [0 ];
741774 int input_height = (int )input->ne [1 ];
742775 int output_width = (int )output->ne [0 ];
743776 int output_height = (int )output->ne [1 ];
777+
778+ GGML_ASSERT (input_width / output_width == input_height / output_height && output_width / input_width == output_height / input_height);
779+ GGML_ASSERT (input_width / output_width == scale || output_width / input_width == scale);
780+
781+ int small_width = output_width;
782+ int small_height = output_height;
783+
784+ bool big_out = output_width > input_width;
785+ if (big_out) {
786+ // Ex: decode
787+ small_width = input_width;
788+ small_height = input_height;
789+ }
790+
791+ int num_tiles_x;
792+ float tile_overlap_factor_x;
793+ sd_tiling_calc_tiles (num_tiles_x, tile_overlap_factor_x, small_width, p_tile_size_x, tile_overlap_factor);
794+
795+ int num_tiles_y;
796+ float tile_overlap_factor_y;
797+ sd_tiling_calc_tiles (num_tiles_y, tile_overlap_factor_y, small_height, p_tile_size_y, tile_overlap_factor);
798+
799+ LOG_DEBUG (" num tiles : %d, %d " , num_tiles_x, num_tiles_y);
800+ LOG_DEBUG (" optimal overlap : %f, %f (targeting %f)" , tile_overlap_factor_x, tile_overlap_factor_y, tile_overlap_factor);
801+
744802 GGML_ASSERT (input_width % 2 == 0 && input_height % 2 == 0 && output_width % 2 == 0 && output_height % 2 == 0 ); // should be multiple of 2
745803
746- int tile_overlap = (int32_t )(tile_size * tile_overlap_factor);
747- int non_tile_overlap = tile_size - tile_overlap;
804+ int tile_overlap_x = (int32_t )(p_tile_size_x * tile_overlap_factor_x);
805+ int non_tile_overlap_x = p_tile_size_x - tile_overlap_x;
806+
807+ int tile_overlap_y = (int32_t )(p_tile_size_y * tile_overlap_factor_y);
808+ int non_tile_overlap_y = p_tile_size_y - tile_overlap_y;
809+
810+ int tile_size_x = p_tile_size_x < small_width ? p_tile_size_x : small_width;
811+ int tile_size_y = p_tile_size_y < small_height ? p_tile_size_y : small_height;
812+
813+ int input_tile_size_x = tile_size_x;
814+ int input_tile_size_y = tile_size_y;
815+ int output_tile_size_x = tile_size_x;
816+ int output_tile_size_y = tile_size_y;
817+
818+ if (big_out) {
819+ output_tile_size_x *= scale;
820+ output_tile_size_y *= scale;
821+ } else {
822+ input_tile_size_x *= scale;
823+ input_tile_size_y *= scale;
824+ }
748825
749826 struct ggml_init_params params = {};
750- params.mem_size += tile_size * tile_size * input->ne [2 ] * sizeof (float ); // input chunk
751- params.mem_size += (tile_size * scale) * (tile_size * scale) * output->ne [2 ] * sizeof (float ); // output chunk
827+ params.mem_size += input_tile_size_x * input_tile_size_y * input->ne [2 ] * sizeof (float ); // input chunk
828+ params.mem_size += output_tile_size_x * output_tile_size_y * output->ne [2 ] * sizeof (float ); // output chunk
752829 params.mem_size += 3 * ggml_tensor_overhead ();
753830 params.mem_buffer = NULL ;
754831 params.no_alloc = false ;
@@ -763,29 +840,50 @@ __STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const
763840 }
764841
765842 // tiling
766- ggml_tensor* input_tile = ggml_new_tensor_4d (tiles_ctx, GGML_TYPE_F32, tile_size, tile_size, input->ne [2 ], 1 );
767- ggml_tensor* output_tile = ggml_new_tensor_4d (tiles_ctx, GGML_TYPE_F32, tile_size * scale, tile_size * scale, output->ne [2 ], 1 );
768- on_processing (input_tile, NULL , true );
769- int num_tiles = ceil ((float )input_width / non_tile_overlap) * ceil ((float )input_height / non_tile_overlap);
843+ ggml_tensor* input_tile = ggml_new_tensor_4d (tiles_ctx, GGML_TYPE_F32, input_tile_size_x, input_tile_size_y, input->ne [2 ], 1 );
844+ ggml_tensor* output_tile = ggml_new_tensor_4d (tiles_ctx, GGML_TYPE_F32, output_tile_size_x, output_tile_size_y, output->ne [2 ], 1 );
845+ int num_tiles = num_tiles_x * num_tiles_y;
770846 LOG_INFO (" processing %i tiles" , num_tiles);
771- pretty_progress (1 , num_tiles, 0 .0f );
847+ pretty_progress (0 , num_tiles, 0 .0f );
772848 int tile_count = 1 ;
773849 bool last_y = false , last_x = false ;
774850 float last_time = 0 .0f ;
775- for (int y = 0 ; y < input_height && !last_y; y += non_tile_overlap) {
776- if (y + tile_size >= input_height) {
777- y = input_height - tile_size;
851+ for (int y = 0 ; y < small_height && !last_y; y += non_tile_overlap_y) {
852+ int dy = 0 ;
853+ if (y + tile_size_y >= small_height) {
854+ int _y = y;
855+ y = small_height - tile_size_y;
856+ dy = _y - y;
857+ if (big_out) {
858+ dy *= scale;
859+ }
778860 last_y = true ;
779861 }
780- for (int x = 0 ; x < input_width && !last_x; x += non_tile_overlap) {
781- if (x + tile_size >= input_width) {
782- x = input_width - tile_size;
862+ for (int x = 0 ; x < small_width && !last_x; x += non_tile_overlap_x) {
863+ int dx = 0 ;
864+ if (x + tile_size_x >= small_width) {
865+ int _x = x;
866+ x = small_width - tile_size_x;
867+ dx = _x - x;
868+ if (big_out) {
869+ dx *= scale;
870+ }
783871 last_x = true ;
784872 }
873+
874+ int x_in = big_out ? x : scale * x;
875+ int y_in = big_out ? y : scale * y;
876+ int x_out = big_out ? x * scale : x;
877+ int y_out = big_out ? y * scale : y;
878+
879+ int overlap_x_out = big_out ? tile_overlap_x * scale : tile_overlap_x;
880+ int overlap_y_out = big_out ? tile_overlap_y * scale : tile_overlap_y;
881+
785882 int64_t t1 = ggml_time_ms ();
786- ggml_split_tensor_2d (input, input_tile, x, y );
883+ ggml_split_tensor_2d (input, input_tile, x_in, y_in );
787884 on_processing (input_tile, output_tile, false );
788- ggml_merge_tensor_2d (output_tile, output, x * scale, y * scale, tile_overlap * scale);
885+ ggml_merge_tensor_2d (output_tile, output, x_out, y_out, overlap_x_out, overlap_y_out, dx, dy);
886+
789887 int64_t t2 = ggml_time_ms ();
790888 last_time = (t2 - t1) / 1000 .0f ;
791889 pretty_progress (tile_count, num_tiles, last_time);
@@ -799,6 +897,11 @@ __STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const
799897 ggml_free (tiles_ctx);
800898}
801899
900+ __STATIC_INLINE__ void sd_tiling (ggml_tensor* input, ggml_tensor* output, const int scale,
901+ const int tile_size, const float tile_overlap_factor, on_tile_process on_processing) {
902+ sd_tiling_non_square (input, output, scale, tile_size, tile_size, tile_overlap_factor, on_processing);
903+ }
904+
802905__STATIC_INLINE__ struct ggml_tensor * ggml_group_norm_32 (struct ggml_context * ctx,
803906 struct ggml_tensor * a) {
804907 const float eps = 1e-6f ; // default eps parameter
0 commit comments