@@ -574,9 +574,18 @@ void submit_sliding_window1d(const PaddedSpan<const T, SizeT> &a,
574574 }
575575
576576 auto *const out_ptr = out.begin ();
577- auto *const out_end = out.end ();
578- results.store (&out_ptr[glid],
579- [out_end](auto &&ptr) { return ptr < out_end; });
577+ // auto *const out_end = out.end();
578+
579+ auto y_start = glid;
580+ auto y_stop = std::min (y_start + WorkPI*results.size_x (), out.size ());
581+ int32_t i = 0 ;
582+ for (uint32_t y = y_start; y < y_stop; y+=results.size_x ())
583+ {
584+ out_ptr[y] = results[i++];
585+ }
586+ // due to excessive optimizations this code results in memory corruption
587+ // results.store(&out_ptr[glid],
588+ // [out_end](auto &&ptr) { return ptr < out_end; });
580589 });
581590}
582591
@@ -635,9 +644,18 @@ void submit_sliding_window1d_small_kernel(const PaddedSpan<const T, SizeT> &a,
635644 red);
636645
637646 auto *const out_ptr = out.begin ();
638- auto *const out_end = out.end ();
639- results.store (&out_ptr[glid],
640- [out_end](auto &&ptr) { return ptr < out_end; });
647+ // auto *const out_end = out.end();
648+
649+ auto y_start = glid;
650+ auto y_stop = std::min (y_start + WorkPI*results.size_x (), out.size ());
651+ int32_t i = 0 ;
652+ for (uint32_t y = y_start; y < y_stop; y+=results.size_x ())
653+ {
654+ out_ptr[y] = results[i++];
655+ }
656+ // due to excessive optimizations this code results in memory corruption
657+ // results.store(&out_ptr[glid],
658+ // [out_end](auto &&ptr) { return ptr < out_end; });
641659 });
642660}
643661
0 commit comments