From 47407e967f9357d421fe124b8bdae8388b891c70 Mon Sep 17 00:00:00 2001
From: Steve <61917452+awildergoose@users.noreply.github.com>
Date: Sun, 19 Oct 2025 17:28:43 +0000
Subject: [PATCH 1/6] vk: move parallel aggregator to cpu-side cmake: build
 with -O3

---
 CMakeLists.txt                 |   6 ++
 rpcs3/Emu/RSX/VK/VKCompute.cpp | 181 +++++++++++++++++++++++++++++++--
 2 files changed, 179 insertions(+), 8 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0492d13184e8..91cd7c9b15b3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -132,6 +132,12 @@ if(MSVC)
     add_compile_options(/MP)
 endif()
 
+if (CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
+  add_compile_options(-msse4.1 -mavx2 -mfma -march=native -O3)
+elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
+  add_compile_options(/arch:AVX2)
+endif()
+
 if(NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
     message( FATAL_ERROR "RPCS3 can only be compiled on 64-bit platforms." )
 endif()
diff --git a/rpcs3/Emu/RSX/VK/VKCompute.cpp b/rpcs3/Emu/RSX/VK/VKCompute.cpp
index 9e9e2a474640..e1649ef0cc72 100644
--- a/rpcs3/Emu/RSX/VK/VKCompute.cpp
+++ b/rpcs3/Emu/RSX/VK/VKCompute.cpp
@@ -3,11 +3,122 @@
 #include "VKRenderPass.h"
 #include "vkutils/buffer_object.h"
 #include "VKPipelineCompiler.h"
+#include <thread>
+#include <vector>
+#include <atomic>
+#include <cstdint>
+#include <cstring>
+#include <algorithm>
+#include <stdexcept>
+#include <cmath>
 
 #define VK_MAX_COMPUTE_TASKS 8192   // Max number of jobs per frame
 
 namespace vk
 {
+namespace cpu_compute {
+
+template <typename Worker>
+inline void parallel_for(size_t n, size_t numThreads, Worker&& worker)
+{
+	if (n == 0) return;
+	numThreads = std::max<size_t>(1, std::min(numThreads, n));
+	std::vector<std::thread> threads;
+	threads.reserve(numThreads);
+	size_t chunk = n / numThreads;
+	size_t rem = n % numThreads;
+	size_t start = 0;
+	for (size_t t = 0; t < numThreads; ++t)
+	{
+		size_t sz = chunk + (t < rem ? 1 : 0);
+		size_t s = start;
+		size_t e = s + sz;
+		threads.emplace_back([s,e, &worker](){ for (size_t i = s; i < e; ++i) worker(i); });
+		start = e;
+	}
+	for (auto &th : threads) th.join();
+}
+
+inline void scatter_d24x8_cpu(uint32_t* base_words,
+                              size_t total_words,
+                              size_t block_length_words,
+                              size_t z_offset_words,
+                              size_t s_offset_words,
+                              size_t numThreads = std::thread::hardware_concurrency())
+{
+    if (!base_words) throw std::runtime_error("scatter_d24x8_cpu: null buffer");
+    if (block_length_words == 0) return;
+
+    if ((block_length_words + z_offset_words) > total_words) {
+        throw std::runtime_error("scatter_d24x8_cpu: z-offset out of range");
+    }
+
+    size_t stencil_word_count = (block_length_words + 3) / 4;
+    if ((stencil_word_count + s_offset_words) > total_words) {
+        throw std::runtime_error("scatter_d24x8_cpu: stencil offset out of range");
+    }
+
+    cpu_compute::parallel_for(block_length_words, numThreads, [&](size_t index) {
+        uint32_t value = base_words[index];
+        base_words[index + z_offset_words] = (value >> 8);
+
+        size_t stencil_offset = index >> 2;
+        uint32_t stencil_shift = static_cast<uint32_t>((index & 3) << 3);
+        uint32_t stencil_mask = (value & 0xFFu) << stencil_shift;
+
+        uint32_t* stencil_word_ptr = base_words + (stencil_offset + s_offset_words);
+
+#if defined(__cpp_lib_atomic_ref)
+        std::atomic_ref<uint32_t> ref(*stencil_word_ptr);
+        ref.fetch_or(stencil_mask, std::memory_order_relaxed);
+#else
+        std::atomic<uint32_t>* atomic_ptr = reinterpret_cast<std::atomic<uint32_t>*>(stencil_word_ptr);
+        uint32_t oldv = atomic_ptr->load(std::memory_order_relaxed);
+        uint32_t newv;
+        do {
+            newv = oldv | stencil_mask;
+        } while (!atomic_ptr->compare_exchange_weak(oldv, newv, std::memory_order_relaxed));
+#endif
+    });
+}
+
+// parallel sum for u32 source; returns 64-bit sum
+inline uint64_t parallel_sum_u32(const uint32_t* src, size_t count, size_t numThreads = std::thread::hardware_concurrency())
+{
+    if (count == 0) return 0;
+    numThreads = std::max<size_t>(1, std::min(numThreads, count));
+    std::vector<uint64_t> partials(numThreads, 0);
+
+    size_t chunk = count / numThreads;
+    size_t rem = count % numThreads;
+    size_t start = 0;
+    std::vector<std::thread> threads;
+    threads.reserve(numThreads);
+
+	for (size_t t = 0; t < numThreads; ++t)
+	{
+		size_t sz = chunk + (t < rem ? 1 : 0);
+		size_t s = start;
+		size_t e = s + sz;
+		threads.emplace_back([=, &partials]() {
+			uint64_t acc = 0;
+			const uint32_t* p = src + s;
+			size_t len = e - s;
+			// plain loop — compiler will usually vectorize
+			for (size_t i = 0; i < len; ++i) acc += p[i];
+			partials[t] = acc;
+		});
+		start = e;
+	}
+    for (auto &th : threads) th.join();
+
+    uint64_t total = 0;
+    for (auto v : partials) total += v;
+    return total;
+}
+
+} // namespace cpu_compute
+
 	std::vector<glsl::program_input> compute_task::get_inputs()
 	{
 		std::vector<glsl::program_input> result;
@@ -68,7 +179,7 @@ namespace vk
 				// Warps are multiples of 32. Increasing kernel depth seems to hurt performance (Nier, Big Duck sample)
 				unroll_loops = true;
 				optimal_kernel_size = 1;
-				optimal_group_size = 32;
+				optimal_group_size = 128; //32;
 				break;
 			case vk::driver_vendor::AMD:
 			case vk::driver_vendor::RADV:
@@ -299,13 +410,51 @@ namespace vk
 		m_program->bind_uniform({ *m_data, m_data_offset, m_ssbo_length }, 0, 0);
 	}
 
-	void cs_interleave_task::run(const vk::command_buffer& cmd, const vk::buffer* data, u32 data_offset, u32 data_length, u32 zeta_offset, u32 stencil_offset)
+	void cs_interleave_task::run(const vk::command_buffer& /*cmd*/,
+								const vk::buffer* data,
+								u32 data_offset,
+								u32 data_length,
+								u32 zeta_offset,
+								u32 stencil_offset)
 	{
-		m_params = { data_length, zeta_offset - data_offset, stencil_offset - data_offset, 0 };
+		size_t hw = std::max<size_t>(1, std::thread::hardware_concurrency());
+		size_t numThreads = std::min<size_t>(hw, 8);
 
+		m_params = { data_length, zeta_offset - data_offset, stencil_offset - data_offset, 0 };
 		ensure(stencil_offset > data_offset);
-		m_ssbo_length = stencil_offset + (data_length / 4) - data_offset;
-		cs_shuffle_base::run(cmd, data, data_length, data_offset);
+
+		void* mapped = const_cast<vk::buffer*>(data)->map(data_offset, data_length);
+		if (!mapped) throw std::runtime_error("cs_interleave_task::run: failed to map buffer");
+
+		uint8_t* base_bytes = reinterpret_cast<uint8_t*>(mapped);
+		uint8_t* region_start = base_bytes + data_offset;
+		uint32_t* words = reinterpret_cast<uint32_t*>(region_start);
+
+		const size_t region_total_words = (data->size() - data_offset) / 4;
+		const size_t block_length_words = data_length / 4;
+		const size_t z_offset_words = (zeta_offset - data_offset) / 4;
+		const size_t s_offset_words = (stencil_offset - data_offset) / 4;
+
+		if ((block_length_words + z_offset_words) > region_total_words) {
+			const_cast<vk::buffer*>(data)->unmap();
+			throw std::runtime_error("cs_interleave_task::run: z_offset out of bounds");
+		}
+		size_t stencil_word_count = (block_length_words + 3) / 4;
+		if ((stencil_word_count + s_offset_words) > region_total_words) {
+			const_cast<vk::buffer*>(data)->unmap();
+			throw std::runtime_error("cs_interleave_task::run: stencil offset out of bounds");
+		}
+
+		cpu_compute::scatter_d24x8_cpu(
+			/*base_words=*/ words,
+			/*total_words=*/ region_total_words,
+			/*block_length_words=*/ block_length_words,
+			/*z_offset_words=*/ z_offset_words,
+			/*s_offset_words=*/ s_offset_words,
+			/*numThreads=*/ numThreads
+		);
+
+		const_cast<vk::buffer*>(data)->unmap();
 	}
 
 	cs_scatter_d24x8::cs_scatter_d24x8()
@@ -359,14 +508,30 @@ namespace vk
 		m_program->bind_uniform({ *dst, 0, 4 }, 0, 1);
 	}
 
-	void cs_aggregator::run(const vk::command_buffer& cmd, const vk::buffer* dst, const vk::buffer* src, u32 num_words)
+	void cs_aggregator::run(const vk::command_buffer& /*cmd*/, const vk::buffer* dst, const vk::buffer* src, u32 num_words)
 	{
 		this->dst = dst;
 		this->src = src;
 		word_count = num_words;
 		block_length = num_words * 4;
 
-		const u32 linear_invocations = utils::aligned_div(word_count, optimal_group_size);
-		compute_task::run(cmd, linear_invocations);
+		size_t hw = std::max<size_t>(1, std::thread::hardware_concurrency());
+		size_t numThreads = std::min<size_t>(hw, 8);
+
+		void* src_map = const_cast<vk::buffer*>(src)->map(0, num_words * 4);
+		if (!src_map) throw std::runtime_error("cs_aggregator::run: failed to map src buffer");
+
+		const uint32_t* src_words = reinterpret_cast<const uint32_t*>(src_map);
+		uint64_t sum = cpu_compute::parallel_sum_u32(src_words, static_cast<size_t>(num_words), numThreads);
+
+		const_cast<vk::buffer*>(src)->unmap();
+
+		void* dst_map = const_cast<vk::buffer*>(dst)->map(0, 4);
+		if (!dst_map) throw std::runtime_error("cs_aggregator::run: failed to map dst buffer");
+
+		uint32_t result32 = static_cast<uint32_t>(sum);
+		std::memcpy(dst_map, &result32, sizeof(result32));
+
+		const_cast<vk::buffer*>(dst)->unmap();
 	}
 }

From fcc295ffac47ed12d3575700b25b6257bde7b180 Mon Sep 17 00:00:00 2001
From: Steve <61917452+awildergoose@users.noreply.github.com>
Date: Sun, 19 Oct 2025 17:46:03 +0000
Subject: [PATCH 2/6] chore: dont throw!

---
 rpcs3/Emu/RSX/VK/VKCompute.cpp | 32 ++++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/rpcs3/Emu/RSX/VK/VKCompute.cpp b/rpcs3/Emu/RSX/VK/VKCompute.cpp
index e1649ef0cc72..913874060f2b 100644
--- a/rpcs3/Emu/RSX/VK/VKCompute.cpp
+++ b/rpcs3/Emu/RSX/VK/VKCompute.cpp
@@ -46,16 +46,21 @@ inline void scatter_d24x8_cpu(uint32_t* base_words,
                               size_t s_offset_words,
                               size_t numThreads = std::thread::hardware_concurrency())
 {
-    if (!base_words) throw std::runtime_error("scatter_d24x8_cpu: null buffer");
+    if (!base_words) {
+		rsx_log.error("scatter_d24x8_cpu: null buffer");
+		return;
+	}
     if (block_length_words == 0) return;
 
     if ((block_length_words + z_offset_words) > total_words) {
-        throw std::runtime_error("scatter_d24x8_cpu: z-offset out of range");
+        rsx_log.error("scatter_d24x8_cpu: z-offset out of range");
+		return;
     }
 
     size_t stencil_word_count = (block_length_words + 3) / 4;
     if ((stencil_word_count + s_offset_words) > total_words) {
-        throw std::runtime_error("scatter_d24x8_cpu: stencil offset out of range");
+        rsx_log.error("scatter_d24x8_cpu: stencil offset out of range");
+		return;
     }
 
     cpu_compute::parallel_for(block_length_words, numThreads, [&](size_t index) {
@@ -424,7 +429,10 @@ inline uint64_t parallel_sum_u32(const uint32_t* src, size_t count, size_t numTh
 		ensure(stencil_offset > data_offset);
 
 		void* mapped = const_cast<vk::buffer*>(data)->map(data_offset, data_length);
-		if (!mapped) throw std::runtime_error("cs_interleave_task::run: failed to map buffer");
+		if (!mapped) {
+			rsx_log.error("cs_interleave_task::run: failed to map buffer");
+			return;
+		}
 
 		uint8_t* base_bytes = reinterpret_cast<uint8_t*>(mapped);
 		uint8_t* region_start = base_bytes + data_offset;
@@ -437,12 +445,14 @@ inline uint64_t parallel_sum_u32(const uint32_t* src, size_t count, size_t numTh
 
 		if ((block_length_words + z_offset_words) > region_total_words) {
 			const_cast<vk::buffer*>(data)->unmap();
-			throw std::runtime_error("cs_interleave_task::run: z_offset out of bounds");
+			rsx_log.error("cs_interleave_task::run: z_offset out of bounds");
+			return;
 		}
 		size_t stencil_word_count = (block_length_words + 3) / 4;
 		if ((stencil_word_count + s_offset_words) > region_total_words) {
 			const_cast<vk::buffer*>(data)->unmap();
-			throw std::runtime_error("cs_interleave_task::run: stencil offset out of bounds");
+			rsx_log.error("cs_interleave_task::run: stencil offset out of bounds");
+			return;
 		}
 
 		cpu_compute::scatter_d24x8_cpu(
@@ -519,7 +529,10 @@ inline uint64_t parallel_sum_u32(const uint32_t* src, size_t count, size_t numTh
 		size_t numThreads = std::min<size_t>(hw, 8);
 
 		void* src_map = const_cast<vk::buffer*>(src)->map(0, num_words * 4);
-		if (!src_map) throw std::runtime_error("cs_aggregator::run: failed to map src buffer");
+		if (!src_map) {
+			rsx_log.error("cs_aggregator::run: failed to map src buffer");
+			return;
+		}
 
 		const uint32_t* src_words = reinterpret_cast<const uint32_t*>(src_map);
 		uint64_t sum = cpu_compute::parallel_sum_u32(src_words, static_cast<size_t>(num_words), numThreads);
@@ -527,7 +540,10 @@ inline uint64_t parallel_sum_u32(const uint32_t* src, size_t count, size_t numTh
 		const_cast<vk::buffer*>(src)->unmap();
 
 		void* dst_map = const_cast<vk::buffer*>(dst)->map(0, 4);
-		if (!dst_map) throw std::runtime_error("cs_aggregator::run: failed to map dst buffer");
+		if (!dst_map) {
+			rsx_log.error("cs_aggregator::run: failed to map dst buffer");
+			return;
+		}
 
 		uint32_t result32 = static_cast<uint32_t>(sum);
 		std::memcpy(dst_map, &result32, sizeof(result32));

From 68644efb57a8bc5b3e95f2baa489905b81498aa0 Mon Sep 17 00:00:00 2001
From: Steve <61917452+awildergoose@users.noreply.github.com>
Date: Sun, 19 Oct 2025 18:48:50 +0000
Subject: [PATCH 3/6] vk: revert gpu/cpu compute changes cmake: dont use sse
 and avx and other flags on arm64/aarch64

---
 CMakeLists.txt                 |   8 +-
 rpcs3/Emu/RSX/VK/VKCompute.cpp | 199 ++-------------------------------
 2 files changed, 15 insertions(+), 192 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 91cd7c9b15b3..1c5fcf763c83 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -133,9 +133,13 @@ if(MSVC)
 endif()
 
 if (CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
-  add_compile_options(-msse4.1 -mavx2 -mfma -march=native -O3)
+  if (NOT CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64")
+    add_compile_options(-msse4.1 -mavx2 -mfma -march=native -O3)
+  else()
+    add_compile_options(-O3)
+  endif()
 elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
-  add_compile_options(/arch:AVX2)
+  add_compile_options(/arch:AVX2 /O2)
 endif()
 
 if(NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
diff --git a/rpcs3/Emu/RSX/VK/VKCompute.cpp b/rpcs3/Emu/RSX/VK/VKCompute.cpp
index 913874060f2b..ac056cddb962 100644
--- a/rpcs3/Emu/RSX/VK/VKCompute.cpp
+++ b/rpcs3/Emu/RSX/VK/VKCompute.cpp
@@ -3,127 +3,11 @@
 #include "VKRenderPass.h"
 #include "vkutils/buffer_object.h"
 #include "VKPipelineCompiler.h"
-#include <thread>
-#include <vector>
-#include <atomic>
-#include <cstdint>
-#include <cstring>
-#include <algorithm>
-#include <stdexcept>
-#include <cmath>
 
 #define VK_MAX_COMPUTE_TASKS 8192   // Max number of jobs per frame
 
 namespace vk
 {
-namespace cpu_compute {
-
-template <typename Worker>
-inline void parallel_for(size_t n, size_t numThreads, Worker&& worker)
-{
-	if (n == 0) return;
-	numThreads = std::max<size_t>(1, std::min(numThreads, n));
-	std::vector<std::thread> threads;
-	threads.reserve(numThreads);
-	size_t chunk = n / numThreads;
-	size_t rem = n % numThreads;
-	size_t start = 0;
-	for (size_t t = 0; t < numThreads; ++t)
-	{
-		size_t sz = chunk + (t < rem ? 1 : 0);
-		size_t s = start;
-		size_t e = s + sz;
-		threads.emplace_back([s,e, &worker](){ for (size_t i = s; i < e; ++i) worker(i); });
-		start = e;
-	}
-	for (auto &th : threads) th.join();
-}
-
-inline void scatter_d24x8_cpu(uint32_t* base_words,
-                              size_t total_words,
-                              size_t block_length_words,
-                              size_t z_offset_words,
-                              size_t s_offset_words,
-                              size_t numThreads = std::thread::hardware_concurrency())
-{
-    if (!base_words) {
-		rsx_log.error("scatter_d24x8_cpu: null buffer");
-		return;
-	}
-    if (block_length_words == 0) return;
-
-    if ((block_length_words + z_offset_words) > total_words) {
-        rsx_log.error("scatter_d24x8_cpu: z-offset out of range");
-		return;
-    }
-
-    size_t stencil_word_count = (block_length_words + 3) / 4;
-    if ((stencil_word_count + s_offset_words) > total_words) {
-        rsx_log.error("scatter_d24x8_cpu: stencil offset out of range");
-		return;
-    }
-
-    cpu_compute::parallel_for(block_length_words, numThreads, [&](size_t index) {
-        uint32_t value = base_words[index];
-        base_words[index + z_offset_words] = (value >> 8);
-
-        size_t stencil_offset = index >> 2;
-        uint32_t stencil_shift = static_cast<uint32_t>((index & 3) << 3);
-        uint32_t stencil_mask = (value & 0xFFu) << stencil_shift;
-
-        uint32_t* stencil_word_ptr = base_words + (stencil_offset + s_offset_words);
-
-#if defined(__cpp_lib_atomic_ref)
-        std::atomic_ref<uint32_t> ref(*stencil_word_ptr);
-        ref.fetch_or(stencil_mask, std::memory_order_relaxed);
-#else
-        std::atomic<uint32_t>* atomic_ptr = reinterpret_cast<std::atomic<uint32_t>*>(stencil_word_ptr);
-        uint32_t oldv = atomic_ptr->load(std::memory_order_relaxed);
-        uint32_t newv;
-        do {
-            newv = oldv | stencil_mask;
-        } while (!atomic_ptr->compare_exchange_weak(oldv, newv, std::memory_order_relaxed));
-#endif
-    });
-}
-
-// parallel sum for u32 source; returns 64-bit sum
-inline uint64_t parallel_sum_u32(const uint32_t* src, size_t count, size_t numThreads = std::thread::hardware_concurrency())
-{
-    if (count == 0) return 0;
-    numThreads = std::max<size_t>(1, std::min(numThreads, count));
-    std::vector<uint64_t> partials(numThreads, 0);
-
-    size_t chunk = count / numThreads;
-    size_t rem = count % numThreads;
-    size_t start = 0;
-    std::vector<std::thread> threads;
-    threads.reserve(numThreads);
-
-	for (size_t t = 0; t < numThreads; ++t)
-	{
-		size_t sz = chunk + (t < rem ? 1 : 0);
-		size_t s = start;
-		size_t e = s + sz;
-		threads.emplace_back([=, &partials]() {
-			uint64_t acc = 0;
-			const uint32_t* p = src + s;
-			size_t len = e - s;
-			// plain loop — compiler will usually vectorize
-			for (size_t i = 0; i < len; ++i) acc += p[i];
-			partials[t] = acc;
-		});
-		start = e;
-	}
-    for (auto &th : threads) th.join();
-
-    uint64_t total = 0;
-    for (auto v : partials) total += v;
-    return total;
-}
-
-} // namespace cpu_compute
-
 	std::vector<glsl::program_input> compute_task::get_inputs()
 	{
 		std::vector<glsl::program_input> result;
@@ -184,7 +68,7 @@ inline uint64_t parallel_sum_u32(const uint32_t* src, size_t count, size_t numTh
 				// Warps are multiples of 32. Increasing kernel depth seems to hurt performance (Nier, Big Duck sample)
 				unroll_loops = true;
 				optimal_kernel_size = 1;
-				optimal_group_size = 128; //32;
+				optimal_group_size = 32;
 				break;
 			case vk::driver_vendor::AMD:
 			case vk::driver_vendor::RADV:
@@ -415,56 +299,13 @@ inline uint64_t parallel_sum_u32(const uint32_t* src, size_t count, size_t numTh
 		m_program->bind_uniform({ *m_data, m_data_offset, m_ssbo_length }, 0, 0);
 	}
 
-	void cs_interleave_task::run(const vk::command_buffer& /*cmd*/,
-								const vk::buffer* data,
-								u32 data_offset,
-								u32 data_length,
-								u32 zeta_offset,
-								u32 stencil_offset)
+	void cs_interleave_task::run(const vk::command_buffer& cmd, const vk::buffer* data, u32 data_offset, u32 data_length, u32 zeta_offset, u32 stencil_offset)
 	{
-		size_t hw = std::max<size_t>(1, std::thread::hardware_concurrency());
-		size_t numThreads = std::min<size_t>(hw, 8);
-
 		m_params = { data_length, zeta_offset - data_offset, stencil_offset - data_offset, 0 };
-		ensure(stencil_offset > data_offset);
 
-		void* mapped = const_cast<vk::buffer*>(data)->map(data_offset, data_length);
-		if (!mapped) {
-			rsx_log.error("cs_interleave_task::run: failed to map buffer");
-			return;
-		}
-
-		uint8_t* base_bytes = reinterpret_cast<uint8_t*>(mapped);
-		uint8_t* region_start = base_bytes + data_offset;
-		uint32_t* words = reinterpret_cast<uint32_t*>(region_start);
-
-		const size_t region_total_words = (data->size() - data_offset) / 4;
-		const size_t block_length_words = data_length / 4;
-		const size_t z_offset_words = (zeta_offset - data_offset) / 4;
-		const size_t s_offset_words = (stencil_offset - data_offset) / 4;
-
-		if ((block_length_words + z_offset_words) > region_total_words) {
-			const_cast<vk::buffer*>(data)->unmap();
-			rsx_log.error("cs_interleave_task::run: z_offset out of bounds");
-			return;
-		}
-		size_t stencil_word_count = (block_length_words + 3) / 4;
-		if ((stencil_word_count + s_offset_words) > region_total_words) {
-			const_cast<vk::buffer*>(data)->unmap();
-			rsx_log.error("cs_interleave_task::run: stencil offset out of bounds");
-			return;
-		}
-
-		cpu_compute::scatter_d24x8_cpu(
-			/*base_words=*/ words,
-			/*total_words=*/ region_total_words,
-			/*block_length_words=*/ block_length_words,
-			/*z_offset_words=*/ z_offset_words,
-			/*s_offset_words=*/ s_offset_words,
-			/*numThreads=*/ numThreads
-		);
-
-		const_cast<vk::buffer*>(data)->unmap();
+		ensure(stencil_offset > data_offset);
+		m_ssbo_length = stencil_offset + (data_length / 4) - data_offset;
+		cs_shuffle_base::run(cmd, data, data_length, data_offset);
 	}
 
 	cs_scatter_d24x8::cs_scatter_d24x8()
@@ -518,36 +359,14 @@ inline uint64_t parallel_sum_u32(const uint32_t* src, size_t count, size_t numTh
 		m_program->bind_uniform({ *dst, 0, 4 }, 0, 1);
 	}
 
-	void cs_aggregator::run(const vk::command_buffer& /*cmd*/, const vk::buffer* dst, const vk::buffer* src, u32 num_words)
+	void cs_aggregator::run(const vk::command_buffer& cmd, const vk::buffer* dst, const vk::buffer* src, u32 num_words)
 	{
 		this->dst = dst;
 		this->src = src;
 		word_count = num_words;
 		block_length = num_words * 4;
 
-		size_t hw = std::max<size_t>(1, std::thread::hardware_concurrency());
-		size_t numThreads = std::min<size_t>(hw, 8);
-
-		void* src_map = const_cast<vk::buffer*>(src)->map(0, num_words * 4);
-		if (!src_map) {
-			rsx_log.error("cs_aggregator::run: failed to map src buffer");
-			return;
-		}
-
-		const uint32_t* src_words = reinterpret_cast<const uint32_t*>(src_map);
-		uint64_t sum = cpu_compute::parallel_sum_u32(src_words, static_cast<size_t>(num_words), numThreads);
-
-		const_cast<vk::buffer*>(src)->unmap();
-
-		void* dst_map = const_cast<vk::buffer*>(dst)->map(0, 4);
-		if (!dst_map) {
-			rsx_log.error("cs_aggregator::run: failed to map dst buffer");
-			return;
-		}
-
-		uint32_t result32 = static_cast<uint32_t>(sum);
-		std::memcpy(dst_map, &result32, sizeof(result32));
-
-		const_cast<vk::buffer*>(dst)->unmap();
+		const u32 linear_invocations = utils::aligned_div(word_count, optimal_group_size);
+		compute_task::run(cmd, linear_invocations);
 	}
-}
+}
\ No newline at end of file

From f932632d9a2aab6c78376f29ac67cd7cb2c100c7 Mon Sep 17 00:00:00 2001
From: Steve <61917452+awildergoose@users.noreply.github.com>
Date: Sun, 19 Oct 2025 22:44:31 +0300
Subject: [PATCH 4/6] chore: fix clang build fail

---
 3rdparty/OpenAL/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/3rdparty/OpenAL/CMakeLists.txt b/3rdparty/OpenAL/CMakeLists.txt
index b9fee23ce5fc..368b80c54fa0 100644
--- a/3rdparty/OpenAL/CMakeLists.txt
+++ b/3rdparty/OpenAL/CMakeLists.txt
@@ -13,6 +13,7 @@ else()
     option(ALSOFT_EXAMPLES "Build example programs" OFF)
     set(LIBTYPE "STATIC")
     add_subdirectory(openal-soft EXCLUDE_FROM_ALL)
+    target_compile_options(alsoft.fmt PRIVATE -include cstdlib)
     add_library(3rdparty_openal INTERFACE)
     target_link_libraries(3rdparty_openal INTERFACE OpenAL::OpenAL)
 endif()

From e1b4d963afd4b5a463f8a9d590d2861bf9fa53f7 Mon Sep 17 00:00:00 2001
From: Steve <61917452+awildergoose@users.noreply.github.com>
Date: Sun, 19 Oct 2025 23:00:01 +0300
Subject: [PATCH 5/6] fix win clang

---
 3rdparty/OpenAL/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/3rdparty/OpenAL/CMakeLists.txt b/3rdparty/OpenAL/CMakeLists.txt
index 368b80c54fa0..b9fee23ce5fc 100644
--- a/3rdparty/OpenAL/CMakeLists.txt
+++ b/3rdparty/OpenAL/CMakeLists.txt
@@ -13,7 +13,6 @@ else()
     option(ALSOFT_EXAMPLES "Build example programs" OFF)
     set(LIBTYPE "STATIC")
     add_subdirectory(openal-soft EXCLUDE_FROM_ALL)
-    target_compile_options(alsoft.fmt PRIVATE -include cstdlib)
     add_library(3rdparty_openal INTERFACE)
     target_link_libraries(3rdparty_openal INTERFACE OpenAL::OpenAL)
 endif()

From 4cc1a3952b10300e54ee664c73f31f8b607f15a6 Mon Sep 17 00:00:00 2001
From: Steve <61917452+awildergoose@users.noreply.github.com>
Date: Sun, 19 Oct 2025 23:04:54 +0300
Subject: [PATCH 6/6] Add compiler options for OpenAL based on compiler

---
 3rdparty/CMakeLists.txt | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/3rdparty/CMakeLists.txt b/3rdparty/CMakeLists.txt
index 46307746a319..2e6207e96be6 100644
--- a/3rdparty/CMakeLists.txt
+++ b/3rdparty/CMakeLists.txt
@@ -244,6 +244,12 @@ if(USE_SDL)
 endif()
 
 # OpenAL
+if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU")
+  add_compile_options(-include cstdlib)
+elseif(MSVC)
+  add_compile_options(/FI cstdlib)
+endif()
+
 if (NOT ANDROID)
 	add_subdirectory(OpenAL EXCLUDE_FROM_ALL)
 else()