@@ -175,7 +175,7 @@ void main() {
175175
176176 bool is_bc1 = push_constants.srcFormat >= VK_FORMAT_BC1_RGB_UNORM_BLOCK && push_constants.srcFormat <= VK_FORMAT_BC1_RGBA_SRGB_BLOCK;
177177 bool is_bc4 = push_constants.srcFormat == VK_FORMAT_BC4_UNORM_BLOCK || push_constants.srcFormat == VK_FORMAT_BC4_SNORM_BLOCK;
178- bool is_8bpp = is_bc1 || is_bc4;
178+ bool use_2_quads = is_bc1 || is_bc4;
179179
180180 ivec2 pixel_coord = ivec2(gl_WorkGroupID.xy) * 8 + ivec2(gl_LocalInvocationID.xy);
181181 ivec2 final_dst_coord = ivec2(push_constants.offsetx, push_constants.offsety) + pixel_coord;
@@ -223,40 +223,41 @@ void main() {
223223 return;
224224 }
225225
226- ivec2 local_coords = ivec2(gl_LocalInvocationID.xy); // on 8x8 warp grid
227- // The first worker in the 4x4 squad is the leader who's responsible for fetching data
228- bool is_leader = (local_coords.x % 4 == 0) && (local_coords.y % 4 == 0);
229- // 0 1
230- // 2 3
231- int shared_idx = (local_coords.y / 4) * 2 + (local_coords.x / 4);
232-
233- // Load the 128-bit block data into shared memory if this invocation is a leader
234- ivec2 block_coord = pixel_coord / 4;
235- uint block_stride = push_constants.srcRowLength > 0 ? push_constants.srcRowLength / 4 : push_constants.dst_width;
236- uint block_index = block_coord.y * block_stride + block_coord.x;
237- uint buffer_offset = is_8bpp ? block_index * 2 : block_index * 4;
238-
239- if (is_leader) {
240- if (is_8bpp) {
241- // 2 quads per 4x4 block
242- shared_block_data[shared_idx] = uvec4(
243- srcBuffer.data[buffer_offset + 0], srcBuffer.data[buffer_offset + 1],
244- // Do not use zw
245- 0, 0
246- );
226+ // Calculate the block coordinate
227+ uint block_stride = (push_constants.srcRowLength + 3u) / 4;
228+ uint local_thread_id = gl_LocalInvocationID.y * 8u + gl_LocalInvocationID.x;
229+ if (local_thread_id < 4u) {
230+ // Calculate source block coordinates on the 2x2 = 4 grid
231+ uvec2 local_block_coord = uvec2(local_thread_id % 2u, local_thread_id / 2u);
232+ uvec2 src_block_coord = uvec2(gl_WorkGroupID.xy) * 2 + local_block_coord;
233+
234+ // Check if source block is within compressed texture bounds
235+ // Each block is a 4x4 patch of texels
236+ uint src_blocks_width = (push_constants.dst_width + 3u) / 4u; // Blocks, not pixels
237+ uint src_blocks_height = (push_constants.dst_height + 3u) / 4u;
238+ if (src_block_coord.x < src_blocks_width && src_block_coord.y < src_blocks_height) {
239+ uint src_block_index = uint(src_block_coord.y) * block_stride + uint(src_block_coord.x);
240+ uint data_offset = src_block_index * (use_2_quads ? 2u : 4u);
241+ uint data0 = srcBuffer.data[data_offset];
242+ uint data1 = srcBuffer.data[data_offset + 1u];
243+ if (use_2_quads) {
244+ shared_block_data[local_thread_id] = uvec4(data0, data1, 0u, 0u);
245+ } else {
246+ uint data2 = srcBuffer.data[data_offset + 2u];
247+ uint data3 = srcBuffer.data[data_offset + 3u];
248+ shared_block_data[local_thread_id] = uvec4(data0, data1, data2, data3);
249+ }
247250 } else {
248- // 4 quads per 4x4 block
249- shared_block_data[shared_idx] = uvec4(
250- srcBuffer.data[buffer_offset + 0], srcBuffer.data[buffer_offset + 1],
251- srcBuffer.data[buffer_offset + 2], srcBuffer.data[buffer_offset + 3]
252- );
251+ shared_block_data[local_thread_id] = uvec4(0);
253252 }
254253 }
255-
256- // Synchronize to ensure all leaders have written their data before any invocation reads it
254+
255+ // Synchronize to ensure all loaders have written their data before any invocation reads it
257256 barrier();
258- // All invocations read the shared memory (only leaders fetch from main memory)
259- uvec4 payload = shared_block_data[shared_idx];
257+
258+ uvec2 block_coord = gl_LocalInvocationID.xy / 4;
259+ uint block_index = block_coord.y * 2u + block_coord.x;
260+ uvec4 payload = shared_block_data[block_index];
260261
261262 // Find the specific pixel this thread is responsible for within its 4x4 block
262263 ivec2 tile_coord = pixel_coord % 4;
0 commit comments