Skip to content

Commit a5f7b6e

Browse files
authored
Merge pull request #120 from leegao/agm_debug
Gate aggressive DCE behind a flag, and add bounds check for src buffer in shaders
2 parents 4cf56bd + 397cc07 commit a5f7b6e

File tree

4 files changed

+95
-83
lines changed

4 files changed

+95
-83
lines changed

src/vulkan/wrapper/bc6.comp

Lines changed: 29 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -701,29 +701,37 @@ void main() {
701701
return;
702702
}
703703

704-
ivec2 local_coords = ivec2(gl_LocalInvocationID.xy); // on 8x8 warp grid
705-
// The first worker in the 4x4 squad is the leader who's responsible for fetching data
706-
bool is_leader = (local_coords.x % 4 == 0) && (local_coords.y % 4 == 0);
707-
int shared_idx = (local_coords.y / 4) * 2 + (local_coords.x / 4);
708-
709-
// Load the 128-bit block data into shared memory if this invocation is a leader
710-
ivec2 block_coord = coord / 4;
711-
uint block_stride = push_constants.srcRowLength > 0 ? push_constants.srcRowLength / 4 : push_constants.dst_width;
712-
uint block_index = block_coord.y * block_stride + block_coord.x;
713-
uint buffer_offset = block_index * 4;
714-
715-
// Load the 128-bit block data into shared memory if this invocation is a leader
716-
if (is_leader) {
717-
shared_block_data[shared_idx] = uvec4(
718-
srcBuffer.data[buffer_offset + 0], srcBuffer.data[buffer_offset + 1],
719-
srcBuffer.data[buffer_offset + 2], srcBuffer.data[buffer_offset + 3]
720-
);
704+
// Calculate the block coordinate
705+
uint block_stride = (push_constants.srcRowLength + 3) / 4;
706+
uint local_thread_id = gl_LocalInvocationID.y * 8 + gl_LocalInvocationID.x;
707+
if (local_thread_id < 4) {
708+
// Calculate source block coordinates on the 2x2 = 4 grid
709+
uvec2 local_block_coord = uvec2(local_thread_id % 2, local_thread_id / 2);
710+
uvec2 src_block_coord = uvec2(gl_WorkGroupID.xy) * 2 + local_block_coord;
711+
712+
// Check if source block is within compressed texture bounds
713+
// Each block is a 4x4 patch of texels
714+
uint src_blocks_width = (push_constants.dst_width + 3) / 4; // Blocks, not pixels
715+
uint src_blocks_height = (push_constants.dst_height + 3) / 4;
716+
if (src_block_coord.x < src_blocks_width && src_block_coord.y < src_blocks_height) {
717+
uint src_block_index = uint(src_block_coord.y) * block_stride + uint(src_block_coord.x);
718+
uint data_offset = src_block_index * 4;
719+
uint data0 = srcBuffer.data[data_offset];
720+
uint data1 = srcBuffer.data[data_offset + 1];
721+
uint data2 = srcBuffer.data[data_offset + 2];
722+
uint data3 = srcBuffer.data[data_offset + 3];
723+
shared_block_data[local_thread_id] = uvec4(data0, data1, data2, data3);
724+
} else {
725+
shared_block_data[local_thread_id] = uvec4(0);
726+
}
721727
}
722-
723-
// Synchronize to ensure all leaders have written their data before any invocation reads it
728+
729+
// Synchronize to ensure all loaders have written their data before any invocation reads it
724730
barrier();
725-
// All invocations read the shared memory (only leaders fetch from main memory)
726-
uvec4 payload = shared_block_data[shared_idx];
731+
732+
uvec2 block_coord = gl_LocalInvocationID.xy / 4;
733+
uint block_index = block_coord.y * 2 + block_coord.x;
734+
uvec4 payload = shared_block_data[block_index];
727735

728736
// Decode the local pixel this thread is responsible in the 4x4 grid
729737
ivec2 tile_coord = coord % 4;

src/vulkan/wrapper/bc7.comp

Lines changed: 29 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -427,29 +427,37 @@ void main() {
427427
return;
428428
}
429429

430-
ivec2 local_coords = ivec2(gl_LocalInvocationID.xy); // on 8x8 warp grid
431-
// The first worker in the 4x4 squad is the leader who's responsible for fetching data
432-
bool is_leader = (local_coords.x % 4 == 0) && (local_coords.y % 4 == 0);
433-
int shared_idx = (local_coords.y / 4) * 2 + (local_coords.x / 4);
434-
435-
// Load the 128-bit block data into shared memory if this invocation is a leader
436-
ivec2 block_coord = coord / 4;
437-
uint block_stride = push_constants.srcRowLength > 0 ? push_constants.srcRowLength / 4 : push_constants.dst_width;
438-
uint block_index = block_coord.y * block_stride + block_coord.x;
439-
uint buffer_offset = block_index * 4;
440-
441-
// Load the 128-bit block data into shared memory if this invocation is a leader
442-
if (is_leader) {
443-
shared_block_data[shared_idx] = uvec4(
444-
srcBuffer.data[buffer_offset + 0], srcBuffer.data[buffer_offset + 1],
445-
srcBuffer.data[buffer_offset + 2], srcBuffer.data[buffer_offset + 3]
446-
);
430+
// Calculate the block coordinate
431+
uint block_stride = (push_constants.srcRowLength + 3) / 4;
432+
uint local_thread_id = gl_LocalInvocationID.y * 8 + gl_LocalInvocationID.x;
433+
if (local_thread_id < 4) {
434+
// Calculate source block coordinates on the 2x2 = 4 grid
435+
uvec2 local_block_coord = uvec2(local_thread_id % 2, local_thread_id / 2);
436+
uvec2 src_block_coord = uvec2(gl_WorkGroupID.xy) * 2 + local_block_coord;
437+
438+
// Check if source block is within compressed texture bounds
439+
// Each block is a 4x4 patch of texels
440+
uint src_blocks_width = (push_constants.dst_width + 3) / 4; // Blocks, not pixels
441+
uint src_blocks_height = (push_constants.dst_height + 3) / 4;
442+
if (src_block_coord.x < src_blocks_width && src_block_coord.y < src_blocks_height) {
443+
uint src_block_index = uint(src_block_coord.y) * block_stride + uint(src_block_coord.x);
444+
uint data_offset = src_block_index * 4;
445+
uint data0 = srcBuffer.data[data_offset];
446+
uint data1 = srcBuffer.data[data_offset + 1];
447+
uint data2 = srcBuffer.data[data_offset + 2];
448+
uint data3 = srcBuffer.data[data_offset + 3];
449+
shared_block_data[local_thread_id] = uvec4(data0, data1, data2, data3);
450+
} else {
451+
shared_block_data[local_thread_id] = uvec4(0);
452+
}
447453
}
448-
449-
// Synchronize to ensure all leaders have written their data before any invocation reads it
454+
455+
// Synchronize to ensure all loaders have written their data before any invocation reads it
450456
barrier();
451-
// All invocations read the shared memory (only leaders fetch from main memory)
452-
uvec4 payload = shared_block_data[shared_idx];
457+
458+
uvec2 block_coord = gl_LocalInvocationID.xy / 4;
459+
uint block_index = block_coord.y * 2 + block_coord.x;
460+
uvec4 payload = shared_block_data[block_index];
453461

454462
// Find the specific pixel this thread is responsible for within its 4x4 block
455463
ivec2 tile_coord = coord & 3;

src/vulkan/wrapper/s3tc.comp

Lines changed: 32 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ void main() {
175175

176176
bool is_bc1 = push_constants.srcFormat >= VK_FORMAT_BC1_RGB_UNORM_BLOCK && push_constants.srcFormat <= VK_FORMAT_BC1_RGBA_SRGB_BLOCK;
177177
bool is_bc4 = push_constants.srcFormat == VK_FORMAT_BC4_UNORM_BLOCK || push_constants.srcFormat == VK_FORMAT_BC4_SNORM_BLOCK;
178-
bool is_8bpp = is_bc1 || is_bc4;
178+
bool use_2_quads = is_bc1 || is_bc4;
179179

180180
ivec2 pixel_coord = ivec2(gl_WorkGroupID.xy) * 8 + ivec2(gl_LocalInvocationID.xy);
181181
ivec2 final_dst_coord = ivec2(push_constants.offsetx, push_constants.offsety) + pixel_coord;
@@ -223,40 +223,41 @@ void main() {
223223
return;
224224
}
225225

226-
ivec2 local_coords = ivec2(gl_LocalInvocationID.xy); // on 8x8 warp grid
227-
// The first worker in the 4x4 squad is the leader who's responsible for fetching data
228-
bool is_leader = (local_coords.x % 4 == 0) && (local_coords.y % 4 == 0);
229-
// 0 1
230-
// 2 3
231-
int shared_idx = (local_coords.y / 4) * 2 + (local_coords.x / 4);
232-
233-
// Load the 128-bit block data into shared memory if this invocation is a leader
234-
ivec2 block_coord = pixel_coord / 4;
235-
uint block_stride = push_constants.srcRowLength > 0 ? push_constants.srcRowLength / 4 : push_constants.dst_width;
236-
uint block_index = block_coord.y * block_stride + block_coord.x;
237-
uint buffer_offset = is_8bpp ? block_index * 2 : block_index * 4;
238-
239-
if (is_leader) {
240-
if (is_8bpp) {
241-
// 2 quads per 4x4 block
242-
shared_block_data[shared_idx] = uvec4(
243-
srcBuffer.data[buffer_offset + 0], srcBuffer.data[buffer_offset + 1],
244-
// Do not use zw
245-
0, 0
246-
);
226+
// Calculate the block coordinate
227+
uint block_stride = (push_constants.srcRowLength + 3u) / 4;
228+
uint local_thread_id = gl_LocalInvocationID.y * 8u + gl_LocalInvocationID.x;
229+
if (local_thread_id < 4u) {
230+
// Calculate source block coordinates on the 2x2 = 4 grid
231+
uvec2 local_block_coord = uvec2(local_thread_id % 2u, local_thread_id / 2u);
232+
uvec2 src_block_coord = uvec2(gl_WorkGroupID.xy) * 2 + local_block_coord;
233+
234+
// Check if source block is within compressed texture bounds
235+
// Each block is a 4x4 patch of texels
236+
uint src_blocks_width = (push_constants.dst_width + 3u) / 4u; // Blocks, not pixels
237+
uint src_blocks_height = (push_constants.dst_height + 3u) / 4u;
238+
if (src_block_coord.x < src_blocks_width && src_block_coord.y < src_blocks_height) {
239+
uint src_block_index = uint(src_block_coord.y) * block_stride + uint(src_block_coord.x);
240+
uint data_offset = src_block_index * (use_2_quads ? 2u : 4u);
241+
uint data0 = srcBuffer.data[data_offset];
242+
uint data1 = srcBuffer.data[data_offset + 1u];
243+
if (use_2_quads) {
244+
shared_block_data[local_thread_id] = uvec4(data0, data1, 0u, 0u);
245+
} else {
246+
uint data2 = srcBuffer.data[data_offset + 2u];
247+
uint data3 = srcBuffer.data[data_offset + 3u];
248+
shared_block_data[local_thread_id] = uvec4(data0, data1, data2, data3);
249+
}
247250
} else {
248-
// 4 quads per 4x4 block
249-
shared_block_data[shared_idx] = uvec4(
250-
srcBuffer.data[buffer_offset + 0], srcBuffer.data[buffer_offset + 1],
251-
srcBuffer.data[buffer_offset + 2], srcBuffer.data[buffer_offset + 3]
252-
);
251+
shared_block_data[local_thread_id] = uvec4(0);
253252
}
254253
}
255-
256-
// Synchronize to ensure all leaders have written their data before any invocation reads it
254+
255+
// Synchronize to ensure all loaders have written their data before any invocation reads it
257256
barrier();
258-
// All invocations read the shared memory (only leaders fetch from main memory)
259-
uvec4 payload = shared_block_data[shared_idx];
257+
258+
uvec2 block_coord = gl_LocalInvocationID.xy / 4;
259+
uint block_index = block_coord.y * 2u + block_coord.x;
260+
uvec4 payload = shared_block_data[block_index];
260261

261262
// Find the specific pixel this thread is responsible for within its 4x4 block
262263
ivec2 tile_coord = pixel_coord % 4;

src/vulkan/wrapper/spirv_edit.cpp

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -125,18 +125,12 @@ int lower_eliminate_clip_distance(const uint32_t* spirv_binary, size_t spirv_wor
125125
OptimizerMessageConsumer(level, source, position, message, id);
126126
});
127127

128-
// Cannonicalization passes
129-
// optimizer.RegisterPass(spvtools::CreateMergeReturnPass());
130-
// optimizer.RegisterPass(spvtools::CreateInlineExhaustivePass());
131-
// optimizer.RegisterPass(spvtools::CreateEliminateDeadFunctionsPass());
132-
133128
// Mali specific pass
134129
optimizer.RegisterPass(spvtools::CreateRemoveClipCullDistPass());
135130

136-
// optimizer.RegisterPerformancePasses(); // For -O
137-
138-
optimizer.RegisterPass(spvtools::CreateAggressiveDCEPass());
139-
// optimizer.RegisterPass(spvtools::CreateCompactIdsPass());
131+
if (CHECK_FLAG("SPIRV_AGGRESSIVE_DCE")) {
132+
optimizer.RegisterPass(spvtools::CreateAggressiveDCEPass());
133+
}
140134

141135
WLOGD("Original SPIR-V Word Count %d (id=%d)", spirv_word_count, id);
142136

@@ -151,7 +145,8 @@ int lower_eliminate_clip_distance(const uint32_t* spirv_binary, size_t spirv_wor
151145
WLOGD("Lowered SPIR-V Word Count %d (id=%d)", optimized_binary.size(), id);
152146

153147
if (optimized_binary.size() != spirv_word_count) {
154-
LogDisassembly("Original", {spirv_binary, spirv_binary+spirv_word_count}, id);
148+
if (CHECK_FLAG("LOG_DISASSEMBLY"))
149+
LogDisassembly("Original", {spirv_binary, spirv_binary+spirv_word_count}, id);
155150
LogDisassembly("Lowered", optimized_binary, id);
156151
}
157152

0 commit comments

Comments
 (0)