-
-
Notifications
You must be signed in to change notification settings - Fork 50
Description
Describe the bug
If you create a compute shader with workgroup buffers that need more memory than the allowed limit then writing to one buffer will cause corruption in other buffers.
To Reproduce
Code attached at bottom.
Observed behaviour
If I write to shared buffer and then write to another shared buffer, the second shared buffor has results overlapping with the previous buffer in read time.
Screenshots
NA
Your environment
██ system:
platform: Windows-10-10.0.26100-SP0
python_implementation: CPython
python: 3.10.16
██ versions:
wgpu: 0.22.2
cffi: 1.17.1
numpy: 2.2.6
██ wgpu_native_info:
expected_version: 24.0.3.1
lib_version: 24.0.3.1
lib_path: .\resources\wgpu_native-release.dll
██ object_counts:
count resource_mem
Adapter: 1
BindGroup: 0
BindGroupLayout: 0
Buffer: 0 0
CanvasContext: 0
CommandBuffer: 0
CommandEncoder: 0
ComputePassEncoder: 0
ComputePipeline: 0
Device: 1
PipelineLayout: 0
QuerySet: 0
Queue: 1
RenderBundle: 0
RenderBundleEncoder: 0
RenderPassEncoder: 0
RenderPipeline: 0
Sampler: 0
ShaderModule: 0
Texture: 0
TextureView: 0
total: 3 0
██ wgpu_native_counts:
count mem hub a k r el_size
Adapter: 1 8 hub: 1 1 0 8
BindGroup: 0 0 hub: 0 0 1 16
BindGroupLayout: 0 0 hub: 0 0 1 16
Buffer: 0 0 hub: 0 0 7 16
CanvasContext: 0 0 0 0 0 8
CommandBuffer: 0 0 hub: 0 0 1 8
ComputePipeline: 0 0 hub: 0 0 1 16
Device: 1 8 hub: 1 1 0 8
PipelineCache: 0 0 hub: 0 0 0 16
PipelineLayout: 0 0 hub: 0 0 1 16
QuerySet: 0 0 hub: 0 0 0 16
Queue: 1 8 hub: 1 1 0 8
RenderBundle: 0 0 hub: 0 0 0 16
RenderPipeline: 0 0 hub: 0 0 0 16
Sampler: 0 0 hub: 0 0 0 16
ShaderModule: 2 32 hub: 2 2 1 16
Texture: 0 0 hub: 0 0 0 16
TextureView: 0 0 hub: 0 0 0 16
total: 5 56
* The a, k, r are allocated, kept, and released, respectively.
* Reported memory does not include buffer/texture data.
Steps to reproduce
const RANDOM_FLOAT_ARRAY_LENGTH:u32 = 256;
const OBJECTIVE_FUNCTION_OUTPUT_LENGTH:u32 = 63;
const OBJECTIVE_FUNCTION_INPUT_LENGTH:u32 = 195;
const SVD_EPSILON:f32 = 1e-10;
const WORKGROUP_SIZE_X:u32 = 8;
const WORKGROUP_SIZE_Y :u32 = 8;
struct State{
random_vector_norm:f32
}
const WORKGROUP_LENGTH = WORKGROUP_SIZE_X * WORKGROUP_SIZE_Y;
const OBJECTIVE_FUNCTION_JACOBIAN_SIZE = OBJECTIVE_FUNCTION_OUTPUT_LENGTH * OBJECTIVE_FUNCTION_INPUT_LENGTH;
@group(0) @binding(1)
var<storage, read> jacobian: array<f32>;
@group(0) @binding(2)
var<storage, read_write> U: array<f32>;
var<workgroup> workgroup_shared_obj_out_len_buf: array<f32, OBJECTIVE_FUNCTION_OUTPUT_LENGTH>;
var<workgroup> workgroup_shared_obj_inp_len_buf: array<f32, OBJECTIVE_FUNCTION_INPUT_LENGTH>;
var<workgroup> jacobian_workgroup_copy: array<
f32,
OBJECTIVE_FUNCTION_INPUT_LENGTH * OBJECTIVE_FUNCTION_OUTPUT_LENGTH
>;
var<workgroup> decomposed_jacobian: array<
f32,
OBJECTIVE_FUNCTION_INPUT_LENGTH * OBJECTIVE_FUNCTION_OUTPUT_LENGTH
>;
var<workgroup> one_dim_svd_b: array<
f32,
OBJECTIVE_FUNCTION_OUTPUT_LENGTH * OBJECTIVE_FUNCTION_OUTPUT_LENGTH
>;
var<workgroup> current_u_buffer: array<f32, OBJECTIVE_FUNCTION_OUTPUT_LENGTH>;
var<workgroup> current_v_buffer: array<f32, OBJECTIVE_FUNCTION_INPUT_LENGTH>;
var<workgroup> state: State;
fn row_major_access_jac(r: u32, c:u32)->u32{
return r * OBJECTIVE_FUNCTION_INPUT_LENGTH + c;
}
fn col_major_access_jac(r: u32, c:u32)->u32{
return c * OBJECTIVE_FUNCTION_OUTPUT_LENGTH + r;
}
fn sweep_calculator(length: u32, workgroup_size: u32) -> u32 {
return (length + workgroup_size - 1u) / workgroup_size; // ceiling division
}
fn copy_jacobian_into_local_buffers(
local_invocation_index: u32,
){
let total_elements = OBJECTIVE_FUNCTION_JACOBIAN_SIZE;
let workgroup_size = WORKGROUP_LENGTH; // e.g., 64
var index = local_invocation_index;
while (index < total_elements) {
jacobian_workgroup_copy[index] = jacobian[index];
decomposed_jacobian[index] = jacobian[index];
index += workgroup_size;
}
workgroupBarrier();
}
fn fill_u_buffer_and_shared_output_buffer_with_random_normal_vector(
local_invocation_id : vec3<u32>,
global_invocation_id : vec3<u32>,
local_invocation_index: u32,
){
let sweeps = sweep_calculator(OBJECTIVE_FUNCTION_OUTPUT_LENGTH, WORKGROUP_LENGTH);
for(var iter = 0u ; iter < sweeps ; iter++){
let index = WORKGROUP_LENGTH * iter + local_invocation_index;
if(index < OBJECTIVE_FUNCTION_OUTPUT_LENGTH){
workgroup_shared_obj_out_len_buf[index] = 9.9;
current_u_buffer[index] = 9.9;
}
}
workgroupBarrier();
}
fn fill_one_dim_svd_b_buffer(
local_invocation_id : vec3<u32>,
local_invocation_index: u32,
){
let sweeps = sweep_calculator(OBJECTIVE_FUNCTION_OUTPUT_LENGTH* OBJECTIVE_FUNCTION_OUTPUT_LENGTH, WORKGROUP_LENGTH);
for(var iter = 0u ; iter < sweeps ; iter++){
let index = iter * WORKGROUP_LENGTH + local_invocation_index;
if(index >= OBJECTIVE_FUNCTION_OUTPUT_LENGTH * OBJECTIVE_FUNCTION_OUTPUT_LENGTH){continue;}
let row = index / OBJECTIVE_FUNCTION_OUTPUT_LENGTH;
let col = index % OBJECTIVE_FUNCTION_OUTPUT_LENGTH;
var b_val = 0.0;
for(var z_index:u32 = 0u ; z_index < OBJECTIVE_FUNCTION_INPUT_LENGTH ; z_index++ ){
let left_matrix_index = row_major_access_jac(row, z_index);
let right_matrix_index = row_major_access_jac(col, z_index);
b_val += decomposed_jacobian[left_matrix_index] * decomposed_jacobian[right_matrix_index];
}
one_dim_svd_b[index] = b_val;
}
workgroupBarrier();
}
fn populate_u_with_next_singular_vector(
local_invocation_id : vec3<u32>,
global_invocation_id : vec3<u32>,
local_invocation_index: u32,
){
fill_u_buffer_and_shared_output_buffer_with_random_normal_vector(local_invocation_id, global_invocation_id, local_invocation_index);
workgroupBarrier();
fill_one_dim_svd_b_buffer(local_invocation_id, local_invocation_index);
workgroupBarrier();
let sweeps = sweep_calculator(OBJECTIVE_FUNCTION_OUTPUT_LENGTH*OBJECTIVE_FUNCTION_OUTPUT_LENGTH, WORKGROUP_LENGTH);
for(var iter = 0u ; iter < sweeps ; iter++){
let index = WORKGROUP_LENGTH * iter + local_invocation_index;
if(index < OBJECTIVE_FUNCTION_OUTPUT_LENGTH*OBJECTIVE_FUNCTION_OUTPUT_LENGTH){
U[index] = one_dim_svd_b[index];
}
}
workgroupBarrier();
}
@compute @workgroup_size(WORKGROUP_SIZE_X, WORKGROUP_SIZE_Y)
fn main(
@builtin(local_invocation_id) local_invocation_id : vec3<u32>,
@builtin(global_invocation_id) global_invocation_id : vec3<u32>,
@builtin(local_invocation_index) local_invocation_index: u32,
) {
copy_jacobian_into_local_buffers(local_invocation_index);
populate_u_with_next_singular_vector(local_invocation_id, global_invocation_id, local_invocation_index);
}
In the above code, if you comment out the write to workgroup_shared_obj_out_len_buf or remove redundant workgroup shared buffers, the code works and one_dim_svd_b and U will have the intended values. Otherwise one_dim_svd_b will get corrupted and U will have incorrect values. Ideally the compiler should error on compile time if configures the shader to use more shared memory than available.