Overallocation of workgroup buffer inside compute shader causes memory corruption

### Describe the bug
If you create a compute shader with workgroup buffers that need more memory than the allowed limit then writing to one buffer will cause corruption in other buffers.

### To Reproduce
Code attached at bottom.

### Observed behaviour
If I write to shared buffer and then write to another shared buffer, the second shared buffor has results overlapping with the previous buffer in read time.

### Screenshots
NA

### Your environment

██ system:

             platform:  Windows-10-10.0.26100-SP0
python_implementation:  CPython
               python:  3.10.16

██ versions:

 wgpu:  0.22.2
 cffi:  1.17.1
numpy:  2.2.6

██ wgpu_native_info:

expected_version:  24.0.3.1
     lib_version:  24.0.3.1
        lib_path:  .\resources\wgpu_native-release.dll

██ object_counts:

                      count  resource_mem

            Adapter:      1
          BindGroup:      0
    BindGroupLayout:      0
             Buffer:      0             0
      CanvasContext:      0
      CommandBuffer:      0
     CommandEncoder:      0
 ComputePassEncoder:      0
    ComputePipeline:      0
             Device:      1
     PipelineLayout:      0
           QuerySet:      0
              Queue:      1
       RenderBundle:      0
RenderBundleEncoder:      0
  RenderPassEncoder:      0
     RenderPipeline:      0
            Sampler:      0
       ShaderModule:      0
            Texture:      0
        TextureView:      0

              total:      3             0

██ wgpu_native_counts:

                  count  mem  hub   a  k  r  el_size

        Adapter:      1    8  hub:  1  1  0        8
      BindGroup:      0    0  hub:  0  0  1       16
BindGroupLayout:      0    0  hub:  0  0  1       16
         Buffer:      0    0  hub:  0  0  7       16
  CanvasContext:      0    0        0  0  0        8
  CommandBuffer:      0    0  hub:  0  0  1        8
ComputePipeline:      0    0  hub:  0  0  1       16
         Device:      1    8  hub:  1  1  0        8
  PipelineCache:      0    0  hub:  0  0  0       16
 PipelineLayout:      0    0  hub:  0  0  1       16
       QuerySet:      0    0  hub:  0  0  0       16
          Queue:      1    8  hub:  1  1  0        8
   RenderBundle:      0    0  hub:  0  0  0       16
 RenderPipeline:      0    0  hub:  0  0  0       16
        Sampler:      0    0  hub:  0  0  0       16
   ShaderModule:      2   32  hub:  2  2  1       16
        Texture:      0    0  hub:  0  0  0       16
    TextureView:      0    0  hub:  0  0  0       16

          total:      5   56

    * The a, k, r are allocated, kept, and released, respectively.
    * Reported memory does not include buffer/texture data.


## Steps to reproduce 
```
const RANDOM_FLOAT_ARRAY_LENGTH:u32 = 256;
const OBJECTIVE_FUNCTION_OUTPUT_LENGTH:u32 = 63;
const OBJECTIVE_FUNCTION_INPUT_LENGTH:u32 = 195;
const SVD_EPSILON:f32 = 1e-10;
const WORKGROUP_SIZE_X:u32 = 8;
const WORKGROUP_SIZE_Y :u32 = 8;

struct State{
    random_vector_norm:f32
}
const WORKGROUP_LENGTH = WORKGROUP_SIZE_X * WORKGROUP_SIZE_Y;
const OBJECTIVE_FUNCTION_JACOBIAN_SIZE = OBJECTIVE_FUNCTION_OUTPUT_LENGTH * OBJECTIVE_FUNCTION_INPUT_LENGTH;

@group(0) @binding(1)
var<storage, read> jacobian: array<f32>;


@group(0) @binding(2)
var<storage, read_write> U: array<f32>;


var<workgroup> workgroup_shared_obj_out_len_buf: array<f32, OBJECTIVE_FUNCTION_OUTPUT_LENGTH>;
var<workgroup> workgroup_shared_obj_inp_len_buf: array<f32, OBJECTIVE_FUNCTION_INPUT_LENGTH>;
var<workgroup> jacobian_workgroup_copy: array<
    f32, 
    OBJECTIVE_FUNCTION_INPUT_LENGTH * OBJECTIVE_FUNCTION_OUTPUT_LENGTH
>;
var<workgroup> decomposed_jacobian: array<
    f32, 
    OBJECTIVE_FUNCTION_INPUT_LENGTH * OBJECTIVE_FUNCTION_OUTPUT_LENGTH
>;
var<workgroup> one_dim_svd_b: array<
    f32, 
    OBJECTIVE_FUNCTION_OUTPUT_LENGTH * OBJECTIVE_FUNCTION_OUTPUT_LENGTH
>;
var<workgroup> current_u_buffer: array<f32, OBJECTIVE_FUNCTION_OUTPUT_LENGTH>;
var<workgroup> current_v_buffer: array<f32, OBJECTIVE_FUNCTION_INPUT_LENGTH>;
var<workgroup> state: State;

fn row_major_access_jac(r: u32, c:u32)->u32{
    return r * OBJECTIVE_FUNCTION_INPUT_LENGTH + c;
}

fn col_major_access_jac(r: u32, c:u32)->u32{
    return c * OBJECTIVE_FUNCTION_OUTPUT_LENGTH + r;
}

fn sweep_calculator(length: u32, workgroup_size: u32) -> u32 {
    return (length + workgroup_size - 1u) / workgroup_size; // ceiling division
}


fn copy_jacobian_into_local_buffers(
    local_invocation_index: u32,
){
    let total_elements = OBJECTIVE_FUNCTION_JACOBIAN_SIZE;
    let workgroup_size = WORKGROUP_LENGTH; // e.g., 64
    var index = local_invocation_index;
    while (index < total_elements) {
        jacobian_workgroup_copy[index] = jacobian[index];
        decomposed_jacobian[index] = jacobian[index];
        index += workgroup_size;
    }
    workgroupBarrier();
}


fn fill_u_buffer_and_shared_output_buffer_with_random_normal_vector(
    local_invocation_id : vec3<u32>,
    global_invocation_id : vec3<u32>,
    local_invocation_index: u32,
){
    let sweeps = sweep_calculator(OBJECTIVE_FUNCTION_OUTPUT_LENGTH, WORKGROUP_LENGTH);
    for(var iter = 0u ; iter < sweeps ; iter++){
        let index = WORKGROUP_LENGTH * iter + local_invocation_index;
        if(index < OBJECTIVE_FUNCTION_OUTPUT_LENGTH){
            workgroup_shared_obj_out_len_buf[index] = 9.9;
            current_u_buffer[index] = 9.9;
        }
    }
    workgroupBarrier();

}

fn fill_one_dim_svd_b_buffer(
    local_invocation_id : vec3<u32>,
    local_invocation_index: u32,
){
    let sweeps = sweep_calculator(OBJECTIVE_FUNCTION_OUTPUT_LENGTH* OBJECTIVE_FUNCTION_OUTPUT_LENGTH, WORKGROUP_LENGTH);
    for(var iter = 0u ; iter < sweeps ; iter++){
        let index = iter * WORKGROUP_LENGTH + local_invocation_index;
        if(index >= OBJECTIVE_FUNCTION_OUTPUT_LENGTH * OBJECTIVE_FUNCTION_OUTPUT_LENGTH){continue;}
        let row = index / OBJECTIVE_FUNCTION_OUTPUT_LENGTH; 
        let col = index % OBJECTIVE_FUNCTION_OUTPUT_LENGTH; 
        var b_val = 0.0;
        for(var z_index:u32 = 0u ; z_index < OBJECTIVE_FUNCTION_INPUT_LENGTH ; z_index++ ){
            let left_matrix_index = row_major_access_jac(row, z_index);
            let right_matrix_index = row_major_access_jac(col, z_index);
            b_val += decomposed_jacobian[left_matrix_index] * decomposed_jacobian[right_matrix_index]; 
        }
        one_dim_svd_b[index] = b_val;
    }
    workgroupBarrier();
}

fn populate_u_with_next_singular_vector(
    local_invocation_id : vec3<u32>,
    global_invocation_id : vec3<u32>,
    local_invocation_index: u32,
){
    fill_u_buffer_and_shared_output_buffer_with_random_normal_vector(local_invocation_id, global_invocation_id, local_invocation_index);
    workgroupBarrier();
    fill_one_dim_svd_b_buffer(local_invocation_id, local_invocation_index);
    workgroupBarrier();
    let sweeps = sweep_calculator(OBJECTIVE_FUNCTION_OUTPUT_LENGTH*OBJECTIVE_FUNCTION_OUTPUT_LENGTH, WORKGROUP_LENGTH);
    for(var iter = 0u ; iter < sweeps ; iter++){
        let index = WORKGROUP_LENGTH * iter + local_invocation_index;
        if(index < OBJECTIVE_FUNCTION_OUTPUT_LENGTH*OBJECTIVE_FUNCTION_OUTPUT_LENGTH){
            U[index] = one_dim_svd_b[index];
        }
    }
    workgroupBarrier();
}

@compute @workgroup_size(WORKGROUP_SIZE_X, WORKGROUP_SIZE_Y)
fn main(
    @builtin(local_invocation_id) local_invocation_id : vec3<u32>,
    @builtin(global_invocation_id) global_invocation_id : vec3<u32>,
    @builtin(local_invocation_index) local_invocation_index: u32,
) {
    copy_jacobian_into_local_buffers(local_invocation_index);
    populate_u_with_next_singular_vector(local_invocation_id, global_invocation_id, local_invocation_index);
}
```

In the above code, if you comment out the write to workgroup_shared_obj_out_len_buf or remove redundant workgroup shared buffers, the code works and one_dim_svd_b and U will have the intended values. Otherwise one_dim_svd_b  will get corrupted and U will have incorrect values. Ideally the compiler should error on compile time if configures the shader to use more shared memory than available. 

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

Overallocation of workgroup buffer inside compute shader causes memory corruption #724

Describe the bug

To Reproduce

Observed behaviour

Screenshots

Your environment

Steps to reproduce

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Uh oh!

Overallocation of workgroup buffer inside compute shader causes memory corruption #724

Description

Describe the bug

To Reproduce

Observed behaviour

Screenshots

Your environment

Steps to reproduce

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions