Skip to content

Overallocation of workgroup buffer inside compute shader causes memory corruption #724

@professorcode1

Description

@professorcode1

Describe the bug

If you create a compute shader with workgroup buffers that need more memory than the allowed limit then writing to one buffer will cause corruption in other buffers.

To Reproduce

Code attached at bottom.

Observed behaviour

If I write to shared buffer and then write to another shared buffer, the second shared buffor has results overlapping with the previous buffer in read time.

Screenshots

NA

Your environment

██ system:

         platform:  Windows-10-10.0.26100-SP0

python_implementation: CPython
python: 3.10.16

██ versions:

wgpu: 0.22.2
cffi: 1.17.1
numpy: 2.2.6

██ wgpu_native_info:

expected_version: 24.0.3.1
lib_version: 24.0.3.1
lib_path: .\resources\wgpu_native-release.dll

██ object_counts:

                  count  resource_mem

        Adapter:      1
      BindGroup:      0
BindGroupLayout:      0
         Buffer:      0             0
  CanvasContext:      0
  CommandBuffer:      0
 CommandEncoder:      0

ComputePassEncoder: 0
ComputePipeline: 0
Device: 1
PipelineLayout: 0
QuerySet: 0
Queue: 1
RenderBundle: 0
RenderBundleEncoder: 0
RenderPassEncoder: 0
RenderPipeline: 0
Sampler: 0
ShaderModule: 0
Texture: 0
TextureView: 0

          total:      3             0

██ wgpu_native_counts:

              count  mem  hub   a  k  r  el_size

    Adapter:      1    8  hub:  1  1  0        8
  BindGroup:      0    0  hub:  0  0  1       16

BindGroupLayout: 0 0 hub: 0 0 1 16
Buffer: 0 0 hub: 0 0 7 16
CanvasContext: 0 0 0 0 0 8
CommandBuffer: 0 0 hub: 0 0 1 8
ComputePipeline: 0 0 hub: 0 0 1 16
Device: 1 8 hub: 1 1 0 8
PipelineCache: 0 0 hub: 0 0 0 16
PipelineLayout: 0 0 hub: 0 0 1 16
QuerySet: 0 0 hub: 0 0 0 16
Queue: 1 8 hub: 1 1 0 8
RenderBundle: 0 0 hub: 0 0 0 16
RenderPipeline: 0 0 hub: 0 0 0 16
Sampler: 0 0 hub: 0 0 0 16
ShaderModule: 2 32 hub: 2 2 1 16
Texture: 0 0 hub: 0 0 0 16
TextureView: 0 0 hub: 0 0 0 16

      total:      5   56

* The a, k, r are allocated, kept, and released, respectively.
* Reported memory does not include buffer/texture data.

Steps to reproduce

const RANDOM_FLOAT_ARRAY_LENGTH:u32 = 256;
const OBJECTIVE_FUNCTION_OUTPUT_LENGTH:u32 = 63;
const OBJECTIVE_FUNCTION_INPUT_LENGTH:u32 = 195;
const SVD_EPSILON:f32 = 1e-10;
const WORKGROUP_SIZE_X:u32 = 8;
const WORKGROUP_SIZE_Y :u32 = 8;

struct State{
    random_vector_norm:f32
}
const WORKGROUP_LENGTH = WORKGROUP_SIZE_X * WORKGROUP_SIZE_Y;
const OBJECTIVE_FUNCTION_JACOBIAN_SIZE = OBJECTIVE_FUNCTION_OUTPUT_LENGTH * OBJECTIVE_FUNCTION_INPUT_LENGTH;

@group(0) @binding(1)
var<storage, read> jacobian: array<f32>;


@group(0) @binding(2)
var<storage, read_write> U: array<f32>;


var<workgroup> workgroup_shared_obj_out_len_buf: array<f32, OBJECTIVE_FUNCTION_OUTPUT_LENGTH>;
var<workgroup> workgroup_shared_obj_inp_len_buf: array<f32, OBJECTIVE_FUNCTION_INPUT_LENGTH>;
var<workgroup> jacobian_workgroup_copy: array<
    f32, 
    OBJECTIVE_FUNCTION_INPUT_LENGTH * OBJECTIVE_FUNCTION_OUTPUT_LENGTH
>;
var<workgroup> decomposed_jacobian: array<
    f32, 
    OBJECTIVE_FUNCTION_INPUT_LENGTH * OBJECTIVE_FUNCTION_OUTPUT_LENGTH
>;
var<workgroup> one_dim_svd_b: array<
    f32, 
    OBJECTIVE_FUNCTION_OUTPUT_LENGTH * OBJECTIVE_FUNCTION_OUTPUT_LENGTH
>;
var<workgroup> current_u_buffer: array<f32, OBJECTIVE_FUNCTION_OUTPUT_LENGTH>;
var<workgroup> current_v_buffer: array<f32, OBJECTIVE_FUNCTION_INPUT_LENGTH>;
var<workgroup> state: State;

fn row_major_access_jac(r: u32, c:u32)->u32{
    return r * OBJECTIVE_FUNCTION_INPUT_LENGTH + c;
}

fn col_major_access_jac(r: u32, c:u32)->u32{
    return c * OBJECTIVE_FUNCTION_OUTPUT_LENGTH + r;
}

fn sweep_calculator(length: u32, workgroup_size: u32) -> u32 {
    return (length + workgroup_size - 1u) / workgroup_size; // ceiling division
}


fn copy_jacobian_into_local_buffers(
    local_invocation_index: u32,
){
    let total_elements = OBJECTIVE_FUNCTION_JACOBIAN_SIZE;
    let workgroup_size = WORKGROUP_LENGTH; // e.g., 64
    var index = local_invocation_index;
    while (index < total_elements) {
        jacobian_workgroup_copy[index] = jacobian[index];
        decomposed_jacobian[index] = jacobian[index];
        index += workgroup_size;
    }
    workgroupBarrier();
}


fn fill_u_buffer_and_shared_output_buffer_with_random_normal_vector(
    local_invocation_id : vec3<u32>,
    global_invocation_id : vec3<u32>,
    local_invocation_index: u32,
){
    let sweeps = sweep_calculator(OBJECTIVE_FUNCTION_OUTPUT_LENGTH, WORKGROUP_LENGTH);
    for(var iter = 0u ; iter < sweeps ; iter++){
        let index = WORKGROUP_LENGTH * iter + local_invocation_index;
        if(index < OBJECTIVE_FUNCTION_OUTPUT_LENGTH){
            workgroup_shared_obj_out_len_buf[index] = 9.9;
            current_u_buffer[index] = 9.9;
        }
    }
    workgroupBarrier();

}

fn fill_one_dim_svd_b_buffer(
    local_invocation_id : vec3<u32>,
    local_invocation_index: u32,
){
    let sweeps = sweep_calculator(OBJECTIVE_FUNCTION_OUTPUT_LENGTH* OBJECTIVE_FUNCTION_OUTPUT_LENGTH, WORKGROUP_LENGTH);
    for(var iter = 0u ; iter < sweeps ; iter++){
        let index = iter * WORKGROUP_LENGTH + local_invocation_index;
        if(index >= OBJECTIVE_FUNCTION_OUTPUT_LENGTH * OBJECTIVE_FUNCTION_OUTPUT_LENGTH){continue;}
        let row = index / OBJECTIVE_FUNCTION_OUTPUT_LENGTH; 
        let col = index % OBJECTIVE_FUNCTION_OUTPUT_LENGTH; 
        var b_val = 0.0;
        for(var z_index:u32 = 0u ; z_index < OBJECTIVE_FUNCTION_INPUT_LENGTH ; z_index++ ){
            let left_matrix_index = row_major_access_jac(row, z_index);
            let right_matrix_index = row_major_access_jac(col, z_index);
            b_val += decomposed_jacobian[left_matrix_index] * decomposed_jacobian[right_matrix_index]; 
        }
        one_dim_svd_b[index] = b_val;
    }
    workgroupBarrier();
}

fn populate_u_with_next_singular_vector(
    local_invocation_id : vec3<u32>,
    global_invocation_id : vec3<u32>,
    local_invocation_index: u32,
){
    fill_u_buffer_and_shared_output_buffer_with_random_normal_vector(local_invocation_id, global_invocation_id, local_invocation_index);
    workgroupBarrier();
    fill_one_dim_svd_b_buffer(local_invocation_id, local_invocation_index);
    workgroupBarrier();
    let sweeps = sweep_calculator(OBJECTIVE_FUNCTION_OUTPUT_LENGTH*OBJECTIVE_FUNCTION_OUTPUT_LENGTH, WORKGROUP_LENGTH);
    for(var iter = 0u ; iter < sweeps ; iter++){
        let index = WORKGROUP_LENGTH * iter + local_invocation_index;
        if(index < OBJECTIVE_FUNCTION_OUTPUT_LENGTH*OBJECTIVE_FUNCTION_OUTPUT_LENGTH){
            U[index] = one_dim_svd_b[index];
        }
    }
    workgroupBarrier();
}

@compute @workgroup_size(WORKGROUP_SIZE_X, WORKGROUP_SIZE_Y)
fn main(
    @builtin(local_invocation_id) local_invocation_id : vec3<u32>,
    @builtin(global_invocation_id) global_invocation_id : vec3<u32>,
    @builtin(local_invocation_index) local_invocation_index: u32,
) {
    copy_jacobian_into_local_buffers(local_invocation_index);
    populate_u_with_next_singular_vector(local_invocation_id, global_invocation_id, local_invocation_index);
}

In the above code, if you comment out the write to workgroup_shared_obj_out_len_buf or remove redundant workgroup shared buffers, the code works and one_dim_svd_b and U will have the intended values. Otherwise one_dim_svd_b will get corrupted and U will have incorrect values. Ideally the compiler should error on compile time if configures the shader to use more shared memory than available.

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions