diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 29001e3208515..8f44afa405938 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -439,15 +439,15 @@ TARGET_BUILTIN(__builtin_amdgcn_s_sleep_var, "vUi", "n", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_permlane16_var, "UiUiUiUiIbIb", "nc", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_permlanex16_var, "UiUiUiUiIbIb", "nc", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal, "vIi", "n", "gfx12-insts") -TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal_var, "vi", "n", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal_var, "vv*i", "n", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_s_barrier_wait, "vIs", "n", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal_isfirst, "bIi", "n", "gfx12-insts") -TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal_isfirst_var, "bi", "n", "gfx12-insts") -TARGET_BUILTIN(__builtin_amdgcn_s_barrier_init, "vii", "n", "gfx12-insts") -TARGET_BUILTIN(__builtin_amdgcn_s_barrier_join, "vi", "n", "gfx12-insts") -TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vi", "n", "gfx12-insts") -TARGET_BUILTIN(__builtin_amdgcn_s_barrier_leave, "b", "n", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_s_barrier_init, "vv*i", "n", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_s_barrier_join, "vv*", "n", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vv*", "n", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_s_barrier_leave, "vIs", "n", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_s_get_named_barrier_state, "Uiv*", "n", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_s_prefetch_data, "vvC*Ui", "nc", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_s_buffer_prefetch_data, "vQbIiUi", "nc", "gfx12-insts") diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-param-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-param-err.cl index 5d86a9b369429..1a5043328895a 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-param-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-param-err.cl @@ -23,6 +23,13 @@ kernel void builtins_amdgcn_s_barrier_signal_isfirst_err(global int* in, global *out = *in; } +kernel void builtins_amdgcn_s_barrier_leave_err(global int* in, global int* out, int barrier) { + + __builtin_amdgcn_s_barrier_signal(-1); + __builtin_amdgcn_s_barrier_leave(barrier); // expected-error {{'__builtin_amdgcn_s_barrier_leave' must be a constant integer}} + *out = *in; +} + void test_s_buffer_prefetch_data(__amdgpu_buffer_rsrc_t rsrc, unsigned int off) { __builtin_amdgcn_s_buffer_prefetch_data(rsrc, off, 31); // expected-error {{'__builtin_amdgcn_s_buffer_prefetch_data' must be a constant integer}} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl index 9bfedac003296..b1866a8e492c8 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl @@ -87,16 +87,21 @@ void test_s_barrier_signal() // CHECK-LABEL: @test_s_barrier_signal_var( // CHECK-NEXT: entry: +// CHECK-NEXT: [[BAR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr // CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4 -// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3) +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) [[TMP1]], i32 [[TMP2]]) // CHECK-NEXT: ret void // -void test_s_barrier_signal_var(int a) +void test_s_barrier_signal_var(void *bar, int a) { - __builtin_amdgcn_s_barrier_signal_var(a); + __builtin_amdgcn_s_barrier_signal_var(bar, a); } // CHECK-LABEL: @test_s_barrier_signal_isfirst( @@ -134,110 +139,63 @@ void test_s_barrier_signal_isfirst(int* a, int* b, int *c) __builtin_amdgcn_s_barrier_wait(1); } -// CHECK-LABEL: @test_s_barrier_isfirst_var( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4, addrspace(5) -// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr -// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr -// CHECK-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr -// CHECK-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr -// CHECK-NEXT: store ptr [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store ptr [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store ptr [[C:%.*]], ptr [[C_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store i32 [[D:%.*]], ptr [[D_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[D_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst.var(i32 [[TMP0]]) -// CHECK-NEXT: br i1 [[TMP1]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] -// CHECK: if.then: -// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store ptr [[TMP2]], ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: br label [[IF_END:%.*]] -// CHECK: if.else: -// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store ptr [[TMP3]], ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: br label [[IF_END]] -// CHECK: if.end: -// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1) -// CHECK-NEXT: ret void -// -void test_s_barrier_isfirst_var(int* a, int* b, int *c, int d) -{ - if ( __builtin_amdgcn_s_barrier_signal_isfirst_var(d)) - a = b; - else - a = c; - - __builtin_amdgcn_s_barrier_wait(1); - -} - // CHECK-LABEL: @test_s_barrier_init( // CHECK-NEXT: entry: +// CHECK-NEXT: [[BAR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr // CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4 -// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.init(i32 1, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3) +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) [[TMP1]], i32 [[TMP2]]) // CHECK-NEXT: ret void // -void test_s_barrier_init(int a) +void test_s_barrier_init(void *bar, int a) { - __builtin_amdgcn_s_barrier_init(1, a); + __builtin_amdgcn_s_barrier_init(bar, a); } // CHECK-LABEL: @test_s_barrier_join( // CHECK-NEXT: entry: -// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(i32 1) +// CHECK-NEXT: [[BAR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr +// CHECK-NEXT: store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3) +// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) [[TMP1]]) // CHECK-NEXT: ret void // -void test_s_barrier_join() +void test_s_barrier_join(void *bar) { - __builtin_amdgcn_s_barrier_join(1); + __builtin_amdgcn_s_barrier_join(bar); } // CHECK-LABEL: @test_s_wakeup_barrier( // CHECK-NEXT: entry: -// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(i32 1) +// CHECK-NEXT: [[BAR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr +// CHECK-NEXT: store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3) +// CHECK-NEXT: call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) [[TMP1]]) // CHECK-NEXT: ret void // -void test_s_wakeup_barrier() +void test_s_wakeup_barrier(void *bar) { - __builtin_amdgcn_s_barrier_join(1); + __builtin_amdgcn_s_wakeup_barrier(bar); } // CHECK-LABEL: @test_s_barrier_leave( // CHECK-NEXT: entry: -// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr -// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr -// CHECK-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr -// CHECK-NEXT: store ptr [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store ptr [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store ptr [[C:%.*]], ptr [[C_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = call i1 @llvm.amdgcn.s.barrier.leave() -// CHECK-NEXT: br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] -// CHECK: if.then: -// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store ptr [[TMP1]], ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: br label [[IF_END:%.*]] -// CHECK: if.else: -// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store ptr [[TMP2]], ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: br label [[IF_END]] -// CHECK: if.end: +// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.leave(i16 1) // CHECK-NEXT: ret void // -void test_s_barrier_leave(int* a, int* b, int *c) +void test_s_barrier_leave() { - if (__builtin_amdgcn_s_barrier_leave()) - a = b; - else - a = c; + __builtin_amdgcn_s_barrier_leave(1); } // CHECK-LABEL: @test_s_get_barrier_state( @@ -261,6 +219,28 @@ unsigned test_s_get_barrier_state(int a) return State; } +// CHECK-LABEL: @test_s_get_named_barrier_state( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[BAR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[STATE:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr +// CHECK-NEXT: [[STATE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[STATE]] to ptr +// CHECK-NEXT: store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) [[TMP1]]) +// CHECK-NEXT: store i32 [[TMP2]], ptr [[STATE_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[STATE_ASCAST]], align 4 +// CHECK-NEXT: ret i32 [[TMP3]] +// +unsigned test_s_get_named_barrier_state(void *bar) +{ + unsigned State = __builtin_amdgcn_s_get_named_barrier_state(bar); + return State; +} + // CHECK-LABEL: @test_s_ttracedata( // CHECK-NEXT: entry: // CHECK-NEXT: call void @llvm.amdgcn.s.ttracedata(i32 1) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 143b538b361c9..d6375ab77cfb3 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// def global_ptr_ty : LLVMQualPointerType<1>; +def local_ptr_ty : LLVMQualPointerType<3>; // The amdgpu-no-* attributes (ex amdgpu-no-workitem-id-z) typically inferred // by the backend cause whole-program undefined behavior when violated, such as @@ -247,48 +248,70 @@ def int_amdgcn_s_sendmsghalt : ClangBuiltin<"__builtin_amdgcn_s_sendmsghalt">, def int_amdgcn_s_sendmsg_rtn : Intrinsic <[llvm_anyint_ty], [llvm_i32_ty], [ImmArg>, IntrNoMem, IntrHasSideEffects]>; +// Vanilla workgroup sync-barrier def int_amdgcn_s_barrier : ClangBuiltin<"__builtin_amdgcn_s_barrier">, Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; +// Lower-level split-barrier intrinsics + +// void @llvm.amdgcn.s.barrier.signal(i32 %barrierType) +// only for non-named barrier def int_amdgcn_s_barrier_signal : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal">, Intrinsic<[], [llvm_i32_ty], [ImmArg>, IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; +// void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) %barrier, i32 %memberCnt) +// The %barrier and %memberCnt argument must be uniform, otherwise behavior is undefined. def int_amdgcn_s_barrier_signal_var : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_var">, - Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, + Intrinsic<[], [local_ptr_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; +// bool @llvm.amdgcn.s.barrier.signal.isfirst(i32 %barrierType) +// only for non-named barrier def int_amdgcn_s_barrier_signal_isfirst : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_isfirst">, Intrinsic<[llvm_i1_ty], [llvm_i32_ty], [ImmArg>, IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; -def int_amdgcn_s_barrier_signal_isfirst_var : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_isfirst_var">, - Intrinsic<[llvm_i1_ty], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, - IntrNoCallback, IntrNoFree]>; - +// void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) %barrier, i32 %memberCnt) +// The %barrier and %memberCnt argument must be uniform, otherwise behavior is undefined. def int_amdgcn_s_barrier_init : ClangBuiltin<"__builtin_amdgcn_s_barrier_init">, - Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, + Intrinsic<[], [local_ptr_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; +// void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) %barrier) +// The %barrier argument must be uniform, otherwise behavior is undefined. def int_amdgcn_s_barrier_join : ClangBuiltin<"__builtin_amdgcn_s_barrier_join">, - Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, + Intrinsic<[], [local_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; +// void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) %barrier) +// The %barrier argument must be uniform, otherwise behavior is undefined. def int_amdgcn_s_wakeup_barrier : ClangBuiltin<"__builtin_amdgcn_s_wakeup_barrier">, - Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, + Intrinsic<[], [local_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; +// void @llvm.amdgcn.s.barrier.wait(i16 %barrierType) def int_amdgcn_s_barrier_wait : ClangBuiltin<"__builtin_amdgcn_s_barrier_wait">, Intrinsic<[], [llvm_i16_ty], [ImmArg>, IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; +// void @llvm.amdgcn.s.barrier.leave(i16 %barrierType) def int_amdgcn_s_barrier_leave : ClangBuiltin<"__builtin_amdgcn_s_barrier_leave">, - Intrinsic<[llvm_i1_ty], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; + Intrinsic<[], [llvm_i16_ty], [ImmArg>, IntrNoMem, IntrHasSideEffects, IntrConvergent, + IntrWillReturn, IntrNoCallback, IntrNoFree]>; +// uint32_t @llvm.amdgcn.s.get.barrier.state(i32 %barrierId) +// The %barrierType argument must be uniform, otherwise behavior is undefined. def int_amdgcn_s_get_barrier_state : ClangBuiltin<"__builtin_amdgcn_s_get_barrier_state">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; +// uint32_t @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) %barrier) +// The %barrier argument must be uniform, otherwise behavior is undefined. +def int_amdgcn_s_get_named_barrier_state : ClangBuiltin<"__builtin_amdgcn_s_get_named_barrier_state">, + Intrinsic<[llvm_i32_ty], [local_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, + IntrNoCallback, IntrNoFree]>; + def int_amdgcn_wave_barrier : ClangBuiltin<"__builtin_amdgcn_wave_barrier">, Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index e4b54c7d72b08..8c640ec18e1a4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -16,6 +16,7 @@ #include "AMDGPU.h" #include "AMDGPUInstrInfo.h" #include "AMDGPUMachineFunction.h" +#include "AMDGPUMemoryUtils.h" #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" @@ -1508,7 +1509,8 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) { if (!MFI->isModuleEntryFunction() && - GV->getName() != "llvm.amdgcn.module.lds") { + GV->getName() != "llvm.amdgcn.module.lds" && + !AMDGPU::isNamedBarrier(*cast(GV))) { SDLoc DL(Op); const Function &Fn = DAG.getMachineFunction().getFunction(); DiagnosticInfoUnsupported BadLDSDecl( diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 800bdbe04cf70..1873251ea358b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2181,15 +2181,16 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( case Intrinsic::amdgcn_ds_bvh_stack_rtn: return selectDSBvhStackIntrinsic(I); case Intrinsic::amdgcn_s_barrier_init: + case Intrinsic::amdgcn_s_barrier_signal_var: + return selectNamedBarrierInit(I, IntrinsicID); case Intrinsic::amdgcn_s_barrier_join: case Intrinsic::amdgcn_s_wakeup_barrier: - case Intrinsic::amdgcn_s_get_barrier_state: + case Intrinsic::amdgcn_s_get_named_barrier_state: return selectNamedBarrierInst(I, IntrinsicID); + case Intrinsic::amdgcn_s_get_barrier_state: + return selectSGetBarrierState(I, IntrinsicID); case Intrinsic::amdgcn_s_barrier_signal_isfirst: - case Intrinsic::amdgcn_s_barrier_signal_isfirst_var: return selectSBarrierSignalIsfirst(I, IntrinsicID); - case Intrinsic::amdgcn_s_barrier_leave: - return selectSBarrierLeave(I); } return selectImpl(I, *CoverageInfo); } @@ -5437,18 +5438,8 @@ bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst( const DebugLoc &DL = I.getDebugLoc(); Register CCReg = I.getOperand(0).getReg(); - bool HasM0 = IntrID == Intrinsic::amdgcn_s_barrier_signal_isfirst_var; - - if (HasM0) { - auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) - .addReg(I.getOperand(2).getReg()); - BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0)); - if (!constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI)) - return false; - } else { - BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM)) - .addImm(I.getOperand(2).getImm()); - } + BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM)) + .addImm(I.getOperand(2).getImm()); BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC); @@ -5457,80 +5448,143 @@ bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst( *MRI); } +bool AMDGPUInstructionSelector::selectSGetBarrierState( + MachineInstr &I, Intrinsic::ID IntrID) const { + MachineBasicBlock *MBB = I.getParent(); + const DebugLoc &DL = I.getDebugLoc(); + MachineOperand BarOp = I.getOperand(2); + std::optional BarValImm = + getIConstantVRegSExtVal(BarOp.getReg(), *MRI); + + if (!BarValImm) { + auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) + .addReg(BarOp.getReg()); + constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI); + } + MachineInstrBuilder MIB; + unsigned Opc = BarValImm ? AMDGPU::S_GET_BARRIER_STATE_IMM + : AMDGPU::S_GET_BARRIER_STATE_M0; + MIB = BuildMI(*MBB, &I, DL, TII.get(Opc)); + + auto DstReg = I.getOperand(0).getReg(); + const TargetRegisterClass *DstRC = + TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI); + if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) + return false; + MIB.addDef(DstReg); + if (BarValImm) { + MIB.addImm(*BarValImm); + } + I.eraseFromParent(); + return true; +} + unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) { if (HasInlineConst) { switch (IntrID) { default: llvm_unreachable("not a named barrier op"); - case Intrinsic::amdgcn_s_barrier_init: - return AMDGPU::S_BARRIER_INIT_IMM; case Intrinsic::amdgcn_s_barrier_join: return AMDGPU::S_BARRIER_JOIN_IMM; case Intrinsic::amdgcn_s_wakeup_barrier: return AMDGPU::S_WAKEUP_BARRIER_IMM; - case Intrinsic::amdgcn_s_get_barrier_state: + case Intrinsic::amdgcn_s_get_named_barrier_state: return AMDGPU::S_GET_BARRIER_STATE_IMM; }; } else { switch (IntrID) { default: llvm_unreachable("not a named barrier op"); - case Intrinsic::amdgcn_s_barrier_init: - return AMDGPU::S_BARRIER_INIT_M0; case Intrinsic::amdgcn_s_barrier_join: return AMDGPU::S_BARRIER_JOIN_M0; case Intrinsic::amdgcn_s_wakeup_barrier: return AMDGPU::S_WAKEUP_BARRIER_M0; - case Intrinsic::amdgcn_s_get_barrier_state: + case Intrinsic::amdgcn_s_get_named_barrier_state: return AMDGPU::S_GET_BARRIER_STATE_M0; }; } } +bool AMDGPUInstructionSelector::selectNamedBarrierInit( + MachineInstr &I, Intrinsic::ID IntrID) const { + MachineBasicBlock *MBB = I.getParent(); + const DebugLoc &DL = I.getDebugLoc(); + MachineOperand BarOp = I.getOperand(1); + MachineOperand CntOp = I.getOperand(2); + + // BarID = (BarOp >> 4) & 0x3F + Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0) + .add(BarOp) + .addImm(4u) + .setOperandDead(3); // Dead scc + + Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1) + .addReg(TmpReg0) + .addImm(0x3F) + .setOperandDead(3); // Dead scc + + // MO = ((CntOp & 0x3F) << shAmt) | BarID + Register TmpReg2 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg2) + .add(CntOp) + .addImm(0x3F) + .setOperandDead(3); // Dead scc + + Register TmpReg3 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + constexpr unsigned ShAmt = 16; + BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg3) + .addReg(TmpReg2) + .addImm(ShAmt) + .setOperandDead(3); // Dead scc + + Register TmpReg4 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg4) + .addReg(TmpReg1) + .addReg(TmpReg3) + .setOperandDead(3); // Dead scc; + + auto CopyMIB = + BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(TmpReg4); + constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI); + + unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init + ? AMDGPU::S_BARRIER_INIT_M0 + : AMDGPU::S_BARRIER_SIGNAL_M0; + MachineInstrBuilder MIB; + MIB = BuildMI(*MBB, &I, DL, TII.get(Opc)); + + I.eraseFromParent(); + return true; +} + bool AMDGPUInstructionSelector::selectNamedBarrierInst( MachineInstr &I, Intrinsic::ID IntrID) const { MachineBasicBlock *MBB = I.getParent(); const DebugLoc &DL = I.getDebugLoc(); - MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_barrier_state + MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_named_barrier_state ? I.getOperand(2) : I.getOperand(1); std::optional BarValImm = getIConstantVRegSExtVal(BarOp.getReg(), *MRI); - Register M0Val; - Register TmpReg0; - - // For S_BARRIER_INIT, member count will always be read from M0[16:22] - if (IntrID == Intrinsic::amdgcn_s_barrier_init) { - Register MemberCount = I.getOperand(2).getReg(); - TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); - // TODO: This should be expanded during legalization so that the the S_LSHL - // and S_OR can be constant-folded - BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0) - .addImm(16) - .addReg(MemberCount); - M0Val = TmpReg0; - } - - // If not inlinable, get reference to barrier depending on the instruction + if (!BarValImm) { - if (IntrID == Intrinsic::amdgcn_s_barrier_init) { - // If reference to barrier id is not an inlinable constant then it must be - // referenced with M0[4:0]. Perform an OR with the member count to include - // it in M0 for S_BARRIER_INIT. - Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); - BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg1) - .addReg(BarOp.getReg()) - .addReg(TmpReg0); - M0Val = TmpReg1; - } else { - M0Val = BarOp.getReg(); - } - } + // BarID = (BarOp >> 4) & 0x3F + Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0) + .addReg(BarOp.getReg()) + .addImm(4u) + .setOperandDead(3); // Dead scc; + + Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1) + .addReg(TmpReg0) + .addImm(0x3F) + .setOperandDead(3); // Dead scc; - // Build copy to M0 if needed. For S_BARRIER_INIT, M0 is always required. - if (M0Val) { - auto CopyMIB = - BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(M0Val); + auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) + .addReg(TmpReg1); constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI); } @@ -5538,29 +5592,24 @@ bool AMDGPUInstructionSelector::selectNamedBarrierInst( unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID); MIB = BuildMI(*MBB, &I, DL, TII.get(Opc)); - if (IntrID == Intrinsic::amdgcn_s_get_barrier_state) - MIB.addDef(I.getOperand(0).getReg()); + if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) { + auto DstReg = I.getOperand(0).getReg(); + const TargetRegisterClass *DstRC = + TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI); + if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) + return false; + MIB.addDef(DstReg); + } - if (BarValImm) - MIB.addImm(*BarValImm); + if (BarValImm) { + auto BarId = ((*BarValImm) >> 4) & 0x3F; + MIB.addImm(BarId); + } I.eraseFromParent(); return true; } -bool AMDGPUInstructionSelector::selectSBarrierLeave(MachineInstr &I) const { - MachineBasicBlock *BB = I.getParent(); - const DebugLoc &DL = I.getDebugLoc(); - Register CCReg = I.getOperand(0).getReg(); - - BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_BARRIER_LEAVE)); - BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC); - - I.eraseFromParent(); - return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass, - *MRI); -} - void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index df39ecbd61bce..42343104812b6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -147,8 +147,10 @@ class AMDGPUInstructionSelector final : public InstructionSelector { bool selectSMFMACIntrin(MachineInstr &I) const; bool selectWaveAddress(MachineInstr &I) const; bool selectStackRestore(MachineInstr &MI) const; + bool selectNamedBarrierInit(MachineInstr &I, Intrinsic::ID IID) const; bool selectNamedBarrierInst(MachineInstr &I, Intrinsic::ID IID) const; bool selectSBarrierSignalIsfirst(MachineInstr &I, Intrinsic::ID IID) const; + bool selectSGetBarrierState(MachineInstr &I, Intrinsic::ID IID) const; bool selectSBarrierLeave(MachineInstr &I) const; std::pair selectVOP3ModsImpl(Register Src, diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index d7126132356d2..ab6b09b008714 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -16,6 +16,7 @@ #include "AMDGPU.h" #include "AMDGPUGlobalISelUtils.h" #include "AMDGPUInstrInfo.h" +#include "AMDGPUMemoryUtils.h" #include "AMDGPUTargetMachine.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIInstrInfo.h" @@ -2976,7 +2977,8 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue( if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { if (!MFI->isModuleEntryFunction() && - GV->getName() != "llvm.amdgcn.module.lds") { + GV->getName() != "llvm.amdgcn.module.lds" && + !AMDGPU::isNamedBarrier(*cast(GV))) { const Function &Fn = MF.getFunction(); DiagnosticInfoUnsupported BadLDSDecl( Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index 5791daed00651..a76d92ee91ee5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -920,6 +920,124 @@ class AMDGPULowerModuleLDS { return KernelToCreatedDynamicLDS; } + static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV, + Function *KF) { + bool NeedsReplacement = false; + for (Use &U : GV->uses()) { + if (auto *I = dyn_cast(U.getUser())) { + Function *F = I->getFunction(); + if (isKernelLDS(F) && F != KF) { + NeedsReplacement = true; + break; + } + } + } + if (!NeedsReplacement) + return GV; + // Create a new GV used only by this kernel and its function + GlobalVariable *NewGV = new GlobalVariable( + M, GV->getValueType(), GV->isConstant(), GV->getLinkage(), + GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr, + GV->getThreadLocalMode(), GV->getType()->getAddressSpace()); + NewGV->copyAttributesFrom(GV); + for (Use &U : make_early_inc_range(GV->uses())) { + if (auto *I = dyn_cast(U.getUser())) { + Function *F = I->getFunction(); + if (!isKernelLDS(F) || F == KF) { + U.getUser()->replaceUsesOfWith(GV, NewGV); + } + } + } + return NewGV; + } + + bool lowerSpecialLDSVariables( + Module &M, LDSUsesInfoTy &LDSUsesInfo, + VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) { + bool Changed = false; + // The 1st round: give module-absolute assignments + int NumAbsolutes = 0; + std::vector OrderedGVs; + for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) { + GlobalVariable *GV = K.first; + if (!isNamedBarrier(*GV)) + continue; + // give a module-absolute assignment if it is indirectly accessed by + // multiple kernels. This is not precise, but we don't want to duplicate + // a function when it is called by multiple kernels. + if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) { + OrderedGVs.push_back(GV); + } else { + // leave it to the 2nd round, which will give a kernel-relative + // assignment if it is only indirectly accessed by one kernel + LDSUsesInfo.direct_access[*K.second.begin()].insert(GV); + } + LDSToKernelsThatNeedToAccessItIndirectly.erase(GV); + } + OrderedGVs = sortByName(std::move(OrderedGVs)); + for (GlobalVariable *GV : OrderedGVs) { + int BarId = ++NumAbsolutes; + unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; + // 4 bits for alignment, 5 bits for the barrier num, + // 3 bits for the barrier scope + unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4; + recordLDSAbsoluteAddress(&M, GV, Offset); + } + OrderedGVs.clear(); + + // The 2nd round: give a kernel-relative assignment for GV that + // either only indirectly accessed by single kernel or only directly + // accessed by multiple kernels. + std::vector OrderedKernels; + for (auto &K : LDSUsesInfo.direct_access) { + Function *F = K.first; + assert(isKernelLDS(F)); + OrderedKernels.push_back(F); + } + OrderedKernels = sortByName(std::move(OrderedKernels)); + + llvm::DenseMap Kernel2BarId; + for (Function *F : OrderedKernels) { + for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) { + if (!isNamedBarrier(*GV)) + continue; + + LDSUsesInfo.direct_access[F].erase(GV); + if (GV->isAbsoluteSymbolRef()) { + // already assigned + continue; + } + OrderedGVs.push_back(GV); + } + OrderedGVs = sortByName(std::move(OrderedGVs)); + for (GlobalVariable *GV : OrderedGVs) { + // GV could also be used directly by other kernels. If so, we need to + // create a new GV used only by this kernel and its function. + auto NewGV = uniquifyGVPerKernel(M, GV, F); + Changed |= (NewGV != GV); + int BarId = (NumAbsolutes + 1); + if (Kernel2BarId.find(F) != Kernel2BarId.end()) { + BarId = (Kernel2BarId[F] + 1); + } + Kernel2BarId[F] = BarId; + unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; + unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4; + recordLDSAbsoluteAddress(&M, NewGV, Offset); + } + OrderedGVs.clear(); + } + // Also erase those special LDS variables from indirect_access. + for (auto &K : LDSUsesInfo.indirect_access) { + Function *F = K.first; + assert(isKernelLDS(F)); + for (GlobalVariable *GV : K.second) { + if (isNamedBarrier(*GV)) + K.second.erase(GV); + } + } + return Changed; + } + bool runOnModule(Module &M) { CallGraph CG = CallGraph(M); bool Changed = superAlignLDSGlobals(M); @@ -942,6 +1060,12 @@ class AMDGPULowerModuleLDS { } } + if (LDSUsesInfo.HasSpecialGVs) { + // Special LDS variables need special address assignment + Changed |= lowerSpecialLDSVariables( + M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly); + } + // Partition variables accessed indirectly into the different strategies DenseSet ModuleScopeVariables; DenseSet TableLookupVariables; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp index 1d83d0c4c9337..f52476464e05e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -8,6 +8,7 @@ #include "AMDGPUMachineFunction.h" #include "AMDGPU.h" +#include "AMDGPUMemoryUtils.h" #include "AMDGPUPerfHintAnalysis.h" #include "AMDGPUSubtarget.h" #include "Utils/AMDGPUBaseInfo.h" @@ -102,6 +103,13 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL, unsigned Offset; if (GV.getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + if (TargetExtType *TTy = AMDGPU::isNamedBarrier(GV)) { + std::optional BarAddr = getLDSAbsoluteAddress(GV); + if (!BarAddr) + llvm_unreachable("named barrier should have an assigned address"); + Entry.first->second = BarAddr.value(); + return BarAddr.value(); + } std::optional MaybeAbs = getLDSAbsoluteAddress(GV); if (MaybeAbs) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp index c82b8d83704b7..b337e2762b13a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp @@ -32,6 +32,28 @@ Align getAlign(const DataLayout &DL, const GlobalVariable *GV) { GV->getValueType()); } +TargetExtType *isNamedBarrier(const GlobalVariable &GV) { + // TODO: Allow arrays and structs, if all members are barriers + // in the same scope. + // TODO: Disallow other uses of target("amdgcn.named.barrier") including: + // - Structs containing barriers in different scope. + // - Structs containing a mixture of barriers and other data. + // - Globals in other address spaces. + // - Allocas. + Type *Ty = GV.getValueType(); + while (true) { + if (auto *TTy = dyn_cast(Ty)) + return TTy->getName() == "amdgcn.named.barrier" ? TTy : nullptr; + if (auto *STy = dyn_cast(Ty)) { + if (STy->getNumElements() == 0) + return nullptr; + Ty = STy->getElementType(0); + continue; + } + return nullptr; + } +} + bool isDynamicLDS(const GlobalVariable &GV) { // external zero size addrspace(3) without initializer is dynlds. const Module *M = GV.getParent(); @@ -211,6 +233,7 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) { // so we don't have anything to do. // - No variables are absolute. std::optional HasAbsoluteGVs; + bool HasSpecialGVs = false; for (auto &Map : {DirectMapKernel, IndirectMapKernel}) { for (auto &[Fn, GVs] : Map) { for (auto *GV : GVs) { @@ -219,6 +242,10 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) { AMDGPU::isDynamicLDS(*GV) && DirectMapKernel.contains(Fn); if (IsDirectMapDynLDSGV) continue; + if (isNamedBarrier(*GV)) { + HasSpecialGVs = true; + continue; + } if (HasAbsoluteGVs.has_value()) { if (*HasAbsoluteGVs != IsAbsolute) { report_fatal_error( @@ -233,9 +260,10 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) { // If we only had absolute GVs, we have nothing to do, return an empty // result. if (HasAbsoluteGVs && *HasAbsoluteGVs) - return {FunctionVariableMap(), FunctionVariableMap()}; + return {FunctionVariableMap(), FunctionVariableMap(), false}; - return {std::move(DirectMapKernel), std::move(IndirectMapKernel)}; + return {std::move(DirectMapKernel), std::move(IndirectMapKernel), + HasSpecialGVs}; } void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot, @@ -294,7 +322,6 @@ bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) { case Intrinsic::amdgcn_s_barrier_signal: case Intrinsic::amdgcn_s_barrier_signal_var: case Intrinsic::amdgcn_s_barrier_signal_isfirst: - case Intrinsic::amdgcn_s_barrier_signal_isfirst_var: case Intrinsic::amdgcn_s_barrier_init: case Intrinsic::amdgcn_s_barrier_join: case Intrinsic::amdgcn_s_barrier_wait: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h index e1cd4d03052b3..058e74452573c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h @@ -26,6 +26,7 @@ class Value; class Function; class CallGraph; class Module; +class TargetExtType; namespace AMDGPU { @@ -34,12 +35,16 @@ using VariableFunctionMap = DenseMap>; Align getAlign(const DataLayout &DL, const GlobalVariable *GV); +// If GV is a named-barrier return its type. Otherwise return nullptr. +TargetExtType *isNamedBarrier(const GlobalVariable &GV); + bool isDynamicLDS(const GlobalVariable &GV); bool isLDSVariableToLower(const GlobalVariable &GV); struct LDSUsesInfoTy { FunctionVariableMap direct_access; FunctionVariableMap indirect_access; + bool HasSpecialGVs = false; }; bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 32dfbc98df581..415c068367074 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3276,19 +3276,17 @@ void AMDGPURegisterBankInfo::applyMappingImpl( assert(OpdMapper.getVRegs(1).empty()); constrainOpWithReadfirstlane(B, MI, 1); return; - case Intrinsic::amdgcn_s_barrier_signal_var: case Intrinsic::amdgcn_s_barrier_join: case Intrinsic::amdgcn_s_wakeup_barrier: constrainOpWithReadfirstlane(B, MI, 1); return; - case Intrinsic::amdgcn_s_barrier_signal_isfirst_var: - constrainOpWithReadfirstlane(B, MI, 2); - return; case Intrinsic::amdgcn_s_barrier_init: + case Intrinsic::amdgcn_s_barrier_signal_var: constrainOpWithReadfirstlane(B, MI, 1); constrainOpWithReadfirstlane(B, MI, 2); return; - case Intrinsic::amdgcn_s_get_barrier_state: { + case Intrinsic::amdgcn_s_get_barrier_state: + case Intrinsic::amdgcn_s_get_named_barrier_state: { constrainOpWithReadfirstlane(B, MI, 2); return; } @@ -5134,30 +5132,23 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_s_sleep_var: OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); break; - case Intrinsic::amdgcn_s_barrier_signal_var: case Intrinsic::amdgcn_s_barrier_join: case Intrinsic::amdgcn_s_wakeup_barrier: OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); break; case Intrinsic::amdgcn_s_barrier_init: + case Intrinsic::amdgcn_s_barrier_signal_var: OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); break; - case Intrinsic::amdgcn_s_barrier_signal_isfirst_var: { - const unsigned ResultSize = 1; - OpdsMapping[0] = - AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, ResultSize); - OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); - break; - } - case Intrinsic::amdgcn_s_barrier_signal_isfirst: - case Intrinsic::amdgcn_s_barrier_leave: { + case Intrinsic::amdgcn_s_barrier_signal_isfirst: { const unsigned ResultSize = 1; OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, ResultSize); break; } - case Intrinsic::amdgcn_s_get_barrier_state: { + case Intrinsic::amdgcn_s_get_barrier_state: + case Intrinsic::amdgcn_s_get_named_barrier_state: { OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); break; diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index 07c80bd2575f0..8f297726a0df8 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -1062,7 +1062,13 @@ enum Register_Flag : uint8_t { namespace AMDGPU { namespace Barrier { + enum Type { TRAP = -2, WORKGROUP = -1 }; + +enum { + BARRIER_SCOPE_WORKGROUP = 0, +}; + } // namespace Barrier } // namespace AMDGPU diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index d66610ae0a160..e0362b0568cf8 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9386,27 +9386,33 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(), Ops, M->getMemOperand()); } - case Intrinsic::amdgcn_s_get_barrier_state: { + case Intrinsic::amdgcn_s_get_barrier_state: + case Intrinsic::amdgcn_s_get_named_barrier_state: { SDValue Chain = Op->getOperand(0); SmallVector Ops; unsigned Opc; - bool IsInlinableBarID = false; - int64_t BarID; if (isa(Op->getOperand(2))) { - BarID = cast(Op->getOperand(2))->getSExtValue(); - IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarID); - } - - if (IsInlinableBarID) { + uint64_t BarID = cast(Op->getOperand(2))->getZExtValue(); + if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) + BarID = (BarID >> 4) & 0x3F; Opc = AMDGPU::S_GET_BARRIER_STATE_IMM; SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32); Ops.push_back(K); Ops.push_back(Chain); } else { Opc = AMDGPU::S_GET_BARRIER_STATE_M0; - SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(2)); - Ops.push_back(M0Val.getValue(0)); + if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) { + SDValue M0Val; + M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2), + DAG.getShiftAmountConstant(4, MVT::i32, DL)); + M0Val = SDValue( + DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val, + DAG.getTargetConstant(0x3F, DL, MVT::i32)), + 0); + Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0)); + } else + Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0)); } auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); @@ -9946,27 +9952,55 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Op->getOperand(2), Chain), 0); case Intrinsic::amdgcn_s_barrier_init: + case Intrinsic::amdgcn_s_barrier_signal_var: { + // these two intrinsics have two operands: barrier pointer and member count + SDValue Chain = Op->getOperand(0); + SmallVector Ops; + SDValue BarOp = Op->getOperand(2); + SDValue CntOp = Op->getOperand(3); + SDValue M0Val; + unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init + ? AMDGPU::S_BARRIER_INIT_M0 + : AMDGPU::S_BARRIER_SIGNAL_M0; + // extract the BarrierID from bits 4-9 of BarOp + SDValue BarID; + BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp, + DAG.getShiftAmountConstant(4, MVT::i32, DL)); + BarID = + SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID, + DAG.getTargetConstant(0x3F, DL, MVT::i32)), + 0); + // Member count should be put into M0[ShAmt:+6] + // Barrier ID should be put into M0[5:0] + M0Val = + SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp, + DAG.getTargetConstant(0x3F, DL, MVT::i32)), + 0); + constexpr unsigned ShAmt = 16; + M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp, + DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL)); + + M0Val = SDValue( + DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0); + + Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0)); + + auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); + return SDValue(NewMI, 0); + } case Intrinsic::amdgcn_s_barrier_join: case Intrinsic::amdgcn_s_wakeup_barrier: { + // these three intrinsics have one operand: barrier pointer SDValue Chain = Op->getOperand(0); SmallVector Ops; SDValue BarOp = Op->getOperand(2); unsigned Opc; - bool IsInlinableBarID = false; - int64_t BarVal; if (isa(BarOp)) { - BarVal = cast(BarOp)->getSExtValue(); - IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarVal); - } - - if (IsInlinableBarID) { + uint64_t BarVal = cast(BarOp)->getZExtValue(); switch (IntrinsicID) { default: return SDValue(); - case Intrinsic::amdgcn_s_barrier_init: - Opc = AMDGPU::S_BARRIER_INIT_IMM; - break; case Intrinsic::amdgcn_s_barrier_join: Opc = AMDGPU::S_BARRIER_JOIN_IMM; break; @@ -9974,16 +10008,15 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Opc = AMDGPU::S_WAKEUP_BARRIER_IMM; break; } - - SDValue K = DAG.getTargetConstant(BarVal, DL, MVT::i32); + // extract the BarrierID from bits 4-9 of the immediate + unsigned BarID = (BarVal >> 4) & 0x3F; + SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32); Ops.push_back(K); + Ops.push_back(Chain); } else { switch (IntrinsicID) { default: return SDValue(); - case Intrinsic::amdgcn_s_barrier_init: - Opc = AMDGPU::S_BARRIER_INIT_M0; - break; case Intrinsic::amdgcn_s_barrier_join: Opc = AMDGPU::S_BARRIER_JOIN_M0; break; @@ -9991,25 +10024,15 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Opc = AMDGPU::S_WAKEUP_BARRIER_M0; break; } - } - - if (IntrinsicID == Intrinsic::amdgcn_s_barrier_init) { + // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0] SDValue M0Val; - // Member count will be read from M0[16:22] - M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, Op.getOperand(3), - DAG.getShiftAmountConstant(16, MVT::i32, DL)); - - if (!IsInlinableBarID) { - // If reference to barrier id is not an inline constant then it must be - // referenced with M0[4:0]. Perform an OR with the member count to - // include it in M0. - M0Val = DAG.getNode(ISD::OR, DL, MVT::i32, Op.getOperand(2), M0Val); - } + M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp, + DAG.getShiftAmountConstant(4, MVT::i32, DL)); + M0Val = + SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val, + DAG.getTargetConstant(0x3F, DL, MVT::i32)), + 0); Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0)); - } else if (IsInlinableBarID) { - Ops.push_back(Chain); - } else { - Ops.push_back(copyToM0(DAG, Chain, DL, BarOp).getValue(0)); } auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 7041b59964645..1f7fff76d1521 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -946,8 +946,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { Opcode == AMDGPU::S_BARRIER_INIT_IMM || Opcode == AMDGPU::S_BARRIER_JOIN_IMM || Opcode == AMDGPU::S_BARRIER_LEAVE || - Opcode == AMDGPU::DS_GWS_INIT || - Opcode == AMDGPU::DS_GWS_BARRIER; + Opcode == AMDGPU::S_BARRIER_LEAVE_IMM || + Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER; } static bool isF16PseudoScalarTrans(unsigned Opcode) { diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 9da27a7c7ee7d..c0697c80b23f9 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -458,13 +458,13 @@ let hasSideEffects = 1 in { let has_sdst = 0 in { let Uses = [M0] in { def S_BARRIER_SIGNAL_M0 : SOP1_Pseudo <"s_barrier_signal m0", (outs), (ins), - "", [(int_amdgcn_s_barrier_signal_var M0)]>{ + "", []>{ let SchedRW = [WriteBarrier]; let isConvergent = 1; } def S_BARRIER_SIGNAL_ISFIRST_M0 : SOP1_Pseudo <"s_barrier_signal_isfirst m0", (outs), (ins), - "", [(set SCC, (int_amdgcn_s_barrier_signal_isfirst_var M0))]>{ + "", []>{ let Defs = [SCC]; let SchedRW = [WriteBarrier]; let isConvergent = 1; @@ -1604,8 +1604,7 @@ def S_BARRIER_WAIT : SOPP_Pseudo <"s_barrier_wait", (ins i16imm:$simm16), "$simm let isConvergent = 1; } -def S_BARRIER_LEAVE : SOPP_Pseudo <"s_barrier_leave", (ins), "", - [(set SCC, (int_amdgcn_s_barrier_leave))]> { +def S_BARRIER_LEAVE : SOPP_Pseudo <"s_barrier_leave", (ins)> { let SchedRW = [WriteBarrier]; let simm16 = 0; let fixed_imm = 1; @@ -1613,6 +1612,9 @@ def S_BARRIER_LEAVE : SOPP_Pseudo <"s_barrier_leave", (ins), "", let Defs = [SCC]; } +def S_BARRIER_LEAVE_IMM : SOPP_Pseudo <"s_barrier_leave", + (ins i16imm:$simm16), "$simm16", [(int_amdgcn_s_barrier_leave timm:$simm16)]>; + def S_WAKEUP : SOPP_Pseudo <"s_wakeup", (ins) > { let SubtargetPredicate = isGFX8Plus; let simm16 = 0; diff --git a/llvm/test/Assembler/target-type-param-errors.ll b/llvm/test/Assembler/target-type-param-errors.ll index 03180811c7549..b9eb9150e6e16 100644 --- a/llvm/test/Assembler/target-type-param-errors.ll +++ b/llvm/test/Assembler/target-type-param-errors.ll @@ -1,6 +1,7 @@ ; RUN: split-file %s %t ; RUN: not llvm-as < %t/aarch64-svcount.ll -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-AARCH64-SVCOUNT %s ; RUN: not llvm-as < %t/riscv-vector-tuple.ll -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-RISCV-VECTOR-TUPLE %s +; RUN: not llvm-as < %t/amdgcn-named-barrier.ll -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-AMDGCN-NAMEDBARRIER %s ; Check target extension type properties are verified in the assembler. ;--- aarch64-svcount.ll @@ -10,3 +11,7 @@ declare target("aarch64.svcount", i32) @aarch64_svcount() ;--- riscv-vector-tuple.ll declare target("riscv.vector.tuple", 99) @riscv_vector_tuple() ; CHECK-RISCV-VECTOR-TUPLE: target extension type riscv.vector.tuple should have one type parameter and one integer parameter + +;--- amdgcn-named-barrier.ll +declare target("amdgcn.named.barrier", i32) @amdgcn_named_barrier() +; CHECK-AMDGCN-NAMEDBARRIER: target extension type amdgcn.named.barrier should have no type parameters and one integer parameter diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll deleted file mode 100644 index 61baca24fbdc0..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll +++ /dev/null @@ -1,1373 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s - -define amdgpu_kernel void @test1_s_barrier_signal(ptr addrspace(1) %out) #0 { -; GFX12-SDAG-LABEL: test1_s_barrier_signal: -; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0 -; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1] -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: s_barrier_signal -1 -; GFX12-SDAG-NEXT: s_barrier_wait -1 -; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: test1_s_barrier_signal: -; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0 -; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] -; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: s_barrier_signal -1 -; GFX12-GISEL-NEXT: s_barrier_wait -1 -; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-GISEL-NEXT: s_endpgm -entry: - %tmp = call i32 @llvm.amdgcn.workitem.id.x() - %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp - store i32 0, ptr addrspace(1) %tmp1 - call void @llvm.amdgcn.s.barrier.signal(i32 -1) - call void @llvm.amdgcn.s.barrier.wait(i16 -1) - %tmp3 = mul i32 %tmp, %tmp - %tmp4 = sub i32 %tmp3, %tmp - store i32 %tmp4, ptr addrspace(1) %tmp1 - ret void -} - -define amdgpu_kernel void @test2_s_barrier_signal(ptr addrspace(1) %out) #0 { -; GFX12-SDAG-LABEL: test2_s_barrier_signal: -; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0 -; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1] -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: s_barrier_signal 1 -; GFX12-SDAG-NEXT: s_barrier_wait 1 -; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: test2_s_barrier_signal: -; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0 -; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] -; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: s_barrier_signal 1 -; GFX12-GISEL-NEXT: s_barrier_wait 1 -; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-GISEL-NEXT: s_endpgm -entry: - %tmp = call i32 @llvm.amdgcn.workitem.id.x() - %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp - store i32 0, ptr addrspace(1) %tmp1 - call void @llvm.amdgcn.s.barrier.signal(i32 1) - call void @llvm.amdgcn.s.barrier.wait(i16 1) - %tmp3 = mul i32 %tmp, %tmp - %tmp4 = sub i32 %tmp3, %tmp - store i32 %tmp4, ptr addrspace(1) %tmp1 - ret void -} - -define amdgpu_kernel void @test3_s_barrier_signal(ptr addrspace(1) %out) #0 { -; GFX12-SDAG-LABEL: test3_s_barrier_signal: -; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0 -; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1] -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: s_barrier_signal 0 -; GFX12-SDAG-NEXT: s_barrier_wait 0 -; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: test3_s_barrier_signal: -; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0 -; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] -; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: s_barrier_signal 0 -; GFX12-GISEL-NEXT: s_barrier_wait 0 -; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-GISEL-NEXT: s_endpgm -entry: - %tmp = call i32 @llvm.amdgcn.workitem.id.x() - %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp - store i32 0, ptr addrspace(1) %tmp1 - call void @llvm.amdgcn.s.barrier.signal(i32 0) - call void @llvm.amdgcn.s.barrier.wait(i16 0) - %tmp3 = mul i32 %tmp, %tmp - %tmp4 = sub i32 %tmp3, %tmp - store i32 %tmp4, ptr addrspace(1) %tmp1 - ret void -} - -define amdgpu_kernel void @test1_s_barrier_signal_var(ptr addrspace(1) %out) #0 { -; GFX12-SDAG-LABEL: test1_s_barrier_signal_var: -; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_mov_b32 m0, 1 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v2, v0, v0 -; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v3, 2, v0 -; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v2, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: global_store_b32 v3, v1, s[0:1] -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: s_barrier_signal m0 -; GFX12-SDAG-NEXT: s_barrier_wait 1 -; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: test1_s_barrier_signal_var: -; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: s_mov_b32 m0, 1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v3, 2, v0 -; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0 -; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] -; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: s_barrier_signal m0 -; GFX12-GISEL-NEXT: s_barrier_wait 1 -; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-GISEL-NEXT: s_endpgm -entry: - %tmp = call i32 @llvm.amdgcn.workitem.id.x() - %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp - store i32 0, ptr addrspace(1) %tmp1 - call void @llvm.amdgcn.s.barrier.signal.var(i32 1) - call void @llvm.amdgcn.s.barrier.wait(i16 1) - %tmp3 = mul i32 %tmp, %tmp - %tmp4 = sub i32 %tmp3, %tmp - store i32 %tmp4, ptr addrspace(1) %tmp1 - ret void -} - -define void @test2_s_barrier_signal_var(i32 %arg) { -; GFX12-SDAG-LABEL: test2_s_barrier_signal_var: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: s_mov_b32 m0, s0 -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: s_barrier_signal m0 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe -; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-GISEL-LABEL: test2_s_barrier_signal_var: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_readfirstlane_b32 m0, v0 -; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: s_barrier_signal m0 -; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - call void @llvm.amdgcn.s.barrier.signal.var(i32 %arg) - ret void -} - -define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 { -; GFX12-SDAG-LABEL: test1_s_barrier_signal_isfirst: -; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[6:7] -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1 -; GFX12-SDAG-NEXT: s_cselect_b32 s3, s3, s5 -; GFX12-SDAG-NEXT: s_cselect_b32 s2, s2, s4 -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: global_load_b32 v2, v1, s[0:1] -; GFX12-SDAG-NEXT: global_load_b32 v1, v1, s[2:3] -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: v_mul_lo_u32 v1, v1, v2 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[6:7] -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: test1_s_barrier_signal_isfirst: -; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[6:7] -; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1 -; GFX12-GISEL-NEXT: s_cselect_b32 s8, 1, 0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: s_and_b32 s8, s8, 1 -; GFX12-GISEL-NEXT: s_cmp_lg_u32 s8, 0 -; GFX12-GISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: global_load_b32 v2, v1, s[0:1] -; GFX12-GISEL-NEXT: global_load_b32 v1, v1, s[2:3] -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v1, v2 -; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[6:7] -; GFX12-GISEL-NEXT: s_endpgm -entry: - %tmp = call i32 @llvm.amdgcn.workitem.id.x() - %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp - store i32 0, ptr addrspace(1) %tmp1 - %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1) - %0 = load i32, ptr addrspace(1) %a, align 4 - %b.c = select i1 %isfirst, ptr addrspace(1) %b, ptr addrspace(1) %c - %1 = load i32, ptr addrspace(1) %b.c, align 4 - %mul1 = mul nsw i32 %1, %0 - store i32 %mul1, ptr addrspace(1) %tmp1 - ret void -} - -define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 { -; GFX12-SDAG-LABEL: test2_s_barrier_signal_isfirst: -; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[6:7] -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: s_barrier_signal_isfirst 1 -; GFX12-SDAG-NEXT: s_cselect_b32 s3, s3, s5 -; GFX12-SDAG-NEXT: s_cselect_b32 s2, s2, s4 -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: global_load_b32 v2, v1, s[0:1] -; GFX12-SDAG-NEXT: global_load_b32 v1, v1, s[2:3] -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: v_mul_lo_u32 v1, v1, v2 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[6:7] -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: test2_s_barrier_signal_isfirst: -; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[6:7] -; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: s_barrier_signal_isfirst 1 -; GFX12-GISEL-NEXT: s_cselect_b32 s8, 1, 0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: s_and_b32 s8, s8, 1 -; GFX12-GISEL-NEXT: s_cmp_lg_u32 s8, 0 -; GFX12-GISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: global_load_b32 v2, v1, s[0:1] -; GFX12-GISEL-NEXT: global_load_b32 v1, v1, s[2:3] -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v1, v2 -; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[6:7] -; GFX12-GISEL-NEXT: s_endpgm -entry: - %tmp = call i32 @llvm.amdgcn.workitem.id.x() - %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp - store i32 0, ptr addrspace(1) %tmp1 - %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 1) - %0 = load i32, ptr addrspace(1) %a, align 4 - %b.c = select i1 %isfirst, ptr addrspace(1) %b, ptr addrspace(1) %c - %1 = load i32, ptr addrspace(1) %b.c, align 4 - %mul1 = mul nsw i32 %1, %0 - store i32 %mul1, ptr addrspace(1) %tmp1 - ret void -} - -define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 { -; GFX12-SDAG-LABEL: test3_s_barrier_signal_isfirst: -; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[6:7] -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: s_barrier_signal_isfirst 1 -; GFX12-SDAG-NEXT: s_cselect_b32 s3, s3, s5 -; GFX12-SDAG-NEXT: s_cselect_b32 s2, s2, s4 -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: global_load_b32 v2, v1, s[0:1] -; GFX12-SDAG-NEXT: global_load_b32 v1, v1, s[2:3] -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: v_mul_lo_u32 v1, v1, v2 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[6:7] -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: test3_s_barrier_signal_isfirst: -; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[6:7] -; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: s_barrier_signal_isfirst 1 -; GFX12-GISEL-NEXT: s_cselect_b32 s8, 1, 0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: s_and_b32 s8, s8, 1 -; GFX12-GISEL-NEXT: s_cmp_lg_u32 s8, 0 -; GFX12-GISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: global_load_b32 v2, v1, s[0:1] -; GFX12-GISEL-NEXT: global_load_b32 v1, v1, s[2:3] -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v1, v2 -; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[6:7] -; GFX12-GISEL-NEXT: s_endpgm -entry: - %tmp = call i32 @llvm.amdgcn.workitem.id.x() - %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp - store i32 0, ptr addrspace(1) %tmp1 - %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 1) - %0 = load i32, ptr addrspace(1) %a, align 4 - %b.c = select i1 %isfirst, ptr addrspace(1) %b, ptr addrspace(1) %c - %1 = load i32, ptr addrspace(1) %b.c, align 4 - %mul1 = mul nsw i32 %1, %0 - store i32 %mul1, ptr addrspace(1) %tmp1 - ret void -} - -define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 { -; GFX12-SDAG-LABEL: test1_s_barrier_signal_isfirst_var: -; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_mov_b32 m0, 1 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[6:7] -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: s_barrier_signal_isfirst m0 -; GFX12-SDAG-NEXT: s_cselect_b32 s3, s3, s5 -; GFX12-SDAG-NEXT: s_cselect_b32 s2, s2, s4 -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: global_load_b32 v2, v1, s[0:1] -; GFX12-SDAG-NEXT: global_load_b32 v1, v1, s[2:3] -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: v_mul_lo_u32 v1, v1, v2 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[6:7] -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: test1_s_barrier_signal_isfirst_var: -; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_mov_b32 m0, 1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[6:7] -; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: s_barrier_signal_isfirst m0 -; GFX12-GISEL-NEXT: s_cselect_b32 s8, 1, 0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: s_and_b32 s8, s8, 1 -; GFX12-GISEL-NEXT: s_cmp_lg_u32 s8, 0 -; GFX12-GISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: global_load_b32 v2, v1, s[0:1] -; GFX12-GISEL-NEXT: global_load_b32 v1, v1, s[2:3] -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v1, v2 -; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[6:7] -; GFX12-GISEL-NEXT: s_endpgm -entry: - %tmp = call i32 @llvm.amdgcn.workitem.id.x() - %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp - store i32 0, ptr addrspace(1) %tmp1 - %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst.var(i32 1) - %0 = load i32, ptr addrspace(1) %a, align 4 - %b.c = select i1 %isfirst, ptr addrspace(1) %b, ptr addrspace(1) %c - %1 = load i32, ptr addrspace(1) %b.c, align 4 - %mul1 = mul nsw i32 %1, %0 - store i32 %mul1, ptr addrspace(1) %tmp1 - ret void -} - -define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, i32 %arg, ptr addrspace(1) %out) { -; GFX12-SDAG-LABEL: test2_s_barrier_signal_isfirst_var: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_and_b32 v9, 0x3ff, v31 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v6 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v9, 2, v9 -; GFX12-SDAG-NEXT: s_mov_b32 m0, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_add_co_u32 v7, vcc_lo, v7, v9 -; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v8, vcc_lo -; GFX12-SDAG-NEXT: global_store_b32 v[7:8], v10, off -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: s_barrier_signal_isfirst m0 -; GFX12-SDAG-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe -; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v3, v5, v3 -; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off -; GFX12-SDAG-NEXT: global_load_b32 v1, v[2:3], off -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: v_mul_lo_u32 v0, v1, v0 -; GFX12-SDAG-NEXT: global_store_b32 v[7:8], v0, off -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-GISEL-LABEL: test2_s_barrier_signal_isfirst_var: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_and_b32_e32 v9, 0x3ff, v31 -; GFX12-GISEL-NEXT: v_readfirstlane_b32 m0, v6 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v9, 2, v9 -; GFX12-GISEL-NEXT: v_add_co_u32 v7, vcc_lo, v7, v9 -; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v8, vcc_lo -; GFX12-GISEL-NEXT: v_mov_b32_e32 v9, 0 -; GFX12-GISEL-NEXT: global_store_b32 v[7:8], v9, off -; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: s_barrier_signal_isfirst m0 -; GFX12-GISEL-NEXT: s_cselect_b32 s0, 1, 0 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe -; GFX12-GISEL-NEXT: s_and_b32 s0, 1, s0 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe -; GFX12-GISEL-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX12-GISEL-NEXT: v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v3, v5, v3 -; GFX12-GISEL-NEXT: global_load_b32 v0, v[0:1], off -; GFX12-GISEL-NEXT: global_load_b32 v1, v[2:3], off -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-NEXT: v_mul_lo_u32 v0, v1, v0 -; GFX12-GISEL-NEXT: global_store_b32 v[7:8], v0, off -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %tmp = call i32 @llvm.amdgcn.workitem.id.x() - %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp - store i32 0, ptr addrspace(1) %tmp1 - %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst.var(i32 %arg) - %1 = load i32, ptr addrspace(1) %a, align 4 - %b.c = select i1 %isfirst, ptr addrspace(1) %b, ptr addrspace(1) %c - %2 = load i32, ptr addrspace(1) %b.c, align 4 - %mul1 = mul nsw i32 %2, %1 - store i32 %mul1, ptr addrspace(1) %tmp1 - ret void -} - -define amdgpu_kernel void @test1_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 { -; GFX12-SDAG-LABEL: test1_s_barrier_init: -; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0 -; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_lshl_b32 s2, s2, 16 -; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1] -; GFX12-SDAG-NEXT: s_mov_b32 m0, s2 -; GFX12-SDAG-NEXT: s_barrier_init -1 -; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: test1_s_barrier_init: -; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0 -; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_lshl_b32 m0, 16, s2 -; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] -; GFX12-GISEL-NEXT: s_barrier_init -1 -; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-GISEL-NEXT: s_endpgm -entry: - %tmp = call i32 @llvm.amdgcn.workitem.id.x() - %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp - store i32 0, ptr addrspace(1) %tmp1 - call void @llvm.amdgcn.s.barrier.init(i32 -1, i32 %mbrCnt) - %tmp2 = mul i32 %tmp, %tmp - %tmp3 = sub i32 %tmp2, %tmp - store i32 %tmp3, ptr addrspace(1) %tmp1 - ret void -} - -define amdgpu_kernel void @test2_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 { -; GFX12-SDAG-LABEL: test2_s_barrier_init: -; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0 -; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_lshl_b32 s2, s2, 16 -; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1] -; GFX12-SDAG-NEXT: s_mov_b32 m0, s2 -; GFX12-SDAG-NEXT: s_barrier_init 1 -; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: test2_s_barrier_init: -; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0 -; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_lshl_b32 m0, 16, s2 -; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] -; GFX12-GISEL-NEXT: s_barrier_init 1 -; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-GISEL-NEXT: s_endpgm -entry: - %tmp = call i32 @llvm.amdgcn.workitem.id.x() - %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp - store i32 0, ptr addrspace(1) %tmp1 - call void @llvm.amdgcn.s.barrier.init(i32 1, i32 %mbrCnt) - %tmp3 = mul i32 %tmp, %tmp - %tmp4 = sub i32 %tmp3, %tmp - store i32 %tmp4, ptr addrspace(1) %tmp1 - ret void -} - -define amdgpu_kernel void @test3_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 { -; GFX12-SDAG-LABEL: test3_s_barrier_init: -; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0 -; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_lshl_b32 s2, s2, 16 -; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1] -; GFX12-SDAG-NEXT: s_mov_b32 m0, s2 -; GFX12-SDAG-NEXT: s_barrier_init 0 -; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: test3_s_barrier_init: -; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0 -; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_lshl_b32 m0, 16, s2 -; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] -; GFX12-GISEL-NEXT: s_barrier_init 0 -; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-GISEL-NEXT: s_endpgm -entry: - %tmp = call i32 @llvm.amdgcn.workitem.id.x() - %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp - store i32 0, ptr addrspace(1) %tmp1 - call void @llvm.amdgcn.s.barrier.init(i32 0, i32 %mbrCnt) - %tmp3 = mul i32 %tmp, %tmp - %tmp4 = sub i32 %tmp3, %tmp - store i32 %tmp4, ptr addrspace(1) %tmp1 - ret void -} - -define amdgpu_kernel void @test4_s_barrier_init(ptr addrspace(1) %out, i32 %bar, i32 %mbrCnt) #0 { -; GFX12-SDAG-LABEL: test4_s_barrier_init: -; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0 -; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_lshl_b32 s3, s3, 16 -; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1] -; GFX12-SDAG-NEXT: s_or_b32 s2, s2, s3 -; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: s_mov_b32 m0, s2 -; GFX12-SDAG-NEXT: s_barrier_init m0 -; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: test4_s_barrier_init: -; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0 -; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_lshl_b32 s3, 16, s3 -; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] -; GFX12-GISEL-NEXT: s_or_b32 m0, s2, s3 -; GFX12-GISEL-NEXT: s_barrier_init m0 -; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-GISEL-NEXT: s_endpgm -entry: - %tmp = call i32 @llvm.amdgcn.workitem.id.x() - %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp - store i32 0, ptr addrspace(1) %tmp1 - call void @llvm.amdgcn.s.barrier.init(i32 %bar, i32 %mbrCnt) - %tmp3 = mul i32 %tmp, %tmp - %tmp4 = sub i32 %tmp3, %tmp - store i32 %tmp4, ptr addrspace(1) %tmp1 - ret void -} - -define void @test5_s_barrier_init_m0(i32 %arg1 ,i32 %arg2) { -; GFX12-SDAG-LABEL: test5_s_barrier_init_m0: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v0 -; GFX12-SDAG-NEXT: s_mov_b32 m0, s0 -; GFX12-SDAG-NEXT: s_barrier_init m0 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe -; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-GISEL-LABEL: test5_s_barrier_init_m0: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v1 -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: s_lshl_b32 s0, 16, s0 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe -; GFX12-GISEL-NEXT: s_or_b32 m0, s1, s0 -; GFX12-GISEL-NEXT: s_barrier_init m0 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe -; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - call void @llvm.amdgcn.s.barrier.init(i32 %arg1, i32 %arg2) - ret void -} - -define amdgpu_kernel void @test1_s_barrier_join(ptr addrspace(1) %out) #0 { -; -; GFX12-SDAG-LABEL: test1_s_barrier_join: -; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0 -; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1] -; GFX12-SDAG-NEXT: s_barrier_join -1 -; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: test1_s_barrier_join: -; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0 -; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] -; GFX12-GISEL-NEXT: s_barrier_join -1 -; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-GISEL-NEXT: s_endpgm -entry: - %tmp = call i32 @llvm.amdgcn.workitem.id.x() - %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp - store i32 0, ptr addrspace(1) %tmp1 - call void @llvm.amdgcn.s.barrier.join(i32 -1) - %tmp3 = mul i32 %tmp, %tmp - %tmp4 = sub i32 %tmp3, %tmp - store i32 %tmp4, ptr addrspace(1) %tmp1 - ret void -} - -define amdgpu_kernel void @test2_s_barrier_join(ptr addrspace(1) %out) #0 { -; -; GFX12-SDAG-LABEL: test2_s_barrier_join: -; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0 -; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1] -; GFX12-SDAG-NEXT: s_barrier_join 1 -; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: test2_s_barrier_join: -; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0 -; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] -; GFX12-GISEL-NEXT: s_barrier_join 1 -; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-GISEL-NEXT: s_endpgm -entry: - %tmp = call i32 @llvm.amdgcn.workitem.id.x() - %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp - store i32 0, ptr addrspace(1) %tmp1 - call void @llvm.amdgcn.s.barrier.join(i32 1) - %tmp3 = mul i32 %tmp, %tmp - %tmp4 = sub i32 %tmp3, %tmp - store i32 %tmp4, ptr addrspace(1) %tmp1 - ret void -} - -define amdgpu_kernel void @test3_s_barrier_join(ptr addrspace(1) %out) #0 { -; -; GFX12-SDAG-LABEL: test3_s_barrier_join: -; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0 -; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1] -; GFX12-SDAG-NEXT: s_barrier_join 0 -; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: test3_s_barrier_join: -; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0 -; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] -; GFX12-GISEL-NEXT: s_barrier_join 0 -; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-GISEL-NEXT: s_endpgm -entry: - %tmp = call i32 @llvm.amdgcn.workitem.id.x() - %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp - store i32 0, ptr addrspace(1) %tmp1 - call void @llvm.amdgcn.s.barrier.join(i32 0) - %tmp3 = mul i32 %tmp, %tmp - %tmp4 = sub i32 %tmp3, %tmp - store i32 %tmp4, ptr addrspace(1) %tmp1 - ret void -} - -define amdgpu_kernel void @test4_s_barrier_join_m0(ptr addrspace(1) %out, i32 %bar) #0 { -; GFX12-SDAG-LABEL: test4_s_barrier_join_m0: -; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v2, v0, v0 -; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v3, 2, v0 -; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v2, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_mov_b32 m0, s2 -; GFX12-SDAG-NEXT: global_store_b32 v3, v1, s[0:1] -; GFX12-SDAG-NEXT: s_barrier_join m0 -; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: test4_s_barrier_join_m0: -; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0 -; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_mov_b32 m0, s2 -; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] -; GFX12-GISEL-NEXT: s_barrier_join m0 -; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-GISEL-NEXT: s_endpgm -entry: - %tmp = call i32 @llvm.amdgcn.workitem.id.x() - %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp - store i32 0, ptr addrspace(1) %tmp1 - call void @llvm.amdgcn.s.barrier.join(i32 %bar) - %tmp3 = mul i32 %tmp, %tmp - %tmp4 = sub i32 %tmp3, %tmp - store i32 %tmp4, ptr addrspace(1) %tmp1 - ret void -} - -define void @test5_s_barrier_join_m0(i32 %arg) { -; GFX12-SDAG-LABEL: test5_s_barrier_join_m0: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: s_mov_b32 m0, s0 -; GFX12-SDAG-NEXT: s_barrier_join m0 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe -; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-GISEL-LABEL: test5_s_barrier_join_m0: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_readfirstlane_b32 m0, v0 -; GFX12-GISEL-NEXT: s_barrier_join m0 -; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - call void @llvm.amdgcn.s.barrier.join(i32 %arg) - ret void -} - -define void @test6_s_barrier_join_0() { -; GFX12-LABEL: test6_s_barrier_join_0: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_barrier_join 0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - call void @llvm.amdgcn.s.barrier.join(i32 0) - ret void -} - -define amdgpu_kernel void @test1_s_barrier_leave(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 { -; GFX12-SDAG-LABEL: test1_s_barrier_leave: -; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[6:7] -; GFX12-SDAG-NEXT: s_barrier_leave -; GFX12-SDAG-NEXT: s_cselect_b32 s3, s3, s5 -; GFX12-SDAG-NEXT: s_cselect_b32 s2, s2, s4 -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: global_load_b32 v2, v1, s[0:1] -; GFX12-SDAG-NEXT: global_load_b32 v1, v1, s[2:3] -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: v_mul_lo_u32 v1, v1, v2 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[6:7] -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: test1_s_barrier_leave: -; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[6:7] -; GFX12-GISEL-NEXT: s_barrier_leave -; GFX12-GISEL-NEXT: s_cselect_b32 s8, 1, 0 -; GFX12-GISEL-NEXT: s_and_b32 s8, s8, 1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: s_cmp_lg_u32 s8, 0 -; GFX12-GISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: global_load_b32 v2, v1, s[0:1] -; GFX12-GISEL-NEXT: global_load_b32 v1, v1, s[2:3] -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v1, v2 -; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[6:7] -; GFX12-GISEL-NEXT: s_endpgm -entry: - %tmp = call i32 @llvm.amdgcn.workitem.id.x() - %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp - store i32 0, ptr addrspace(1) %tmp1 - %0 = call i1 @llvm.amdgcn.s.barrier.leave() - %1 = load i32, ptr addrspace(1) %a, align 4 - %b.c = select i1 %0, ptr addrspace(1) %b, ptr addrspace(1) %c - %2 = load i32, ptr addrspace(1) %b.c, align 4 - %mul1 = mul nsw i32 %2, %1 - store i32 %mul1, ptr addrspace(1) %tmp1 - ret void -} - -define amdgpu_kernel void @test1_s_wakeup_barrier(ptr addrspace(1) %out) #0 { -; -; GFX12-SDAG-LABEL: test1_s_wakeup_barrier: -; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0 -; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1] -; GFX12-SDAG-NEXT: s_wakeup_barrier -1 -; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: test1_s_wakeup_barrier: -; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0 -; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] -; GFX12-GISEL-NEXT: s_wakeup_barrier -1 -; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-GISEL-NEXT: s_endpgm -entry: - %tmp = call i32 @llvm.amdgcn.workitem.id.x() - %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp - store i32 0, ptr addrspace(1) %tmp1 - call void @llvm.amdgcn.s.wakeup.barrier(i32 -1) - %tmp3 = mul i32 %tmp, %tmp - %tmp4 = sub i32 %tmp3, %tmp - store i32 %tmp4, ptr addrspace(1) %tmp1 - ret void -} - -define amdgpu_kernel void @test2_s_wakeup_barrier(ptr addrspace(1) %out) #0 { -; -; GFX12-SDAG-LABEL: test2_s_wakeup_barrier: -; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0 -; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1] -; GFX12-SDAG-NEXT: s_wakeup_barrier 1 -; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: test2_s_wakeup_barrier: -; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0 -; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] -; GFX12-GISEL-NEXT: s_wakeup_barrier 1 -; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-GISEL-NEXT: s_endpgm -entry: - %tmp = call i32 @llvm.amdgcn.workitem.id.x() - %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp - store i32 0, ptr addrspace(1) %tmp1 - call void @llvm.amdgcn.s.wakeup.barrier(i32 1) - %tmp3 = mul i32 %tmp, %tmp - %tmp4 = sub i32 %tmp3, %tmp - store i32 %tmp4, ptr addrspace(1) %tmp1 - ret void -} - -define amdgpu_kernel void @test3_s_wakeup_barrier(ptr addrspace(1) %out) #0 { -; -; GFX12-SDAG-LABEL: test3_s_wakeup_barrier: -; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0 -; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1] -; GFX12-SDAG-NEXT: s_wakeup_barrier 0 -; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: test3_s_wakeup_barrier: -; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0 -; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] -; GFX12-GISEL-NEXT: s_wakeup_barrier 0 -; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-GISEL-NEXT: s_endpgm -entry: - %tmp = call i32 @llvm.amdgcn.workitem.id.x() - %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp - store i32 0, ptr addrspace(1) %tmp1 - call void @llvm.amdgcn.s.wakeup.barrier(i32 0) - %tmp3 = mul i32 %tmp, %tmp - %tmp4 = sub i32 %tmp3, %tmp - store i32 %tmp4, ptr addrspace(1) %tmp1 - ret void -} - -define amdgpu_kernel void @test4_s_wakeup_barrier_m0(ptr addrspace(1) %out, i32 %bar) #0 { -; GFX12-SDAG-LABEL: test4_s_wakeup_barrier_m0: -; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v2, v0, v0 -; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v3, 2, v0 -; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v2, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_mov_b32 m0, s2 -; GFX12-SDAG-NEXT: global_store_b32 v3, v1, s[0:1] -; GFX12-SDAG-NEXT: s_wakeup_barrier m0 -; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: test4_s_wakeup_barrier_m0: -; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0 -; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_mov_b32 m0, s2 -; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] -; GFX12-GISEL-NEXT: s_wakeup_barrier m0 -; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-GISEL-NEXT: s_endpgm -entry: - %tmp = call i32 @llvm.amdgcn.workitem.id.x() - %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp - store i32 0, ptr addrspace(1) %tmp1 - call void @llvm.amdgcn.s.wakeup.barrier(i32 %bar) - %tmp3 = mul i32 %tmp, %tmp - %tmp4 = sub i32 %tmp3, %tmp - store i32 %tmp4, ptr addrspace(1) %tmp1 - ret void -} - -define void @test5_s_wakeup_barrier_m0(i32 %arg) { -; GFX12-SDAG-LABEL: test5_s_wakeup_barrier_m0: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: s_mov_b32 m0, s0 -; GFX12-SDAG-NEXT: s_wakeup_barrier m0 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe -; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-GISEL-LABEL: test5_s_wakeup_barrier_m0: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_readfirstlane_b32 m0, v0 -; GFX12-GISEL-NEXT: s_wakeup_barrier m0 -; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - call void @llvm.amdgcn.s.wakeup.barrier(i32 %arg) - ret void -} - -define amdgpu_kernel void @test1_s_get_barrier_state(ptr addrspace(1) %out) #0 { -; GFX12-LABEL: test1_s_get_barrier_state: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_2) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_get_barrier_state s2, -1 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_endpgm -entry: - %tmp = call i32 @llvm.amdgcn.workitem.id.x() - %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp - store i32 0, ptr addrspace(1) %tmp1 - %state = call i32 @llvm.amdgcn.s.get.barrier.state(i32 -1) - store i32 %state, ptr addrspace(1) %tmp1 - ret void -} - -define amdgpu_kernel void @test2_s_get_barrier_state(ptr addrspace(1) %out) #0 { -; GFX12-LABEL: test2_s_get_barrier_state: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_2) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_get_barrier_state s2, 1 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_endpgm -entry: - %tmp = call i32 @llvm.amdgcn.workitem.id.x() - %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp - store i32 0, ptr addrspace(1) %tmp1 - %state = call i32 @llvm.amdgcn.s.get.barrier.state(i32 1) - store i32 %state, ptr addrspace(1) %tmp1 - ret void -} - -define amdgpu_kernel void @test3_s_get_barrier_state(ptr addrspace(1) %out) #0 { -; GFX12-LABEL: test3_s_get_barrier_state: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_2) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_get_barrier_state s2, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_endpgm -entry: - %tmp = call i32 @llvm.amdgcn.workitem.id.x() - %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp - store i32 0, ptr addrspace(1) %tmp1 - %state = call i32 @llvm.amdgcn.s.get.barrier.state(i32 0) - store i32 %state, ptr addrspace(1) %tmp1 - ret void -} - -define amdgpu_kernel void @test4_s_get_barrier_state_m0(ptr addrspace(1) %out, i32 %bar) #0 { -; GFX12-LABEL: test4_s_get_barrier_state_m0: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 m0, s2 -; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_get_barrier_state s2, m0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) -; GFX12-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_endpgm -entry: - %tmp = call i32 @llvm.amdgcn.workitem.id.x() - %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp - store i32 0, ptr addrspace(1) %tmp1 - %state = call i32 @llvm.amdgcn.s.get.barrier.state(i32 %bar) - store i32 %state, ptr addrspace(1) %tmp1 - ret void -} - -define i32 @test5_s_get_barrier_state_m0(i32 %arg) { -; GFX12-SDAG-LABEL: test5_s_get_barrier_state_m0: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: s_mov_b32 m0, s0 -; GFX12-SDAG-NEXT: s_get_barrier_state s0, m0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-GISEL-LABEL: test5_s_get_barrier_state_m0: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_readfirstlane_b32 m0, v0 -; GFX12-GISEL-NEXT: s_get_barrier_state s0, m0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %state = call i32 @llvm.amdgcn.s.get.barrier.state(i32 %arg) - ret i32 %state -} - -define i32 @test6_s_get_barrier_state_0() { -; GFX12-LABEL: test6_s_get_barrier_state_0: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_get_barrier_state s0, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %state = call i32 @llvm.amdgcn.s.get.barrier.state(i32 0) - ret i32 %state -} - -define amdgpu_kernel void @test_barrier_convert(ptr addrspace(1) %out) #0 { -; GFX12-SDAG-LABEL: test_barrier_convert: -; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0 -; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1] -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: s_barrier_signal -1 -; GFX12-SDAG-NEXT: s_barrier_wait -1 -; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: test_barrier_convert: -; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0 -; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] -; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: s_barrier_signal -1 -; GFX12-GISEL-NEXT: s_barrier_wait -1 -; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-GISEL-NEXT: s_endpgm -entry: - %tmp = call i32 @llvm.amdgcn.workitem.id.x() - %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp - store i32 0, ptr addrspace(1) %tmp1 - call void @llvm.amdgcn.s.barrier() - %tmp3 = mul i32 %tmp, %tmp - %tmp4 = sub i32 %tmp3, %tmp - store i32 %tmp4, ptr addrspace(1) %tmp1 - ret void -} -declare void @llvm.amdgcn.s.barrier() #1 -declare void @llvm.amdgcn.s.barrier.wait(i16) #1 -declare void @llvm.amdgcn.s.barrier.signal(i32) #1 -declare void @llvm.amdgcn.s.barrier.signal.var(i32) #1 -declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1 -declare i1 @llvm.amdgcn.s.barrier.signal.isfirst.var(i32) #1 -declare void @llvm.amdgcn.s.barrier.init(i32, i32) #1 -declare void @llvm.amdgcn.s.barrier.join(i32) #1 -declare i1 @llvm.amdgcn.s.barrier.leave() #1 -declare void @llvm.amdgcn.s.wakeup.barrier(i32) #1 -declare i32 @llvm.amdgcn.s.get.barrier.state(i32) #1 -declare i32 @llvm.amdgcn.s.get.barrier.state.var(i32) #1 -declare i32 @llvm.amdgcn.workitem.id.x() #2 - -attributes #0 = { nounwind } -attributes #1 = { convergent nounwind } -attributes #2 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll new file mode 100644 index 0000000000000..7cf8883082458 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll @@ -0,0 +1,66 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s + +@bar2 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison +@bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison +@bar1 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison + +; CHECK: @bar2 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !0 +; CHECK-NEXT: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !1 +; CHECK-NEXT: @bar1 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !2 +; CHECK-NEXT: @bar1.kernel1 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !2 + +define void @func1() { + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + ret void +} + +define void @func2() { + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + ret void +} + +define amdgpu_kernel void @kernel1() #0 { +; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1.kernel1, i32 11) + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) @bar1) + %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1) + call void @llvm.amdgcn.s.barrier() + call void @func1() + call void @func2() + ret void +} + +define amdgpu_kernel void @kernel2() #0 { +; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9) + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + + call void @func2() + ret void +} + +declare void @llvm.amdgcn.s.barrier() #1 +declare void @llvm.amdgcn.s.barrier.wait(i16) #1 +declare void @llvm.amdgcn.s.barrier.signal(i32) #1 +declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1 +declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1 +declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1 +declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1 +declare void @llvm.amdgcn.s.barrier.leave(i16) #1 +declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1 +declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1 + +attributes #0 = { nounwind } +attributes #1 = { convergent nounwind } +attributes #2 = { nounwind readnone } + +; CHECK: !0 = !{i32 8396816, i32 8396817} +; CHECK-NEXT: !1 = !{i32 8396848, i32 8396849} +; CHECK-NEXT: !2 = !{i32 8396832, i32 8396833} diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier.ll b/llvm/test/CodeGen/AMDGPU/s-barrier.ll new file mode 100644 index 0000000000000..0fe0c57df9d3a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/s-barrier.ll @@ -0,0 +1,299 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-GISEL %s + +@bar = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison +@bar2 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison +@bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison + +define void @func1() { +; GFX12-SDAG-LABEL: func1: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_mov_b32 m0, 0x70003 +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: s_barrier_signal m0 +; GFX12-SDAG-NEXT: s_mov_b32 m0, 3 +; GFX12-SDAG-NEXT: s_barrier_join m0 +; GFX12-SDAG-NEXT: s_barrier_wait 1 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: func1: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_mov_b32 m0, 0x70003 +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: s_barrier_signal m0 +; GFX12-GISEL-NEXT: s_barrier_join 3 +; GFX12-GISEL-NEXT: s_barrier_wait 1 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + ret void +} + +define void @func2() { +; GFX12-SDAG-LABEL: func2: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_mov_b32 m0, 0x70001 +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: s_barrier_signal m0 +; GFX12-SDAG-NEXT: s_mov_b32 m0, 1 +; GFX12-SDAG-NEXT: s_barrier_join m0 +; GFX12-SDAG-NEXT: s_barrier_wait 1 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: func2: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_mov_b32 m0, 0x70001 +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: s_barrier_signal m0 +; GFX12-GISEL-NEXT: s_barrier_join 1 +; GFX12-GISEL-NEXT: s_barrier_wait 1 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + ret void +} + +define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { +; GFX12-SDAG-LABEL: kernel1: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX12-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX12-SDAG-NEXT: s_mov_b32 m0, 0xc0002 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v31, v0 +; GFX12-SDAG-NEXT: s_barrier_init m0 +; GFX12-SDAG-NEXT: s_add_nc_u64 s[8:9], s[2:3], 48 +; GFX12-SDAG-NEXT: s_mov_b32 s32, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_lshr_b32 s0, s0, 4 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_and_b32 s0, s0, 63 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_or_b32 s1, 0x90000, s0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_mov_b32 m0, s1 +; GFX12-SDAG-NEXT: s_barrier_init m0 +; GFX12-SDAG-NEXT: s_mov_b32 m0, 0xc0002 +; GFX12-SDAG-NEXT: s_barrier_signal m0 +; GFX12-SDAG-NEXT: s_mov_b32 m0, s1 +; GFX12-SDAG-NEXT: s_barrier_signal m0 +; GFX12-SDAG-NEXT: s_mov_b32 m0, s0 +; GFX12-SDAG-NEXT: s_barrier_signal -1 +; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1 +; GFX12-SDAG-NEXT: s_barrier_join m0 +; GFX12-SDAG-NEXT: s_mov_b32 m0, 2 +; GFX12-SDAG-NEXT: s_barrier_wait 1 +; GFX12-SDAG-NEXT: s_barrier_leave +; GFX12-SDAG-NEXT: s_wakeup_barrier m0 +; GFX12-SDAG-NEXT: s_mov_b32 m0, s0 +; GFX12-SDAG-NEXT: s_wakeup_barrier m0 +; GFX12-SDAG-NEXT: s_mov_b32 m0, 2 +; GFX12-SDAG-NEXT: s_get_barrier_state s1, m0 +; GFX12-SDAG-NEXT: s_mov_b32 m0, s0 +; GFX12-SDAG-NEXT: s_get_barrier_state s0, m0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_getpc_b64 s[0:1] +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_sext_i32_i16 s1, s1 +; GFX12-SDAG-NEXT: s_add_co_u32 s0, s0, func1@gotpcrel32@lo+12 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_add_co_ci_u32 s1, s1, func1@gotpcrel32@hi+24 +; GFX12-SDAG-NEXT: s_barrier_signal -1 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-SDAG-NEXT: s_barrier_wait -1 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX12-SDAG-NEXT: s_getpc_b64 s[0:1] +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_sext_i32_i16 s1, s1 +; GFX12-SDAG-NEXT: s_add_co_u32 s0, s0, func2@gotpcrel32@lo+12 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_add_co_ci_u32 s1, s1, func2@gotpcrel32@hi+24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX12-SDAG-NEXT: s_get_barrier_state s0, -1 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: kernel1: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX12-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX12-GISEL-NEXT: s_mov_b32 m0, 0xc0002 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v31, v0 +; GFX12-GISEL-NEXT: s_barrier_init m0 +; GFX12-GISEL-NEXT: s_mov_b32 s32, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_lshr_b32 s0, s0, 4 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_and_b32 s0, s0, 63 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_or_b32 s1, s0, 0x90000 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_mov_b32 m0, s1 +; GFX12-GISEL-NEXT: s_barrier_init m0 +; GFX12-GISEL-NEXT: s_mov_b32 m0, 0xc0002 +; GFX12-GISEL-NEXT: s_barrier_signal m0 +; GFX12-GISEL-NEXT: s_mov_b32 m0, s1 +; GFX12-GISEL-NEXT: s_barrier_signal m0 +; GFX12-GISEL-NEXT: s_barrier_signal -1 +; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1 +; GFX12-GISEL-NEXT: s_mov_b32 m0, s0 +; GFX12-GISEL-NEXT: s_add_co_u32 s8, s2, 48 +; GFX12-GISEL-NEXT: s_barrier_join m0 +; GFX12-GISEL-NEXT: s_barrier_wait 1 +; GFX12-GISEL-NEXT: s_barrier_leave +; GFX12-GISEL-NEXT: s_wakeup_barrier 2 +; GFX12-GISEL-NEXT: s_wakeup_barrier m0 +; GFX12-GISEL-NEXT: s_get_barrier_state s0, 2 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_get_barrier_state s0, m0 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s9, s3, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_getpc_b64 s[0:1] +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_sext_i32_i16 s1, s1 +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, func1@gotpcrel32@lo+12 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, func1@gotpcrel32@hi+24 +; GFX12-GISEL-NEXT: s_barrier_signal -1 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-GISEL-NEXT: s_barrier_wait -1 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX12-GISEL-NEXT: s_add_co_u32 s8, s2, 48 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s9, s3, 0 +; GFX12-GISEL-NEXT: s_getpc_b64 s[0:1] +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_sext_i32_i16 s1, s1 +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, func2@gotpcrel32@lo+12 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, func2@gotpcrel32@hi+24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX12-GISEL-NEXT: s_get_barrier_state s0, -1 +; GFX12-GISEL-NEXT: s_endpgm + call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) @bar, i32 12) + call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) %in, i32 9) + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 12) + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) %in, i32 9) + call void @llvm.amdgcn.s.barrier.signal(i32 -1) + %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) %in) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + call void @llvm.amdgcn.s.barrier.leave(i16 1) + call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) @bar) + call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) %in) + %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar) + %state2 = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) %in) + call void @llvm.amdgcn.s.barrier() + call void @func1() + call void @func2() + %state3 = call i32 @llvm.amdgcn.s.get.barrier.state(i32 -1) + ret void +} + +define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { +; GFX12-SDAG-LABEL: kernel2: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX12-SDAG-NEXT: s_getpc_b64 s[4:5] +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_sext_i32_i16 s5, s5 +; GFX12-SDAG-NEXT: s_add_co_u32 s4, s4, func2@gotpcrel32@lo+12 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_add_co_ci_u32 s5, s5, func2@gotpcrel32@hi+24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v31, v0 +; GFX12-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX12-SDAG-NEXT: s_mov_b32 m0, 0x70002 +; GFX12-SDAG-NEXT: s_add_nc_u64 s[8:9], s[2:3], 48 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_barrier_signal m0 +; GFX12-SDAG-NEXT: s_mov_b32 m0, 2 +; GFX12-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX12-SDAG-NEXT: s_mov_b32 s32, 0 +; GFX12-SDAG-NEXT: s_barrier_join m0 +; GFX12-SDAG-NEXT: s_barrier_wait 1 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: kernel2: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_add_co_u32 s8, s2, 48 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s9, s3, 0 +; GFX12-GISEL-NEXT: s_getpc_b64 s[2:3] +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_sext_i32_i16 s3, s3 +; GFX12-GISEL-NEXT: s_add_co_u32 s2, s2, func2@gotpcrel32@lo+12 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s3, s3, func2@gotpcrel32@hi+24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v31, v0 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-GISEL-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX12-GISEL-NEXT: s_mov_b32 m0, 0x70002 +; GFX12-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX12-GISEL-NEXT: s_mov_b32 s32, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_barrier_signal m0 +; GFX12-GISEL-NEXT: s_barrier_join 2 +; GFX12-GISEL-NEXT: s_barrier_wait 1 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX12-GISEL-NEXT: s_endpgm + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 7) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + + call void @func2() + ret void +} + +declare void @llvm.amdgcn.s.barrier() #1 +declare void @llvm.amdgcn.s.barrier.wait(i16) #1 +declare void @llvm.amdgcn.s.barrier.signal(i32) #1 +declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1 +declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1 +declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1 +declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1 +declare void @llvm.amdgcn.s.barrier.leave(i16) #1 +declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1 +declare i32 @llvm.amdgcn.s.get.barrier.state(i32) #1 +declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1 + +attributes #0 = { nounwind } +attributes #1 = { convergent nounwind } +attributes #2 = { nounwind readnone }