diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index 35c9f8ae48c80..ad012d98635ff 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -171,16 +171,6 @@ static Value *emitFPIntBuiltin(CodeGenFunction &CGF, return CGF.Builder.CreateCall(F, {Src0, Src1}); } -static Value *emitRangedBuiltin(CodeGenFunction &CGF, unsigned IntrinsicID, - int low, int high) { - Function *F = CGF.CGM.getIntrinsic(IntrinsicID, {}); - llvm::CallInst *Call = CGF.Builder.CreateCall(F); - llvm::ConstantRange CR(APInt(32, low), APInt(32, high)); - Call->addRangeRetAttr(CR); - Call->addRetAttr(llvm::Attribute::AttrKind::NoUndef); - return Call; -} - // For processing memory ordering and memory scope arguments of various // amdgcn builtins. // \p Order takes a C++11 comptabile memory-ordering specifier and converts @@ -934,15 +924,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, Function *F = CGM.getIntrinsic(BuiltinWMMAOp, ArgTypes); return Builder.CreateCall(F, Args); } - - // amdgcn workitem - case AMDGPU::BI__builtin_amdgcn_workitem_id_x: - return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_x, 0, 1024); - case AMDGPU::BI__builtin_amdgcn_workitem_id_y: - return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_y, 0, 1024); - case AMDGPU::BI__builtin_amdgcn_workitem_id_z: - return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_z, 0, 1024); - // amdgcn workgroup size case AMDGPU::BI__builtin_amdgcn_workgroup_size_x: return EmitAMDGPUWorkGroupSize(*this, 0); @@ -964,12 +945,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_r600_recipsqrt_ieeef: return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::r600_recipsqrt_ieee); - case AMDGPU::BI__builtin_r600_read_tidig_x: - return emitRangedBuiltin(*this, Intrinsic::r600_read_tidig_x, 0, 1024); - case AMDGPU::BI__builtin_r600_read_tidig_y: - return emitRangedBuiltin(*this, Intrinsic::r600_read_tidig_y, 0, 1024); - case AMDGPU::BI__builtin_r600_read_tidig_z: - return emitRangedBuiltin(*this, Intrinsic::r600_read_tidig_z, 0, 1024); case AMDGPU::BI__builtin_amdgcn_alignbit: { llvm::Value *Src0 = EmitScalarExpr(E->getArg(0)); llvm::Value *Src1 = EmitScalarExpr(E->getArg(1)); diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl index ded5f6b5ac4fd..bf022bc6eb446 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -605,9 +605,9 @@ void test_s_getreg(volatile global uint *out) } // CHECK-LABEL: @test_get_local_id( -// CHECK: tail call noundef range(i32 0, 1024){{.*}} i32 @llvm.amdgcn.workitem.id.x() -// CHECK: tail call noundef range(i32 0, 1024){{.*}} i32 @llvm.amdgcn.workitem.id.y() -// CHECK: tail call noundef range(i32 0, 1024){{.*}} i32 @llvm.amdgcn.workitem.id.z() +// CHECK: tail call{{.*}} i32 @llvm.amdgcn.workitem.id.x() +// CHECK: tail call{{.*}} i32 @llvm.amdgcn.workitem.id.y() +// CHECK: tail call{{.*}} i32 @llvm.amdgcn.workitem.id.z() void test_get_local_id(int d, global int *out) { switch (d) { @@ -618,6 +618,10 @@ void test_get_local_id(int d, global int *out) } } +// CHECK: declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() +// CHECK: declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y() +// CHECK: declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z() + // CHECK-LABEL: @test_get_workgroup_size( // CHECK: {{.*}}call align 8 dereferenceable(256){{.*}} ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() // CHECK: getelementptr inbounds nuw i8, ptr addrspace(4) %{{.*}}, i64 12 diff --git a/clang/test/CodeGenOpenCL/builtins-r600.cl b/clang/test/CodeGenOpenCL/builtins-r600.cl index a82c4fb90ec50..5fe130f585688 100644 --- a/clang/test/CodeGenOpenCL/builtins-r600.cl +++ b/clang/test/CodeGenOpenCL/builtins-r600.cl @@ -39,9 +39,9 @@ void test_get_group_id(int d, global int *out) } // CHECK-LABEL: @test_get_local_id( -// CHECK: tail call noundef range(i32 0, 1024) i32 @llvm.r600.read.tidig.x() -// CHECK: tail call noundef range(i32 0, 1024) i32 @llvm.r600.read.tidig.y() -// CHECK: tail call noundef range(i32 0, 1024) i32 @llvm.r600.read.tidig.z() +// CHECK: tail call i32 @llvm.r600.read.tidig.x() +// CHECK: tail call i32 @llvm.r600.read.tidig.y() +// CHECK: tail call i32 @llvm.r600.read.tidig.z() void test_get_local_id(int d, global int *out) { switch (d) { @@ -52,3 +52,6 @@ void test_get_local_id(int d, global int *out) } } +// CHECK: declare noundef range(i32 0, 1024) i32 @llvm.r600.read.tidig.x() +// CHECK: declare noundef range(i32 0, 1024) i32 @llvm.r600.read.tidig.y() +// CHECK: declare noundef range(i32 0, 1024) i32 @llvm.r600.read.tidig.z() diff --git a/clang/test/Headers/gpuintrin.c b/clang/test/Headers/gpuintrin.c index 9a15ce277ba87..f7dfb86ac4652 100644 --- a/clang/test/Headers/gpuintrin.c +++ b/clang/test/Headers/gpuintrin.c @@ -291,7 +291,7 @@ __gpu_kernel void foo() { // AMDGPU-NEXT: [[ENTRY:.*:]] // AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) // AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr -// AMDGPU-NEXT: [[TMP0:%.*]] = call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() +// AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() // AMDGPU-NEXT: ret i32 [[TMP0]] // // @@ -300,7 +300,7 @@ __gpu_kernel void foo() { // AMDGPU-NEXT: [[ENTRY:.*:]] // AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) // AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr -// AMDGPU-NEXT: [[TMP0:%.*]] = call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y() +// AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() // AMDGPU-NEXT: ret i32 [[TMP0]] // // @@ -309,7 +309,7 @@ __gpu_kernel void foo() { // AMDGPU-NEXT: [[ENTRY:.*:]] // AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) // AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr -// AMDGPU-NEXT: [[TMP0:%.*]] = call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z() +// AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() // AMDGPU-NEXT: ret i32 [[TMP0]] // // diff --git a/clang/test/Headers/gpuintrin_lang.c b/clang/test/Headers/gpuintrin_lang.c index fa04849f8094d..ab660ac5c8a49 100644 --- a/clang/test/Headers/gpuintrin_lang.c +++ b/clang/test/Headers/gpuintrin_lang.c @@ -36,7 +36,7 @@ __device__ int foo() { return __gpu_thread_id_x(); } // CUDA-LABEL: define dso_local i32 @foo( // CUDA-SAME: ) #[[ATTR0:[0-9]+]] { // CUDA-NEXT: [[ENTRY:.*:]] -// CUDA-NEXT: [[TMP0:%.*]] = call {{.*}}i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CUDA-NEXT: [[TMP0:%.*]] = call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() // CUDA-NEXT: ret i32 [[TMP0]] // // HIP-LABEL: define dso_local i32 @foo( @@ -46,29 +46,29 @@ __device__ int foo() { return __gpu_thread_id_x(); } // HIP-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) // HIP-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr // HIP-NEXT: [[RETVAL_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I]] to ptr -// HIP-NEXT: [[TMP0:%.*]] = call noundef {{.*}}i32 @llvm.amdgcn.workitem.id.x() +// HIP-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() // HIP-NEXT: ret i32 [[TMP0]] // // OPENCL-LABEL: define dso_local i32 @foo( // OPENCL-SAME: ) #[[ATTR0:[0-9]+]] { // OPENCL-NEXT: [[ENTRY:.*:]] -// OPENCL-NEXT: [[TMP0:%.*]] = call noundef {{.*}}i32 @llvm.amdgcn.workitem.id.x() +// OPENCL-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() // OPENCL-NEXT: ret i32 [[TMP0]] // // OPENMP-LABEL: define hidden i32 @foo( // OPENMP-SAME: ) #[[ATTR0:[0-9]+]] { // OPENMP-NEXT: [[ENTRY:.*:]] -// OPENMP-NEXT: [[TMP0:%.*]] = call noundef {{.*}}i32 @llvm.amdgcn.workitem.id.x() +// OPENMP-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() // OPENMP-NEXT: ret i32 [[TMP0]] // // C89-LABEL: define dso_local i32 @foo( -// C89-SAME: ) #[[ATTR2:[0-9]+]] { +// C89-SAME: ) #[[ATTR0:[0-9]+]] { // C89-NEXT: [[ENTRY:.*:]] // C89-NEXT: [[RETVAL_I:%.*]] = alloca i32, align 4, addrspace(5) // C89-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) // C89-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr // C89-NEXT: [[RETVAL_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I]] to ptr -// C89-NEXT: [[TMP0:%.*]] = call noundef {{.*}}i32 @llvm.amdgcn.workitem.id.x() +// C89-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() // C89-NEXT: ret i32 [[TMP0]] // int foo() { return __gpu_thread_id_x(); } diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 217e43fcce4fd..75068717d9a5f 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -20,11 +20,16 @@ def local_ptr_ty : LLVMQualPointerType<3>; // some preloaded register from a function that is known to not need it is a violation // of the calling convention and also program-level UB. Outside of such IR-level UB, // these preloaded registers are always set to a well-defined value and are thus `noundef`. -class AMDGPUReadPreloadRegisterIntrinsic - : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef, IntrNoMem, IntrSpeculatable]>; +class AMDGPUReadPreloadRegisterIntrinsic< + list ExtraAttrs = []> + : DefaultAttrsIntrinsic<[llvm_i32_ty], [], + !listconcat([NoUndef, IntrNoMem, + IntrSpeculatable], + ExtraAttrs)>; -class AMDGPUReadPreloadRegisterIntrinsicNamed - : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef, IntrNoMem, IntrSpeculatable]>, ClangBuiltin; +class AMDGPUReadPreloadRegisterIntrinsicNamed< + string name, list ExtraAttrs = []> + : AMDGPUReadPreloadRegisterIntrinsic, ClangBuiltin; // Used to tag image and resource intrinsics with information used to generate // mem operands. @@ -35,17 +40,22 @@ class AMDGPURsrcIntrinsic { let TargetPrefix = "r600" in { -multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz { - def _x : AMDGPUReadPreloadRegisterIntrinsic; - def _y : AMDGPUReadPreloadRegisterIntrinsic; - def _z : AMDGPUReadPreloadRegisterIntrinsic; -} + multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz< + list ExtraAttrs = []> { + def _x : AMDGPUReadPreloadRegisterIntrinsic; + def _y : AMDGPUReadPreloadRegisterIntrinsic; + def _z : AMDGPUReadPreloadRegisterIntrinsic; + } -multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz_named { - def _x : AMDGPUReadPreloadRegisterIntrinsicNamed; - def _y : AMDGPUReadPreloadRegisterIntrinsicNamed; - def _z : AMDGPUReadPreloadRegisterIntrinsicNamed; -} + multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz_named< + string prefix, list ExtraAttrs = []> { + def _x : AMDGPUReadPreloadRegisterIntrinsicNamed; + def _y : AMDGPUReadPreloadRegisterIntrinsicNamed; + def _z : AMDGPUReadPreloadRegisterIntrinsicNamed; + } defm int_r600_read_global_size : AMDGPUReadPreloadRegisterIntrinsic_xyz_named <"__builtin_r600_read_global_size">; @@ -55,7 +65,9 @@ defm int_r600_read_tgid : AMDGPUReadPreloadRegisterIntrinsic_xyz_named <"__builtin_r600_read_tgid">; defm int_r600_read_local_size : AMDGPUReadPreloadRegisterIntrinsic_xyz; -defm int_r600_read_tidig : AMDGPUReadPreloadRegisterIntrinsic_xyz; +defm int_r600_read_tidig + : AMDGPUReadPreloadRegisterIntrinsic_xyz_named< + "__builtin_r600_read_tidig", [Range]>; def int_r600_group_barrier : ClangBuiltin<"__builtin_r600_group_barrier">, Intrinsic<[], [], [IntrConvergent, IntrWillReturn]>; @@ -146,7 +158,10 @@ let TargetPrefix = "amdgcn" in { // ABI Special Intrinsics //===----------------------------------------------------------------------===// -defm int_amdgcn_workitem_id : AMDGPUReadPreloadRegisterIntrinsic_xyz; +defm int_amdgcn_workitem_id + : AMDGPUReadPreloadRegisterIntrinsic_xyz_named< + "__builtin_amdgcn_workitem_id", [Range]>; + defm int_amdgcn_workgroup_id : AMDGPUReadPreloadRegisterIntrinsic_xyz_named <"__builtin_amdgcn_workgroup_id">; diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll index 2ac5c78d8cdb5..b563e03b6080f 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll @@ -61,7 +61,7 @@ entry: ; GCN: define amdgpu_kernel void @test_inliner( ; GCN-INL1: %c1 = tail call coldcc float @foo( -; GCN-INLDEF: %cmp.i = fcmp ogt float %tmp2, 0.000000e+00 +; GCN-INLDEF: %cmp.i = fcmp ogt float %{{.+}}, 0.000000e+00 ; GCN-MAXBBDEF: %div.i{{[0-9]*}} = fdiv float 1.000000e+00, %c ; GCN-MAXBBDEF: %div.i{{[0-9]*}} = fdiv float 2.000000e+00, %tmp1.i ; GCN-MAXBB1: call coldcc void @foo_private_ptr diff --git a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll index 7819da8b97e55..9cf9d81773037 100644 --- a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll @@ -258,46 +258,41 @@ define amdgpu_kernel void @add_x_shl_max_offset() #1 { define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_alt() #1 { ; CI-LABEL: add_x_shl_neg_to_sub_max_offset_alt: ; CI: ; %bb.0: -; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; CI-NEXT: v_xor_b32_e32 v0, 0xffff, v0 +; CI-NEXT: v_mul_i32_i24_e32 v0, -4, v0 ; CI-NEXT: v_mov_b32_e32 v1, 13 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: ds_write_b8 v0, v1 +; CI-NEXT: ds_write_b8 v0, v1 offset:65535 ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: add_x_shl_neg_to_sub_max_offset_alt: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_xor_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_mul_i32_i24_e32 v0, -4, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 13 -; GFX9-NEXT: ds_write_b8 v0, v1 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:65535 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: add_x_shl_neg_to_sub_max_offset_alt: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mul_i32_i24_e32 v0, -4, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 13 -; GFX10-NEXT: v_xor_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: ds_write_b8 v0, v1 +; GFX10-NEXT: ds_write_b8 v0, v1 offset:65535 ; GFX10-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: add_x_shl_neg_to_sub_max_offset_alt: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-TRUE16-NEXT: v_xor_b32_e32 v1, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mul_i32_i24_e32 v1, -4, v0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 13 -; GFX11-TRUE16-NEXT: ds_store_b8 v1, v0 +; GFX11-TRUE16-NEXT: ds_store_b8 v1, v0 offset:65535 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: add_x_shl_neg_to_sub_max_offset_alt: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-FAKE16-NEXT: v_xor_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: ds_store_b8 v0, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mul_i32_i24_e32 v0, -4, v0 +; GFX11-FAKE16-NEXT: ds_store_b8 v0, v1 offset:65535 ; GFX11-FAKE16-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() %.neg = mul i32 %x.i, -4 @@ -447,9 +442,9 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use() #1 { ; ; GFX11-LABEL: add_x_shl_neg_to_sub_multi_use: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0, v0 ; GFX11-NEXT: ds_store_b32 v0, v1 offset:123 ; GFX11-NEXT: ds_store_b32 v0, v1 offset:456 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll index 96d0e383761d1..690e5cc68747f 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll @@ -142,7 +142,7 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-SDAG-NEXT: v_lshl_add_u32 v0, v0, 1, v2 +; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 2, v2 ; GFX942-SDAG-NEXT: v_add_u32_e32 v2, 1, v0 ; GFX942-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 ; GFX942-SDAG-NEXT: scratch_store_byte v2, v1, off sc0 sc1 @@ -160,7 +160,7 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; GFX942-GISEL: ; %bb.0: ; %bb ; GFX942-GISEL-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX942-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX942-GISEL-NEXT: v_mul_u32_u24_e32 v0, 2, v0 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s0, v0 @@ -181,13 +181,14 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_mul_u32_u24_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 @@ -201,8 +202,9 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; GFX11-GISEL: ; %bb.0: ; %bb ; GFX11-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mul_u32_u24_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -223,7 +225,7 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v0, 2, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 @@ -239,7 +241,7 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX12-GISEL-NEXT: v_mul_u32_u24_e32 v0, 2, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS @@ -273,7 +275,7 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, v2 +; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 4, v2 ; GFX942-SDAG-NEXT: v_add_u32_e32 v2, 1, v0 ; GFX942-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 ; GFX942-SDAG-NEXT: scratch_store_byte v2, v1, off sc0 sc1 @@ -291,7 +293,7 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; GFX942-GISEL: ; %bb.0: ; %bb ; GFX942-GISEL-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX942-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX942-GISEL-NEXT: v_mul_u32_u24_e32 v0, 4, v0 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s0, v0 @@ -312,13 +314,14 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_mul_u32_u24_e32 v0, 4, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 @@ -332,8 +335,9 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; GFX11-GISEL: ; %bb.0: ; %bb ; GFX11-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mul_u32_u24_e32 v0, 4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -354,7 +358,7 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v0, 4, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 @@ -370,7 +374,7 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-GISEL-NEXT: v_mul_u32_u24_e32 v0, 4, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS @@ -537,7 +541,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 1 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-SDAG-NEXT: v_lshl_add_u32 v0, v0, 1, v2 +; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 2, v2 ; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1 ; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX942-SDAG-NEXT: v_add_u32_e32 v1, 2, v0 @@ -554,7 +558,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; GFX942-GISEL: ; %bb.0: ; %bb ; GFX942-GISEL-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX942-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX942-GISEL-NEXT: v_mul_u32_u24_e32 v0, 2, v0 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: s_lshl_b32 s0, s0, 1 @@ -576,15 +580,16 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_mul_u32_u24_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 4, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 2, v0 +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 4, v0 ; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: scratch_store_b8 v4, v2, off dlc @@ -597,8 +602,9 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; GFX11-GISEL: ; %bb.0: ; %bb ; GFX11-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mul_u32_u24_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) @@ -621,7 +627,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v0, 2, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 1 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS @@ -638,7 +644,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX12-GISEL-NEXT: v_mul_u32_u24_e32 v0, 2, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 1 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) @@ -675,7 +681,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 1 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, v2 +; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 4, v2 ; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1 ; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX942-SDAG-NEXT: v_add_u32_e32 v1, 2, v0 @@ -692,7 +698,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; GFX942-GISEL: ; %bb.0: ; %bb ; GFX942-GISEL-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX942-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX942-GISEL-NEXT: v_mul_u32_u24_e32 v0, 4, v0 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: s_lshl_b32 s0, s0, 1 @@ -714,15 +720,16 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_mul_u32_u24_e32 v0, 4, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 4, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 2, v0 +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 4, v0 ; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: scratch_store_b8 v4, v2, off dlc @@ -735,8 +742,9 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; GFX11-GISEL: ; %bb.0: ; %bb ; GFX11-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mul_u32_u24_e32 v0, 4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) @@ -759,7 +767,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v0, 4, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 1 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS @@ -776,7 +784,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-GISEL-NEXT: v_mul_u32_u24_e32 v0, 4, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 1 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) @@ -945,7 +953,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 2 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; GFX942-SDAG-NEXT: v_lshl_add_u32 v0, v0, 1, v2 +; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 2, v2 ; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1 ; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX942-SDAG-NEXT: v_add_u32_e32 v1, 2, v0 @@ -962,7 +970,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; GFX942-GISEL: ; %bb.0: ; %bb ; GFX942-GISEL-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX942-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX942-GISEL-NEXT: v_mul_u32_u24_e32 v0, 2, v0 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: s_lshl_b32 s0, s0, 2 @@ -984,15 +992,16 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_mul_u32_u24_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 4, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 2, v0 +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 4, v0 ; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: scratch_store_b8 v4, v2, off dlc @@ -1005,8 +1014,9 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; GFX11-GISEL: ; %bb.0: ; %bb ; GFX11-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mul_u32_u24_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) @@ -1029,7 +1039,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v0, 2, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS @@ -1046,7 +1056,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX12-GISEL-NEXT: v_mul_u32_u24_e32 v0, 2, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) @@ -1084,7 +1094,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 2 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, s0 -; GFX942-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, v3 +; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 4, v3 ; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1 ; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX942-SDAG-NEXT: scratch_store_byte v0, v2, off offset:2 sc0 sc1 @@ -1099,7 +1109,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; GFX942-GISEL: ; %bb.0: ; %bb ; GFX942-GISEL-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX942-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX942-GISEL-NEXT: v_mul_u32_u24_e32 v0, 4, v0 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: s_lshl_b32 s0, s0, 2 @@ -1123,7 +1133,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 4 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: v_mul_u32_u24_e32 v0, 4, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) @@ -1142,8 +1152,9 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; GFX11-GISEL: ; %bb.0: ; %bb ; GFX11-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mul_u32_u24_e32 v0, 4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) @@ -1166,7 +1177,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v0, 4, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS @@ -1183,7 +1194,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-GISEL-NEXT: v_mul_u32_u24_e32 v0, 4, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index b25d9b245f5f6..b5e579b78a59c 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -714,10 +714,10 @@ define amdgpu_kernel void @store_load_vindex_kernel(i32 %n) { ; GFX11-LABEL: store_load_vindex_kernel: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 15 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshl_b32 s0, s0, 7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) @@ -732,9 +732,9 @@ define amdgpu_kernel void @store_load_vindex_kernel(i32 %n) { ; GFX12-LABEL: store_load_vindex_kernel: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, 0xffc, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b32 s0, s0, 7 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) @@ -769,8 +769,8 @@ define amdgpu_kernel void @store_load_vindex_kernel(i32 %n) { ; GFX942-LABEL: store_load_vindex_kernel: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX942-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX942-NEXT: v_mov_b32_e32 v1, 15 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_lshl_b32 s0, s0, 7 @@ -809,10 +809,10 @@ define amdgpu_kernel void @store_load_vindex_kernel(i32 %n) { ; GFX11-PAL-LABEL: store_load_vindex_kernel: ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-PAL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 +; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 7 ; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) @@ -827,9 +827,9 @@ define amdgpu_kernel void @store_load_vindex_kernel(i32 %n) { ; GFX12-PAL-LABEL: store_load_vindex_kernel: ; GFX12-PAL: ; %bb.0: ; %bb ; GFX12-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 +; GFX12-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PAL-NEXT: s_lshl_b32 s0, s0, 7 ; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) @@ -1958,10 +1958,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) { ; GFX11-LABEL: store_load_vindex_small_offset_kernel: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: scratch_load_b32 v3, off, off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:384 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1976,10 +1976,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) { ; GFX12-LABEL: store_load_vindex_small_offset_kernel: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, 0xffc, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:384 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -2021,8 +2021,8 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) { ; GFX942-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX942-NEXT: scratch_load_dword v1, off, off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX942-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX942-NEXT: v_mov_b32_e32 v1, 15 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_lshl_b32 s0, s0, 7 @@ -2092,10 +2092,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) { ; GFX11-PAL-LABEL: store_load_vindex_small_offset_kernel: ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-PAL-NEXT: scratch_load_b32 v3, off, off glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 +; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:384 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) @@ -2110,10 +2110,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) { ; GFX12-PAL-LABEL: store_load_vindex_small_offset_kernel: ; GFX12-PAL: ; %bb.0: ; %bb ; GFX12-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-PAL-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 -; GFX12-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 +; GFX12-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:384 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_storecnt 0x0 @@ -3254,10 +3254,10 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) { ; GFX11-LABEL: store_load_vindex_large_offset_kernel: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshl_b32 s0, s0, 7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -3274,10 +3274,10 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) { ; GFX12-LABEL: store_load_vindex_large_offset_kernel: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, 0xffc, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:16512 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -3319,8 +3319,8 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) { ; GFX942-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX942-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX942-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX942-NEXT: v_mov_b32_e32 v1, 15 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_lshl_b32 s0, s0, 7 @@ -3391,10 +3391,10 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) { ; GFX11-PAL-LABEL: store_load_vindex_large_offset_kernel: ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-PAL-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 +; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 7 ; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -3411,10 +3411,10 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) { ; GFX12-PAL-LABEL: store_load_vindex_large_offset_kernel: ; GFX12-PAL: ; %bb.0: ; %bb ; GFX12-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-PAL-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 -; GFX12-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 +; GFX12-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:16512 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_storecnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll index 08c0d15432915..565ad295ebbb3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll @@ -15,8 +15,8 @@ define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in, ; GCN-LABEL: test_iglp_opt_mfma_gemm: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 2.0 ; GCN-NEXT: ; iglp_opt mask(0x00000000) ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -153,8 +153,8 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias ; GCN-LABEL: test_iglp_opt_rev_mfma_gemm: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 1.0 ; GCN-NEXT: v_mov_b32_e32 v3, 2.0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -289,8 +289,8 @@ define amdgpu_kernel void @test_iglp_opt_asm_sideeffect(ptr addrspace(3) noalias ; GCN-LABEL: test_iglp_opt_asm_sideeffect: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GCN-NEXT: ; iglp_opt mask(0x00000000) ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_u32_e32 v1, s0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll index 46359f7e99059..6507976872410 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll @@ -6,9 +6,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_cluster(ptr ad ; GCN-LABEL: test_sched_group_barrier_pipeline_WMMA_cluster: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_and_b32_e32 v40, 0x7fe0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v40, 5, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_nc_u32_e32 v32, s0, v40 ; GCN-NEXT: v_dual_mov_b32 v81, s1 :: v_dual_add_nc_u32 v80, s1, v40 @@ -74,9 +74,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_cluster(ptr ad ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_WMMA_cluster: ; EXACTCUTOFF: ; %bb.0: ; %entry ; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_and_b32_e32 v40, 0x7fe0, v0 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v40, 5, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v32, s0, v40 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v81, s1 :: v_dual_add_nc_u32 v80, s1, v40 @@ -178,9 +178,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr ; GCN-LABEL: test_sched_group_barrier_pipeline_WMMA_interleave: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_and_b32_e32 v16, 0x7fe0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 5, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_nc_u32_e32 v17, s0, v16 ; GCN-NEXT: v_add_nc_u32_e32 v16, s1, v16 @@ -260,9 +260,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_WMMA_interleave: ; EXACTCUTOFF: ; %bb.0: ; %entry ; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_and_b32_e32 v16, 0x7fe0, v0 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v16, 5, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v17, s0, v16 ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v16, s1, v16 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll index dcc3e0df0c744..02e80b62fed6e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll @@ -8,10 +8,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr ; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_mov_b32_e32 v48, 0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_and_b32_e32 v28, 0x3ff0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 4, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_add_nc_u32_e32 v0, s0, v28 ; GCN-NEXT: v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28 @@ -60,10 +60,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster: ; EXACTCUTOFF: ; %bb.0: ; %entry ; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v48, 0 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_and_b32_e32 v28, 0x3ff0, v0 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v28, 4, v0 ; EXACTCUTOFF-NEXT: s_wait_kmcnt 0x0 ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v0, s0, v28 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll index 0764cd5d34d75..371b4f070094d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll @@ -7,8 +7,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-MINREG-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; GCN-MINREG: ; %bb.0: ; %entry ; GCN-MINREG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-MINREG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-MINREG-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GCN-MINREG-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-MINREG-NEXT: v_mov_b32_e32 v2, 1.0 ; GCN-MINREG-NEXT: v_mov_b32_e32 v1, 2.0 ; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) @@ -140,8 +140,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-MAXOCC-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; GCN-MAXOCC: ; %bb.0: ; %entry ; GCN-MAXOCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-MAXOCC-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GCN-MAXOCC-NEXT: v_and_b32_e32 v1, 0x1ff80, v0 +; GCN-MAXOCC-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-MAXOCC-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GCN-MAXOCC-NEXT: v_mov_b32_e32 v2, 1.0 ; GCN-MAXOCC-NEXT: v_mov_b32_e32 v3, 2.0 ; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) @@ -274,8 +274,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-ILP-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; GCN-ILP: ; %bb.0: ; %entry ; GCN-ILP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-ILP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-ILP-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GCN-ILP-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-ILP-NEXT: v_mov_b32_e32 v1, 1.0 ; GCN-ILP-NEXT: v_mov_b32_e32 v2, 2.0 ; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) @@ -469,8 +469,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-MINREG-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave_split_region: ; GCN-MINREG: ; %bb.0: ; %entry ; GCN-MINREG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-MINREG-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GCN-MINREG-NEXT: v_and_b32_e32 v2, 0x1ff80, v0 +; GCN-MINREG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-MINREG-NEXT: v_lshlrev_b32_e32 v2, 7, v0 ; GCN-MINREG-NEXT: v_mov_b32_e32 v1, 1.0 ; GCN-MINREG-NEXT: v_mov_b32_e32 v0, 2.0 ; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) @@ -604,8 +604,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-MAXOCC-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave_split_region: ; GCN-MAXOCC: ; %bb.0: ; %entry ; GCN-MAXOCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-MAXOCC-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GCN-MAXOCC-NEXT: v_and_b32_e32 v3, 0x1ff80, v0 +; GCN-MAXOCC-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-MAXOCC-NEXT: v_lshlrev_b32_e32 v3, 7, v0 ; GCN-MAXOCC-NEXT: v_mov_b32_e32 v1, 1.0 ; GCN-MAXOCC-NEXT: v_mov_b32_e32 v2, 2.0 ; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) @@ -739,8 +739,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-ILP-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave_split_region: ; GCN-ILP: ; %bb.0: ; %entry ; GCN-ILP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-ILP-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GCN-ILP-NEXT: v_and_b32_e32 v2, 0x1ff80, v0 +; GCN-ILP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-ILP-NEXT: v_lshlrev_b32_e32 v2, 7, v0 ; GCN-ILP-NEXT: v_mov_b32_e32 v0, 1.0 ; GCN-ILP-NEXT: v_mov_b32_e32 v1, 2.0 ; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll index 37f335561a52c..73586b1243376 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll @@ -621,8 +621,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_cluster: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_u32_e32 v1, s0, v0 ; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:112 @@ -728,8 +728,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_MFMA_cluster: ; EXACTCUTOFF: ; %bb.0: ; %entry ; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s0, v0 ; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v1 offset:112 @@ -871,8 +871,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 1.0 ; GCN-NEXT: v_mov_b32_e32 v3, 2.0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1005,8 +1005,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; EXACTCUTOFF: ; %bb.0: ; %entry ; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 1.0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v3, 2.0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) @@ -1202,7 +1202,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: v_mov_b32_e32 v3, 0x3fb8aa3b ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v7, 0x32a5705f -; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v4, s0, v3 ; GCN-NEXT: v_rndne_f32_e32 v5, v4 @@ -1212,7 +1212,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: v_add_f32_e32 v4, v6, v4 ; GCN-NEXT: v_exp_f32_e32 v4, v4 ; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-NEXT: v_add_u32_e32 v1, s6, v0 ; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:112 ; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:96 @@ -1387,7 +1387,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v3, 0x3fb8aa3b ; EXACTCUTOFF-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v7, 0x32a5705f -; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_f32_e32 v4, s0, v3 ; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v5, v4 @@ -1397,7 +1397,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: v_add_f32_e32 v4, v6, v4 ; EXACTCUTOFF-NEXT: v_exp_f32_e32 v4, v4 ; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v5, v5 -; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s6, v0 ; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v1 offset:112 ; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v1 offset:96 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id-unsupported-calling-convention.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id-unsupported-calling-convention.ll index 6ad98a1cbb3ed..37cfb6cc10180 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id-unsupported-calling-convention.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id-unsupported-calling-convention.ll @@ -1,15 +1,24 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -o - %s | FileCheck %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -o - %s | FileCheck %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -o - %s | FileCheck -check-prefixes=CHECK,SDAG %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -o - %s | FileCheck -check-prefixes=CHECK,GISEL %s declare i32 @llvm.amdgcn.workitem.id.x() declare i32 @llvm.amdgcn.workitem.id.y() declare i32 @llvm.amdgcn.workitem.id.z() +; FIXME: It's not worth adding AssertZext to the intrinsic calls, and +; we don't fold out assertzext undef ->undef define amdgpu_ps void @undefined_workitems(ptr addrspace(1) %p, ptr addrspace(1) %q, ptr addrspace(1) %r) { -; CHECK-LABEL: undefined_workitems: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_endpgm +; SDAG-LABEL: undefined_workitems: +; SDAG: ; %bb.0: +; SDAG-NEXT: global_store_dword v[0:1], v0, off +; SDAG-NEXT: global_store_dword v[2:3], v0, off +; SDAG-NEXT: global_store_dword v[4:5], v0, off +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: undefined_workitems: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_endpgm %id.x = call i32 @llvm.amdgcn.workitem.id.x() store i32 %id.x, ptr addrspace(1) %p %id.y = call i32 @llvm.amdgcn.workitem.id.y() @@ -18,3 +27,5 @@ define amdgpu_ps void @undefined_workitems(ptr addrspace(1) %p, ptr addrspace(1) store i32 %id.z, ptr addrspace(1) %r ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll index 4e1d2a754fa61..2bda61ab950f7 100644 --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -146,8 +146,8 @@ define void @mubuf_clause(ptr addrspace(5) noalias nocapture readonly %arg, ptr ; GCN-LABEL: mubuf_clause: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 4, v31 -; GCN-NEXT: v_and_b32_e32 v2, 0x3ff0, v2 +; GCN-NEXT: v_and_b32_e32 v2, 0x3ff, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 4, v2 ; GCN-NEXT: v_add_u32_e32 v0, v0, v2 ; GCN-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:12 ; GCN-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:8 @@ -205,8 +205,8 @@ define void @mubuf_clause(ptr addrspace(5) noalias nocapture readonly %arg, ptr ; GCN-SCRATCH-LABEL: mubuf_clause: ; GCN-SCRATCH: ; %bb.0: ; %bb ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-SCRATCH-NEXT: v_lshlrev_b32_e32 v2, 4, v31 -; GCN-SCRATCH-NEXT: v_and_b32_e32 v18, 0x3ff0, v2 +; GCN-SCRATCH-NEXT: v_and_b32_e32 v2, 0x3ff, v31 +; GCN-SCRATCH-NEXT: v_lshlrev_b32_e32 v18, 4, v2 ; GCN-SCRATCH-NEXT: v_add_nc_u32_e32 v0, v0, v18 ; GCN-SCRATCH-NEXT: s_clause 0x3 ; GCN-SCRATCH-NEXT: scratch_load_dwordx4 v[2:5], v0, off diff --git a/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll b/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll index 9ace249e6af4a..2e52e515a2a97 100644 --- a/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll +++ b/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll @@ -3,10 +3,10 @@ define amdgpu_kernel void @sdwa_test() local_unnamed_addr #0 { ; GFX9-LABEL: sdwa_test: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: v_add_u32_e32 v1, 10, v0 -; GFX9-NEXT: v_add_u32_e32 v0, 20, v0 -; GFX9-NEXT: v_add_co_u32_sdwa v0, vcc, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_addc_co_u32_e64 v1, s[0:1], 0, 0, vcc +; GFX9-NEXT: v_add_u32_e32 v1, 20, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX9-NEXT: v_add3_u32 v0, v0, v1, 10 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_endpgm bb: diff --git a/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll b/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll index a742d1e59dc72..4f770b91c07fe 100644 --- a/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll +++ b/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll @@ -34,6 +34,8 @@ for.body: ; preds = %for.body, %entry br i1 %exitcond, label %for.cond.cleanup, label %for.body } +declare i32 @func() + ; Check that we unroll inner loop but not outer ; CHECK-LABEL: @invariant_ind ; CHECK: %[[exitcond:[^ ]+]] = icmp eq i32 %{{.*}}, 32 @@ -43,7 +45,7 @@ for.body: ; preds = %for.body, %entry define amdgpu_kernel void @invariant_ind(ptr addrspace(1) nocapture %a, i32 %x) { entry: %arr = alloca [64 x i32], align 4, addrspace(5) - %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %tmp1 = tail call i32 @func() br label %for.cond2.preheader for.cond2.preheader: ; preds = %for.cond.cleanup5, %entry