diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index d6375ab77cfb3..4829453ee57cd 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1148,8 +1148,8 @@ def int_amdgcn_s_buffer_load : DefaultAttrsIntrinsic < // Note that in the cachepolicy for all these intrinsics, bit 31 is not preserved // through to final assembly selection and is used to signal that the buffer // operation is volatile. -class AMDGPURawBufferLoad : DefaultAttrsIntrinsic < - [data_ty], +class AMDGPURawBufferLoad : DefaultAttrsIntrinsic < + [llvm_any_ty], [llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) @@ -1162,7 +1162,7 @@ class AMDGPURawBufferLoad : DefaultAttrsIntrinsi // all: volatile op (bit 31, stripped at lowering) [IntrReadMem, ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; -def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad; +def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad; def int_amdgcn_raw_buffer_load : AMDGPURawBufferLoad; class AMDGPURawAtomicBufferLoad : Intrinsic < diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll index 23efaa4d2bd91..baeb5909f04e8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll @@ -169,6 +169,62 @@ define amdgpu_ps <4 x float> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voff ret <4 x float> %val } +define amdgpu_ps <4 x i32> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_v4i32(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; GFX8-LABEL: name: raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_v4i32 + ; GFX8: bb.1 (%ir-block.0): + ; GFX8-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX8-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub0 + ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1 + ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub2 + ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub3 + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX8-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GFX8-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GFX8-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] + ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3 + ; + ; GFX12-LABEL: name: raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_v4i32 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX12-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub0 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub1 + ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub2 + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub3 + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GFX12-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GFX12-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3 + %val = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret <4 x i32> %val +} + ; Waterfall for rsrc and soffset, copy for voffset define amdgpu_ps float @raw_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset(<4 x i32> %rsrc, i32 inreg %voffset, i32 %soffset) { ; GFX8-LABEL: name: raw_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset @@ -325,9 +381,68 @@ define amdgpu_ps <4 x float> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voff ret <4 x float> %val } +define amdgpu_ps <4 x i32> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add_4095_v4i32(<4 x i32> inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) { + ; GFX8-LABEL: name: raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add_4095_v4i32 + ; GFX8: bb.1 (%ir-block.0): + ; GFX8-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX8-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub0 + ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1 + ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub2 + ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub3 + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX8-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GFX8-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GFX8-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] + ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3 + ; + ; GFX12-LABEL: name: raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add_4095_v4i32 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX12-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub0 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub1 + ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub2 + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub3 + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GFX12-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GFX12-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3 + %voffset = add i32 %voffset.base, 4095 + %val = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret <4 x i32> %val +} + + declare float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32>, i32, i32, i32 immarg) #0 declare <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32>, i32, i32, i32 immarg) #0 declare <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32>, i32, i32, i32 immarg) #0 declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32 immarg) #0 +declare <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32>, i32, i32, i32 immarg) #0 attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.ll index 60c6268e448cb..29efdddac39b2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.ll @@ -17,6 +17,25 @@ main_body: ret {<4 x float>, <4 x float>, <4 x float>} %r2 } +;CHECK-LABEL: {{^}}buffer_load_v4i32: +;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 +;CHECK: buffer_load_format_xyzw v[4:7], off, s[0:3], 0 glc +;CHECK: buffer_load_format_xyzw v[8:11], off, s[0:3], 0 slc +;CHECK: s_waitcnt +define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load_v4i32(<4 x i32> inreg) { +main_body: + %data = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0) + %data_glc = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 0, i32 0, i32 1) + %data_slc = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 0, i32 0, i32 2) + %fdata = bitcast <4 x i32> %data to <4 x float> + %fdata_glc = bitcast <4 x i32> %data_glc to <4 x float> + %fdata_slc = bitcast <4 x i32> %data_slc to <4 x float> + %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %fdata, 0 + %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %fdata_glc, 1 + %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %fdata_slc, 2 + ret {<4 x float>, <4 x float>, <4 x float>} %r2 +} + ;CHECK-LABEL: {{^}}buffer_load_immoffs: ;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 offset:42 ;CHECK: s_waitcnt @@ -26,6 +45,16 @@ main_body: ret <4 x float> %data } +;CHECK-LABEL: {{^}}buffer_load_immoffs_v4i32: +;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 offset:42 +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_immoffs_v4i32(<4 x i32> inreg) { +main_body: + %data = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 42, i32 0, i32 0) + %fdata = bitcast <4 x i32> %data to <4 x float> + ret <4 x float> %fdata +} + ;CHECK-LABEL: {{^}}buffer_load_immoffs_large: ;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 60 offset:4092 ;CHECK-DAG: s_movk_i32 [[OFS1:s[0-9]+]], 0x7ffc @@ -43,6 +72,26 @@ main_body: ret <4 x float> %data } +;CHECK-LABEL: {{^}}buffer_load_immoffs_large_v4i32: +;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 60 offset:4092 +;CHECK-DAG: s_movk_i32 [[OFS1:s[0-9]+]], 0x7ffc +;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS1]] offset:4092 +;CHECK-DAG: s_mov_b32 [[OFS2:s[0-9]+]], 0x8ffc +;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS2]] offset:4 +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_immoffs_large_v4i32(<4 x i32> inreg) { +main_body: + %d.0 = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 4092, i32 60, i32 0) + %d.1 = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 4092, i32 32764, i32 0) + %d.2 = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 4, i32 36860, i32 0) + %fd.0 = bitcast <4 x i32> %d.0 to <4 x float> + %fd.1 = bitcast <4 x i32> %d.1 to <4 x float> + %fd.2 = bitcast <4 x i32> %d.2 to <4 x float> + %d.3 = fadd <4 x float> %fd.0, %fd.1 + %data = fadd <4 x float> %fd.2, %d.3 + ret <4 x float> %data +} + ;CHECK-LABEL: {{^}}buffer_load_ofs: ;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen ;CHECK: s_waitcnt @@ -52,6 +101,16 @@ main_body: ret <4 x float> %data } +;CHECK-LABEL: {{^}}buffer_load_ofs_v4i32: +;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_ofs_v4i32(<4 x i32> inreg, i32) { +main_body: + %data = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 %1, i32 0, i32 0) + %fdata = bitcast <4 x i32> %data to <4 x float> + ret <4 x float> %fdata +} + ;CHECK-LABEL: {{^}}buffer_load_ofs_imm: ;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen offset:60 ;CHECK: s_waitcnt @@ -62,6 +121,17 @@ main_body: ret <4 x float> %data } +;CHECK-LABEL: {{^}}buffer_load_ofs_imm_v4i32: +;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen offset:60 +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_ofs_imm_v4i32(<4 x i32> inreg, i32) { +main_body: + %ofs = add i32 %1, 60 + %data = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 %ofs, i32 0, i32 0) + %fdata = bitcast <4 x i32> %data to <4 x float> + ret <4 x float> %fdata +} + ;CHECK-LABEL: {{^}}buffer_load_x: ;CHECK: buffer_load_format_x v0, off, s[0:3], 0 ;CHECK: s_waitcnt @@ -83,5 +153,6 @@ main_body: declare float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32>, i32, i32, i32) #0 declare <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32>, i32, i32, i32) #0 declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32) #0 +declare <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32>, i32, i32, i32) #0 attributes #0 = { nounwind readonly }