@@ -8770,6 +8770,85 @@ define void @v_permlane16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %s
87708770 ret void
87718771}
87728772
8773+ define void @v_permlane16_i8 (ptr addrspace (1 ) %out , i8 %src0 , i32 %src1 , i32 %src2 ) {
8774+ ; GFX10-LABEL: v_permlane16_i8:
8775+ ; GFX10: ; %bb.0:
8776+ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8777+ ; GFX10-NEXT: v_readfirstlane_b32 s4, v3
8778+ ; GFX10-NEXT: v_readfirstlane_b32 s5, v4
8779+ ; GFX10-NEXT: v_permlane16_b32 v2, v2, s4, s5
8780+ ; GFX10-NEXT: global_store_byte v[0:1], v2, off
8781+ ; GFX10-NEXT: s_setpc_b64 s[30:31]
8782+ ;
8783+ ; GFX11-LABEL: v_permlane16_i8:
8784+ ; GFX11: ; %bb.0:
8785+ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8786+ ; GFX11-NEXT: v_readfirstlane_b32 s0, v3
8787+ ; GFX11-NEXT: v_readfirstlane_b32 s1, v4
8788+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
8789+ ; GFX11-NEXT: v_permlane16_b32 v2, v2, s0, s1
8790+ ; GFX11-NEXT: global_store_b8 v[0:1], v2, off
8791+ ; GFX11-NEXT: s_setpc_b64 s[30:31]
8792+ ;
8793+ ; GFX12-LABEL: v_permlane16_i8:
8794+ ; GFX12: ; %bb.0:
8795+ ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
8796+ ; GFX12-NEXT: s_wait_expcnt 0x0
8797+ ; GFX12-NEXT: s_wait_samplecnt 0x0
8798+ ; GFX12-NEXT: s_wait_bvhcnt 0x0
8799+ ; GFX12-NEXT: s_wait_kmcnt 0x0
8800+ ; GFX12-NEXT: v_readfirstlane_b32 s0, v3
8801+ ; GFX12-NEXT: v_readfirstlane_b32 s1, v4
8802+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
8803+ ; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1
8804+ ; GFX12-NEXT: global_store_b8 v[0:1], v2, off
8805+ ; GFX12-NEXT: s_setpc_b64 s[30:31]
8806+ %v = call i8 @llvm.amdgcn.permlane16.i8 (i8 %src0 , i8 %src0 , i32 %src1 , i32 %src2 , i1 false , i1 false )
8807+ store i8 %v , ptr addrspace (1 ) %out
8808+ ret void
8809+ }
8810+
8811+ define void @v_permlane16_i1 (ptr addrspace (1 ) %out , i1 %src0 , i32 %src1 , i32 %src2 ) {
8812+ ; GFX10-LABEL: v_permlane16_i1:
8813+ ; GFX10: ; %bb.0:
8814+ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8815+ ; GFX10-NEXT: v_readfirstlane_b32 s4, v3
8816+ ; GFX10-NEXT: v_readfirstlane_b32 s5, v4
8817+ ; GFX10-NEXT: v_permlane16_b32 v2, v2, s4, s5
8818+ ; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
8819+ ; GFX10-NEXT: global_store_byte v[0:1], v2, off
8820+ ; GFX10-NEXT: s_setpc_b64 s[30:31]
8821+ ;
8822+ ; GFX11-LABEL: v_permlane16_i1:
8823+ ; GFX11: ; %bb.0:
8824+ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8825+ ; GFX11-NEXT: v_readfirstlane_b32 s0, v3
8826+ ; GFX11-NEXT: v_readfirstlane_b32 s1, v4
8827+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8828+ ; GFX11-NEXT: v_permlane16_b32 v2, v2, s0, s1
8829+ ; GFX11-NEXT: v_and_b32_e32 v2, 1, v2
8830+ ; GFX11-NEXT: global_store_b8 v[0:1], v2, off
8831+ ; GFX11-NEXT: s_setpc_b64 s[30:31]
8832+ ;
8833+ ; GFX12-LABEL: v_permlane16_i1:
8834+ ; GFX12: ; %bb.0:
8835+ ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
8836+ ; GFX12-NEXT: s_wait_expcnt 0x0
8837+ ; GFX12-NEXT: s_wait_samplecnt 0x0
8838+ ; GFX12-NEXT: s_wait_bvhcnt 0x0
8839+ ; GFX12-NEXT: s_wait_kmcnt 0x0
8840+ ; GFX12-NEXT: v_readfirstlane_b32 s0, v3
8841+ ; GFX12-NEXT: v_readfirstlane_b32 s1, v4
8842+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8843+ ; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1
8844+ ; GFX12-NEXT: v_and_b32_e32 v2, 1, v2
8845+ ; GFX12-NEXT: global_store_b8 v[0:1], v2, off
8846+ ; GFX12-NEXT: s_setpc_b64 s[30:31]
8847+ %v = call i1 @llvm.amdgcn.permlane16.i1 (i1 %src0 , i1 %src0 , i32 %src1 , i32 %src2 , i1 false , i1 false )
8848+ store i1 %v , ptr addrspace (1 ) %out
8849+ ret void
8850+ }
8851+
87738852define void @v_permlanex16_v2f32 (ptr addrspace (1 ) %out , <2 x float > %src0 , i32 %src1 , i32 %src2 ) {
87748853; GFX10-SDAG-LABEL: v_permlanex16_v2f32:
87758854; GFX10-SDAG: ; %bb.0:
@@ -9258,3 +9337,82 @@ define void @v_permlanex16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %sr
92589337 store <8 x i16 > %v , ptr addrspace (1 ) %out
92599338 ret void
92609339}
9340+
9341+ define void @v_permlanex16_i8 (ptr addrspace (1 ) %out , i8 %src0 , i32 %src1 , i32 %src2 ) {
9342+ ; GFX10-LABEL: v_permlanex16_i8:
9343+ ; GFX10: ; %bb.0:
9344+ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9345+ ; GFX10-NEXT: v_readfirstlane_b32 s4, v3
9346+ ; GFX10-NEXT: v_readfirstlane_b32 s5, v4
9347+ ; GFX10-NEXT: v_permlanex16_b32 v2, v2, s4, s5
9348+ ; GFX10-NEXT: global_store_byte v[0:1], v2, off
9349+ ; GFX10-NEXT: s_setpc_b64 s[30:31]
9350+ ;
9351+ ; GFX11-LABEL: v_permlanex16_i8:
9352+ ; GFX11: ; %bb.0:
9353+ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9354+ ; GFX11-NEXT: v_readfirstlane_b32 s0, v3
9355+ ; GFX11-NEXT: v_readfirstlane_b32 s1, v4
9356+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
9357+ ; GFX11-NEXT: v_permlanex16_b32 v2, v2, s0, s1
9358+ ; GFX11-NEXT: global_store_b8 v[0:1], v2, off
9359+ ; GFX11-NEXT: s_setpc_b64 s[30:31]
9360+ ;
9361+ ; GFX12-LABEL: v_permlanex16_i8:
9362+ ; GFX12: ; %bb.0:
9363+ ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
9364+ ; GFX12-NEXT: s_wait_expcnt 0x0
9365+ ; GFX12-NEXT: s_wait_samplecnt 0x0
9366+ ; GFX12-NEXT: s_wait_bvhcnt 0x0
9367+ ; GFX12-NEXT: s_wait_kmcnt 0x0
9368+ ; GFX12-NEXT: v_readfirstlane_b32 s0, v3
9369+ ; GFX12-NEXT: v_readfirstlane_b32 s1, v4
9370+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
9371+ ; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1
9372+ ; GFX12-NEXT: global_store_b8 v[0:1], v2, off
9373+ ; GFX12-NEXT: s_setpc_b64 s[30:31]
9374+ %v = call i8 @llvm.amdgcn.permlanex16.i8 (i8 %src0 , i8 %src0 , i32 %src1 , i32 %src2 , i1 false , i1 false )
9375+ store i8 %v , ptr addrspace (1 ) %out
9376+ ret void
9377+ }
9378+
9379+ define void @v_permlanex16_i1 (ptr addrspace (1 ) %out , i1 %src0 , i32 %src1 , i32 %src2 ) {
9380+ ; GFX10-LABEL: v_permlanex16_i1:
9381+ ; GFX10: ; %bb.0:
9382+ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9383+ ; GFX10-NEXT: v_readfirstlane_b32 s4, v3
9384+ ; GFX10-NEXT: v_readfirstlane_b32 s5, v4
9385+ ; GFX10-NEXT: v_permlanex16_b32 v2, v2, s4, s5
9386+ ; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
9387+ ; GFX10-NEXT: global_store_byte v[0:1], v2, off
9388+ ; GFX10-NEXT: s_setpc_b64 s[30:31]
9389+ ;
9390+ ; GFX11-LABEL: v_permlanex16_i1:
9391+ ; GFX11: ; %bb.0:
9392+ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9393+ ; GFX11-NEXT: v_readfirstlane_b32 s0, v3
9394+ ; GFX11-NEXT: v_readfirstlane_b32 s1, v4
9395+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9396+ ; GFX11-NEXT: v_permlanex16_b32 v2, v2, s0, s1
9397+ ; GFX11-NEXT: v_and_b32_e32 v2, 1, v2
9398+ ; GFX11-NEXT: global_store_b8 v[0:1], v2, off
9399+ ; GFX11-NEXT: s_setpc_b64 s[30:31]
9400+ ;
9401+ ; GFX12-LABEL: v_permlanex16_i1:
9402+ ; GFX12: ; %bb.0:
9403+ ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
9404+ ; GFX12-NEXT: s_wait_expcnt 0x0
9405+ ; GFX12-NEXT: s_wait_samplecnt 0x0
9406+ ; GFX12-NEXT: s_wait_bvhcnt 0x0
9407+ ; GFX12-NEXT: s_wait_kmcnt 0x0
9408+ ; GFX12-NEXT: v_readfirstlane_b32 s0, v3
9409+ ; GFX12-NEXT: v_readfirstlane_b32 s1, v4
9410+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9411+ ; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1
9412+ ; GFX12-NEXT: v_and_b32_e32 v2, 1, v2
9413+ ; GFX12-NEXT: global_store_b8 v[0:1], v2, off
9414+ ; GFX12-NEXT: s_setpc_b64 s[30:31]
9415+ %v = call i1 @llvm.amdgcn.permlanex16.i1 (i1 %src0 , i1 %src0 , i32 %src1 , i32 %src2 , i1 false , i1 false )
9416+ store i1 %v , ptr addrspace (1 ) %out
9417+ ret void
9418+ }
0 commit comments