@@ -8932,6 +8932,85 @@ define void @v_permlane16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %s
89328932 ret void
89338933}
89348934
8935+ define void @v_permlane16_i8 (ptr addrspace (1 ) %out , i8 %src0 , i32 %src1 , i32 %src2 ) {
8936+ ; GFX10-LABEL: v_permlane16_i8:
8937+ ; GFX10: ; %bb.0:
8938+ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8939+ ; GFX10-NEXT: v_readfirstlane_b32 s4, v3
8940+ ; GFX10-NEXT: v_readfirstlane_b32 s5, v4
8941+ ; GFX10-NEXT: v_permlane16_b32 v2, v2, s4, s5
8942+ ; GFX10-NEXT: global_store_byte v[0:1], v2, off
8943+ ; GFX10-NEXT: s_setpc_b64 s[30:31]
8944+ ;
8945+ ; GFX11-LABEL: v_permlane16_i8:
8946+ ; GFX11: ; %bb.0:
8947+ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8948+ ; GFX11-NEXT: v_readfirstlane_b32 s0, v3
8949+ ; GFX11-NEXT: v_readfirstlane_b32 s1, v4
8950+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
8951+ ; GFX11-NEXT: v_permlane16_b32 v2, v2, s0, s1
8952+ ; GFX11-NEXT: global_store_b8 v[0:1], v2, off
8953+ ; GFX11-NEXT: s_setpc_b64 s[30:31]
8954+ ;
8955+ ; GFX12-LABEL: v_permlane16_i8:
8956+ ; GFX12: ; %bb.0:
8957+ ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
8958+ ; GFX12-NEXT: s_wait_expcnt 0x0
8959+ ; GFX12-NEXT: s_wait_samplecnt 0x0
8960+ ; GFX12-NEXT: s_wait_bvhcnt 0x0
8961+ ; GFX12-NEXT: s_wait_kmcnt 0x0
8962+ ; GFX12-NEXT: v_readfirstlane_b32 s0, v3
8963+ ; GFX12-NEXT: v_readfirstlane_b32 s1, v4
8964+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
8965+ ; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1
8966+ ; GFX12-NEXT: global_store_b8 v[0:1], v2, off
8967+ ; GFX12-NEXT: s_setpc_b64 s[30:31]
8968+ %v = call i8 @llvm.amdgcn.permlane16.i8 (i8 %src0 , i8 %src0 , i32 %src1 , i32 %src2 , i1 false , i1 false )
8969+ store i8 %v , ptr addrspace (1 ) %out
8970+ ret void
8971+ }
8972+
8973+ define void @v_permlane16_i1 (ptr addrspace (1 ) %out , i1 %src0 , i32 %src1 , i32 %src2 ) {
8974+ ; GFX10-LABEL: v_permlane16_i1:
8975+ ; GFX10: ; %bb.0:
8976+ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8977+ ; GFX10-NEXT: v_readfirstlane_b32 s4, v3
8978+ ; GFX10-NEXT: v_readfirstlane_b32 s5, v4
8979+ ; GFX10-NEXT: v_permlane16_b32 v2, v2, s4, s5
8980+ ; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
8981+ ; GFX10-NEXT: global_store_byte v[0:1], v2, off
8982+ ; GFX10-NEXT: s_setpc_b64 s[30:31]
8983+ ;
8984+ ; GFX11-LABEL: v_permlane16_i1:
8985+ ; GFX11: ; %bb.0:
8986+ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8987+ ; GFX11-NEXT: v_readfirstlane_b32 s0, v3
8988+ ; GFX11-NEXT: v_readfirstlane_b32 s1, v4
8989+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8990+ ; GFX11-NEXT: v_permlane16_b32 v2, v2, s0, s1
8991+ ; GFX11-NEXT: v_and_b32_e32 v2, 1, v2
8992+ ; GFX11-NEXT: global_store_b8 v[0:1], v2, off
8993+ ; GFX11-NEXT: s_setpc_b64 s[30:31]
8994+ ;
8995+ ; GFX12-LABEL: v_permlane16_i1:
8996+ ; GFX12: ; %bb.0:
8997+ ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
8998+ ; GFX12-NEXT: s_wait_expcnt 0x0
8999+ ; GFX12-NEXT: s_wait_samplecnt 0x0
9000+ ; GFX12-NEXT: s_wait_bvhcnt 0x0
9001+ ; GFX12-NEXT: s_wait_kmcnt 0x0
9002+ ; GFX12-NEXT: v_readfirstlane_b32 s0, v3
9003+ ; GFX12-NEXT: v_readfirstlane_b32 s1, v4
9004+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9005+ ; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1
9006+ ; GFX12-NEXT: v_and_b32_e32 v2, 1, v2
9007+ ; GFX12-NEXT: global_store_b8 v[0:1], v2, off
9008+ ; GFX12-NEXT: s_setpc_b64 s[30:31]
9009+ %v = call i1 @llvm.amdgcn.permlane16.i1 (i1 %src0 , i1 %src0 , i32 %src1 , i32 %src2 , i1 false , i1 false )
9010+ store i1 %v , ptr addrspace (1 ) %out
9011+ ret void
9012+ }
9013+
89359014define void @v_permlanex16_v2f32 (ptr addrspace (1 ) %out , <2 x float > %src0 , i32 %src1 , i32 %src2 ) {
89369015; GFX10-SDAG-LABEL: v_permlanex16_v2f32:
89379016; GFX10-SDAG: ; %bb.0:
@@ -9430,3 +9509,82 @@ define void @v_permlanex16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %sr
94309509 store <8 x i16 > %v , ptr addrspace (1 ) %out
94319510 ret void
94329511}
9512+
9513+ define void @v_permlanex16_i8 (ptr addrspace (1 ) %out , i8 %src0 , i32 %src1 , i32 %src2 ) {
9514+ ; GFX10-LABEL: v_permlanex16_i8:
9515+ ; GFX10: ; %bb.0:
9516+ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9517+ ; GFX10-NEXT: v_readfirstlane_b32 s4, v3
9518+ ; GFX10-NEXT: v_readfirstlane_b32 s5, v4
9519+ ; GFX10-NEXT: v_permlanex16_b32 v2, v2, s4, s5
9520+ ; GFX10-NEXT: global_store_byte v[0:1], v2, off
9521+ ; GFX10-NEXT: s_setpc_b64 s[30:31]
9522+ ;
9523+ ; GFX11-LABEL: v_permlanex16_i8:
9524+ ; GFX11: ; %bb.0:
9525+ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9526+ ; GFX11-NEXT: v_readfirstlane_b32 s0, v3
9527+ ; GFX11-NEXT: v_readfirstlane_b32 s1, v4
9528+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
9529+ ; GFX11-NEXT: v_permlanex16_b32 v2, v2, s0, s1
9530+ ; GFX11-NEXT: global_store_b8 v[0:1], v2, off
9531+ ; GFX11-NEXT: s_setpc_b64 s[30:31]
9532+ ;
9533+ ; GFX12-LABEL: v_permlanex16_i8:
9534+ ; GFX12: ; %bb.0:
9535+ ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
9536+ ; GFX12-NEXT: s_wait_expcnt 0x0
9537+ ; GFX12-NEXT: s_wait_samplecnt 0x0
9538+ ; GFX12-NEXT: s_wait_bvhcnt 0x0
9539+ ; GFX12-NEXT: s_wait_kmcnt 0x0
9540+ ; GFX12-NEXT: v_readfirstlane_b32 s0, v3
9541+ ; GFX12-NEXT: v_readfirstlane_b32 s1, v4
9542+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
9543+ ; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1
9544+ ; GFX12-NEXT: global_store_b8 v[0:1], v2, off
9545+ ; GFX12-NEXT: s_setpc_b64 s[30:31]
9546+ %v = call i8 @llvm.amdgcn.permlanex16.i8 (i8 %src0 , i8 %src0 , i32 %src1 , i32 %src2 , i1 false , i1 false )
9547+ store i8 %v , ptr addrspace (1 ) %out
9548+ ret void
9549+ }
9550+
9551+ define void @v_permlanex16_i1 (ptr addrspace (1 ) %out , i1 %src0 , i32 %src1 , i32 %src2 ) {
9552+ ; GFX10-LABEL: v_permlanex16_i1:
9553+ ; GFX10: ; %bb.0:
9554+ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9555+ ; GFX10-NEXT: v_readfirstlane_b32 s4, v3
9556+ ; GFX10-NEXT: v_readfirstlane_b32 s5, v4
9557+ ; GFX10-NEXT: v_permlanex16_b32 v2, v2, s4, s5
9558+ ; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
9559+ ; GFX10-NEXT: global_store_byte v[0:1], v2, off
9560+ ; GFX10-NEXT: s_setpc_b64 s[30:31]
9561+ ;
9562+ ; GFX11-LABEL: v_permlanex16_i1:
9563+ ; GFX11: ; %bb.0:
9564+ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9565+ ; GFX11-NEXT: v_readfirstlane_b32 s0, v3
9566+ ; GFX11-NEXT: v_readfirstlane_b32 s1, v4
9567+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9568+ ; GFX11-NEXT: v_permlanex16_b32 v2, v2, s0, s1
9569+ ; GFX11-NEXT: v_and_b32_e32 v2, 1, v2
9570+ ; GFX11-NEXT: global_store_b8 v[0:1], v2, off
9571+ ; GFX11-NEXT: s_setpc_b64 s[30:31]
9572+ ;
9573+ ; GFX12-LABEL: v_permlanex16_i1:
9574+ ; GFX12: ; %bb.0:
9575+ ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
9576+ ; GFX12-NEXT: s_wait_expcnt 0x0
9577+ ; GFX12-NEXT: s_wait_samplecnt 0x0
9578+ ; GFX12-NEXT: s_wait_bvhcnt 0x0
9579+ ; GFX12-NEXT: s_wait_kmcnt 0x0
9580+ ; GFX12-NEXT: v_readfirstlane_b32 s0, v3
9581+ ; GFX12-NEXT: v_readfirstlane_b32 s1, v4
9582+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9583+ ; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1
9584+ ; GFX12-NEXT: v_and_b32_e32 v2, 1, v2
9585+ ; GFX12-NEXT: global_store_b8 v[0:1], v2, off
9586+ ; GFX12-NEXT: s_setpc_b64 s[30:31]
9587+ %v = call i1 @llvm.amdgcn.permlanex16.i1 (i1 %src0 , i1 %src0 , i32 %src1 , i32 %src2 , i1 false , i1 false )
9588+ store i1 %v , ptr addrspace (1 ) %out
9589+ ret void
9590+ }
0 commit comments