Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable

typedef unsigned int uint;
typedef unsigned int __attribute__((ext_vector_type(2))) uint2;
typedef half __attribute__((ext_vector_type(2))) half2;

// CHECK-LABEL: @test_setprio_inc_wg(
Expand Down Expand Up @@ -368,6 +369,52 @@ void test_cvt_pk_f16_bf8(global half2* out, short a)
out[0] = __builtin_amdgcn_cvt_pk_f16_bf8(a);
}

// CHECK-LABEL: @test_permlane16_swap(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OLD_ADDR]] to ptr
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4
// CHECK-NEXT: store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP0]], i32 [[TMP1]], i1 false, i1 false)
// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
// CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0
// CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1
// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8
// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP10:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP8]], i32 [[TMP9]], i1 true, i1 false)
// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { i32, i32 } [[TMP10]], 0
// CHECK-NEXT: [[TMP12:%.*]] = extractvalue { i32, i32 } [[TMP10]], 1
// CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i64 0
// CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP12]], i64 1
// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <2 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 8
// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP18:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP16]], i32 [[TMP17]], i1 false, i1 true)
// CHECK-NEXT: [[TMP19:%.*]] = extractvalue { i32, i32 } [[TMP18]], 0
// CHECK-NEXT: [[TMP20:%.*]] = extractvalue { i32, i32 } [[TMP18]], 1
// CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i64 0
// CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i64 1
// CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <2 x i32> [[TMP22]], ptr addrspace(1) [[TMP23]], align 8
// CHECK-NEXT: ret void
//
void test_permlane16_swap(global uint2* out, uint old, uint src) {
*out = __builtin_amdgcn_permlane16_swap(old, src, false, false);
*out = __builtin_amdgcn_permlane16_swap(old, src, true, false);
*out = __builtin_amdgcn_permlane16_swap(old, src, false, true);
}

// CHECK-LABEL: @test_cvt_f32_fp8_e5m3(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/Target/AMDGPU/VOP1Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1080,6 +1080,13 @@ multiclass VOP1_Real_FULL_t16_and_fake16_gfx1250<
VOP1_Real_FULL_with_name<GFX1250Gen, op, opName#"_fake16", asmName>;
}

multiclass VOP1_Real_OpSelIsDPP_gfx1250<bits<9> op> : VOP1_Real_e32<GFX1250Gen, op> {
defvar ps = !cast<VOP_Pseudo>(NAME#"_e64");
def _e64_gfx1250 :
VOP3_Real_Gen<ps, GFX1250Gen>,
VOP3OpSelIsDPP_gfx12<{0, 1, 1, op{6-0}}, ps.Pfl>;
}

defm V_CVT_F32_FP8 : VOP1_Real_FULL_with_name<GFX12Not12_50Gen, 0x06c, "V_CVT_F32_FP8_OP_SEL", "v_cvt_f32_fp8">;
defm V_CVT_F32_FP8 : VOP1_Real_FULL_with_name<GFX1250Gen, 0x06c, "V_CVT_F32_FP8_gfx1250", "v_cvt_f32_fp8">;

Expand Down Expand Up @@ -1147,6 +1154,7 @@ defm V_MOV_B64 : VOP1_Real_FULL <GFX1250Gen, 0x1d>;

defm V_TANH_F32 : VOP1_Real_FULL<GFX1250Gen, 0x01e>;
defm V_TANH_F16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x01f>;
defm V_PERMLANE16_SWAP_B32 : VOP1_Real_OpSelIsDPP_gfx1250<0x049>;
defm V_TANH_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>;
defm V_PRNG_B32 : VOP1_Real_FULL<GFX1250Gen, 0x04b>;
defm V_CVT_F32_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">;
Expand Down
11 changes: 10 additions & 1 deletion llvm/lib/Target/AMDGPU/VOPInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -331,10 +331,19 @@ class VOP3OpSel_gfx9 <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> {

// Special case for v_permlane16_swap_b32/v_permlane32_swap_b32
// op_sel[0]/op_sel[1] are treated as bound_ctrl and fi dpp operands.
class VOP3OpSelIsDPP_gfx9 <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> {
class VOP3OpSelIsDPP_base {
bits<1> fi;
bits<1> bound_ctrl;
}

class VOP3OpSelIsDPP_gfx9 <bits<10> op, VOPProfile P> : VOP3OpSelIsDPP_base, VOP3e_vi <op, P> {
// OPSEL[0] specifies FI
let Inst{11} = fi;
// OPSEL[1] specifies BOUND_CTRL
let Inst{12} = bound_ctrl;
}

class VOP3OpSelIsDPP_gfx12 <bits<10> op, VOPProfile P> : VOP3OpSelIsDPP_base, VOP3e_gfx11_gfx12 <op, P> {
// OPSEL[0] specifies FI
let Inst{11} = fi;
// OPSEL[1] specifies BOUND_CTRL
Expand Down
153 changes: 151 additions & 2 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s

; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR-SDAG %s
; RUN: not llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR-GISEL %s
Expand All @@ -17,6 +19,18 @@ define { i32, i32 } @v_permlane16_swap_b32_vv(i32 %vdst_old, i32 %src0_old) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
; GFX950-LABEL: v_permlane16_swap_b32_vv:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_permlane16_swap_b32_vv:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
ret { i32, i32 } %v
}
Expand All @@ -29,6 +43,22 @@ define { i32, i32 } @v_permlane16_swap_b32_vi(i32 %vdst_old) {
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
; GFX950-LABEL: v_permlane16_swap_b32_vi:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v1, 1
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_permlane16_swap_b32_vi:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 1
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 1, i1 false, i1 false)
ret { i32, i32 } %v
}
Expand All @@ -41,6 +71,22 @@ define { i32, i32 } @v_permlane16_swap_b32_vl(i32 %vdst_old) {
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
; GFX950-LABEL: v_permlane16_swap_b32_vl:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v1, 0xc1d1
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_permlane16_swap_b32_vl:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 0xc1d1
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 49617, i1 false, i1 false)
ret { i32, i32 } %v
}
Expand All @@ -54,6 +100,23 @@ define { i32, i32 } @v_permlane16_swap_b32_iv(i32 %src0_old) {
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
; GFX950-LABEL: v_permlane16_swap_b32_iv:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: v_mov_b32_e32 v0, 1
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_permlane16_swap_b32_iv:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 1
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 1, i32 %src0_old, i1 false, i1 false)
ret { i32, i32 } %v
}
Expand All @@ -67,6 +130,23 @@ define { i32, i32 } @v_permlane16_swap_b32_ss(i32 inreg %vdst_old, i32 inreg %sr
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
; GFX950-LABEL: v_permlane16_swap_b32_ss:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_mov_b32_e32 v1, s1
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_permlane16_swap_b32_ss:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
ret { i32, i32 } %v
}
Expand All @@ -80,6 +160,23 @@ define { i32, i32 } @v_permlane16_swap_b32_sv(i32 inreg %vdst_old, i32 %src0_old
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
; GFX950-LABEL: v_permlane16_swap_b32_sv:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_permlane16_swap_b32_sv:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
ret { i32, i32 } %v
}
Expand All @@ -92,6 +189,22 @@ define { i32, i32 } @v_permlane16_swap_b32_vs(i32 %vdst_old, i32 inreg %src0_old
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
; GFX950-LABEL: v_permlane16_swap_b32_vs:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v1, s0
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_permlane16_swap_b32_vs:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, s0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
ret { i32, i32 } %v
}
Expand All @@ -102,6 +215,18 @@ define { i32, i32 } @v_permlane16_swap_b32_vv_fi(i32 %vdst_old, i32 %src0_old) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_permlane16_swap_b32_e64 v0, v1 fi:1
; GCN-NEXT: s_setpc_b64 s[30:31]
; GFX950-LABEL: v_permlane16_swap_b32_vv_fi:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_permlane16_swap_b32_e64 v0, v1 fi:1
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_permlane16_swap_b32_vv_fi:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_permlane16_swap_b32_e64 v0, v1 fi:1
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 true, i1 false)
ret { i32, i32 } %v
}
Expand All @@ -112,6 +237,18 @@ define { i32, i32 } @v_permlane16_swap_b32_vv_bc(i32 %vdst_old, i32 %src0_old) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_permlane16_swap_b32_e64 v0, v1 bound_ctrl:1
; GCN-NEXT: s_setpc_b64 s[30:31]
; GFX950-LABEL: v_permlane16_swap_b32_vv_bc:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_permlane16_swap_b32_e64 v0, v1 bound_ctrl:1
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_permlane16_swap_b32_vv_bc:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_permlane16_swap_b32_e64 v0, v1 bound_ctrl:1
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 true)
ret { i32, i32 } %v
}
Expand All @@ -122,6 +259,18 @@ define { i32, i32 } @v_permlane16_swap_b32_vv_fi_bc(i32 %vdst_old, i32 %src0_old
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_permlane16_swap_b32_e64 v0, v1 bound_ctrl:1 fi:1
; GCN-NEXT: s_setpc_b64 s[30:31]
; GFX950-LABEL: v_permlane16_swap_b32_vv_fi_bc:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_permlane16_swap_b32_e64 v0, v1 bound_ctrl:1 fi:1
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_permlane16_swap_b32_vv_fi_bc:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_permlane16_swap_b32_e64 v0, v1 bound_ctrl:1 fi:1
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 true, i1 true)
ret { i32, i32 } %v
}
6 changes: 6 additions & 0 deletions llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s
Original file line number Diff line number Diff line change
Expand Up @@ -627,3 +627,9 @@ v_cvt_f32_fp8_e32 v1, 3

v_cvt_f32_fp8_e32 v1, v3
// GFX1250: v_cvt_f32_fp8_e32 v1, v3 ; encoding: [0x03,0xd9,0x02,0x7e]

v_permlane16_swap_b32 v1, v2
// GFX1250: v_permlane16_swap_b32_e32 v1, v2 ; encoding: [0x02,0x93,0x02,0x7e]

v_permlane16_swap_b32_e32 v1, v2
// GFX1250: v_permlane16_swap_b32_e32 v1, v2 ; encoding: [0x02,0x93,0x02,0x7e]
6 changes: 6 additions & 0 deletions llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s
Original file line number Diff line number Diff line change
Expand Up @@ -663,3 +663,9 @@ v_cvt_f32_fp8_e32 v1, 3

v_cvt_f32_fp8_e32 v1, v3
// GFX1250: v_cvt_f32_fp8_e32 v1, v3 ; encoding: [0x03,0xd9,0x02,0x7e]

v_permlane16_swap_b32 v1, v2
// GFX1250: v_permlane16_swap_b32_e32 v1, v2 ; encoding: [0x02,0x93,0x02,0x7e]

v_permlane16_swap_b32_e32 v1, v2
// GFX1250: v_permlane16_swap_b32_e32 v1, v2 ; encoding: [0x02,0x93,0x02,0x7e]
21 changes: 21 additions & 0 deletions llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s
Original file line number Diff line number Diff line change
Expand Up @@ -720,3 +720,24 @@ v_cvt_pk_f16_fp8 v1, v150 op_sel:[1]

v_cvt_pk_f16_fp8 v1, s2 op_sel:[1]
// GFX1250: v_cvt_pk_f16_fp8 v1, s2 op_sel:[1,0] ; encoding: [0x01,0x08,0xf5,0xd5,0x02,0x00,0x00,0x00]

v_permlane16_swap_b32_e64 v1, v2
// GFX1250: v_permlane16_swap_b32_e64 v1, v2 ; encoding: [0x01,0x00,0xc9,0xd5,0x02,0x01,0x00,0x00]

v_permlane16_swap_b32 v1, v2 bound_ctrl:0
// GFX1250: v_permlane16_swap_b32_e64 v1, v2 ; encoding: [0x01,0x00,0xc9,0xd5,0x02,0x01,0x00,0x00]

v_permlane16_swap_b32 v1, v2 fi:0
// GFX1250: v_permlane16_swap_b32_e64 v1, v2 ; encoding: [0x01,0x00,0xc9,0xd5,0x02,0x01,0x00,0x00]

v_permlane16_swap_b32 v1, v2 bound_ctrl:1
// GFX1250: v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 ; encoding: [0x01,0x10,0xc9,0xd5,0x02,0x01,0x00,0x00]

v_permlane16_swap_b32 v1, v2 fi:1
// GFX1250: v_permlane16_swap_b32_e64 v1, v2 fi:1 ; encoding: [0x01,0x08,0xc9,0xd5,0x02,0x01,0x00,0x00]

v_permlane16_swap_b32 v1, v2 bound_ctrl:1 fi:1
// GFX1250: v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 fi:1 ; encoding: [0x01,0x18,0xc9,0xd5,0x02,0x01,0x00,0x00]

v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 fi:1
// GFX1250: v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 fi:1 ; encoding: [0x01,0x18,0xc9,0xd5,0x02,0x01,0x00,0x00]
Loading
Loading