|
| 1 | +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 |
| 2 | +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -stop-after=amdgpu-isel -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11 |
| 3 | + |
| 4 | +@const_half = internal constant half 1.0 |
| 5 | + |
| 6 | +define amdgpu_kernel void @fma_v2f16_divergent( |
| 7 | + ; GFX11-LABEL: name: fma_v2f16_divergent |
| 8 | + ; GFX11: bb.0 (%ir-block.0): |
| 9 | + ; GFX11-NEXT: liveins: $vgpr0, $sgpr4_sgpr5 |
| 10 | + ; GFX11-NEXT: {{ $}} |
| 11 | + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 |
| 12 | + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 |
| 13 | + ; GFX11-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s256) from %ir.r.kernarg.offset, align 4, addrspace 4) |
| 14 | + ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 68, 0 :: (dereferenceable invariant load (s32) from %ir.d.kernarg.offset, addrspace 4) |
| 15 | + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub1 |
| 16 | + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub0 |
| 17 | + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1 |
| 18 | + ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec |
| 19 | + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub3 |
| 20 | + ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub2 |
| 21 | + ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1 |
| 22 | + ; GFX11-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub5 |
| 23 | + ; GFX11-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub4 |
| 24 | + ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY7]], %subreg.sub0, killed [[COPY6]], %subreg.sub1 |
| 25 | + ; GFX11-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub7 |
| 26 | + ; GFX11-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub6 |
| 27 | + ; GFX11-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1 |
| 28 | + ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1023 |
| 29 | + ; GFX11-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]](s32), killed [[S_MOV_B32_]], implicit $exec |
| 30 | + ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1 |
| 31 | + ; GFX11-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 killed [[S_MOV_B32_1]], killed [[V_AND_B32_e64_]], implicit $exec |
| 32 | + ; GFX11-NEXT: [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 killed [[REG_SEQUENCE1]], killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s16) from %ir.f.gep, addrspace 1) |
| 33 | + ; GFX11-NEXT: [[V_AND_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_AND_B16_t16_e64 0, 32767, 0, [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], 0, implicit $exec |
| 34 | + ; GFX11-NEXT: [[V_XOR_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_XOR_B16_t16_e64 0, -32768, 0, [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], 0, implicit $exec |
| 35 | + ; GFX11-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 |
| 36 | + ; GFX11-NEXT: S_CMP_LG_U32 killed [[S_LOAD_DWORD_IMM]], killed [[S_MOV_B32_2]], implicit-def $scc |
| 37 | + ; GFX11-NEXT: [[COPY10:%[0-9]+]]:sreg_32_xm0_xexec = COPY $scc |
| 38 | + ; GFX11-NEXT: [[V_CNDMASK_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_CNDMASK_B16_t16_e64 0, killed [[V_XOR_B16_t16_e64_]], 0, killed [[V_AND_B16_t16_e64_]], killed [[COPY10]], 0, implicit $exec |
| 39 | + ; GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF |
| 40 | + ; GFX11-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF |
| 41 | + ; GFX11-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vgpr_32 = REG_SEQUENCE killed [[V_CNDMASK_B16_t16_e64_]], %subreg.lo16, killed [[DEF]], %subreg.hi16 |
| 42 | + ; GFX11-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed [[REG_SEQUENCE2]], 0, 0 :: ("amdgpu-noclobber" load (s32) from %ir.4, addrspace 1) |
| 43 | + ; GFX11-NEXT: [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed [[REG_SEQUENCE3]], 0, 0 :: ("amdgpu-noclobber" load (s32) from %ir.5, addrspace 1) |
| 44 | + ; GFX11-NEXT: [[V_PK_FMA_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_FMA_F16 0, killed [[REG_SEQUENCE4]], 8, killed [[S_LOAD_DWORD_IMM1]], 8, killed [[S_LOAD_DWORD_IMM2]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec |
| 45 | + ; GFX11-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[V_PK_FMA_F16_]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (s32) into %ir.2, addrspace 1) |
| 46 | + ; GFX11-NEXT: S_ENDPGM 0 |
| 47 | + ptr addrspace(1) %r, |
| 48 | + ptr addrspace(1) %fptr, |
| 49 | + ptr addrspace(1) %b, |
| 50 | + ptr addrspace(1) %c, |
| 51 | + i32 %d) { |
| 52 | + |
| 53 | + %idx = call i32 @llvm.amdgcn.workitem.id.x() #1 |
| 54 | + %f.gep = getelementptr half, ptr addrspace(1) %fptr, i32 %idx |
| 55 | + %f = load half, ptr addrspace(1) %f.gep |
| 56 | + %f.abs = call half @llvm.fabs.f16(half %f) |
| 57 | + %f.neg = fneg half %f |
| 58 | + %setcc = icmp ne i32 %d, 0 |
| 59 | + %select = select i1 %setcc, half %f.abs, half %f.neg |
| 60 | + %vec = insertelement <2 x half> poison, half %select, i32 0 |
| 61 | + %a.val = insertelement <2 x half> %vec, half %select, i32 1 |
| 62 | + %b.v = load i32, ptr addrspace(1) %b |
| 63 | + %b.val = bitcast i32 %b.v to <2 x half> |
| 64 | + %c.v = load i32, ptr addrspace(1) %c |
| 65 | + %c.val = bitcast i32 %c.v to <2 x half> |
| 66 | + %r.val = call <2 x half> @llvm.fma.v2f16(<2 x half> %a.val, <2 x half> %b.val, <2 x half> %c.val) |
| 67 | + store <2 x half> %r.val, ptr addrspace(1) %r |
| 68 | + ret void |
| 69 | +} |
| 70 | + |
| 71 | +define amdgpu_kernel void @fma_v2f16_uniform( |
| 72 | + ; GFX11-LABEL: name: fma_v2f16_uniform |
| 73 | + ; GFX11: bb.0 (%ir-block.0): |
| 74 | + ; GFX11-NEXT: liveins: $sgpr4_sgpr5 |
| 75 | + ; GFX11-NEXT: {{ $}} |
| 76 | + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 |
| 77 | + ; GFX11-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s256) from %ir.r.kernarg.offset, align 4, addrspace 4) |
| 78 | + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub1 |
| 79 | + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub0 |
| 80 | + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1 |
| 81 | + ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec |
| 82 | + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub3 |
| 83 | + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub2 |
| 84 | + ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[COPY3]], %subreg.sub1 |
| 85 | + ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub5 |
| 86 | + ; GFX11-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub4 |
| 87 | + ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1 |
| 88 | + ; GFX11-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub7 |
| 89 | + ; GFX11-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub6 |
| 90 | + ; GFX11-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1 |
| 91 | + ; GFX11-NEXT: [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 killed [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s16) from %ir.3, addrspace 1) |
| 92 | + ; GFX11-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]] |
| 93 | + ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 killed [[COPY9]] |
| 94 | + ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed [[REG_SEQUENCE2]], 0, 0 :: ("amdgpu-noclobber" load (s32) from %ir.4, addrspace 1) |
| 95 | + ; GFX11-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed [[REG_SEQUENCE3]], 0, 0 :: ("amdgpu-noclobber" load (s32) from %ir.5, addrspace 1) |
| 96 | + ; GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY killed [[S_LOAD_DWORD_IMM1]] |
| 97 | + ; GFX11-NEXT: [[V_PK_FMA_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_FMA_F16 0, killed [[S_MOV_B32_]], 8, killed [[S_LOAD_DWORD_IMM]], 8, [[COPY10]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec |
| 98 | + ; GFX11-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[V_PK_FMA_F16_]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (s32) into %ir.2, addrspace 1) |
| 99 | + ; GFX11-NEXT: S_ENDPGM 0 |
| 100 | + ptr addrspace(1) %r, |
| 101 | + ptr addrspace(1) %a, |
| 102 | + ptr addrspace(1) %b, |
| 103 | + ptr addrspace(1) %c) { |
| 104 | + %a.half = load half, ptr addrspace(1) %a |
| 105 | + %vec = insertelement <2 x half> poison, half %a.half, i32 0 |
| 106 | + %a.val = insertelement <2 x half> %vec, half %a.half, i32 1 |
| 107 | + %b.v = load i32, ptr addrspace(1) %b |
| 108 | + %b.val = bitcast i32 %b.v to <2 x half> |
| 109 | + %c.v = load i32, ptr addrspace(1) %c |
| 110 | + %c.val = bitcast i32 %c.v to <2 x half> |
| 111 | + %r.val = call <2 x half> @llvm.fma.v2f16(<2 x half> %a.val, <2 x half> %b.val, <2 x half> %c.val) |
| 112 | + store <2 x half> %r.val, ptr addrspace(1) %r |
| 113 | + ret void |
| 114 | +} |
0 commit comments