1- ; RUN: llc -mtriple=amdgcn -mcpu=gfx942 --stop-after=si-fix-sgpr-copies < %s | FileCheck %s
1+ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s
23
34; iglp.opt should not be flagged as clobbering the memory operand for the global_load, and we should be able to
45; lower into the scalar version (i.e. should not need to lower into vector version with waterfall loop)
5- ; CHECK-NOT: WATERFALL
66
7- define amdgpu_kernel void @_attn_forward_fp8e5_128x32x64_BW128 (ptr addrspace (1 ) %in , ptr addrspace (3 ) %out ) {
7+ define amdgpu_kernel void @func (ptr addrspace (1 ) %in , ptr addrspace (3 ) %out ) {
8+ ; CHECK-LABEL: func:
9+ ; CHECK: ; %bb.0: ; %.lr.ph
10+ ; CHECK-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
11+ ; CHECK-NEXT: s_mov_b64 s[8:9], 0
12+ ; CHECK-NEXT: s_mov_b64 s[10:11], 0
13+ ; CHECK-NEXT: s_mov_b32 s3, 32
14+ ; CHECK-NEXT: s_mov_b32 s2, 0
15+ ; CHECK-NEXT: s_mov_b64 s[12:13], 0
16+ ; CHECK-NEXT: .LBB0_1: ; %loop
17+ ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
18+ ; CHECK-NEXT: s_mov_b64 s[0:1], s[10:11]
19+ ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
20+ ; CHECK-NEXT: s_add_u32 s10, s6, s12
21+ ; CHECK-NEXT: s_addc_u32 s11, s7, s13
22+ ; CHECK-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0
23+ ; CHECK-NEXT: s_add_i32 s3, s3, -1
24+ ; CHECK-NEXT: s_cmp_lg_u32 s3, 0
25+ ; CHECK-NEXT: ; iglp_opt mask(0x00000000)
26+ ; CHECK-NEXT: s_cbranch_scc1 .LBB0_1
27+ ; CHECK-NEXT: ; %bb.2: ; %end
28+ ; CHECK-NEXT: s_and_b32 s1, s1, 0xffff
29+ ; CHECK-NEXT: s_mov_b32 s3, s2
30+ ; CHECK-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
31+ ; CHECK-NEXT: s_load_dword s0, s[4:5], 0x8
32+ ; CHECK-NEXT: s_waitcnt vmcnt(0)
33+ ; CHECK-NEXT: v_mov_b32_e32 v1, 0
34+ ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
35+ ; CHECK-NEXT: v_mov_b32_e32 v2, s0
36+ ; CHECK-NEXT: v_and_b32_e32 v0, 0xff, v0
37+ ; CHECK-NEXT: ds_write_b64 v2, v[0:1]
38+ ; CHECK-NEXT: s_endpgm
839.lr.ph:
9- br label %1
40+ br label %loop
1041
11- 1 : ; preds = %1, %.lr.ph
12- %addr = phi ptr addrspace (1 ) [ null , %.lr.ph ], [ %gep , %1 ]
13- %offset = phi i64 [ 0 , %.lr.ph ], [ %nextOff , %1 ]
14- %inc = phi i32 [0 , %.lr.ph ], [ %incCond , %1 ]
42+ loop : ; preds = %1, %.lr.ph
43+ %addr = phi ptr addrspace (1 ) [ null , %.lr.ph ], [ %gep , %loop ]
44+ %offset = phi i64 [ 0 , %.lr.ph ], [ %nextOff , %loop ]
45+ %inc = phi i32 [0 , %.lr.ph ], [ %incCond , %loop ]
1546 %rsrc = tail call ptr addrspace (8 ) @llvm.amdgcn.make.buffer.rsrc.p1 (ptr addrspace (1 ) %addr , i16 0 , i32 0 , i32 0 )
1647 %load = tail call <2 x i32 > @llvm.amdgcn.raw.ptr.buffer.load.v2i32 (ptr addrspace (8 ) %rsrc , i32 0 , i32 0 , i32 0 )
1748 %load.bc = bitcast <2 x i32 > %load to <8 x i8 >
@@ -25,15 +56,13 @@ define amdgpu_kernel void @_attn_forward_fp8e5_128x32x64_BW128(ptr addrspace(1)
2556 %nextOff = extractelement <1 x i64 > %unmaskedload49 , i64 0
2657 %incCond = add i32 %inc , 1
2758 %cond = icmp eq i32 %incCond , 32
28- br i1 %cond , label %2 , label %1
59+ br i1 %cond , label %end , label %loop
2960
30- 2 :
61+ end :
3162 store <4 x half > %shuff , ptr addrspace (3 ) %out , align 8
3263 ret void
3364}
3465
35- ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
3666declare ptr addrspace (8 ) @llvm.amdgcn.make.buffer.rsrc.p1 (ptr addrspace (1 ) readnone , i16 , i32 , i32 ) #0
3767
38- ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read)
3968declare <2 x i32 > @llvm.amdgcn.raw.ptr.buffer.load.v2i32 (ptr addrspace (8 ) nocapture readonly , i32 , i32 , i32 immarg) #1
0 commit comments