1+ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 
12; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s 
23
3- ; GCN-LABEL: {{^}}private_load_maybe_divergent: 
4- ; GCN: buffer_load_dword 
5- ; GCN-NOT: s_load_dword s 
6- ; GCN: flat_load_dword 
7- ; GCN-NOT: s_load_dword s 
84define  amdgpu_kernel void  @private_load_maybe_divergent (ptr  addrspace (4 ) %k , ptr  %flat ) {
5+ ; GCN-LABEL: private_load_maybe_divergent: 
6+ ; GCN:       ; %bb.0: 
7+ ; GCN-NEXT:    s_add_i32 s12, s12, s17 
8+ ; GCN-NEXT:    s_mov_b64 s[22:23], s[2:3] 
9+ ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8 
10+ ; GCN-NEXT:    s_mov_b64 s[20:21], s[0:1] 
11+ ; GCN-NEXT:    s_add_u32 s20, s20, s17 
12+ ; GCN-NEXT:    s_addc_u32 s21, s21, 0 
13+ ; GCN-NEXT:    buffer_load_dword v0, v0, s[20:23], 0 offen glc 
14+ ; GCN-NEXT:    s_waitcnt vmcnt(0) 
15+ ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0 
16+ ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13 
17+ ; GCN-NEXT:    s_waitcnt lgkmcnt(0) 
18+ ; GCN-NEXT:    v_mov_b32_e32 v2, s1 
19+ ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0 
20+ ; GCN-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1] 
21+ ; GCN-NEXT:    v_add_u32_e32 v0, vcc, s0, v0 
22+ ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc 
23+ ; GCN-NEXT:    flat_load_dword v0, v[0:1] 
24+ ; GCN-NEXT:    s_waitcnt vmcnt(0) 
25+ ; GCN-NEXT:    flat_store_dword v[0:1], v0 
26+ ; GCN-NEXT:    s_endpgm 
927  %load  = load  volatile  i32 , ptr  addrspace (5 ) poison, align  4 
1028  %gep  = getelementptr  inbounds  i32 , ptr  addrspace (4 ) %k , i32  %load 
1129  %maybe.not.uniform.load  = load  i32 , ptr  addrspace (4 ) %gep , align  4 
1230  store  i32  %maybe.not.uniform.load , ptr  addrspace (1 ) poison
1331  ret  void 
1432}
1533
16- ; GCN-LABEL: {{^}}flat_load_maybe_divergent: 
17- ; GCN: s_load_dwordx4 
18- ; GCN-NOT: s_load 
19- ; GCN: flat_load_dword 
20- ; GCN-NOT: s_load 
21- ; GCN: flat_load_dword 
22- ; GCN-NOT: s_load 
23- ; GCN: flat_store_dword 
2434define  amdgpu_kernel void  @flat_load_maybe_divergent (ptr  addrspace (4 ) %k , ptr  %flat ) {
35+ ; GCN-LABEL: flat_load_maybe_divergent: 
36+ ; GCN:       ; %bb.0: 
37+ ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0 
38+ ; GCN-NEXT:    s_add_i32 s12, s12, s17 
39+ ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13 
40+ ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8 
41+ ; GCN-NEXT:    s_waitcnt lgkmcnt(0) 
42+ ; GCN-NEXT:    v_mov_b32_e32 v0, s2 
43+ ; GCN-NEXT:    v_mov_b32_e32 v1, s3 
44+ ; GCN-NEXT:    flat_load_dword v0, v[0:1] 
45+ ; GCN-NEXT:    v_mov_b32_e32 v2, s1 
46+ ; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) 
47+ ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0 
48+ ; GCN-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1] 
49+ ; GCN-NEXT:    v_add_u32_e32 v0, vcc, s0, v0 
50+ ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc 
51+ ; GCN-NEXT:    flat_load_dword v0, v[0:1] 
52+ ; GCN-NEXT:    s_waitcnt vmcnt(0) 
53+ ; GCN-NEXT:    flat_store_dword v[0:1], v0 
54+ ; GCN-NEXT:    s_endpgm 
2555  %load  = load  i32 , ptr  %flat , align  4 
2656  %gep  = getelementptr  inbounds  i32 , ptr  addrspace (4 ) %k , i32  %load 
2757  %maybe.not.uniform.load  = load  i32 , ptr  addrspace (4 ) %gep , align  4 
@@ -34,12 +64,33 @@ define amdgpu_kernel void @flat_load_maybe_divergent(ptr addrspace(4) %k, ptr %f
3464; last values are divergent due to the carry in glue (such that 
3565; divergence needs to propagate through glue if there are any non-void 
3666; outputs) 
37- ; GCN-LABEL: {{^}}wide_carry_divergence_error: 
38- ; GCN: v_sub_u32_e32 
39- ; GCN: v_subb_u32_e32 
40- ; GCN: v_subb_u32_e32 
41- ; GCN: v_subb_u32_e32 
4267define  <2  x i128 > @wide_carry_divergence_error (i128  %arg ) {
68+ ; GCN-LABEL: wide_carry_divergence_error: 
69+ ; GCN:       ; %bb.0: 
70+ ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 
71+ ; GCN-NEXT:    v_ffbh_u32_e32 v0, v0 
72+ ; GCN-NEXT:    v_ffbh_u32_e32 v4, v2 
73+ ; GCN-NEXT:    v_add_u32_e64 v0, s[4:5], v0, 32 clamp 
74+ ; GCN-NEXT:    v_ffbh_u32_e32 v1, v1 
75+ ; GCN-NEXT:    v_add_u32_e32 v4, vcc, 32, v4 
76+ ; GCN-NEXT:    v_min3_u32 v0, v0, v1, 64 
77+ ; GCN-NEXT:    v_add_u32_e32 v0, vcc, 64, v0 
78+ ; GCN-NEXT:    v_ffbh_u32_e32 v5, v3 
79+ ; GCN-NEXT:    v_addc_u32_e64 v1, s[4:5], 0, 0, vcc 
80+ ; GCN-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3] 
81+ ; GCN-NEXT:    v_min_u32_e32 v4, v4, v5 
82+ ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc 
83+ ; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc 
84+ ; GCN-NEXT:    v_sub_u32_e32 v0, vcc, 0, v0 
85+ ; GCN-NEXT:    v_mov_b32_e32 v3, 0 
86+ ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc 
87+ ; GCN-NEXT:    v_subb_u32_e32 v2, vcc, 0, v3, vcc 
88+ ; GCN-NEXT:    v_subb_u32_e32 v3, vcc, 0, v3, vcc 
89+ ; GCN-NEXT:    v_mov_b32_e32 v4, 0 
90+ ; GCN-NEXT:    v_mov_b32_e32 v5, 0 
91+ ; GCN-NEXT:    v_mov_b32_e32 v6, 0 
92+ ; GCN-NEXT:    v_mov_b32_e32 v7, 0 
93+ ; GCN-NEXT:    s_setpc_b64 s[30:31] 
4394  %i  = call  i128  @llvm.ctlz.i128 (i128  %arg , i1  false )
4495  %i1  = sub  i128  0 , %i 
4596  %i2  = insertelement  <2  x i128 > zeroinitializer , i128  %i1 , i64  0 
0 commit comments