[NFC] Precommit test.

fengfeng09 · fengfeng09 · commit c8f37bb86546 · 2025-04-29T15:44:23.000+08:00
diff --git a/llvm/test/CodeGen/AMDGPU/tbaa-load-store.ll b/llvm/test/CodeGen/AMDGPU/tbaa-load-store.ll
@@ -0,0 +1,82 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefix=GCN %s
+
+%"struct.cub::BlockRadixSort<unsigned char, 32, 1, cub::NullType, 1, true, cub::BLOCK_SCAN_RAKING>::TempStorage" = type { %"struct.cub::Uninitialized" }
+%"struct.cub::Uninitialized" = type { [26 x %struct.ulonglong2.0] }
+%struct.ulonglong2.0 = type { i64, i64 }
+
+@tmp_storage = external dso_local local_unnamed_addr addrspace(3) global %"struct.cub::BlockRadixSort<unsigned char, 32, 1, cub::NullType, 1, true, cub::BLOCK_SCAN_RAKING>::TempStorage", align 16
+
+declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
+
+declare void @llvm.amdgcn.s.barrier()
+
+define amdgpu_kernel void @Kernel_func(i8 %a, i32 %b, i32 %c, i32 %end_bit) {
+; GCN-LABEL: Kernel_func:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT:    s_load_dword s6, s[4:5], 0x30
+; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT:    s_mov_b64 s[2:3], 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_and_b32 s0, s0, 0xff
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    v_lshl_or_b32 v2, v0, 2, 2
+; GCN-NEXT:  .LBB0_1: ; %while.cond
+; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    v_sub_u32_e32 v4, s6, v3
+; GCN-NEXT:    v_lshrrev_b32_e64 v3, v3, s0
+; GCN-NEXT:    v_min_i32_e32 v4, 1, v4
+; GCN-NEXT:    v_bfe_u32 v3, v3, 0, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v3, 1, v3
+; GCN-NEXT:    v_sub_u32_e32 v4, v2, v3
+; GCN-NEXT:    ds_read_u16 v3, v4
+; GCN-NEXT:    ds_write_b32 v1, v0
+; GCN-NEXT:    s_waitcnt lgkmcnt(1)
+; GCN-NEXT:    v_add_u16_e32 v3, 1, v3
+; GCN-NEXT:    v_cmp_ge_i32_e32 vcc, s1, v3
+; GCN-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GCN-NEXT:    ds_write_b16 v4, v3
+; GCN-NEXT:    s_barrier
+; GCN-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GCN-NEXT:    s_cbranch_execnz .LBB0_1
+; GCN-NEXT:  ; %bb.2: ; %end
+; GCN-NEXT:    s_endpgm
+entry:
+  %tid = tail call noundef i32 @llvm.amdgcn.workitem.id.x()
+  %idx = getelementptr inbounds [2 x [32 x [2 x i16]]], ptr addrspace(3) @tmp_storage, i32 0, i32 0, i32 %tid
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %begin_bit = phi i32 [ %b, %entry ], [ %conv, %while.cond ]
+  %diff = sub nsw i32 %end_bit, %begin_bit
+  %smin = tail call i32 @llvm.smin.i32(i32 %diff, i32 1)
+  store i32 %tid, ptr addrspace(3) %idx, align 4, !tbaa !0
+  %notmask = shl nsw i32 -1, %smin
+  %not = xor i32 %notmask, -1
+  %conv.iii = zext i8 %a to i32
+  %shr.iii = lshr i32 %conv.iii, %begin_bit
+  %and.iii = and i32 %shr.iii, %not
+  %sub = sub nsw i32 1, %and.iii
+  %arrayidx = getelementptr inbounds [32 x [2 x i16]], ptr addrspace(3) @tmp_storage, i32 0, i32 %tid, i32 %sub
+  %value = load i16, ptr addrspace(3) %arrayidx, align 2, !tbaa !4
+  %add = add i16 %value, 1
+  store i16 %add, ptr addrspace(3) %arrayidx, align 2, !tbaa !4
+  tail call void @llvm.amdgcn.s.barrier()
+  %conv = zext i16 %add to i32
+  %cmp = icmp sgt i32 %conv, %b
+  br i1 %cmp, label %while.cond, label %end
+
+end:                                              ; preds = %while.cond
+  ret void
+}
+
+declare i32 @llvm.smin.i32(i32, i32)
+
+!0 = !{!1, !1, i64 0}
+!1 = !{!"int", !2, i64 0}
+!2 = !{!"omnipotent char", !3, i64 0}
+!3 = !{!"Simple C++ TBAA"}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"short", !2, i64 0}