[AA] Make alias check more accurately. #137747

fengfeng09 · 2025-04-29T03:48:24Z

TBAA could not check the alias between same memory location access in different type, so this will mislead the AAResults::alias to return a NoAlias which will make a necessary dep missing between the two location ld/st.

llvmbot · 2025-04-29T03:48:57Z

@llvm/pr-subscribers-backend-amdgpu

@llvm/pr-subscribers-llvm-analysis

Author: fengfeng (fengfeng09)

Changes

TBAA could not check the alias between same memory location access in different type, so this will mislead the AAResults::alias to return a NoAlias which will make a necessary dep missing between the two location ld/st.

Full diff: https://github.com/llvm/llvm-project/pull/137747.diff

2 Files Affected:

(modified) llvm/lib/Analysis/AliasAnalysis.cpp (+1-1)
(added) llvm/test/CodeGen/AMDGPU/tbaa-load-store.ll (+77)

diff --git a/llvm/lib/Analysis/AliasAnalysis.cpp b/llvm/lib/Analysis/AliasAnalysis.cpp
index efabf69b06047..3833b84ffdddb 100644
--- a/llvm/lib/Analysis/AliasAnalysis.cpp
+++ b/llvm/lib/Analysis/AliasAnalysis.cpp
@@ -122,7 +122,7 @@ AliasResult AAResults::alias(const MemoryLocation &LocA,
   AAQI.Depth++;
   for (const auto &AA : AAs) {
     Result = AA->alias(LocA, LocB, AAQI, CtxI);
-    if (Result != AliasResult::MayAlias)
+    if (Result != AliasResult::NoAlias)
       break;
   }
   AAQI.Depth--;
diff --git a/llvm/test/CodeGen/AMDGPU/tbaa-load-store.ll b/llvm/test/CodeGen/AMDGPU/tbaa-load-store.ll
new file mode 100644
index 0000000000000..bc8248e8876c2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/tbaa-load-store.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
+
+%"struct.cub::BlockRadixSort<unsigned char, 32, 1, cub::NullType, 1, true, cub::BLOCK_SCAN_RAKING>::TempStorage" = type { %"struct.cub::Uninitialized" }
+%"struct.cub::Uninitialized" = type { [26 x %struct.ulonglong2.0] }
+%struct.ulonglong2.0 = type { i64, i64 }
+
+$Kernel_func = comdat any
+@tmp_storage = external dso_local local_unnamed_addr addrspace(3) global %"struct.cub::BlockRadixSort<unsigned char, 32, 1, cub::NullType, 1, true, cub::BLOCK_SCAN_RAKING>::TempStorage", align 16
+declare i32 @llvm.amdgcn.workitem.id.x()
+declare void @llvm.amdgcn.s.barrier()
+
+define amdgpu_kernel void @Kernel_func(i8 %a, i32 %b, i32 %c, i32 %end_bit) {
+; GCN-LABEL: Kernel_func:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT:    s_load_dword s6, s[4:5], 0x30
+; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT:    s_mov_b64 s[2:3], 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_and_b32 s0, s0, 0xff
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    v_lshl_or_b32 v2, v0, 2, 2
+; GCN-NEXT:  .LBB0_1: ; %while.cond
+; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    v_sub_u32_e32 v4, s6, v3
+; GCN-NEXT:    v_lshrrev_b32_e64 v3, v3, s0
+; GCN-NEXT:    v_min_i32_e32 v4, 1, v4
+; GCN-NEXT:    v_bfe_u32 v3, v3, 0, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v3, 1, v3
+; GCN-NEXT:    ds_write_b32 v1, v0
+; GCN-NEXT:    v_sub_u32_e32 v4, v2, v3
+; GCN-NEXT:    ds_read_u16 v3, v4
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_add_u16_e32 v3, 1, v3
+; GCN-NEXT:    v_cmp_ge_i32_e32 vcc, s1, v3
+; GCN-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GCN-NEXT:    ds_write_b16 v4, v3
+; GCN-NEXT:    s_barrier
+; GCN-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GCN-NEXT:    s_cbranch_execnz .LBB0_1
+; GCN-NEXT:  ; %bb.2: ; %end
+; GCN-NEXT:    s_endpgm
+entry:
+  %0 = tail call noundef i32 @llvm.amdgcn.workitem.id.x()
+  %arrayidx3 = getelementptr inbounds [2 x [32 x [2 x i16]]], ptr addrspace(3) @tmp_storage, i32 0, i32 0, i32 %0
+  br label %while.cond
+while.cond:
+  %begin_bit = phi i32 [ %b, %entry ], [ %conv, %while.cond ]
+  %sub.i.i.i = sub nsw i32 %end_bit, %begin_bit
+  %cond.i.i.i = tail call i32 @llvm.smin.i32(i32 %sub.i.i.i, i32 1)
+  store i32 %0, ptr addrspace(3) %arrayidx3, align 4, !tbaa !10
+  %notmask.ii = shl nsw i32 -1, %cond.i.i.i
+  %sub.iii = xor i32 %notmask.ii, -1
+  %conv.iii = zext i8 %a to i32
+  %shr.iii = lshr i32 %conv.iii, %begin_bit
+  %and.iii = and i32 %shr.iii, %sub.iii
+  %sub = sub nsw i32 1, %and.iii
+  %arrayidx8.ii = getelementptr inbounds [32 x [2 x i16]], ptr addrspace(3) @tmp_storage, i32 0, i32 %0, i32 %sub
+  %3 = load i16, ptr addrspace(3) %arrayidx8.ii, align 2, !tbaa !12
+  %add = add i16 %3, 1
+  store i16 %add, ptr addrspace(3) %arrayidx8.ii, align 2, !tbaa !12
+  tail call void @llvm.amdgcn.s.barrier()
+  %conv = zext i16 %add to i32
+  %cmp7 = icmp sgt i32 %conv, %b
+  br i1 %cmp7, label %while.cond, label %end
+end:
+  ret void
+}
+
+!6 = !{!"omnipotent char", !7, i64 0}
+!7 = !{!"Simple C++ TBAA"}
+!10 = !{!11, !11, i64 0}
+!11 = !{!"int", !6, i64 0}
+!12 = !{!13, !13, i64 0}
+!13 = !{!"short", !6, i64 0}

arsenm · 2025-04-29T07:00:11Z