[X86][AMX] Combine constant zero vector and AMX cast to tilezero #92384

phoebewang · 2024-05-16T11:21:39Z

Found this problem when investigating #91207

Found this problem when investigating llvm#91207

llvmbot · 2024-05-16T11:22:49Z

@llvm/pr-subscribers-backend-x86

Author: Phoebe Wang (phoebewang)

Changes

Found this problem when investigating #91207

Full diff: https://github.com/llvm/llvm-project/pull/92384.diff

2 Files Affected:

(modified) llvm/lib/Target/X86/X86LowerAMXType.cpp (+31)
(modified) llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll (+18-68)

diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp
index b69058787a4e2..539a9b2e4b6d2 100644
--- a/llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -709,6 +709,7 @@ class X86LowerAMXCast {
   X86LowerAMXCast(Function &F) : Func(F), DT(nullptr) {}
   bool combineCastStore(IntrinsicInst *Cast, StoreInst *ST);
   bool combineLoadCast(IntrinsicInst *Cast, LoadInst *LD);
+  bool combineTilezero(IntrinsicInst *Cast);
   bool combineLdSt(SmallVectorImpl<Instruction *> &Casts);
   bool combineAMXcast(TargetLibraryInfo *TLI);
   bool transformAMXCast(IntrinsicInst *AMXCast);
@@ -988,6 +989,27 @@ bool X86LowerAMXCast::combineLoadCast(IntrinsicInst *Cast, LoadInst *LD) {
   return EraseLoad;
 }
 
+// %19 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> zeroinitializer)
+// -->
+// %19 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col)
+bool X86LowerAMXCast::combineTilezero(IntrinsicInst *Cast) {
+  Value *Row = nullptr, *Col = nullptr;
+  Use &U = *(Cast->use_begin());
+  unsigned OpNo = U.getOperandNo();
+  auto *II = cast<IntrinsicInst>(U.getUser());
+  if (!isAMXIntrinsic(II))
+    return false;
+
+  std::tie(Row, Col) = getShape(II, OpNo);
+  std::array<Value *, 2> Args = {Row, Col};
+
+  IRBuilder<> Builder(Cast);
+  Value *NewInst = Builder.CreateIntrinsic(Intrinsic::x86_tilezero_internal,
+                                           std::nullopt, Args);
+  Cast->replaceAllUsesWith(NewInst);
+  return true;
+}
+
 bool X86LowerAMXCast::combineLdSt(SmallVectorImpl<Instruction *> &Casts) {
   bool Change = false;
   for (auto *Cast : Casts) {
@@ -1011,6 +1033,14 @@ bool X86LowerAMXCast::combineLdSt(SmallVectorImpl<Instruction *> &Casts) {
       for (auto *Store : DeadStores)
         Store->eraseFromParent();
     } else { // x86_cast_vector_to_tile
+      //  %19 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> zeroinitializer)
+      //  -->
+      //  %19 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col)
+      if (dyn_cast<ConstantAggregateZero>(Cast->getOperand(0))) {
+        Change |= combineTilezero(cast<IntrinsicInst>(Cast));
+        continue;
+      }
+
       SmallVector<Instruction *, 2> DeadLoads;
       auto *Load = dyn_cast<LoadInst>(Cast->getOperand(0));
       if (!Load || !Load->hasOneUse())
@@ -1024,6 +1054,7 @@ bool X86LowerAMXCast::combineLdSt(SmallVectorImpl<Instruction *> &Casts) {
         // Set the operand is null so that load instruction can be erased.
         Cast->setOperand(0, nullptr);
         Load->eraseFromParent();
+        Change = true;
       }
     }
   }
diff --git a/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll b/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
index 7511e5953dac1..5efdba81e76be 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
@@ -52,26 +52,13 @@ declare x86_amx @llvm.x86.tdpbuud.internal(i16, i16, i16, x86_amx, x86_amx, x86_
 declare x86_amx @llvm.x86.tdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
 declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx)
 
-define void @PR90954(ptr %0, ptr %1, i32 %2) {
+define void @PR90954(ptr %0, ptr %1, i32 %2) nounwind {
 ; CHECK-LABEL: PR90954:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pushq %rbp
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset %rbp, -16
-; CHECK-NEXT:    movq %rsp, %rbp
-; CHECK-NEXT:    .cfi_def_cfa_register %rbp
-; CHECK-NEXT:    pushq %r15
 ; CHECK-NEXT:    pushq %r14
-; CHECK-NEXT:    pushq %r13
-; CHECK-NEXT:    pushq %r12
 ; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    andq $-1024, %rsp # imm = 0xFC00
-; CHECK-NEXT:    subq $5120, %rsp # imm = 0x1400
-; CHECK-NEXT:    .cfi_offset %rbx, -56
-; CHECK-NEXT:    .cfi_offset %r12, -48
-; CHECK-NEXT:    .cfi_offset %r13, -40
-; CHECK-NEXT:    .cfi_offset %r14, -32
-; CHECK-NEXT:    .cfi_offset %r15, -24
+; CHECK-NEXT:    subq $2912, %rsp # imm = 0xB60
 ; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovups %zmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movb $1, {{[0-9]+}}(%rsp)
@@ -87,29 +74,26 @@ define void @PR90954(ptr %0, ptr %1, i32 %2) {
 ; CHECK-NEXT:    movw $64, %cx
 ; CHECK-NEXT:    movw $16, %di
 ; CHECK-NEXT:    movb $1, %r8b
-; CHECK-NEXT:    movl $64, %r9d
-; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %r10
-; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %r11
-; CHECK-NEXT:    xorl %ebx, %ebx
-; CHECK-NEXT:    xorl %r14d, %r14d
+; CHECK-NEXT:    xorl %r9d, %r9d
+; CHECK-NEXT:    xorl %r10d, %r10d
 ; CHECK-NEXT:    jmp .LBB1_1
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB1_5: # in Loop: Header=BB1_1 Depth=1
-; CHECK-NEXT:    incq %r14
-; CHECK-NEXT:    addl %edx, %ebx
+; CHECK-NEXT:    incq %r10
+; CHECK-NEXT:    addl %edx, %r9d
 ; CHECK-NEXT:  .LBB1_1: # =>This Loop Header: Depth=1
 ; CHECK-NEXT:    # Child Loop BB1_2 Depth 2
-; CHECK-NEXT:    movslq %ebx, %r15
-; CHECK-NEXT:    leaq (%rsi,%r15,4), %r15
-; CHECK-NEXT:    xorl %r12d, %r12d
-; CHECK-NEXT:    xorl %r13d, %r13d
+; CHECK-NEXT:    movslq %r9d, %r11
+; CHECK-NEXT:    leaq (%rsi,%r11,4), %r11
+; CHECK-NEXT:    xorl %ebx, %ebx
+; CHECK-NEXT:    xorl %r14d, %r14d
 ; CHECK-NEXT:    jmp .LBB1_2
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB1_4: # in Loop: Header=BB1_2 Depth=2
-; CHECK-NEXT:    tilestored %tmm1, (%r15,%rax)
-; CHECK-NEXT:    incq %r13
-; CHECK-NEXT:    addq $64, %r15
-; CHECK-NEXT:    decq %r12
+; CHECK-NEXT:    tilestored %tmm1, (%r11,%rax)
+; CHECK-NEXT:    incq %r14
+; CHECK-NEXT:    addq $64, %r11
+; CHECK-NEXT:    decq %rbx
 ; CHECK-NEXT:    je .LBB1_5
 ; CHECK-NEXT:  .LBB1_2: # Parent Loop BB1_1 Depth=1
 ; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
@@ -118,46 +102,12 @@ define void @PR90954(ptr %0, ptr %1, i32 %2) {
 ; CHECK-NEXT:    testb %r8b, %r8b
 ; CHECK-NEXT:    jne .LBB1_4
 ; CHECK-NEXT:  # %bb.3: # in Loop: Header=BB1_2 Depth=2
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    tileloadd (%r10,%r9), %tmm1
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    tileloadd (%r11,%r9), %tmm2
+; CHECK-NEXT:    tilezero %tmm1
+; CHECK-NEXT:    tilezero %tmm2
 ; CHECK-NEXT:    tdpbf16ps %tmm2, %tmm1, %tmm0
-; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movabsq $64, %rax
-; CHECK-NEXT:    tilestored %tmm0, 3072(%rsp,%rax) # 1024-byte Folded Spill
+; CHECK-NEXT:    movabsq $64, %rbp
+; CHECK-NEXT:    tilestored %tmm0, 896(%rsp,%rbp) # 1024-byte Folded Spill
 ; CHECK-NEXT:    tileloadd {{[-0-9]+}}(%r{{[sb]}}p), %tmm1 # 1024-byte Folded Reload
-; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; CHECK-NEXT:    jmp .LBB1_4
   %4 = shl i32 %2, 4
   %5 = icmp eq i64 0, 0

github-actions · 2024-05-16T11:24:27Z

⚠️ C/C++ code formatter, clang-format found issues in your code. ⚠️

You can test this locally with the following command:

git-clang-format --diff origin/main HEAD --extensions cpp -- llvm/lib/Target/X86/X86LowerAMXType.cpp

⚠️
The reproduction instructions above might return results for more than one PR
in a stack if you are using a stacked PR workflow. You can limit the results by
changing origin/main to the base branch/commit you want to compare against.
⚠️

View the diff from clang-format here.

diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp
index 0ba71ada8..342582535 100644
--- a/llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -1176,7 +1176,8 @@ bool X86LowerAMXCast::combineLoadCast(IntrinsicInst *Cast, LoadInst *LD) {
   return EraseLoad;
 }
 
-// %19 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> zeroinitializer)
+// %19 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>
+// zeroinitializer)
 // -->
 // %19 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col)
 bool X86LowerAMXCast::combineTilezero(IntrinsicInst *Cast) {
@@ -1219,9 +1220,11 @@ bool X86LowerAMXCast::combineLdSt(SmallVectorImpl<Instruction *> &Casts) {
       for (auto *Store : DeadStores)
         Store->eraseFromParent();
     } else { // x86_cast_vector_to_tile
-      //  %19 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> zeroinitializer)
+      //  %19 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x
+      //  i32> zeroinitializer)
       //  -->
-      //  %19 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col)
+      //  %19 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16
+      //  %col)
       if (isa<ConstantAggregateZero>(Cast->getOperand(0))) {
         Change |= combineTilezero(cast<IntrinsicInst>(Cast));
         continue;

RKSimon · 2025-10-02T13:24:57Z

@phoebewang what happened with this?

phoebewang · 2025-10-02T14:35:52Z

@phoebewang what happened with this?

Seems it still working.

RKSimon

LGTM with one minor

RKSimon · 2025-10-02T15:45:27Z

llvm/lib/Target/X86/X86LowerAMXType.cpp

+      //  %19 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> zeroinitializer)
+      //  -->
+      //  %19 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col)
+      if (dyn_cast<ConstantAggregateZero>(Cast->getOperand(0))) {


dyn_cast<> -> isa<>

…m#92384) Found this problem when investigating llvm#91207

[X86][AMX] Combine constant zero vector and AMX cast to tilezero

0a33db0

Found this problem when investigating llvm#91207

phoebewang requested review from LuoYuanke, RKSimon and yubingex007-a11y May 16, 2024 11:21

phoebewang marked this pull request as ready for review May 16, 2024 11:22

llvmbot added the backend:X86 label May 16, 2024

Merge remote-tracking branch 'origin/main' into AMX

6796c65

format

c0a8b32

RKSimon approved these changes Oct 2, 2025

View reviewed changes

dyn_cast -> isa

e4fbaa3

phoebewang enabled auto-merge (squash) October 3, 2025 00:47

phoebewang merged commit 487cdf1 into llvm:main Oct 3, 2025
6 of 8 checks passed

mahesh-attarde pushed a commit to mahesh-attarde/llvm-project that referenced this pull request Oct 3, 2025

[X86][AMX] Combine constant zero vector and AMX cast to tilezero (llv…

309408e

…m#92384) Found this problem when investigating llvm#91207

MixedMatched pushed a commit to MixedMatched/llvm-project that referenced this pull request Oct 3, 2025

[X86][AMX] Combine constant zero vector and AMX cast to tilezero (llv…

b5ce27a

…m#92384) Found this problem when investigating llvm#91207

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[X86][AMX] Combine constant zero vector and AMX cast to tilezero #92384

[X86][AMX] Combine constant zero vector and AMX cast to tilezero #92384

Uh oh!

phoebewang commented May 16, 2024

Uh oh!

llvmbot commented May 16, 2024

Uh oh!

github-actions bot commented May 16, 2024 •

edited

Loading

Uh oh!

RKSimon commented Oct 2, 2025

Uh oh!

phoebewang commented Oct 2, 2025 •

edited

Loading

Uh oh!

RKSimon left a comment

Uh oh!

RKSimon Oct 2, 2025

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

[X86][AMX] Combine constant zero vector and AMX cast to tilezero #92384

[X86][AMX] Combine constant zero vector and AMX cast to tilezero #92384

Uh oh!

Conversation

phoebewang commented May 16, 2024

Uh oh!

llvmbot commented May 16, 2024

Uh oh!

github-actions bot commented May 16, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

RKSimon commented Oct 2, 2025

Uh oh!

phoebewang commented Oct 2, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

RKSimon left a comment

Choose a reason for hiding this comment

Uh oh!

RKSimon Oct 2, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

github-actions bot commented May 16, 2024 •

edited

Loading

phoebewang commented Oct 2, 2025 •

edited

Loading