-
Notifications
You must be signed in to change notification settings - Fork 15.3k
Open
Labels
Description
When a zero AMX tile has multiple uses, it's copied using a tileload + tilestore pair instead of another tilezero instruction. Example:
; ModuleID = 'LLVMDialectModule'
source_filename = "LLVMDialectModule"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
define void @amx_phi(ptr %0, ptr %1, ptr %2, i1 %3) {
.entry:
%zero_tile = tail call x86_amx @llvm.x86.tilezero.internal(i16 16, i16 64)
br i1 %3, label %.comp, label %.exit
.comp:
%lhs_1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr %0, i64 128)
%ptr_1 = getelementptr i8, ptr %0, i64 1024
%lhs_2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr %ptr_1, i64 128)
%rhs_1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr %1, i64 128)
%acc_1 = tail call x86_amx @llvm.x86.tdpbf16ps.internal(i16 16, i16 64, i16 64, x86_amx %zero_tile, x86_amx %lhs_1, x86_amx %rhs_1)
%acc_2 = tail call x86_amx @llvm.x86.tdpbf16ps.internal(i16 16, i16 64, i16 64, x86_amx %zero_tile, x86_amx %lhs_2, x86_amx %rhs_1)
br label %.exit
.exit:
%res_1 = phi x86_amx [ %zero_tile, %.entry ], [ %acc_1, %.comp ]
%res_2 = phi x86_amx [ %zero_tile, %.entry ], [ %acc_2, %.comp ]
%ptr_out_1 = getelementptr i8, ptr %2, i64 0
tail call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr %ptr_out_1, i64 128, x86_amx %res_1)
%ptr_out_2 = getelementptr i8, ptr %2, i64 1024
tail call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr %ptr_out_2, i64 128, x86_amx %res_2)
ret void
}
; Function Attrs: nounwind
declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #0
; Function Attrs: nounwind
declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) #0
; Function Attrs: nounwind
declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #0
; Function Attrs: nounwind
declare x86_amx @llvm.x86.tdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #0
attributes #0 = { nounwind }
!llvm.module.flags = !{!0}
!0 = !{i32 2, !"Debug Info Version", i32 3}
Generated asm using llc test.ll -mcpu=sapphirerapids -O3:
tilezero %tmm0
...
tilestored %tmm0, 896(%rsp,%rbp) # 1024-byte Folded Spill
tileloadd 896(%rsp), %tmm1 # 1024-byte Folded Reload
...
tilestored %tmm0, 1920(%rsp,%rbp) # 1024-byte Folded Spill
tileloadd 1920(%rsp), %tmm1 # 1024-byte Folded Reload
In both cases tilezero %tmm1 could be used instead because %tmm0 is known to be defined by tilezeo instruction.