Skip to content

[X86] [AMX] Misoptimized copy of a zero tile #112763

@ienkovich

Description

@ienkovich

When a zero AMX tile has multiple uses, it's copied using a tileload + tilestore pair instead of another tilezero instruction. Example:

; ModuleID = 'LLVMDialectModule'
source_filename = "LLVMDialectModule"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

define void @amx_phi(ptr %0, ptr %1, ptr %2, i1 %3) {
.entry:
  %zero_tile = tail call x86_amx @llvm.x86.tilezero.internal(i16 16, i16 64)
  br i1 %3, label %.comp, label %.exit

.comp:
  %lhs_1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr %0, i64 128)
  %ptr_1 = getelementptr i8, ptr %0, i64 1024
  %lhs_2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr %ptr_1, i64 128)
  %rhs_1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr %1, i64 128)
  %acc_1 = tail call x86_amx @llvm.x86.tdpbf16ps.internal(i16 16, i16 64, i16 64, x86_amx %zero_tile, x86_amx %lhs_1, x86_amx %rhs_1)
  %acc_2 = tail call x86_amx @llvm.x86.tdpbf16ps.internal(i16 16, i16 64, i16 64, x86_amx %zero_tile, x86_amx %lhs_2, x86_amx %rhs_1)
  br label %.exit

.exit:
  %res_1 = phi x86_amx [ %zero_tile, %.entry ], [ %acc_1, %.comp ]
  %res_2 = phi x86_amx [ %zero_tile, %.entry ], [ %acc_2, %.comp ]
  %ptr_out_1 = getelementptr i8, ptr %2, i64 0
  tail call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr %ptr_out_1, i64 128, x86_amx %res_1)
  %ptr_out_2 = getelementptr i8, ptr %2, i64 1024
  tail call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr %ptr_out_2, i64 128, x86_amx %res_2)

  ret void
}

; Function Attrs: nounwind
declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #0
; Function Attrs: nounwind
declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) #0
; Function Attrs: nounwind
declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #0
; Function Attrs: nounwind
declare x86_amx @llvm.x86.tdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #0

attributes #0 = { nounwind }

!llvm.module.flags = !{!0}

!0 = !{i32 2, !"Debug Info Version", i32 3}

Generated asm using llc test.ll -mcpu=sapphirerapids -O3:

        tilezero        %tmm0
...
        tilestored      %tmm0, 896(%rsp,%rbp)   # 1024-byte Folded Spill
        tileloadd       896(%rsp), %tmm1        # 1024-byte Folded Reload
...
        tilestored      %tmm0, 1920(%rsp,%rbp)  # 1024-byte Folded Spill
        tileloadd       1920(%rsp), %tmm1       # 1024-byte Folded Reload

In both cases tilezero %tmm1 could be used instead because %tmm0 is known to be defined by tilezeo instruction.

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions