From 49264cc2139f6c1c52d2417d465e5b1a9c9952a4 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Sat, 25 Oct 2025 18:59:42 +0000 Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20change?= =?UTF-8?q?s=20to=20main=20this=20commit=20is=20based=20on?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Created using spr 1.3.7 [skip ci] --- llvm/lib/Target/X86/X86.h | 15 ++- llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp | 1 + llvm/lib/Target/X86/X86LowerAMXType.cpp | 96 +++++++++++-------- llvm/lib/Target/X86/X86PassRegistry.def | 7 +- llvm/lib/Target/X86/X86TargetMachine.cpp | 2 +- .../test/CodeGen/X86/AMX/amx-combine-undef.ll | 3 +- llvm/test/CodeGen/X86/AMX/amx-combine.ll | 3 +- .../CodeGen/X86/AMX/amx-configO2toO0-lower.ll | 3 +- llvm/test/CodeGen/X86/AMX/amx-type.ll | 3 +- .../X86/AMX/lat-combine-amx-bitcast.ll | 3 +- .../X86/AMX/lat-transform-amx-bitcast.ll | 3 +- .../X86/amx_tile_pair_lower_type_O0.ll | 3 +- .../X86/amx_tile_pair_lower_type_O2.ll | 3 +- 13 files changed, 95 insertions(+), 50 deletions(-) diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h index 706ab2b62bc1b..51b540a7a51d0 100644 --- a/llvm/lib/Target/X86/X86.h +++ b/llvm/lib/Target/X86/X86.h @@ -14,7 +14,10 @@ #ifndef LLVM_LIB_TARGET_X86_X86_H #define LLVM_LIB_TARGET_X86_X86_H +#include "llvm/IR/Analysis.h" +#include "llvm/IR/PassManager.h" #include "llvm/Support/CodeGen.h" +#include "llvm/Target/TargetMachine.h" namespace llvm { @@ -162,7 +165,17 @@ FunctionPass *createX86WinEHUnwindV2Pass(); /// The pass transforms load/store <256 x i32> to AMX load/store intrinsics /// or split the data to two <128 x i32>. -FunctionPass *createX86LowerAMXTypePass(); +class X86LowerAMXTypePass : public PassInfoMixin { +private: + const TargetMachine *TM; + +public: + X86LowerAMXTypePass(const TargetMachine *TM) : TM(TM) {} + PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); + static bool isRequired() { return true; } +}; + +FunctionPass *createX86LowerAMXTypeLegacyPass(); /// The pass transforms amx intrinsics to scalar operation if the function has /// optnone attribute or it is O0. diff --git a/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp b/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp index d979517e12af6..2c0443da673a8 100644 --- a/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp +++ b/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp @@ -10,6 +10,7 @@ /// TODO: Port CodeGen passes to new pass manager. //===----------------------------------------------------------------------===// +#include "X86.h" #include "X86ISelDAGToDAG.h" #include "X86TargetMachine.h" diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp index 0ba71ada8638e..8ffd454f4f73e 100644 --- a/llvm/lib/Target/X86/X86LowerAMXType.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp @@ -46,12 +46,14 @@ #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Analysis.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsX86.h" +#include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" @@ -64,7 +66,7 @@ using namespace llvm; using namespace PatternMatch; -#define DEBUG_TYPE "lower-amx-type" +#define DEBUG_TYPE "x86-lower-amx-type" static bool isAMXCast(Instruction *II) { return match(II, @@ -137,7 +139,7 @@ static Instruction *getFirstNonAllocaInTheEntryBlock(Function &F) { class ShapeCalculator { private: - TargetMachine *TM = nullptr; + const TargetMachine *TM = nullptr; // In AMX intrinsics we let Shape = {Row, Col}, but the // RealCol = Col / ElementSize. We may use the RealCol @@ -145,7 +147,7 @@ class ShapeCalculator { std::map Col2Row, Row2Col; public: - ShapeCalculator(TargetMachine *TargetM) : TM(TargetM) {} + ShapeCalculator(const TargetMachine *TargetM) : TM(TargetM) {} std::pair getShape(IntrinsicInst *II, unsigned OpNo); std::pair getShape(PHINode *Phi); Value *getRowFromCol(Instruction *II, Value *V, unsigned Granularity); @@ -1432,8 +1434,58 @@ bool X86LowerAMXCast::transformAllAMXCast() { return Change; } +bool lowerAmxType(Function &F, const TargetMachine *TM, + TargetLibraryInfo *TLI) { + // Performance optimization: most code doesn't use AMX, so return early if + // there are no instructions that produce AMX values. This is sufficient, as + // AMX arguments and constants are not allowed -- so any producer of an AMX + // value must be an instruction. + // TODO: find a cheaper way for this, without looking at all instructions. + if (!containsAMXCode(F)) + return false; + + bool C = false; + ShapeCalculator SC(TM); + X86LowerAMXCast LAC(F, &SC); + C |= LAC.combineAMXcast(TLI); + // There might be remaining AMXcast after combineAMXcast and they should be + // handled elegantly. + C |= LAC.transformAllAMXCast(); + + X86LowerAMXType LAT(F, &SC); + C |= LAT.visit(); + + // Prepare for fast register allocation at O0. + // Todo: May better check the volatile model of AMX code, not just + // by checking Attribute::OptimizeNone and CodeGenOptLevel::None. + if (TM->getOptLevel() == CodeGenOptLevel::None) { + // If Front End not use O0 but the Mid/Back end use O0, (e.g. + // "Clang -O2 -S -emit-llvm t.c" + "llc t.ll") we should make + // sure the amx data is volatile, that is necessary for AMX fast + // register allocation. + if (!F.hasFnAttribute(Attribute::OptimizeNone)) { + X86VolatileTileData VTD(F); + C = VTD.volatileTileData() || C; + } + } + + return C; +} + } // anonymous namespace +PreservedAnalyses X86LowerAMXTypePass::run(Function &F, + FunctionAnalysisManager &FAM) { + TargetLibraryInfo &TLI = FAM.getResult(F); + bool Changed = lowerAmxType(F, TM, &TLI); + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA = PreservedAnalyses::none(); + PA.preserveSet(); + return PA; +} + namespace { class X86LowerAMXTypeLegacyPass : public FunctionPass { @@ -1443,44 +1495,10 @@ class X86LowerAMXTypeLegacyPass : public FunctionPass { X86LowerAMXTypeLegacyPass() : FunctionPass(ID) {} bool runOnFunction(Function &F) override { - // Performance optimization: most code doesn't use AMX, so return early if - // there are no instructions that produce AMX values. This is sufficient, as - // AMX arguments and constants are not allowed -- so any producer of an AMX - // value must be an instruction. - // TODO: find a cheaper way for this, without looking at all instructions. - if (!containsAMXCode(F)) - return false; - - bool C = false; TargetMachine *TM = &getAnalysis().getTM(); TargetLibraryInfo *TLI = &getAnalysis().getTLI(F); - - ShapeCalculator SC(TM); - X86LowerAMXCast LAC(F, &SC); - C |= LAC.combineAMXcast(TLI); - // There might be remaining AMXcast after combineAMXcast and they should be - // handled elegantly. - C |= LAC.transformAllAMXCast(); - - X86LowerAMXType LAT(F, &SC); - C |= LAT.visit(); - - // Prepare for fast register allocation at O0. - // Todo: May better check the volatile model of AMX code, not just - // by checking Attribute::OptimizeNone and CodeGenOptLevel::None. - if (TM->getOptLevel() == CodeGenOptLevel::None) { - // If Front End not use O0 but the Mid/Back end use O0, (e.g. - // "Clang -O2 -S -emit-llvm t.c" + "llc t.ll") we should make - // sure the amx data is volatile, that is nessary for AMX fast - // register allocation. - if (!F.hasFnAttribute(Attribute::OptimizeNone)) { - X86VolatileTileData VTD(F); - C = VTD.volatileTileData() || C; - } - } - - return C; + return lowerAmxType(F, TM, TLI); } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -1501,6 +1519,6 @@ INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(X86LowerAMXTypeLegacyPass, DEBUG_TYPE, PassName, false, false) -FunctionPass *llvm::createX86LowerAMXTypePass() { +FunctionPass *llvm::createX86LowerAMXTypeLegacyPass() { return new X86LowerAMXTypeLegacyPass(); } diff --git a/llvm/lib/Target/X86/X86PassRegistry.def b/llvm/lib/Target/X86/X86PassRegistry.def index 3f2a4331c41f2..fc25d55d3059a 100644 --- a/llvm/lib/Target/X86/X86PassRegistry.def +++ b/llvm/lib/Target/X86/X86PassRegistry.def @@ -12,11 +12,16 @@ // NOTE: NO INCLUDE GUARD DESIRED! +#ifndef FUNCTION_PASS +#define FUNCTION_PASS(NAME, CREATE_PASS) +#endif +FUNCTION_PASS("x86-lower-amx-type", X86LowerAMXTypePass(this)) +#undef FUNCTION_PASS + #ifndef DUMMY_FUNCTION_PASS #define DUMMY_FUNCTION_PASS(NAME, CREATE_PASS) #endif DUMMY_FUNCTION_PASS("lower-amx-intrinsics", X86LowerAMXIntrinsics(*this)) -DUMMY_FUNCTION_PASS("lower-amx-type", X86LowerAMXTypePass(*this)) DUMMY_FUNCTION_PASS("x86-partial-reduction", X86PartialReduction()) DUMMY_FUNCTION_PASS("x86-winehstate", WinEHStatePass()) #undef DUMMY_FUNCTION_PASS diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 8dd6f3d97ccea..9a76abcd351bf 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -423,7 +423,7 @@ void X86PassConfig::addIRPasses() { // We add both pass anyway and when these two passes run, we skip the pass // based on the option level and option attribute. addPass(createX86LowerAMXIntrinsicsPass()); - addPass(createX86LowerAMXTypePass()); + addPass(createX86LowerAMXTypeLegacyPass()); TargetPassConfig::addIRPasses(); diff --git a/llvm/test/CodeGen/X86/AMX/amx-combine-undef.ll b/llvm/test/CodeGen/X86/AMX/amx-combine-undef.ll index faa119cd037f1..5f0682abbea12 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-combine-undef.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-combine-undef.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s +; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -x86-lower-amx-type %s -S | FileCheck %s +; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -passes=x86-lower-amx-type %s -S | FileCheck %s define void @undef_2phi(ptr%buf) { ; CHECK-LABEL: @undef_2phi( diff --git a/llvm/test/CodeGen/X86/AMX/amx-combine.ll b/llvm/test/CodeGen/X86/AMX/amx-combine.ll index 07f489c633c55..72e072dd15761 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-combine.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-combine.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s +; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -x86-lower-amx-type %s -S | FileCheck %s +; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -passes=x86-lower-amx-type %s -S | FileCheck %s define void @combine_store(ptr%p) { ; CHECK-LABEL: @combine_store( diff --git a/llvm/test/CodeGen/X86/AMX/amx-configO2toO0-lower.ll b/llvm/test/CodeGen/X86/AMX/amx-configO2toO0-lower.ll index 6c536f11d4bb1..4ac406c1603ee 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-configO2toO0-lower.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-configO2toO0-lower.ll @@ -1,4 +1,5 @@ -; RUN: opt < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -lower-amx-type -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -x86-lower-amx-type -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -passes=x86-lower-amx-type -S | FileCheck %s @buf = dso_local global [1024 x i8] zeroinitializer, align 16 @buf2 = dso_local global [1024 x i8] zeroinitializer, align 16 diff --git a/llvm/test/CodeGen/X86/AMX/amx-type.ll b/llvm/test/CodeGen/X86/AMX/amx-type.ll index 1d9af2b13cdfd..294195a6541bf 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-type.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-type.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s +; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -x86-lower-amx-type %s -S | FileCheck %s +; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -passes=x86-lower-amx-type %s -S | FileCheck %s %struct.__tile_str = type { i16, i16, <256 x i32> } diff --git a/llvm/test/CodeGen/X86/AMX/lat-combine-amx-bitcast.ll b/llvm/test/CodeGen/X86/AMX/lat-combine-amx-bitcast.ll index b70668f7a3dea..cdce783d0a237 100644 --- a/llvm/test/CodeGen/X86/AMX/lat-combine-amx-bitcast.ll +++ b/llvm/test/CodeGen/X86/AMX/lat-combine-amx-bitcast.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s +; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -x86-lower-amx-type %s -S | FileCheck %s +; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -passes=x86-lower-amx-type %s -S | FileCheck %s define void @combine_amx_cast_inside_bb() { ; CHECK-LABEL: @combine_amx_cast_inside_bb( diff --git a/llvm/test/CodeGen/X86/AMX/lat-transform-amx-bitcast.ll b/llvm/test/CodeGen/X86/AMX/lat-transform-amx-bitcast.ll index 3a5b424540ff1..0b419bb8573d5 100644 --- a/llvm/test/CodeGen/X86/AMX/lat-transform-amx-bitcast.ll +++ b/llvm/test/CodeGen/X86/AMX/lat-transform-amx-bitcast.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s +; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -x86-lower-amx-type %s -S | FileCheck %s +; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -passes=x86-lower-amx-type %s -S | FileCheck %s %struct.__tile_str = type { i16, i16, <256 x i32> } diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll b/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll index 52641c65c90e9..3549875e858a9 100644 --- a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll +++ b/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py - ; RUN: opt --codegen-opt-level=0 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s + ; RUN: opt --codegen-opt-level=0 -mtriple=x86_64 -x86-lower-amx-type %s -S | FileCheck %s + ; RUN: opt --codegen-opt-level=0 -mtriple=x86_64 -passes=x86-lower-amx-type %s -S | FileCheck %s @buf = dso_local global [2048 x i8] zeroinitializer, align 16 diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll b/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll index 346d46b6b16c2..96966264e0515 100644 --- a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll +++ b/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s +; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -x86-lower-amx-type %s -S | FileCheck %s +; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -passes=x86-lower-amx-type %s -S | FileCheck %s @buf = dso_local global [2048 x i8] zeroinitializer, align 16 @buf2 = dso_local global [2048 x i8] zeroinitializer, align 16