Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion llvm/lib/Target/X86/X86.h
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,18 @@ FunctionPass *createX86LowerAMXTypeLegacyPass();

/// The pass transforms amx intrinsics to scalar operation if the function has
/// optnone attribute or it is O0.
FunctionPass *createX86LowerAMXIntrinsicsPass();
class X86LowerAMXIntrinsicsPass
: public PassInfoMixin<X86LowerAMXIntrinsicsPass> {
private:
const TargetMachine *TM;

public:
X86LowerAMXIntrinsicsPass(const TargetMachine *TM) : TM(TM) {}
PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
static bool isRequired() { return true; }
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't really understand the circumstances when this pass is supposed to run. It took all possible strategies for controlling a pass and used all of them. This "only run for different flavors of -O0, except not by default" seems very unsound

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is for -O0 or none O0 but has optnone attributes. It solves the redundant stack load/stores which eliminated in other cases.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure we should be using a backend pass enabled at O0 to remove redundant stack load/stores if those are supposed to be cleaned up in the middle end.

Either way, this patch just intends to port the pass, not fix any latent issues or clean up any latent tech debt of this sort.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds like a solution to a problem that shouldn't be solved?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. The comment at the top of the file:

/// To decouple the dependency of the shape, we transform amx intrinsics
/// to scalar operation, so that compiling doesn't fail. In long term, we
/// should improve fast register allocation to allocate amx register.

makes it seem like this might require some effort though.

};

FunctionPass *createX86LowerAMXIntrinsicsLegacyPass();

InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM,
const X86Subtarget &,
Expand Down
48 changes: 38 additions & 10 deletions llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,15 @@
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/Analysis.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsX86.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
Expand All @@ -40,7 +43,7 @@
using namespace llvm;
using namespace PatternMatch;

#define DEBUG_TYPE "lower-amx-intrinsics"
#define DEBUG_TYPE "x86-lower-amx-intrinsics"

#ifndef NDEBUG
static bool isV256I32Ty(Type *Ty) {
Expand Down Expand Up @@ -626,6 +629,37 @@ bool X86LowerAMXIntrinsics::visit() {
return C;
}

namespace {
bool shouldRunLowerAMXIntrinsics(const Function &F, const TargetMachine *TM) {
return X86ScalarizeAMX && (F.hasFnAttribute(Attribute::OptimizeNone) ||
TM->getOptLevel() == CodeGenOptLevel::None);
}

bool runLowerAMXIntrinsics(Function &F, DominatorTree *DT, LoopInfo *LI) {
DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);

X86LowerAMXIntrinsics LAT(F, DTU, LI);
return LAT.visit();
}
} // namespace

PreservedAnalyses X86LowerAMXIntrinsicsPass::run(Function &F,
FunctionAnalysisManager &FAM) {
if (!shouldRunLowerAMXIntrinsics(F, TM))
return PreservedAnalyses::all();

DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
bool Changed = runLowerAMXIntrinsics(F, &DT, &LI);
if (!Changed)
return PreservedAnalyses::all();

PreservedAnalyses PA = PreservedAnalyses::none();
PA.preserve<DominatorTreeAnalysis>();
PA.preserve<LoopAnalysis>();
return PA;
}

namespace {
class X86LowerAMXIntrinsicsLegacyPass : public FunctionPass {
public:
Expand All @@ -634,21 +668,15 @@ class X86LowerAMXIntrinsicsLegacyPass : public FunctionPass {
X86LowerAMXIntrinsicsLegacyPass() : FunctionPass(ID) {}

bool runOnFunction(Function &F) override {
if (!X86ScalarizeAMX)
return false;
TargetMachine *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
if (!F.hasFnAttribute(Attribute::OptimizeNone) &&
TM->getOptLevel() != CodeGenOptLevel::None)
if (!shouldRunLowerAMXIntrinsics(F, TM))
return false;

auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);

X86LowerAMXIntrinsics LAT(F, DTU, LI);
return LAT.visit();
return runLowerAMXIntrinsics(F, DT, LI);
}
StringRef getPassName() const override { return "Lower AMX intrinsics"; }

Expand All @@ -668,6 +696,6 @@ INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
INITIALIZE_PASS_END(X86LowerAMXIntrinsicsLegacyPass, DEBUG_TYPE, PassName,
false, false)

FunctionPass *llvm::createX86LowerAMXIntrinsicsPass() {
FunctionPass *llvm::createX86LowerAMXIntrinsicsLegacyPass() {
return new X86LowerAMXIntrinsicsLegacyPass();
}
2 changes: 1 addition & 1 deletion llvm/lib/Target/X86/X86PassRegistry.def
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,13 @@
#ifndef FUNCTION_PASS
#define FUNCTION_PASS(NAME, CREATE_PASS)
#endif
FUNCTION_PASS("x86-lower-amx-intrinsics", X86LowerAMXIntrinsicsPass(this))
FUNCTION_PASS("x86-lower-amx-type", X86LowerAMXTypePass(this))
#undef FUNCTION_PASS

#ifndef DUMMY_FUNCTION_PASS
#define DUMMY_FUNCTION_PASS(NAME, CREATE_PASS)
#endif
DUMMY_FUNCTION_PASS("lower-amx-intrinsics", X86LowerAMXIntrinsics(*this))
DUMMY_FUNCTION_PASS("x86-partial-reduction", X86PartialReduction())
DUMMY_FUNCTION_PASS("x86-winehstate", WinEHStatePass())
#undef DUMMY_FUNCTION_PASS
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/X86/X86TargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -422,7 +422,7 @@ void X86PassConfig::addIRPasses() {

// We add both pass anyway and when these two passes run, we skip the pass
// based on the option level and option attribute.
addPass(createX86LowerAMXIntrinsicsPass());
addPass(createX86LowerAMXIntrinsicsLegacyPass());
addPass(createX86LowerAMXTypeLegacyPass());

TargetPassConfig::addIRPasses();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
; RUN: opt -mtriple=x86_64 -x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
; RUN: opt -mtriple=x86_64 -passes=x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s

define dso_local void @test_no_bitcast(ptr %A_mem, ptr %B_mem, ptr %C_mem) local_unnamed_addr #0 {
; CHECK-LABEL: @test_no_bitcast(
Expand Down
3 changes: 2 additions & 1 deletion llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
; RUN: opt -mtriple=x86_64 -x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
; RUN: opt -mtriple=x86_64 -passes=x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s

define dso_local void @test_amx_load_non_O0(i16 signext %row, i16 signext %col, ptr%ptr, i64 %stride, ptr %vptr) {
; CHECK-LABEL: @test_amx_load_non_O0(
Expand Down
Loading