Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 89 additions & 21 deletions llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,11 @@ static cl::opt<bool> UseLIRCodeSizeHeurs(
"with -Os/-Oz"),
cl::init(true), cl::Hidden);

static cl::opt<bool> EnableMemsetPatternIntrinsic(
"loop-idiom-enable-memset-pattern-intrinsic",
cl::desc("Enable use of the memset_pattern intrinsic."), cl::init(false),
cl::Hidden);

namespace {

class LoopIdiomRecognize {
Expand Down Expand Up @@ -300,7 +305,8 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) {
HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
HasMemcpy = TLI->has(LibFunc_memcpy);

if (HasMemset || HasMemsetPattern || HasMemcpy)
if (HasMemset || HasMemsetPattern || EnableMemsetPatternIntrinsic ||
HasMemcpy)
if (SE->hasLoopInvariantBackedgeTakenCount(L))
return runOnCountableLoop();

Expand Down Expand Up @@ -457,7 +463,8 @@ LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
// It looks like we can use SplatValue.
return LegalStoreKind::Memset;
}
if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset &&
if (!UnorderedAtomic && (HasMemsetPattern || EnableMemsetPatternIntrinsic) &&
!DisableLIRP::Memset &&
// Don't create memset_pattern16s with address spaces.
StorePtr->getType()->getPointerAddressSpace() == 0 &&
getMemSetPatternValue(StoredVal, DL)) {
Expand Down Expand Up @@ -993,6 +1000,46 @@ static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
SCEV::FlagNUW);
}

ConstantInt *memSetPatternValueToI128ConstantInt(LLVMContext &Context,
Value *MemSetPatternValue) {
if (auto CIMemSetPatternValue = dyn_cast<ConstantInt>(MemSetPatternValue)) {
return CIMemSetPatternValue;
}

if (auto Array = dyn_cast<ConstantDataArray>(MemSetPatternValue)) {
Type *ElementType = Array->getElementType();
unsigned ElementSize = Array->getElementByteSize() * 8;

APInt Result(128, 0);
unsigned totalBits = 0;

for (unsigned i = 0; i < Array->getNumElements(); ++i) {
if (totalBits + ElementSize > 128) {
report_fatal_error("Pattern value unexpectedly greater than 128 bits");
}

APInt ElementBits;
if (ElementType->isIntegerTy()) {
ElementBits = Array->getElementAsAPInt(i);
} else if (ElementType->isFloatingPointTy()) {
APFloat APF = Array->getElementAsAPFloat(i);
ElementBits = APF.bitcastToAPInt();
} else {
llvm_unreachable("Unexpected element type");
}

// Shift the existing result left by the element's size and OR in the new
// value
Result = (Result << ElementSize) | ElementBits.zextOrTrunc(128);
totalBits += ElementSize;
}

// Create and return a ConstantInt with the resulting value
return ConstantInt::get(Context, Result);
}
report_fatal_error("Encountered unrecognised type");
}

/// processLoopStridedStore - We see a strided store of some value. If we can
/// transform this into a memset or memset_pattern in the loop preheader, do so.
bool LoopIdiomRecognize::processLoopStridedStore(
Expand Down Expand Up @@ -1070,7 +1117,8 @@ bool LoopIdiomRecognize::processLoopStridedStore(
Value *NumBytes =
Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());

if (!SplatValue && !isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16))
if (!SplatValue && !(isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16) ||
EnableMemsetPatternIntrinsic))
return Changed;

AAMDNodes AATags = TheStore->getAAMetadata();
Expand All @@ -1087,24 +1135,44 @@ bool LoopIdiomRecognize::processLoopStridedStore(
BasePtr, SplatValue, NumBytes, MaybeAlign(StoreAlignment),
/*isVolatile=*/false, AATags.TBAA, AATags.Scope, AATags.NoAlias);
} else {
assert (isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16));
// Everything is emitted in default address space
Type *Int8PtrTy = DestInt8PtrTy;

StringRef FuncName = "memset_pattern16";
FunctionCallee MSP = getOrInsertLibFunc(M, *TLI, LibFunc_memset_pattern16,
Builder.getVoidTy(), Int8PtrTy, Int8PtrTy, IntIdxTy);
inferNonMandatoryLibFuncAttrs(M, FuncName, *TLI);

// Otherwise we should form a memset_pattern16. PatternValue is known to be
// an constant array of 16-bytes. Plop the value into a mergable global.
GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
GlobalValue::PrivateLinkage,
PatternValue, ".memset_pattern");
GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
GV->setAlignment(Align(16));
Value *PatternPtr = GV;
NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
assert(isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16) ||
EnableMemsetPatternIntrinsic);
if (EnableMemsetPatternIntrinsic) {
// Everything is emitted in default address space

// Get or insert the intrinsic declaration
Function *MemsetPatternIntrinsic = Intrinsic::getDeclaration(
M, Intrinsic::memset_pattern,
{DestInt8PtrTy, Builder.getInt128Ty(), Builder.getInt64Ty()});

// Create the call to the intrinsic
NewCall = Builder.CreateCall(
MemsetPatternIntrinsic,
{BasePtr,
memSetPatternValueToI128ConstantInt(M->getContext(), PatternValue),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Two high level suggestions to simplify this code:

  • Change getMemSetPatternValue to directly return the i128 result. There's no reason it should return and array, and then be converted again. The global variable can just be a i128 type, and the pointer can be passed. The Array is (should be?) an irrelevant implementation detail.
  • Move the libcall creation to lowering for the intrinsic, and always generate the intrinsic. The enable becomes not whether to use the intrinsic, but when to generate the intrinsic (i.e. are we limiting ourselves to the case where we know we have a libcall to expand to.)

NumBytes, ConstantInt::getFalse(M->getContext())});
} else {
// Everything is emitted in default address space
Type *Int8PtrTy = DestInt8PtrTy;

StringRef FuncName = "memset_pattern16";
FunctionCallee MSP = getOrInsertLibFunc(M, *TLI, LibFunc_memset_pattern16,
Builder.getVoidTy(), Int8PtrTy,
Int8PtrTy, IntIdxTy);
inferNonMandatoryLibFuncAttrs(M, FuncName, *TLI);

// Otherwise we should form a memset_pattern16. PatternValue is known to
// be an constant array of 16-bytes. Plop the value into a mergable
// global.
GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
GlobalValue::PrivateLinkage,
PatternValue, ".memset_pattern");
GV->setUnnamedAddr(
GlobalValue::UnnamedAddr::Global); // Ok to merge these.
GV->setAlignment(Align(16));
Value *PatternPtr = GV;
NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
}

// Set the TBAA info if present.
if (AATags.TBAA)
Expand Down
141 changes: 141 additions & 0 deletions llvm/test/Transforms/LoopIdiom/memset-pattern-intrinsic.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -passes="loop-idiom" -loop-idiom-enable-memset-pattern-intrinsic < %s -S | FileCheck %s

target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"

target triple = "x86_64-apple-darwin10.0.0"


define dso_local void @double_memset(ptr nocapture %p) {
; CHECK-LABEL: @double_memset(
; CHECK-NEXT: entry:
; CHECK-NEXT: call void @llvm.memset_pattern.p0.i128.i64(ptr [[P:%.*]], i128 85118011523600494056561698149391631982, i64 128, i1 false), !tbaa [[TBAA0:![0-9]+]]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret void
; CHECK: for.body:
; CHECK-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds double, ptr [[P]], i64 [[I_07]]
; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_07]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 16
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
;
entry:
br label %for.body

for.cond.cleanup:
ret void

for.body:
%i.07 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
%ptr1 = getelementptr inbounds double, ptr %p, i64 %i.07
store double 3.14159e+00, ptr %ptr1, align 1, !tbaa !5
%inc = add nuw nsw i64 %i.07, 1
%exitcond.not = icmp eq i64 %inc, 16
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}


define dso_local void @struct_memset(ptr nocapture %p) {
; CHECK-LABEL: @struct_memset(
; CHECK-NEXT: entry:
; CHECK-NEXT: call void @llvm.memset_pattern.p0.i128.i64(ptr [[P:%.*]], i128 85118011523600494056561698149391631982, i64 128, i1 false), !tbaa [[TBAA4:![0-9]+]]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret void
; CHECK: for.body:
; CHECK-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds double, ptr [[P]], i64 [[I_07]]
; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_07]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 16
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
;
entry:
br label %for.body

for.cond.cleanup:
ret void

for.body:
%i.07 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
%ptr1 = getelementptr inbounds double, ptr %p, i64 %i.07
store double 3.14159e+00, ptr %ptr1, align 1, !tbaa !10
%inc = add nuw nsw i64 %i.07, 1
%exitcond.not = icmp eq i64 %inc, 16
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}

define dso_local void @var_memset(ptr nocapture %p, i64 %len) {
; CHECK-LABEL: @var_memset(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = shl nuw i64 [[LEN:%.*]], 3
; CHECK-NEXT: call void @llvm.memset_pattern.p0.i128.i64(ptr [[P:%.*]], i128 85118011523600494056561698149391631982, i64 [[TMP0]], i1 false)
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret void
; CHECK: for.body:
; CHECK-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds double, ptr [[P]], i64 [[I_07]]
; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_07]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[LEN]]
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
;
entry:
br label %for.body

for.cond.cleanup:
ret void

for.body:
%i.07 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
%ptr1 = getelementptr inbounds double, ptr %p, i64 %i.07
store double 3.14159e+00, ptr %ptr1, align 1, !tbaa !10
%inc = add nuw nsw i64 %i.07, 1
%exitcond.not = icmp eq i64 %inc, %len
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}

define void @test11_pattern(ptr nocapture %P) nounwind ssp {
; CHECK-LABEL: @test11_pattern(
; CHECK-NEXT: entry:
; CHECK-NEXT: call void @llvm.memset_pattern.p0.i128.i64(ptr [[P:%.*]], i128 79228162532711081671548469249, i64 40000, i1 false)
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr i32, ptr [[P]], i64 [[INDVAR]]
; CHECK-NEXT: [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVAR_NEXT]], 10000
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
entry:
br label %for.body

for.body: ; preds = %entry, %for.body
%indvar = phi i64 [ 0, %entry ], [ %indvar.next, %for.body ]
%arrayidx = getelementptr i32, ptr %P, i64 %indvar
store i32 1, ptr %arrayidx, align 4
%indvar.next = add i64 %indvar, 1
%exitcond = icmp eq i64 %indvar.next, 10000
br i1 %exitcond, label %for.end, label %for.body

for.end: ; preds = %for.body
ret void
}

!5 = !{!6, !6, i64 0}
!6 = !{!"double", !7, i64 0}
!7 = !{!"omnipotent char", !8, i64 0}
!8 = !{!"Simple C++ TBAA"}

!15 = !{!8, i64 0, !"omnipotent char"}
!17 = !{!15, i64 8, !"double"}
!9 = !{!15, i64 32, !"_ZTS1A", !17, i64 0, i64 8, !17, i64 8, i64 8, !17, i64 16, i64 8, !17, i64 24, i64 8}
!10 = !{!9, !17, i64 0, i64 1}

!18 = !{!19, !20, i64 0}
!19 = !{!"A", !20, i64 0, !22, i64 8}
!20 = !{!"any pointer", !7, i64 0}
!21 = !{!22, !20, i64 0}
!22 = !{!"B", !20, i64 0}