Skip to content

Commit 78b6393

Browse files
committed
Merge branch 'main' into sg_distr_minor_fixes
2 parents b32bd82 + fae8df2 commit 78b6393

File tree

14 files changed

+120
-37
lines changed

14 files changed

+120
-37
lines changed

.github/new-prs-labeler.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -718,6 +718,7 @@ mlgo:
718718
- llvm/lib/Analysis/models/**
719719
- llvm/test/Analysis/IR2Vec/**
720720
- llvm/tools/llvm-ir2vec/**
721+
- llvm/docs/CommandGuide/llvm-ir2vec.rst
721722

722723
tools:llvm-exegesis:
723724
- llvm/tools/llvm-exegesis/**

lldb/test/Shell/Settings/TestChildDepthTruncation.test

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# when target.max-children-depth wasn't explicitly set.
33

44
# RUN: split-file %s %t
5-
# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out
5+
# RUN: %clang_host -g %t/main.cpp -o %t.out
66
# RUN: %lldb -x -b -s %t/dwim-commands.input %t.out -o exit 2>&1 \
77
# RUN: | FileCheck %s --check-prefix=DWIM
88
#

llvm/lib/Target/AMDGPU/FLATInstructions.td

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2947,13 +2947,6 @@ multiclass VFLAT_Real_AllAddr_gfx12<bits<8> op,
29472947
defm _SADDR : VFLAT_Real_gfx12<op, name>;
29482948
}
29492949

2950-
multiclass VGLOBAL_Real_AllAddr_gfx12<bits<8> op,
2951-
string name = get_FLAT_ps<NAME>.Mnemonic,
2952-
string alias = name> :
2953-
VFLAT_Real_Base_gfx12<op, name, alias> {
2954-
defm _SADDR : VFLAT_Real_gfx12<op, name>;
2955-
}
2956-
29572950
multiclass VGLOBAL_Real_AllAddr_gfx1200<bits<8> op> {
29582951
let AssemblerPredicate = isGFX12Not12_50 in {
29592952
defm "" : VFLAT_Real_gfx12<op>;

llvm/lib/Target/DirectX/DXILFlattenArrays.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,16 @@ bool DXILFlattenArraysVisitor::visitGetElementPtrInst(GetElementPtrInst &GEP) {
343343
Info.RootFlattenedArrayType, Info.RootPointerOperand,
344344
{ZeroIndex, FlattenedIndex}, GEP.getName(), GEP.getNoWrapFlags());
345345

346+
// If the pointer operand is a global variable and all indices are 0,
347+
// IRBuilder::CreateGEP will return the global variable instead of creating
348+
// a GEP instruction or GEP ConstantExpr. In this case we have to create and
349+
// insert our own GEP instruction.
350+
if (!isa<GEPOperator>(NewGEP))
351+
NewGEP = GetElementPtrInst::Create(
352+
Info.RootFlattenedArrayType, Info.RootPointerOperand,
353+
{ZeroIndex, FlattenedIndex}, GEP.getNoWrapFlags(), GEP.getName(),
354+
Builder.GetInsertPoint());
355+
346356
// Replace the current GEP with the new GEP. Store GEPInfo into the map
347357
// for later use in case this GEP was not the end of the chain
348358
GEPChainInfoMap.insert({cast<GEPOperator>(NewGEP), std::move(Info)});

llvm/lib/Target/DirectX/DXILLegalizePass.cpp

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -563,7 +563,7 @@ legalizeGetHighLowi64Bytes(Instruction &I,
563563
}
564564

565565
static void
566-
legalizeLoadStoreOnArrayAllocas(Instruction &I,
566+
legalizeScalarLoadStoreOnArrays(Instruction &I,
567567
SmallVectorImpl<Instruction *> &ToRemove,
568568
DenseMap<Value *, Value *> &) {
569569

@@ -581,23 +581,31 @@ legalizeLoadStoreOnArrayAllocas(Instruction &I,
581581
} else
582582
return;
583583

584-
assert(LoadStoreTy->isSingleValueType() &&
585-
"Expected load/store type to be a single-valued type");
584+
// If the load/store is not of a single-value type (i.e., scalar or vector)
585+
// then we do not modify it. It shouldn't be a vector either because the
586+
// dxil-data-scalarization pass is expected to run before this, but it's not
587+
// incorrect to apply this transformation to vector load/stores.
588+
if (!LoadStoreTy->isSingleValueType())
589+
return;
586590

587-
auto *AllocaPtrOp = dyn_cast<AllocaInst>(PtrOp);
588-
if (!AllocaPtrOp)
591+
Type *ArrayTy;
592+
if (auto *GlobalVarPtrOp = dyn_cast<GlobalVariable>(PtrOp))
593+
ArrayTy = GlobalVarPtrOp->getValueType();
594+
else if (auto *AllocaPtrOp = dyn_cast<AllocaInst>(PtrOp))
595+
ArrayTy = AllocaPtrOp->getAllocatedType();
596+
else
589597
return;
590598

591-
Type *Ty = AllocaPtrOp->getAllocatedType();
592-
if (!isa<ArrayType>(Ty))
599+
if (!isa<ArrayType>(ArrayTy))
593600
return;
594-
assert(!isa<ArrayType>(Ty->getArrayElementType()) &&
595-
"Expected allocated type of AllocaInst to be a flat ArrayType");
596601

597-
IRBuilder<> Builder(&I);
598-
Value *Zero = Builder.getInt32(0);
599-
Value *GEP = Builder.CreateGEP(Ty, AllocaPtrOp, {Zero, Zero}, "",
600-
GEPNoWrapFlags::all());
602+
assert(ArrayTy->getArrayElementType() == LoadStoreTy &&
603+
"Expected array element type to be the same as to the scalar load or "
604+
"store type");
605+
606+
Value *Zero = ConstantInt::get(Type::getInt32Ty(I.getContext()), 0);
607+
Value *GEP = GetElementPtrInst::Create(
608+
ArrayTy, PtrOp, {Zero, Zero}, GEPNoWrapFlags::all(), "", I.getIterator());
601609
I.setOperand(PtrOpIndex, GEP);
602610
}
603611

@@ -651,7 +659,7 @@ class DXILLegalizationPipeline {
651659
// downcastI64toI32InsertExtractElements needs to handle.
652660
LegalizationPipeline[Stage2].push_back(
653661
downcastI64toI32InsertExtractElements);
654-
LegalizationPipeline[Stage2].push_back(legalizeLoadStoreOnArrayAllocas);
662+
LegalizationPipeline[Stage2].push_back(legalizeScalarLoadStoreOnArrays);
655663
}
656664
};
657665

llvm/test/CodeGen/DirectX/flatten-array.ll

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,28 @@ define void @two_index_gep_const() {
218218
ret void
219219
}
220220

221+
define void @zero_index_global() {
222+
; CHECK-LABEL: define void @zero_index_global(
223+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw [4 x float], ptr addrspace(3) @g.1dim, i32 0, i32 0
224+
; CHECK-NEXT: load float, ptr addrspace(3) [[GEP]], align 4
225+
; CHECK-NEXT: ret void
226+
%1 = getelementptr inbounds nuw [2 x [2 x float]], ptr addrspace(3) @g, i32 0, i32 0, i32 0
227+
%2 = load float, ptr addrspace(3) %1, align 4
228+
ret void
229+
}
230+
231+
; Note: A ConstantExpr GEP with all 0 indices is equivalent to the pointer
232+
; operand of the GEP. Therefore the visitLoadInst will not see the pointer operand
233+
; as a ConstantExpr GEP and will not create a GEP instruction to be visited.
234+
; The later dxil-legalize pass will insert a GEP in this instance.
235+
define void @zero_index_global_const() {
236+
; CHECK-LABEL: define void @zero_index_global_const(
237+
; CHECK-NEXT: load float, ptr addrspace(3) @g.1dim, align 4
238+
; CHECK-NEXT: ret void
239+
%1 = load float, ptr addrspace(3) getelementptr inbounds nuw ([2 x [2 x float]], ptr addrspace(3) @g, i32 0, i32 0, i32 0), align 4
240+
ret void
241+
}
242+
221243
define void @gep_4d_index_test() {
222244
; CHECK-LABEL: gep_4d_index_test
223245
; CHECK: [[a:%.*]] = alloca [16 x i32], align 4

llvm/test/CodeGen/DirectX/legalize-load-store-array-alloca.ll

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,21 @@ define void @store() {
2121
store i32 0, ptr %a, align 4
2222
ret void
2323
}
24+
25+
@g = local_unnamed_addr addrspace(3) global [4 x i32] zeroinitializer, align 4
26+
define void @load_whole_global () {
27+
; CHECK-LABEL: define void @load_whole_global
28+
; CHECK-NEXT: load [4 x i32], ptr addrspace(3) @g, align 4
29+
; CHECK-NEXT: ret void
30+
%l = load [4 x i32], ptr addrspace(3) @g, align 4
31+
ret void
32+
}
33+
34+
define void @load_global_index0 () {
35+
; CHECK-LABEL: define void @load_global_index0
36+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw [4 x i32], ptr addrspace(3) @g, i32 0, i32 0
37+
; CHECK-NEXT: load i32, ptr addrspace(3) [[GEP]], align 4
38+
; CHECK-NEXT: ret void
39+
%l = load i32, ptr addrspace(3) @g, align 4
40+
ret void
41+
}

llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@
2424
define <4 x i32> @load_array_vec_test() #0 {
2525
; CHECK-LABEL: define <4 x i32> @load_array_vec_test(
2626
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
27-
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(3) @arrayofVecData.scalarized.1dim, align 4
27+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw [8 x i32], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 0
28+
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(3) [[GEP]], align 4
2829
; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(3) getelementptr ([8 x i32], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 1), align 4
2930
; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(3) getelementptr ([8 x i32], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 2), align 4
3031
; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(3) getelementptr ([8 x i32], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 3), align 4
@@ -52,7 +53,8 @@ define <4 x i32> @load_array_vec_test() #0 {
5253
define <4 x i32> @load_vec_test() #0 {
5354
; CHECK-LABEL: define <4 x i32> @load_vec_test(
5455
; CHECK-SAME: ) #[[ATTR0]] {
55-
; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(3) @vecData.scalarized, align 4
56+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw [4 x i32], ptr addrspace(3) @vecData.scalarized, i32 0, i32 0
57+
; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(3) [[GEP]], align 4
5658
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(3) getelementptr ([4 x i32], ptr addrspace(3) @vecData.scalarized, i32 0, i32 1), align 4
5759
; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) getelementptr ([4 x i32], ptr addrspace(3) @vecData.scalarized, i32 0, i32 2), align 4
5860
; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(3) getelementptr ([4 x i32], ptr addrspace(3) @vecData.scalarized, i32 0, i32 3), align 4
@@ -203,7 +205,8 @@ define <4 x i32> @load_static_array_of_vec_from_i8_gep_test(i32 %index) #0 {
203205
define <4 x i32> @multid_load_test() #0 {
204206
; CHECK-LABEL: define <4 x i32> @multid_load_test(
205207
; CHECK-SAME: ) #[[ATTR0]] {
206-
; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(3) @groupshared2dArrayofVectors.scalarized.1dim, align 4
208+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw [36 x i32], ptr addrspace(3) @groupshared2dArrayofVectors.scalarized.1dim, i32 0, i32 0
209+
; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(3) [[GEP]], align 4
207210
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(3) getelementptr ([36 x i32], ptr addrspace(3) @groupshared2dArrayofVectors.scalarized.1dim, i32 0, i32 1), align 4
208211
; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) getelementptr ([36 x i32], ptr addrspace(3) @groupshared2dArrayofVectors.scalarized.1dim, i32 0, i32 2), align 4
209212
; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(3) getelementptr ([36 x i32], ptr addrspace(3) @groupshared2dArrayofVectors.scalarized.1dim, i32 0, i32 3), align 4

llvm/test/CodeGen/DirectX/scalar-store.ll

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@
1414

1515
; CHECK-LABEL: store_array_vec_test
1616
define void @store_array_vec_test () local_unnamed_addr #0 {
17-
; CHECK-NEXT: store float 1.000000e+00, ptr addrspace(3) @arrayofVecData.scalarized.1dim, align 16
17+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw [6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 0
18+
; CHECK-NEXT: store float 1.000000e+00, ptr addrspace(3) [[GEP]], align 16
1819
; CHECK-NEXT: store float 2.000000e+00, ptr addrspace(3) getelementptr ([6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 1), align 4
1920
; CHECK-NEXT: store float 3.000000e+00, ptr addrspace(3) getelementptr ([6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 2), align 8
2021
; CHECK-NEXT: store float 2.000000e+00, ptr addrspace(3) getelementptr inbounds ([6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 3), align 16
@@ -30,7 +31,8 @@ define void @store_array_vec_test () local_unnamed_addr #0 {
3031
; CHECK-LABEL: store_vec_test
3132
define void @store_vec_test(<4 x i32> %inputVec) #0 {
3233
; CHECK-NEXT: [[INPUTVEC_I01:%.*]] = extractelement <4 x i32> %inputVec, i32 0
33-
; CHECK-NEXT: store i32 [[INPUTVEC_I01]], ptr addrspace(3) @vecData.scalarized, align 4
34+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw [4 x i32], ptr addrspace(3) @vecData.scalarized, i32 0, i32 0
35+
; CHECK-NEXT: store i32 [[INPUTVEC_I01]], ptr addrspace(3) [[GEP]], align 4
3436
; CHECK-NEXT: [[INPUTVEC_I12:%.*]] = extractelement <4 x i32> %inputVec, i32 1
3537
; CHECK-NEXT: store i32 [[INPUTVEC_I12]], ptr addrspace(3) getelementptr ([4 x i32], ptr addrspace(3) @vecData.scalarized, i32 0, i32 1), align 4
3638
; CHECK-NEXT: [[INPUTVEC_I23:%.*]] = extractelement <4 x i32> %inputVec, i32 2

mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ def AMDGPU_ScaledExtPackedOp
127127
let summary = "Extend a vector of packed floating point values";
128128

129129
let description = [{
130-
Extend and scale two packed floats in `source[index]` to two floats and
130+
Extend and scale two packed floats in `source[index]` to two floats and
131131
return them.
132132

133133
This rather unusual signature arises from the fact that AMD GPUs cannot
@@ -861,7 +861,7 @@ def AMDGPU_WMMAOp :
861861
}
862862

863863
def AMDGPU_GatherToLDSOp :
864-
AMDGPU_Op<"gather_to_lds", [SameVariadicOperandSize]>,
864+
AMDGPU_Op<"gather_to_lds", [AttrSizedOperandSegments]>,
865865
Arguments<(ins
866866
Arg<AnyMemRef, "buffer to gather from", [MemRead]>:$src,
867867
Variadic<Index>:$srcIndices,
@@ -966,13 +966,13 @@ def AMDGPU_ScaledMFMAOp :
966966
order (that is, v[0] will go to arg[7:0], v[1] to arg[15:8] and so on).
967967

968968
This wrapper takes inspiration from `amdgpu.mfma`, but has some key differences:
969-
- `amdgpu.scaled_mfma` operates on fp4 (f4E2M1FN), fp6 (f6E2M3FN and f6E3M2FN) and
970-
fp8 (f8E4M3FN and f8E5M2) types using either M=N=16, K=128 or M=N=32, K=64 as their tile
971-
size.
972-
- `amdgpu.scaled_mfma` does not support broadcasting. So, `cbsz`, `abid`, and `blgp`
969+
- `amdgpu.scaled_mfma` operates on fp4 (f4E2M1FN), fp6 (f6E2M3FN and f6E3M2FN) and
970+
fp8 (f8E4M3FN and f8E5M2) types using either M=N=16, K=128 or M=N=32, K=64 as their tile
971+
size.
972+
- `amdgpu.scaled_mfma` does not support broadcasting. So, `cbsz`, `abid`, and `blgp`
973973
are omitted from this wrapper.
974-
- The `negateA`, `negateB`, and `negateC` flags in `amdgpu.mfma` are only supported for
975-
double-precision operations on gfx94x and so are not included here.
974+
- The `negateA`, `negateB`, and `negateC` flags in `amdgpu.mfma` are only supported for
975+
double-precision operations on gfx94x and so are not included here.
976976
}];
977977
let assemblyFormat = [{
978978
`(` $scalesA `[` $scalesIdxA `]` `*` $sourceA `)` `*` `(` $scalesB `[` $scalesIdxB `]` `*` $sourceB `)` `+` $destC

0 commit comments

Comments
 (0)