Skip to content

Commit be91577

Browse files
authored
Merge branch 'main' into remove_no_op_bitcast_DXIL
2 parents 6bb09c1 + 9ab4c16 commit be91577

File tree

9 files changed

+165
-75
lines changed

9 files changed

+165
-75
lines changed

.github/workflows/docs.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,7 @@ jobs:
205205
steps.docs-changed-subprojects.outputs.workflow_any_changed == 'true'
206206
run: |
207207
cmake -B flang-build -GNinja -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_PROJECTS="clang;mlir;flang" -DLLVM_ENABLE_SPHINX=ON ./llvm
208-
TZ=UTC ninja -C flang-build docs-flang-html
208+
TZ=UTC ninja -C flang-build docs-flang-html docs-flang-man
209209
mkdir built-docs/flang
210210
cp -r flang-build/docs/* built-docs/flang/
211211
- name: Upload docs

libc/src/__support/GPU/allocator.cpp

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -229,24 +229,34 @@ struct Slab {
229229

230230
// The uniform mask represents which lanes contain a uniform target pointer.
231231
// We attempt to place these next to each other.
232-
// TODO: We should coalesce these bits and use the result of `fetch_or` to
233-
// search for free bits in parallel.
234232
void *result = nullptr;
235233
for (uint64_t mask = lane_mask; mask;
236234
mask = gpu::ballot(lane_mask, !result)) {
237-
uint32_t id = impl::lane_count(uniform & mask);
238-
uint32_t index =
239-
(gpu::broadcast_value(lane_mask, impl::xorshift32(state)) + id) %
240-
usable_bits(chunk_size);
235+
if (result)
236+
continue;
237+
238+
uint32_t start = gpu::broadcast_value(lane_mask, impl::xorshift32(state));
241239

240+
uint32_t id = impl::lane_count(uniform & mask);
241+
uint32_t index = (start + id) % usable_bits(chunk_size);
242242
uint32_t slot = index / BITS_IN_WORD;
243243
uint32_t bit = index % BITS_IN_WORD;
244-
if (!result) {
245-
uint32_t before = cpp::AtomicRef(get_bitfield()[slot])
246-
.fetch_or(1u << bit, cpp::MemoryOrder::RELAXED);
247-
if (~before & (1 << bit))
248-
result = ptr_from_index(index, chunk_size);
249-
}
244+
245+
// Get the mask of bits destined for the same slot and coalesce it.
246+
uint64_t match = uniform & gpu::match_any(mask, slot);
247+
uint32_t length = cpp::popcount(match);
248+
uint32_t bitmask = static_cast<uint32_t>((uint64_t(1) << length) - 1)
249+
<< bit;
250+
251+
uint32_t before = 0;
252+
if (gpu::get_lane_id() == static_cast<uint32_t>(cpp::countr_zero(match)))
253+
before = cpp::AtomicRef(get_bitfield()[slot])
254+
.fetch_or(bitmask, cpp::MemoryOrder::RELAXED);
255+
before = gpu::shuffle(mask, cpp::countr_zero(match), before);
256+
if (~before & (1 << bit))
257+
result = ptr_from_index(index, chunk_size);
258+
else
259+
sleep_briefly();
250260
}
251261

252262
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);

llvm/lib/Target/DirectX/DXILFlattenArrays.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -272,8 +272,9 @@ bool DXILFlattenArraysVisitor::visitGetElementPtrInstInGEPChainBase(
272272

273273
ArrayType *FlattenedArrayType = GEPInfo.ParentArrayType;
274274
Value *FlatGEP =
275-
Builder.CreateGEP(FlattenedArrayType, GEPInfo.ParendOperand, FlatIndex,
276-
GEP.getName() + ".flat", GEP.isInBounds());
275+
Builder.CreateGEP(FlattenedArrayType, GEPInfo.ParendOperand,
276+
{Builder.getInt32(0), FlatIndex},
277+
GEP.getName() + ".flat", GEP.getNoWrapFlags());
277278

278279
GEP.replaceAllUsesWith(FlatGEP);
279280
GEP.eraseFromParent();

llvm/lib/Target/DirectX/DXILLegalizePass.cpp

Lines changed: 84 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88

99
#include "DXILLegalizePass.h"
1010
#include "DirectX.h"
11+
#include "llvm/ADT/APInt.h"
12+
#include "llvm/IR/Constants.h"
1113
#include "llvm/IR/Function.h"
1214
#include "llvm/IR/IRBuilder.h"
1315
#include "llvm/IR/InstIterator.h"
@@ -510,40 +512,105 @@ static void updateFnegToFsub(Instruction &I,
510512
ToRemove.push_back(&I);
511513
}
512514

515+
static void
516+
legalizeGetHighLowi64Bytes(Instruction &I,
517+
SmallVectorImpl<Instruction *> &ToRemove,
518+
DenseMap<Value *, Value *> &ReplacedValues) {
519+
if (auto *BitCast = dyn_cast<BitCastInst>(&I)) {
520+
if (BitCast->getDestTy() ==
521+
FixedVectorType::get(Type::getInt32Ty(I.getContext()), 2) &&
522+
BitCast->getSrcTy()->isIntegerTy(64)) {
523+
ToRemove.push_back(BitCast);
524+
ReplacedValues[BitCast] = BitCast->getOperand(0);
525+
return;
526+
}
527+
}
528+
529+
if (auto *Extract = dyn_cast<ExtractElementInst>(&I)) {
530+
if (!dyn_cast<BitCastInst>(Extract->getVectorOperand()))
531+
return;
532+
auto *VecTy = dyn_cast<FixedVectorType>(Extract->getVectorOperandType());
533+
if (VecTy && VecTy->getElementType()->isIntegerTy(32) &&
534+
VecTy->getNumElements() == 2) {
535+
if (auto *Index = dyn_cast<ConstantInt>(Extract->getIndexOperand())) {
536+
unsigned Idx = Index->getZExtValue();
537+
IRBuilder<> Builder(&I);
538+
539+
auto *Replacement = ReplacedValues[Extract->getVectorOperand()];
540+
assert(Replacement && "The BitCast replacement should have been set "
541+
"before working on ExtractElementInst.");
542+
if (Idx == 0) {
543+
Value *LowBytes = Builder.CreateTrunc(
544+
Replacement, Type::getInt32Ty(I.getContext()));
545+
ReplacedValues[Extract] = LowBytes;
546+
} else {
547+
assert(Idx == 1);
548+
Value *LogicalShiftRight = Builder.CreateLShr(
549+
Replacement,
550+
ConstantInt::get(
551+
Replacement->getType(),
552+
APInt(Replacement->getType()->getIntegerBitWidth(), 32)));
553+
Value *HighBytes = Builder.CreateTrunc(
554+
LogicalShiftRight, Type::getInt32Ty(I.getContext()));
555+
ReplacedValues[Extract] = HighBytes;
556+
}
557+
ToRemove.push_back(Extract);
558+
Extract->replaceAllUsesWith(ReplacedValues[Extract]);
559+
}
560+
}
561+
}
562+
}
563+
513564
namespace {
514565
class DXILLegalizationPipeline {
515566

516567
public:
517568
DXILLegalizationPipeline() { initializeLegalizationPipeline(); }
518569

519570
bool runLegalizationPipeline(Function &F) {
571+
bool MadeChange = false;
520572
SmallVector<Instruction *> ToRemove;
521573
DenseMap<Value *, Value *> ReplacedValues;
522-
for (auto &I : instructions(F)) {
523-
for (auto &LegalizationFn : LegalizationPipeline)
524-
LegalizationFn(I, ToRemove, ReplacedValues);
525-
}
574+
for (int Stage = 0; Stage < NumStages; ++Stage) {
575+
ToRemove.clear();
576+
ReplacedValues.clear();
577+
for (auto &I : instructions(F)) {
578+
for (auto &LegalizationFn : LegalizationPipeline[Stage])
579+
LegalizationFn(I, ToRemove, ReplacedValues);
580+
}
526581

527-
for (auto *Inst : reverse(ToRemove))
528-
Inst->eraseFromParent();
582+
for (auto *Inst : reverse(ToRemove))
583+
Inst->eraseFromParent();
529584

530-
return !ToRemove.empty();
585+
MadeChange |= !ToRemove.empty();
586+
}
587+
return MadeChange;
531588
}
532589

533590
private:
534-
SmallVector<
591+
enum LegalizationStage { Stage1 = 0, Stage2 = 1, NumStages };
592+
593+
using LegalizationFnTy =
535594
std::function<void(Instruction &, SmallVectorImpl<Instruction *> &,
536-
DenseMap<Value *, Value *> &)>>
537-
LegalizationPipeline;
595+
DenseMap<Value *, Value *> &)>;
596+
597+
SmallVector<LegalizationFnTy> LegalizationPipeline[NumStages];
538598

539599
void initializeLegalizationPipeline() {
540-
LegalizationPipeline.push_back(upcastI8AllocasAndUses);
541-
LegalizationPipeline.push_back(fixI8UseChain);
542-
LegalizationPipeline.push_back(downcastI64toI32InsertExtractElements);
543-
LegalizationPipeline.push_back(legalizeFreeze);
544-
LegalizationPipeline.push_back(legalizeMemCpy);
545-
LegalizationPipeline.push_back(removeMemSet);
546-
LegalizationPipeline.push_back(updateFnegToFsub);
600+
LegalizationPipeline[Stage1].push_back(upcastI8AllocasAndUses);
601+
LegalizationPipeline[Stage1].push_back(fixI8UseChain);
602+
LegalizationPipeline[Stage1].push_back(legalizeGetHighLowi64Bytes);
603+
LegalizationPipeline[Stage1].push_back(legalizeFreeze);
604+
LegalizationPipeline[Stage1].push_back(legalizeMemCpy);
605+
LegalizationPipeline[Stage1].push_back(removeMemSet);
606+
LegalizationPipeline[Stage1].push_back(updateFnegToFsub);
607+
// Note: legalizeGetHighLowi64Bytes and
608+
// downcastI64toI32InsertExtractElements both modify extractelement, so they
609+
// must run staggered stages. legalizeGetHighLowi64Bytes runs first b\c it
610+
// removes extractelements, reducing the number that
611+
// downcastI64toI32InsertExtractElements needs to handle.
612+
LegalizationPipeline[Stage2].push_back(
613+
downcastI64toI32InsertExtractElements);
547614
}
548615
};
549616

llvm/test/CodeGen/DirectX/flatten-array.ll

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ define void @alloca_4d_test () {
3131
; CHECK-LABEL: gep_2d_test
3232
define void @gep_2d_test () {
3333
; CHECK: [[a:%.*]] = alloca [9 x i32], align 4
34-
; CHECK-COUNT-9: getelementptr inbounds [9 x i32], ptr [[a]], i32 {{[0-8]}}
34+
; CHECK-COUNT-9: getelementptr inbounds [9 x i32], ptr [[a]], i32 0, i32 {{[0-8]}}
3535
; CHECK-NEXT: ret void
3636
%1 = alloca [3 x [3 x i32]], align 4
3737
%g2d0 = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* %1, i32 0, i32 0
@@ -53,7 +53,7 @@ define void @gep_2d_test () {
5353
; CHECK-LABEL: gep_3d_test
5454
define void @gep_3d_test () {
5555
; CHECK: [[a:%.*]] = alloca [8 x i32], align 4
56-
; CHECK-COUNT-8: getelementptr inbounds [8 x i32], ptr [[a]], i32 {{[0-7]}}
56+
; CHECK-COUNT-8: getelementptr inbounds [8 x i32], ptr [[a]], i32 0, i32 {{[0-7]}}
5757
; CHECK-NEXT: ret void
5858
%1 = alloca [2 x[2 x [2 x i32]]], align 4
5959
%g3d0 = getelementptr inbounds [2 x[2 x [2 x i32]]], [2 x[2 x [2 x i32]]]* %1, i32 0, i32 0
@@ -76,7 +76,7 @@ define void @gep_3d_test () {
7676
; CHECK-LABEL: gep_4d_test
7777
define void @gep_4d_test () {
7878
; CHECK: [[a:%.*]] = alloca [16 x i32], align 4
79-
; CHECK-COUNT-16: getelementptr inbounds [16 x i32], ptr [[a]], i32 {{[0-9]|1[0-5]}}
79+
; CHECK-COUNT-16: getelementptr inbounds [16 x i32], ptr [[a]], i32 0, i32 {{[0-9]|1[0-5]}}
8080
; CHECK-NEXT: ret void
8181
%1 = alloca [2x[2 x[2 x [2 x i32]]]], align 4
8282
%g4d0 = getelementptr inbounds [2x[2 x[2 x [2 x i32]]]], [2x[2 x[2 x [2 x i32]]]]* %1, i32 0, i32 0
@@ -123,8 +123,7 @@ define void @gep_4d_test () {
123123
@b = internal global [2 x [3 x [4 x i32]]] zeroinitializer, align 16
124124

125125
define void @global_gep_load() {
126-
; CHECK: [[GEP_PTR:%.*]] = getelementptr inbounds [24 x i32], ptr @a.1dim, i32 6
127-
; CHECK: load i32, ptr [[GEP_PTR]], align 4
126+
; CHECK: load i32, ptr getelementptr inbounds ([24 x i32], ptr @a.1dim, i32 0, i32 6), align 4
128127
; CHECK-NEXT: ret void
129128
%1 = getelementptr inbounds [2 x [3 x [4 x i32]]], [2 x [3 x [4 x i32]]]* @a, i32 0, i32 0
130129
%2 = getelementptr inbounds [3 x [4 x i32]], [3 x [4 x i32]]* %1, i32 0, i32 1
@@ -142,7 +141,7 @@ define void @global_gep_load_index(i32 %row, i32 %col, i32 %timeIndex) {
142141
; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP2]], [[TMP3]]
143142
; CHECK-NEXT: [[TMP5:%.*]] = mul i32 [[ROW]], 12
144143
; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP4]], [[TMP5]]
145-
; CHECK-NEXT: [[DOTFLAT:%.*]] = getelementptr inbounds [24 x i32], ptr @a.1dim, i32 [[TMP6]]
144+
; CHECK-NEXT: [[DOTFLAT:%.*]] = getelementptr inbounds [24 x i32], ptr @a.1dim, i32 0, i32 [[TMP6]]
146145
; CHECK-NOT: getelementptr inbounds [2 x [3 x [4 x i32]]]{{.*}}
147146
; CHECK-NOT: getelementptr inbounds [3 x [4 x i32]]{{.*}}
148147
; CHECK-NOT: getelementptr inbounds [4 x i32]{{.*}}
@@ -163,7 +162,7 @@ define void @global_incomplete_gep_chain(i32 %row, i32 %col) {
163162
; CHECK-NEXT: [[TMP2:%.*]] = add i32 0, [[TMP1]]
164163
; CHECK-NEXT: [[TMP3:%.*]] = mul i32 [[ROW]], 3
165164
; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP2]], [[TMP3]]
166-
; CHECK-NEXT: [[DOTFLAT:%.*]] = getelementptr inbounds [24 x i32], ptr @a.1dim, i32 [[TMP4]]
165+
; CHECK-NEXT: [[DOTFLAT:%.*]] = getelementptr inbounds [24 x i32], ptr @a.1dim, i32 0, i32 [[TMP4]]
167166
; CHECK-NOT: getelementptr inbounds [2 x [3 x [4 x i32]]]{{.*}}
168167
; CHECK-NOT: getelementptr inbounds [3 x [4 x i32]]{{.*}}
169168
; CHECK-NOT: getelementptr inbounds [4 x i32]{{.*}}
@@ -177,8 +176,7 @@ define void @global_incomplete_gep_chain(i32 %row, i32 %col) {
177176
}
178177

179178
define void @global_gep_store() {
180-
; CHECK: [[GEP_PTR:%.*]] = getelementptr inbounds [24 x i32], ptr @b.1dim, i32 13
181-
; CHECK: store i32 1, ptr [[GEP_PTR]], align 4
179+
; CHECK: store i32 1, ptr getelementptr inbounds ([24 x i32], ptr @b.1dim, i32 0, i32 13), align 4
182180
; CHECK-NEXT: ret void
183181
%1 = getelementptr inbounds [2 x [3 x [4 x i32]]], [2 x [3 x [4 x i32]]]* @b, i32 0, i32 1
184182
%2 = getelementptr inbounds [3 x [4 x i32]], [3 x [4 x i32]]* %1, i32 0, i32 0

llvm/test/CodeGen/DirectX/flatten-bug-117273.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@
88
define internal void @main() {
99
; CHECK-LABEL: define internal void @main() {
1010
; CHECK-NEXT: [[ENTRY:.*:]]
11-
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr [24 x float], ptr @ZerroInitArr.1dim, i32 1
11+
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr [24 x float], ptr @ZerroInitArr.1dim, i32 0, i32 1
1212
; CHECK-NEXT: [[DOTI0:%.*]] = load float, ptr [[TMP0]], align 16
13-
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr [24 x float], ptr @ZerroInitArr.1dim, i32 2
13+
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr [24 x float], ptr @ZerroInitArr.1dim, i32 0, i32 2
1414
; CHECK-NEXT: [[DOTI03:%.*]] = load float, ptr [[TMP1]], align 16
1515
; CHECK-NEXT: ret void
1616
;
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt -S -passes='dxil-legalize' -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
3+
4+
define void @split_via_extract(i64 noundef %a) {
5+
; CHECK-LABEL: define void @split_via_extract(
6+
; CHECK-SAME: i64 noundef [[A:%.*]]) {
7+
; CHECK-NEXT: [[ENTRY:.*:]]
8+
; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[A]] to i32
9+
; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[A]], 32
10+
; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
11+
; CHECK-NEXT: ret void
12+
;
13+
entry:
14+
%vecA = bitcast i64 %a to <2 x i32>
15+
%low = extractelement <2 x i32> %vecA, i32 0 ; low 32 bits
16+
%high = extractelement <2 x i32> %vecA, i32 1 ; high 32 bits
17+
ret void
18+
}

0 commit comments

Comments
 (0)