Merge branch 'main' into remove_no_op_bitcast_DXIL

bob80905 · web-flow · commit be915772c2f8 · 2025-06-04T18:53:35.000-07:00
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -205,7 +205,7 @@ jobs:
           steps.docs-changed-subprojects.outputs.workflow_any_changed == 'true'
         run: |
           cmake -B flang-build -GNinja -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_PROJECTS="clang;mlir;flang" -DLLVM_ENABLE_SPHINX=ON ./llvm
-          TZ=UTC ninja -C flang-build docs-flang-html
+          TZ=UTC ninja -C flang-build docs-flang-html docs-flang-man
           mkdir built-docs/flang
           cp -r flang-build/docs/* built-docs/flang/
       - name: Upload docs
diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
@@ -229,24 +229,34 @@ struct Slab {
 
     // The uniform mask represents which lanes contain a uniform target pointer.
     // We attempt to place these next to each other.
-    // TODO: We should coalesce these bits and use the result of `fetch_or` to
-    //       search for free bits in parallel.
     void *result = nullptr;
     for (uint64_t mask = lane_mask; mask;
          mask = gpu::ballot(lane_mask, !result)) {
-      uint32_t id = impl::lane_count(uniform & mask);
-      uint32_t index =
-          (gpu::broadcast_value(lane_mask, impl::xorshift32(state)) + id) %
-          usable_bits(chunk_size);
+      if (result)
+        continue;
+
+      uint32_t start = gpu::broadcast_value(lane_mask, impl::xorshift32(state));
 
+      uint32_t id = impl::lane_count(uniform & mask);
+      uint32_t index = (start + id) % usable_bits(chunk_size);
       uint32_t slot = index / BITS_IN_WORD;
       uint32_t bit = index % BITS_IN_WORD;
-      if (!result) {
-        uint32_t before = cpp::AtomicRef(get_bitfield()[slot])
-                              .fetch_or(1u << bit, cpp::MemoryOrder::RELAXED);
-        if (~before & (1 << bit))
-          result = ptr_from_index(index, chunk_size);
-      }
+
+      // Get the mask of bits destined for the same slot and coalesce it.
+      uint64_t match = uniform & gpu::match_any(mask, slot);
+      uint32_t length = cpp::popcount(match);
+      uint32_t bitmask = static_cast<uint32_t>((uint64_t(1) << length) - 1)
+                         << bit;
+
+      uint32_t before = 0;
+      if (gpu::get_lane_id() == static_cast<uint32_t>(cpp::countr_zero(match)))
+        before = cpp::AtomicRef(get_bitfield()[slot])
+                     .fetch_or(bitmask, cpp::MemoryOrder::RELAXED);
+      before = gpu::shuffle(mask, cpp::countr_zero(match), before);
+      if (~before & (1 << bit))
+        result = ptr_from_index(index, chunk_size);
+      else
+        sleep_briefly();
     }
 
     cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
diff --git a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
@@ -272,8 +272,9 @@ bool DXILFlattenArraysVisitor::visitGetElementPtrInstInGEPChainBase(
 
   ArrayType *FlattenedArrayType = GEPInfo.ParentArrayType;
   Value *FlatGEP =
-      Builder.CreateGEP(FlattenedArrayType, GEPInfo.ParendOperand, FlatIndex,
-                        GEP.getName() + ".flat", GEP.isInBounds());
+      Builder.CreateGEP(FlattenedArrayType, GEPInfo.ParendOperand,
+                        {Builder.getInt32(0), FlatIndex},
+                        GEP.getName() + ".flat", GEP.getNoWrapFlags());
 
   GEP.replaceAllUsesWith(FlatGEP);
   GEP.eraseFromParent();
diff --git a/llvm/lib/Target/DirectX/DXILLegalizePass.cpp b/llvm/lib/Target/DirectX/DXILLegalizePass.cpp
@@ -8,6 +8,8 @@
 
 #include "DXILLegalizePass.h"
 #include "DirectX.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
@@ -510,40 +512,105 @@ static void updateFnegToFsub(Instruction &I,
   ToRemove.push_back(&I);
 }
 
+static void
+legalizeGetHighLowi64Bytes(Instruction &I,
+                           SmallVectorImpl<Instruction *> &ToRemove,
+                           DenseMap<Value *, Value *> &ReplacedValues) {
+  if (auto *BitCast = dyn_cast<BitCastInst>(&I)) {
+    if (BitCast->getDestTy() ==
+            FixedVectorType::get(Type::getInt32Ty(I.getContext()), 2) &&
+        BitCast->getSrcTy()->isIntegerTy(64)) {
+      ToRemove.push_back(BitCast);
+      ReplacedValues[BitCast] = BitCast->getOperand(0);
+      return;
+    }
+  }
+
+  if (auto *Extract = dyn_cast<ExtractElementInst>(&I)) {
+    if (!dyn_cast<BitCastInst>(Extract->getVectorOperand()))
+      return;
+    auto *VecTy = dyn_cast<FixedVectorType>(Extract->getVectorOperandType());
+    if (VecTy && VecTy->getElementType()->isIntegerTy(32) &&
+        VecTy->getNumElements() == 2) {
+      if (auto *Index = dyn_cast<ConstantInt>(Extract->getIndexOperand())) {
+        unsigned Idx = Index->getZExtValue();
+        IRBuilder<> Builder(&I);
+
+        auto *Replacement = ReplacedValues[Extract->getVectorOperand()];
+        assert(Replacement && "The BitCast replacement should have been set "
+                              "before working on ExtractElementInst.");
+        if (Idx == 0) {
+          Value *LowBytes = Builder.CreateTrunc(
+              Replacement, Type::getInt32Ty(I.getContext()));
+          ReplacedValues[Extract] = LowBytes;
+        } else {
+          assert(Idx == 1);
+          Value *LogicalShiftRight = Builder.CreateLShr(
+              Replacement,
+              ConstantInt::get(
+                  Replacement->getType(),
+                  APInt(Replacement->getType()->getIntegerBitWidth(), 32)));
+          Value *HighBytes = Builder.CreateTrunc(
+              LogicalShiftRight, Type::getInt32Ty(I.getContext()));
+          ReplacedValues[Extract] = HighBytes;
+        }
+        ToRemove.push_back(Extract);
+        Extract->replaceAllUsesWith(ReplacedValues[Extract]);
+      }
+    }
+  }
+}
+
 namespace {
 class DXILLegalizationPipeline {
 
 public:
   DXILLegalizationPipeline() { initializeLegalizationPipeline(); }
 
   bool runLegalizationPipeline(Function &F) {
+    bool MadeChange = false;
     SmallVector<Instruction *> ToRemove;
     DenseMap<Value *, Value *> ReplacedValues;
-    for (auto &I : instructions(F)) {
-      for (auto &LegalizationFn : LegalizationPipeline)
-        LegalizationFn(I, ToRemove, ReplacedValues);
-    }
+    for (int Stage = 0; Stage < NumStages; ++Stage) {
+      ToRemove.clear();
+      ReplacedValues.clear();
+      for (auto &I : instructions(F)) {
+        for (auto &LegalizationFn : LegalizationPipeline[Stage])
+          LegalizationFn(I, ToRemove, ReplacedValues);
+      }
 
-    for (auto *Inst : reverse(ToRemove))
-      Inst->eraseFromParent();
+      for (auto *Inst : reverse(ToRemove))
+        Inst->eraseFromParent();
 
-    return !ToRemove.empty();
+      MadeChange |= !ToRemove.empty();
+    }
+    return MadeChange;
   }
 
 private:
-  SmallVector<
+  enum LegalizationStage { Stage1 = 0, Stage2 = 1, NumStages };
+
+  using LegalizationFnTy =
       std::function<void(Instruction &, SmallVectorImpl<Instruction *> &,
-                         DenseMap<Value *, Value *> &)>>
-      LegalizationPipeline;
+                         DenseMap<Value *, Value *> &)>;
+
+  SmallVector<LegalizationFnTy> LegalizationPipeline[NumStages];
 
   void initializeLegalizationPipeline() {
-    LegalizationPipeline.push_back(upcastI8AllocasAndUses);
-    LegalizationPipeline.push_back(fixI8UseChain);
-    LegalizationPipeline.push_back(downcastI64toI32InsertExtractElements);
-    LegalizationPipeline.push_back(legalizeFreeze);
-    LegalizationPipeline.push_back(legalizeMemCpy);
-    LegalizationPipeline.push_back(removeMemSet);
-    LegalizationPipeline.push_back(updateFnegToFsub);
+    LegalizationPipeline[Stage1].push_back(upcastI8AllocasAndUses);
+    LegalizationPipeline[Stage1].push_back(fixI8UseChain);
+    LegalizationPipeline[Stage1].push_back(legalizeGetHighLowi64Bytes);
+    LegalizationPipeline[Stage1].push_back(legalizeFreeze);
+    LegalizationPipeline[Stage1].push_back(legalizeMemCpy);
+    LegalizationPipeline[Stage1].push_back(removeMemSet);
+    LegalizationPipeline[Stage1].push_back(updateFnegToFsub);
+    // Note: legalizeGetHighLowi64Bytes and
+    // downcastI64toI32InsertExtractElements both modify extractelement, so they
+    // must run staggered stages. legalizeGetHighLowi64Bytes runs first b\c it
+    // removes extractelements, reducing the number that
+    // downcastI64toI32InsertExtractElements needs to handle.
+    LegalizationPipeline[Stage2].push_back(
+        downcastI64toI32InsertExtractElements);
   }
 };
 
diff --git a/llvm/test/CodeGen/DirectX/flatten-array.ll b/llvm/test/CodeGen/DirectX/flatten-array.ll
@@ -31,7 +31,7 @@ define void @alloca_4d_test ()  {
 ; CHECK-LABEL: gep_2d_test
 define void @gep_2d_test ()  {
     ; CHECK: [[a:%.*]] = alloca [9 x i32], align 4
-    ; CHECK-COUNT-9: getelementptr inbounds [9 x i32], ptr [[a]], i32 {{[0-8]}}
+    ; CHECK-COUNT-9: getelementptr inbounds [9 x i32], ptr [[a]], i32 0, i32 {{[0-8]}}
     ; CHECK-NEXT:    ret void
     %1 = alloca [3 x [3 x i32]], align 4
     %g2d0 = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* %1, i32 0, i32 0
@@ -53,7 +53,7 @@ define void @gep_2d_test ()  {
 ; CHECK-LABEL: gep_3d_test
 define void @gep_3d_test ()  {
     ; CHECK: [[a:%.*]] = alloca [8 x i32], align 4
-    ; CHECK-COUNT-8: getelementptr inbounds [8 x i32], ptr [[a]], i32 {{[0-7]}}
+    ; CHECK-COUNT-8: getelementptr inbounds [8 x i32], ptr [[a]], i32 0, i32 {{[0-7]}}
     ; CHECK-NEXT:    ret void
     %1 = alloca [2 x[2 x [2 x i32]]], align 4
     %g3d0 = getelementptr inbounds [2 x[2 x [2 x i32]]], [2 x[2 x [2 x i32]]]* %1, i32 0, i32 0
@@ -76,7 +76,7 @@ define void @gep_3d_test ()  {
 ; CHECK-LABEL: gep_4d_test
 define void @gep_4d_test ()  {
     ; CHECK: [[a:%.*]] = alloca [16 x i32], align 4
-    ; CHECK-COUNT-16: getelementptr inbounds [16 x i32], ptr [[a]], i32 {{[0-9]|1[0-5]}}
+    ; CHECK-COUNT-16: getelementptr inbounds [16 x i32], ptr [[a]], i32 0, i32 {{[0-9]|1[0-5]}}
     ; CHECK-NEXT:    ret void
     %1 = alloca [2x[2 x[2 x [2 x i32]]]], align 4
     %g4d0 = getelementptr inbounds [2x[2 x[2 x [2 x i32]]]], [2x[2 x[2 x [2 x i32]]]]* %1, i32 0, i32 0
@@ -123,8 +123,7 @@ define void @gep_4d_test ()  {
 @b = internal global [2 x [3 x [4 x i32]]] zeroinitializer, align 16
 
 define void @global_gep_load() {
-  ; CHECK: [[GEP_PTR:%.*]] = getelementptr inbounds [24 x i32], ptr @a.1dim, i32 6
-  ; CHECK: load i32, ptr [[GEP_PTR]], align 4
+  ; CHECK: load i32, ptr getelementptr inbounds ([24 x i32], ptr @a.1dim, i32 0, i32 6), align 4
   ; CHECK-NEXT:    ret void
   %1 = getelementptr inbounds [2 x [3 x [4 x i32]]], [2 x [3 x [4 x i32]]]* @a, i32 0, i32 0
   %2 = getelementptr inbounds [3 x [4 x i32]], [3 x [4 x i32]]* %1, i32 0, i32 1
@@ -142,7 +141,7 @@ define void @global_gep_load_index(i32 %row, i32 %col, i32 %timeIndex) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[ROW]], 12
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[DOTFLAT:%.*]] = getelementptr inbounds [24 x i32], ptr @a.1dim, i32 [[TMP6]]
+; CHECK-NEXT:    [[DOTFLAT:%.*]] = getelementptr inbounds [24 x i32], ptr @a.1dim, i32 0, i32 [[TMP6]]
 ; CHECK-NOT: getelementptr inbounds [2 x [3 x [4 x i32]]]{{.*}}
 ; CHECK-NOT: getelementptr inbounds [3 x [4 x i32]]{{.*}}
 ; CHECK-NOT: getelementptr inbounds [4 x i32]{{.*}}
@@ -163,7 +162,7 @@ define void @global_incomplete_gep_chain(i32 %row, i32 %col) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i32 0, [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[ROW]], 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[DOTFLAT:%.*]] = getelementptr inbounds [24 x i32], ptr @a.1dim, i32 [[TMP4]]
+; CHECK-NEXT:    [[DOTFLAT:%.*]] = getelementptr inbounds [24 x i32], ptr @a.1dim, i32 0, i32 [[TMP4]]
 ; CHECK-NOT: getelementptr inbounds [2 x [3 x [4 x i32]]]{{.*}}
 ; CHECK-NOT: getelementptr inbounds [3 x [4 x i32]]{{.*}}
 ; CHECK-NOT: getelementptr inbounds [4 x i32]{{.*}}
@@ -177,8 +176,7 @@ define void @global_incomplete_gep_chain(i32 %row, i32 %col) {
 }
 
 define void @global_gep_store() {
-  ; CHECK: [[GEP_PTR:%.*]] = getelementptr inbounds [24 x i32], ptr @b.1dim, i32 13
-  ; CHECK:  store i32 1, ptr [[GEP_PTR]], align 4
+  ; CHECK: store i32 1, ptr getelementptr inbounds ([24 x i32], ptr @b.1dim, i32 0, i32 13), align 4
   ; CHECK-NEXT:    ret void
   %1 = getelementptr inbounds [2 x [3 x [4 x i32]]], [2 x [3 x [4 x i32]]]* @b, i32 0, i32 1
   %2 = getelementptr inbounds [3 x [4 x i32]], [3 x [4 x i32]]* %1, i32 0, i32 0
diff --git a/llvm/test/CodeGen/DirectX/flatten-bug-117273.ll b/llvm/test/CodeGen/DirectX/flatten-bug-117273.ll
@@ -8,9 +8,9 @@
 define internal void @main() {
 ; CHECK-LABEL: define internal void @main() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [24 x float], ptr @ZerroInitArr.1dim, i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [24 x float], ptr @ZerroInitArr.1dim, i32 0, i32 1
 ; CHECK-NEXT:    [[DOTI0:%.*]] = load float, ptr [[TMP0]], align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [24 x float], ptr @ZerroInitArr.1dim, i32 2
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [24 x float], ptr @ZerroInitArr.1dim, i32 0, i32 2
 ; CHECK-NEXT:    [[DOTI03:%.*]] = load float, ptr [[TMP1]], align 16
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/CodeGen/DirectX/legalize-i64-high-low-vec-split.ll b/llvm/test/CodeGen/DirectX/legalize-i64-high-low-vec-split.ll
@@ -0,0 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes='dxil-legalize' -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+
+define void @split_via_extract(i64 noundef %a) {
+; CHECK-LABEL: define void @split_via_extract(
+; CHECK-SAME: i64 noundef [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[A]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[A]], 32
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
+; CHECK-NEXT:    ret void
+;
+entry:
+  %vecA = bitcast i64 %a to <2 x i32>
+  %low = extractelement <2 x i32> %vecA, i32 0 ; low 32 bits
+  %high = extractelement <2 x i32> %vecA, i32 1 ; high 32 bits
+  ret void
+}
diff --git a/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll b/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll
diff --git a/llvm/test/CodeGen/DirectX/scalar-bug-117273.ll b/llvm/test/CodeGen/DirectX/scalar-bug-117273.ll