Merge branch 'llvm:main' into r586519_bazel_fix

DeanSturtevant1 · web-flow · commit 6780385e06cf · 2025-08-14T17:55:12.000-04:00
diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90
@@ -653,29 +653,29 @@ attributes(device) real(8) function sinpi(x) bind(c,name='__nv_sinpi')
     end function
   end interface
 
-  interface __double2ull_rn
-    attributes(device) integer(8) function __double2ull_rn(r) bind(c)
+  interface __double2ull_rd
+    attributes(device) integer(8) function __double2ull_rd(r) bind(c, name='__nv_double2ull_rd')
       !dir$ ignore_tkr (d) r
       double precision, value :: r
     end function
   end interface
 
-  interface __double2ull_rz
-    attributes(device) integer(8) function __double2ull_rz(r) bind(c)
+  interface __double2ull_rn
+    attributes(device) integer(8) function __double2ull_rn(r) bind(c, name='__nv_double2ull_rn')
       !dir$ ignore_tkr (d) r
       double precision, value :: r
     end function
   end interface
 
   interface __double2ull_ru
-    attributes(device) integer(8) function __double2ull_ru(r) bind(c)
+    attributes(device) integer(8) function __double2ull_ru(r) bind(c, name='__nv_double2ull_ru')
       !dir$ ignore_tkr (d) r
       double precision, value :: r
     end function
   end interface
 
-  interface __double2ull_rd
-    attributes(device) integer(8) function __double2ull_rd(r) bind(c)
+  interface __double2ull_rz
+    attributes(device) integer(8) function __double2ull_rz(r) bind(c, name='__nv_double2ull_rz')
       !dir$ ignore_tkr (d) r
       double precision, value :: r
     end function
@@ -779,6 +779,34 @@ attributes(device) real(8) function sinpi(x) bind(c,name='__nv_sinpi')
     end function
   end interface
 
+  interface __drcp_rd
+    attributes(device) double precision function __drcp_rd(x) bind(c, name='__nv_drcp_rd')
+      !dir$ ignore_tkr (d) x
+      double precision, value :: x
+    end function
+  end interface
+
+  interface __drcp_rn
+    attributes(device) double precision function __drcp_rn(x) bind(c, name='__nv_drcp_rn')
+      !dir$ ignore_tkr (d) x
+      double precision, value :: x
+    end function
+  end interface
+
+  interface __drcp_ru
+    attributes(device) double precision function __drcp_ru(x) bind(c, name='__nv_drcp_ru')
+      !dir$ ignore_tkr (d) x
+      double precision, value :: x
+    end function
+  end interface
+
+  interface __drcp_rz
+    attributes(device) double precision function __drcp_rz(x) bind(c, name='__nv_drcp_rz')
+      !dir$ ignore_tkr (d) x
+      double precision, value :: x
+    end function
+  end interface
+
   interface __dsqrt_rd
     attributes(device) double precision function __dsqrt_rd(x) bind(c, name='__nv_dsqrt_rd')
       !dir$ ignore_tkr (d) x
diff --git a/flang/test/Lower/CUDA/cuda-libdevice.cuf b/flang/test/Lower/CUDA/cuda-libdevice.cuf
@@ -115,3 +115,35 @@ end subroutine
 ! CHECK: %{{.*}} = fir.call @__nv_double2ll_rn(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i64
 ! CHECK: %{{.*}} = fir.call @__nv_double2ll_ru(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i64
 ! CHECK: %{{.*}} = fir.call @__nv_double2ll_rz(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i64
+
+
+attributes(global) subroutine test_drcp_rX()
+  double precision :: res
+  double precision :: r
+  res = __drcp_rd(r)
+  res = __drcp_rn(r)
+  res = __drcp_ru(r)
+  res = __drcp_rz(r)
+end subroutine
+
+! CHECK-LABEL: _QPtest_drcp_rx
+! CHECK: %{{.*}} = fir.call @__nv_drcp_rd(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> f64
+! CHECK: %{{.*}} = fir.call @__nv_drcp_rn(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> f64
+! CHECK: %{{.*}} = fir.call @__nv_drcp_ru(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> f64
+! CHECK: %{{.*}} = fir.call @__nv_drcp_rz(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> f64
+
+attributes(global) subroutine test_double2ull_rX()
+  integer(8) :: res
+  double precision :: r
+  res = __double2ull_rd(r)
+  res = __double2ull_rn(r)
+  res = __double2ull_ru(r)
+  res = __double2ull_rz(r)
+end subroutine
+
+! CHECK-LABEL: _QPtest_double2ull_rx
+! CHECK: %{{.*}} = fir.call @__nv_double2ull_rd(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i64
+! CHECK: %{{.*}} = fir.call @__nv_double2ull_rn(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i64
+! CHECK: %{{.*}} = fir.call @__nv_double2ull_ru(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i64
+! CHECK: %{{.*}} = fir.call @__nv_double2ull_rz(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i64
+
diff --git a/llvm/lib/Target/RISCV/RISCVMoveMerger.cpp b/llvm/lib/Target/RISCV/RISCVMoveMerger.cpp
@@ -137,6 +137,11 @@ RISCVMoveMerge::mergePairedInsns(MachineBasicBlock::iterator I,
     NextI = next_nodbg(NextI, E);
   DebugLoc DL = I->getDebugLoc();
 
+  // Make a copy so we can update the kill flag in the MoveFromAToS case. The
+  // copied operand needs to be scoped outside the if since we make a pointer
+  // to it.
+  MachineOperand PairedSource = *PairedRegs.Source;
+
   // The order of S-reg depends on which instruction holds A0, instead of
   // the order of register pair.
   // e,g.
@@ -147,8 +152,15 @@ RISCVMoveMerge::mergePairedInsns(MachineBasicBlock::iterator I,
   //   mv a1, s1    =>  cm.mva01s s2,s1
   bool StartWithX10 = ARegInFirstPair == RISCV::X10;
   if (isMoveFromAToS(Opcode)) {
-    Sreg1 = StartWithX10 ? FirstPair.Source : PairedRegs.Source;
-    Sreg2 = StartWithX10 ? PairedRegs.Source : FirstPair.Source;
+    // We are moving one of the copies earlier so its kill flag may become
+    // invalid. Clear the copied kill flag if there are any reads of the
+    // register between the new location and the old location.
+    for (auto It = std::next(I); It != Paired && PairedSource.isKill(); ++It)
+      if (It->readsRegister(PairedSource.getReg(), TRI))
+        PairedSource.setIsKill(false);
+
+    Sreg1 = StartWithX10 ? FirstPair.Source : &PairedSource;
+    Sreg2 = StartWithX10 ? &PairedSource : FirstPair.Source;
   } else {
     Sreg1 = StartWithX10 ? FirstPair.Destination : PairedRegs.Destination;
     Sreg2 = StartWithX10 ? PairedRegs.Destination : FirstPair.Destination;
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -10564,41 +10564,35 @@ class InstructionsCompatibilityAnalysis {
   unsigned MainOpcode = 0;
   Instruction *MainOp = nullptr;
 
-  /// Checks if the opcode is supported as the main opcode for copyable
-  /// elements.
-  static bool isSupportedOpcode(const unsigned Opcode) {
-    return Opcode == Instruction::Add || Opcode == Instruction::LShr;
-  }
-
   /// Identifies the best candidate value, which represents main opcode
   /// operation.
   /// Currently the best candidate is the Add instruction with the parent
   /// block with the highest DFS incoming number (block, that dominates other).
   void findAndSetMainInstruction(ArrayRef<Value *> VL, const BoUpSLP &R) {
     BasicBlock *Parent = nullptr;
     // Checks if the instruction has supported opcode.
-    auto IsSupportedInstruction = [&](Instruction *I) {
-      return I && isSupportedOpcode(I->getOpcode()) &&
+    auto IsSupportedOpcode = [&](Instruction *I) {
+      return I && I->getOpcode() == Instruction::Add &&
              (!doesNotNeedToBeScheduled(I) || !R.isVectorized(I));
     };
     // Exclude operands instructions immediately to improve compile time, it
     // will be unable to schedule anyway.
     SmallDenseSet<Value *, 8> Operands;
-    SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
     for (Value *V : VL) {
       auto *I = dyn_cast<Instruction>(V);
       if (!I)
         continue;
       if (!DT.isReachableFromEntry(I->getParent()))
         continue;
-      if (Candidates.empty()) {
-        Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
+      if (!MainOp) {
+        MainOp = I;
         Parent = I->getParent();
         Operands.insert(I->op_begin(), I->op_end());
         continue;
       }
       if (Parent == I->getParent()) {
-        Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
+        if (!IsSupportedOpcode(MainOp) && !Operands.contains(I))
+          MainOp = I;
         Operands.insert(I->op_begin(), I->op_end());
         continue;
       }
@@ -10610,35 +10604,24 @@ class InstructionsCompatibilityAnalysis {
                  (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
              "Different nodes should have different DFS numbers");
       if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
-        Candidates.clear();
-        Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
+        MainOp = I;
         Parent = I->getParent();
         Operands.clear();
         Operands.insert(I->op_begin(), I->op_end());
       }
     }
-    unsigned BestOpcodeNum = 0;
-    MainOp = nullptr;
-    for (const auto &P : Candidates) {
-      if (P.second.size() < BestOpcodeNum)
-        continue;
-      for (Instruction *I : P.second) {
-        if (IsSupportedInstruction(I) && !Operands.contains(I)) {
-          MainOp = I;
-          BestOpcodeNum = P.second.size();
-          break;
-        }
-      }
+    if (!IsSupportedOpcode(MainOp) || Operands.contains(MainOp)) {
+      MainOp = nullptr;
+      return;
     }
-    if (MainOp)
-      MainOpcode = MainOp->getOpcode();
+    MainOpcode = MainOp->getOpcode();
   }
 
   /// Returns the idempotent value for the \p MainOp with the detected \p
   /// MainOpcode. For Add, returns 0. For Or, it should choose between false and
   /// the operand itself, since V or V == V.
   Value *selectBestIdempotentValue() const {
-    assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
+    assert(MainOpcode == Instruction::Add && "Unsupported opcode");
     return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(),
                                           !MainOp->isCommutative());
   }
@@ -10651,8 +10634,13 @@ class InstructionsCompatibilityAnalysis {
       return {V, V};
     if (!S.isCopyableElement(V))
       return convertTo(cast<Instruction>(V), S).second;
-    assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
-    return {V, selectBestIdempotentValue()};
+    switch (MainOpcode) {
+    case Instruction::Add:
+      return {V, selectBestIdempotentValue()};
+    default:
+      break;
+    }
+    llvm_unreachable("Unsupported opcode");
   }
 
   /// Builds operands for the original instructions.
@@ -10865,21 +10853,6 @@ class InstructionsCompatibilityAnalysis {
       }
       if (!Res)
         return InstructionsState::invalid();
-      constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
-      InstructionCost ScalarCost = TTI.getInstructionCost(S.getMainOp(), Kind);
-      InstructionCost VectorCost;
-      FixedVectorType *VecTy =
-          getWidenedType(S.getMainOp()->getType(), VL.size());
-      switch (MainOpcode) {
-      case Instruction::Add:
-      case Instruction::LShr:
-        VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
-        break;
-      default:
-        llvm_unreachable("Unexpected instruction.");
-      }
-      if (VectorCost > ScalarCost)
-        return InstructionsState::invalid();
       return S;
     }
     assert(Operands.size() == 2 && "Unexpected number of operands!");
@@ -21117,7 +21090,6 @@ void BoUpSLP::BlockScheduling::calculateDependencies(
         ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
         const auto *It = find(Op, CD->getInst());
         assert(It != Op.end() && "Lane not set");
-        SmallPtrSet<Instruction *, 4> Visited;
         do {
           int Lane = std::distance(Op.begin(), It);
           assert(Lane >= 0 && "Lane not set");
@@ -21139,15 +21111,13 @@ void BoUpSLP::BlockScheduling::calculateDependencies(
                   (InsertInReadyList && UseSD->isReady()))
                 WorkList.push_back(UseSD);
             }
-          } else if (Visited.insert(In).second) {
-            if (ScheduleData *UseSD = getScheduleData(In)) {
-              CD->incDependencies();
-              if (!UseSD->isScheduled())
-                CD->incrementUnscheduledDeps(1);
-              if (!UseSD->hasValidDependencies() ||
-                  (InsertInReadyList && UseSD->isReady()))
-                WorkList.push_back(UseSD);
-            }
+          } else if (ScheduleData *UseSD = getScheduleData(In)) {
+            CD->incDependencies();
+            if (!UseSD->isScheduled())
+              CD->incrementUnscheduledDeps(1);
+            if (!UseSD->hasValidDependencies() ||
+                (InsertInReadyList && UseSD->isReady()))
+              WorkList.push_back(UseSD);
           }
           It = find(make_range(std::next(It), Op.end()), CD->getInst());
         } while (It != Op.end());
@@ -21905,11 +21875,9 @@ bool BoUpSLP::collectValuesToDemote(
       return all_of(E.Scalars, [&](Value *V) {
         if (isa<PoisonValue>(V))
           return true;
-        APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
-        if (E.isCopyableElement(V))
-          return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
         auto *I = cast<Instruction>(V);
         KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
+        APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
         return AmtKnownBits.getMaxValue().ult(BitWidth) &&
                MaskedValueIsZero(I->getOperand(0), ShiftedBits,
                                  SimplifyQuery(*DL));
diff --git a/llvm/test/CodeGen/RISCV/pr153598.mir b/llvm/test/CodeGen/RISCV/pr153598.mir
@@ -0,0 +1,23 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=riscv32 -mattr=+zcmp -run-pass=riscv-move-merge -verify-machineinstrs %s -o - | FileCheck %s
+---
+name: mov-merge
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x8, $x9
+    ; CHECK-LABEL: name: mov-merge
+    ; CHECK: liveins: $x8, $x9
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $x12 = ADDI $x0, -3
+    ; CHECK-NEXT: SW renamable $x9, $x2, 56
+    ; CHECK-NEXT: CM_MVA01S killed renamable $x9, renamable $x8, implicit-def $x10, implicit-def $x11
+    ; CHECK-NEXT: SW renamable $x8, $x2, 60
+    ; CHECK-NEXT: PseudoRET
+    $x12 = ADDI $x0, -3
+    SW renamable $x9, $x2, 56
+    $x10 = ADDI killed renamable $x9, 0
+    SW renamable $x8, $x2, 60
+    $x11 = ADDI killed renamable $x8, 0
+    PseudoRET
+...
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll
@@ -8,8 +8,11 @@ define i32 @test(ptr %c) {
 ; CHECK-NEXT:    [[BITLEN:%.*]] = getelementptr i8, ptr [[C]], i64 136
 ; CHECK-NEXT:    [[INCDEC_PTR_3_1:%.*]] = getelementptr i8, ptr [[C]], i64 115
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[BITLEN]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 0, i32 0, i32 0>
-; CHECK-NEXT:    [[TMP5:%.*]] = lshr <8 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <6 x i32> <i32 1, i32 1, i32 1, i32 1, i32 0, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr <6 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 0, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <6 x i64> [[TMP2]], <6 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 4, i32 5>
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc <8 x i64> [[TMP5]] to <8 x i8>
 ; CHECK-NEXT:    store <8 x i8> [[TMP6]], ptr [[INCDEC_PTR_3_1]], align 1
 ; CHECK-NEXT:    ret i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
@@ -101,8 +101,10 @@ define <4 x float> @PR16739_byref_alt(ptr nocapture readonly dereferenceable(16)
 define <4 x float> @PR16739_byval(ptr nocapture readonly dereferenceable(16) %x) {
 ; CHECK-LABEL: @PR16739_byval(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[X:%.*]], align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = lshr <4 x i64> [[TMP2]], <i64 0, i64 32, i64 0, i64 0>
+; CHECK-NEXT:    [[T1:%.*]] = load i64, ptr [[X]], align 16
+; CHECK-NEXT:    [[T8:%.*]] = lshr i64 [[T1]], 32
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[T8]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float>
 ; CHECK-NEXT:    ret <4 x float> [[TMP5]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
@@ -101,8 +101,10 @@ define <4 x float> @PR16739_byref_alt(ptr nocapture readonly dereferenceable(16)
 define <4 x float> @PR16739_byval(ptr nocapture readonly dereferenceable(16) %x) {
 ; CHECK-LABEL: @PR16739_byval(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[X:%.*]], align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = lshr <4 x i64> [[TMP2]], <i64 0, i64 32, i64 0, i64 0>
+; CHECK-NEXT:    [[T1:%.*]] = load i64, ptr [[X]], align 16
+; CHECK-NEXT:    [[T8:%.*]] = lshr i64 [[T1]], 32
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[T8]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float>
 ; CHECK-NEXT:    ret <4 x float> [[TMP5]]