Merge branch 'main' into users/rovka/remove-dvgpr-target-features

rovka · web-flow · commit b8375c5824dc · 2025-09-26T10:03:13.000+02:00
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -2730,7 +2730,7 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
           HasVMemStore = true;
       }
       for (const MachineOperand &Op : MI.all_uses()) {
-        if (!TRI->isVectorRegister(*MRI, Op.getReg()))
+        if (Op.isDebug() || !TRI->isVectorRegister(*MRI, Op.getReg()))
           continue;
         RegInterval Interval = Brackets.getRegInterval(&MI, MRI, TRI, Op);
         // Vgpr use
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -5573,7 +5573,7 @@ static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
   llvm_unreachable("Unknown VFP cmp argument!");
 }
 
-/// OptimizeVFPBrcond - With nnan, it's legal to optimize some
+/// OptimizeVFPBrcond - With nnan and without daz, it's legal to optimize some
 /// f32 and even f64 comparisons to integer ones.
 SDValue
 ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
@@ -5729,9 +5729,9 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
   }
 
   SDNodeFlags Flags = Op->getFlags();
-  if ((getTargetMachine().Options.UnsafeFPMath || Flags.hasNoNaNs()) &&
-      (DAG.getDenormalMode(MVT::f32) == DenormalMode::getIEEE() &&
-       DAG.getDenormalMode(MVT::f64) == DenormalMode::getIEEE()) &&
+  if (Flags.hasNoNaNs() &&
+      DAG.getDenormalMode(MVT::f32) == DenormalMode::getIEEE() &&
+      DAG.getDenormalMode(MVT::f64) == DenormalMode::getIEEE() &&
       (CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETNE ||
        CC == ISD::SETUNE)) {
     if (SDValue Result = OptimizeVFPBrcond(Op, DAG))
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -222,8 +222,7 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   // NEON f32 ops are non-IEEE 754 compliant. Darwin is ok with it by default.
   const FeatureBitset &Bits = getFeatureBits();
   if ((Bits[ARM::ProcA5] || Bits[ARM::ProcA8]) && // Where this matters
-      (Options.UnsafeFPMath || isTargetDarwin() ||
-       DM == DenormalMode::getPreserveSign()))
+      (isTargetDarwin() || DM == DenormalMode::getPreserveSign()))
     HasNEONForFP = true;
 
   if (isRWPI())
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -1603,7 +1603,7 @@ static SDValue lowerVECTOR_SHUFFLEAsZeroOrAnyExtend(const SDLoc &DL,
 /// value is necessary in order to fit the above form.
 static SDValue
 lowerVECTOR_SHUFFLE_VREPLVEI(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
-                             SDValue V1, SDValue V2, SelectionDAG &DAG,
+                             SDValue V1, SelectionDAG &DAG,
                              const LoongArchSubtarget &Subtarget) {
   int SplatIndex = -1;
   for (const auto &M : Mask) {
@@ -1996,8 +1996,8 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
   SDValue Result;
   // TODO: Add more comparison patterns.
   if (V2.isUndef()) {
-    if ((Result = lowerVECTOR_SHUFFLE_VREPLVEI(DL, Mask, VT, V1, V2, DAG,
-                                               Subtarget)))
+    if ((Result =
+             lowerVECTOR_SHUFFLE_VREPLVEI(DL, Mask, VT, V1, DAG, Subtarget)))
       return Result;
     if ((Result =
              lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG, Subtarget)))
@@ -2053,7 +2053,7 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
 /// value is necessary in order to fit the above form.
 static SDValue
 lowerVECTOR_SHUFFLE_XVREPLVEI(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
-                              SDValue V1, SDValue V2, SelectionDAG &DAG,
+                              SDValue V1, SelectionDAG &DAG,
                               const LoongArchSubtarget &Subtarget) {
   int SplatIndex = -1;
   for (const auto &M : Mask) {
@@ -2096,10 +2096,29 @@ lowerVECTOR_SHUFFLE_XVSHUF4I(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
   return lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG, Subtarget);
 }
 
+/// Lower VECTOR_SHUFFLE into XVPERMI (if possible).
+static SDValue
+lowerVECTOR_SHUFFLE_XVPERMI(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
+                            SDValue V1, SelectionDAG &DAG,
+                            const LoongArchSubtarget &Subtarget) {
+  // Only consider XVPERMI_D.
+  if (Mask.size() != 4 || (VT != MVT::v4i64 && VT != MVT::v4f64))
+    return SDValue();
+
+  unsigned MaskImm = 0;
+  for (unsigned i = 0; i < Mask.size(); ++i) {
+    if (Mask[i] == -1)
+      continue;
+    MaskImm |= Mask[i] << (i * 2);
+  }
+
+  return DAG.getNode(LoongArchISD::XVPERMI, DL, VT, V1,
+                     DAG.getConstant(MaskImm, DL, Subtarget.getGRLenVT()));
+}
+
 /// Lower VECTOR_SHUFFLE into XVPERM (if possible).
 static SDValue lowerVECTOR_SHUFFLE_XVPERM(const SDLoc &DL, ArrayRef<int> Mask,
-                                          MVT VT, SDValue V1, SDValue V2,
-                                          SelectionDAG &DAG,
+                                          MVT VT, SDValue V1, SelectionDAG &DAG,
                                           const LoongArchSubtarget &Subtarget) {
   // LoongArch LASX only have XVPERM_W.
   if (Mask.size() != 8 || (VT != MVT::v8i32 && VT != MVT::v8f32))
@@ -2540,14 +2559,16 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
   SDValue Result;
   // TODO: Add more comparison patterns.
   if (V2.isUndef()) {
-    if ((Result = lowerVECTOR_SHUFFLE_XVREPLVEI(DL, Mask, VT, V1, V2, DAG,
-                                                Subtarget)))
+    if ((Result =
+             lowerVECTOR_SHUFFLE_XVREPLVEI(DL, Mask, VT, V1, DAG, Subtarget)))
       return Result;
     if ((Result = lowerVECTOR_SHUFFLE_XVSHUF4I(DL, Mask, VT, V1, V2, DAG,
                                                Subtarget)))
       return Result;
     if ((Result =
-             lowerVECTOR_SHUFFLE_XVPERM(DL, Mask, VT, V1, V2, DAG, Subtarget)))
+             lowerVECTOR_SHUFFLE_XVPERMI(DL, Mask, VT, V1, DAG, Subtarget)))
+      return Result;
+    if ((Result = lowerVECTOR_SHUFFLE_XVPERM(DL, Mask, VT, V1, DAG, Subtarget)))
       return Result;
 
     // TODO: This comment may be enabled in the future to better match the
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir
@@ -1,4 +1,5 @@
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s -debugify-and-strip-all-safe | FileCheck -check-prefix=GFX9 %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX10 %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX12 %s
 
diff --git a/llvm/test/CodeGen/ARM/fnegs.ll b/llvm/test/CodeGen/ARM/fnegs.ll
@@ -10,11 +10,11 @@
 ; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - \
 ; RUN:  | FileCheck %s -check-prefix=CORTEXA8
 
-; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 --enable-unsafe-fp-math %s -o - \
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 --denormal-fp-math=preserve-sign %s -o - \
 ; RUN:  | FileCheck %s -check-prefix=CORTEXA8U
 
 ; RUN: llc -mtriple=arm-darwin -mcpu=cortex-a8 %s -o - \
-; RUN:  | FileCheck %s -check-prefix=CORTEXA8U
+; RUN:  | FileCheck %s -check-prefix=CORTEXA8U-DARWIN
 
 ; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9 %s -o - \
 ; RUN:  | FileCheck %s -check-prefix=CORTEXA9
@@ -41,7 +41,10 @@ entry:
 ; CORTEXA8: 	vneg.f32	s{{.*}}, s{{.*}}
 
 ; CORTEXA8U-LABEL: test1:
-; CORTEXA8U: 	vneg.f32	d{{.*}}, d{{.*}}
+; CORTEXA8U: 	vsub.f32	d{{.*}}, d{{.*}}, d{{.*}}
+
+; CORTEXA8U-DARWIN-LABEL: test1:
+; CORTEXA8U-DARWIN: 	vneg.f32	d{{.*}}, d{{.*}}
 
 ; CORTEXA9-LABEL: test1:
 ; CORTEXA9: 	vneg.f32	s{{.*}}, s{{.*}}
@@ -110,9 +113,13 @@ define <2 x float> @fneg_bitcast(i64 %i) {
 ; CORTEXA8-NOT:         vneg.f32
 
 ; CORTEXA8U-LABEL: fneg_bitcast:
-; CORTEXA8U-DAG: eor r0, r0, #-2147483648
-; CORTEXA8U-DAG: eor r1, r1, #-2147483648
-; CORTEXA8U-NOT:        vneg.f32
+; CORTEXA8U-DAG: vmov.i32	d{{.*}}, #0x80000000
+; CORTEXA8U-DAG: vsub.f32	d{{.*}}, d{{.*}}, d{{.*}}
+
+; CORTEXA8U-DARWIN-LABEL: fneg_bitcast:
+; CORTEXA8U-DARWIN-DAG: eor r0, r0, #-2147483648
+; CORTEXA8U-DARWIN-DAG: eor r1, r1, #-2147483648
+; CORTEXA8U-DARWIN-NOT:        vneg.f32
 
 ; CORTEXA9-LABEL: fneg_bitcast:
 ; CORTEXA9-DAG: eor r0, r0, #-2147483648
diff --git a/llvm/test/CodeGen/ARM/fnmscs.ll b/llvm/test/CodeGen/ARM/fnmscs.ll
@@ -13,11 +13,11 @@
 ; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 -regalloc=basic %s -o - \
 ; RUN:  | FileCheck %s -check-prefix=A8
 
-; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 --enable-unsafe-fp-math %s -o - \
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 --denormal-fp-math=preserve-sign %s -o - \
 ; RUN:  | FileCheck %s -check-prefix=A8U
 
 ; RUN: llc -mtriple=arm-darwin -mcpu=cortex-a8 %s -o - \
-; RUN:  | FileCheck %s -check-prefix=A8U
+; RUN:  | FileCheck %s -check-prefix=A8U-DARWIN
 
 define float @t1(float %acc, float %a, float %b) nounwind {
 entry:
@@ -31,15 +31,20 @@ entry:
 ; NEON: vnmla.f32
 
 ; A8U-LABEL: t1:
-; A8U: vnmul.f32 s{{[0-9]}}, s{{[0-9]}}, s{{[0-9]}}
-; A8U: vsub.f32 d{{[0-9]}}, d{{[0-9]}}, d{{[0-9]}}
+; A8U: vmov.i32	d{{[0-9]+}}, #0x80000000
+; A8U: vsub.f32	d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+; A8U: vsub.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+
+; A8U-DARWIN-LABEL: t1:
+; A8U-DARWIN: vnmul.f32 s{{[0-9]}}, s{{[0-9]}}, s{{[0-9]}}
+; A8U-DARWIN: vsub.f32 d{{[0-9]}}, d{{[0-9]}}, d{{[0-9]}}
 
 ; A8-LABEL: t1:
 ; A8: vnmul.f32 s{{[0-9]}}, s{{[0-9]}}, s{{[0-9]}}
 ; A8: vsub.f32 s{{[0-9]}}, s{{[0-9]}}, s{{[0-9]}}
 	%0 = fmul float %a, %b
 	%1 = fsub float -0.0, %0
-        %2 = fsub float %1, %acc
+	%2 = fsub float %1, %acc
 	ret float %2
 }
 
@@ -55,8 +60,13 @@ entry:
 ; NEON: vnmla.f32
 
 ; A8U-LABEL: t2:
-; A8U: vnmul.f32 s{{[01234]}}, s{{[01234]}}, s{{[01234]}}
-; A8U: vsub.f32 d{{[0-9]}}, d{{[0-9]}}, d{{[0-9]}}
+; A8U: vmov.i32	d{{[0-9]+}}, #0x80000000
+; A8U: vsub.f32	d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+; A8U: vsub.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+
+; A8U-DARWIN-LABEL: t2:
+; A8U-DARWIN: vnmul.f32 s{{[01234]}}, s{{[01234]}}, s{{[01234]}}
+; A8U-DARWIN: vsub.f32 d{{[0-9]}}, d{{[0-9]}}, d{{[0-9]}}
 
 ; A8-LABEL: t2:
 ; A8: vnmul.f32 s{{[01234]}}, s{{[01234]}}, s{{[01234]}}
@@ -79,8 +89,12 @@ entry:
 ; NEON: vnmla.f64
 
 ; A8U-LABEL: t3:
-; A8U: vnmul.f64 d
 ; A8U: vsub.f64 d
+; A8U: vsub.f64 d
+
+; A8U-DARWIN-LABEL: t3:
+; A8U-DARWIN: vnmul.f64 d
+; A8U-DARWIN: vsub.f64 d
 
 ; A8-LABEL: t3:
 ; A8: vnmul.f64 d
@@ -103,8 +117,12 @@ entry:
 ; NEON: vnmla.f64
 
 ; A8U-LABEL: t4:
-; A8U: vnmul.f64 d
 ; A8U: vsub.f64 d
+; A8U: vsub.f64 d
+
+; A8U-DARWIN-LABEL: t4:
+; A8U-DARWIN: vnmul.f64 d
+; A8U-DARWIN: vsub.f64 d
 
 ; A8-LABEL: t4:
 ; A8: vnmul.f64 d
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll
@@ -7,13 +7,12 @@
 define <4 x double> @shufflevector_v4f64(<4 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: shufflevector_v4f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xvpickve.d $xr2, $xr1, 3
-; CHECK-NEXT:    xvpermi.d $xr3, $xr0, 238
-; CHECK-NEXT:    xvrepl128vei.d $xr3, $xr3, 1
-; CHECK-NEXT:    vextrins.d $vr3, $vr2, 16
+; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 3
+; CHECK-NEXT:    xvpickve.d $xr3, $xr1, 3
+; CHECK-NEXT:    vextrins.d $vr2, $vr3, 16
 ; CHECK-NEXT:    xvpickve.d $xr1, $xr1, 2
 ; CHECK-NEXT:    vextrins.d $vr0, $vr1, 16
-; CHECK-NEXT:    xvpermi.q $xr0, $xr3, 2
+; CHECK-NEXT:    xvpermi.q $xr0, $xr2, 2
 ; CHECK-NEXT:    ret
 entry:
   %c = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 6, i32 3, i32 7>
diff --git a/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll b/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll
@@ -6,11 +6,8 @@ define <32 x i8> @shuffle_v32i8(<32 x i8> %a) {
 ; CHECK-LABEL: shuffle_v32i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_0)
-; CHECK-NEXT:    xvld $xr2, $a0, %pc_lo12(.LCPI0_0)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_1)
-; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI0_1)
-; CHECK-NEXT:    xvpermi.d $xr3, $xr0, 78
-; CHECK-NEXT:    xvshuf.d $xr2, $xr0, $xr3
+; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI0_0)
+; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 78
 ; CHECK-NEXT:    xvshuf.h $xr1, $xr2, $xr0
 ; CHECK-NEXT:    xvori.b $xr0, $xr1, 0
 ; CHECK-NEXT:    ret
@@ -34,11 +31,8 @@ define <16 x i16> @shuffle_v16i16(<16 x i16> %a) {
 ; CHECK-LABEL: shuffle_v16i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
-; CHECK-NEXT:    xvld $xr2, $a0, %pc_lo12(.LCPI2_0)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_1)
-; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI2_1)
-; CHECK-NEXT:    xvpermi.d $xr3, $xr0, 78
-; CHECK-NEXT:    xvshuf.d $xr2, $xr0, $xr3
+; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI2_0)
+; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 78
 ; CHECK-NEXT:    xvshuf.w $xr1, $xr2, $xr0
 ; CHECK-NEXT:    xvori.b $xr0, $xr1, 0
 ; CHECK-NEXT:    ret
@@ -72,10 +66,7 @@ define <8 x i32> @shuffle_v8i32(<8 x i32> %a) {
 define <8 x i32> @shuffle_v8i32_same_lane(<8 x i32> %a) {
 ; CHECK-LABEL: shuffle_v8i32_same_lane:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_0)
-; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI5_0)
-; CHECK-NEXT:    xvshuf.d $xr1, $xr0, $xr0
-; CHECK-NEXT:    xvori.b $xr0, $xr1, 0
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 225
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> poison, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i32> %shuffle
@@ -84,14 +75,7 @@ define <8 x i32> @shuffle_v8i32_same_lane(<8 x i32> %a) {
 define <4 x i64> @shuffle_v4i64(<4 x i64> %a) {
 ; CHECK-LABEL: shuffle_v4i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI6_0)
-; CHECK-NEXT:    xvld $xr2, $a0, %pc_lo12(.LCPI6_0)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI6_1)
-; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI6_1)
-; CHECK-NEXT:    xvpermi.d $xr3, $xr0, 78
-; CHECK-NEXT:    xvshuf.d $xr2, $xr0, $xr3
-; CHECK-NEXT:    xvshuf.d $xr1, $xr2, $xr0
-; CHECK-NEXT:    xvori.b $xr0, $xr1, 0
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 39
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> poison, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
   ret <4 x i64> %shuffle
@@ -100,10 +84,7 @@ define <4 x i64> @shuffle_v4i64(<4 x i64> %a) {
 define <4 x i64> @shuffle_v4i64_same_lane(<4 x i64> %a) {
 ; CHECK-LABEL: shuffle_v4i64_same_lane:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI7_0)
-; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI7_0)
-; CHECK-NEXT:    xvshuf.d $xr1, $xr0, $xr0
-; CHECK-NEXT:    xvori.b $xr0, $xr1, 0
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 225
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> poison, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
   ret <4 x i64> %shuffle
@@ -136,14 +117,7 @@ define <8 x float> @shuffle_v8f32_same_lane(<8 x float> %a) {
 define <4 x double> @shuffle_v4f64(<4 x double> %a) {
 ; CHECK-LABEL: shuffle_v4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI10_0)
-; CHECK-NEXT:    xvld $xr2, $a0, %pc_lo12(.LCPI10_0)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI10_1)
-; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI10_1)
-; CHECK-NEXT:    xvpermi.d $xr3, $xr0, 78
-; CHECK-NEXT:    xvshuf.d $xr2, $xr0, $xr3
-; CHECK-NEXT:    xvshuf.d $xr1, $xr2, $xr0
-; CHECK-NEXT:    xvori.b $xr0, $xr1, 0
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 39
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
   ret <4 x double> %shuffle
@@ -152,11 +126,7 @@ define <4 x double> @shuffle_v4f64(<4 x double> %a) {
 define <4 x double> @shuffle_v4f64_same_lane(<4 x double> %a) {
 ; CHECK-LABEL: shuffle_v4f64_same_lane:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI11_0)
-; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI11_0)
-; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 78
-; CHECK-NEXT:    xvshuf.d $xr1, $xr0, $xr0
-; CHECK-NEXT:    xvori.b $xr0, $xr1, 0
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 75
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 0, i32 1>
   ret <4 x double> %shuffle
diff --git a/llvm/test/CodeGen/LoongArch/lasx/vec-shuffle-byte-rotate.ll b/llvm/test/CodeGen/LoongArch/lasx/vec-shuffle-byte-rotate.ll
@@ -127,9 +127,7 @@ define <4 x i64> @byte_rotate_v4i64_2(<4 x i64> %a, <4 x i64> %b) nounwind {
 define <4 x i64> @byte_rotate_v4i64_3(<4 x i64> %a) nounwind {
 ; CHECK-LABEL: byte_rotate_v4i64_3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvbsrl.v $xr1, $xr0, 8
-; CHECK-NEXT:    xvbsll.v $xr0, $xr0, 8
-; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 177
 ; CHECK-NEXT:    ret
     %shuffle = shufflevector <4 x i64> %a, <4 x i64> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
     ret <4 x i64> %shuffle

Original file line number	Diff line number	Diff line change
`@@ -2730,7 +2730,7 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,`
`2730`	`2730`	`HasVMemStore = true;`
`2731`	`2731`	`}`
`2732`	`2732`	`for (const MachineOperand &Op : MI.all_uses()) {`
`2733`		`- if (!TRI->isVectorRegister(*MRI, Op.getReg()))`
	`2733`	`+ if (Op.isDebug() \|\| !TRI->isVectorRegister(*MRI, Op.getReg()))`
`2734`	`2734`	`continue;`
`2735`	`2735`	`RegInterval Interval = Brackets.getRegInterval(&MI, MRI, TRI, Op);`
`2736`	`2736`	`// Vgpr use`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s \| FileCheck -check-prefix=GFX9 %s`
	`2`	`+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s -debugify-and-strip-all-safe \| FileCheck -check-prefix=GFX9 %s`
`2`	`3`	`# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s \| FileCheck -check-prefix=GFX10 %s`
`3`	`4`	`# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s \| FileCheck -check-prefix=GFX12 %s`
`4`	`5`