llvm
diff --git a/‎llvm/lib/Target/X86/X86ISelLowering.cpp‎
Lines changed: 8 additions & 6 deletions b/‎llvm/lib/Target/X86/X86ISelLowering.cpp‎
Lines changed: 8 additions & 6 deletions
@@ -42365,21 +42365,23 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
     // If we're permuting the upper 256-bits subvectors of a concatenation, then
     // see if we can peek through and access the subvector directly.
     if (VT.is512BitVector()) {
-      // 512-bit mask uses 4 x i2 indices - if the msb is always set then only the
-      // upper subvector is used.
+      // 512-bit mask uses 4 x i2 indices - if the msb is always set then only
+      // the upper subvector is used.
       SDValue LHS = N->getOperand(0);
       SDValue RHS = N->getOperand(1);
       uint64_t Mask = N->getConstantOperandVal(2);
       SmallVector<SDValue> LHSOps, RHSOps;
       SDValue NewLHS, NewRHS;
-      if ((Mask & 0x0A) == 0x0A &&
-          collectConcatOps(LHS.getNode(), LHSOps, DAG) && LHSOps.size() == 2) {
+      if (collectConcatOps(peekThroughBitcasts(LHS).getNode(), LHSOps, DAG) &&
+          LHSOps.size() == 2 && (Mask & 0x0A) == 0x0A) {
         NewLHS = widenSubVector(LHSOps[1], false, Subtarget, DAG, DL, 512);
+        NewLHS = DAG.getBitcast(VT, NewLHS);
         Mask &= ~0x0A;
       }
-      if ((Mask & 0xA0) == 0xA0 &&
-          collectConcatOps(RHS.getNode(), RHSOps, DAG) && RHSOps.size() == 2) {
+      if (collectConcatOps(peekThroughBitcasts(RHS).getNode(), RHSOps, DAG) &&
+          RHSOps.size() == 2 && (Mask & 0xA0) == 0xA0) {
         NewRHS = widenSubVector(RHSOps[1], false, Subtarget, DAG, DL, 512);
+        NewRHS = DAG.getBitcast(VT, NewRHS);
         Mask &= ~0xA0;
       }
       if (NewLHS || NewRHS)