llvm
diff --git a/‎clang/lib/Headers/cpuid.h‎
Lines changed: 1 addition & 1 deletion b/‎clang/lib/Headers/cpuid.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎clang/test/CodeGen/union-tbaa1.c‎
Lines changed: 4 additions & 4 deletions b/‎clang/test/CodeGen/union-tbaa1.c‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎clang/test/Headers/__cpuidex_conflict.c‎
Lines changed: 3 additions & 0 deletions b/‎clang/test/Headers/__cpuidex_conflict.c‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎llvm/lib/Analysis/ConstantFolding.cpp‎
100644100755
Lines changed: 14 additions & 0 deletions b/‎llvm/lib/Analysis/ConstantFolding.cpp‎
100644100755
Lines changed: 14 additions & 0 deletions
diff --git a/‎llvm/lib/Transforms/InstCombine/InstructionCombining.cpp‎
Lines changed: 39 additions & 110 deletions b/‎llvm/lib/Transforms/InstCombine/InstructionCombining.cpp‎
Lines changed: 39 additions & 110 deletions
diff --git a/‎llvm/test/Analysis/BasicAA/featuretest.ll‎
Lines changed: 4 additions & 2 deletions b/‎llvm/test/Analysis/BasicAA/featuretest.ll‎
Lines changed: 4 additions & 2 deletions
@@ -348,7 +348,7 @@ static __inline int __get_cpuid_count (unsigned int __leaf,
 // In some cases, offloading will set the host as the aux triple and define the
 // builtin. Given __has_builtin does not detect builtins on aux triples, we need
 // to explicitly check for some offloading cases.
-#ifndef __NVPTX__
+#if !defined(__NVPTX__) && !defined(__AMDGPU__) && !defined(__SPIRV__)
 static __inline void __cpuidex(int __cpu_info[4], int __leaf, int __subleaf) {
   __cpuid_count(__leaf, __subleaf, __cpu_info[0], __cpu_info[1], __cpu_info[2],
                 __cpu_info[3]);
 
@@ -19,16 +19,16 @@ void bar(vect32 p[][2]);
 // CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[TMP1]], [[NUM]]
 // CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x %union.vect32], ptr [[TMP]], i32 [[TMP0]]
 // CHECK-NEXT:    store i32 [[MUL]], ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA6:![0-9]+]]
-// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [2 x i32], ptr [[ARR]], i32 [[TMP0]], i32 1
+// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i32 4
 // CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4, !tbaa [[TBAA2]]
 // CHECK-NEXT:    [[MUL6:%.*]] = mul i32 [[TMP2]], [[NUM]]
-// CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [2 x %union.vect32], ptr [[TMP]], i32 [[TMP0]], i32 1
+// CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i32 4
 // CHECK-NEXT:    store i32 [[MUL6]], ptr [[ARRAYIDX8]], align 4, !tbaa [[TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = lshr i32 [[MUL]], 16
 // CHECK-NEXT:    store i32 [[TMP3]], ptr [[VEC]], align 4, !tbaa [[TBAA2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[INDEX]], align 4, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [2 x %union.vect32], ptr [[TMP]], i32 [[TMP4]], i32 1
-// CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX14]], i32 2
+// CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [2 x %union.vect32], ptr [[TMP]], i32 [[TMP4]]
+// CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX13]], i32 6
 // CHECK-NEXT:    [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX15]], align 2, !tbaa [[TBAA6]]
 // CHECK-NEXT:    [[CONV16:%.*]] = zext i16 [[TMP5]] to i32
 // CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds nuw i8, ptr [[VEC]], i32 4
 
@@ -6,6 +6,9 @@
 // Ensure that we do not run into conflicts when offloading.
 // RUN: %clang_cc1 %s -DIS_STATIC=static -ffreestanding -fopenmp -fopenmp-is-target-device -aux-triple x86_64-unknown-linux-gnu
 // RUN: %clang_cc1 -DIS_STATIC="" -triple nvptx64-nvidia-cuda -aux-triple x86_64-unknown-linux-gnu -aux-target-cpu x86-64 -fcuda-is-device -x cuda %s -o -
+// RUN: %clang_cc1 -DIS_STATIC="" -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -aux-target-cpu x86-64 -fcuda-is-device -x cuda %s -o -
+// RUN: %clang_cc1 -DIS_STATIC="" -triple spirv64 -aux-triple x86_64-unknown-linux-gnu -aux-target-cpu x86-64 -fcuda-is-device -x cuda %s -o -
+// RUN: %clang_cc1 -DIS_STATIC="" -triple spirv64 -aux-triple x86_64-unknown-linux-gnu -aux-target-cpu x86-64 -fsycl-is-device %s -o -
 
 typedef __SIZE_TYPE__ size_t;
 
 
@@ -1652,6 +1652,13 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
   case Intrinsic::amdgcn_perm:
   case Intrinsic::amdgcn_wave_reduce_umin:
   case Intrinsic::amdgcn_wave_reduce_umax:
+  case Intrinsic::amdgcn_wave_reduce_max:
+  case Intrinsic::amdgcn_wave_reduce_min:
+  case Intrinsic::amdgcn_wave_reduce_add:
+  case Intrinsic::amdgcn_wave_reduce_sub:
+  case Intrinsic::amdgcn_wave_reduce_and:
+  case Intrinsic::amdgcn_wave_reduce_or:
+  case Intrinsic::amdgcn_wave_reduce_xor:
   case Intrinsic::amdgcn_s_wqm:
   case Intrinsic::amdgcn_s_quadmask:
   case Intrinsic::amdgcn_s_bitreplicate:
@@ -3672,6 +3679,13 @@ static Constant *ConstantFoldIntrinsicCall2(Intrinsic::ID IntrinsicID, Type *Ty,
       return ConstantInt::get(Ty, C0->abs());
     case Intrinsic::amdgcn_wave_reduce_umin:
     case Intrinsic::amdgcn_wave_reduce_umax:
+    case Intrinsic::amdgcn_wave_reduce_max:
+    case Intrinsic::amdgcn_wave_reduce_min:
+    case Intrinsic::amdgcn_wave_reduce_add:
+    case Intrinsic::amdgcn_wave_reduce_sub:
+    case Intrinsic::amdgcn_wave_reduce_and:
+    case Intrinsic::amdgcn_wave_reduce_or:
+    case Intrinsic::amdgcn_wave_reduce_xor:
       return dyn_cast<Constant>(Operands[0]);
     }
 
 
@@ -2744,125 +2744,53 @@ Instruction *InstCombinerImpl::visitGEPOfGEP(GetElementPtrInst &GEP,
   if (auto *I = combineConstantOffsets(GEP, *this))
     return I;
 
-  // For constant GEPs, use a more general offset-based folding approach.
-  Type *PtrTy = Src->getType()->getScalarType();
-  if (GEP.hasAllConstantIndices() &&
-      (Src->hasOneUse() || Src->hasAllConstantIndices())) {
-    // Split Src into a variable part and a constant suffix.
-    gep_type_iterator GTI = gep_type_begin(*Src);
-    Type *BaseType = GTI.getIndexedType();
-    bool IsFirstType = true;
-    unsigned NumVarIndices = 0;
-    for (auto Pair : enumerate(Src->indices())) {
-      if (!isa<ConstantInt>(Pair.value())) {
-        BaseType = GTI.getIndexedType();
-        IsFirstType = false;
-        NumVarIndices = Pair.index() + 1;
-      }
-      ++GTI;
-    }
-
-    // Determine the offset for the constant suffix of Src.
-    APInt Offset(DL.getIndexTypeSizeInBits(PtrTy), 0);
-    if (NumVarIndices != Src->getNumIndices()) {
-      // FIXME: getIndexedOffsetInType() does not handled scalable vectors.
-      if (BaseType->isScalableTy())
-        return nullptr;
-
-      SmallVector<Value *> ConstantIndices;
-      if (!IsFirstType)
-        ConstantIndices.push_back(
-            Constant::getNullValue(Type::getInt32Ty(GEP.getContext())));
-      append_range(ConstantIndices, drop_begin(Src->indices(), NumVarIndices));
-      Offset += DL.getIndexedOffsetInType(BaseType, ConstantIndices);
-    }
-
-    // Add the offset for GEP (which is fully constant).
-    if (!GEP.accumulateConstantOffset(DL, Offset))
-      return nullptr;
-
-    // Convert the total offset back into indices.
-    SmallVector<APInt> ConstIndices =
-        DL.getGEPIndicesForOffset(BaseType, Offset);
-    if (!Offset.isZero() || (!IsFirstType && !ConstIndices[0].isZero()))
-      return nullptr;
-
-    GEPNoWrapFlags NW = getMergedGEPNoWrapFlags(*Src, *cast<GEPOperator>(&GEP));
-    SmallVector<Value *> Indices(
-        drop_end(Src->indices(), Src->getNumIndices() - NumVarIndices));
-    for (const APInt &Idx : drop_begin(ConstIndices, !IsFirstType)) {
-      Indices.push_back(ConstantInt::get(GEP.getContext(), Idx));
-      // Even if the total offset is inbounds, we may end up representing it
-      // by first performing a larger negative offset, and then a smaller
-      // positive one. The large negative offset might go out of bounds. Only
-      // preserve inbounds if all signs are the same.
-      if (Idx.isNonNegative() != ConstIndices[0].isNonNegative())
-        NW = NW.withoutNoUnsignedSignedWrap();
-      if (!Idx.isNonNegative())
-        NW = NW.withoutNoUnsignedWrap();
-    }
-
-    return replaceInstUsesWith(
-        GEP, Builder.CreateGEP(Src->getSourceElementType(), Src->getOperand(0),
-                               Indices, "", NW));
-  }
-
   if (Src->getResultElementType() != GEP.getSourceElementType())
     return nullptr;
 
-  SmallVector<Value*, 8> Indices;
-
   // Find out whether the last index in the source GEP is a sequential idx.
   bool EndsWithSequential = false;
   for (gep_type_iterator I = gep_type_begin(*Src), E = gep_type_end(*Src);
        I != E; ++I)
     EndsWithSequential = I.isSequential();
+  if (!EndsWithSequential)
+    return nullptr;
 
-  // Can we combine the two pointer arithmetics offsets?
-  if (EndsWithSequential) {
-    // Replace: gep (gep %P, long B), long A, ...
-    // With:    T = long A+B; gep %P, T, ...
-    Value *SO1 = Src->getOperand(Src->getNumOperands()-1);
-    Value *GO1 = GEP.getOperand(1);
-
-    // If they aren't the same type, then the input hasn't been processed
-    // by the loop above yet (which canonicalizes sequential index types to
-    // intptr_t).  Just avoid transforming this until the input has been
-    // normalized.
-    if (SO1->getType() != GO1->getType())
-      return nullptr;
+  // Replace: gep (gep %P, long B), long A, ...
+  // With:    T = long A+B; gep %P, T, ...
+  Value *SO1 = Src->getOperand(Src->getNumOperands() - 1);
+  Value *GO1 = GEP.getOperand(1);
 
-    Value *Sum =
-        simplifyAddInst(GO1, SO1, false, false, SQ.getWithInstruction(&GEP));
-    // Only do the combine when we are sure the cost after the
-    // merge is never more than that before the merge.
-    if (Sum == nullptr)
-      return nullptr;
+  // If they aren't the same type, then the input hasn't been processed
+  // by the loop above yet (which canonicalizes sequential index types to
+  // intptr_t).  Just avoid transforming this until the input has been
+  // normalized.
+  if (SO1->getType() != GO1->getType())
+    return nullptr;
 
-    Indices.append(Src->op_begin()+1, Src->op_end()-1);
-    Indices.push_back(Sum);
-    Indices.append(GEP.op_begin()+2, GEP.op_end());
-  } else if (isa<Constant>(*GEP.idx_begin()) &&
-             cast<Constant>(*GEP.idx_begin())->isNullValue() &&
-             Src->getNumOperands() != 1) {
-    // Otherwise we can do the fold if the first index of the GEP is a zero
-    Indices.append(Src->op_begin()+1, Src->op_end());
-    Indices.append(GEP.idx_begin()+1, GEP.idx_end());
-  }
-
-  // Don't create GEPs with more than one variable index.
-  unsigned NumVarIndices =
-      count_if(Indices, [](Value *Idx) { return !isa<Constant>(Idx); });
-  if (NumVarIndices > 1)
+  Value *Sum =
+      simplifyAddInst(GO1, SO1, false, false, SQ.getWithInstruction(&GEP));
+  // Only do the combine when we are sure the cost after the
+  // merge is never more than that before the merge.
+  if (Sum == nullptr)
     return nullptr;
 
-  if (!Indices.empty())
-    return replaceInstUsesWith(
-        GEP, Builder.CreateGEP(
-                 Src->getSourceElementType(), Src->getOperand(0), Indices, "",
-                 getMergedGEPNoWrapFlags(*Src, *cast<GEPOperator>(&GEP))));
+  SmallVector<Value *, 8> Indices;
+  Indices.append(Src->op_begin() + 1, Src->op_end() - 1);
+  Indices.push_back(Sum);
+  Indices.append(GEP.op_begin() + 2, GEP.op_end());
 
-  return nullptr;
+  // Don't create GEPs with more than one non-zero index.
+  unsigned NumNonZeroIndices = count_if(Indices, [](Value *Idx) {
+    auto *C = dyn_cast<Constant>(Idx);
+    return !C || !C->isNullValue();
+  });
+  if (NumNonZeroIndices > 1)
+    return nullptr;
+
+  return replaceInstUsesWith(
+      GEP, Builder.CreateGEP(
+               Src->getSourceElementType(), Src->getOperand(0), Indices, "",
+               getMergedGEPNoWrapFlags(*Src, *cast<GEPOperator>(&GEP))));
 }
 
 Value *InstCombiner::getFreelyInvertedImpl(Value *V, bool WillInvertAllUses,
@@ -3334,17 +3262,18 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     return replaceInstUsesWith(GEP, Res);
   }
 
-  bool SeenVarIndex = false;
+  bool SeenNonZeroIndex = false;
   for (auto [IdxNum, Idx] : enumerate(Indices)) {
-    if (isa<Constant>(Idx))
+    auto *C = dyn_cast<Constant>(Idx);
+    if (C && C->isNullValue())
       continue;
 
-    if (!SeenVarIndex) {
-      SeenVarIndex = true;
+    if (!SeenNonZeroIndex) {
+      SeenNonZeroIndex = true;
       continue;
     }
 
-    // GEP has multiple variable indices: Split it.
+    // GEP has multiple non-zero indices: Split it.
     ArrayRef<Value *> FrontIndices = ArrayRef(Indices).take_front(IdxNum);
     Value *FrontGEP =
         Builder.CreateGEP(GEPEltType, PtrOp, FrontIndices,
 
@@ -102,12 +102,14 @@ define i32 @gep_distance_test(ptr %A) {
 ; cannot alias, even if there is a variable offset between them...
 define i32 @gep_distance_test2(ptr %A, i64 %distance) {
 ; NO_ASSUME-LABEL: @gep_distance_test2(
-; NO_ASSUME-NEXT:    [[B:%.*]] = getelementptr { i32, i32 }, ptr [[A:%.*]], i64 [[DISTANCE:%.*]], i32 1
+; NO_ASSUME-NEXT:    [[B_SPLIT:%.*]] = getelementptr { i32, i32 }, ptr [[A:%.*]], i64 [[DISTANCE:%.*]]
+; NO_ASSUME-NEXT:    [[B:%.*]] = getelementptr i8, ptr [[B_SPLIT]], i64 4
 ; NO_ASSUME-NEXT:    store i32 7, ptr [[B]], align 4
 ; NO_ASSUME-NEXT:    ret i32 0
 ;
 ; USE_ASSUME-LABEL: @gep_distance_test2(
-; USE_ASSUME-NEXT:    [[B:%.*]] = getelementptr { i32, i32 }, ptr [[A:%.*]], i64 [[DISTANCE:%.*]], i32 1
+; USE_ASSUME-NEXT:    [[B_SPLIT:%.*]] = getelementptr { i32, i32 }, ptr [[A:%.*]], i64 [[DISTANCE:%.*]]
+; USE_ASSUME-NEXT:    [[B:%.*]] = getelementptr i8, ptr [[B_SPLIT]], i64 4
 ; USE_ASSUME-NEXT:    store i32 7, ptr [[B]], align 4
 ; USE_ASSUME-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[A]], i64 4), "nonnull"(ptr [[A]]), "align"(ptr [[A]], i64 4) ]
 ; USE_ASSUME-NEXT:    ret i32 0