From 374fffe015b48676fb3785167d363454a2f4dd1f Mon Sep 17 00:00:00 2001 From: Alexander Kornienko Date: Wed, 9 Oct 2024 14:15:06 +0200 Subject: [PATCH 001/119] Fix out-of-bounds access to std::unique_ptr (#111581) This manifested as an assertion failure in Clang built against libc++ with hardening enabled (e.g. -D_LIBCPP_HARDENING_MODE=_LIBCPP_HARDENING_MODE_DEBUG): `libcxx/include/__memory/unique_ptr.h:596: assertion __checker_.__in_bounds(std::__to_address(__ptr_), __i) failed: unique_ptr::operator[](index): index out of range` --- clang/lib/Frontend/TextDiagnostic.cpp | 6 +++--- clang/test/Frontend/highlight-text.c | 27 +++++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 3 deletions(-) create mode 100644 clang/test/Frontend/highlight-text.c diff --git a/clang/lib/Frontend/TextDiagnostic.cpp b/clang/lib/Frontend/TextDiagnostic.cpp index a264836a54398..4119ce6048d45 100644 --- a/clang/lib/Frontend/TextDiagnostic.cpp +++ b/clang/lib/Frontend/TextDiagnostic.cpp @@ -1252,10 +1252,10 @@ highlightLines(StringRef FileData, unsigned StartLineNumber, for (unsigned I = 0; I <= Spelling.size(); ++I) { // This line is done. if (I == Spelling.size() || isVerticalWhitespace(Spelling[I])) { - SmallVector &LineRanges = - SnippetRanges[L - StartLineNumber]; - if (L >= StartLineNumber) { + SmallVector &LineRanges = + SnippetRanges[L - StartLineNumber]; + if (L == TokenStartLine) // First line appendStyle(LineRanges, T, StartCol, LineLength); else if (L == TokenEndLine) // Last line diff --git a/clang/test/Frontend/highlight-text.c b/clang/test/Frontend/highlight-text.c new file mode 100644 index 0000000000000..a81d26caa4c24 --- /dev/null +++ b/clang/test/Frontend/highlight-text.c @@ -0,0 +1,27 @@ +// RUN: %clang_cc1 -fsyntax-only %s 2> %t +// RUN: FileCheck < %t %s +#define F (1 << 99) + +#define M \ +F | F + +int a = M; +// CHECK: :8:9: warning: shift count >= width of type [-Wshift-count-overflow] +// CHECK-NEXT: 8 | int a = M; +// CHECK-NEXT: | ^ +// CHECK-NEXT: :5:11: note: expanded from macro 'M' +// CHECK-NEXT: 5 | #define M \ +// CHECK-NEXT: | ^ +// CHECK-NEXT: :3:14: note: expanded from macro '\ +// CHECK-NEXT: F' +// CHECK-NEXT: 3 | #define F (1 << 99) +// CHECK-NEXT: | ^ ~~ +// CHECK-NEXT: :8:9: warning: shift count >= width of type [-Wshift-count-overflow] +// CHECK-NEXT: 8 | int a = M; +// CHECK-NEXT: | ^ +// CHECK-NEXT: :6:5: note: expanded from macro 'M' +// CHECK-NEXT: 6 | F | F +// CHECK-NEXT: | ^ +// CHECK-NEXT: :3:14: note: expanded from macro 'F' +// CHECK-NEXT: 3 | #define F (1 << 99) +// CHECK-NEXT: | ^ ~~ From 01cbbc52dc95fe5d9e30ecbfb2ec8c7c1e0199c4 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 9 Oct 2024 13:18:53 +0100 Subject: [PATCH 002/119] [VPlan] Request lane 0 for pointer arg in PtrAdd. After 7f74651, the pointer operand may be replicated of a PtrAdd. Instead of requesting a single scalar, request lane 0, which correctly handles the case when there is a scalar-per-lane. Fixes https://github.com/llvm/llvm-project/issues/111606. --- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 4 +- ...terleave-ptradd-with-replicated-operand.ll | 172 ++++++++++++++++++ 2 files changed, 174 insertions(+), 2 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 123eb0d8d71ab..2468616be0bd7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -645,8 +645,8 @@ Value *VPInstruction::generate(VPTransformState &State) { case VPInstruction::PtrAdd: { assert(vputils::onlyFirstLaneUsed(this) && "can only generate first lane for PtrAdd"); - Value *Ptr = State.get(getOperand(0), /* IsScalar */ true); - Value *Addend = State.get(getOperand(1), /* IsScalar */ true); + Value *Ptr = State.get(getOperand(0), VPLane(0)); + Value *Addend = State.get(getOperand(1), VPLane(0)); return isInBounds() ? Builder.CreateInBoundsPtrAdd(Ptr, Addend, Name) : Builder.CreatePtrAdd(Ptr, Addend, Name); } diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll new file mode 100644 index 0000000000000..cdc7839bfc0f0 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll @@ -0,0 +1,172 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -p loop-vectorize -force-vector-width=4 -S %s | FileCheck %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Test for https://github.com/llvm/llvm-project/issues/111606. +define ptr @test_interleave_ptradd_with_replicated_op(ptr %m) #0 { +; CHECK-LABEL: define ptr @test_interleave_ptradd_with_replicated_op( +; CHECK-SAME: ptr [[M:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[M]], i64 768 +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 8 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 16 +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 24 +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 32 +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 40 +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 48 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 56 +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 64 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 72 +; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 80 +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 88 +; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 96 +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 104 +; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 112 +; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 120 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP0]] +; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP1]] +; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP2]] +; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP3]] +; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP4]] +; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP5]] +; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP6]] +; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP7]] +; CHECK-NEXT: [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP8]] +; CHECK-NEXT: [[NEXT_GEP10:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP9]] +; CHECK-NEXT: [[NEXT_GEP11:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP10]] +; CHECK-NEXT: [[NEXT_GEP12:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP11]] +; CHECK-NEXT: [[NEXT_GEP13:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP12]] +; CHECK-NEXT: [[NEXT_GEP14:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP13]] +; CHECK-NEXT: [[NEXT_GEP15:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP14]] +; CHECK-NEXT: [[NEXT_GEP16:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 4 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i64 4 +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[NEXT_GEP3]], i64 4 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[NEXT_GEP4]], i64 4 +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[NEXT_GEP5]], i64 4 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[NEXT_GEP6]], i64 4 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[NEXT_GEP7]], i64 4 +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[NEXT_GEP8]], i64 4 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[NEXT_GEP9]], i64 4 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[NEXT_GEP10]], i64 4 +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[NEXT_GEP11]], i64 4 +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[NEXT_GEP12]], i64 4 +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[NEXT_GEP13]], i64 4 +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[NEXT_GEP14]], i64 4 +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[NEXT_GEP15]], i64 4 +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr i8, ptr [[NEXT_GEP16]], i64 4 +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[TMP16]], i32 -4 +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[TMP20]], i32 -4 +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[TMP24]], i32 -4 +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr i8, ptr [[TMP28]], i32 -4 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP32]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC17:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[WIDE_VEC18:%.*]] = load <8 x i32>, ptr [[TMP33]], align 4 +; CHECK-NEXT: [[STRIDED_VEC19:%.*]] = shufflevector <8 x i32> [[WIDE_VEC18]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC20:%.*]] = shufflevector <8 x i32> [[WIDE_VEC18]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[WIDE_VEC21:%.*]] = load <8 x i32>, ptr [[TMP34]], align 4 +; CHECK-NEXT: [[STRIDED_VEC22:%.*]] = shufflevector <8 x i32> [[WIDE_VEC21]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC23:%.*]] = shufflevector <8 x i32> [[WIDE_VEC21]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[WIDE_VEC24:%.*]] = load <8 x i32>, ptr [[TMP35]], align 4 +; CHECK-NEXT: [[STRIDED_VEC25:%.*]] = shufflevector <8 x i32> [[WIDE_VEC24]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC26:%.*]] = shufflevector <8 x i32> [[WIDE_VEC24]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP36:%.*]] = add <4 x i32> [[STRIDED_VEC17]], [[STRIDED_VEC]] +; CHECK-NEXT: [[TMP37:%.*]] = add <4 x i32> [[STRIDED_VEC20]], [[STRIDED_VEC19]] +; CHECK-NEXT: [[TMP38:%.*]] = add <4 x i32> [[STRIDED_VEC23]], [[STRIDED_VEC22]] +; CHECK-NEXT: [[TMP39:%.*]] = add <4 x i32> [[STRIDED_VEC26]], [[STRIDED_VEC25]] +; CHECK-NEXT: [[TMP40:%.*]] = extractelement <4 x i32> [[TMP36]], i32 0 +; CHECK-NEXT: store i32 [[TMP40]], ptr [[NEXT_GEP]], align 4 +; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i32> [[TMP36]], i32 1 +; CHECK-NEXT: store i32 [[TMP41]], ptr [[NEXT_GEP2]], align 4 +; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i32> [[TMP36]], i32 2 +; CHECK-NEXT: store i32 [[TMP42]], ptr [[NEXT_GEP3]], align 4 +; CHECK-NEXT: [[TMP43:%.*]] = extractelement <4 x i32> [[TMP36]], i32 3 +; CHECK-NEXT: store i32 [[TMP43]], ptr [[NEXT_GEP4]], align 4 +; CHECK-NEXT: [[TMP44:%.*]] = extractelement <4 x i32> [[TMP37]], i32 0 +; CHECK-NEXT: store i32 [[TMP44]], ptr [[NEXT_GEP5]], align 4 +; CHECK-NEXT: [[TMP45:%.*]] = extractelement <4 x i32> [[TMP37]], i32 1 +; CHECK-NEXT: store i32 [[TMP45]], ptr [[NEXT_GEP6]], align 4 +; CHECK-NEXT: [[TMP46:%.*]] = extractelement <4 x i32> [[TMP37]], i32 2 +; CHECK-NEXT: store i32 [[TMP46]], ptr [[NEXT_GEP7]], align 4 +; CHECK-NEXT: [[TMP47:%.*]] = extractelement <4 x i32> [[TMP37]], i32 3 +; CHECK-NEXT: store i32 [[TMP47]], ptr [[NEXT_GEP8]], align 4 +; CHECK-NEXT: [[TMP48:%.*]] = extractelement <4 x i32> [[TMP38]], i32 0 +; CHECK-NEXT: store i32 [[TMP48]], ptr [[NEXT_GEP9]], align 4 +; CHECK-NEXT: [[TMP49:%.*]] = extractelement <4 x i32> [[TMP38]], i32 1 +; CHECK-NEXT: store i32 [[TMP49]], ptr [[NEXT_GEP10]], align 4 +; CHECK-NEXT: [[TMP50:%.*]] = extractelement <4 x i32> [[TMP38]], i32 2 +; CHECK-NEXT: store i32 [[TMP50]], ptr [[NEXT_GEP11]], align 4 +; CHECK-NEXT: [[TMP51:%.*]] = extractelement <4 x i32> [[TMP38]], i32 3 +; CHECK-NEXT: store i32 [[TMP51]], ptr [[NEXT_GEP12]], align 4 +; CHECK-NEXT: [[TMP52:%.*]] = extractelement <4 x i32> [[TMP39]], i32 0 +; CHECK-NEXT: store i32 [[TMP52]], ptr [[NEXT_GEP13]], align 4 +; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i32> [[TMP39]], i32 1 +; CHECK-NEXT: store i32 [[TMP53]], ptr [[NEXT_GEP14]], align 4 +; CHECK-NEXT: [[TMP54:%.*]] = extractelement <4 x i32> [[TMP39]], i32 2 +; CHECK-NEXT: store i32 [[TMP54]], ptr [[NEXT_GEP15]], align 4 +; CHECK-NEXT: [[TMP55:%.*]] = extractelement <4 x i32> [[TMP39]], i32 3 +; CHECK-NEXT: store i32 [[TMP55]], ptr [[NEXT_GEP16]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP56:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96 +; CHECK-NEXT: br i1 [[TMP56]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[M]], %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 97, %[[MIDDLE_BLOCK]] ], [ 1, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 8 +; CHECK-NEXT: [[P_4:%.*]] = getelementptr i8, ptr [[PTR_IV]], i64 4 +; CHECK-NEXT: [[L_1:%.*]] = load i32, ptr [[P_4]], align 4 +; CHECK-NEXT: [[P_0:%.*]] = getelementptr i8, ptr [[PTR_IV]], i64 0 +; CHECK-NEXT: [[L_2:%.*]] = load i32, ptr [[P_0]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[L_1]], [[L_2]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[PTR_IV]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[IV]], 100 +; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[P_4_LCSSA:%.*]] = phi ptr [ [[P_4]], %[[LOOP]] ], [ [[TMP31]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret ptr [[P_4_LCSSA]] +; +entry: + br label %loop + +loop: ; preds = %loop, %entry + %ptr.iv = phi ptr [ %m, %entry ], [ %ptr.iv.next, %loop ] + %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop ] + %ptr.iv.next = getelementptr i8, ptr %ptr.iv, i64 8 + %p.4 = getelementptr i8, ptr %ptr.iv, i64 4 + %l.1 = load i32, ptr %p.4, align 4 + %p.0 = getelementptr i8, ptr %ptr.iv, i64 0 + %l.2 = load i32, ptr %p.0, align 4 + %add = add i32 %l.1, %l.2 + store i32 %add, ptr %ptr.iv, align 4 + %iv.next = add i32 %iv, 1 + %tobool.not = icmp eq i32 %iv, 100 + br i1 %tobool.not, label %exit, label %loop + +exit: ; preds = %loop + ret ptr %p.4 +} + +attributes #0 = { "target-cpu"="znver2" } +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +;. From 25c3ecf28f0a3a404305b5eefac23baf7e4e0754 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 9 Oct 2024 12:22:50 +0100 Subject: [PATCH 003/119] [X86] Add isConstantPowerOf2 helper to replace repeated code. NFC. Prep work for #110875 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 50 ++++++++++++------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index d4ab0491e7d6b..fd8291bfaea7c 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -5190,6 +5190,21 @@ static bool getTargetShuffleMaskIndices(SDValue MaskNode, return true; } +static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts, + bool AllowUndefs) { + APInt UndefElts; + SmallVector EltBits; + if (!getTargetConstantBitsFromNode(V, EltSizeInBIts, UndefElts, EltBits, + /*AllowWholeUndefs*/ AllowUndefs, + /*AllowPartialUndefs*/ false)) + return false; + + bool IsPow2OrUndef = true; + for (unsigned I = 0, E = EltBits.size(); I != E; ++I) + IsPow2OrUndef &= UndefElts[I] || EltBits[I].isPowerOf2(); + return IsPow2OrUndef; +} + // Match not(xor X, -1) -> X. // Match not(pcmpgt(C, X)) -> pcmpgt(X, C - 1). // Match not(extract_subvector(xor X, -1)) -> extract_subvector(X). @@ -23600,17 +23615,11 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, // Revert part of the simplifySetCCWithAnd combine, to avoid an invert. if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) { SDValue BC0 = peekThroughBitcasts(Op0); - if (BC0.getOpcode() == ISD::AND) { - APInt UndefElts; - SmallVector EltBits; - if (getTargetConstantBitsFromNode( - BC0.getOperand(1), VT.getScalarSizeInBits(), UndefElts, EltBits, - /*AllowWholeUndefs*/ false, /*AllowPartialUndefs*/ false)) { - if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) { - Cond = ISD::SETEQ; - Op1 = DAG.getBitcast(VT, BC0.getOperand(1)); - } - } + if (BC0.getOpcode() == ISD::AND && + isConstantPowerOf2(BC0.getOperand(1), VT.getScalarSizeInBits(), + /*AllowUndefs=*/false)) { + Cond = ISD::SETEQ; + Op1 = DAG.getBitcast(VT, BC0.getOperand(1)); } } @@ -51224,20 +51233,11 @@ static SDValue combineOrXorWithSETCC(unsigned Opc, const SDLoc &DL, EVT VT, if (Opc == ISD::XOR && N0.getOpcode() == X86ISD::PCMPEQ && N0.getOperand(0).getOpcode() == ISD::AND && ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode()) && - ISD::isBuildVectorAllOnes(N1.getNode())) { - APInt UndefElts; - SmallVector EltBits; - if (getTargetConstantBitsFromNode(N0.getOperand(0).getOperand(1), - VT.getScalarSizeInBits(), UndefElts, - EltBits)) { - bool IsPow2OrUndef = true; - for (unsigned I = 0, E = EltBits.size(); I != E; ++I) - IsPow2OrUndef &= UndefElts[I] || EltBits[I].isPowerOf2(); - - if (IsPow2OrUndef) - return DAG.getNode(X86ISD::PCMPEQ, DL, VT, N0.getOperand(0), - N0.getOperand(0).getOperand(1)); - } + ISD::isBuildVectorAllOnes(N1.getNode()) && + isConstantPowerOf2(N0.getOperand(0).getOperand(1), + VT.getScalarSizeInBits(), /*AllowUndefs=*/true)) { + return DAG.getNode(X86ISD::PCMPEQ, DL, VT, N0.getOperand(0), + N0.getOperand(0).getOperand(1)); } return SDValue(); From e17f701f559fc637b41c27ea240568bb33b56d1f Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 9 Oct 2024 12:39:25 +0100 Subject: [PATCH 004/119] [X86] vselect-pcmp.ll - regenerate test checks with vpternlog comments --- llvm/test/CodeGen/X86/vselect-pcmp.ll | 28 +++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/llvm/test/CodeGen/X86/vselect-pcmp.ll b/llvm/test/CodeGen/X86/vselect-pcmp.ll index 84317ad34fb29..545184e03e4f3 100644 --- a/llvm/test/CodeGen/X86/vselect-pcmp.ll +++ b/llvm/test/CodeGen/X86/vselect-pcmp.ll @@ -41,7 +41,7 @@ define <8 x i16> @signbit_sel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512VL-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2 -; AVX512VL-NEXT: vpternlogq $226, %xmm1, %xmm2, %xmm0 +; AVX512VL-NEXT: vpternlogq {{.*#+}} xmm0 = xmm1 ^ (xmm2 & (xmm0 ^ xmm1)) ; AVX512VL-NEXT: retq ; ; XOP-LABEL: signbit_sel_v8i16: @@ -263,7 +263,7 @@ define <16 x i16> @signbit_sel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> % ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512VL-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2 -; AVX512VL-NEXT: vpternlogq $226, %ymm1, %ymm2, %ymm0 +; AVX512VL-NEXT: vpternlogq {{.*#+}} ymm0 = ymm1 ^ (ymm2 & (ymm0 ^ ymm1)) ; AVX512VL-NEXT: retq ; ; XOP-LABEL: signbit_sel_v16i16: @@ -615,7 +615,7 @@ define <16 x i16> @blend_splat1_mask_cond_v16i16(<16 x i16> %x, <16 x i16> %y, < ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512VL-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm0 +; AVX512VL-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm1 ^ ymm2)) ; AVX512VL-NEXT: retq ; ; XOP-LABEL: blend_splat1_mask_cond_v16i16: @@ -654,7 +654,7 @@ define <16 x i8> @blend_splat1_mask_cond_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512VL-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $202, %xmm2, %xmm1, %xmm0 +; AVX512VL-NEXT: vpternlogq {{.*#+}} xmm0 = xmm2 ^ (xmm0 & (xmm1 ^ xmm2)) ; AVX512VL-NEXT: retq ; ; XOP-LABEL: blend_splat1_mask_cond_v16i8: @@ -759,7 +759,7 @@ define <8 x i16> @blend_splatmax_mask_cond_v8i16(<8 x i16> %x, <8 x i16> %y, <8 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512VL-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $202, %xmm2, %xmm1, %xmm0 +; AVX512VL-NEXT: vpternlogq {{.*#+}} xmm0 = xmm2 ^ (xmm0 & (xmm1 ^ xmm2)) ; AVX512VL-NEXT: retq ; ; XOP-LABEL: blend_splatmax_mask_cond_v8i16: @@ -806,7 +806,7 @@ define <32 x i8> @blend_splatmax_mask_cond_v32i8(<32 x i8> %x, <32 x i8> %y, <32 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512VL-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm0 +; AVX512VL-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm1 ^ ymm2)) ; AVX512VL-NEXT: retq ; ; XOP-LABEL: blend_splatmax_mask_cond_v32i8: @@ -944,7 +944,7 @@ define <16 x i16> @blend_splat_mask_cond_v16i16(<16 x i16> %x, <16 x i16> %y, <1 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512VL-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm0 +; AVX512VL-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm1 ^ ymm2)) ; AVX512VL-NEXT: retq ; ; XOP-LABEL: blend_splat_mask_cond_v16i16: @@ -983,7 +983,7 @@ define <16 x i8> @blend_splat_mask_cond_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512VL-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $202, %xmm2, %xmm1, %xmm0 +; AVX512VL-NEXT: vpternlogq {{.*#+}} xmm0 = xmm2 ^ (xmm0 & (xmm1 ^ xmm2)) ; AVX512VL-NEXT: retq ; ; XOP-LABEL: blend_splat_mask_cond_v16i8: @@ -1107,7 +1107,7 @@ define <8 x i16> @blend_mask_cond_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %z ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512VL-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $202, %xmm2, %xmm1, %xmm0 +; AVX512VL-NEXT: vpternlogq {{.*#+}} xmm0 = xmm2 ^ (xmm0 & (xmm1 ^ xmm2)) ; AVX512VL-NEXT: retq ; ; XOP-LABEL: blend_mask_cond_v8i16: @@ -1145,7 +1145,7 @@ define <16 x i8> @blend_mask_cond_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %z ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512VL-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $202, %xmm2, %xmm1, %xmm0 +; AVX512VL-NEXT: vpternlogq {{.*#+}} xmm0 = xmm2 ^ (xmm0 & (xmm1 ^ xmm2)) ; AVX512VL-NEXT: retq ; ; XOP-LABEL: blend_mask_cond_v16i8: @@ -1290,7 +1290,7 @@ define <16 x i16> @blend_mask_cond_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i1 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512VL-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm0 +; AVX512VL-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm1 ^ ymm2)) ; AVX512VL-NEXT: retq ; ; XOP-LABEL: blend_mask_cond_v16i16: @@ -1345,7 +1345,7 @@ define <32 x i8> @blend_mask_cond_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %z ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512VL-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm0 +; AVX512VL-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm1 ^ ymm2)) ; AVX512VL-NEXT: retq ; ; XOP-LABEL: blend_mask_cond_v32i8: @@ -1548,7 +1548,7 @@ define void @store_blend_load_v16i16(ptr %a0, ptr %a1, ptr %a2) { ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-NEXT: vpmaxuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 ; AVX512VL-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm1 -; AVX512VL-NEXT: vpternlogq $202, (%rsi), %ymm0, %ymm1 +; AVX512VL-NEXT: vpternlogq {{.*#+}} ymm1 = mem ^ (ymm1 & (ymm0 ^ mem)) ; AVX512VL-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -1617,7 +1617,7 @@ define void @store_blend_load_v32i8(ptr %a0, ptr %a1, ptr %a2) { ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-NEXT: vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 ; AVX512VL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1 -; AVX512VL-NEXT: vpternlogq $202, (%rsi), %ymm0, %ymm1 +; AVX512VL-NEXT: vpternlogq {{.*#+}} ymm1 = mem ^ (ymm1 & (ymm0 ^ mem)) ; AVX512VL-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq From 4b4078a5cf95ad4a5e18704d9b88129c3a0bcb88 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 9 Oct 2024 13:29:56 +0100 Subject: [PATCH 005/119] [X86] Add test coverage for #110875 --- llvm/test/CodeGen/X86/vselect-pcmp.ll | 125 ++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) diff --git a/llvm/test/CodeGen/X86/vselect-pcmp.ll b/llvm/test/CodeGen/X86/vselect-pcmp.ll index 545184e03e4f3..1cf59ea2ab7ad 100644 --- a/llvm/test/CodeGen/X86/vselect-pcmp.ll +++ b/llvm/test/CodeGen/X86/vselect-pcmp.ll @@ -1698,6 +1698,131 @@ define void @PR46531(ptr %x, ptr %y, ptr %z) { ret void } +define <64 x i8> @PR110875(<32 x i8> %a0, <32 x i8> %a1, i64 %a2) { +; AVX1-LABEL: PR110875: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovq %rdi, %xmm2 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,5,5] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,6,7,7] +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,2,3,3,6,6,7,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,0,1,1,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,3,3,4,5,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,1,1,4,4,5,5] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 +; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpcmpeqb %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqb %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vpcmpeqb %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqb %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20] +; AVX1-NEXT: vandnps %ymm4, %ymm2, %ymm5 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vorps %ymm5, %ymm0, %ymm0 +; AVX1-NEXT: vandnps %ymm4, %ymm3, %ymm2 +; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vorps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: PR110875: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq %rdi, %xmm2 +; AVX2-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,22,22,22,22,22,22,22,22,23,23,23,23,23,23,23,23] +; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm4 = [20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20] +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: PR110875: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512F-NEXT: vmovq %rdi, %xmm0 +; AVX512F-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19] +; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,22,22,22,22,22,22,22,22,23,23,23,23,23,23,23,23] +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 = mem ^ (zmm0 & (zmm1 ^ mem)) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: PR110875: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512VL-NEXT: vpbroadcastq %rdi, %ymm0 +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19] +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,22,22,22,22,22,22,22,22,23,23,23,23,23,23,23,23] +; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 = mem ^ (zmm0 & (zmm1 ^ mem)) +; AVX512VL-NEXT: retq +; +; XOP-LABEL: PR110875: +; XOP: # %bb.0: +; XOP-NEXT: vmovq %rdi, %xmm2 +; XOP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; XOP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,5,5] +; XOP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,6,7,7] +; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; XOP-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,2,3,3,6,6,7,7] +; XOP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,0,1,1,4,5,6,7] +; XOP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,3,3,4,5,6,7] +; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; XOP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,1,1,4,4,5,5] +; XOP-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; XOP-NEXT: vandps %ymm4, %ymm2, %ymm2 +; XOP-NEXT: vandps %ymm4, %ymm3, %ymm3 +; XOP-NEXT: vextractf128 $1, %ymm3, %xmm4 +; XOP-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; XOP-NEXT: vpcomeqb %xmm5, %xmm4, %xmm4 +; XOP-NEXT: vpcomeqb %xmm5, %xmm3, %xmm3 +; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; XOP-NEXT: vextractf128 $1, %ymm2, %xmm4 +; XOP-NEXT: vpcomeqb %xmm5, %xmm4, %xmm4 +; XOP-NEXT: vpcomeqb %xmm5, %xmm2, %xmm2 +; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; XOP-NEXT: vbroadcastss {{.*#+}} ymm4 = [20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20] +; XOP-NEXT: vpcmov %ymm2, %ymm4, %ymm0, %ymm0 +; XOP-NEXT: vpcmov %ymm3, %ymm4, %ymm1, %ymm1 +; XOP-NEXT: retq + %concat = shufflevector <32 x i8> %a0, <32 x i8> %a1, <64 x i32> + %scl = insertelement <1 x i64> poison, i64 %a2, i64 0 + %splat = shufflevector <1 x i64> %scl, <1 x i64> poison, <8 x i32> + %ref = bitcast <8 x i64> %splat to <64 x i8> + %shuf = shufflevector <64 x i8> %ref, <64 x i8> poison, <64 x i32> + %mask = and <64 x i8> %shuf, + %cmp = icmp eq <64 x i8> %mask, zeroinitializer + %res = select <64 x i1> %cmp, <64 x i8> %concat, <64 x i8> + ret <64 x i8> %res +} + attributes #0 = { "no-nans-fp-math"="true" } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; AVX: {{.*}} From 8e2ccdc4deedd463a20237b4d842b4c51f9fe603 Mon Sep 17 00:00:00 2001 From: Tobias Gysi Date: Wed, 9 Oct 2024 14:37:01 +0200 Subject: [PATCH 006/119] [MLIR][LLVM] Use ViewLikeOpInterface (#111663) This commit adds the ViewLikeOpInterface to the GEP and AddrSpaceCast operations. This allows us to simplify the inliner interface. At the same time, the change also makes the inliner interface more extensible for downstream users that have custom view-like operations. --- mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h | 1 + mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td | 8 ++++++-- mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp | 4 ++++ .../Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp | 10 +++++----- 4 files changed, 16 insertions(+), 7 deletions(-) diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h index 9341a5a11cd62..d236cae0d8088 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h @@ -29,6 +29,7 @@ #include "mlir/Interfaces/FunctionInterfaces.h" #include "mlir/Interfaces/InferTypeOpInterface.h" #include "mlir/Interfaces/SideEffectInterfaces.h" +#include "mlir/Interfaces/ViewLikeInterface.h" #include "mlir/Support/ThreadLocalCache.h" #include "llvm/ADT/PointerEmbeddedInt.h" #include "llvm/IR/DerivedTypes.h" diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td index 88e82ce48959b..825b48fb3d4a3 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td @@ -24,6 +24,7 @@ include "mlir/Interfaces/ControlFlowInterfaces.td" include "mlir/Interfaces/InferTypeOpInterface.td" include "mlir/Interfaces/MemorySlotInterfaces.td" include "mlir/Interfaces/SideEffectInterfaces.td" +include "mlir/Interfaces/ViewLikeInterface.td" class LLVM_Builder { string llvmBuilder = builder; @@ -246,7 +247,9 @@ def LLVM_AllocaOp : LLVM_Op<"alloca", def LLVM_GEPOp : LLVM_Op<"getelementptr", [Pure, DeclareOpInterfaceMethods, DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods]> { + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ]> { let arguments = (ins LLVM_ScalarOrVectorOf:$base, Variadic>:$dynamicIndices, DenseI32ArrayAttr:$rawConstantIndices, @@ -495,7 +498,8 @@ def LLVM_BitcastOp : LLVM_CastOp<"bitcast", "BitCast", LLVM_AnyNonAggregate, def LLVM_AddrSpaceCastOp : LLVM_CastOp<"addrspacecast", "AddrSpaceCast", LLVM_ScalarOrVectorOf, LLVM_ScalarOrVectorOf, - [DeclareOpInterfaceMethods]> { + [DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods]> { let hasFolder = 1; } def LLVM_IntToPtrOp : LLVM_CastOp<"inttoptr", "IntToPtr", diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp index fb7024a14f8d4..9e361848f8c0b 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp @@ -3226,6 +3226,8 @@ OpFoldResult LLVM::AddrSpaceCastOp::fold(FoldAdaptor adaptor) { return foldChainableCast(*this, adaptor); } +Value LLVM::AddrSpaceCastOp::getViewSource() { return getArg(); } + //===----------------------------------------------------------------------===// // Folder for LLVM::GEPOp //===----------------------------------------------------------------------===// @@ -3276,6 +3278,8 @@ OpFoldResult LLVM::GEPOp::fold(FoldAdaptor adaptor) { return {}; } +Value LLVM::GEPOp::getViewSource() { return getBase(); } + //===----------------------------------------------------------------------===// // ShlOp //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp b/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp index 031930dcfc213..fe002359c2022 100644 --- a/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp +++ b/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp @@ -16,6 +16,7 @@ #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/IR/Matchers.h" #include "mlir/Interfaces/DataLayoutInterfaces.h" +#include "mlir/Interfaces/ViewLikeInterface.h" #include "mlir/Transforms/InliningUtils.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/Support/Debug.h" @@ -229,11 +230,10 @@ static FailureOr> getUnderlyingObjectSet(Value pointerValue) { SmallVector result; WalkContinuation walkResult = walkSlice(pointerValue, [&](Value val) { - if (auto gepOp = val.getDefiningOp()) - return WalkContinuation::advanceTo(gepOp.getBase()); - - if (auto addrCast = val.getDefiningOp()) - return WalkContinuation::advanceTo(addrCast.getOperand()); + // Attempt to advance to the source of the underlying view-like operation. + // Examples of view-like operations include GEPOp and AddrSpaceCastOp. + if (auto viewOp = val.getDefiningOp()) + return WalkContinuation::advanceTo(viewOp.getViewSource()); // Attempt to advance to control flow predecessors. std::optional> controlFlowPredecessors = From 3b2bfb48239e674805a9c4e65be3c3a9eeabdf9c Mon Sep 17 00:00:00 2001 From: roderickzzc <32364736+Zhang-Zecheng@users.noreply.github.com> Date: Wed, 9 Oct 2024 05:37:16 -0700 Subject: [PATCH 007/119] [mlir] add missing CMake dependency on ShardingInterface generated headers for LinalgDialect (#111603) This fixes non-deterministic build failures. Fixes https://github.com/llvm/llvm-project/issues/111527 --------- Co-authored-by: zecheng.zhang Co-authored-by: Mehdi Amini --- mlir/lib/Dialect/Linalg/IR/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt b/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt index f1fcb22fb8d54..ce8dc6ccb0fa3 100644 --- a/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt +++ b/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt @@ -13,6 +13,7 @@ add_mlir_dialect_library(MLIRLinalgDialect MLIRLinalgOpsEnumsIncGen MLIRLinalgOpsIncGen MLIRLinalgStructuredOpsIncGen + MLIRShardingInterfaceIncGen LINK_LIBS PUBLIC MLIRAffineDialect From 3b7091bcf3b48b63724500d821dc7a0ce8ffa3c9 Mon Sep 17 00:00:00 2001 From: Ariel-Burton Date: Wed, 9 Oct 2024 08:37:40 -0400 Subject: [PATCH 008/119] [APFloat] add predicates to fltSemantics for hasZero and hasSignedRepr (#111451) We add static methods to APFloatBase to allow the hasZero and hasSignedRepr properties of fltSemantics to be obtained. --- llvm/include/llvm/ADT/APFloat.h | 2 ++ llvm/lib/Support/APFloat.cpp | 8 ++++++++ 2 files changed, 10 insertions(+) diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h index c3bbd9d83a0ec..6f1e24e5da33a 100644 --- a/llvm/include/llvm/ADT/APFloat.h +++ b/llvm/include/llvm/ADT/APFloat.h @@ -309,6 +309,8 @@ struct APFloatBase { static ExponentType semanticsMaxExponent(const fltSemantics &); static unsigned int semanticsSizeInBits(const fltSemantics &); static unsigned int semanticsIntSizeInBits(const fltSemantics&, bool); + static bool semanticsHasZero(const fltSemantics &); + static bool semanticsHasSignedRepr(const fltSemantics &); // Returns true if any number described by \p Src can be precisely represented // by a normal (not subnormal) value in \p Dst. diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp index d1b3c936dc589..a33b6c4a6ddc6 100644 --- a/llvm/lib/Support/APFloat.cpp +++ b/llvm/lib/Support/APFloat.cpp @@ -367,6 +367,14 @@ unsigned int APFloatBase::semanticsIntSizeInBits(const fltSemantics &semantics, return MinBitWidth; } +bool APFloatBase::semanticsHasZero(const fltSemantics &semantics) { + return semantics.hasZero; +} + +bool APFloatBase::semanticsHasSignedRepr(const fltSemantics &semantics) { + return semantics.hasSignedRepr; +} + bool APFloatBase::isRepresentableAsNormalIn(const fltSemantics &Src, const fltSemantics &Dst) { // Exponent range must be larger. From 890e481358d6cb4e81629742eda32f9a1d6785d2 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 9 Oct 2024 16:38:58 +0400 Subject: [PATCH 009/119] AMDGPU: Regenerate test checks --- .../CodeGen/AMDGPU/gep-flags-stack-offsets.ll | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/gep-flags-stack-offsets.ll b/llvm/test/CodeGen/AMDGPU/gep-flags-stack-offsets.ll index 782894976c711..b5f0b2ff9ef4c 100644 --- a/llvm/test/CodeGen/AMDGPU/gep-flags-stack-offsets.ll +++ b/llvm/test/CodeGen/AMDGPU/gep-flags-stack-offsets.ll @@ -9,7 +9,7 @@ define void @gep_noflags_alloca(i32 %idx, i32 %val) #0 { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: v_lshrrev_b32_e64 v2, 6, s32 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GFX8-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -36,7 +36,7 @@ define void @gep_inbounds_alloca(i32 %idx, i32 %val) #0 { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: v_lshrrev_b32_e64 v2, 6, s32 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GFX8-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -63,7 +63,7 @@ define void @gep_nuw_alloca(i32 %idx, i32 %val) #0 { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: v_lshrrev_b32_e64 v2, 6, s32 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GFX8-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -90,7 +90,7 @@ define void @gep_nusw_alloca(i32 %idx, i32 %val) #0 { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: v_lshrrev_b32_e64 v2, 6, s32 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GFX8-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -117,7 +117,7 @@ define void @gep_inbounds_nuw_alloca(i32 %idx, i32 %val) #0 { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: v_lshrrev_b32_e64 v2, 6, s32 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GFX8-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -144,7 +144,7 @@ define void @gep_nusw_nuw_alloca(i32 %idx, i32 %val) #0 { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: v_lshrrev_b32_e64 v2, 6, s32 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GFX8-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -172,7 +172,7 @@ define void @gep_inbounds_nuw_alloca_nonpow2_scale(i32 %idx, i32 %val) #0 { ; GFX8-NEXT: s_movk_i32 s4, 0x84 ; GFX8-NEXT: v_mul_lo_u32 v0, v0, s4 ; GFX8-NEXT: v_lshrrev_b32_e64 v2, 6, s32 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GFX8-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -184,7 +184,7 @@ define void @gep_inbounds_nuw_alloca_nonpow2_scale(i32 %idx, i32 %val) #0 { ; GFX9-NEXT: s_movk_i32 s4, 0x84 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s4 ; GFX9-NEXT: v_lshrrev_b32_e64 v2, 6, s32 -; GFX9-NEXT: v_add_u32_e32 v0, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] From 21da4e7f51c7adfd0b1c5defc8bd0d16ea1ce759 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Wed, 9 Oct 2024 08:46:59 -0400 Subject: [PATCH 010/119] [libc++] Fix broken configuration system-libcxxabi on Apple (#110920) On Apple platforms, using system-libcxxabi as an ABI library wouldn't work because we'd try to re-export symbols from libc++abi that the system libc++abi.dylib might not have. Instead, only re-export those symbols when we're using the in-tree libc++abi. This does mean that libc++.dylib won't re-export any libc++abi symbols when building against the system libc++abi, which could be fixed in various ways. However, the best solution really depends on the intended use case, so this patch doesn't try to solve that problem. As a drive-by, also improve the diagnostic message when the user forgets to set the LIBCXX_CXX_ABI_INCLUDE_PATHS variable, which would previously lead to a confusing error. Closes #104672 --- libcxx/cmake/Modules/HandleLibCXXABI.cmake | 25 +++++++++++++++++++++- libcxx/src/CMakeLists.txt | 8 ++----- libcxxabi/src/CMakeLists.txt | 2 ++ 3 files changed, 28 insertions(+), 7 deletions(-) diff --git a/libcxx/cmake/Modules/HandleLibCXXABI.cmake b/libcxx/cmake/Modules/HandleLibCXXABI.cmake index 34e9a672a960f..52236f473f35d 100644 --- a/libcxx/cmake/Modules/HandleLibCXXABI.cmake +++ b/libcxx/cmake/Modules/HandleLibCXXABI.cmake @@ -83,6 +83,10 @@ endfunction() # Link against a system-provided libstdc++ if ("${LIBCXX_CXX_ABI}" STREQUAL "libstdc++") + if(NOT LIBCXX_CXX_ABI_INCLUDE_PATHS) + message(FATAL_ERROR "LIBCXX_CXX_ABI_INCLUDE_PATHS must be set when selecting libstdc++ as an ABI library") + endif() + add_library(libcxx-abi-headers INTERFACE) import_private_headers(libcxx-abi-headers "${LIBCXX_CXX_ABI_INCLUDE_PATHS}" "cxxabi.h;bits/c++config.h;bits/os_defines.h;bits/cpu_defines.h;bits/cxxabi_tweaks.h;bits/cxxabi_forced.h") @@ -96,6 +100,10 @@ if ("${LIBCXX_CXX_ABI}" STREQUAL "libstdc++") # Link against a system-provided libsupc++ elseif ("${LIBCXX_CXX_ABI}" STREQUAL "libsupc++") + if(NOT LIBCXX_CXX_ABI_INCLUDE_PATHS) + message(FATAL_ERROR "LIBCXX_CXX_ABI_INCLUDE_PATHS must be set when selecting libsupc++ as an ABI library") + endif() + add_library(libcxx-abi-headers INTERFACE) import_private_headers(libcxx-abi-headers "${LIBCXX_CXX_ABI_INCLUDE_PATHS}" "cxxabi.h;bits/c++config.h;bits/os_defines.h;bits/cpu_defines.h;bits/cxxabi_tweaks.h;bits/cxxabi_forced.h") @@ -114,7 +122,18 @@ elseif ("${LIBCXX_CXX_ABI}" STREQUAL "libcxxabi") target_compile_definitions(libcxx-abi-headers INTERFACE "-DLIBCXX_BUILDING_LIBCXXABI") if (TARGET cxxabi_shared) - add_library(libcxx-abi-shared ALIAS cxxabi_shared) + add_library(libcxx-abi-shared INTERFACE) + target_link_libraries(libcxx-abi-shared INTERFACE cxxabi_shared) + + # When using the in-tree libc++abi as an ABI library, libc++ re-exports the + # libc++abi symbols (on platforms where it can) because libc++abi is only an + # implementation detail of libc++. + target_link_libraries(libcxx-abi-shared INTERFACE cxxabi-reexports) + + # Populate the OUTPUT_NAME property of libcxx-abi-shared because that is used when + # generating a linker script. + get_target_property(_output_name cxxabi_shared OUTPUT_NAME) + set_target_properties(libcxx-abi-shared PROPERTIES "OUTPUT_NAME" "${_output_name}") endif() if (TARGET cxxabi_static) @@ -131,6 +150,10 @@ elseif ("${LIBCXX_CXX_ABI}" STREQUAL "libcxxabi") # Link against a system-provided libc++abi elseif ("${LIBCXX_CXX_ABI}" STREQUAL "system-libcxxabi") + if(NOT LIBCXX_CXX_ABI_INCLUDE_PATHS) + message(FATAL_ERROR "LIBCXX_CXX_ABI_INCLUDE_PATHS must be set when selecting system-libcxxabi as an ABI library") + endif() + add_library(libcxx-abi-headers INTERFACE) import_private_headers(libcxx-abi-headers "${LIBCXX_CXX_ABI_INCLUDE_PATHS}" "cxxabi.h;__cxxabi_config.h") target_compile_definitions(libcxx-abi-headers INTERFACE "-DLIBCXX_BUILDING_LIBCXXABI") diff --git a/libcxx/src/CMakeLists.txt b/libcxx/src/CMakeLists.txt index 806e341ba3b72..b187677ff2db5 100644 --- a/libcxx/src/CMakeLists.txt +++ b/libcxx/src/CMakeLists.txt @@ -210,14 +210,10 @@ if (LIBCXX_ENABLE_SHARED) target_link_libraries(cxx_shared PUBLIC libcxx-abi-shared) endif() - # Maybe re-export symbols from libc++abi - # In particular, we don't re-export the symbols if libc++abi is merged statically - # into libc++ because in that case there's no dylib to re-export from. + # Maybe force some symbols to be weak, not weak or not exported. + # TODO: This shouldn't depend on the platform, and ideally it should be done in the sources. if (APPLE AND LIBCXX_CXX_ABI MATCHES "libcxxabi$" AND NOT LIBCXX_STATICALLY_LINK_ABI_IN_SHARED_LIBRARY) - target_link_libraries(cxx_shared PRIVATE cxxabi-reexports) - - # TODO: These exports controls should not be tied to whether we re-export libc++abi symbols target_link_libraries(cxx_shared PRIVATE "-Wl,-force_symbols_not_weak_list,${CMAKE_CURRENT_SOURCE_DIR}/../lib/notweak.exp" "-Wl,-force_symbols_weak_list,${CMAKE_CURRENT_SOURCE_DIR}/../lib/weak.exp") diff --git a/libcxxabi/src/CMakeLists.txt b/libcxxabi/src/CMakeLists.txt index 6f16c614212ef..480e528b819bb 100644 --- a/libcxxabi/src/CMakeLists.txt +++ b/libcxxabi/src/CMakeLists.txt @@ -213,6 +213,8 @@ if (LIBCXXABI_ENABLE_SHARED) list(APPEND LIBCXXABI_INSTALL_TARGETS "cxxabi_shared") endif() + # TODO: Move this to libc++'s HandleLibCXXABI.cmake since this is effectively trying to control + # what libc++ re-exports. add_library(cxxabi-reexports INTERFACE) function(export_symbols file) # -exported_symbols_list is only available on Apple platforms From 32db6fbdb9a8173813e67606b87555c31ea557bb Mon Sep 17 00:00:00 2001 From: Kunwar Grover Date: Wed, 9 Oct 2024 13:50:33 +0100 Subject: [PATCH 011/119] [mlir][vector] Implement speculation for vector.transferx ops (#111533) This patch implements speculation for vector.transfer_read/vector.transfer_write ops, allowing these ops to work with LICM. --- .../mlir/Dialect/Vector/IR/VectorOps.td | 2 + mlir/lib/Dialect/Vector/IR/VectorOps.cpp | 12 ++ .../loop-invariant-code-motion.mlir | 107 ++++++++++++++++++ 3 files changed, 121 insertions(+) diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td index 45fd1c6e3f938..b0de7c11b9d43 100644 --- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td +++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td @@ -1240,6 +1240,7 @@ def Vector_TransferReadOp : DeclareOpInterfaceMethods, DeclareOpInterfaceMethods, DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, AttrSizedOperandSegments, DestinationStyleOpInterface ]>, @@ -1487,6 +1488,7 @@ def Vector_TransferWriteOp : DeclareOpInterfaceMethods, DeclareOpInterfaceMethods, DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, AttrSizedOperandSegments, DestinationStyleOpInterface ]>, diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp index dc92bea09dc16..1718530b4aa16 100644 --- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp +++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp @@ -4245,6 +4245,12 @@ void TransferReadOp::getEffects( SideEffects::DefaultResource::get()); } +Speculation::Speculatability TransferReadOp::getSpeculatability() { + if (hasPureTensorSemantics()) + return Speculation::Speculatable; + return Speculation::NotSpeculatable; +} + namespace { /// Store to load forwarding for transfer operations with permuation maps. /// Even if the permutation maps are different we can still propagate the store @@ -4627,6 +4633,12 @@ void TransferWriteOp::getEffects( SideEffects::DefaultResource::get()); } +Speculation::Speculatability TransferWriteOp::getSpeculatability() { + if (hasPureTensorSemantics()) + return Speculation::Speculatable; + return Speculation::NotSpeculatable; +} + namespace { /// Remove dead transfer write from the SSA chain so that it an be eliminated by /// DCE diff --git a/mlir/test/Transforms/loop-invariant-code-motion.mlir b/mlir/test/Transforms/loop-invariant-code-motion.mlir index 57f4ece9c9f2a..e4c423ce7052b 100644 --- a/mlir/test/Transforms/loop-invariant-code-motion.mlir +++ b/mlir/test/Transforms/loop-invariant-code-motion.mlir @@ -1209,3 +1209,110 @@ func.func @hoist_linalg_ops_div_by_zero(%a : tensor<128x128xi32>, func.return %final : tensor } + +// ----- + +// CHECK-LABEL: func @hoist_vector_transfer_ops +// CHECK: vector.transfer_read +// CHECK: scf.for +// CHECK-NOT: vector.transfer_read +// CHECK: arith.addf +// CHECK: scf.yield +func.func @hoist_vector_transfer_ops( + %a : tensor<128x128xf32>, + %lb : index, + %ub : index, + %step : index, + %ida : index, + %idb : index) -> vector<4x4xf32> { + %cst_0 = arith.constant 0.0 : f32 + %cst = arith.constant dense<0.0> : vector<4x4xf32> + %final = + scf.for %i = %lb to %ub step %step iter_args(%acc = %cst) -> vector<4x4xf32> { + %read = vector.transfer_read %a[%ida, %idb], %cst_0 : tensor<128x128xf32>, vector<4x4xf32> + %out = arith.addf %read, %acc : vector<4x4xf32> + scf.yield %out : vector<4x4xf32> + } + func.return %final : vector<4x4xf32> +} + +// ----- + +// CHECK-LABEL: func @hoist_vector_transfer_ops +// CHECK: vector.transfer_write +// CHECK: vector.transfer_read +// CHECK: scf.for +// CHECK-NOT: vector.transfer_write +// CHECK-NOT: vector.transfer_read +// CHECK: arith.addf +// CHECK: scf.yield +func.func @hoist_vector_transfer_ops( + %lb : index, + %ub : index, + %step : index, + %ida : index, + %idb : index) -> vector<4x4xf32> { + %c0 = arith.constant 0 : index + %cst_0 = arith.constant 0.0 : f32 + %cst = arith.constant dense<0.0> : vector<4x4xf32> + %empty = tensor.empty() : tensor<4x4xf32> + %final = + scf.for %i = %lb to %ub step %step iter_args(%acc = %cst) -> vector<4x4xf32> { + %a = vector.transfer_write %cst, %empty[%c0, %c0] : vector<4x4xf32>, tensor<4x4xf32> + %read = vector.transfer_read %a[%c0, %c0], %cst_0 : tensor<4x4xf32>, vector<4x4xf32> + %out = arith.addf %read, %acc : vector<4x4xf32> + scf.yield %out : vector<4x4xf32> + } + func.return %final : vector<4x4xf32> +} + +// ----- + +// CHECK-LABEL: func @do_not_hoist_vector_transfer_ops_loop_dep +// CHECK-NOT: vector.transfer_read +// CHECK: scf.for +// CHECK: vector.transfer_read +// CHECK: arith.addf +// CHECK: scf.yield +func.func @do_not_hoist_vector_transfer_ops_loop_dep( + %a : tensor<128x128xf32>, + %lb : index, + %ub : index, + %step : index, + %ida : index) -> vector<4x4xf32> { + %cst_0 = arith.constant 0.0 : f32 + %cst = arith.constant dense<0.0> : vector<4x4xf32> + %final = + scf.for %i = %lb to %ub step %step iter_args(%acc = %cst) -> vector<4x4xf32> { + %read = vector.transfer_read %a[%ida, %i], %cst_0 : tensor<128x128xf32>, vector<4x4xf32> + %out = arith.addf %read, %acc : vector<4x4xf32> + scf.yield %out : vector<4x4xf32> + } + func.return %final : vector<4x4xf32> +} + +// ----- + +// CHECK-LABEL: func @do_not_hoist_vector_transfer_ops_memref +// CHECK-NOT: vector.transfer_read +// CHECK: scf.for +// CHECK: vector.transfer_read +// CHECK: arith.addf +// CHECK: scf.yield +func.func @do_not_hoist_vector_transfer_ops_memref( + %a : memref<128x128xf32>, + %lb : index, + %ub : index, + %step : index, + %ida : index, + %idb : index) -> vector<4x4xf32> { + %cst_0 = arith.constant 0.0 : f32 + %cst = arith.constant dense<0.0> : vector<4x4xf32> + %final = + scf.for %i = %lb to %ub step %step iter_args(%acc = %cst) -> vector<4x4xf32> { + %read = vector.transfer_read %a[%ida, %idb], %cst_0 : memref<128x128xf32>, vector<4x4xf32> + %out = arith.addf %read, %acc : vector<4x4xf32> + scf.yield %out : vector<4x4xf32> + } + func.return %final : vector<4x4xf32> +} From 5b03efb85d63d1f4033ed649a56a177dd4ed62b4 Mon Sep 17 00:00:00 2001 From: Michael Kruse Date: Wed, 9 Oct 2024 14:56:43 +0200 Subject: [PATCH 012/119] [Clang][OpenMP] Add permutation clause (#92030) Add the permutation clause for the interchange directive which will be introduced in the upcoming OpenMP 6.0 specification. A preview has been published in [Technical Report12](https://www.openmp.org/wp-content/uploads/openmp-TR12.pdf). --- clang/include/clang/AST/OpenMPClause.h | 99 + clang/include/clang/AST/RecursiveASTVisitor.h | 8 + .../clang/Basic/DiagnosticSemaKinds.td | 4 + clang/include/clang/Parse/Parser.h | 3 + clang/include/clang/Sema/SemaOpenMP.h | 5 + clang/lib/AST/OpenMPClause.cpp | 27 + clang/lib/AST/StmtProfile.cpp | 7 + clang/lib/Basic/OpenMPKinds.cpp | 2 + clang/lib/Parse/ParseOpenMP.cpp | 20 + clang/lib/Sema/SemaOpenMP.cpp | 87 +- clang/lib/Sema/TreeTransform.h | 35 + clang/lib/Serialization/ASTReader.cpp | 11 + clang/lib/Serialization/ASTWriter.cpp | 7 + clang/test/OpenMP/interchange_ast_print.cpp | 98 + clang/test/OpenMP/interchange_codegen.cpp | 3358 ++++++++++++++++- clang/test/OpenMP/interchange_messages.cpp | 154 + clang/tools/libclang/CIndex.cpp | 6 + flang/lib/Lower/OpenMP/Clauses.cpp | 6 + flang/lib/Lower/OpenMP/Clauses.h | 1 + flang/lib/Parser/openmp-parsers.cpp | 2 + flang/lib/Semantics/check-omp-structure.cpp | 1 + llvm/include/llvm/Frontend/OpenMP/ClauseT.h | 14 +- llvm/include/llvm/Frontend/OpenMP/OMP.td | 8 + 23 files changed, 3946 insertions(+), 17 deletions(-) diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h index 3a1d6852d2a70..2e48c1c3c72c8 100644 --- a/clang/include/clang/AST/OpenMPClause.h +++ b/clang/include/clang/AST/OpenMPClause.h @@ -930,6 +930,105 @@ class OMPSizesClause final } }; +/// This class represents the 'permutation' clause in the +/// '#pragma omp interchange' directive. +/// +/// \code{.c} +/// #pragma omp interchange permutation(2,1) +/// for (int i = 0; i < 64; ++i) +/// for (int j = 0; j < 64; ++j) +/// \endcode +class OMPPermutationClause final + : public OMPClause, + private llvm::TrailingObjects { + friend class OMPClauseReader; + friend class llvm::TrailingObjects; + + /// Location of '('. + SourceLocation LParenLoc; + + /// Number of arguments in the clause, and hence also the number of loops to + /// be permuted. + unsigned NumLoops; + + /// Sets the permutation index expressions. + void setArgRefs(ArrayRef VL) { + assert(VL.size() == NumLoops && "Expecting one expression per loop"); + llvm::copy(VL, static_cast(this) + ->template getTrailingObjects()); + } + + /// Build an empty clause. + explicit OMPPermutationClause(int NumLoops) + : OMPClause(llvm::omp::OMPC_permutation, SourceLocation(), + SourceLocation()), + NumLoops(NumLoops) {} + +public: + /// Build a 'permutation' clause AST node. + /// + /// \param C Context of the AST. + /// \param StartLoc Location of the 'permutation' identifier. + /// \param LParenLoc Location of '('. + /// \param EndLoc Location of ')'. + /// \param Args Content of the clause. + static OMPPermutationClause * + Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc, + SourceLocation EndLoc, ArrayRef Args); + + /// Build an empty 'permutation' AST node for deserialization. + /// + /// \param C Context of the AST. + /// \param NumLoops Number of arguments in the clause. + static OMPPermutationClause *CreateEmpty(const ASTContext &C, + unsigned NumLoops); + + /// Sets the location of '('. + void setLParenLoc(SourceLocation Loc) { LParenLoc = Loc; } + + /// Returns the location of '('. + SourceLocation getLParenLoc() const { return LParenLoc; } + + /// Returns the number of list items. + unsigned getNumLoops() const { return NumLoops; } + + /// Returns the permutation index expressions. + ///@{ + MutableArrayRef getArgsRefs() { + return MutableArrayRef(static_cast(this) + ->template getTrailingObjects(), + NumLoops); + } + ArrayRef getArgsRefs() const { + return ArrayRef(static_cast(this) + ->template getTrailingObjects(), + NumLoops); + } + ///@} + + child_range children() { + MutableArrayRef Args = getArgsRefs(); + return child_range(reinterpret_cast(Args.begin()), + reinterpret_cast(Args.end())); + } + const_child_range children() const { + ArrayRef Args = getArgsRefs(); + return const_child_range(reinterpret_cast(Args.begin()), + reinterpret_cast(Args.end())); + } + + child_range used_children() { + return child_range(child_iterator(), child_iterator()); + } + const_child_range used_children() const { + return const_child_range(const_child_iterator(), const_child_iterator()); + } + + static bool classof(const OMPClause *T) { + return T->getClauseKind() == llvm::omp::OMPC_permutation; + } +}; + /// Representation of the 'full' clause of the '#pragma omp unroll' directive. /// /// \code diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h index cbbba9e88b7f5..b2dd51319ba9e 100644 --- a/clang/include/clang/AST/RecursiveASTVisitor.h +++ b/clang/include/clang/AST/RecursiveASTVisitor.h @@ -3348,6 +3348,14 @@ bool RecursiveASTVisitor::VisitOMPSizesClause(OMPSizesClause *C) { return true; } +template +bool RecursiveASTVisitor::VisitOMPPermutationClause( + OMPPermutationClause *C) { + for (Expr *E : C->getArgsRefs()) + TRY_TO(TraverseStmt(E)); + return true; +} + template bool RecursiveASTVisitor::VisitOMPFullClause(OMPFullClause *C) { return true; diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 536211a6da335..777ea1f37cea4 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -11702,6 +11702,10 @@ def err_omp_dispatch_statement_call " to a target function or an assignment to one">; def err_omp_unroll_full_variable_trip_count : Error< "loop to be fully unrolled must have a constant trip count">; +def err_omp_interchange_permutation_value_range : Error< + "permutation index must be at least 1 and at most %0">; +def err_omp_interchange_permutation_value_repeated : Error< + "index %0 must appear exactly once in the permutation clause">; def note_omp_directive_here : Note<"'%0' directive found here">; def err_omp_instantiation_not_supported : Error<"instantiation of '%0' not supported yet">; diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h index 93e49d395388a..dbcb545058a02 100644 --- a/clang/include/clang/Parse/Parser.h +++ b/clang/include/clang/Parse/Parser.h @@ -3595,6 +3595,9 @@ class Parser : public CodeCompletionHandler { /// Parses the 'sizes' clause of a '#pragma omp tile' directive. OMPClause *ParseOpenMPSizesClause(); + /// Parses the 'permutation' clause of a '#pragma omp interchange' directive. + OMPClause *ParseOpenMPPermutationClause(); + /// Parses clause without any additional arguments. /// /// \param Kind Kind of current clause. diff --git a/clang/include/clang/Sema/SemaOpenMP.h b/clang/include/clang/Sema/SemaOpenMP.h index 53191e7bb4272..80ad30b0f99ef 100644 --- a/clang/include/clang/Sema/SemaOpenMP.h +++ b/clang/include/clang/Sema/SemaOpenMP.h @@ -891,6 +891,11 @@ class SemaOpenMP : public SemaBase { SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc); + /// Called on well-form 'permutation' clause after parsing its arguments. + OMPClause *ActOnOpenMPPermutationClause(ArrayRef PermExprs, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); /// Called on well-form 'full' clauses. OMPClause *ActOnOpenMPFullClause(SourceLocation StartLoc, SourceLocation EndLoc); diff --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp index eb15aa8440690..985c844362d95 100644 --- a/clang/lib/AST/OpenMPClause.cpp +++ b/clang/lib/AST/OpenMPClause.cpp @@ -971,6 +971,25 @@ OMPSizesClause *OMPSizesClause::CreateEmpty(const ASTContext &C, return new (Mem) OMPSizesClause(NumSizes); } +OMPPermutationClause *OMPPermutationClause::Create(const ASTContext &C, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc, + ArrayRef Args) { + OMPPermutationClause *Clause = CreateEmpty(C, Args.size()); + Clause->setLocStart(StartLoc); + Clause->setLParenLoc(LParenLoc); + Clause->setLocEnd(EndLoc); + Clause->setArgRefs(Args); + return Clause; +} + +OMPPermutationClause *OMPPermutationClause::CreateEmpty(const ASTContext &C, + unsigned NumLoops) { + void *Mem = C.Allocate(totalSizeToAlloc(NumLoops)); + return new (Mem) OMPPermutationClause(NumLoops); +} + OMPFullClause *OMPFullClause::Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc) { @@ -1841,6 +1860,14 @@ void OMPClausePrinter::VisitOMPSizesClause(OMPSizesClause *Node) { OS << ")"; } +void OMPClausePrinter::VisitOMPPermutationClause(OMPPermutationClause *Node) { + OS << "permutation("; + llvm::interleaveComma(Node->getArgsRefs(), OS, [&](const Expr *E) { + E->printPretty(OS, nullptr, Policy, 0); + }); + OS << ")"; +} + void OMPClausePrinter::VisitOMPFullClause(OMPFullClause *Node) { OS << "full"; } void OMPClausePrinter::VisitOMPPartialClause(OMPPartialClause *Node) { diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp index 299ac005c7fdb..4d177fd6c5968 100644 --- a/clang/lib/AST/StmtProfile.cpp +++ b/clang/lib/AST/StmtProfile.cpp @@ -493,6 +493,13 @@ void OMPClauseProfiler::VisitOMPSizesClause(const OMPSizesClause *C) { Profiler->VisitExpr(E); } +void OMPClauseProfiler::VisitOMPPermutationClause( + const OMPPermutationClause *C) { + for (Expr *E : C->getArgsRefs()) + if (E) + Profiler->VisitExpr(E); +} + void OMPClauseProfiler::VisitOMPFullClause(const OMPFullClause *C) {} void OMPClauseProfiler::VisitOMPPartialClause(const OMPPartialClause *C) { diff --git a/clang/lib/Basic/OpenMPKinds.cpp b/clang/lib/Basic/OpenMPKinds.cpp index 630a8898aa229..8d2460bc74fa3 100644 --- a/clang/lib/Basic/OpenMPKinds.cpp +++ b/clang/lib/Basic/OpenMPKinds.cpp @@ -188,6 +188,7 @@ unsigned clang::getOpenMPSimpleClauseType(OpenMPClauseKind Kind, StringRef Str, case OMPC_safelen: case OMPC_simdlen: case OMPC_sizes: + case OMPC_permutation: case OMPC_allocator: case OMPC_allocate: case OMPC_collapse: @@ -512,6 +513,7 @@ const char *clang::getOpenMPSimpleClauseTypeName(OpenMPClauseKind Kind, case OMPC_safelen: case OMPC_simdlen: case OMPC_sizes: + case OMPC_permutation: case OMPC_allocator: case OMPC_allocate: case OMPC_collapse: diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp index 64dfcd4729699..108b532be4168 100644 --- a/clang/lib/Parse/ParseOpenMP.cpp +++ b/clang/lib/Parse/ParseOpenMP.cpp @@ -3080,6 +3080,18 @@ OMPClause *Parser::ParseOpenMPSizesClause() { OpenLoc, CloseLoc); } +OMPClause *Parser::ParseOpenMPPermutationClause() { + SourceLocation ClauseNameLoc, OpenLoc, CloseLoc; + SmallVector ArgExprs; + if (ParseOpenMPExprListClause(OMPC_permutation, ClauseNameLoc, OpenLoc, + CloseLoc, ArgExprs, + /*ReqIntConst=*/true)) + return nullptr; + + return Actions.OpenMP().ActOnOpenMPPermutationClause(ArgExprs, ClauseNameLoc, + OpenLoc, CloseLoc); +} + OMPClause *Parser::ParseOpenMPUsesAllocatorClause(OpenMPDirectiveKind DKind) { SourceLocation Loc = Tok.getLocation(); ConsumeAnyToken(); @@ -3377,6 +3389,14 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind, Clause = ParseOpenMPSizesClause(); break; + case OMPC_permutation: + if (!FirstClause) { + Diag(Tok, diag::err_omp_more_one_clause) + << getOpenMPDirectiveName(DKind) << getOpenMPClauseName(CKind) << 0; + ErrorFound = true; + } + Clause = ParseOpenMPPermutationClause(); + break; case OMPC_uses_allocators: Clause = ParseOpenMPUsesAllocatorClause(DKind); break; diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index 74ab52106e323..d3e696a79b94f 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -14956,7 +14956,9 @@ StmtResult SemaOpenMP::ActOnOpenMPInterchangeDirective( return StmtError(); // interchange without permutation clause swaps two loops. - constexpr size_t NumLoops = 2; + const OMPPermutationClause *PermutationClause = + OMPExecutableDirective::getSingleClause(Clauses); + size_t NumLoops = PermutationClause ? PermutationClause->getNumLoops() : 2; // Verify and diagnose loop nest. SmallVector LoopHelpers(NumLoops); @@ -14971,6 +14973,12 @@ StmtResult SemaOpenMP::ActOnOpenMPInterchangeDirective( return OMPInterchangeDirective::Create(Context, StartLoc, EndLoc, Clauses, NumLoops, AStmt, nullptr, nullptr); + // An invalid expression in the permutation clause is set to nullptr in + // ActOnOpenMPPermutationClause. + if (PermutationClause && + llvm::is_contained(PermutationClause->getArgsRefs(), nullptr)) + return StmtError(); + assert(LoopHelpers.size() == NumLoops && "Expecting loop iteration space dimensionaly to match number of " "affected loops"); @@ -14979,7 +14987,44 @@ StmtResult SemaOpenMP::ActOnOpenMPInterchangeDirective( "affected loops"); // Decode the permutation clause. - constexpr uint64_t Permutation[] = {1, 0}; + SmallVector Permutation; + if (!PermutationClause) { + Permutation = {1, 0}; + } else { + ArrayRef PermArgs = PermutationClause->getArgsRefs(); + llvm::BitVector Flags(PermArgs.size()); + for (Expr *PermArg : PermArgs) { + std::optional PermCstExpr = + PermArg->getIntegerConstantExpr(Context); + if (!PermCstExpr) + continue; + uint64_t PermInt = PermCstExpr->getZExtValue(); + assert(1 <= PermInt && PermInt <= NumLoops && + "Must be a permutation; diagnostic emitted in " + "ActOnOpenMPPermutationClause"); + if (Flags[PermInt - 1]) { + SourceRange ExprRange(PermArg->getBeginLoc(), PermArg->getEndLoc()); + Diag(PermArg->getExprLoc(), + diag::err_omp_interchange_permutation_value_repeated) + << PermInt << ExprRange; + continue; + } + Flags[PermInt - 1] = true; + + Permutation.push_back(PermInt - 1); + } + + if (Permutation.size() != NumLoops) + return StmtError(); + } + + // Nothing to transform with trivial permutation. + if (NumLoops <= 1 || llvm::all_of(llvm::enumerate(Permutation), [](auto P) { + auto [Idx, Arg] = P; + return Idx == Arg; + })) + return OMPInterchangeDirective::Create(Context, StartLoc, EndLoc, Clauses, + NumLoops, AStmt, AStmt, nullptr); // Find the affected loops. SmallVector LoopStmts(NumLoops, nullptr); @@ -16111,6 +16156,44 @@ OMPClause *SemaOpenMP::ActOnOpenMPSizesClause(ArrayRef SizeExprs, SanitizedSizeExprs); } +OMPClause *SemaOpenMP::ActOnOpenMPPermutationClause(ArrayRef PermExprs, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { + size_t NumLoops = PermExprs.size(); + SmallVector SanitizedPermExprs; + llvm::append_range(SanitizedPermExprs, PermExprs); + + for (Expr *&PermExpr : SanitizedPermExprs) { + // Skip if template-dependent or already sanitized, e.g. during a partial + // template instantiation. + if (!PermExpr || PermExpr->isInstantiationDependent()) + continue; + + llvm::APSInt PermVal; + ExprResult PermEvalExpr = SemaRef.VerifyIntegerConstantExpression( + PermExpr, &PermVal, Sema::AllowFold); + bool IsValid = PermEvalExpr.isUsable(); + if (IsValid) + PermExpr = PermEvalExpr.get(); + + if (IsValid && (PermVal < 1 || NumLoops < PermVal)) { + SourceRange ExprRange(PermEvalExpr.get()->getBeginLoc(), + PermEvalExpr.get()->getEndLoc()); + Diag(PermEvalExpr.get()->getExprLoc(), + diag::err_omp_interchange_permutation_value_range) + << NumLoops << ExprRange; + IsValid = false; + } + + if (!PermExpr->isInstantiationDependent() && !IsValid) + PermExpr = nullptr; + } + + return OMPPermutationClause::Create(getASTContext(), StartLoc, LParenLoc, + EndLoc, SanitizedPermExprs); +} + OMPClause *SemaOpenMP::ActOnOpenMPFullClause(SourceLocation StartLoc, SourceLocation EndLoc) { return OMPFullClause::Create(getASTContext(), StartLoc, EndLoc); diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 01c086a602dd5..5753c9eccf6c9 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -1760,6 +1760,15 @@ class TreeTransform { EndLoc); } + /// Build a new OpenMP 'permutation' clause. + OMPClause *RebuildOMPPermutationClause(ArrayRef PermExprs, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { + return getSema().OpenMP().ActOnOpenMPPermutationClause(PermExprs, StartLoc, + LParenLoc, EndLoc); + } + /// Build a new OpenMP 'full' clause. OMPClause *RebuildOMPFullClause(SourceLocation StartLoc, SourceLocation EndLoc) { @@ -10279,6 +10288,32 @@ OMPClause *TreeTransform::TransformOMPSizesClause(OMPSizesClause *C) { C->getLParenLoc(), C->getEndLoc()); } +template +OMPClause * +TreeTransform::TransformOMPPermutationClause(OMPPermutationClause *C) { + SmallVector TransformedArgs; + TransformedArgs.reserve(C->getNumLoops()); + bool Changed = false; + for (Expr *E : C->getArgsRefs()) { + if (!E) { + TransformedArgs.push_back(nullptr); + continue; + } + + ExprResult T = getDerived().TransformExpr(E); + if (T.isInvalid()) + return nullptr; + if (E != T.get()) + Changed = true; + TransformedArgs.push_back(T.get()); + } + + if (!Changed && !getDerived().AlwaysRebuild()) + return C; + return RebuildOMPPermutationClause(TransformedArgs, C->getBeginLoc(), + C->getLParenLoc(), C->getEndLoc()); +} + template OMPClause *TreeTransform::TransformOMPFullClause(OMPFullClause *C) { if (!getDerived().AlwaysRebuild()) diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index 4a20dfc09cd06..5c4f8d0e9c46c 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -10605,6 +10605,11 @@ OMPClause *OMPClauseReader::readClause() { C = OMPSizesClause::CreateEmpty(Context, NumSizes); break; } + case llvm::omp::OMPC_permutation: { + unsigned NumLoops = Record.readInt(); + C = OMPPermutationClause::CreateEmpty(Context, NumLoops); + break; + } case llvm::omp::OMPC_full: C = OMPFullClause::CreateEmpty(Context); break; @@ -10993,6 +10998,12 @@ void OMPClauseReader::VisitOMPSizesClause(OMPSizesClause *C) { C->setLParenLoc(Record.readSourceLocation()); } +void OMPClauseReader::VisitOMPPermutationClause(OMPPermutationClause *C) { + for (Expr *&E : C->getArgsRefs()) + E = Record.readSubExpr(); + C->setLParenLoc(Record.readSourceLocation()); +} + void OMPClauseReader::VisitOMPFullClause(OMPFullClause *C) {} void OMPClauseReader::VisitOMPPartialClause(OMPPartialClause *C) { diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index 836532ca402ff..4976327fc654e 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -7188,6 +7188,13 @@ void OMPClauseWriter::VisitOMPSizesClause(OMPSizesClause *C) { Record.AddSourceLocation(C->getLParenLoc()); } +void OMPClauseWriter::VisitOMPPermutationClause(OMPPermutationClause *C) { + Record.push_back(C->getNumLoops()); + for (Expr *Size : C->getArgsRefs()) + Record.AddStmt(Size); + Record.AddSourceLocation(C->getLParenLoc()); +} + void OMPClauseWriter::VisitOMPFullClause(OMPFullClause *C) {} void OMPClauseWriter::VisitOMPPartialClause(OMPPartialClause *C) { diff --git a/clang/test/OpenMP/interchange_ast_print.cpp b/clang/test/OpenMP/interchange_ast_print.cpp index f8bf075cd300f..4b0818ff6eb7f 100644 --- a/clang/test/OpenMP/interchange_ast_print.cpp +++ b/clang/test/OpenMP/interchange_ast_print.cpp @@ -35,6 +35,35 @@ void foo1() { } +// PRINT-LABEL: void foo2( +// DUMP-LABEL: FunctionDecl {{.*}} foo2 +void foo2(int start1, int start2, int start3, int end1, int end2, int end3) { + // PRINT: #pragma omp interchange permutation(2, 3, 1) + // DUMP: OMPInterchangeDirective + // DUMP-NEXT: OMPPermutationClause + // DUMP-NEXT: ConstantExpr + // DUMP-NEXT: value: Int 2 + // DUMP-NEXT: IntegerLiteral {{.*}} 2 + // DUMP-NEXT: ConstantExpr + // DUMP-NEXT: value: Int 3 + // DUMP-NEXT: IntegerLiteral {{.*}} 3 + // DUMP-NEXT: ConstantExpr + // DUMP-NEXT: value: Int 1 + // DUMP-NEXT: IntegerLiteral {{.*}} 1 + #pragma omp interchange permutation(2,3,1) + // PRINT: for (int i = start1; i < end1; i += 1) + // DUMP-NEXT: ForStmt + for (int i = start1; i < end1; i += 1) + // PRINT: for (int j = start2; j < end2; j += 1) + // DUMP: ForStmt + for (int j = start2; j < end2; j += 1) + // PRINT: for (int k = start3; k < end3; k += 1) + // DUMP: ForStmt + for (int k = start3; k < end3; k += 1) + // PRINT: body(i, j, k); + // DUMP: CallExpr + body(i, j, k); +} // PRINT-LABEL: void foo3( @@ -67,6 +96,75 @@ void foo3() { } +// PRINT-LABEL: void foo4( +// DUMP-LABEL: FunctionDecl {{.*}} foo4 +void foo4(int start, int end, int step) { + // PRINT: #pragma omp for collapse(3) + // DUMP: OMPForDirective + // DUMP-NEXT: OMPCollapseClause + // DUMP-NEXT: ConstantExpr + // DUMP-NEXT: value: Int 3 + // DUMP-NEXT: IntegerLiteral {{.*}} 3 + // DUMP-NEXT: CapturedStmt + // DUMP-NEXT: CapturedDecl + #pragma omp for collapse(3) + // PRINT: for (int i = 7; i < 17; i += 1) + // DUMP-NEXT: ForStmt + for (int i = 7; i < 17; i += 1) + // PRINT: #pragma omp interchange permutation(1) + // DUMP: OMPInterchangeDirective + // DUMP-NEXT: OMPPermutationClause + // DUMP-NEXT: ConstantExpr + // DUMP-NEXT: value: Int 1 + // DUMP-NEXT: IntegerLiteral {{.*}} 1 + #pragma omp interchange permutation(1) + // PRINT: for (int j = 7; j < 17; j += 1) + // DUMP-NEXT: ForStmt + for (int j = 7; j < 17; j += 1) + // PRINT: for (int k = 7; k < 17; k += 1) + // DUMP: ForStmt + for (int k = 7; k < 17; k += 1) + // PRINT: body(i, j, k); + // DUMP: CallExpr + body(i, j, k); +} + + +// PRINT-LABEL: void foo5( +// DUMP-LABEL: FunctionTemplateDecl {{.*}} foo5 +template +void foo5(T start, T end) { + // PRINT: #pragma omp for + // DUMP: OMPForDirective + #pragma omp for + // PRINT: #pragma omp interchange permutation(P + 1, 2 - P) + // DUMP: OMPInterchangeDirective + // DUMP-NEXT: OMPPermutationClause + // DUMP-NEXT: BinaryOperator {{.*}} '+' + // DUMP-NEXT: DeclRefExpr {{.*}} 'P' 'T' + // DUMP-NEXT: IntegerLiteral {{.*}} 'int' 1 + // DUMP-NEXT: BinaryOperator {{.*}} '-' + // DUMP-NEXT: IntegerLiteral {{.*}} 'int' 2 + // DUMP-NEXT: DeclRefExpr {{.*}} 'P' 'T' + #pragma omp interchange permutation(P + 1, 2 - P) + // PRINT-NEXT: for (T i = start; i < end; i += 2) + // DUMP-NEXT: ForStmt + for (T i = start; i < end; i += 2) + // PRINT-NEXT: for (T j = start; j < end; j += 2) + // DUMP: ForStmt + for (T j = start; j < end; j += 2) + // PRINT-NEXT: body(i, j); + // DUMP: CallExpr + body(i,j); +} + +// Also test instantiating the template. +void tfoo5() { + foo5(0, 42); + foo5(0, 42); +} + + // PRINT-LABEL: void foo6( // DUMP-LABEL: FunctionTemplateDecl {{.*}} foo6 template diff --git a/clang/test/OpenMP/interchange_codegen.cpp b/clang/test/OpenMP/interchange_codegen.cpp index 9c1782183cf98..8e833c9df324c 100644 --- a/clang/test/OpenMP/interchange_codegen.cpp +++ b/clang/test/OpenMP/interchange_codegen.cpp @@ -17,6 +17,12 @@ extern "C" void body(...) {} +extern "C" void foo1(int start, int end, int step) { + int i; +#pragma omp interchange permutation(1) + for (i = start; i < end; i += step) + body(i); +} extern "C" void foo2(int start1, int start2, int end1, int end2, int step1, int step2) { @@ -46,6 +52,16 @@ extern "C" void foo4() { } +extern "C" void foo5() { +#pragma omp for collapse(3) + for (int i = 7; i < 17; i += 3) +#pragma omp interchange permutation(1) + for (int j = 7; j < 17; j += 3) + for (int k = 7; k < 17; k += 3) + body(i, j, k); +} + + extern "C" void foo6() { #pragma omp for collapse(4) for (int i = 7; i < 17; i += 3) @@ -57,6 +73,34 @@ extern "C" void foo6() { } +extern "C" void foo7() { +#pragma omp interchange permutation(2,3,4,1) + for (int i = 7; i < 17; i += 3) + for (int j = 7; j < 17; j += 3) + for (int k = 7; k < 17; k += 3) + for (int l = 7; l < 17; l += 3) + body(i, j, k, l); +} + + +template +void foo8(int start, int end, int step) { + #pragma omp for collapse(4) + for (int i = start; i < end; i += step) + #pragma omp interchange permutation(1) + for (int j = start; j < end; j += step) + #pragma omp tile sizes(TILESIZE) + for (int k = start; k < end; k += step) + body(i, j, k); +} + +// Also test instantiating the template. +extern "C" void tfoo8() { + foo8<32>(0, 42, 1); + foo8<64>(0, 42, 3); +} + + extern "C" void foo9() { double arr[128]; #pragma omp interchange @@ -85,6 +129,38 @@ extern "C" void foo10() { // CHECK1-NEXT: ret void // // +// CHECK1-LABEL: define {{[^@]+}}@foo1 +// CHECK1-SAME: (i32 noundef [[START:%.*]], i32 noundef [[END:%.*]], i32 noundef [[STEP:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[START_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[END_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[STEP_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 [[START]], ptr [[START_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[END]], ptr [[END_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[STEP]], ptr [[STEP_ADDR]], align 4 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32, ptr [[START_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP0]], ptr [[I]], align 4 +// CHECK1-NEXT: br label [[FOR_COND:%.*]] +// CHECK1: for.cond: +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[I]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[END_ADDR]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], [[TMP2]] +// CHECK1-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +// CHECK1: for.body: +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[I]], align 4 +// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP3]]) +// CHECK1-NEXT: br label [[FOR_INC:%.*]] +// CHECK1: for.inc: +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[STEP_ADDR]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[I]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP5]], [[TMP4]] +// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4 +// CHECK1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]] +// CHECK1: for.end: +// CHECK1-NEXT: ret void +// +// // CHECK1-LABEL: define {{[^@]+}}@foo2 // CHECK1-SAME: (i32 noundef [[START1:%.*]], i32 noundef [[START2:%.*]], i32 noundef [[END1:%.*]], i32 noundef [[END2:%.*]], i32 noundef [[STEP1:%.*]], i32 noundef [[STEP2:%.*]]) #[[ATTR0]] { // CHECK1-NEXT: entry: @@ -186,14 +262,14 @@ extern "C" void foo10() { // CHECK1-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTPERMUTED_1_IV_I]], align 4 // CHECK1-NEXT: [[INC:%.*]] = add i32 [[TMP28]], 1 // CHECK1-NEXT: store i32 [[INC]], ptr [[DOTPERMUTED_1_IV_I]], align 4 -// CHECK1-NEXT: br label [[FOR_COND16]], !llvm.loop [[LOOP3:![0-9]+]] +// CHECK1-NEXT: br label [[FOR_COND16]], !llvm.loop [[LOOP5:![0-9]+]] // CHECK1: for.end: // CHECK1-NEXT: br label [[FOR_INC22:%.*]] // CHECK1: for.inc22: // CHECK1-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTPERMUTED_0_IV_J]], align 4 // CHECK1-NEXT: [[INC23:%.*]] = add i32 [[TMP29]], 1 // CHECK1-NEXT: store i32 [[INC23]], ptr [[DOTPERMUTED_0_IV_J]], align 4 -// CHECK1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP5:![0-9]+]] +// CHECK1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP6:![0-9]+]] // CHECK1: for.end24: // CHECK1-NEXT: ret void // @@ -266,7 +342,7 @@ extern "C" void foo10() { // CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTPERMUTED_1_IV_I]], align 4 // CHECK1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP12]], 1 // CHECK1-NEXT: store i32 [[INC]], ptr [[DOTPERMUTED_1_IV_I]], align 4 -// CHECK1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP6:![0-9]+]] +// CHECK1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]] // CHECK1: for.end: // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1: omp.body.continue: @@ -363,7 +439,7 @@ extern "C" void foo10() { // CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTPERMUTED_1_IV_I]], align 4 // CHECK1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP14]], 1 // CHECK1-NEXT: store i32 [[INC]], ptr [[DOTPERMUTED_1_IV_I]], align 4 -// CHECK1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]] +// CHECK1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP8:![0-9]+]] // CHECK1: for.end: // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1: omp.body.continue: @@ -381,6 +457,96 @@ extern "C" void foo10() { // CHECK1-NEXT: ret void // // +// CHECK1-LABEL: define {{[^@]+}}@foo5 +// CHECK1-SAME: () #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[_TMP2:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]]) +// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 63, ptr [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], 63 +// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 63, [[COND_TRUE]] ], [ [[TMP2]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CMP3:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]] +// CHECK1-NEXT: br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP6]], 16 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 3 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 7, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[DIV4:%.*]] = sdiv i32 [[TMP8]], 16 +// CHECK1-NEXT: [[MUL5:%.*]] = mul nsw i32 [[DIV4]], 16 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], [[MUL5]] +// CHECK1-NEXT: [[DIV6:%.*]] = sdiv i32 [[SUB]], 4 +// CHECK1-NEXT: [[MUL7:%.*]] = mul nsw i32 [[DIV6]], 3 +// CHECK1-NEXT: [[ADD8:%.*]] = add nsw i32 7, [[MUL7]] +// CHECK1-NEXT: store i32 [[ADD8]], ptr [[J]], align 4 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[DIV9:%.*]] = sdiv i32 [[TMP10]], 16 +// CHECK1-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 16 +// CHECK1-NEXT: [[SUB11:%.*]] = sub nsw i32 [[TMP9]], [[MUL10]] +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[DIV12:%.*]] = sdiv i32 [[TMP12]], 16 +// CHECK1-NEXT: [[MUL13:%.*]] = mul nsw i32 [[DIV12]], 16 +// CHECK1-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP11]], [[MUL13]] +// CHECK1-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 4 +// CHECK1-NEXT: [[MUL16:%.*]] = mul nsw i32 [[DIV15]], 4 +// CHECK1-NEXT: [[SUB17:%.*]] = sub nsw i32 [[SUB11]], [[MUL16]] +// CHECK1-NEXT: [[MUL18:%.*]] = mul nsw i32 [[SUB17]], 3 +// CHECK1-NEXT: [[ADD19:%.*]] = add nsw i32 7, [[MUL18]] +// CHECK1-NEXT: store i32 [[ADD19]], ptr [[K]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[I]], align 4 +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[J]], align 4 +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[K]], align 4 +// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP13]], i32 noundef [[TMP14]], i32 noundef [[TMP15]]) +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[ADD20:%.*]] = add nsw i32 [[TMP16]], 1 +// CHECK1-NEXT: store i32 [[ADD20]], ptr [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]]) +// CHECK1-NEXT: call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]]) +// CHECK1-NEXT: ret void +// +// // CHECK1-LABEL: define {{[^@]+}}@foo6 // CHECK1-SAME: () #[[ATTR0]] { // CHECK1-NEXT: entry: @@ -519,6 +685,1529 @@ extern "C" void foo10() { // CHECK1-NEXT: ret void // // +// CHECK1-LABEL: define {{[^@]+}}@foo7 +// CHECK1-SAME: () #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[L:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTPERMUTED_0_IV_J:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTPERMUTED_1_IV_K:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTPERMUTED_2_IV_L:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTPERMUTED_3_IV_I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 7, ptr [[I]], align 4 +// CHECK1-NEXT: store i32 7, ptr [[J]], align 4 +// CHECK1-NEXT: store i32 7, ptr [[K]], align 4 +// CHECK1-NEXT: store i32 7, ptr [[L]], align 4 +// CHECK1-NEXT: store i32 0, ptr [[DOTPERMUTED_0_IV_J]], align 4 +// CHECK1-NEXT: br label [[FOR_COND:%.*]] +// CHECK1: for.cond: +// CHECK1-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTPERMUTED_0_IV_J]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], 4 +// CHECK1-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END24:%.*]] +// CHECK1: for.body: +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTPERMUTED_0_IV_J]], align 4 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP1]], 3 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 7, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], ptr [[J]], align 4 +// CHECK1-NEXT: store i32 0, ptr [[DOTPERMUTED_1_IV_K]], align 4 +// CHECK1-NEXT: br label [[FOR_COND1:%.*]] +// CHECK1: for.cond1: +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTPERMUTED_1_IV_K]], align 4 +// CHECK1-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP2]], 4 +// CHECK1-NEXT: br i1 [[CMP2]], label [[FOR_BODY3:%.*]], label [[FOR_END21:%.*]] +// CHECK1: for.body3: +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTPERMUTED_1_IV_K]], align 4 +// CHECK1-NEXT: [[MUL4:%.*]] = mul nsw i32 [[TMP3]], 3 +// CHECK1-NEXT: [[ADD5:%.*]] = add nsw i32 7, [[MUL4]] +// CHECK1-NEXT: store i32 [[ADD5]], ptr [[K]], align 4 +// CHECK1-NEXT: store i32 0, ptr [[DOTPERMUTED_2_IV_L]], align 4 +// CHECK1-NEXT: br label [[FOR_COND6:%.*]] +// CHECK1: for.cond6: +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTPERMUTED_2_IV_L]], align 4 +// CHECK1-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP4]], 4 +// CHECK1-NEXT: br i1 [[CMP7]], label [[FOR_BODY8:%.*]], label [[FOR_END18:%.*]] +// CHECK1: for.body8: +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTPERMUTED_2_IV_L]], align 4 +// CHECK1-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP5]], 3 +// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 7, [[MUL9]] +// CHECK1-NEXT: store i32 [[ADD10]], ptr [[L]], align 4 +// CHECK1-NEXT: store i32 0, ptr [[DOTPERMUTED_3_IV_I]], align 4 +// CHECK1-NEXT: br label [[FOR_COND11:%.*]] +// CHECK1: for.cond11: +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTPERMUTED_3_IV_I]], align 4 +// CHECK1-NEXT: [[CMP12:%.*]] = icmp slt i32 [[TMP6]], 4 +// CHECK1-NEXT: br i1 [[CMP12]], label [[FOR_BODY13:%.*]], label [[FOR_END:%.*]] +// CHECK1: for.body13: +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTPERMUTED_3_IV_I]], align 4 +// CHECK1-NEXT: [[MUL14:%.*]] = mul nsw i32 [[TMP7]], 3 +// CHECK1-NEXT: [[ADD15:%.*]] = add nsw i32 7, [[MUL14]] +// CHECK1-NEXT: store i32 [[ADD15]], ptr [[I]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[I]], align 4 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[J]], align 4 +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[K]], align 4 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[L]], align 4 +// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP8]], i32 noundef [[TMP9]], i32 noundef [[TMP10]], i32 noundef [[TMP11]]) +// CHECK1-NEXT: br label [[FOR_INC:%.*]] +// CHECK1: for.inc: +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTPERMUTED_3_IV_I]], align 4 +// CHECK1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP12]], 1 +// CHECK1-NEXT: store i32 [[INC]], ptr [[DOTPERMUTED_3_IV_I]], align 4 +// CHECK1-NEXT: br label [[FOR_COND11]], !llvm.loop [[LOOP9:![0-9]+]] +// CHECK1: for.end: +// CHECK1-NEXT: br label [[FOR_INC16:%.*]] +// CHECK1: for.inc16: +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTPERMUTED_2_IV_L]], align 4 +// CHECK1-NEXT: [[INC17:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK1-NEXT: store i32 [[INC17]], ptr [[DOTPERMUTED_2_IV_L]], align 4 +// CHECK1-NEXT: br label [[FOR_COND6]], !llvm.loop [[LOOP10:![0-9]+]] +// CHECK1: for.end18: +// CHECK1-NEXT: br label [[FOR_INC19:%.*]] +// CHECK1: for.inc19: +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTPERMUTED_1_IV_K]], align 4 +// CHECK1-NEXT: [[INC20:%.*]] = add nsw i32 [[TMP14]], 1 +// CHECK1-NEXT: store i32 [[INC20]], ptr [[DOTPERMUTED_1_IV_K]], align 4 +// CHECK1-NEXT: br label [[FOR_COND1]], !llvm.loop [[LOOP11:![0-9]+]] +// CHECK1: for.end21: +// CHECK1-NEXT: br label [[FOR_INC22:%.*]] +// CHECK1: for.inc22: +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTPERMUTED_0_IV_J]], align 4 +// CHECK1-NEXT: [[INC23:%.*]] = add nsw i32 [[TMP15]], 1 +// CHECK1-NEXT: store i32 [[INC23]], ptr [[DOTPERMUTED_0_IV_J]], align 4 +// CHECK1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]] +// CHECK1: for.end24: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@tfoo8 +// CHECK1-SAME: () #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: call void @_Z4foo8ILi32EEviii(i32 noundef 0, i32 noundef 42, i32 noundef 1) +// CHECK1-NEXT: call void @_Z4foo8ILi64EEviii(i32 noundef 0, i32 noundef 42, i32 noundef 3) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@_Z4foo8ILi32EEviii +// CHECK1-SAME: (i32 noundef [[START:%.*]], i32 noundef [[END:%.*]], i32 noundef [[STEP:%.*]]) #[[ATTR0]] comdat { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[START_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[END_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[STEP_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[_TMP2:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[_TMP3:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_4:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTNEW_STEP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_5:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_6:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTNEW_STEP7:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_8:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_9:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTNEW_STEP10:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_11:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_14:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_16:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_17:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_22:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTFLOOR_0_IV_K:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTILE_0_IV_K:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I49:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[J50:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTFLOOR_0_IV_K51:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTILE_0_IV_K52:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]]) +// CHECK1-NEXT: store i32 [[START]], ptr [[START_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[END]], ptr [[END_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[STEP]], ptr [[STEP_ADDR]], align 4 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[START_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[END_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_4]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[STEP_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP3]], ptr [[DOTNEW_STEP]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[START_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[END_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[STEP_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP6]], ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[START_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP7]], ptr [[K]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[START_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP8]], ptr [[DOTCAPTURE_EXPR_8]], align 4 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[END_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP9]], ptr [[DOTCAPTURE_EXPR_9]], align 4 +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[STEP_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP10]], ptr [[DOTNEW_STEP10]], align 4 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_9]], align 4 +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_8]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub i32 [[TMP11]], [[TMP12]] +// CHECK1-NEXT: [[SUB12:%.*]] = sub i32 [[SUB]], 1 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTNEW_STEP10]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add i32 [[SUB12]], [[TMP13]] +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTNEW_STEP10]], align 4 +// CHECK1-NEXT: [[DIV:%.*]] = udiv i32 [[ADD]], [[TMP14]] +// CHECK1-NEXT: [[SUB13:%.*]] = sub i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB13]], ptr [[DOTCAPTURE_EXPR_11]], align 4 +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_11]], align 4 +// CHECK1-NEXT: [[ADD15:%.*]] = add i32 [[TMP15]], 1 +// CHECK1-NEXT: store i32 [[ADD15]], ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[_TMP2]], align 4 +// CHECK1-NEXT: store i32 [[TMP16]], ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_11]], align 4 +// CHECK1-NEXT: [[ADD18:%.*]] = add i32 [[TMP17]], 1 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[_TMP2]], align 4 +// CHECK1-NEXT: [[ADD19:%.*]] = add i32 [[TMP18]], 32 +// CHECK1-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD18]], [[ADD19]] +// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_11]], align 4 +// CHECK1-NEXT: [[ADD20:%.*]] = add i32 [[TMP19]], 1 +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[_TMP2]], align 4 +// CHECK1-NEXT: [[ADD21:%.*]] = add i32 [[TMP20]], 32 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[ADD20]], [[COND_TRUE]] ], [ [[ADD21]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4 +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB23:%.*]] = sub i32 [[TMP21]], [[TMP22]] +// CHECK1-NEXT: [[SUB24:%.*]] = sub i32 [[SUB23]], 1 +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4 +// CHECK1-NEXT: [[ADD25:%.*]] = add i32 [[SUB24]], [[TMP23]] +// CHECK1-NEXT: [[TMP24:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4 +// CHECK1-NEXT: [[DIV26:%.*]] = udiv i32 [[ADD25]], [[TMP24]] +// CHECK1-NEXT: [[CONV:%.*]] = zext i32 [[DIV26]] to i64 +// CHECK1-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK1-NEXT: [[SUB27:%.*]] = sub i32 [[TMP25]], [[TMP26]] +// CHECK1-NEXT: [[SUB28:%.*]] = sub i32 [[SUB27]], 1 +// CHECK1-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[ADD29:%.*]] = add i32 [[SUB28]], [[TMP27]] +// CHECK1-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[DIV30:%.*]] = udiv i32 [[ADD29]], [[TMP28]] +// CHECK1-NEXT: [[CONV31:%.*]] = zext i32 [[DIV30]] to i64 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV31]] +// CHECK1-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB32:%.*]] = sub i32 [[TMP29]], -31 +// CHECK1-NEXT: [[DIV33:%.*]] = udiv i32 [[SUB32]], 32 +// CHECK1-NEXT: [[CONV34:%.*]] = zext i32 [[DIV33]] to i64 +// CHECK1-NEXT: [[MUL35:%.*]] = mul nsw i64 [[MUL]], [[CONV34]] +// CHECK1-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB36:%.*]] = sub i32 [[TMP30]], [[TMP31]] +// CHECK1-NEXT: [[SUB37:%.*]] = sub i32 [[SUB36]], 1 +// CHECK1-NEXT: [[ADD38:%.*]] = add i32 [[SUB37]], 1 +// CHECK1-NEXT: [[DIV39:%.*]] = udiv i32 [[ADD38]], 1 +// CHECK1-NEXT: [[CONV40:%.*]] = zext i32 [[DIV39]] to i64 +// CHECK1-NEXT: [[MUL41:%.*]] = mul nsw i64 [[MUL35]], [[CONV40]] +// CHECK1-NEXT: [[SUB42:%.*]] = sub nsw i64 [[MUL41]], 1 +// CHECK1-NEXT: store i64 [[SUB42]], ptr [[DOTCAPTURE_EXPR_22]], align 8 +// CHECK1-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: store i32 [[TMP32]], ptr [[I]], align 4 +// CHECK1-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK1-NEXT: store i32 [[TMP33]], ptr [[J]], align 4 +// CHECK1-NEXT: store i32 0, ptr [[DOTFLOOR_0_IV_K]], align 4 +// CHECK1-NEXT: [[TMP34:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: store i32 [[TMP34]], ptr [[DOTTILE_0_IV_K]], align 4 +// CHECK1-NEXT: [[TMP35:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4 +// CHECK1-NEXT: [[CMP43:%.*]] = icmp slt i32 [[TMP35]], [[TMP36]] +// CHECK1-NEXT: br i1 [[CMP43]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: land.lhs.true: +// CHECK1-NEXT: [[TMP37:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK1-NEXT: [[TMP38:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: [[CMP44:%.*]] = icmp slt i32 [[TMP37]], [[TMP38]] +// CHECK1-NEXT: br i1 [[CMP44]], label [[LAND_LHS_TRUE45:%.*]], label [[OMP_PRECOND_END]] +// CHECK1: land.lhs.true45: +// CHECK1-NEXT: [[TMP39:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[CMP46:%.*]] = icmp ult i32 0, [[TMP39]] +// CHECK1-NEXT: br i1 [[CMP46]], label [[LAND_LHS_TRUE47:%.*]], label [[OMP_PRECOND_END]] +// CHECK1: land.lhs.true47: +// CHECK1-NEXT: [[TMP40:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[CMP48:%.*]] = icmp ult i32 [[TMP40]], [[TMP41]] +// CHECK1-NEXT: br i1 [[CMP48]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i64 0, ptr [[DOTOMP_LB]], align 8 +// CHECK1-NEXT: [[TMP42:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_22]], align 8 +// CHECK1-NEXT: store i64 [[TMP42]], ptr [[DOTOMP_UB]], align 8 +// CHECK1-NEXT: store i64 1, ptr [[DOTOMP_STRIDE]], align 8 +// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_8(ptr @[[GLOB1]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i64 1, i64 1) +// CHECK1-NEXT: [[TMP43:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8 +// CHECK1-NEXT: [[TMP44:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_22]], align 8 +// CHECK1-NEXT: [[CMP53:%.*]] = icmp sgt i64 [[TMP43]], [[TMP44]] +// CHECK1-NEXT: br i1 [[CMP53]], label [[COND_TRUE54:%.*]], label [[COND_FALSE55:%.*]] +// CHECK1: cond.true54: +// CHECK1-NEXT: [[TMP45:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_22]], align 8 +// CHECK1-NEXT: br label [[COND_END56:%.*]] +// CHECK1: cond.false55: +// CHECK1-NEXT: [[TMP46:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8 +// CHECK1-NEXT: br label [[COND_END56]] +// CHECK1: cond.end56: +// CHECK1-NEXT: [[COND57:%.*]] = phi i64 [ [[TMP45]], [[COND_TRUE54]] ], [ [[TMP46]], [[COND_FALSE55]] ] +// CHECK1-NEXT: store i64 [[COND57]], ptr [[DOTOMP_UB]], align 8 +// CHECK1-NEXT: [[TMP47:%.*]] = load i64, ptr [[DOTOMP_LB]], align 8 +// CHECK1-NEXT: store i64 [[TMP47]], ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP48:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP49:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8 +// CHECK1-NEXT: [[CMP58:%.*]] = icmp sle i64 [[TMP48]], [[TMP49]] +// CHECK1-NEXT: br i1 [[CMP58]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP50:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CONV59:%.*]] = sext i32 [[TMP50]] to i64 +// CHECK1-NEXT: [[TMP51:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP52:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: [[TMP53:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK1-NEXT: [[SUB60:%.*]] = sub i32 [[TMP52]], [[TMP53]] +// CHECK1-NEXT: [[SUB61:%.*]] = sub i32 [[SUB60]], 1 +// CHECK1-NEXT: [[TMP54:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[ADD62:%.*]] = add i32 [[SUB61]], [[TMP54]] +// CHECK1-NEXT: [[TMP55:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[DIV63:%.*]] = udiv i32 [[ADD62]], [[TMP55]] +// CHECK1-NEXT: [[MUL64:%.*]] = mul i32 1, [[DIV63]] +// CHECK1-NEXT: [[TMP56:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB65:%.*]] = sub i32 [[TMP56]], -31 +// CHECK1-NEXT: [[DIV66:%.*]] = udiv i32 [[SUB65]], 32 +// CHECK1-NEXT: [[MUL67:%.*]] = mul i32 [[MUL64]], [[DIV66]] +// CHECK1-NEXT: [[TMP57:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP58:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB68:%.*]] = sub i32 [[TMP57]], [[TMP58]] +// CHECK1-NEXT: [[SUB69:%.*]] = sub i32 [[SUB68]], 1 +// CHECK1-NEXT: [[ADD70:%.*]] = add i32 [[SUB69]], 1 +// CHECK1-NEXT: [[DIV71:%.*]] = udiv i32 [[ADD70]], 1 +// CHECK1-NEXT: [[MUL72:%.*]] = mul i32 [[MUL67]], [[DIV71]] +// CHECK1-NEXT: [[CONV73:%.*]] = zext i32 [[MUL72]] to i64 +// CHECK1-NEXT: [[DIV74:%.*]] = sdiv i64 [[TMP51]], [[CONV73]] +// CHECK1-NEXT: [[TMP59:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4 +// CHECK1-NEXT: [[CONV75:%.*]] = sext i32 [[TMP59]] to i64 +// CHECK1-NEXT: [[MUL76:%.*]] = mul nsw i64 [[DIV74]], [[CONV75]] +// CHECK1-NEXT: [[ADD77:%.*]] = add nsw i64 [[CONV59]], [[MUL76]] +// CHECK1-NEXT: [[CONV78:%.*]] = trunc i64 [[ADD77]] to i32 +// CHECK1-NEXT: store i32 [[CONV78]], ptr [[I49]], align 4 +// CHECK1-NEXT: [[TMP60:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK1-NEXT: [[CONV79:%.*]] = sext i32 [[TMP60]] to i64 +// CHECK1-NEXT: [[TMP61:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP62:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP63:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: [[TMP64:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK1-NEXT: [[SUB80:%.*]] = sub i32 [[TMP63]], [[TMP64]] +// CHECK1-NEXT: [[SUB81:%.*]] = sub i32 [[SUB80]], 1 +// CHECK1-NEXT: [[TMP65:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[ADD82:%.*]] = add i32 [[SUB81]], [[TMP65]] +// CHECK1-NEXT: [[TMP66:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[DIV83:%.*]] = udiv i32 [[ADD82]], [[TMP66]] +// CHECK1-NEXT: [[MUL84:%.*]] = mul i32 1, [[DIV83]] +// CHECK1-NEXT: [[TMP67:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB85:%.*]] = sub i32 [[TMP67]], -31 +// CHECK1-NEXT: [[DIV86:%.*]] = udiv i32 [[SUB85]], 32 +// CHECK1-NEXT: [[MUL87:%.*]] = mul i32 [[MUL84]], [[DIV86]] +// CHECK1-NEXT: [[TMP68:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP69:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB88:%.*]] = sub i32 [[TMP68]], [[TMP69]] +// CHECK1-NEXT: [[SUB89:%.*]] = sub i32 [[SUB88]], 1 +// CHECK1-NEXT: [[ADD90:%.*]] = add i32 [[SUB89]], 1 +// CHECK1-NEXT: [[DIV91:%.*]] = udiv i32 [[ADD90]], 1 +// CHECK1-NEXT: [[MUL92:%.*]] = mul i32 [[MUL87]], [[DIV91]] +// CHECK1-NEXT: [[CONV93:%.*]] = zext i32 [[MUL92]] to i64 +// CHECK1-NEXT: [[DIV94:%.*]] = sdiv i64 [[TMP62]], [[CONV93]] +// CHECK1-NEXT: [[TMP70:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: [[TMP71:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK1-NEXT: [[SUB95:%.*]] = sub i32 [[TMP70]], [[TMP71]] +// CHECK1-NEXT: [[SUB96:%.*]] = sub i32 [[SUB95]], 1 +// CHECK1-NEXT: [[TMP72:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[ADD97:%.*]] = add i32 [[SUB96]], [[TMP72]] +// CHECK1-NEXT: [[TMP73:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[DIV98:%.*]] = udiv i32 [[ADD97]], [[TMP73]] +// CHECK1-NEXT: [[MUL99:%.*]] = mul i32 1, [[DIV98]] +// CHECK1-NEXT: [[TMP74:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB100:%.*]] = sub i32 [[TMP74]], -31 +// CHECK1-NEXT: [[DIV101:%.*]] = udiv i32 [[SUB100]], 32 +// CHECK1-NEXT: [[MUL102:%.*]] = mul i32 [[MUL99]], [[DIV101]] +// CHECK1-NEXT: [[TMP75:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP76:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB103:%.*]] = sub i32 [[TMP75]], [[TMP76]] +// CHECK1-NEXT: [[SUB104:%.*]] = sub i32 [[SUB103]], 1 +// CHECK1-NEXT: [[ADD105:%.*]] = add i32 [[SUB104]], 1 +// CHECK1-NEXT: [[DIV106:%.*]] = udiv i32 [[ADD105]], 1 +// CHECK1-NEXT: [[MUL107:%.*]] = mul i32 [[MUL102]], [[DIV106]] +// CHECK1-NEXT: [[CONV108:%.*]] = zext i32 [[MUL107]] to i64 +// CHECK1-NEXT: [[MUL109:%.*]] = mul nsw i64 [[DIV94]], [[CONV108]] +// CHECK1-NEXT: [[SUB110:%.*]] = sub nsw i64 [[TMP61]], [[MUL109]] +// CHECK1-NEXT: [[TMP77:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB111:%.*]] = sub i32 [[TMP77]], -31 +// CHECK1-NEXT: [[DIV112:%.*]] = udiv i32 [[SUB111]], 32 +// CHECK1-NEXT: [[MUL113:%.*]] = mul i32 1, [[DIV112]] +// CHECK1-NEXT: [[TMP78:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP79:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB114:%.*]] = sub i32 [[TMP78]], [[TMP79]] +// CHECK1-NEXT: [[SUB115:%.*]] = sub i32 [[SUB114]], 1 +// CHECK1-NEXT: [[ADD116:%.*]] = add i32 [[SUB115]], 1 +// CHECK1-NEXT: [[DIV117:%.*]] = udiv i32 [[ADD116]], 1 +// CHECK1-NEXT: [[MUL118:%.*]] = mul i32 [[MUL113]], [[DIV117]] +// CHECK1-NEXT: [[CONV119:%.*]] = zext i32 [[MUL118]] to i64 +// CHECK1-NEXT: [[DIV120:%.*]] = sdiv i64 [[SUB110]], [[CONV119]] +// CHECK1-NEXT: [[TMP80:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[CONV121:%.*]] = sext i32 [[TMP80]] to i64 +// CHECK1-NEXT: [[MUL122:%.*]] = mul nsw i64 [[DIV120]], [[CONV121]] +// CHECK1-NEXT: [[ADD123:%.*]] = add nsw i64 [[CONV79]], [[MUL122]] +// CHECK1-NEXT: [[CONV124:%.*]] = trunc i64 [[ADD123]] to i32 +// CHECK1-NEXT: store i32 [[CONV124]], ptr [[J50]], align 4 +// CHECK1-NEXT: [[TMP81:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP82:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP83:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: [[TMP84:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK1-NEXT: [[SUB125:%.*]] = sub i32 [[TMP83]], [[TMP84]] +// CHECK1-NEXT: [[SUB126:%.*]] = sub i32 [[SUB125]], 1 +// CHECK1-NEXT: [[TMP85:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[ADD127:%.*]] = add i32 [[SUB126]], [[TMP85]] +// CHECK1-NEXT: [[TMP86:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[DIV128:%.*]] = udiv i32 [[ADD127]], [[TMP86]] +// CHECK1-NEXT: [[MUL129:%.*]] = mul i32 1, [[DIV128]] +// CHECK1-NEXT: [[TMP87:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB130:%.*]] = sub i32 [[TMP87]], -31 +// CHECK1-NEXT: [[DIV131:%.*]] = udiv i32 [[SUB130]], 32 +// CHECK1-NEXT: [[MUL132:%.*]] = mul i32 [[MUL129]], [[DIV131]] +// CHECK1-NEXT: [[TMP88:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP89:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB133:%.*]] = sub i32 [[TMP88]], [[TMP89]] +// CHECK1-NEXT: [[SUB134:%.*]] = sub i32 [[SUB133]], 1 +// CHECK1-NEXT: [[ADD135:%.*]] = add i32 [[SUB134]], 1 +// CHECK1-NEXT: [[DIV136:%.*]] = udiv i32 [[ADD135]], 1 +// CHECK1-NEXT: [[MUL137:%.*]] = mul i32 [[MUL132]], [[DIV136]] +// CHECK1-NEXT: [[CONV138:%.*]] = zext i32 [[MUL137]] to i64 +// CHECK1-NEXT: [[DIV139:%.*]] = sdiv i64 [[TMP82]], [[CONV138]] +// CHECK1-NEXT: [[TMP90:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: [[TMP91:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK1-NEXT: [[SUB140:%.*]] = sub i32 [[TMP90]], [[TMP91]] +// CHECK1-NEXT: [[SUB141:%.*]] = sub i32 [[SUB140]], 1 +// CHECK1-NEXT: [[TMP92:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[ADD142:%.*]] = add i32 [[SUB141]], [[TMP92]] +// CHECK1-NEXT: [[TMP93:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[DIV143:%.*]] = udiv i32 [[ADD142]], [[TMP93]] +// CHECK1-NEXT: [[MUL144:%.*]] = mul i32 1, [[DIV143]] +// CHECK1-NEXT: [[TMP94:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB145:%.*]] = sub i32 [[TMP94]], -31 +// CHECK1-NEXT: [[DIV146:%.*]] = udiv i32 [[SUB145]], 32 +// CHECK1-NEXT: [[MUL147:%.*]] = mul i32 [[MUL144]], [[DIV146]] +// CHECK1-NEXT: [[TMP95:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP96:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB148:%.*]] = sub i32 [[TMP95]], [[TMP96]] +// CHECK1-NEXT: [[SUB149:%.*]] = sub i32 [[SUB148]], 1 +// CHECK1-NEXT: [[ADD150:%.*]] = add i32 [[SUB149]], 1 +// CHECK1-NEXT: [[DIV151:%.*]] = udiv i32 [[ADD150]], 1 +// CHECK1-NEXT: [[MUL152:%.*]] = mul i32 [[MUL147]], [[DIV151]] +// CHECK1-NEXT: [[CONV153:%.*]] = zext i32 [[MUL152]] to i64 +// CHECK1-NEXT: [[MUL154:%.*]] = mul nsw i64 [[DIV139]], [[CONV153]] +// CHECK1-NEXT: [[SUB155:%.*]] = sub nsw i64 [[TMP81]], [[MUL154]] +// CHECK1-NEXT: [[TMP97:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP98:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP99:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: [[TMP100:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK1-NEXT: [[SUB156:%.*]] = sub i32 [[TMP99]], [[TMP100]] +// CHECK1-NEXT: [[SUB157:%.*]] = sub i32 [[SUB156]], 1 +// CHECK1-NEXT: [[TMP101:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[ADD158:%.*]] = add i32 [[SUB157]], [[TMP101]] +// CHECK1-NEXT: [[TMP102:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[DIV159:%.*]] = udiv i32 [[ADD158]], [[TMP102]] +// CHECK1-NEXT: [[MUL160:%.*]] = mul i32 1, [[DIV159]] +// CHECK1-NEXT: [[TMP103:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB161:%.*]] = sub i32 [[TMP103]], -31 +// CHECK1-NEXT: [[DIV162:%.*]] = udiv i32 [[SUB161]], 32 +// CHECK1-NEXT: [[MUL163:%.*]] = mul i32 [[MUL160]], [[DIV162]] +// CHECK1-NEXT: [[TMP104:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP105:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB164:%.*]] = sub i32 [[TMP104]], [[TMP105]] +// CHECK1-NEXT: [[SUB165:%.*]] = sub i32 [[SUB164]], 1 +// CHECK1-NEXT: [[ADD166:%.*]] = add i32 [[SUB165]], 1 +// CHECK1-NEXT: [[DIV167:%.*]] = udiv i32 [[ADD166]], 1 +// CHECK1-NEXT: [[MUL168:%.*]] = mul i32 [[MUL163]], [[DIV167]] +// CHECK1-NEXT: [[CONV169:%.*]] = zext i32 [[MUL168]] to i64 +// CHECK1-NEXT: [[DIV170:%.*]] = sdiv i64 [[TMP98]], [[CONV169]] +// CHECK1-NEXT: [[TMP106:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: [[TMP107:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK1-NEXT: [[SUB171:%.*]] = sub i32 [[TMP106]], [[TMP107]] +// CHECK1-NEXT: [[SUB172:%.*]] = sub i32 [[SUB171]], 1 +// CHECK1-NEXT: [[TMP108:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[ADD173:%.*]] = add i32 [[SUB172]], [[TMP108]] +// CHECK1-NEXT: [[TMP109:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[DIV174:%.*]] = udiv i32 [[ADD173]], [[TMP109]] +// CHECK1-NEXT: [[MUL175:%.*]] = mul i32 1, [[DIV174]] +// CHECK1-NEXT: [[TMP110:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB176:%.*]] = sub i32 [[TMP110]], -31 +// CHECK1-NEXT: [[DIV177:%.*]] = udiv i32 [[SUB176]], 32 +// CHECK1-NEXT: [[MUL178:%.*]] = mul i32 [[MUL175]], [[DIV177]] +// CHECK1-NEXT: [[TMP111:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP112:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB179:%.*]] = sub i32 [[TMP111]], [[TMP112]] +// CHECK1-NEXT: [[SUB180:%.*]] = sub i32 [[SUB179]], 1 +// CHECK1-NEXT: [[ADD181:%.*]] = add i32 [[SUB180]], 1 +// CHECK1-NEXT: [[DIV182:%.*]] = udiv i32 [[ADD181]], 1 +// CHECK1-NEXT: [[MUL183:%.*]] = mul i32 [[MUL178]], [[DIV182]] +// CHECK1-NEXT: [[CONV184:%.*]] = zext i32 [[MUL183]] to i64 +// CHECK1-NEXT: [[MUL185:%.*]] = mul nsw i64 [[DIV170]], [[CONV184]] +// CHECK1-NEXT: [[SUB186:%.*]] = sub nsw i64 [[TMP97]], [[MUL185]] +// CHECK1-NEXT: [[TMP113:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB187:%.*]] = sub i32 [[TMP113]], -31 +// CHECK1-NEXT: [[DIV188:%.*]] = udiv i32 [[SUB187]], 32 +// CHECK1-NEXT: [[MUL189:%.*]] = mul i32 1, [[DIV188]] +// CHECK1-NEXT: [[TMP114:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP115:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB190:%.*]] = sub i32 [[TMP114]], [[TMP115]] +// CHECK1-NEXT: [[SUB191:%.*]] = sub i32 [[SUB190]], 1 +// CHECK1-NEXT: [[ADD192:%.*]] = add i32 [[SUB191]], 1 +// CHECK1-NEXT: [[DIV193:%.*]] = udiv i32 [[ADD192]], 1 +// CHECK1-NEXT: [[MUL194:%.*]] = mul i32 [[MUL189]], [[DIV193]] +// CHECK1-NEXT: [[CONV195:%.*]] = zext i32 [[MUL194]] to i64 +// CHECK1-NEXT: [[DIV196:%.*]] = sdiv i64 [[SUB186]], [[CONV195]] +// CHECK1-NEXT: [[TMP116:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB197:%.*]] = sub i32 [[TMP116]], -31 +// CHECK1-NEXT: [[DIV198:%.*]] = udiv i32 [[SUB197]], 32 +// CHECK1-NEXT: [[MUL199:%.*]] = mul i32 1, [[DIV198]] +// CHECK1-NEXT: [[TMP117:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP118:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB200:%.*]] = sub i32 [[TMP117]], [[TMP118]] +// CHECK1-NEXT: [[SUB201:%.*]] = sub i32 [[SUB200]], 1 +// CHECK1-NEXT: [[ADD202:%.*]] = add i32 [[SUB201]], 1 +// CHECK1-NEXT: [[DIV203:%.*]] = udiv i32 [[ADD202]], 1 +// CHECK1-NEXT: [[MUL204:%.*]] = mul i32 [[MUL199]], [[DIV203]] +// CHECK1-NEXT: [[CONV205:%.*]] = zext i32 [[MUL204]] to i64 +// CHECK1-NEXT: [[MUL206:%.*]] = mul nsw i64 [[DIV196]], [[CONV205]] +// CHECK1-NEXT: [[SUB207:%.*]] = sub nsw i64 [[SUB155]], [[MUL206]] +// CHECK1-NEXT: [[TMP119:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP120:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB208:%.*]] = sub i32 [[TMP119]], [[TMP120]] +// CHECK1-NEXT: [[SUB209:%.*]] = sub i32 [[SUB208]], 1 +// CHECK1-NEXT: [[ADD210:%.*]] = add i32 [[SUB209]], 1 +// CHECK1-NEXT: [[DIV211:%.*]] = udiv i32 [[ADD210]], 1 +// CHECK1-NEXT: [[MUL212:%.*]] = mul i32 1, [[DIV211]] +// CHECK1-NEXT: [[CONV213:%.*]] = zext i32 [[MUL212]] to i64 +// CHECK1-NEXT: [[DIV214:%.*]] = sdiv i64 [[SUB207]], [[CONV213]] +// CHECK1-NEXT: [[MUL215:%.*]] = mul nsw i64 [[DIV214]], 32 +// CHECK1-NEXT: [[ADD216:%.*]] = add nsw i64 0, [[MUL215]] +// CHECK1-NEXT: [[CONV217:%.*]] = trunc i64 [[ADD216]] to i32 +// CHECK1-NEXT: store i32 [[CONV217]], ptr [[DOTFLOOR_0_IV_K51]], align 4 +// CHECK1-NEXT: [[TMP121:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[CONV218:%.*]] = zext i32 [[TMP121]] to i64 +// CHECK1-NEXT: [[TMP122:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP123:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP124:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: [[TMP125:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK1-NEXT: [[SUB219:%.*]] = sub i32 [[TMP124]], [[TMP125]] +// CHECK1-NEXT: [[SUB220:%.*]] = sub i32 [[SUB219]], 1 +// CHECK1-NEXT: [[TMP126:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[ADD221:%.*]] = add i32 [[SUB220]], [[TMP126]] +// CHECK1-NEXT: [[TMP127:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[DIV222:%.*]] = udiv i32 [[ADD221]], [[TMP127]] +// CHECK1-NEXT: [[MUL223:%.*]] = mul i32 1, [[DIV222]] +// CHECK1-NEXT: [[TMP128:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB224:%.*]] = sub i32 [[TMP128]], -31 +// CHECK1-NEXT: [[DIV225:%.*]] = udiv i32 [[SUB224]], 32 +// CHECK1-NEXT: [[MUL226:%.*]] = mul i32 [[MUL223]], [[DIV225]] +// CHECK1-NEXT: [[TMP129:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP130:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB227:%.*]] = sub i32 [[TMP129]], [[TMP130]] +// CHECK1-NEXT: [[SUB228:%.*]] = sub i32 [[SUB227]], 1 +// CHECK1-NEXT: [[ADD229:%.*]] = add i32 [[SUB228]], 1 +// CHECK1-NEXT: [[DIV230:%.*]] = udiv i32 [[ADD229]], 1 +// CHECK1-NEXT: [[MUL231:%.*]] = mul i32 [[MUL226]], [[DIV230]] +// CHECK1-NEXT: [[CONV232:%.*]] = zext i32 [[MUL231]] to i64 +// CHECK1-NEXT: [[DIV233:%.*]] = sdiv i64 [[TMP123]], [[CONV232]] +// CHECK1-NEXT: [[TMP131:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: [[TMP132:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK1-NEXT: [[SUB234:%.*]] = sub i32 [[TMP131]], [[TMP132]] +// CHECK1-NEXT: [[SUB235:%.*]] = sub i32 [[SUB234]], 1 +// CHECK1-NEXT: [[TMP133:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[ADD236:%.*]] = add i32 [[SUB235]], [[TMP133]] +// CHECK1-NEXT: [[TMP134:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[DIV237:%.*]] = udiv i32 [[ADD236]], [[TMP134]] +// CHECK1-NEXT: [[MUL238:%.*]] = mul i32 1, [[DIV237]] +// CHECK1-NEXT: [[TMP135:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB239:%.*]] = sub i32 [[TMP135]], -31 +// CHECK1-NEXT: [[DIV240:%.*]] = udiv i32 [[SUB239]], 32 +// CHECK1-NEXT: [[MUL241:%.*]] = mul i32 [[MUL238]], [[DIV240]] +// CHECK1-NEXT: [[TMP136:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP137:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB242:%.*]] = sub i32 [[TMP136]], [[TMP137]] +// CHECK1-NEXT: [[SUB243:%.*]] = sub i32 [[SUB242]], 1 +// CHECK1-NEXT: [[ADD244:%.*]] = add i32 [[SUB243]], 1 +// CHECK1-NEXT: [[DIV245:%.*]] = udiv i32 [[ADD244]], 1 +// CHECK1-NEXT: [[MUL246:%.*]] = mul i32 [[MUL241]], [[DIV245]] +// CHECK1-NEXT: [[CONV247:%.*]] = zext i32 [[MUL246]] to i64 +// CHECK1-NEXT: [[MUL248:%.*]] = mul nsw i64 [[DIV233]], [[CONV247]] +// CHECK1-NEXT: [[SUB249:%.*]] = sub nsw i64 [[TMP122]], [[MUL248]] +// CHECK1-NEXT: [[TMP138:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP139:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP140:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: [[TMP141:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK1-NEXT: [[SUB250:%.*]] = sub i32 [[TMP140]], [[TMP141]] +// CHECK1-NEXT: [[SUB251:%.*]] = sub i32 [[SUB250]], 1 +// CHECK1-NEXT: [[TMP142:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[ADD252:%.*]] = add i32 [[SUB251]], [[TMP142]] +// CHECK1-NEXT: [[TMP143:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[DIV253:%.*]] = udiv i32 [[ADD252]], [[TMP143]] +// CHECK1-NEXT: [[MUL254:%.*]] = mul i32 1, [[DIV253]] +// CHECK1-NEXT: [[TMP144:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB255:%.*]] = sub i32 [[TMP144]], -31 +// CHECK1-NEXT: [[DIV256:%.*]] = udiv i32 [[SUB255]], 32 +// CHECK1-NEXT: [[MUL257:%.*]] = mul i32 [[MUL254]], [[DIV256]] +// CHECK1-NEXT: [[TMP145:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP146:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB258:%.*]] = sub i32 [[TMP145]], [[TMP146]] +// CHECK1-NEXT: [[SUB259:%.*]] = sub i32 [[SUB258]], 1 +// CHECK1-NEXT: [[ADD260:%.*]] = add i32 [[SUB259]], 1 +// CHECK1-NEXT: [[DIV261:%.*]] = udiv i32 [[ADD260]], 1 +// CHECK1-NEXT: [[MUL262:%.*]] = mul i32 [[MUL257]], [[DIV261]] +// CHECK1-NEXT: [[CONV263:%.*]] = zext i32 [[MUL262]] to i64 +// CHECK1-NEXT: [[DIV264:%.*]] = sdiv i64 [[TMP139]], [[CONV263]] +// CHECK1-NEXT: [[TMP147:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: [[TMP148:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK1-NEXT: [[SUB265:%.*]] = sub i32 [[TMP147]], [[TMP148]] +// CHECK1-NEXT: [[SUB266:%.*]] = sub i32 [[SUB265]], 1 +// CHECK1-NEXT: [[TMP149:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[ADD267:%.*]] = add i32 [[SUB266]], [[TMP149]] +// CHECK1-NEXT: [[TMP150:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[DIV268:%.*]] = udiv i32 [[ADD267]], [[TMP150]] +// CHECK1-NEXT: [[MUL269:%.*]] = mul i32 1, [[DIV268]] +// CHECK1-NEXT: [[TMP151:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB270:%.*]] = sub i32 [[TMP151]], -31 +// CHECK1-NEXT: [[DIV271:%.*]] = udiv i32 [[SUB270]], 32 +// CHECK1-NEXT: [[MUL272:%.*]] = mul i32 [[MUL269]], [[DIV271]] +// CHECK1-NEXT: [[TMP152:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP153:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB273:%.*]] = sub i32 [[TMP152]], [[TMP153]] +// CHECK1-NEXT: [[SUB274:%.*]] = sub i32 [[SUB273]], 1 +// CHECK1-NEXT: [[ADD275:%.*]] = add i32 [[SUB274]], 1 +// CHECK1-NEXT: [[DIV276:%.*]] = udiv i32 [[ADD275]], 1 +// CHECK1-NEXT: [[MUL277:%.*]] = mul i32 [[MUL272]], [[DIV276]] +// CHECK1-NEXT: [[CONV278:%.*]] = zext i32 [[MUL277]] to i64 +// CHECK1-NEXT: [[MUL279:%.*]] = mul nsw i64 [[DIV264]], [[CONV278]] +// CHECK1-NEXT: [[SUB280:%.*]] = sub nsw i64 [[TMP138]], [[MUL279]] +// CHECK1-NEXT: [[TMP154:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB281:%.*]] = sub i32 [[TMP154]], -31 +// CHECK1-NEXT: [[DIV282:%.*]] = udiv i32 [[SUB281]], 32 +// CHECK1-NEXT: [[MUL283:%.*]] = mul i32 1, [[DIV282]] +// CHECK1-NEXT: [[TMP155:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP156:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB284:%.*]] = sub i32 [[TMP155]], [[TMP156]] +// CHECK1-NEXT: [[SUB285:%.*]] = sub i32 [[SUB284]], 1 +// CHECK1-NEXT: [[ADD286:%.*]] = add i32 [[SUB285]], 1 +// CHECK1-NEXT: [[DIV287:%.*]] = udiv i32 [[ADD286]], 1 +// CHECK1-NEXT: [[MUL288:%.*]] = mul i32 [[MUL283]], [[DIV287]] +// CHECK1-NEXT: [[CONV289:%.*]] = zext i32 [[MUL288]] to i64 +// CHECK1-NEXT: [[DIV290:%.*]] = sdiv i64 [[SUB280]], [[CONV289]] +// CHECK1-NEXT: [[TMP157:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB291:%.*]] = sub i32 [[TMP157]], -31 +// CHECK1-NEXT: [[DIV292:%.*]] = udiv i32 [[SUB291]], 32 +// CHECK1-NEXT: [[MUL293:%.*]] = mul i32 1, [[DIV292]] +// CHECK1-NEXT: [[TMP158:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP159:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB294:%.*]] = sub i32 [[TMP158]], [[TMP159]] +// CHECK1-NEXT: [[SUB295:%.*]] = sub i32 [[SUB294]], 1 +// CHECK1-NEXT: [[ADD296:%.*]] = add i32 [[SUB295]], 1 +// CHECK1-NEXT: [[DIV297:%.*]] = udiv i32 [[ADD296]], 1 +// CHECK1-NEXT: [[MUL298:%.*]] = mul i32 [[MUL293]], [[DIV297]] +// CHECK1-NEXT: [[CONV299:%.*]] = zext i32 [[MUL298]] to i64 +// CHECK1-NEXT: [[MUL300:%.*]] = mul nsw i64 [[DIV290]], [[CONV299]] +// CHECK1-NEXT: [[SUB301:%.*]] = sub nsw i64 [[SUB249]], [[MUL300]] +// CHECK1-NEXT: [[TMP160:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP161:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP162:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: [[TMP163:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK1-NEXT: [[SUB302:%.*]] = sub i32 [[TMP162]], [[TMP163]] +// CHECK1-NEXT: [[SUB303:%.*]] = sub i32 [[SUB302]], 1 +// CHECK1-NEXT: [[TMP164:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[ADD304:%.*]] = add i32 [[SUB303]], [[TMP164]] +// CHECK1-NEXT: [[TMP165:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[DIV305:%.*]] = udiv i32 [[ADD304]], [[TMP165]] +// CHECK1-NEXT: [[MUL306:%.*]] = mul i32 1, [[DIV305]] +// CHECK1-NEXT: [[TMP166:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB307:%.*]] = sub i32 [[TMP166]], -31 +// CHECK1-NEXT: [[DIV308:%.*]] = udiv i32 [[SUB307]], 32 +// CHECK1-NEXT: [[MUL309:%.*]] = mul i32 [[MUL306]], [[DIV308]] +// CHECK1-NEXT: [[TMP167:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP168:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB310:%.*]] = sub i32 [[TMP167]], [[TMP168]] +// CHECK1-NEXT: [[SUB311:%.*]] = sub i32 [[SUB310]], 1 +// CHECK1-NEXT: [[ADD312:%.*]] = add i32 [[SUB311]], 1 +// CHECK1-NEXT: [[DIV313:%.*]] = udiv i32 [[ADD312]], 1 +// CHECK1-NEXT: [[MUL314:%.*]] = mul i32 [[MUL309]], [[DIV313]] +// CHECK1-NEXT: [[CONV315:%.*]] = zext i32 [[MUL314]] to i64 +// CHECK1-NEXT: [[DIV316:%.*]] = sdiv i64 [[TMP161]], [[CONV315]] +// CHECK1-NEXT: [[TMP169:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: [[TMP170:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK1-NEXT: [[SUB317:%.*]] = sub i32 [[TMP169]], [[TMP170]] +// CHECK1-NEXT: [[SUB318:%.*]] = sub i32 [[SUB317]], 1 +// CHECK1-NEXT: [[TMP171:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[ADD319:%.*]] = add i32 [[SUB318]], [[TMP171]] +// CHECK1-NEXT: [[TMP172:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[DIV320:%.*]] = udiv i32 [[ADD319]], [[TMP172]] +// CHECK1-NEXT: [[MUL321:%.*]] = mul i32 1, [[DIV320]] +// CHECK1-NEXT: [[TMP173:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB322:%.*]] = sub i32 [[TMP173]], -31 +// CHECK1-NEXT: [[DIV323:%.*]] = udiv i32 [[SUB322]], 32 +// CHECK1-NEXT: [[MUL324:%.*]] = mul i32 [[MUL321]], [[DIV323]] +// CHECK1-NEXT: [[TMP174:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP175:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB325:%.*]] = sub i32 [[TMP174]], [[TMP175]] +// CHECK1-NEXT: [[SUB326:%.*]] = sub i32 [[SUB325]], 1 +// CHECK1-NEXT: [[ADD327:%.*]] = add i32 [[SUB326]], 1 +// CHECK1-NEXT: [[DIV328:%.*]] = udiv i32 [[ADD327]], 1 +// CHECK1-NEXT: [[MUL329:%.*]] = mul i32 [[MUL324]], [[DIV328]] +// CHECK1-NEXT: [[CONV330:%.*]] = zext i32 [[MUL329]] to i64 +// CHECK1-NEXT: [[MUL331:%.*]] = mul nsw i64 [[DIV316]], [[CONV330]] +// CHECK1-NEXT: [[SUB332:%.*]] = sub nsw i64 [[TMP160]], [[MUL331]] +// CHECK1-NEXT: [[TMP176:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP177:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP178:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: [[TMP179:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK1-NEXT: [[SUB333:%.*]] = sub i32 [[TMP178]], [[TMP179]] +// CHECK1-NEXT: [[SUB334:%.*]] = sub i32 [[SUB333]], 1 +// CHECK1-NEXT: [[TMP180:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[ADD335:%.*]] = add i32 [[SUB334]], [[TMP180]] +// CHECK1-NEXT: [[TMP181:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[DIV336:%.*]] = udiv i32 [[ADD335]], [[TMP181]] +// CHECK1-NEXT: [[MUL337:%.*]] = mul i32 1, [[DIV336]] +// CHECK1-NEXT: [[TMP182:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB338:%.*]] = sub i32 [[TMP182]], -31 +// CHECK1-NEXT: [[DIV339:%.*]] = udiv i32 [[SUB338]], 32 +// CHECK1-NEXT: [[MUL340:%.*]] = mul i32 [[MUL337]], [[DIV339]] +// CHECK1-NEXT: [[TMP183:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP184:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB341:%.*]] = sub i32 [[TMP183]], [[TMP184]] +// CHECK1-NEXT: [[SUB342:%.*]] = sub i32 [[SUB341]], 1 +// CHECK1-NEXT: [[ADD343:%.*]] = add i32 [[SUB342]], 1 +// CHECK1-NEXT: [[DIV344:%.*]] = udiv i32 [[ADD343]], 1 +// CHECK1-NEXT: [[MUL345:%.*]] = mul i32 [[MUL340]], [[DIV344]] +// CHECK1-NEXT: [[CONV346:%.*]] = zext i32 [[MUL345]] to i64 +// CHECK1-NEXT: [[DIV347:%.*]] = sdiv i64 [[TMP177]], [[CONV346]] +// CHECK1-NEXT: [[TMP185:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: [[TMP186:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK1-NEXT: [[SUB348:%.*]] = sub i32 [[TMP185]], [[TMP186]] +// CHECK1-NEXT: [[SUB349:%.*]] = sub i32 [[SUB348]], 1 +// CHECK1-NEXT: [[TMP187:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[ADD350:%.*]] = add i32 [[SUB349]], [[TMP187]] +// CHECK1-NEXT: [[TMP188:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[DIV351:%.*]] = udiv i32 [[ADD350]], [[TMP188]] +// CHECK1-NEXT: [[MUL352:%.*]] = mul i32 1, [[DIV351]] +// CHECK1-NEXT: [[TMP189:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB353:%.*]] = sub i32 [[TMP189]], -31 +// CHECK1-NEXT: [[DIV354:%.*]] = udiv i32 [[SUB353]], 32 +// CHECK1-NEXT: [[MUL355:%.*]] = mul i32 [[MUL352]], [[DIV354]] +// CHECK1-NEXT: [[TMP190:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP191:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB356:%.*]] = sub i32 [[TMP190]], [[TMP191]] +// CHECK1-NEXT: [[SUB357:%.*]] = sub i32 [[SUB356]], 1 +// CHECK1-NEXT: [[ADD358:%.*]] = add i32 [[SUB357]], 1 +// CHECK1-NEXT: [[DIV359:%.*]] = udiv i32 [[ADD358]], 1 +// CHECK1-NEXT: [[MUL360:%.*]] = mul i32 [[MUL355]], [[DIV359]] +// CHECK1-NEXT: [[CONV361:%.*]] = zext i32 [[MUL360]] to i64 +// CHECK1-NEXT: [[MUL362:%.*]] = mul nsw i64 [[DIV347]], [[CONV361]] +// CHECK1-NEXT: [[SUB363:%.*]] = sub nsw i64 [[TMP176]], [[MUL362]] +// CHECK1-NEXT: [[TMP192:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB364:%.*]] = sub i32 [[TMP192]], -31 +// CHECK1-NEXT: [[DIV365:%.*]] = udiv i32 [[SUB364]], 32 +// CHECK1-NEXT: [[MUL366:%.*]] = mul i32 1, [[DIV365]] +// CHECK1-NEXT: [[TMP193:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP194:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB367:%.*]] = sub i32 [[TMP193]], [[TMP194]] +// CHECK1-NEXT: [[SUB368:%.*]] = sub i32 [[SUB367]], 1 +// CHECK1-NEXT: [[ADD369:%.*]] = add i32 [[SUB368]], 1 +// CHECK1-NEXT: [[DIV370:%.*]] = udiv i32 [[ADD369]], 1 +// CHECK1-NEXT: [[MUL371:%.*]] = mul i32 [[MUL366]], [[DIV370]] +// CHECK1-NEXT: [[CONV372:%.*]] = zext i32 [[MUL371]] to i64 +// CHECK1-NEXT: [[DIV373:%.*]] = sdiv i64 [[SUB363]], [[CONV372]] +// CHECK1-NEXT: [[TMP195:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB374:%.*]] = sub i32 [[TMP195]], -31 +// CHECK1-NEXT: [[DIV375:%.*]] = udiv i32 [[SUB374]], 32 +// CHECK1-NEXT: [[MUL376:%.*]] = mul i32 1, [[DIV375]] +// CHECK1-NEXT: [[TMP196:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP197:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB377:%.*]] = sub i32 [[TMP196]], [[TMP197]] +// CHECK1-NEXT: [[SUB378:%.*]] = sub i32 [[SUB377]], 1 +// CHECK1-NEXT: [[ADD379:%.*]] = add i32 [[SUB378]], 1 +// CHECK1-NEXT: [[DIV380:%.*]] = udiv i32 [[ADD379]], 1 +// CHECK1-NEXT: [[MUL381:%.*]] = mul i32 [[MUL376]], [[DIV380]] +// CHECK1-NEXT: [[CONV382:%.*]] = zext i32 [[MUL381]] to i64 +// CHECK1-NEXT: [[MUL383:%.*]] = mul nsw i64 [[DIV373]], [[CONV382]] +// CHECK1-NEXT: [[SUB384:%.*]] = sub nsw i64 [[SUB332]], [[MUL383]] +// CHECK1-NEXT: [[TMP198:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP199:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB385:%.*]] = sub i32 [[TMP198]], [[TMP199]] +// CHECK1-NEXT: [[SUB386:%.*]] = sub i32 [[SUB385]], 1 +// CHECK1-NEXT: [[ADD387:%.*]] = add i32 [[SUB386]], 1 +// CHECK1-NEXT: [[DIV388:%.*]] = udiv i32 [[ADD387]], 1 +// CHECK1-NEXT: [[MUL389:%.*]] = mul i32 1, [[DIV388]] +// CHECK1-NEXT: [[CONV390:%.*]] = zext i32 [[MUL389]] to i64 +// CHECK1-NEXT: [[DIV391:%.*]] = sdiv i64 [[SUB384]], [[CONV390]] +// CHECK1-NEXT: [[TMP200:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP201:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB392:%.*]] = sub i32 [[TMP200]], [[TMP201]] +// CHECK1-NEXT: [[SUB393:%.*]] = sub i32 [[SUB392]], 1 +// CHECK1-NEXT: [[ADD394:%.*]] = add i32 [[SUB393]], 1 +// CHECK1-NEXT: [[DIV395:%.*]] = udiv i32 [[ADD394]], 1 +// CHECK1-NEXT: [[MUL396:%.*]] = mul i32 1, [[DIV395]] +// CHECK1-NEXT: [[CONV397:%.*]] = zext i32 [[MUL396]] to i64 +// CHECK1-NEXT: [[MUL398:%.*]] = mul nsw i64 [[DIV391]], [[CONV397]] +// CHECK1-NEXT: [[SUB399:%.*]] = sub nsw i64 [[SUB301]], [[MUL398]] +// CHECK1-NEXT: [[MUL400:%.*]] = mul nsw i64 [[SUB399]], 1 +// CHECK1-NEXT: [[ADD401:%.*]] = add nsw i64 [[CONV218]], [[MUL400]] +// CHECK1-NEXT: [[CONV402:%.*]] = trunc i64 [[ADD401]] to i32 +// CHECK1-NEXT: store i32 [[CONV402]], ptr [[DOTTILE_0_IV_K52]], align 4 +// CHECK1-NEXT: [[TMP202:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_8]], align 4 +// CHECK1-NEXT: [[TMP203:%.*]] = load i32, ptr [[DOTTILE_0_IV_K52]], align 4 +// CHECK1-NEXT: [[TMP204:%.*]] = load i32, ptr [[DOTNEW_STEP10]], align 4 +// CHECK1-NEXT: [[MUL403:%.*]] = mul i32 [[TMP203]], [[TMP204]] +// CHECK1-NEXT: [[ADD404:%.*]] = add i32 [[TMP202]], [[MUL403]] +// CHECK1-NEXT: store i32 [[ADD404]], ptr [[K]], align 4 +// CHECK1-NEXT: [[TMP205:%.*]] = load i32, ptr [[I49]], align 4 +// CHECK1-NEXT: [[TMP206:%.*]] = load i32, ptr [[J50]], align 4 +// CHECK1-NEXT: [[TMP207:%.*]] = load i32, ptr [[K]], align 4 +// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP205]], i32 noundef [[TMP206]], i32 noundef [[TMP207]]) +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP208:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[ADD405:%.*]] = add nsw i64 [[TMP208]], 1 +// CHECK1-NEXT: store i64 [[ADD405]], ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]]) +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]]) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@_Z4foo8ILi64EEviii +// CHECK1-SAME: (i32 noundef [[START:%.*]], i32 noundef [[END:%.*]], i32 noundef [[STEP:%.*]]) #[[ATTR0]] comdat { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[START_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[END_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[STEP_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[_TMP2:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[_TMP3:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_4:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTNEW_STEP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_5:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_6:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTNEW_STEP7:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_8:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_9:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTNEW_STEP10:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_11:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_14:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_16:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_17:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_22:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTFLOOR_0_IV_K:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTILE_0_IV_K:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I49:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[J50:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTFLOOR_0_IV_K51:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTILE_0_IV_K52:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]]) +// CHECK1-NEXT: store i32 [[START]], ptr [[START_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[END]], ptr [[END_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[STEP]], ptr [[STEP_ADDR]], align 4 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[START_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[END_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_4]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[STEP_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP3]], ptr [[DOTNEW_STEP]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[START_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[END_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[STEP_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP6]], ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[START_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP7]], ptr [[K]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[START_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP8]], ptr [[DOTCAPTURE_EXPR_8]], align 4 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[END_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP9]], ptr [[DOTCAPTURE_EXPR_9]], align 4 +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[STEP_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP10]], ptr [[DOTNEW_STEP10]], align 4 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_9]], align 4 +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_8]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub i32 [[TMP11]], [[TMP12]] +// CHECK1-NEXT: [[SUB12:%.*]] = sub i32 [[SUB]], 1 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTNEW_STEP10]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add i32 [[SUB12]], [[TMP13]] +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTNEW_STEP10]], align 4 +// CHECK1-NEXT: [[DIV:%.*]] = udiv i32 [[ADD]], [[TMP14]] +// CHECK1-NEXT: [[SUB13:%.*]] = sub i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB13]], ptr [[DOTCAPTURE_EXPR_11]], align 4 +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_11]], align 4 +// CHECK1-NEXT: [[ADD15:%.*]] = add i32 [[TMP15]], 1 +// CHECK1-NEXT: store i32 [[ADD15]], ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[_TMP2]], align 4 +// CHECK1-NEXT: store i32 [[TMP16]], ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_11]], align 4 +// CHECK1-NEXT: [[ADD18:%.*]] = add i32 [[TMP17]], 1 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[_TMP2]], align 4 +// CHECK1-NEXT: [[ADD19:%.*]] = add i32 [[TMP18]], 64 +// CHECK1-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD18]], [[ADD19]] +// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_11]], align 4 +// CHECK1-NEXT: [[ADD20:%.*]] = add i32 [[TMP19]], 1 +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[_TMP2]], align 4 +// CHECK1-NEXT: [[ADD21:%.*]] = add i32 [[TMP20]], 64 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[ADD20]], [[COND_TRUE]] ], [ [[ADD21]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4 +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB23:%.*]] = sub i32 [[TMP21]], [[TMP22]] +// CHECK1-NEXT: [[SUB24:%.*]] = sub i32 [[SUB23]], 1 +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4 +// CHECK1-NEXT: [[ADD25:%.*]] = add i32 [[SUB24]], [[TMP23]] +// CHECK1-NEXT: [[TMP24:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4 +// CHECK1-NEXT: [[DIV26:%.*]] = udiv i32 [[ADD25]], [[TMP24]] +// CHECK1-NEXT: [[CONV:%.*]] = zext i32 [[DIV26]] to i64 +// CHECK1-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK1-NEXT: [[SUB27:%.*]] = sub i32 [[TMP25]], [[TMP26]] +// CHECK1-NEXT: [[SUB28:%.*]] = sub i32 [[SUB27]], 1 +// CHECK1-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[ADD29:%.*]] = add i32 [[SUB28]], [[TMP27]] +// CHECK1-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[DIV30:%.*]] = udiv i32 [[ADD29]], [[TMP28]] +// CHECK1-NEXT: [[CONV31:%.*]] = zext i32 [[DIV30]] to i64 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV31]] +// CHECK1-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB32:%.*]] = sub i32 [[TMP29]], -63 +// CHECK1-NEXT: [[DIV33:%.*]] = udiv i32 [[SUB32]], 64 +// CHECK1-NEXT: [[CONV34:%.*]] = zext i32 [[DIV33]] to i64 +// CHECK1-NEXT: [[MUL35:%.*]] = mul nsw i64 [[MUL]], [[CONV34]] +// CHECK1-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB36:%.*]] = sub i32 [[TMP30]], [[TMP31]] +// CHECK1-NEXT: [[SUB37:%.*]] = sub i32 [[SUB36]], 1 +// CHECK1-NEXT: [[ADD38:%.*]] = add i32 [[SUB37]], 1 +// CHECK1-NEXT: [[DIV39:%.*]] = udiv i32 [[ADD38]], 1 +// CHECK1-NEXT: [[CONV40:%.*]] = zext i32 [[DIV39]] to i64 +// CHECK1-NEXT: [[MUL41:%.*]] = mul nsw i64 [[MUL35]], [[CONV40]] +// CHECK1-NEXT: [[SUB42:%.*]] = sub nsw i64 [[MUL41]], 1 +// CHECK1-NEXT: store i64 [[SUB42]], ptr [[DOTCAPTURE_EXPR_22]], align 8 +// CHECK1-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: store i32 [[TMP32]], ptr [[I]], align 4 +// CHECK1-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK1-NEXT: store i32 [[TMP33]], ptr [[J]], align 4 +// CHECK1-NEXT: store i32 0, ptr [[DOTFLOOR_0_IV_K]], align 4 +// CHECK1-NEXT: [[TMP34:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: store i32 [[TMP34]], ptr [[DOTTILE_0_IV_K]], align 4 +// CHECK1-NEXT: [[TMP35:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4 +// CHECK1-NEXT: [[CMP43:%.*]] = icmp slt i32 [[TMP35]], [[TMP36]] +// CHECK1-NEXT: br i1 [[CMP43]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: land.lhs.true: +// CHECK1-NEXT: [[TMP37:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK1-NEXT: [[TMP38:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: [[CMP44:%.*]] = icmp slt i32 [[TMP37]], [[TMP38]] +// CHECK1-NEXT: br i1 [[CMP44]], label [[LAND_LHS_TRUE45:%.*]], label [[OMP_PRECOND_END]] +// CHECK1: land.lhs.true45: +// CHECK1-NEXT: [[TMP39:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[CMP46:%.*]] = icmp ult i32 0, [[TMP39]] +// CHECK1-NEXT: br i1 [[CMP46]], label [[LAND_LHS_TRUE47:%.*]], label [[OMP_PRECOND_END]] +// CHECK1: land.lhs.true47: +// CHECK1-NEXT: [[TMP40:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[CMP48:%.*]] = icmp ult i32 [[TMP40]], [[TMP41]] +// CHECK1-NEXT: br i1 [[CMP48]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i64 0, ptr [[DOTOMP_LB]], align 8 +// CHECK1-NEXT: [[TMP42:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_22]], align 8 +// CHECK1-NEXT: store i64 [[TMP42]], ptr [[DOTOMP_UB]], align 8 +// CHECK1-NEXT: store i64 1, ptr [[DOTOMP_STRIDE]], align 8 +// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_8(ptr @[[GLOB1]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i64 1, i64 1) +// CHECK1-NEXT: [[TMP43:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8 +// CHECK1-NEXT: [[TMP44:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_22]], align 8 +// CHECK1-NEXT: [[CMP53:%.*]] = icmp sgt i64 [[TMP43]], [[TMP44]] +// CHECK1-NEXT: br i1 [[CMP53]], label [[COND_TRUE54:%.*]], label [[COND_FALSE55:%.*]] +// CHECK1: cond.true54: +// CHECK1-NEXT: [[TMP45:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_22]], align 8 +// CHECK1-NEXT: br label [[COND_END56:%.*]] +// CHECK1: cond.false55: +// CHECK1-NEXT: [[TMP46:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8 +// CHECK1-NEXT: br label [[COND_END56]] +// CHECK1: cond.end56: +// CHECK1-NEXT: [[COND57:%.*]] = phi i64 [ [[TMP45]], [[COND_TRUE54]] ], [ [[TMP46]], [[COND_FALSE55]] ] +// CHECK1-NEXT: store i64 [[COND57]], ptr [[DOTOMP_UB]], align 8 +// CHECK1-NEXT: [[TMP47:%.*]] = load i64, ptr [[DOTOMP_LB]], align 8 +// CHECK1-NEXT: store i64 [[TMP47]], ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP48:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP49:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8 +// CHECK1-NEXT: [[CMP58:%.*]] = icmp sle i64 [[TMP48]], [[TMP49]] +// CHECK1-NEXT: br i1 [[CMP58]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP50:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CONV59:%.*]] = sext i32 [[TMP50]] to i64 +// CHECK1-NEXT: [[TMP51:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP52:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: [[TMP53:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK1-NEXT: [[SUB60:%.*]] = sub i32 [[TMP52]], [[TMP53]] +// CHECK1-NEXT: [[SUB61:%.*]] = sub i32 [[SUB60]], 1 +// CHECK1-NEXT: [[TMP54:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[ADD62:%.*]] = add i32 [[SUB61]], [[TMP54]] +// CHECK1-NEXT: [[TMP55:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[DIV63:%.*]] = udiv i32 [[ADD62]], [[TMP55]] +// CHECK1-NEXT: [[MUL64:%.*]] = mul i32 1, [[DIV63]] +// CHECK1-NEXT: [[TMP56:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB65:%.*]] = sub i32 [[TMP56]], -63 +// CHECK1-NEXT: [[DIV66:%.*]] = udiv i32 [[SUB65]], 64 +// CHECK1-NEXT: [[MUL67:%.*]] = mul i32 [[MUL64]], [[DIV66]] +// CHECK1-NEXT: [[TMP57:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP58:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB68:%.*]] = sub i32 [[TMP57]], [[TMP58]] +// CHECK1-NEXT: [[SUB69:%.*]] = sub i32 [[SUB68]], 1 +// CHECK1-NEXT: [[ADD70:%.*]] = add i32 [[SUB69]], 1 +// CHECK1-NEXT: [[DIV71:%.*]] = udiv i32 [[ADD70]], 1 +// CHECK1-NEXT: [[MUL72:%.*]] = mul i32 [[MUL67]], [[DIV71]] +// CHECK1-NEXT: [[CONV73:%.*]] = zext i32 [[MUL72]] to i64 +// CHECK1-NEXT: [[DIV74:%.*]] = sdiv i64 [[TMP51]], [[CONV73]] +// CHECK1-NEXT: [[TMP59:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4 +// CHECK1-NEXT: [[CONV75:%.*]] = sext i32 [[TMP59]] to i64 +// CHECK1-NEXT: [[MUL76:%.*]] = mul nsw i64 [[DIV74]], [[CONV75]] +// CHECK1-NEXT: [[ADD77:%.*]] = add nsw i64 [[CONV59]], [[MUL76]] +// CHECK1-NEXT: [[CONV78:%.*]] = trunc i64 [[ADD77]] to i32 +// CHECK1-NEXT: store i32 [[CONV78]], ptr [[I49]], align 4 +// CHECK1-NEXT: [[TMP60:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK1-NEXT: [[CONV79:%.*]] = sext i32 [[TMP60]] to i64 +// CHECK1-NEXT: [[TMP61:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP62:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP63:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: [[TMP64:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK1-NEXT: [[SUB80:%.*]] = sub i32 [[TMP63]], [[TMP64]] +// CHECK1-NEXT: [[SUB81:%.*]] = sub i32 [[SUB80]], 1 +// CHECK1-NEXT: [[TMP65:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[ADD82:%.*]] = add i32 [[SUB81]], [[TMP65]] +// CHECK1-NEXT: [[TMP66:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[DIV83:%.*]] = udiv i32 [[ADD82]], [[TMP66]] +// CHECK1-NEXT: [[MUL84:%.*]] = mul i32 1, [[DIV83]] +// CHECK1-NEXT: [[TMP67:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB85:%.*]] = sub i32 [[TMP67]], -63 +// CHECK1-NEXT: [[DIV86:%.*]] = udiv i32 [[SUB85]], 64 +// CHECK1-NEXT: [[MUL87:%.*]] = mul i32 [[MUL84]], [[DIV86]] +// CHECK1-NEXT: [[TMP68:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP69:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB88:%.*]] = sub i32 [[TMP68]], [[TMP69]] +// CHECK1-NEXT: [[SUB89:%.*]] = sub i32 [[SUB88]], 1 +// CHECK1-NEXT: [[ADD90:%.*]] = add i32 [[SUB89]], 1 +// CHECK1-NEXT: [[DIV91:%.*]] = udiv i32 [[ADD90]], 1 +// CHECK1-NEXT: [[MUL92:%.*]] = mul i32 [[MUL87]], [[DIV91]] +// CHECK1-NEXT: [[CONV93:%.*]] = zext i32 [[MUL92]] to i64 +// CHECK1-NEXT: [[DIV94:%.*]] = sdiv i64 [[TMP62]], [[CONV93]] +// CHECK1-NEXT: [[TMP70:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: [[TMP71:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK1-NEXT: [[SUB95:%.*]] = sub i32 [[TMP70]], [[TMP71]] +// CHECK1-NEXT: [[SUB96:%.*]] = sub i32 [[SUB95]], 1 +// CHECK1-NEXT: [[TMP72:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[ADD97:%.*]] = add i32 [[SUB96]], [[TMP72]] +// CHECK1-NEXT: [[TMP73:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[DIV98:%.*]] = udiv i32 [[ADD97]], [[TMP73]] +// CHECK1-NEXT: [[MUL99:%.*]] = mul i32 1, [[DIV98]] +// CHECK1-NEXT: [[TMP74:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB100:%.*]] = sub i32 [[TMP74]], -63 +// CHECK1-NEXT: [[DIV101:%.*]] = udiv i32 [[SUB100]], 64 +// CHECK1-NEXT: [[MUL102:%.*]] = mul i32 [[MUL99]], [[DIV101]] +// CHECK1-NEXT: [[TMP75:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP76:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB103:%.*]] = sub i32 [[TMP75]], [[TMP76]] +// CHECK1-NEXT: [[SUB104:%.*]] = sub i32 [[SUB103]], 1 +// CHECK1-NEXT: [[ADD105:%.*]] = add i32 [[SUB104]], 1 +// CHECK1-NEXT: [[DIV106:%.*]] = udiv i32 [[ADD105]], 1 +// CHECK1-NEXT: [[MUL107:%.*]] = mul i32 [[MUL102]], [[DIV106]] +// CHECK1-NEXT: [[CONV108:%.*]] = zext i32 [[MUL107]] to i64 +// CHECK1-NEXT: [[MUL109:%.*]] = mul nsw i64 [[DIV94]], [[CONV108]] +// CHECK1-NEXT: [[SUB110:%.*]] = sub nsw i64 [[TMP61]], [[MUL109]] +// CHECK1-NEXT: [[TMP77:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB111:%.*]] = sub i32 [[TMP77]], -63 +// CHECK1-NEXT: [[DIV112:%.*]] = udiv i32 [[SUB111]], 64 +// CHECK1-NEXT: [[MUL113:%.*]] = mul i32 1, [[DIV112]] +// CHECK1-NEXT: [[TMP78:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP79:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB114:%.*]] = sub i32 [[TMP78]], [[TMP79]] +// CHECK1-NEXT: [[SUB115:%.*]] = sub i32 [[SUB114]], 1 +// CHECK1-NEXT: [[ADD116:%.*]] = add i32 [[SUB115]], 1 +// CHECK1-NEXT: [[DIV117:%.*]] = udiv i32 [[ADD116]], 1 +// CHECK1-NEXT: [[MUL118:%.*]] = mul i32 [[MUL113]], [[DIV117]] +// CHECK1-NEXT: [[CONV119:%.*]] = zext i32 [[MUL118]] to i64 +// CHECK1-NEXT: [[DIV120:%.*]] = sdiv i64 [[SUB110]], [[CONV119]] +// CHECK1-NEXT: [[TMP80:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[CONV121:%.*]] = sext i32 [[TMP80]] to i64 +// CHECK1-NEXT: [[MUL122:%.*]] = mul nsw i64 [[DIV120]], [[CONV121]] +// CHECK1-NEXT: [[ADD123:%.*]] = add nsw i64 [[CONV79]], [[MUL122]] +// CHECK1-NEXT: [[CONV124:%.*]] = trunc i64 [[ADD123]] to i32 +// CHECK1-NEXT: store i32 [[CONV124]], ptr [[J50]], align 4 +// CHECK1-NEXT: [[TMP81:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP82:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP83:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: [[TMP84:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK1-NEXT: [[SUB125:%.*]] = sub i32 [[TMP83]], [[TMP84]] +// CHECK1-NEXT: [[SUB126:%.*]] = sub i32 [[SUB125]], 1 +// CHECK1-NEXT: [[TMP85:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[ADD127:%.*]] = add i32 [[SUB126]], [[TMP85]] +// CHECK1-NEXT: [[TMP86:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[DIV128:%.*]] = udiv i32 [[ADD127]], [[TMP86]] +// CHECK1-NEXT: [[MUL129:%.*]] = mul i32 1, [[DIV128]] +// CHECK1-NEXT: [[TMP87:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB130:%.*]] = sub i32 [[TMP87]], -63 +// CHECK1-NEXT: [[DIV131:%.*]] = udiv i32 [[SUB130]], 64 +// CHECK1-NEXT: [[MUL132:%.*]] = mul i32 [[MUL129]], [[DIV131]] +// CHECK1-NEXT: [[TMP88:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP89:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB133:%.*]] = sub i32 [[TMP88]], [[TMP89]] +// CHECK1-NEXT: [[SUB134:%.*]] = sub i32 [[SUB133]], 1 +// CHECK1-NEXT: [[ADD135:%.*]] = add i32 [[SUB134]], 1 +// CHECK1-NEXT: [[DIV136:%.*]] = udiv i32 [[ADD135]], 1 +// CHECK1-NEXT: [[MUL137:%.*]] = mul i32 [[MUL132]], [[DIV136]] +// CHECK1-NEXT: [[CONV138:%.*]] = zext i32 [[MUL137]] to i64 +// CHECK1-NEXT: [[DIV139:%.*]] = sdiv i64 [[TMP82]], [[CONV138]] +// CHECK1-NEXT: [[TMP90:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: [[TMP91:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK1-NEXT: [[SUB140:%.*]] = sub i32 [[TMP90]], [[TMP91]] +// CHECK1-NEXT: [[SUB141:%.*]] = sub i32 [[SUB140]], 1 +// CHECK1-NEXT: [[TMP92:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[ADD142:%.*]] = add i32 [[SUB141]], [[TMP92]] +// CHECK1-NEXT: [[TMP93:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[DIV143:%.*]] = udiv i32 [[ADD142]], [[TMP93]] +// CHECK1-NEXT: [[MUL144:%.*]] = mul i32 1, [[DIV143]] +// CHECK1-NEXT: [[TMP94:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB145:%.*]] = sub i32 [[TMP94]], -63 +// CHECK1-NEXT: [[DIV146:%.*]] = udiv i32 [[SUB145]], 64 +// CHECK1-NEXT: [[MUL147:%.*]] = mul i32 [[MUL144]], [[DIV146]] +// CHECK1-NEXT: [[TMP95:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP96:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB148:%.*]] = sub i32 [[TMP95]], [[TMP96]] +// CHECK1-NEXT: [[SUB149:%.*]] = sub i32 [[SUB148]], 1 +// CHECK1-NEXT: [[ADD150:%.*]] = add i32 [[SUB149]], 1 +// CHECK1-NEXT: [[DIV151:%.*]] = udiv i32 [[ADD150]], 1 +// CHECK1-NEXT: [[MUL152:%.*]] = mul i32 [[MUL147]], [[DIV151]] +// CHECK1-NEXT: [[CONV153:%.*]] = zext i32 [[MUL152]] to i64 +// CHECK1-NEXT: [[MUL154:%.*]] = mul nsw i64 [[DIV139]], [[CONV153]] +// CHECK1-NEXT: [[SUB155:%.*]] = sub nsw i64 [[TMP81]], [[MUL154]] +// CHECK1-NEXT: [[TMP97:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP98:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP99:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: [[TMP100:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK1-NEXT: [[SUB156:%.*]] = sub i32 [[TMP99]], [[TMP100]] +// CHECK1-NEXT: [[SUB157:%.*]] = sub i32 [[SUB156]], 1 +// CHECK1-NEXT: [[TMP101:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[ADD158:%.*]] = add i32 [[SUB157]], [[TMP101]] +// CHECK1-NEXT: [[TMP102:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[DIV159:%.*]] = udiv i32 [[ADD158]], [[TMP102]] +// CHECK1-NEXT: [[MUL160:%.*]] = mul i32 1, [[DIV159]] +// CHECK1-NEXT: [[TMP103:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB161:%.*]] = sub i32 [[TMP103]], -63 +// CHECK1-NEXT: [[DIV162:%.*]] = udiv i32 [[SUB161]], 64 +// CHECK1-NEXT: [[MUL163:%.*]] = mul i32 [[MUL160]], [[DIV162]] +// CHECK1-NEXT: [[TMP104:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP105:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB164:%.*]] = sub i32 [[TMP104]], [[TMP105]] +// CHECK1-NEXT: [[SUB165:%.*]] = sub i32 [[SUB164]], 1 +// CHECK1-NEXT: [[ADD166:%.*]] = add i32 [[SUB165]], 1 +// CHECK1-NEXT: [[DIV167:%.*]] = udiv i32 [[ADD166]], 1 +// CHECK1-NEXT: [[MUL168:%.*]] = mul i32 [[MUL163]], [[DIV167]] +// CHECK1-NEXT: [[CONV169:%.*]] = zext i32 [[MUL168]] to i64 +// CHECK1-NEXT: [[DIV170:%.*]] = sdiv i64 [[TMP98]], [[CONV169]] +// CHECK1-NEXT: [[TMP106:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: [[TMP107:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK1-NEXT: [[SUB171:%.*]] = sub i32 [[TMP106]], [[TMP107]] +// CHECK1-NEXT: [[SUB172:%.*]] = sub i32 [[SUB171]], 1 +// CHECK1-NEXT: [[TMP108:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[ADD173:%.*]] = add i32 [[SUB172]], [[TMP108]] +// CHECK1-NEXT: [[TMP109:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[DIV174:%.*]] = udiv i32 [[ADD173]], [[TMP109]] +// CHECK1-NEXT: [[MUL175:%.*]] = mul i32 1, [[DIV174]] +// CHECK1-NEXT: [[TMP110:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB176:%.*]] = sub i32 [[TMP110]], -63 +// CHECK1-NEXT: [[DIV177:%.*]] = udiv i32 [[SUB176]], 64 +// CHECK1-NEXT: [[MUL178:%.*]] = mul i32 [[MUL175]], [[DIV177]] +// CHECK1-NEXT: [[TMP111:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP112:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB179:%.*]] = sub i32 [[TMP111]], [[TMP112]] +// CHECK1-NEXT: [[SUB180:%.*]] = sub i32 [[SUB179]], 1 +// CHECK1-NEXT: [[ADD181:%.*]] = add i32 [[SUB180]], 1 +// CHECK1-NEXT: [[DIV182:%.*]] = udiv i32 [[ADD181]], 1 +// CHECK1-NEXT: [[MUL183:%.*]] = mul i32 [[MUL178]], [[DIV182]] +// CHECK1-NEXT: [[CONV184:%.*]] = zext i32 [[MUL183]] to i64 +// CHECK1-NEXT: [[MUL185:%.*]] = mul nsw i64 [[DIV170]], [[CONV184]] +// CHECK1-NEXT: [[SUB186:%.*]] = sub nsw i64 [[TMP97]], [[MUL185]] +// CHECK1-NEXT: [[TMP113:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB187:%.*]] = sub i32 [[TMP113]], -63 +// CHECK1-NEXT: [[DIV188:%.*]] = udiv i32 [[SUB187]], 64 +// CHECK1-NEXT: [[MUL189:%.*]] = mul i32 1, [[DIV188]] +// CHECK1-NEXT: [[TMP114:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP115:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB190:%.*]] = sub i32 [[TMP114]], [[TMP115]] +// CHECK1-NEXT: [[SUB191:%.*]] = sub i32 [[SUB190]], 1 +// CHECK1-NEXT: [[ADD192:%.*]] = add i32 [[SUB191]], 1 +// CHECK1-NEXT: [[DIV193:%.*]] = udiv i32 [[ADD192]], 1 +// CHECK1-NEXT: [[MUL194:%.*]] = mul i32 [[MUL189]], [[DIV193]] +// CHECK1-NEXT: [[CONV195:%.*]] = zext i32 [[MUL194]] to i64 +// CHECK1-NEXT: [[DIV196:%.*]] = sdiv i64 [[SUB186]], [[CONV195]] +// CHECK1-NEXT: [[TMP116:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB197:%.*]] = sub i32 [[TMP116]], -63 +// CHECK1-NEXT: [[DIV198:%.*]] = udiv i32 [[SUB197]], 64 +// CHECK1-NEXT: [[MUL199:%.*]] = mul i32 1, [[DIV198]] +// CHECK1-NEXT: [[TMP117:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP118:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB200:%.*]] = sub i32 [[TMP117]], [[TMP118]] +// CHECK1-NEXT: [[SUB201:%.*]] = sub i32 [[SUB200]], 1 +// CHECK1-NEXT: [[ADD202:%.*]] = add i32 [[SUB201]], 1 +// CHECK1-NEXT: [[DIV203:%.*]] = udiv i32 [[ADD202]], 1 +// CHECK1-NEXT: [[MUL204:%.*]] = mul i32 [[MUL199]], [[DIV203]] +// CHECK1-NEXT: [[CONV205:%.*]] = zext i32 [[MUL204]] to i64 +// CHECK1-NEXT: [[MUL206:%.*]] = mul nsw i64 [[DIV196]], [[CONV205]] +// CHECK1-NEXT: [[SUB207:%.*]] = sub nsw i64 [[SUB155]], [[MUL206]] +// CHECK1-NEXT: [[TMP119:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP120:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB208:%.*]] = sub i32 [[TMP119]], [[TMP120]] +// CHECK1-NEXT: [[SUB209:%.*]] = sub i32 [[SUB208]], 1 +// CHECK1-NEXT: [[ADD210:%.*]] = add i32 [[SUB209]], 1 +// CHECK1-NEXT: [[DIV211:%.*]] = udiv i32 [[ADD210]], 1 +// CHECK1-NEXT: [[MUL212:%.*]] = mul i32 1, [[DIV211]] +// CHECK1-NEXT: [[CONV213:%.*]] = zext i32 [[MUL212]] to i64 +// CHECK1-NEXT: [[DIV214:%.*]] = sdiv i64 [[SUB207]], [[CONV213]] +// CHECK1-NEXT: [[MUL215:%.*]] = mul nsw i64 [[DIV214]], 64 +// CHECK1-NEXT: [[ADD216:%.*]] = add nsw i64 0, [[MUL215]] +// CHECK1-NEXT: [[CONV217:%.*]] = trunc i64 [[ADD216]] to i32 +// CHECK1-NEXT: store i32 [[CONV217]], ptr [[DOTFLOOR_0_IV_K51]], align 4 +// CHECK1-NEXT: [[TMP121:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[CONV218:%.*]] = zext i32 [[TMP121]] to i64 +// CHECK1-NEXT: [[TMP122:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP123:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP124:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: [[TMP125:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK1-NEXT: [[SUB219:%.*]] = sub i32 [[TMP124]], [[TMP125]] +// CHECK1-NEXT: [[SUB220:%.*]] = sub i32 [[SUB219]], 1 +// CHECK1-NEXT: [[TMP126:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[ADD221:%.*]] = add i32 [[SUB220]], [[TMP126]] +// CHECK1-NEXT: [[TMP127:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[DIV222:%.*]] = udiv i32 [[ADD221]], [[TMP127]] +// CHECK1-NEXT: [[MUL223:%.*]] = mul i32 1, [[DIV222]] +// CHECK1-NEXT: [[TMP128:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB224:%.*]] = sub i32 [[TMP128]], -63 +// CHECK1-NEXT: [[DIV225:%.*]] = udiv i32 [[SUB224]], 64 +// CHECK1-NEXT: [[MUL226:%.*]] = mul i32 [[MUL223]], [[DIV225]] +// CHECK1-NEXT: [[TMP129:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP130:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB227:%.*]] = sub i32 [[TMP129]], [[TMP130]] +// CHECK1-NEXT: [[SUB228:%.*]] = sub i32 [[SUB227]], 1 +// CHECK1-NEXT: [[ADD229:%.*]] = add i32 [[SUB228]], 1 +// CHECK1-NEXT: [[DIV230:%.*]] = udiv i32 [[ADD229]], 1 +// CHECK1-NEXT: [[MUL231:%.*]] = mul i32 [[MUL226]], [[DIV230]] +// CHECK1-NEXT: [[CONV232:%.*]] = zext i32 [[MUL231]] to i64 +// CHECK1-NEXT: [[DIV233:%.*]] = sdiv i64 [[TMP123]], [[CONV232]] +// CHECK1-NEXT: [[TMP131:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: [[TMP132:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK1-NEXT: [[SUB234:%.*]] = sub i32 [[TMP131]], [[TMP132]] +// CHECK1-NEXT: [[SUB235:%.*]] = sub i32 [[SUB234]], 1 +// CHECK1-NEXT: [[TMP133:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[ADD236:%.*]] = add i32 [[SUB235]], [[TMP133]] +// CHECK1-NEXT: [[TMP134:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[DIV237:%.*]] = udiv i32 [[ADD236]], [[TMP134]] +// CHECK1-NEXT: [[MUL238:%.*]] = mul i32 1, [[DIV237]] +// CHECK1-NEXT: [[TMP135:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB239:%.*]] = sub i32 [[TMP135]], -63 +// CHECK1-NEXT: [[DIV240:%.*]] = udiv i32 [[SUB239]], 64 +// CHECK1-NEXT: [[MUL241:%.*]] = mul i32 [[MUL238]], [[DIV240]] +// CHECK1-NEXT: [[TMP136:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP137:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB242:%.*]] = sub i32 [[TMP136]], [[TMP137]] +// CHECK1-NEXT: [[SUB243:%.*]] = sub i32 [[SUB242]], 1 +// CHECK1-NEXT: [[ADD244:%.*]] = add i32 [[SUB243]], 1 +// CHECK1-NEXT: [[DIV245:%.*]] = udiv i32 [[ADD244]], 1 +// CHECK1-NEXT: [[MUL246:%.*]] = mul i32 [[MUL241]], [[DIV245]] +// CHECK1-NEXT: [[CONV247:%.*]] = zext i32 [[MUL246]] to i64 +// CHECK1-NEXT: [[MUL248:%.*]] = mul nsw i64 [[DIV233]], [[CONV247]] +// CHECK1-NEXT: [[SUB249:%.*]] = sub nsw i64 [[TMP122]], [[MUL248]] +// CHECK1-NEXT: [[TMP138:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP139:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP140:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: [[TMP141:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK1-NEXT: [[SUB250:%.*]] = sub i32 [[TMP140]], [[TMP141]] +// CHECK1-NEXT: [[SUB251:%.*]] = sub i32 [[SUB250]], 1 +// CHECK1-NEXT: [[TMP142:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[ADD252:%.*]] = add i32 [[SUB251]], [[TMP142]] +// CHECK1-NEXT: [[TMP143:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[DIV253:%.*]] = udiv i32 [[ADD252]], [[TMP143]] +// CHECK1-NEXT: [[MUL254:%.*]] = mul i32 1, [[DIV253]] +// CHECK1-NEXT: [[TMP144:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB255:%.*]] = sub i32 [[TMP144]], -63 +// CHECK1-NEXT: [[DIV256:%.*]] = udiv i32 [[SUB255]], 64 +// CHECK1-NEXT: [[MUL257:%.*]] = mul i32 [[MUL254]], [[DIV256]] +// CHECK1-NEXT: [[TMP145:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP146:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB258:%.*]] = sub i32 [[TMP145]], [[TMP146]] +// CHECK1-NEXT: [[SUB259:%.*]] = sub i32 [[SUB258]], 1 +// CHECK1-NEXT: [[ADD260:%.*]] = add i32 [[SUB259]], 1 +// CHECK1-NEXT: [[DIV261:%.*]] = udiv i32 [[ADD260]], 1 +// CHECK1-NEXT: [[MUL262:%.*]] = mul i32 [[MUL257]], [[DIV261]] +// CHECK1-NEXT: [[CONV263:%.*]] = zext i32 [[MUL262]] to i64 +// CHECK1-NEXT: [[DIV264:%.*]] = sdiv i64 [[TMP139]], [[CONV263]] +// CHECK1-NEXT: [[TMP147:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: [[TMP148:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK1-NEXT: [[SUB265:%.*]] = sub i32 [[TMP147]], [[TMP148]] +// CHECK1-NEXT: [[SUB266:%.*]] = sub i32 [[SUB265]], 1 +// CHECK1-NEXT: [[TMP149:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[ADD267:%.*]] = add i32 [[SUB266]], [[TMP149]] +// CHECK1-NEXT: [[TMP150:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[DIV268:%.*]] = udiv i32 [[ADD267]], [[TMP150]] +// CHECK1-NEXT: [[MUL269:%.*]] = mul i32 1, [[DIV268]] +// CHECK1-NEXT: [[TMP151:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB270:%.*]] = sub i32 [[TMP151]], -63 +// CHECK1-NEXT: [[DIV271:%.*]] = udiv i32 [[SUB270]], 64 +// CHECK1-NEXT: [[MUL272:%.*]] = mul i32 [[MUL269]], [[DIV271]] +// CHECK1-NEXT: [[TMP152:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP153:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB273:%.*]] = sub i32 [[TMP152]], [[TMP153]] +// CHECK1-NEXT: [[SUB274:%.*]] = sub i32 [[SUB273]], 1 +// CHECK1-NEXT: [[ADD275:%.*]] = add i32 [[SUB274]], 1 +// CHECK1-NEXT: [[DIV276:%.*]] = udiv i32 [[ADD275]], 1 +// CHECK1-NEXT: [[MUL277:%.*]] = mul i32 [[MUL272]], [[DIV276]] +// CHECK1-NEXT: [[CONV278:%.*]] = zext i32 [[MUL277]] to i64 +// CHECK1-NEXT: [[MUL279:%.*]] = mul nsw i64 [[DIV264]], [[CONV278]] +// CHECK1-NEXT: [[SUB280:%.*]] = sub nsw i64 [[TMP138]], [[MUL279]] +// CHECK1-NEXT: [[TMP154:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB281:%.*]] = sub i32 [[TMP154]], -63 +// CHECK1-NEXT: [[DIV282:%.*]] = udiv i32 [[SUB281]], 64 +// CHECK1-NEXT: [[MUL283:%.*]] = mul i32 1, [[DIV282]] +// CHECK1-NEXT: [[TMP155:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP156:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB284:%.*]] = sub i32 [[TMP155]], [[TMP156]] +// CHECK1-NEXT: [[SUB285:%.*]] = sub i32 [[SUB284]], 1 +// CHECK1-NEXT: [[ADD286:%.*]] = add i32 [[SUB285]], 1 +// CHECK1-NEXT: [[DIV287:%.*]] = udiv i32 [[ADD286]], 1 +// CHECK1-NEXT: [[MUL288:%.*]] = mul i32 [[MUL283]], [[DIV287]] +// CHECK1-NEXT: [[CONV289:%.*]] = zext i32 [[MUL288]] to i64 +// CHECK1-NEXT: [[DIV290:%.*]] = sdiv i64 [[SUB280]], [[CONV289]] +// CHECK1-NEXT: [[TMP157:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB291:%.*]] = sub i32 [[TMP157]], -63 +// CHECK1-NEXT: [[DIV292:%.*]] = udiv i32 [[SUB291]], 64 +// CHECK1-NEXT: [[MUL293:%.*]] = mul i32 1, [[DIV292]] +// CHECK1-NEXT: [[TMP158:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP159:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB294:%.*]] = sub i32 [[TMP158]], [[TMP159]] +// CHECK1-NEXT: [[SUB295:%.*]] = sub i32 [[SUB294]], 1 +// CHECK1-NEXT: [[ADD296:%.*]] = add i32 [[SUB295]], 1 +// CHECK1-NEXT: [[DIV297:%.*]] = udiv i32 [[ADD296]], 1 +// CHECK1-NEXT: [[MUL298:%.*]] = mul i32 [[MUL293]], [[DIV297]] +// CHECK1-NEXT: [[CONV299:%.*]] = zext i32 [[MUL298]] to i64 +// CHECK1-NEXT: [[MUL300:%.*]] = mul nsw i64 [[DIV290]], [[CONV299]] +// CHECK1-NEXT: [[SUB301:%.*]] = sub nsw i64 [[SUB249]], [[MUL300]] +// CHECK1-NEXT: [[TMP160:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP161:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP162:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: [[TMP163:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK1-NEXT: [[SUB302:%.*]] = sub i32 [[TMP162]], [[TMP163]] +// CHECK1-NEXT: [[SUB303:%.*]] = sub i32 [[SUB302]], 1 +// CHECK1-NEXT: [[TMP164:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[ADD304:%.*]] = add i32 [[SUB303]], [[TMP164]] +// CHECK1-NEXT: [[TMP165:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[DIV305:%.*]] = udiv i32 [[ADD304]], [[TMP165]] +// CHECK1-NEXT: [[MUL306:%.*]] = mul i32 1, [[DIV305]] +// CHECK1-NEXT: [[TMP166:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB307:%.*]] = sub i32 [[TMP166]], -63 +// CHECK1-NEXT: [[DIV308:%.*]] = udiv i32 [[SUB307]], 64 +// CHECK1-NEXT: [[MUL309:%.*]] = mul i32 [[MUL306]], [[DIV308]] +// CHECK1-NEXT: [[TMP167:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP168:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB310:%.*]] = sub i32 [[TMP167]], [[TMP168]] +// CHECK1-NEXT: [[SUB311:%.*]] = sub i32 [[SUB310]], 1 +// CHECK1-NEXT: [[ADD312:%.*]] = add i32 [[SUB311]], 1 +// CHECK1-NEXT: [[DIV313:%.*]] = udiv i32 [[ADD312]], 1 +// CHECK1-NEXT: [[MUL314:%.*]] = mul i32 [[MUL309]], [[DIV313]] +// CHECK1-NEXT: [[CONV315:%.*]] = zext i32 [[MUL314]] to i64 +// CHECK1-NEXT: [[DIV316:%.*]] = sdiv i64 [[TMP161]], [[CONV315]] +// CHECK1-NEXT: [[TMP169:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: [[TMP170:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK1-NEXT: [[SUB317:%.*]] = sub i32 [[TMP169]], [[TMP170]] +// CHECK1-NEXT: [[SUB318:%.*]] = sub i32 [[SUB317]], 1 +// CHECK1-NEXT: [[TMP171:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[ADD319:%.*]] = add i32 [[SUB318]], [[TMP171]] +// CHECK1-NEXT: [[TMP172:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[DIV320:%.*]] = udiv i32 [[ADD319]], [[TMP172]] +// CHECK1-NEXT: [[MUL321:%.*]] = mul i32 1, [[DIV320]] +// CHECK1-NEXT: [[TMP173:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB322:%.*]] = sub i32 [[TMP173]], -63 +// CHECK1-NEXT: [[DIV323:%.*]] = udiv i32 [[SUB322]], 64 +// CHECK1-NEXT: [[MUL324:%.*]] = mul i32 [[MUL321]], [[DIV323]] +// CHECK1-NEXT: [[TMP174:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP175:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB325:%.*]] = sub i32 [[TMP174]], [[TMP175]] +// CHECK1-NEXT: [[SUB326:%.*]] = sub i32 [[SUB325]], 1 +// CHECK1-NEXT: [[ADD327:%.*]] = add i32 [[SUB326]], 1 +// CHECK1-NEXT: [[DIV328:%.*]] = udiv i32 [[ADD327]], 1 +// CHECK1-NEXT: [[MUL329:%.*]] = mul i32 [[MUL324]], [[DIV328]] +// CHECK1-NEXT: [[CONV330:%.*]] = zext i32 [[MUL329]] to i64 +// CHECK1-NEXT: [[MUL331:%.*]] = mul nsw i64 [[DIV316]], [[CONV330]] +// CHECK1-NEXT: [[SUB332:%.*]] = sub nsw i64 [[TMP160]], [[MUL331]] +// CHECK1-NEXT: [[TMP176:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP177:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP178:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: [[TMP179:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK1-NEXT: [[SUB333:%.*]] = sub i32 [[TMP178]], [[TMP179]] +// CHECK1-NEXT: [[SUB334:%.*]] = sub i32 [[SUB333]], 1 +// CHECK1-NEXT: [[TMP180:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[ADD335:%.*]] = add i32 [[SUB334]], [[TMP180]] +// CHECK1-NEXT: [[TMP181:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[DIV336:%.*]] = udiv i32 [[ADD335]], [[TMP181]] +// CHECK1-NEXT: [[MUL337:%.*]] = mul i32 1, [[DIV336]] +// CHECK1-NEXT: [[TMP182:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB338:%.*]] = sub i32 [[TMP182]], -63 +// CHECK1-NEXT: [[DIV339:%.*]] = udiv i32 [[SUB338]], 64 +// CHECK1-NEXT: [[MUL340:%.*]] = mul i32 [[MUL337]], [[DIV339]] +// CHECK1-NEXT: [[TMP183:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP184:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB341:%.*]] = sub i32 [[TMP183]], [[TMP184]] +// CHECK1-NEXT: [[SUB342:%.*]] = sub i32 [[SUB341]], 1 +// CHECK1-NEXT: [[ADD343:%.*]] = add i32 [[SUB342]], 1 +// CHECK1-NEXT: [[DIV344:%.*]] = udiv i32 [[ADD343]], 1 +// CHECK1-NEXT: [[MUL345:%.*]] = mul i32 [[MUL340]], [[DIV344]] +// CHECK1-NEXT: [[CONV346:%.*]] = zext i32 [[MUL345]] to i64 +// CHECK1-NEXT: [[DIV347:%.*]] = sdiv i64 [[TMP177]], [[CONV346]] +// CHECK1-NEXT: [[TMP185:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: [[TMP186:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK1-NEXT: [[SUB348:%.*]] = sub i32 [[TMP185]], [[TMP186]] +// CHECK1-NEXT: [[SUB349:%.*]] = sub i32 [[SUB348]], 1 +// CHECK1-NEXT: [[TMP187:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[ADD350:%.*]] = add i32 [[SUB349]], [[TMP187]] +// CHECK1-NEXT: [[TMP188:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK1-NEXT: [[DIV351:%.*]] = udiv i32 [[ADD350]], [[TMP188]] +// CHECK1-NEXT: [[MUL352:%.*]] = mul i32 1, [[DIV351]] +// CHECK1-NEXT: [[TMP189:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB353:%.*]] = sub i32 [[TMP189]], -63 +// CHECK1-NEXT: [[DIV354:%.*]] = udiv i32 [[SUB353]], 64 +// CHECK1-NEXT: [[MUL355:%.*]] = mul i32 [[MUL352]], [[DIV354]] +// CHECK1-NEXT: [[TMP190:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP191:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB356:%.*]] = sub i32 [[TMP190]], [[TMP191]] +// CHECK1-NEXT: [[SUB357:%.*]] = sub i32 [[SUB356]], 1 +// CHECK1-NEXT: [[ADD358:%.*]] = add i32 [[SUB357]], 1 +// CHECK1-NEXT: [[DIV359:%.*]] = udiv i32 [[ADD358]], 1 +// CHECK1-NEXT: [[MUL360:%.*]] = mul i32 [[MUL355]], [[DIV359]] +// CHECK1-NEXT: [[CONV361:%.*]] = zext i32 [[MUL360]] to i64 +// CHECK1-NEXT: [[MUL362:%.*]] = mul nsw i64 [[DIV347]], [[CONV361]] +// CHECK1-NEXT: [[SUB363:%.*]] = sub nsw i64 [[TMP176]], [[MUL362]] +// CHECK1-NEXT: [[TMP192:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB364:%.*]] = sub i32 [[TMP192]], -63 +// CHECK1-NEXT: [[DIV365:%.*]] = udiv i32 [[SUB364]], 64 +// CHECK1-NEXT: [[MUL366:%.*]] = mul i32 1, [[DIV365]] +// CHECK1-NEXT: [[TMP193:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP194:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB367:%.*]] = sub i32 [[TMP193]], [[TMP194]] +// CHECK1-NEXT: [[SUB368:%.*]] = sub i32 [[SUB367]], 1 +// CHECK1-NEXT: [[ADD369:%.*]] = add i32 [[SUB368]], 1 +// CHECK1-NEXT: [[DIV370:%.*]] = udiv i32 [[ADD369]], 1 +// CHECK1-NEXT: [[MUL371:%.*]] = mul i32 [[MUL366]], [[DIV370]] +// CHECK1-NEXT: [[CONV372:%.*]] = zext i32 [[MUL371]] to i64 +// CHECK1-NEXT: [[DIV373:%.*]] = sdiv i64 [[SUB363]], [[CONV372]] +// CHECK1-NEXT: [[TMP195:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK1-NEXT: [[SUB374:%.*]] = sub i32 [[TMP195]], -63 +// CHECK1-NEXT: [[DIV375:%.*]] = udiv i32 [[SUB374]], 64 +// CHECK1-NEXT: [[MUL376:%.*]] = mul i32 1, [[DIV375]] +// CHECK1-NEXT: [[TMP196:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP197:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB377:%.*]] = sub i32 [[TMP196]], [[TMP197]] +// CHECK1-NEXT: [[SUB378:%.*]] = sub i32 [[SUB377]], 1 +// CHECK1-NEXT: [[ADD379:%.*]] = add i32 [[SUB378]], 1 +// CHECK1-NEXT: [[DIV380:%.*]] = udiv i32 [[ADD379]], 1 +// CHECK1-NEXT: [[MUL381:%.*]] = mul i32 [[MUL376]], [[DIV380]] +// CHECK1-NEXT: [[CONV382:%.*]] = zext i32 [[MUL381]] to i64 +// CHECK1-NEXT: [[MUL383:%.*]] = mul nsw i64 [[DIV373]], [[CONV382]] +// CHECK1-NEXT: [[SUB384:%.*]] = sub nsw i64 [[SUB332]], [[MUL383]] +// CHECK1-NEXT: [[TMP198:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP199:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB385:%.*]] = sub i32 [[TMP198]], [[TMP199]] +// CHECK1-NEXT: [[SUB386:%.*]] = sub i32 [[SUB385]], 1 +// CHECK1-NEXT: [[ADD387:%.*]] = add i32 [[SUB386]], 1 +// CHECK1-NEXT: [[DIV388:%.*]] = udiv i32 [[ADD387]], 1 +// CHECK1-NEXT: [[MUL389:%.*]] = mul i32 1, [[DIV388]] +// CHECK1-NEXT: [[CONV390:%.*]] = zext i32 [[MUL389]] to i64 +// CHECK1-NEXT: [[DIV391:%.*]] = sdiv i64 [[SUB384]], [[CONV390]] +// CHECK1-NEXT: [[TMP200:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK1-NEXT: [[TMP201:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK1-NEXT: [[SUB392:%.*]] = sub i32 [[TMP200]], [[TMP201]] +// CHECK1-NEXT: [[SUB393:%.*]] = sub i32 [[SUB392]], 1 +// CHECK1-NEXT: [[ADD394:%.*]] = add i32 [[SUB393]], 1 +// CHECK1-NEXT: [[DIV395:%.*]] = udiv i32 [[ADD394]], 1 +// CHECK1-NEXT: [[MUL396:%.*]] = mul i32 1, [[DIV395]] +// CHECK1-NEXT: [[CONV397:%.*]] = zext i32 [[MUL396]] to i64 +// CHECK1-NEXT: [[MUL398:%.*]] = mul nsw i64 [[DIV391]], [[CONV397]] +// CHECK1-NEXT: [[SUB399:%.*]] = sub nsw i64 [[SUB301]], [[MUL398]] +// CHECK1-NEXT: [[MUL400:%.*]] = mul nsw i64 [[SUB399]], 1 +// CHECK1-NEXT: [[ADD401:%.*]] = add nsw i64 [[CONV218]], [[MUL400]] +// CHECK1-NEXT: [[CONV402:%.*]] = trunc i64 [[ADD401]] to i32 +// CHECK1-NEXT: store i32 [[CONV402]], ptr [[DOTTILE_0_IV_K52]], align 4 +// CHECK1-NEXT: [[TMP202:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_8]], align 4 +// CHECK1-NEXT: [[TMP203:%.*]] = load i32, ptr [[DOTTILE_0_IV_K52]], align 4 +// CHECK1-NEXT: [[TMP204:%.*]] = load i32, ptr [[DOTNEW_STEP10]], align 4 +// CHECK1-NEXT: [[MUL403:%.*]] = mul i32 [[TMP203]], [[TMP204]] +// CHECK1-NEXT: [[ADD404:%.*]] = add i32 [[TMP202]], [[MUL403]] +// CHECK1-NEXT: store i32 [[ADD404]], ptr [[K]], align 4 +// CHECK1-NEXT: [[TMP205:%.*]] = load i32, ptr [[I49]], align 4 +// CHECK1-NEXT: [[TMP206:%.*]] = load i32, ptr [[J50]], align 4 +// CHECK1-NEXT: [[TMP207:%.*]] = load i32, ptr [[K]], align 4 +// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP205]], i32 noundef [[TMP206]], i32 noundef [[TMP207]]) +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP208:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[ADD405:%.*]] = add nsw i64 [[TMP208]], 1 +// CHECK1-NEXT: store i64 [[ADD405]], ptr [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]]) +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]]) +// CHECK1-NEXT: ret void +// +// // CHECK1-LABEL: define {{[^@]+}}@foo9 // CHECK1-SAME: () #[[ATTR0]] { // CHECK1-NEXT: entry: @@ -597,14 +2286,14 @@ extern "C" void foo10() { // CHECK1-NEXT: [[TMP17:%.*]] = load i64, ptr [[DOTPERMUTED_1_IV___BEGIN2]], align 8 // CHECK1-NEXT: [[INC:%.*]] = add nsw i64 [[TMP17]], 1 // CHECK1-NEXT: store i64 [[INC]], ptr [[DOTPERMUTED_1_IV___BEGIN2]], align 8 -// CHECK1-NEXT: br label [[FOR_COND7]], !llvm.loop [[LOOP8:![0-9]+]] +// CHECK1-NEXT: br label [[FOR_COND7]], !llvm.loop [[LOOP13:![0-9]+]] // CHECK1: for.end: // CHECK1-NEXT: br label [[FOR_INC13:%.*]] // CHECK1: for.inc13: // CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTPERMUTED_0_IV_I]], align 4 // CHECK1-NEXT: [[INC14:%.*]] = add nsw i32 [[TMP18]], 1 // CHECK1-NEXT: store i32 [[INC14]], ptr [[DOTPERMUTED_0_IV_I]], align 4 -// CHECK1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP9:![0-9]+]] +// CHECK1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] // CHECK1: for.end15: // CHECK1-NEXT: ret void // @@ -1040,6 +2729,38 @@ extern "C" void foo10() { // CHECK2-NEXT: ret void // // +// CHECK2-LABEL: define {{[^@]+}}@foo1 +// CHECK2-SAME: (i32 noundef [[START:%.*]], i32 noundef [[END:%.*]], i32 noundef [[STEP:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[START_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[END_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[STEP_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 [[START]], ptr [[START_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[END]], ptr [[END_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[STEP]], ptr [[STEP_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32, ptr [[START_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP0]], ptr [[I]], align 4 +// CHECK2-NEXT: br label [[FOR_COND:%.*]] +// CHECK2: for.cond: +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, ptr [[I]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[END_ADDR]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], [[TMP2]] +// CHECK2-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +// CHECK2: for.body: +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[I]], align 4 +// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP3]]) +// CHECK2-NEXT: br label [[FOR_INC:%.*]] +// CHECK2: for.inc: +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[STEP_ADDR]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[I]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP5]], [[TMP4]] +// CHECK2-NEXT: store i32 [[ADD]], ptr [[I]], align 4 +// CHECK2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]] +// CHECK2: for.end: +// CHECK2-NEXT: ret void +// +// // CHECK2-LABEL: define {{[^@]+}}@foo10 // CHECK2-SAME: () #[[ATTR0]] { // CHECK2-NEXT: entry: @@ -1566,14 +3287,14 @@ extern "C" void foo10() { // CHECK2-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTPERMUTED_1_IV_I]], align 4 // CHECK2-NEXT: [[INC:%.*]] = add i32 [[TMP28]], 1 // CHECK2-NEXT: store i32 [[INC]], ptr [[DOTPERMUTED_1_IV_I]], align 4 -// CHECK2-NEXT: br label [[FOR_COND16]], !llvm.loop [[LOOP3:![0-9]+]] +// CHECK2-NEXT: br label [[FOR_COND16]], !llvm.loop [[LOOP5:![0-9]+]] // CHECK2: for.end: // CHECK2-NEXT: br label [[FOR_INC22:%.*]] // CHECK2: for.inc22: // CHECK2-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTPERMUTED_0_IV_J]], align 4 // CHECK2-NEXT: [[INC23:%.*]] = add i32 [[TMP29]], 1 // CHECK2-NEXT: store i32 [[INC23]], ptr [[DOTPERMUTED_0_IV_J]], align 4 -// CHECK2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP5:![0-9]+]] +// CHECK2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP6:![0-9]+]] // CHECK2: for.end24: // CHECK2-NEXT: ret void // @@ -1646,7 +3367,7 @@ extern "C" void foo10() { // CHECK2-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTPERMUTED_1_IV_I]], align 4 // CHECK2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP12]], 1 // CHECK2-NEXT: store i32 [[INC]], ptr [[DOTPERMUTED_1_IV_I]], align 4 -// CHECK2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP6:![0-9]+]] +// CHECK2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]] // CHECK2: for.end: // CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK2: omp.body.continue: @@ -1743,7 +3464,7 @@ extern "C" void foo10() { // CHECK2-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTPERMUTED_1_IV_I]], align 4 // CHECK2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP14]], 1 // CHECK2-NEXT: store i32 [[INC]], ptr [[DOTPERMUTED_1_IV_I]], align 4 -// CHECK2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]] +// CHECK2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP8:![0-9]+]] // CHECK2: for.end: // CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK2: omp.body.continue: @@ -1761,6 +3482,96 @@ extern "C" void foo10() { // CHECK2-NEXT: ret void // // +// CHECK2-LABEL: define {{[^@]+}}@foo5 +// CHECK2-SAME: () #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[_TMP2:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]]) +// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 63, ptr [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], 63 +// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 63, [[COND_TRUE]] ], [ [[TMP2]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[CMP3:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]] +// CHECK2-NEXT: br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP6]], 16 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 3 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 7, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD]], ptr [[I]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[DIV4:%.*]] = sdiv i32 [[TMP8]], 16 +// CHECK2-NEXT: [[MUL5:%.*]] = mul nsw i32 [[DIV4]], 16 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], [[MUL5]] +// CHECK2-NEXT: [[DIV6:%.*]] = sdiv i32 [[SUB]], 4 +// CHECK2-NEXT: [[MUL7:%.*]] = mul nsw i32 [[DIV6]], 3 +// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 7, [[MUL7]] +// CHECK2-NEXT: store i32 [[ADD8]], ptr [[J]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[DIV9:%.*]] = sdiv i32 [[TMP10]], 16 +// CHECK2-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 16 +// CHECK2-NEXT: [[SUB11:%.*]] = sub nsw i32 [[TMP9]], [[MUL10]] +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[DIV12:%.*]] = sdiv i32 [[TMP12]], 16 +// CHECK2-NEXT: [[MUL13:%.*]] = mul nsw i32 [[DIV12]], 16 +// CHECK2-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP11]], [[MUL13]] +// CHECK2-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 4 +// CHECK2-NEXT: [[MUL16:%.*]] = mul nsw i32 [[DIV15]], 4 +// CHECK2-NEXT: [[SUB17:%.*]] = sub nsw i32 [[SUB11]], [[MUL16]] +// CHECK2-NEXT: [[MUL18:%.*]] = mul nsw i32 [[SUB17]], 3 +// CHECK2-NEXT: [[ADD19:%.*]] = add nsw i32 7, [[MUL18]] +// CHECK2-NEXT: store i32 [[ADD19]], ptr [[K]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, ptr [[I]], align 4 +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, ptr [[J]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, ptr [[K]], align 4 +// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP13]], i32 noundef [[TMP14]], i32 noundef [[TMP15]]) +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[ADD20:%.*]] = add nsw i32 [[TMP16]], 1 +// CHECK2-NEXT: store i32 [[ADD20]], ptr [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]]) +// CHECK2-NEXT: call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]]) +// CHECK2-NEXT: ret void +// +// // CHECK2-LABEL: define {{[^@]+}}@foo6 // CHECK2-SAME: () #[[ATTR0]] { // CHECK2-NEXT: entry: @@ -1899,6 +3710,101 @@ extern "C" void foo10() { // CHECK2-NEXT: ret void // // +// CHECK2-LABEL: define {{[^@]+}}@foo7 +// CHECK2-SAME: () #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[L:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTPERMUTED_0_IV_J:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTPERMUTED_1_IV_K:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTPERMUTED_2_IV_L:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTPERMUTED_3_IV_I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 7, ptr [[I]], align 4 +// CHECK2-NEXT: store i32 7, ptr [[J]], align 4 +// CHECK2-NEXT: store i32 7, ptr [[K]], align 4 +// CHECK2-NEXT: store i32 7, ptr [[L]], align 4 +// CHECK2-NEXT: store i32 0, ptr [[DOTPERMUTED_0_IV_J]], align 4 +// CHECK2-NEXT: br label [[FOR_COND:%.*]] +// CHECK2: for.cond: +// CHECK2-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTPERMUTED_0_IV_J]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], 4 +// CHECK2-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END24:%.*]] +// CHECK2: for.body: +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTPERMUTED_0_IV_J]], align 4 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP1]], 3 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 7, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD]], ptr [[J]], align 4 +// CHECK2-NEXT: store i32 0, ptr [[DOTPERMUTED_1_IV_K]], align 4 +// CHECK2-NEXT: br label [[FOR_COND1:%.*]] +// CHECK2: for.cond1: +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTPERMUTED_1_IV_K]], align 4 +// CHECK2-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP2]], 4 +// CHECK2-NEXT: br i1 [[CMP2]], label [[FOR_BODY3:%.*]], label [[FOR_END21:%.*]] +// CHECK2: for.body3: +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTPERMUTED_1_IV_K]], align 4 +// CHECK2-NEXT: [[MUL4:%.*]] = mul nsw i32 [[TMP3]], 3 +// CHECK2-NEXT: [[ADD5:%.*]] = add nsw i32 7, [[MUL4]] +// CHECK2-NEXT: store i32 [[ADD5]], ptr [[K]], align 4 +// CHECK2-NEXT: store i32 0, ptr [[DOTPERMUTED_2_IV_L]], align 4 +// CHECK2-NEXT: br label [[FOR_COND6:%.*]] +// CHECK2: for.cond6: +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTPERMUTED_2_IV_L]], align 4 +// CHECK2-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP4]], 4 +// CHECK2-NEXT: br i1 [[CMP7]], label [[FOR_BODY8:%.*]], label [[FOR_END18:%.*]] +// CHECK2: for.body8: +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTPERMUTED_2_IV_L]], align 4 +// CHECK2-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP5]], 3 +// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 7, [[MUL9]] +// CHECK2-NEXT: store i32 [[ADD10]], ptr [[L]], align 4 +// CHECK2-NEXT: store i32 0, ptr [[DOTPERMUTED_3_IV_I]], align 4 +// CHECK2-NEXT: br label [[FOR_COND11:%.*]] +// CHECK2: for.cond11: +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTPERMUTED_3_IV_I]], align 4 +// CHECK2-NEXT: [[CMP12:%.*]] = icmp slt i32 [[TMP6]], 4 +// CHECK2-NEXT: br i1 [[CMP12]], label [[FOR_BODY13:%.*]], label [[FOR_END:%.*]] +// CHECK2: for.body13: +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTPERMUTED_3_IV_I]], align 4 +// CHECK2-NEXT: [[MUL14:%.*]] = mul nsw i32 [[TMP7]], 3 +// CHECK2-NEXT: [[ADD15:%.*]] = add nsw i32 7, [[MUL14]] +// CHECK2-NEXT: store i32 [[ADD15]], ptr [[I]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, ptr [[I]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, ptr [[J]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, ptr [[K]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, ptr [[L]], align 4 +// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP8]], i32 noundef [[TMP9]], i32 noundef [[TMP10]], i32 noundef [[TMP11]]) +// CHECK2-NEXT: br label [[FOR_INC:%.*]] +// CHECK2: for.inc: +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTPERMUTED_3_IV_I]], align 4 +// CHECK2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP12]], 1 +// CHECK2-NEXT: store i32 [[INC]], ptr [[DOTPERMUTED_3_IV_I]], align 4 +// CHECK2-NEXT: br label [[FOR_COND11]], !llvm.loop [[LOOP9:![0-9]+]] +// CHECK2: for.end: +// CHECK2-NEXT: br label [[FOR_INC16:%.*]] +// CHECK2: for.inc16: +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTPERMUTED_2_IV_L]], align 4 +// CHECK2-NEXT: [[INC17:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK2-NEXT: store i32 [[INC17]], ptr [[DOTPERMUTED_2_IV_L]], align 4 +// CHECK2-NEXT: br label [[FOR_COND6]], !llvm.loop [[LOOP10:![0-9]+]] +// CHECK2: for.end18: +// CHECK2-NEXT: br label [[FOR_INC19:%.*]] +// CHECK2: for.inc19: +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTPERMUTED_1_IV_K]], align 4 +// CHECK2-NEXT: [[INC20:%.*]] = add nsw i32 [[TMP14]], 1 +// CHECK2-NEXT: store i32 [[INC20]], ptr [[DOTPERMUTED_1_IV_K]], align 4 +// CHECK2-NEXT: br label [[FOR_COND1]], !llvm.loop [[LOOP11:![0-9]+]] +// CHECK2: for.end21: +// CHECK2-NEXT: br label [[FOR_INC22:%.*]] +// CHECK2: for.inc22: +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTPERMUTED_0_IV_J]], align 4 +// CHECK2-NEXT: [[INC23:%.*]] = add nsw i32 [[TMP15]], 1 +// CHECK2-NEXT: store i32 [[INC23]], ptr [[DOTPERMUTED_0_IV_J]], align 4 +// CHECK2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]] +// CHECK2: for.end24: +// CHECK2-NEXT: ret void +// +// // CHECK2-LABEL: define {{[^@]+}}@foo9 // CHECK2-SAME: () #[[ATTR0]] { // CHECK2-NEXT: entry: @@ -1977,14 +3883,1442 @@ extern "C" void foo10() { // CHECK2-NEXT: [[TMP17:%.*]] = load i64, ptr [[DOTPERMUTED_1_IV___BEGIN2]], align 8 // CHECK2-NEXT: [[INC:%.*]] = add nsw i64 [[TMP17]], 1 // CHECK2-NEXT: store i64 [[INC]], ptr [[DOTPERMUTED_1_IV___BEGIN2]], align 8 -// CHECK2-NEXT: br label [[FOR_COND7]], !llvm.loop [[LOOP8:![0-9]+]] +// CHECK2-NEXT: br label [[FOR_COND7]], !llvm.loop [[LOOP13:![0-9]+]] // CHECK2: for.end: // CHECK2-NEXT: br label [[FOR_INC13:%.*]] // CHECK2: for.inc13: // CHECK2-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTPERMUTED_0_IV_I]], align 4 // CHECK2-NEXT: [[INC14:%.*]] = add nsw i32 [[TMP18]], 1 // CHECK2-NEXT: store i32 [[INC14]], ptr [[DOTPERMUTED_0_IV_I]], align 4 -// CHECK2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP9:![0-9]+]] +// CHECK2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] // CHECK2: for.end15: // CHECK2-NEXT: ret void // +// +// CHECK2-LABEL: define {{[^@]+}}@tfoo8 +// CHECK2-SAME: () #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: call void @_Z4foo8ILi32EEviii(i32 noundef 0, i32 noundef 42, i32 noundef 1) +// CHECK2-NEXT: call void @_Z4foo8ILi64EEviii(i32 noundef 0, i32 noundef 42, i32 noundef 3) +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@_Z4foo8ILi32EEviii +// CHECK2-SAME: (i32 noundef [[START:%.*]], i32 noundef [[END:%.*]], i32 noundef [[STEP:%.*]]) #[[ATTR0]] comdat { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[START_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[END_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[STEP_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[_TMP2:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[_TMP3:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_4:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTNEW_STEP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_5:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_6:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTNEW_STEP7:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_8:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_9:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTNEW_STEP10:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_11:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_14:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_16:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_17:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_22:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTFLOOR_0_IV_K:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTILE_0_IV_K:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I49:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[J50:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTFLOOR_0_IV_K51:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTILE_0_IV_K52:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]]) +// CHECK2-NEXT: store i32 [[START]], ptr [[START_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[END]], ptr [[END_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[STEP]], ptr [[STEP_ADDR]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, ptr [[START_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[END_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_4]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[STEP_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP3]], ptr [[DOTNEW_STEP]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[START_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[END_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, ptr [[STEP_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP6]], ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, ptr [[START_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP7]], ptr [[K]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, ptr [[START_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP8]], ptr [[DOTCAPTURE_EXPR_8]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, ptr [[END_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP9]], ptr [[DOTCAPTURE_EXPR_9]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, ptr [[STEP_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP10]], ptr [[DOTNEW_STEP10]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_9]], align 4 +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_8]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub i32 [[TMP11]], [[TMP12]] +// CHECK2-NEXT: [[SUB12:%.*]] = sub i32 [[SUB]], 1 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTNEW_STEP10]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add i32 [[SUB12]], [[TMP13]] +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTNEW_STEP10]], align 4 +// CHECK2-NEXT: [[DIV:%.*]] = udiv i32 [[ADD]], [[TMP14]] +// CHECK2-NEXT: [[SUB13:%.*]] = sub i32 [[DIV]], 1 +// CHECK2-NEXT: store i32 [[SUB13]], ptr [[DOTCAPTURE_EXPR_11]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_11]], align 4 +// CHECK2-NEXT: [[ADD15:%.*]] = add i32 [[TMP15]], 1 +// CHECK2-NEXT: store i32 [[ADD15]], ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, ptr [[_TMP2]], align 4 +// CHECK2-NEXT: store i32 [[TMP16]], ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_11]], align 4 +// CHECK2-NEXT: [[ADD18:%.*]] = add i32 [[TMP17]], 1 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, ptr [[_TMP2]], align 4 +// CHECK2-NEXT: [[ADD19:%.*]] = add i32 [[TMP18]], 32 +// CHECK2-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD18]], [[ADD19]] +// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_11]], align 4 +// CHECK2-NEXT: [[ADD20:%.*]] = add i32 [[TMP19]], 1 +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP20:%.*]] = load i32, ptr [[_TMP2]], align 4 +// CHECK2-NEXT: [[ADD21:%.*]] = add i32 [[TMP20]], 32 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[ADD20]], [[COND_TRUE]] ], [ [[ADD21]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4 +// CHECK2-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB23:%.*]] = sub i32 [[TMP21]], [[TMP22]] +// CHECK2-NEXT: [[SUB24:%.*]] = sub i32 [[SUB23]], 1 +// CHECK2-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4 +// CHECK2-NEXT: [[ADD25:%.*]] = add i32 [[SUB24]], [[TMP23]] +// CHECK2-NEXT: [[TMP24:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4 +// CHECK2-NEXT: [[DIV26:%.*]] = udiv i32 [[ADD25]], [[TMP24]] +// CHECK2-NEXT: [[CONV:%.*]] = zext i32 [[DIV26]] to i64 +// CHECK2-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK2-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK2-NEXT: [[SUB27:%.*]] = sub i32 [[TMP25]], [[TMP26]] +// CHECK2-NEXT: [[SUB28:%.*]] = sub i32 [[SUB27]], 1 +// CHECK2-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[ADD29:%.*]] = add i32 [[SUB28]], [[TMP27]] +// CHECK2-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[DIV30:%.*]] = udiv i32 [[ADD29]], [[TMP28]] +// CHECK2-NEXT: [[CONV31:%.*]] = zext i32 [[DIV30]] to i64 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV31]] +// CHECK2-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB32:%.*]] = sub i32 [[TMP29]], -31 +// CHECK2-NEXT: [[DIV33:%.*]] = udiv i32 [[SUB32]], 32 +// CHECK2-NEXT: [[CONV34:%.*]] = zext i32 [[DIV33]] to i64 +// CHECK2-NEXT: [[MUL35:%.*]] = mul nsw i64 [[MUL]], [[CONV34]] +// CHECK2-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB36:%.*]] = sub i32 [[TMP30]], [[TMP31]] +// CHECK2-NEXT: [[SUB37:%.*]] = sub i32 [[SUB36]], 1 +// CHECK2-NEXT: [[ADD38:%.*]] = add i32 [[SUB37]], 1 +// CHECK2-NEXT: [[DIV39:%.*]] = udiv i32 [[ADD38]], 1 +// CHECK2-NEXT: [[CONV40:%.*]] = zext i32 [[DIV39]] to i64 +// CHECK2-NEXT: [[MUL41:%.*]] = mul nsw i64 [[MUL35]], [[CONV40]] +// CHECK2-NEXT: [[SUB42:%.*]] = sub nsw i64 [[MUL41]], 1 +// CHECK2-NEXT: store i64 [[SUB42]], ptr [[DOTCAPTURE_EXPR_22]], align 8 +// CHECK2-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: store i32 [[TMP32]], ptr [[I]], align 4 +// CHECK2-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK2-NEXT: store i32 [[TMP33]], ptr [[J]], align 4 +// CHECK2-NEXT: store i32 0, ptr [[DOTFLOOR_0_IV_K]], align 4 +// CHECK2-NEXT: [[TMP34:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: store i32 [[TMP34]], ptr [[DOTTILE_0_IV_K]], align 4 +// CHECK2-NEXT: [[TMP35:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4 +// CHECK2-NEXT: [[CMP43:%.*]] = icmp slt i32 [[TMP35]], [[TMP36]] +// CHECK2-NEXT: br i1 [[CMP43]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: land.lhs.true: +// CHECK2-NEXT: [[TMP37:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK2-NEXT: [[TMP38:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK2-NEXT: [[CMP44:%.*]] = icmp slt i32 [[TMP37]], [[TMP38]] +// CHECK2-NEXT: br i1 [[CMP44]], label [[LAND_LHS_TRUE45:%.*]], label [[OMP_PRECOND_END]] +// CHECK2: land.lhs.true45: +// CHECK2-NEXT: [[TMP39:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[CMP46:%.*]] = icmp ult i32 0, [[TMP39]] +// CHECK2-NEXT: br i1 [[CMP46]], label [[LAND_LHS_TRUE47:%.*]], label [[OMP_PRECOND_END]] +// CHECK2: land.lhs.true47: +// CHECK2-NEXT: [[TMP40:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[CMP48:%.*]] = icmp ult i32 [[TMP40]], [[TMP41]] +// CHECK2-NEXT: br i1 [[CMP48]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i64 0, ptr [[DOTOMP_LB]], align 8 +// CHECK2-NEXT: [[TMP42:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_22]], align 8 +// CHECK2-NEXT: store i64 [[TMP42]], ptr [[DOTOMP_UB]], align 8 +// CHECK2-NEXT: store i64 1, ptr [[DOTOMP_STRIDE]], align 8 +// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_8(ptr @[[GLOB1]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i64 1, i64 1) +// CHECK2-NEXT: [[TMP43:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8 +// CHECK2-NEXT: [[TMP44:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_22]], align 8 +// CHECK2-NEXT: [[CMP53:%.*]] = icmp sgt i64 [[TMP43]], [[TMP44]] +// CHECK2-NEXT: br i1 [[CMP53]], label [[COND_TRUE54:%.*]], label [[COND_FALSE55:%.*]] +// CHECK2: cond.true54: +// CHECK2-NEXT: [[TMP45:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_22]], align 8 +// CHECK2-NEXT: br label [[COND_END56:%.*]] +// CHECK2: cond.false55: +// CHECK2-NEXT: [[TMP46:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8 +// CHECK2-NEXT: br label [[COND_END56]] +// CHECK2: cond.end56: +// CHECK2-NEXT: [[COND57:%.*]] = phi i64 [ [[TMP45]], [[COND_TRUE54]] ], [ [[TMP46]], [[COND_FALSE55]] ] +// CHECK2-NEXT: store i64 [[COND57]], ptr [[DOTOMP_UB]], align 8 +// CHECK2-NEXT: [[TMP47:%.*]] = load i64, ptr [[DOTOMP_LB]], align 8 +// CHECK2-NEXT: store i64 [[TMP47]], ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP48:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[TMP49:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8 +// CHECK2-NEXT: [[CMP58:%.*]] = icmp sle i64 [[TMP48]], [[TMP49]] +// CHECK2-NEXT: br i1 [[CMP58]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP50:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CONV59:%.*]] = sext i32 [[TMP50]] to i64 +// CHECK2-NEXT: [[TMP51:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[TMP52:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK2-NEXT: [[TMP53:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK2-NEXT: [[SUB60:%.*]] = sub i32 [[TMP52]], [[TMP53]] +// CHECK2-NEXT: [[SUB61:%.*]] = sub i32 [[SUB60]], 1 +// CHECK2-NEXT: [[TMP54:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[ADD62:%.*]] = add i32 [[SUB61]], [[TMP54]] +// CHECK2-NEXT: [[TMP55:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[DIV63:%.*]] = udiv i32 [[ADD62]], [[TMP55]] +// CHECK2-NEXT: [[MUL64:%.*]] = mul i32 1, [[DIV63]] +// CHECK2-NEXT: [[TMP56:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB65:%.*]] = sub i32 [[TMP56]], -31 +// CHECK2-NEXT: [[DIV66:%.*]] = udiv i32 [[SUB65]], 32 +// CHECK2-NEXT: [[MUL67:%.*]] = mul i32 [[MUL64]], [[DIV66]] +// CHECK2-NEXT: [[TMP57:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP58:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB68:%.*]] = sub i32 [[TMP57]], [[TMP58]] +// CHECK2-NEXT: [[SUB69:%.*]] = sub i32 [[SUB68]], 1 +// CHECK2-NEXT: [[ADD70:%.*]] = add i32 [[SUB69]], 1 +// CHECK2-NEXT: [[DIV71:%.*]] = udiv i32 [[ADD70]], 1 +// CHECK2-NEXT: [[MUL72:%.*]] = mul i32 [[MUL67]], [[DIV71]] +// CHECK2-NEXT: [[CONV73:%.*]] = zext i32 [[MUL72]] to i64 +// CHECK2-NEXT: [[DIV74:%.*]] = sdiv i64 [[TMP51]], [[CONV73]] +// CHECK2-NEXT: [[TMP59:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4 +// CHECK2-NEXT: [[CONV75:%.*]] = sext i32 [[TMP59]] to i64 +// CHECK2-NEXT: [[MUL76:%.*]] = mul nsw i64 [[DIV74]], [[CONV75]] +// CHECK2-NEXT: [[ADD77:%.*]] = add nsw i64 [[CONV59]], [[MUL76]] +// CHECK2-NEXT: [[CONV78:%.*]] = trunc i64 [[ADD77]] to i32 +// CHECK2-NEXT: store i32 [[CONV78]], ptr [[I49]], align 4 +// CHECK2-NEXT: [[TMP60:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK2-NEXT: [[CONV79:%.*]] = sext i32 [[TMP60]] to i64 +// CHECK2-NEXT: [[TMP61:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[TMP62:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[TMP63:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK2-NEXT: [[TMP64:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK2-NEXT: [[SUB80:%.*]] = sub i32 [[TMP63]], [[TMP64]] +// CHECK2-NEXT: [[SUB81:%.*]] = sub i32 [[SUB80]], 1 +// CHECK2-NEXT: [[TMP65:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[ADD82:%.*]] = add i32 [[SUB81]], [[TMP65]] +// CHECK2-NEXT: [[TMP66:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[DIV83:%.*]] = udiv i32 [[ADD82]], [[TMP66]] +// CHECK2-NEXT: [[MUL84:%.*]] = mul i32 1, [[DIV83]] +// CHECK2-NEXT: [[TMP67:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB85:%.*]] = sub i32 [[TMP67]], -31 +// CHECK2-NEXT: [[DIV86:%.*]] = udiv i32 [[SUB85]], 32 +// CHECK2-NEXT: [[MUL87:%.*]] = mul i32 [[MUL84]], [[DIV86]] +// CHECK2-NEXT: [[TMP68:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP69:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB88:%.*]] = sub i32 [[TMP68]], [[TMP69]] +// CHECK2-NEXT: [[SUB89:%.*]] = sub i32 [[SUB88]], 1 +// CHECK2-NEXT: [[ADD90:%.*]] = add i32 [[SUB89]], 1 +// CHECK2-NEXT: [[DIV91:%.*]] = udiv i32 [[ADD90]], 1 +// CHECK2-NEXT: [[MUL92:%.*]] = mul i32 [[MUL87]], [[DIV91]] +// CHECK2-NEXT: [[CONV93:%.*]] = zext i32 [[MUL92]] to i64 +// CHECK2-NEXT: [[DIV94:%.*]] = sdiv i64 [[TMP62]], [[CONV93]] +// CHECK2-NEXT: [[TMP70:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK2-NEXT: [[TMP71:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK2-NEXT: [[SUB95:%.*]] = sub i32 [[TMP70]], [[TMP71]] +// CHECK2-NEXT: [[SUB96:%.*]] = sub i32 [[SUB95]], 1 +// CHECK2-NEXT: [[TMP72:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[ADD97:%.*]] = add i32 [[SUB96]], [[TMP72]] +// CHECK2-NEXT: [[TMP73:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[DIV98:%.*]] = udiv i32 [[ADD97]], [[TMP73]] +// CHECK2-NEXT: [[MUL99:%.*]] = mul i32 1, [[DIV98]] +// CHECK2-NEXT: [[TMP74:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB100:%.*]] = sub i32 [[TMP74]], -31 +// CHECK2-NEXT: [[DIV101:%.*]] = udiv i32 [[SUB100]], 32 +// CHECK2-NEXT: [[MUL102:%.*]] = mul i32 [[MUL99]], [[DIV101]] +// CHECK2-NEXT: [[TMP75:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP76:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB103:%.*]] = sub i32 [[TMP75]], [[TMP76]] +// CHECK2-NEXT: [[SUB104:%.*]] = sub i32 [[SUB103]], 1 +// CHECK2-NEXT: [[ADD105:%.*]] = add i32 [[SUB104]], 1 +// CHECK2-NEXT: [[DIV106:%.*]] = udiv i32 [[ADD105]], 1 +// CHECK2-NEXT: [[MUL107:%.*]] = mul i32 [[MUL102]], [[DIV106]] +// CHECK2-NEXT: [[CONV108:%.*]] = zext i32 [[MUL107]] to i64 +// CHECK2-NEXT: [[MUL109:%.*]] = mul nsw i64 [[DIV94]], [[CONV108]] +// CHECK2-NEXT: [[SUB110:%.*]] = sub nsw i64 [[TMP61]], [[MUL109]] +// CHECK2-NEXT: [[TMP77:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB111:%.*]] = sub i32 [[TMP77]], -31 +// CHECK2-NEXT: [[DIV112:%.*]] = udiv i32 [[SUB111]], 32 +// CHECK2-NEXT: [[MUL113:%.*]] = mul i32 1, [[DIV112]] +// CHECK2-NEXT: [[TMP78:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP79:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB114:%.*]] = sub i32 [[TMP78]], [[TMP79]] +// CHECK2-NEXT: [[SUB115:%.*]] = sub i32 [[SUB114]], 1 +// CHECK2-NEXT: [[ADD116:%.*]] = add i32 [[SUB115]], 1 +// CHECK2-NEXT: [[DIV117:%.*]] = udiv i32 [[ADD116]], 1 +// CHECK2-NEXT: [[MUL118:%.*]] = mul i32 [[MUL113]], [[DIV117]] +// CHECK2-NEXT: [[CONV119:%.*]] = zext i32 [[MUL118]] to i64 +// CHECK2-NEXT: [[DIV120:%.*]] = sdiv i64 [[SUB110]], [[CONV119]] +// CHECK2-NEXT: [[TMP80:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[CONV121:%.*]] = sext i32 [[TMP80]] to i64 +// CHECK2-NEXT: [[MUL122:%.*]] = mul nsw i64 [[DIV120]], [[CONV121]] +// CHECK2-NEXT: [[ADD123:%.*]] = add nsw i64 [[CONV79]], [[MUL122]] +// CHECK2-NEXT: [[CONV124:%.*]] = trunc i64 [[ADD123]] to i32 +// CHECK2-NEXT: store i32 [[CONV124]], ptr [[J50]], align 4 +// CHECK2-NEXT: [[TMP81:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[TMP82:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[TMP83:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK2-NEXT: [[TMP84:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK2-NEXT: [[SUB125:%.*]] = sub i32 [[TMP83]], [[TMP84]] +// CHECK2-NEXT: [[SUB126:%.*]] = sub i32 [[SUB125]], 1 +// CHECK2-NEXT: [[TMP85:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[ADD127:%.*]] = add i32 [[SUB126]], [[TMP85]] +// CHECK2-NEXT: [[TMP86:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[DIV128:%.*]] = udiv i32 [[ADD127]], [[TMP86]] +// CHECK2-NEXT: [[MUL129:%.*]] = mul i32 1, [[DIV128]] +// CHECK2-NEXT: [[TMP87:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB130:%.*]] = sub i32 [[TMP87]], -31 +// CHECK2-NEXT: [[DIV131:%.*]] = udiv i32 [[SUB130]], 32 +// CHECK2-NEXT: [[MUL132:%.*]] = mul i32 [[MUL129]], [[DIV131]] +// CHECK2-NEXT: [[TMP88:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP89:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB133:%.*]] = sub i32 [[TMP88]], [[TMP89]] +// CHECK2-NEXT: [[SUB134:%.*]] = sub i32 [[SUB133]], 1 +// CHECK2-NEXT: [[ADD135:%.*]] = add i32 [[SUB134]], 1 +// CHECK2-NEXT: [[DIV136:%.*]] = udiv i32 [[ADD135]], 1 +// CHECK2-NEXT: [[MUL137:%.*]] = mul i32 [[MUL132]], [[DIV136]] +// CHECK2-NEXT: [[CONV138:%.*]] = zext i32 [[MUL137]] to i64 +// CHECK2-NEXT: [[DIV139:%.*]] = sdiv i64 [[TMP82]], [[CONV138]] +// CHECK2-NEXT: [[TMP90:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK2-NEXT: [[TMP91:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK2-NEXT: [[SUB140:%.*]] = sub i32 [[TMP90]], [[TMP91]] +// CHECK2-NEXT: [[SUB141:%.*]] = sub i32 [[SUB140]], 1 +// CHECK2-NEXT: [[TMP92:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[ADD142:%.*]] = add i32 [[SUB141]], [[TMP92]] +// CHECK2-NEXT: [[TMP93:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[DIV143:%.*]] = udiv i32 [[ADD142]], [[TMP93]] +// CHECK2-NEXT: [[MUL144:%.*]] = mul i32 1, [[DIV143]] +// CHECK2-NEXT: [[TMP94:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB145:%.*]] = sub i32 [[TMP94]], -31 +// CHECK2-NEXT: [[DIV146:%.*]] = udiv i32 [[SUB145]], 32 +// CHECK2-NEXT: [[MUL147:%.*]] = mul i32 [[MUL144]], [[DIV146]] +// CHECK2-NEXT: [[TMP95:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP96:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB148:%.*]] = sub i32 [[TMP95]], [[TMP96]] +// CHECK2-NEXT: [[SUB149:%.*]] = sub i32 [[SUB148]], 1 +// CHECK2-NEXT: [[ADD150:%.*]] = add i32 [[SUB149]], 1 +// CHECK2-NEXT: [[DIV151:%.*]] = udiv i32 [[ADD150]], 1 +// CHECK2-NEXT: [[MUL152:%.*]] = mul i32 [[MUL147]], [[DIV151]] +// CHECK2-NEXT: [[CONV153:%.*]] = zext i32 [[MUL152]] to i64 +// CHECK2-NEXT: [[MUL154:%.*]] = mul nsw i64 [[DIV139]], [[CONV153]] +// CHECK2-NEXT: [[SUB155:%.*]] = sub nsw i64 [[TMP81]], [[MUL154]] +// CHECK2-NEXT: [[TMP97:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[TMP98:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[TMP99:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK2-NEXT: [[TMP100:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK2-NEXT: [[SUB156:%.*]] = sub i32 [[TMP99]], [[TMP100]] +// CHECK2-NEXT: [[SUB157:%.*]] = sub i32 [[SUB156]], 1 +// CHECK2-NEXT: [[TMP101:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[ADD158:%.*]] = add i32 [[SUB157]], [[TMP101]] +// CHECK2-NEXT: [[TMP102:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[DIV159:%.*]] = udiv i32 [[ADD158]], [[TMP102]] +// CHECK2-NEXT: [[MUL160:%.*]] = mul i32 1, [[DIV159]] +// CHECK2-NEXT: [[TMP103:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB161:%.*]] = sub i32 [[TMP103]], -31 +// CHECK2-NEXT: [[DIV162:%.*]] = udiv i32 [[SUB161]], 32 +// CHECK2-NEXT: [[MUL163:%.*]] = mul i32 [[MUL160]], [[DIV162]] +// CHECK2-NEXT: [[TMP104:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP105:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB164:%.*]] = sub i32 [[TMP104]], [[TMP105]] +// CHECK2-NEXT: [[SUB165:%.*]] = sub i32 [[SUB164]], 1 +// CHECK2-NEXT: [[ADD166:%.*]] = add i32 [[SUB165]], 1 +// CHECK2-NEXT: [[DIV167:%.*]] = udiv i32 [[ADD166]], 1 +// CHECK2-NEXT: [[MUL168:%.*]] = mul i32 [[MUL163]], [[DIV167]] +// CHECK2-NEXT: [[CONV169:%.*]] = zext i32 [[MUL168]] to i64 +// CHECK2-NEXT: [[DIV170:%.*]] = sdiv i64 [[TMP98]], [[CONV169]] +// CHECK2-NEXT: [[TMP106:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK2-NEXT: [[TMP107:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK2-NEXT: [[SUB171:%.*]] = sub i32 [[TMP106]], [[TMP107]] +// CHECK2-NEXT: [[SUB172:%.*]] = sub i32 [[SUB171]], 1 +// CHECK2-NEXT: [[TMP108:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[ADD173:%.*]] = add i32 [[SUB172]], [[TMP108]] +// CHECK2-NEXT: [[TMP109:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[DIV174:%.*]] = udiv i32 [[ADD173]], [[TMP109]] +// CHECK2-NEXT: [[MUL175:%.*]] = mul i32 1, [[DIV174]] +// CHECK2-NEXT: [[TMP110:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB176:%.*]] = sub i32 [[TMP110]], -31 +// CHECK2-NEXT: [[DIV177:%.*]] = udiv i32 [[SUB176]], 32 +// CHECK2-NEXT: [[MUL178:%.*]] = mul i32 [[MUL175]], [[DIV177]] +// CHECK2-NEXT: [[TMP111:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP112:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB179:%.*]] = sub i32 [[TMP111]], [[TMP112]] +// CHECK2-NEXT: [[SUB180:%.*]] = sub i32 [[SUB179]], 1 +// CHECK2-NEXT: [[ADD181:%.*]] = add i32 [[SUB180]], 1 +// CHECK2-NEXT: [[DIV182:%.*]] = udiv i32 [[ADD181]], 1 +// CHECK2-NEXT: [[MUL183:%.*]] = mul i32 [[MUL178]], [[DIV182]] +// CHECK2-NEXT: [[CONV184:%.*]] = zext i32 [[MUL183]] to i64 +// CHECK2-NEXT: [[MUL185:%.*]] = mul nsw i64 [[DIV170]], [[CONV184]] +// CHECK2-NEXT: [[SUB186:%.*]] = sub nsw i64 [[TMP97]], [[MUL185]] +// CHECK2-NEXT: [[TMP113:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB187:%.*]] = sub i32 [[TMP113]], -31 +// CHECK2-NEXT: [[DIV188:%.*]] = udiv i32 [[SUB187]], 32 +// CHECK2-NEXT: [[MUL189:%.*]] = mul i32 1, [[DIV188]] +// CHECK2-NEXT: [[TMP114:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP115:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB190:%.*]] = sub i32 [[TMP114]], [[TMP115]] +// CHECK2-NEXT: [[SUB191:%.*]] = sub i32 [[SUB190]], 1 +// CHECK2-NEXT: [[ADD192:%.*]] = add i32 [[SUB191]], 1 +// CHECK2-NEXT: [[DIV193:%.*]] = udiv i32 [[ADD192]], 1 +// CHECK2-NEXT: [[MUL194:%.*]] = mul i32 [[MUL189]], [[DIV193]] +// CHECK2-NEXT: [[CONV195:%.*]] = zext i32 [[MUL194]] to i64 +// CHECK2-NEXT: [[DIV196:%.*]] = sdiv i64 [[SUB186]], [[CONV195]] +// CHECK2-NEXT: [[TMP116:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB197:%.*]] = sub i32 [[TMP116]], -31 +// CHECK2-NEXT: [[DIV198:%.*]] = udiv i32 [[SUB197]], 32 +// CHECK2-NEXT: [[MUL199:%.*]] = mul i32 1, [[DIV198]] +// CHECK2-NEXT: [[TMP117:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP118:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB200:%.*]] = sub i32 [[TMP117]], [[TMP118]] +// CHECK2-NEXT: [[SUB201:%.*]] = sub i32 [[SUB200]], 1 +// CHECK2-NEXT: [[ADD202:%.*]] = add i32 [[SUB201]], 1 +// CHECK2-NEXT: [[DIV203:%.*]] = udiv i32 [[ADD202]], 1 +// CHECK2-NEXT: [[MUL204:%.*]] = mul i32 [[MUL199]], [[DIV203]] +// CHECK2-NEXT: [[CONV205:%.*]] = zext i32 [[MUL204]] to i64 +// CHECK2-NEXT: [[MUL206:%.*]] = mul nsw i64 [[DIV196]], [[CONV205]] +// CHECK2-NEXT: [[SUB207:%.*]] = sub nsw i64 [[SUB155]], [[MUL206]] +// CHECK2-NEXT: [[TMP119:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP120:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB208:%.*]] = sub i32 [[TMP119]], [[TMP120]] +// CHECK2-NEXT: [[SUB209:%.*]] = sub i32 [[SUB208]], 1 +// CHECK2-NEXT: [[ADD210:%.*]] = add i32 [[SUB209]], 1 +// CHECK2-NEXT: [[DIV211:%.*]] = udiv i32 [[ADD210]], 1 +// CHECK2-NEXT: [[MUL212:%.*]] = mul i32 1, [[DIV211]] +// CHECK2-NEXT: [[CONV213:%.*]] = zext i32 [[MUL212]] to i64 +// CHECK2-NEXT: [[DIV214:%.*]] = sdiv i64 [[SUB207]], [[CONV213]] +// CHECK2-NEXT: [[MUL215:%.*]] = mul nsw i64 [[DIV214]], 32 +// CHECK2-NEXT: [[ADD216:%.*]] = add nsw i64 0, [[MUL215]] +// CHECK2-NEXT: [[CONV217:%.*]] = trunc i64 [[ADD216]] to i32 +// CHECK2-NEXT: store i32 [[CONV217]], ptr [[DOTFLOOR_0_IV_K51]], align 4 +// CHECK2-NEXT: [[TMP121:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[CONV218:%.*]] = zext i32 [[TMP121]] to i64 +// CHECK2-NEXT: [[TMP122:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[TMP123:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[TMP124:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK2-NEXT: [[TMP125:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK2-NEXT: [[SUB219:%.*]] = sub i32 [[TMP124]], [[TMP125]] +// CHECK2-NEXT: [[SUB220:%.*]] = sub i32 [[SUB219]], 1 +// CHECK2-NEXT: [[TMP126:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[ADD221:%.*]] = add i32 [[SUB220]], [[TMP126]] +// CHECK2-NEXT: [[TMP127:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[DIV222:%.*]] = udiv i32 [[ADD221]], [[TMP127]] +// CHECK2-NEXT: [[MUL223:%.*]] = mul i32 1, [[DIV222]] +// CHECK2-NEXT: [[TMP128:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB224:%.*]] = sub i32 [[TMP128]], -31 +// CHECK2-NEXT: [[DIV225:%.*]] = udiv i32 [[SUB224]], 32 +// CHECK2-NEXT: [[MUL226:%.*]] = mul i32 [[MUL223]], [[DIV225]] +// CHECK2-NEXT: [[TMP129:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP130:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB227:%.*]] = sub i32 [[TMP129]], [[TMP130]] +// CHECK2-NEXT: [[SUB228:%.*]] = sub i32 [[SUB227]], 1 +// CHECK2-NEXT: [[ADD229:%.*]] = add i32 [[SUB228]], 1 +// CHECK2-NEXT: [[DIV230:%.*]] = udiv i32 [[ADD229]], 1 +// CHECK2-NEXT: [[MUL231:%.*]] = mul i32 [[MUL226]], [[DIV230]] +// CHECK2-NEXT: [[CONV232:%.*]] = zext i32 [[MUL231]] to i64 +// CHECK2-NEXT: [[DIV233:%.*]] = sdiv i64 [[TMP123]], [[CONV232]] +// CHECK2-NEXT: [[TMP131:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK2-NEXT: [[TMP132:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK2-NEXT: [[SUB234:%.*]] = sub i32 [[TMP131]], [[TMP132]] +// CHECK2-NEXT: [[SUB235:%.*]] = sub i32 [[SUB234]], 1 +// CHECK2-NEXT: [[TMP133:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[ADD236:%.*]] = add i32 [[SUB235]], [[TMP133]] +// CHECK2-NEXT: [[TMP134:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[DIV237:%.*]] = udiv i32 [[ADD236]], [[TMP134]] +// CHECK2-NEXT: [[MUL238:%.*]] = mul i32 1, [[DIV237]] +// CHECK2-NEXT: [[TMP135:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB239:%.*]] = sub i32 [[TMP135]], -31 +// CHECK2-NEXT: [[DIV240:%.*]] = udiv i32 [[SUB239]], 32 +// CHECK2-NEXT: [[MUL241:%.*]] = mul i32 [[MUL238]], [[DIV240]] +// CHECK2-NEXT: [[TMP136:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP137:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB242:%.*]] = sub i32 [[TMP136]], [[TMP137]] +// CHECK2-NEXT: [[SUB243:%.*]] = sub i32 [[SUB242]], 1 +// CHECK2-NEXT: [[ADD244:%.*]] = add i32 [[SUB243]], 1 +// CHECK2-NEXT: [[DIV245:%.*]] = udiv i32 [[ADD244]], 1 +// CHECK2-NEXT: [[MUL246:%.*]] = mul i32 [[MUL241]], [[DIV245]] +// CHECK2-NEXT: [[CONV247:%.*]] = zext i32 [[MUL246]] to i64 +// CHECK2-NEXT: [[MUL248:%.*]] = mul nsw i64 [[DIV233]], [[CONV247]] +// CHECK2-NEXT: [[SUB249:%.*]] = sub nsw i64 [[TMP122]], [[MUL248]] +// CHECK2-NEXT: [[TMP138:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[TMP139:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[TMP140:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK2-NEXT: [[TMP141:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK2-NEXT: [[SUB250:%.*]] = sub i32 [[TMP140]], [[TMP141]] +// CHECK2-NEXT: [[SUB251:%.*]] = sub i32 [[SUB250]], 1 +// CHECK2-NEXT: [[TMP142:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[ADD252:%.*]] = add i32 [[SUB251]], [[TMP142]] +// CHECK2-NEXT: [[TMP143:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[DIV253:%.*]] = udiv i32 [[ADD252]], [[TMP143]] +// CHECK2-NEXT: [[MUL254:%.*]] = mul i32 1, [[DIV253]] +// CHECK2-NEXT: [[TMP144:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB255:%.*]] = sub i32 [[TMP144]], -31 +// CHECK2-NEXT: [[DIV256:%.*]] = udiv i32 [[SUB255]], 32 +// CHECK2-NEXT: [[MUL257:%.*]] = mul i32 [[MUL254]], [[DIV256]] +// CHECK2-NEXT: [[TMP145:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP146:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB258:%.*]] = sub i32 [[TMP145]], [[TMP146]] +// CHECK2-NEXT: [[SUB259:%.*]] = sub i32 [[SUB258]], 1 +// CHECK2-NEXT: [[ADD260:%.*]] = add i32 [[SUB259]], 1 +// CHECK2-NEXT: [[DIV261:%.*]] = udiv i32 [[ADD260]], 1 +// CHECK2-NEXT: [[MUL262:%.*]] = mul i32 [[MUL257]], [[DIV261]] +// CHECK2-NEXT: [[CONV263:%.*]] = zext i32 [[MUL262]] to i64 +// CHECK2-NEXT: [[DIV264:%.*]] = sdiv i64 [[TMP139]], [[CONV263]] +// CHECK2-NEXT: [[TMP147:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK2-NEXT: [[TMP148:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK2-NEXT: [[SUB265:%.*]] = sub i32 [[TMP147]], [[TMP148]] +// CHECK2-NEXT: [[SUB266:%.*]] = sub i32 [[SUB265]], 1 +// CHECK2-NEXT: [[TMP149:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[ADD267:%.*]] = add i32 [[SUB266]], [[TMP149]] +// CHECK2-NEXT: [[TMP150:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[DIV268:%.*]] = udiv i32 [[ADD267]], [[TMP150]] +// CHECK2-NEXT: [[MUL269:%.*]] = mul i32 1, [[DIV268]] +// CHECK2-NEXT: [[TMP151:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB270:%.*]] = sub i32 [[TMP151]], -31 +// CHECK2-NEXT: [[DIV271:%.*]] = udiv i32 [[SUB270]], 32 +// CHECK2-NEXT: [[MUL272:%.*]] = mul i32 [[MUL269]], [[DIV271]] +// CHECK2-NEXT: [[TMP152:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP153:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB273:%.*]] = sub i32 [[TMP152]], [[TMP153]] +// CHECK2-NEXT: [[SUB274:%.*]] = sub i32 [[SUB273]], 1 +// CHECK2-NEXT: [[ADD275:%.*]] = add i32 [[SUB274]], 1 +// CHECK2-NEXT: [[DIV276:%.*]] = udiv i32 [[ADD275]], 1 +// CHECK2-NEXT: [[MUL277:%.*]] = mul i32 [[MUL272]], [[DIV276]] +// CHECK2-NEXT: [[CONV278:%.*]] = zext i32 [[MUL277]] to i64 +// CHECK2-NEXT: [[MUL279:%.*]] = mul nsw i64 [[DIV264]], [[CONV278]] +// CHECK2-NEXT: [[SUB280:%.*]] = sub nsw i64 [[TMP138]], [[MUL279]] +// CHECK2-NEXT: [[TMP154:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB281:%.*]] = sub i32 [[TMP154]], -31 +// CHECK2-NEXT: [[DIV282:%.*]] = udiv i32 [[SUB281]], 32 +// CHECK2-NEXT: [[MUL283:%.*]] = mul i32 1, [[DIV282]] +// CHECK2-NEXT: [[TMP155:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP156:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB284:%.*]] = sub i32 [[TMP155]], [[TMP156]] +// CHECK2-NEXT: [[SUB285:%.*]] = sub i32 [[SUB284]], 1 +// CHECK2-NEXT: [[ADD286:%.*]] = add i32 [[SUB285]], 1 +// CHECK2-NEXT: [[DIV287:%.*]] = udiv i32 [[ADD286]], 1 +// CHECK2-NEXT: [[MUL288:%.*]] = mul i32 [[MUL283]], [[DIV287]] +// CHECK2-NEXT: [[CONV289:%.*]] = zext i32 [[MUL288]] to i64 +// CHECK2-NEXT: [[DIV290:%.*]] = sdiv i64 [[SUB280]], [[CONV289]] +// CHECK2-NEXT: [[TMP157:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB291:%.*]] = sub i32 [[TMP157]], -31 +// CHECK2-NEXT: [[DIV292:%.*]] = udiv i32 [[SUB291]], 32 +// CHECK2-NEXT: [[MUL293:%.*]] = mul i32 1, [[DIV292]] +// CHECK2-NEXT: [[TMP158:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP159:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB294:%.*]] = sub i32 [[TMP158]], [[TMP159]] +// CHECK2-NEXT: [[SUB295:%.*]] = sub i32 [[SUB294]], 1 +// CHECK2-NEXT: [[ADD296:%.*]] = add i32 [[SUB295]], 1 +// CHECK2-NEXT: [[DIV297:%.*]] = udiv i32 [[ADD296]], 1 +// CHECK2-NEXT: [[MUL298:%.*]] = mul i32 [[MUL293]], [[DIV297]] +// CHECK2-NEXT: [[CONV299:%.*]] = zext i32 [[MUL298]] to i64 +// CHECK2-NEXT: [[MUL300:%.*]] = mul nsw i64 [[DIV290]], [[CONV299]] +// CHECK2-NEXT: [[SUB301:%.*]] = sub nsw i64 [[SUB249]], [[MUL300]] +// CHECK2-NEXT: [[TMP160:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[TMP161:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[TMP162:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK2-NEXT: [[TMP163:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK2-NEXT: [[SUB302:%.*]] = sub i32 [[TMP162]], [[TMP163]] +// CHECK2-NEXT: [[SUB303:%.*]] = sub i32 [[SUB302]], 1 +// CHECK2-NEXT: [[TMP164:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[ADD304:%.*]] = add i32 [[SUB303]], [[TMP164]] +// CHECK2-NEXT: [[TMP165:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[DIV305:%.*]] = udiv i32 [[ADD304]], [[TMP165]] +// CHECK2-NEXT: [[MUL306:%.*]] = mul i32 1, [[DIV305]] +// CHECK2-NEXT: [[TMP166:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB307:%.*]] = sub i32 [[TMP166]], -31 +// CHECK2-NEXT: [[DIV308:%.*]] = udiv i32 [[SUB307]], 32 +// CHECK2-NEXT: [[MUL309:%.*]] = mul i32 [[MUL306]], [[DIV308]] +// CHECK2-NEXT: [[TMP167:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP168:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB310:%.*]] = sub i32 [[TMP167]], [[TMP168]] +// CHECK2-NEXT: [[SUB311:%.*]] = sub i32 [[SUB310]], 1 +// CHECK2-NEXT: [[ADD312:%.*]] = add i32 [[SUB311]], 1 +// CHECK2-NEXT: [[DIV313:%.*]] = udiv i32 [[ADD312]], 1 +// CHECK2-NEXT: [[MUL314:%.*]] = mul i32 [[MUL309]], [[DIV313]] +// CHECK2-NEXT: [[CONV315:%.*]] = zext i32 [[MUL314]] to i64 +// CHECK2-NEXT: [[DIV316:%.*]] = sdiv i64 [[TMP161]], [[CONV315]] +// CHECK2-NEXT: [[TMP169:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK2-NEXT: [[TMP170:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK2-NEXT: [[SUB317:%.*]] = sub i32 [[TMP169]], [[TMP170]] +// CHECK2-NEXT: [[SUB318:%.*]] = sub i32 [[SUB317]], 1 +// CHECK2-NEXT: [[TMP171:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[ADD319:%.*]] = add i32 [[SUB318]], [[TMP171]] +// CHECK2-NEXT: [[TMP172:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[DIV320:%.*]] = udiv i32 [[ADD319]], [[TMP172]] +// CHECK2-NEXT: [[MUL321:%.*]] = mul i32 1, [[DIV320]] +// CHECK2-NEXT: [[TMP173:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB322:%.*]] = sub i32 [[TMP173]], -31 +// CHECK2-NEXT: [[DIV323:%.*]] = udiv i32 [[SUB322]], 32 +// CHECK2-NEXT: [[MUL324:%.*]] = mul i32 [[MUL321]], [[DIV323]] +// CHECK2-NEXT: [[TMP174:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP175:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB325:%.*]] = sub i32 [[TMP174]], [[TMP175]] +// CHECK2-NEXT: [[SUB326:%.*]] = sub i32 [[SUB325]], 1 +// CHECK2-NEXT: [[ADD327:%.*]] = add i32 [[SUB326]], 1 +// CHECK2-NEXT: [[DIV328:%.*]] = udiv i32 [[ADD327]], 1 +// CHECK2-NEXT: [[MUL329:%.*]] = mul i32 [[MUL324]], [[DIV328]] +// CHECK2-NEXT: [[CONV330:%.*]] = zext i32 [[MUL329]] to i64 +// CHECK2-NEXT: [[MUL331:%.*]] = mul nsw i64 [[DIV316]], [[CONV330]] +// CHECK2-NEXT: [[SUB332:%.*]] = sub nsw i64 [[TMP160]], [[MUL331]] +// CHECK2-NEXT: [[TMP176:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[TMP177:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[TMP178:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK2-NEXT: [[TMP179:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK2-NEXT: [[SUB333:%.*]] = sub i32 [[TMP178]], [[TMP179]] +// CHECK2-NEXT: [[SUB334:%.*]] = sub i32 [[SUB333]], 1 +// CHECK2-NEXT: [[TMP180:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[ADD335:%.*]] = add i32 [[SUB334]], [[TMP180]] +// CHECK2-NEXT: [[TMP181:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[DIV336:%.*]] = udiv i32 [[ADD335]], [[TMP181]] +// CHECK2-NEXT: [[MUL337:%.*]] = mul i32 1, [[DIV336]] +// CHECK2-NEXT: [[TMP182:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB338:%.*]] = sub i32 [[TMP182]], -31 +// CHECK2-NEXT: [[DIV339:%.*]] = udiv i32 [[SUB338]], 32 +// CHECK2-NEXT: [[MUL340:%.*]] = mul i32 [[MUL337]], [[DIV339]] +// CHECK2-NEXT: [[TMP183:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP184:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB341:%.*]] = sub i32 [[TMP183]], [[TMP184]] +// CHECK2-NEXT: [[SUB342:%.*]] = sub i32 [[SUB341]], 1 +// CHECK2-NEXT: [[ADD343:%.*]] = add i32 [[SUB342]], 1 +// CHECK2-NEXT: [[DIV344:%.*]] = udiv i32 [[ADD343]], 1 +// CHECK2-NEXT: [[MUL345:%.*]] = mul i32 [[MUL340]], [[DIV344]] +// CHECK2-NEXT: [[CONV346:%.*]] = zext i32 [[MUL345]] to i64 +// CHECK2-NEXT: [[DIV347:%.*]] = sdiv i64 [[TMP177]], [[CONV346]] +// CHECK2-NEXT: [[TMP185:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK2-NEXT: [[TMP186:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK2-NEXT: [[SUB348:%.*]] = sub i32 [[TMP185]], [[TMP186]] +// CHECK2-NEXT: [[SUB349:%.*]] = sub i32 [[SUB348]], 1 +// CHECK2-NEXT: [[TMP187:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[ADD350:%.*]] = add i32 [[SUB349]], [[TMP187]] +// CHECK2-NEXT: [[TMP188:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[DIV351:%.*]] = udiv i32 [[ADD350]], [[TMP188]] +// CHECK2-NEXT: [[MUL352:%.*]] = mul i32 1, [[DIV351]] +// CHECK2-NEXT: [[TMP189:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB353:%.*]] = sub i32 [[TMP189]], -31 +// CHECK2-NEXT: [[DIV354:%.*]] = udiv i32 [[SUB353]], 32 +// CHECK2-NEXT: [[MUL355:%.*]] = mul i32 [[MUL352]], [[DIV354]] +// CHECK2-NEXT: [[TMP190:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP191:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB356:%.*]] = sub i32 [[TMP190]], [[TMP191]] +// CHECK2-NEXT: [[SUB357:%.*]] = sub i32 [[SUB356]], 1 +// CHECK2-NEXT: [[ADD358:%.*]] = add i32 [[SUB357]], 1 +// CHECK2-NEXT: [[DIV359:%.*]] = udiv i32 [[ADD358]], 1 +// CHECK2-NEXT: [[MUL360:%.*]] = mul i32 [[MUL355]], [[DIV359]] +// CHECK2-NEXT: [[CONV361:%.*]] = zext i32 [[MUL360]] to i64 +// CHECK2-NEXT: [[MUL362:%.*]] = mul nsw i64 [[DIV347]], [[CONV361]] +// CHECK2-NEXT: [[SUB363:%.*]] = sub nsw i64 [[TMP176]], [[MUL362]] +// CHECK2-NEXT: [[TMP192:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB364:%.*]] = sub i32 [[TMP192]], -31 +// CHECK2-NEXT: [[DIV365:%.*]] = udiv i32 [[SUB364]], 32 +// CHECK2-NEXT: [[MUL366:%.*]] = mul i32 1, [[DIV365]] +// CHECK2-NEXT: [[TMP193:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP194:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB367:%.*]] = sub i32 [[TMP193]], [[TMP194]] +// CHECK2-NEXT: [[SUB368:%.*]] = sub i32 [[SUB367]], 1 +// CHECK2-NEXT: [[ADD369:%.*]] = add i32 [[SUB368]], 1 +// CHECK2-NEXT: [[DIV370:%.*]] = udiv i32 [[ADD369]], 1 +// CHECK2-NEXT: [[MUL371:%.*]] = mul i32 [[MUL366]], [[DIV370]] +// CHECK2-NEXT: [[CONV372:%.*]] = zext i32 [[MUL371]] to i64 +// CHECK2-NEXT: [[DIV373:%.*]] = sdiv i64 [[SUB363]], [[CONV372]] +// CHECK2-NEXT: [[TMP195:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB374:%.*]] = sub i32 [[TMP195]], -31 +// CHECK2-NEXT: [[DIV375:%.*]] = udiv i32 [[SUB374]], 32 +// CHECK2-NEXT: [[MUL376:%.*]] = mul i32 1, [[DIV375]] +// CHECK2-NEXT: [[TMP196:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP197:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB377:%.*]] = sub i32 [[TMP196]], [[TMP197]] +// CHECK2-NEXT: [[SUB378:%.*]] = sub i32 [[SUB377]], 1 +// CHECK2-NEXT: [[ADD379:%.*]] = add i32 [[SUB378]], 1 +// CHECK2-NEXT: [[DIV380:%.*]] = udiv i32 [[ADD379]], 1 +// CHECK2-NEXT: [[MUL381:%.*]] = mul i32 [[MUL376]], [[DIV380]] +// CHECK2-NEXT: [[CONV382:%.*]] = zext i32 [[MUL381]] to i64 +// CHECK2-NEXT: [[MUL383:%.*]] = mul nsw i64 [[DIV373]], [[CONV382]] +// CHECK2-NEXT: [[SUB384:%.*]] = sub nsw i64 [[SUB332]], [[MUL383]] +// CHECK2-NEXT: [[TMP198:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP199:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB385:%.*]] = sub i32 [[TMP198]], [[TMP199]] +// CHECK2-NEXT: [[SUB386:%.*]] = sub i32 [[SUB385]], 1 +// CHECK2-NEXT: [[ADD387:%.*]] = add i32 [[SUB386]], 1 +// CHECK2-NEXT: [[DIV388:%.*]] = udiv i32 [[ADD387]], 1 +// CHECK2-NEXT: [[MUL389:%.*]] = mul i32 1, [[DIV388]] +// CHECK2-NEXT: [[CONV390:%.*]] = zext i32 [[MUL389]] to i64 +// CHECK2-NEXT: [[DIV391:%.*]] = sdiv i64 [[SUB384]], [[CONV390]] +// CHECK2-NEXT: [[TMP200:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP201:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB392:%.*]] = sub i32 [[TMP200]], [[TMP201]] +// CHECK2-NEXT: [[SUB393:%.*]] = sub i32 [[SUB392]], 1 +// CHECK2-NEXT: [[ADD394:%.*]] = add i32 [[SUB393]], 1 +// CHECK2-NEXT: [[DIV395:%.*]] = udiv i32 [[ADD394]], 1 +// CHECK2-NEXT: [[MUL396:%.*]] = mul i32 1, [[DIV395]] +// CHECK2-NEXT: [[CONV397:%.*]] = zext i32 [[MUL396]] to i64 +// CHECK2-NEXT: [[MUL398:%.*]] = mul nsw i64 [[DIV391]], [[CONV397]] +// CHECK2-NEXT: [[SUB399:%.*]] = sub nsw i64 [[SUB301]], [[MUL398]] +// CHECK2-NEXT: [[MUL400:%.*]] = mul nsw i64 [[SUB399]], 1 +// CHECK2-NEXT: [[ADD401:%.*]] = add nsw i64 [[CONV218]], [[MUL400]] +// CHECK2-NEXT: [[CONV402:%.*]] = trunc i64 [[ADD401]] to i32 +// CHECK2-NEXT: store i32 [[CONV402]], ptr [[DOTTILE_0_IV_K52]], align 4 +// CHECK2-NEXT: [[TMP202:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_8]], align 4 +// CHECK2-NEXT: [[TMP203:%.*]] = load i32, ptr [[DOTTILE_0_IV_K52]], align 4 +// CHECK2-NEXT: [[TMP204:%.*]] = load i32, ptr [[DOTNEW_STEP10]], align 4 +// CHECK2-NEXT: [[MUL403:%.*]] = mul i32 [[TMP203]], [[TMP204]] +// CHECK2-NEXT: [[ADD404:%.*]] = add i32 [[TMP202]], [[MUL403]] +// CHECK2-NEXT: store i32 [[ADD404]], ptr [[K]], align 4 +// CHECK2-NEXT: [[TMP205:%.*]] = load i32, ptr [[I49]], align 4 +// CHECK2-NEXT: [[TMP206:%.*]] = load i32, ptr [[J50]], align 4 +// CHECK2-NEXT: [[TMP207:%.*]] = load i32, ptr [[K]], align 4 +// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP205]], i32 noundef [[TMP206]], i32 noundef [[TMP207]]) +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP208:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[ADD405:%.*]] = add nsw i64 [[TMP208]], 1 +// CHECK2-NEXT: store i64 [[ADD405]], ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]]) +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]]) +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@_Z4foo8ILi64EEviii +// CHECK2-SAME: (i32 noundef [[START:%.*]], i32 noundef [[END:%.*]], i32 noundef [[STEP:%.*]]) #[[ATTR0]] comdat { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[START_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[END_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[STEP_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[_TMP2:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[_TMP3:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_4:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTNEW_STEP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_5:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_6:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTNEW_STEP7:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_8:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_9:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTNEW_STEP10:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_11:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_14:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_16:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_17:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_22:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTFLOOR_0_IV_K:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTILE_0_IV_K:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I49:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[J50:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTFLOOR_0_IV_K51:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTILE_0_IV_K52:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]]) +// CHECK2-NEXT: store i32 [[START]], ptr [[START_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[END]], ptr [[END_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[STEP]], ptr [[STEP_ADDR]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, ptr [[START_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[END_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_4]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[STEP_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP3]], ptr [[DOTNEW_STEP]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[START_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[END_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, ptr [[STEP_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP6]], ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, ptr [[START_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP7]], ptr [[K]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, ptr [[START_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP8]], ptr [[DOTCAPTURE_EXPR_8]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, ptr [[END_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP9]], ptr [[DOTCAPTURE_EXPR_9]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, ptr [[STEP_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP10]], ptr [[DOTNEW_STEP10]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_9]], align 4 +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_8]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub i32 [[TMP11]], [[TMP12]] +// CHECK2-NEXT: [[SUB12:%.*]] = sub i32 [[SUB]], 1 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTNEW_STEP10]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add i32 [[SUB12]], [[TMP13]] +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTNEW_STEP10]], align 4 +// CHECK2-NEXT: [[DIV:%.*]] = udiv i32 [[ADD]], [[TMP14]] +// CHECK2-NEXT: [[SUB13:%.*]] = sub i32 [[DIV]], 1 +// CHECK2-NEXT: store i32 [[SUB13]], ptr [[DOTCAPTURE_EXPR_11]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_11]], align 4 +// CHECK2-NEXT: [[ADD15:%.*]] = add i32 [[TMP15]], 1 +// CHECK2-NEXT: store i32 [[ADD15]], ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, ptr [[_TMP2]], align 4 +// CHECK2-NEXT: store i32 [[TMP16]], ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_11]], align 4 +// CHECK2-NEXT: [[ADD18:%.*]] = add i32 [[TMP17]], 1 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, ptr [[_TMP2]], align 4 +// CHECK2-NEXT: [[ADD19:%.*]] = add i32 [[TMP18]], 64 +// CHECK2-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD18]], [[ADD19]] +// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_11]], align 4 +// CHECK2-NEXT: [[ADD20:%.*]] = add i32 [[TMP19]], 1 +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP20:%.*]] = load i32, ptr [[_TMP2]], align 4 +// CHECK2-NEXT: [[ADD21:%.*]] = add i32 [[TMP20]], 64 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[ADD20]], [[COND_TRUE]] ], [ [[ADD21]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4 +// CHECK2-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB23:%.*]] = sub i32 [[TMP21]], [[TMP22]] +// CHECK2-NEXT: [[SUB24:%.*]] = sub i32 [[SUB23]], 1 +// CHECK2-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4 +// CHECK2-NEXT: [[ADD25:%.*]] = add i32 [[SUB24]], [[TMP23]] +// CHECK2-NEXT: [[TMP24:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4 +// CHECK2-NEXT: [[DIV26:%.*]] = udiv i32 [[ADD25]], [[TMP24]] +// CHECK2-NEXT: [[CONV:%.*]] = zext i32 [[DIV26]] to i64 +// CHECK2-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK2-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK2-NEXT: [[SUB27:%.*]] = sub i32 [[TMP25]], [[TMP26]] +// CHECK2-NEXT: [[SUB28:%.*]] = sub i32 [[SUB27]], 1 +// CHECK2-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[ADD29:%.*]] = add i32 [[SUB28]], [[TMP27]] +// CHECK2-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[DIV30:%.*]] = udiv i32 [[ADD29]], [[TMP28]] +// CHECK2-NEXT: [[CONV31:%.*]] = zext i32 [[DIV30]] to i64 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV31]] +// CHECK2-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB32:%.*]] = sub i32 [[TMP29]], -63 +// CHECK2-NEXT: [[DIV33:%.*]] = udiv i32 [[SUB32]], 64 +// CHECK2-NEXT: [[CONV34:%.*]] = zext i32 [[DIV33]] to i64 +// CHECK2-NEXT: [[MUL35:%.*]] = mul nsw i64 [[MUL]], [[CONV34]] +// CHECK2-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB36:%.*]] = sub i32 [[TMP30]], [[TMP31]] +// CHECK2-NEXT: [[SUB37:%.*]] = sub i32 [[SUB36]], 1 +// CHECK2-NEXT: [[ADD38:%.*]] = add i32 [[SUB37]], 1 +// CHECK2-NEXT: [[DIV39:%.*]] = udiv i32 [[ADD38]], 1 +// CHECK2-NEXT: [[CONV40:%.*]] = zext i32 [[DIV39]] to i64 +// CHECK2-NEXT: [[MUL41:%.*]] = mul nsw i64 [[MUL35]], [[CONV40]] +// CHECK2-NEXT: [[SUB42:%.*]] = sub nsw i64 [[MUL41]], 1 +// CHECK2-NEXT: store i64 [[SUB42]], ptr [[DOTCAPTURE_EXPR_22]], align 8 +// CHECK2-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: store i32 [[TMP32]], ptr [[I]], align 4 +// CHECK2-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK2-NEXT: store i32 [[TMP33]], ptr [[J]], align 4 +// CHECK2-NEXT: store i32 0, ptr [[DOTFLOOR_0_IV_K]], align 4 +// CHECK2-NEXT: [[TMP34:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: store i32 [[TMP34]], ptr [[DOTTILE_0_IV_K]], align 4 +// CHECK2-NEXT: [[TMP35:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4 +// CHECK2-NEXT: [[CMP43:%.*]] = icmp slt i32 [[TMP35]], [[TMP36]] +// CHECK2-NEXT: br i1 [[CMP43]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: land.lhs.true: +// CHECK2-NEXT: [[TMP37:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK2-NEXT: [[TMP38:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK2-NEXT: [[CMP44:%.*]] = icmp slt i32 [[TMP37]], [[TMP38]] +// CHECK2-NEXT: br i1 [[CMP44]], label [[LAND_LHS_TRUE45:%.*]], label [[OMP_PRECOND_END]] +// CHECK2: land.lhs.true45: +// CHECK2-NEXT: [[TMP39:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[CMP46:%.*]] = icmp ult i32 0, [[TMP39]] +// CHECK2-NEXT: br i1 [[CMP46]], label [[LAND_LHS_TRUE47:%.*]], label [[OMP_PRECOND_END]] +// CHECK2: land.lhs.true47: +// CHECK2-NEXT: [[TMP40:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[CMP48:%.*]] = icmp ult i32 [[TMP40]], [[TMP41]] +// CHECK2-NEXT: br i1 [[CMP48]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i64 0, ptr [[DOTOMP_LB]], align 8 +// CHECK2-NEXT: [[TMP42:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_22]], align 8 +// CHECK2-NEXT: store i64 [[TMP42]], ptr [[DOTOMP_UB]], align 8 +// CHECK2-NEXT: store i64 1, ptr [[DOTOMP_STRIDE]], align 8 +// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_8(ptr @[[GLOB1]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i64 1, i64 1) +// CHECK2-NEXT: [[TMP43:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8 +// CHECK2-NEXT: [[TMP44:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_22]], align 8 +// CHECK2-NEXT: [[CMP53:%.*]] = icmp sgt i64 [[TMP43]], [[TMP44]] +// CHECK2-NEXT: br i1 [[CMP53]], label [[COND_TRUE54:%.*]], label [[COND_FALSE55:%.*]] +// CHECK2: cond.true54: +// CHECK2-NEXT: [[TMP45:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_22]], align 8 +// CHECK2-NEXT: br label [[COND_END56:%.*]] +// CHECK2: cond.false55: +// CHECK2-NEXT: [[TMP46:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8 +// CHECK2-NEXT: br label [[COND_END56]] +// CHECK2: cond.end56: +// CHECK2-NEXT: [[COND57:%.*]] = phi i64 [ [[TMP45]], [[COND_TRUE54]] ], [ [[TMP46]], [[COND_FALSE55]] ] +// CHECK2-NEXT: store i64 [[COND57]], ptr [[DOTOMP_UB]], align 8 +// CHECK2-NEXT: [[TMP47:%.*]] = load i64, ptr [[DOTOMP_LB]], align 8 +// CHECK2-NEXT: store i64 [[TMP47]], ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP48:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[TMP49:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8 +// CHECK2-NEXT: [[CMP58:%.*]] = icmp sle i64 [[TMP48]], [[TMP49]] +// CHECK2-NEXT: br i1 [[CMP58]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP50:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CONV59:%.*]] = sext i32 [[TMP50]] to i64 +// CHECK2-NEXT: [[TMP51:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[TMP52:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK2-NEXT: [[TMP53:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK2-NEXT: [[SUB60:%.*]] = sub i32 [[TMP52]], [[TMP53]] +// CHECK2-NEXT: [[SUB61:%.*]] = sub i32 [[SUB60]], 1 +// CHECK2-NEXT: [[TMP54:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[ADD62:%.*]] = add i32 [[SUB61]], [[TMP54]] +// CHECK2-NEXT: [[TMP55:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[DIV63:%.*]] = udiv i32 [[ADD62]], [[TMP55]] +// CHECK2-NEXT: [[MUL64:%.*]] = mul i32 1, [[DIV63]] +// CHECK2-NEXT: [[TMP56:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB65:%.*]] = sub i32 [[TMP56]], -63 +// CHECK2-NEXT: [[DIV66:%.*]] = udiv i32 [[SUB65]], 64 +// CHECK2-NEXT: [[MUL67:%.*]] = mul i32 [[MUL64]], [[DIV66]] +// CHECK2-NEXT: [[TMP57:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP58:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB68:%.*]] = sub i32 [[TMP57]], [[TMP58]] +// CHECK2-NEXT: [[SUB69:%.*]] = sub i32 [[SUB68]], 1 +// CHECK2-NEXT: [[ADD70:%.*]] = add i32 [[SUB69]], 1 +// CHECK2-NEXT: [[DIV71:%.*]] = udiv i32 [[ADD70]], 1 +// CHECK2-NEXT: [[MUL72:%.*]] = mul i32 [[MUL67]], [[DIV71]] +// CHECK2-NEXT: [[CONV73:%.*]] = zext i32 [[MUL72]] to i64 +// CHECK2-NEXT: [[DIV74:%.*]] = sdiv i64 [[TMP51]], [[CONV73]] +// CHECK2-NEXT: [[TMP59:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4 +// CHECK2-NEXT: [[CONV75:%.*]] = sext i32 [[TMP59]] to i64 +// CHECK2-NEXT: [[MUL76:%.*]] = mul nsw i64 [[DIV74]], [[CONV75]] +// CHECK2-NEXT: [[ADD77:%.*]] = add nsw i64 [[CONV59]], [[MUL76]] +// CHECK2-NEXT: [[CONV78:%.*]] = trunc i64 [[ADD77]] to i32 +// CHECK2-NEXT: store i32 [[CONV78]], ptr [[I49]], align 4 +// CHECK2-NEXT: [[TMP60:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK2-NEXT: [[CONV79:%.*]] = sext i32 [[TMP60]] to i64 +// CHECK2-NEXT: [[TMP61:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[TMP62:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[TMP63:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK2-NEXT: [[TMP64:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK2-NEXT: [[SUB80:%.*]] = sub i32 [[TMP63]], [[TMP64]] +// CHECK2-NEXT: [[SUB81:%.*]] = sub i32 [[SUB80]], 1 +// CHECK2-NEXT: [[TMP65:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[ADD82:%.*]] = add i32 [[SUB81]], [[TMP65]] +// CHECK2-NEXT: [[TMP66:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[DIV83:%.*]] = udiv i32 [[ADD82]], [[TMP66]] +// CHECK2-NEXT: [[MUL84:%.*]] = mul i32 1, [[DIV83]] +// CHECK2-NEXT: [[TMP67:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB85:%.*]] = sub i32 [[TMP67]], -63 +// CHECK2-NEXT: [[DIV86:%.*]] = udiv i32 [[SUB85]], 64 +// CHECK2-NEXT: [[MUL87:%.*]] = mul i32 [[MUL84]], [[DIV86]] +// CHECK2-NEXT: [[TMP68:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP69:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB88:%.*]] = sub i32 [[TMP68]], [[TMP69]] +// CHECK2-NEXT: [[SUB89:%.*]] = sub i32 [[SUB88]], 1 +// CHECK2-NEXT: [[ADD90:%.*]] = add i32 [[SUB89]], 1 +// CHECK2-NEXT: [[DIV91:%.*]] = udiv i32 [[ADD90]], 1 +// CHECK2-NEXT: [[MUL92:%.*]] = mul i32 [[MUL87]], [[DIV91]] +// CHECK2-NEXT: [[CONV93:%.*]] = zext i32 [[MUL92]] to i64 +// CHECK2-NEXT: [[DIV94:%.*]] = sdiv i64 [[TMP62]], [[CONV93]] +// CHECK2-NEXT: [[TMP70:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK2-NEXT: [[TMP71:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK2-NEXT: [[SUB95:%.*]] = sub i32 [[TMP70]], [[TMP71]] +// CHECK2-NEXT: [[SUB96:%.*]] = sub i32 [[SUB95]], 1 +// CHECK2-NEXT: [[TMP72:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[ADD97:%.*]] = add i32 [[SUB96]], [[TMP72]] +// CHECK2-NEXT: [[TMP73:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[DIV98:%.*]] = udiv i32 [[ADD97]], [[TMP73]] +// CHECK2-NEXT: [[MUL99:%.*]] = mul i32 1, [[DIV98]] +// CHECK2-NEXT: [[TMP74:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB100:%.*]] = sub i32 [[TMP74]], -63 +// CHECK2-NEXT: [[DIV101:%.*]] = udiv i32 [[SUB100]], 64 +// CHECK2-NEXT: [[MUL102:%.*]] = mul i32 [[MUL99]], [[DIV101]] +// CHECK2-NEXT: [[TMP75:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP76:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB103:%.*]] = sub i32 [[TMP75]], [[TMP76]] +// CHECK2-NEXT: [[SUB104:%.*]] = sub i32 [[SUB103]], 1 +// CHECK2-NEXT: [[ADD105:%.*]] = add i32 [[SUB104]], 1 +// CHECK2-NEXT: [[DIV106:%.*]] = udiv i32 [[ADD105]], 1 +// CHECK2-NEXT: [[MUL107:%.*]] = mul i32 [[MUL102]], [[DIV106]] +// CHECK2-NEXT: [[CONV108:%.*]] = zext i32 [[MUL107]] to i64 +// CHECK2-NEXT: [[MUL109:%.*]] = mul nsw i64 [[DIV94]], [[CONV108]] +// CHECK2-NEXT: [[SUB110:%.*]] = sub nsw i64 [[TMP61]], [[MUL109]] +// CHECK2-NEXT: [[TMP77:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB111:%.*]] = sub i32 [[TMP77]], -63 +// CHECK2-NEXT: [[DIV112:%.*]] = udiv i32 [[SUB111]], 64 +// CHECK2-NEXT: [[MUL113:%.*]] = mul i32 1, [[DIV112]] +// CHECK2-NEXT: [[TMP78:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP79:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB114:%.*]] = sub i32 [[TMP78]], [[TMP79]] +// CHECK2-NEXT: [[SUB115:%.*]] = sub i32 [[SUB114]], 1 +// CHECK2-NEXT: [[ADD116:%.*]] = add i32 [[SUB115]], 1 +// CHECK2-NEXT: [[DIV117:%.*]] = udiv i32 [[ADD116]], 1 +// CHECK2-NEXT: [[MUL118:%.*]] = mul i32 [[MUL113]], [[DIV117]] +// CHECK2-NEXT: [[CONV119:%.*]] = zext i32 [[MUL118]] to i64 +// CHECK2-NEXT: [[DIV120:%.*]] = sdiv i64 [[SUB110]], [[CONV119]] +// CHECK2-NEXT: [[TMP80:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[CONV121:%.*]] = sext i32 [[TMP80]] to i64 +// CHECK2-NEXT: [[MUL122:%.*]] = mul nsw i64 [[DIV120]], [[CONV121]] +// CHECK2-NEXT: [[ADD123:%.*]] = add nsw i64 [[CONV79]], [[MUL122]] +// CHECK2-NEXT: [[CONV124:%.*]] = trunc i64 [[ADD123]] to i32 +// CHECK2-NEXT: store i32 [[CONV124]], ptr [[J50]], align 4 +// CHECK2-NEXT: [[TMP81:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[TMP82:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[TMP83:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK2-NEXT: [[TMP84:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK2-NEXT: [[SUB125:%.*]] = sub i32 [[TMP83]], [[TMP84]] +// CHECK2-NEXT: [[SUB126:%.*]] = sub i32 [[SUB125]], 1 +// CHECK2-NEXT: [[TMP85:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[ADD127:%.*]] = add i32 [[SUB126]], [[TMP85]] +// CHECK2-NEXT: [[TMP86:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[DIV128:%.*]] = udiv i32 [[ADD127]], [[TMP86]] +// CHECK2-NEXT: [[MUL129:%.*]] = mul i32 1, [[DIV128]] +// CHECK2-NEXT: [[TMP87:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB130:%.*]] = sub i32 [[TMP87]], -63 +// CHECK2-NEXT: [[DIV131:%.*]] = udiv i32 [[SUB130]], 64 +// CHECK2-NEXT: [[MUL132:%.*]] = mul i32 [[MUL129]], [[DIV131]] +// CHECK2-NEXT: [[TMP88:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP89:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB133:%.*]] = sub i32 [[TMP88]], [[TMP89]] +// CHECK2-NEXT: [[SUB134:%.*]] = sub i32 [[SUB133]], 1 +// CHECK2-NEXT: [[ADD135:%.*]] = add i32 [[SUB134]], 1 +// CHECK2-NEXT: [[DIV136:%.*]] = udiv i32 [[ADD135]], 1 +// CHECK2-NEXT: [[MUL137:%.*]] = mul i32 [[MUL132]], [[DIV136]] +// CHECK2-NEXT: [[CONV138:%.*]] = zext i32 [[MUL137]] to i64 +// CHECK2-NEXT: [[DIV139:%.*]] = sdiv i64 [[TMP82]], [[CONV138]] +// CHECK2-NEXT: [[TMP90:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK2-NEXT: [[TMP91:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK2-NEXT: [[SUB140:%.*]] = sub i32 [[TMP90]], [[TMP91]] +// CHECK2-NEXT: [[SUB141:%.*]] = sub i32 [[SUB140]], 1 +// CHECK2-NEXT: [[TMP92:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[ADD142:%.*]] = add i32 [[SUB141]], [[TMP92]] +// CHECK2-NEXT: [[TMP93:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[DIV143:%.*]] = udiv i32 [[ADD142]], [[TMP93]] +// CHECK2-NEXT: [[MUL144:%.*]] = mul i32 1, [[DIV143]] +// CHECK2-NEXT: [[TMP94:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB145:%.*]] = sub i32 [[TMP94]], -63 +// CHECK2-NEXT: [[DIV146:%.*]] = udiv i32 [[SUB145]], 64 +// CHECK2-NEXT: [[MUL147:%.*]] = mul i32 [[MUL144]], [[DIV146]] +// CHECK2-NEXT: [[TMP95:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP96:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB148:%.*]] = sub i32 [[TMP95]], [[TMP96]] +// CHECK2-NEXT: [[SUB149:%.*]] = sub i32 [[SUB148]], 1 +// CHECK2-NEXT: [[ADD150:%.*]] = add i32 [[SUB149]], 1 +// CHECK2-NEXT: [[DIV151:%.*]] = udiv i32 [[ADD150]], 1 +// CHECK2-NEXT: [[MUL152:%.*]] = mul i32 [[MUL147]], [[DIV151]] +// CHECK2-NEXT: [[CONV153:%.*]] = zext i32 [[MUL152]] to i64 +// CHECK2-NEXT: [[MUL154:%.*]] = mul nsw i64 [[DIV139]], [[CONV153]] +// CHECK2-NEXT: [[SUB155:%.*]] = sub nsw i64 [[TMP81]], [[MUL154]] +// CHECK2-NEXT: [[TMP97:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[TMP98:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[TMP99:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK2-NEXT: [[TMP100:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK2-NEXT: [[SUB156:%.*]] = sub i32 [[TMP99]], [[TMP100]] +// CHECK2-NEXT: [[SUB157:%.*]] = sub i32 [[SUB156]], 1 +// CHECK2-NEXT: [[TMP101:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[ADD158:%.*]] = add i32 [[SUB157]], [[TMP101]] +// CHECK2-NEXT: [[TMP102:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[DIV159:%.*]] = udiv i32 [[ADD158]], [[TMP102]] +// CHECK2-NEXT: [[MUL160:%.*]] = mul i32 1, [[DIV159]] +// CHECK2-NEXT: [[TMP103:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB161:%.*]] = sub i32 [[TMP103]], -63 +// CHECK2-NEXT: [[DIV162:%.*]] = udiv i32 [[SUB161]], 64 +// CHECK2-NEXT: [[MUL163:%.*]] = mul i32 [[MUL160]], [[DIV162]] +// CHECK2-NEXT: [[TMP104:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP105:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB164:%.*]] = sub i32 [[TMP104]], [[TMP105]] +// CHECK2-NEXT: [[SUB165:%.*]] = sub i32 [[SUB164]], 1 +// CHECK2-NEXT: [[ADD166:%.*]] = add i32 [[SUB165]], 1 +// CHECK2-NEXT: [[DIV167:%.*]] = udiv i32 [[ADD166]], 1 +// CHECK2-NEXT: [[MUL168:%.*]] = mul i32 [[MUL163]], [[DIV167]] +// CHECK2-NEXT: [[CONV169:%.*]] = zext i32 [[MUL168]] to i64 +// CHECK2-NEXT: [[DIV170:%.*]] = sdiv i64 [[TMP98]], [[CONV169]] +// CHECK2-NEXT: [[TMP106:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK2-NEXT: [[TMP107:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK2-NEXT: [[SUB171:%.*]] = sub i32 [[TMP106]], [[TMP107]] +// CHECK2-NEXT: [[SUB172:%.*]] = sub i32 [[SUB171]], 1 +// CHECK2-NEXT: [[TMP108:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[ADD173:%.*]] = add i32 [[SUB172]], [[TMP108]] +// CHECK2-NEXT: [[TMP109:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[DIV174:%.*]] = udiv i32 [[ADD173]], [[TMP109]] +// CHECK2-NEXT: [[MUL175:%.*]] = mul i32 1, [[DIV174]] +// CHECK2-NEXT: [[TMP110:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB176:%.*]] = sub i32 [[TMP110]], -63 +// CHECK2-NEXT: [[DIV177:%.*]] = udiv i32 [[SUB176]], 64 +// CHECK2-NEXT: [[MUL178:%.*]] = mul i32 [[MUL175]], [[DIV177]] +// CHECK2-NEXT: [[TMP111:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP112:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB179:%.*]] = sub i32 [[TMP111]], [[TMP112]] +// CHECK2-NEXT: [[SUB180:%.*]] = sub i32 [[SUB179]], 1 +// CHECK2-NEXT: [[ADD181:%.*]] = add i32 [[SUB180]], 1 +// CHECK2-NEXT: [[DIV182:%.*]] = udiv i32 [[ADD181]], 1 +// CHECK2-NEXT: [[MUL183:%.*]] = mul i32 [[MUL178]], [[DIV182]] +// CHECK2-NEXT: [[CONV184:%.*]] = zext i32 [[MUL183]] to i64 +// CHECK2-NEXT: [[MUL185:%.*]] = mul nsw i64 [[DIV170]], [[CONV184]] +// CHECK2-NEXT: [[SUB186:%.*]] = sub nsw i64 [[TMP97]], [[MUL185]] +// CHECK2-NEXT: [[TMP113:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB187:%.*]] = sub i32 [[TMP113]], -63 +// CHECK2-NEXT: [[DIV188:%.*]] = udiv i32 [[SUB187]], 64 +// CHECK2-NEXT: [[MUL189:%.*]] = mul i32 1, [[DIV188]] +// CHECK2-NEXT: [[TMP114:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP115:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB190:%.*]] = sub i32 [[TMP114]], [[TMP115]] +// CHECK2-NEXT: [[SUB191:%.*]] = sub i32 [[SUB190]], 1 +// CHECK2-NEXT: [[ADD192:%.*]] = add i32 [[SUB191]], 1 +// CHECK2-NEXT: [[DIV193:%.*]] = udiv i32 [[ADD192]], 1 +// CHECK2-NEXT: [[MUL194:%.*]] = mul i32 [[MUL189]], [[DIV193]] +// CHECK2-NEXT: [[CONV195:%.*]] = zext i32 [[MUL194]] to i64 +// CHECK2-NEXT: [[DIV196:%.*]] = sdiv i64 [[SUB186]], [[CONV195]] +// CHECK2-NEXT: [[TMP116:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB197:%.*]] = sub i32 [[TMP116]], -63 +// CHECK2-NEXT: [[DIV198:%.*]] = udiv i32 [[SUB197]], 64 +// CHECK2-NEXT: [[MUL199:%.*]] = mul i32 1, [[DIV198]] +// CHECK2-NEXT: [[TMP117:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP118:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB200:%.*]] = sub i32 [[TMP117]], [[TMP118]] +// CHECK2-NEXT: [[SUB201:%.*]] = sub i32 [[SUB200]], 1 +// CHECK2-NEXT: [[ADD202:%.*]] = add i32 [[SUB201]], 1 +// CHECK2-NEXT: [[DIV203:%.*]] = udiv i32 [[ADD202]], 1 +// CHECK2-NEXT: [[MUL204:%.*]] = mul i32 [[MUL199]], [[DIV203]] +// CHECK2-NEXT: [[CONV205:%.*]] = zext i32 [[MUL204]] to i64 +// CHECK2-NEXT: [[MUL206:%.*]] = mul nsw i64 [[DIV196]], [[CONV205]] +// CHECK2-NEXT: [[SUB207:%.*]] = sub nsw i64 [[SUB155]], [[MUL206]] +// CHECK2-NEXT: [[TMP119:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP120:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB208:%.*]] = sub i32 [[TMP119]], [[TMP120]] +// CHECK2-NEXT: [[SUB209:%.*]] = sub i32 [[SUB208]], 1 +// CHECK2-NEXT: [[ADD210:%.*]] = add i32 [[SUB209]], 1 +// CHECK2-NEXT: [[DIV211:%.*]] = udiv i32 [[ADD210]], 1 +// CHECK2-NEXT: [[MUL212:%.*]] = mul i32 1, [[DIV211]] +// CHECK2-NEXT: [[CONV213:%.*]] = zext i32 [[MUL212]] to i64 +// CHECK2-NEXT: [[DIV214:%.*]] = sdiv i64 [[SUB207]], [[CONV213]] +// CHECK2-NEXT: [[MUL215:%.*]] = mul nsw i64 [[DIV214]], 64 +// CHECK2-NEXT: [[ADD216:%.*]] = add nsw i64 0, [[MUL215]] +// CHECK2-NEXT: [[CONV217:%.*]] = trunc i64 [[ADD216]] to i32 +// CHECK2-NEXT: store i32 [[CONV217]], ptr [[DOTFLOOR_0_IV_K51]], align 4 +// CHECK2-NEXT: [[TMP121:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[CONV218:%.*]] = zext i32 [[TMP121]] to i64 +// CHECK2-NEXT: [[TMP122:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[TMP123:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[TMP124:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK2-NEXT: [[TMP125:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK2-NEXT: [[SUB219:%.*]] = sub i32 [[TMP124]], [[TMP125]] +// CHECK2-NEXT: [[SUB220:%.*]] = sub i32 [[SUB219]], 1 +// CHECK2-NEXT: [[TMP126:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[ADD221:%.*]] = add i32 [[SUB220]], [[TMP126]] +// CHECK2-NEXT: [[TMP127:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[DIV222:%.*]] = udiv i32 [[ADD221]], [[TMP127]] +// CHECK2-NEXT: [[MUL223:%.*]] = mul i32 1, [[DIV222]] +// CHECK2-NEXT: [[TMP128:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB224:%.*]] = sub i32 [[TMP128]], -63 +// CHECK2-NEXT: [[DIV225:%.*]] = udiv i32 [[SUB224]], 64 +// CHECK2-NEXT: [[MUL226:%.*]] = mul i32 [[MUL223]], [[DIV225]] +// CHECK2-NEXT: [[TMP129:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP130:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB227:%.*]] = sub i32 [[TMP129]], [[TMP130]] +// CHECK2-NEXT: [[SUB228:%.*]] = sub i32 [[SUB227]], 1 +// CHECK2-NEXT: [[ADD229:%.*]] = add i32 [[SUB228]], 1 +// CHECK2-NEXT: [[DIV230:%.*]] = udiv i32 [[ADD229]], 1 +// CHECK2-NEXT: [[MUL231:%.*]] = mul i32 [[MUL226]], [[DIV230]] +// CHECK2-NEXT: [[CONV232:%.*]] = zext i32 [[MUL231]] to i64 +// CHECK2-NEXT: [[DIV233:%.*]] = sdiv i64 [[TMP123]], [[CONV232]] +// CHECK2-NEXT: [[TMP131:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK2-NEXT: [[TMP132:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK2-NEXT: [[SUB234:%.*]] = sub i32 [[TMP131]], [[TMP132]] +// CHECK2-NEXT: [[SUB235:%.*]] = sub i32 [[SUB234]], 1 +// CHECK2-NEXT: [[TMP133:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[ADD236:%.*]] = add i32 [[SUB235]], [[TMP133]] +// CHECK2-NEXT: [[TMP134:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[DIV237:%.*]] = udiv i32 [[ADD236]], [[TMP134]] +// CHECK2-NEXT: [[MUL238:%.*]] = mul i32 1, [[DIV237]] +// CHECK2-NEXT: [[TMP135:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB239:%.*]] = sub i32 [[TMP135]], -63 +// CHECK2-NEXT: [[DIV240:%.*]] = udiv i32 [[SUB239]], 64 +// CHECK2-NEXT: [[MUL241:%.*]] = mul i32 [[MUL238]], [[DIV240]] +// CHECK2-NEXT: [[TMP136:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP137:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB242:%.*]] = sub i32 [[TMP136]], [[TMP137]] +// CHECK2-NEXT: [[SUB243:%.*]] = sub i32 [[SUB242]], 1 +// CHECK2-NEXT: [[ADD244:%.*]] = add i32 [[SUB243]], 1 +// CHECK2-NEXT: [[DIV245:%.*]] = udiv i32 [[ADD244]], 1 +// CHECK2-NEXT: [[MUL246:%.*]] = mul i32 [[MUL241]], [[DIV245]] +// CHECK2-NEXT: [[CONV247:%.*]] = zext i32 [[MUL246]] to i64 +// CHECK2-NEXT: [[MUL248:%.*]] = mul nsw i64 [[DIV233]], [[CONV247]] +// CHECK2-NEXT: [[SUB249:%.*]] = sub nsw i64 [[TMP122]], [[MUL248]] +// CHECK2-NEXT: [[TMP138:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[TMP139:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[TMP140:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK2-NEXT: [[TMP141:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK2-NEXT: [[SUB250:%.*]] = sub i32 [[TMP140]], [[TMP141]] +// CHECK2-NEXT: [[SUB251:%.*]] = sub i32 [[SUB250]], 1 +// CHECK2-NEXT: [[TMP142:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[ADD252:%.*]] = add i32 [[SUB251]], [[TMP142]] +// CHECK2-NEXT: [[TMP143:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[DIV253:%.*]] = udiv i32 [[ADD252]], [[TMP143]] +// CHECK2-NEXT: [[MUL254:%.*]] = mul i32 1, [[DIV253]] +// CHECK2-NEXT: [[TMP144:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB255:%.*]] = sub i32 [[TMP144]], -63 +// CHECK2-NEXT: [[DIV256:%.*]] = udiv i32 [[SUB255]], 64 +// CHECK2-NEXT: [[MUL257:%.*]] = mul i32 [[MUL254]], [[DIV256]] +// CHECK2-NEXT: [[TMP145:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP146:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB258:%.*]] = sub i32 [[TMP145]], [[TMP146]] +// CHECK2-NEXT: [[SUB259:%.*]] = sub i32 [[SUB258]], 1 +// CHECK2-NEXT: [[ADD260:%.*]] = add i32 [[SUB259]], 1 +// CHECK2-NEXT: [[DIV261:%.*]] = udiv i32 [[ADD260]], 1 +// CHECK2-NEXT: [[MUL262:%.*]] = mul i32 [[MUL257]], [[DIV261]] +// CHECK2-NEXT: [[CONV263:%.*]] = zext i32 [[MUL262]] to i64 +// CHECK2-NEXT: [[DIV264:%.*]] = sdiv i64 [[TMP139]], [[CONV263]] +// CHECK2-NEXT: [[TMP147:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK2-NEXT: [[TMP148:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK2-NEXT: [[SUB265:%.*]] = sub i32 [[TMP147]], [[TMP148]] +// CHECK2-NEXT: [[SUB266:%.*]] = sub i32 [[SUB265]], 1 +// CHECK2-NEXT: [[TMP149:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[ADD267:%.*]] = add i32 [[SUB266]], [[TMP149]] +// CHECK2-NEXT: [[TMP150:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[DIV268:%.*]] = udiv i32 [[ADD267]], [[TMP150]] +// CHECK2-NEXT: [[MUL269:%.*]] = mul i32 1, [[DIV268]] +// CHECK2-NEXT: [[TMP151:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB270:%.*]] = sub i32 [[TMP151]], -63 +// CHECK2-NEXT: [[DIV271:%.*]] = udiv i32 [[SUB270]], 64 +// CHECK2-NEXT: [[MUL272:%.*]] = mul i32 [[MUL269]], [[DIV271]] +// CHECK2-NEXT: [[TMP152:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP153:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB273:%.*]] = sub i32 [[TMP152]], [[TMP153]] +// CHECK2-NEXT: [[SUB274:%.*]] = sub i32 [[SUB273]], 1 +// CHECK2-NEXT: [[ADD275:%.*]] = add i32 [[SUB274]], 1 +// CHECK2-NEXT: [[DIV276:%.*]] = udiv i32 [[ADD275]], 1 +// CHECK2-NEXT: [[MUL277:%.*]] = mul i32 [[MUL272]], [[DIV276]] +// CHECK2-NEXT: [[CONV278:%.*]] = zext i32 [[MUL277]] to i64 +// CHECK2-NEXT: [[MUL279:%.*]] = mul nsw i64 [[DIV264]], [[CONV278]] +// CHECK2-NEXT: [[SUB280:%.*]] = sub nsw i64 [[TMP138]], [[MUL279]] +// CHECK2-NEXT: [[TMP154:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB281:%.*]] = sub i32 [[TMP154]], -63 +// CHECK2-NEXT: [[DIV282:%.*]] = udiv i32 [[SUB281]], 64 +// CHECK2-NEXT: [[MUL283:%.*]] = mul i32 1, [[DIV282]] +// CHECK2-NEXT: [[TMP155:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP156:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB284:%.*]] = sub i32 [[TMP155]], [[TMP156]] +// CHECK2-NEXT: [[SUB285:%.*]] = sub i32 [[SUB284]], 1 +// CHECK2-NEXT: [[ADD286:%.*]] = add i32 [[SUB285]], 1 +// CHECK2-NEXT: [[DIV287:%.*]] = udiv i32 [[ADD286]], 1 +// CHECK2-NEXT: [[MUL288:%.*]] = mul i32 [[MUL283]], [[DIV287]] +// CHECK2-NEXT: [[CONV289:%.*]] = zext i32 [[MUL288]] to i64 +// CHECK2-NEXT: [[DIV290:%.*]] = sdiv i64 [[SUB280]], [[CONV289]] +// CHECK2-NEXT: [[TMP157:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB291:%.*]] = sub i32 [[TMP157]], -63 +// CHECK2-NEXT: [[DIV292:%.*]] = udiv i32 [[SUB291]], 64 +// CHECK2-NEXT: [[MUL293:%.*]] = mul i32 1, [[DIV292]] +// CHECK2-NEXT: [[TMP158:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP159:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB294:%.*]] = sub i32 [[TMP158]], [[TMP159]] +// CHECK2-NEXT: [[SUB295:%.*]] = sub i32 [[SUB294]], 1 +// CHECK2-NEXT: [[ADD296:%.*]] = add i32 [[SUB295]], 1 +// CHECK2-NEXT: [[DIV297:%.*]] = udiv i32 [[ADD296]], 1 +// CHECK2-NEXT: [[MUL298:%.*]] = mul i32 [[MUL293]], [[DIV297]] +// CHECK2-NEXT: [[CONV299:%.*]] = zext i32 [[MUL298]] to i64 +// CHECK2-NEXT: [[MUL300:%.*]] = mul nsw i64 [[DIV290]], [[CONV299]] +// CHECK2-NEXT: [[SUB301:%.*]] = sub nsw i64 [[SUB249]], [[MUL300]] +// CHECK2-NEXT: [[TMP160:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[TMP161:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[TMP162:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK2-NEXT: [[TMP163:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK2-NEXT: [[SUB302:%.*]] = sub i32 [[TMP162]], [[TMP163]] +// CHECK2-NEXT: [[SUB303:%.*]] = sub i32 [[SUB302]], 1 +// CHECK2-NEXT: [[TMP164:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[ADD304:%.*]] = add i32 [[SUB303]], [[TMP164]] +// CHECK2-NEXT: [[TMP165:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[DIV305:%.*]] = udiv i32 [[ADD304]], [[TMP165]] +// CHECK2-NEXT: [[MUL306:%.*]] = mul i32 1, [[DIV305]] +// CHECK2-NEXT: [[TMP166:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB307:%.*]] = sub i32 [[TMP166]], -63 +// CHECK2-NEXT: [[DIV308:%.*]] = udiv i32 [[SUB307]], 64 +// CHECK2-NEXT: [[MUL309:%.*]] = mul i32 [[MUL306]], [[DIV308]] +// CHECK2-NEXT: [[TMP167:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP168:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB310:%.*]] = sub i32 [[TMP167]], [[TMP168]] +// CHECK2-NEXT: [[SUB311:%.*]] = sub i32 [[SUB310]], 1 +// CHECK2-NEXT: [[ADD312:%.*]] = add i32 [[SUB311]], 1 +// CHECK2-NEXT: [[DIV313:%.*]] = udiv i32 [[ADD312]], 1 +// CHECK2-NEXT: [[MUL314:%.*]] = mul i32 [[MUL309]], [[DIV313]] +// CHECK2-NEXT: [[CONV315:%.*]] = zext i32 [[MUL314]] to i64 +// CHECK2-NEXT: [[DIV316:%.*]] = sdiv i64 [[TMP161]], [[CONV315]] +// CHECK2-NEXT: [[TMP169:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK2-NEXT: [[TMP170:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK2-NEXT: [[SUB317:%.*]] = sub i32 [[TMP169]], [[TMP170]] +// CHECK2-NEXT: [[SUB318:%.*]] = sub i32 [[SUB317]], 1 +// CHECK2-NEXT: [[TMP171:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[ADD319:%.*]] = add i32 [[SUB318]], [[TMP171]] +// CHECK2-NEXT: [[TMP172:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[DIV320:%.*]] = udiv i32 [[ADD319]], [[TMP172]] +// CHECK2-NEXT: [[MUL321:%.*]] = mul i32 1, [[DIV320]] +// CHECK2-NEXT: [[TMP173:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB322:%.*]] = sub i32 [[TMP173]], -63 +// CHECK2-NEXT: [[DIV323:%.*]] = udiv i32 [[SUB322]], 64 +// CHECK2-NEXT: [[MUL324:%.*]] = mul i32 [[MUL321]], [[DIV323]] +// CHECK2-NEXT: [[TMP174:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP175:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB325:%.*]] = sub i32 [[TMP174]], [[TMP175]] +// CHECK2-NEXT: [[SUB326:%.*]] = sub i32 [[SUB325]], 1 +// CHECK2-NEXT: [[ADD327:%.*]] = add i32 [[SUB326]], 1 +// CHECK2-NEXT: [[DIV328:%.*]] = udiv i32 [[ADD327]], 1 +// CHECK2-NEXT: [[MUL329:%.*]] = mul i32 [[MUL324]], [[DIV328]] +// CHECK2-NEXT: [[CONV330:%.*]] = zext i32 [[MUL329]] to i64 +// CHECK2-NEXT: [[MUL331:%.*]] = mul nsw i64 [[DIV316]], [[CONV330]] +// CHECK2-NEXT: [[SUB332:%.*]] = sub nsw i64 [[TMP160]], [[MUL331]] +// CHECK2-NEXT: [[TMP176:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[TMP177:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[TMP178:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK2-NEXT: [[TMP179:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK2-NEXT: [[SUB333:%.*]] = sub i32 [[TMP178]], [[TMP179]] +// CHECK2-NEXT: [[SUB334:%.*]] = sub i32 [[SUB333]], 1 +// CHECK2-NEXT: [[TMP180:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[ADD335:%.*]] = add i32 [[SUB334]], [[TMP180]] +// CHECK2-NEXT: [[TMP181:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[DIV336:%.*]] = udiv i32 [[ADD335]], [[TMP181]] +// CHECK2-NEXT: [[MUL337:%.*]] = mul i32 1, [[DIV336]] +// CHECK2-NEXT: [[TMP182:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB338:%.*]] = sub i32 [[TMP182]], -63 +// CHECK2-NEXT: [[DIV339:%.*]] = udiv i32 [[SUB338]], 64 +// CHECK2-NEXT: [[MUL340:%.*]] = mul i32 [[MUL337]], [[DIV339]] +// CHECK2-NEXT: [[TMP183:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP184:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB341:%.*]] = sub i32 [[TMP183]], [[TMP184]] +// CHECK2-NEXT: [[SUB342:%.*]] = sub i32 [[SUB341]], 1 +// CHECK2-NEXT: [[ADD343:%.*]] = add i32 [[SUB342]], 1 +// CHECK2-NEXT: [[DIV344:%.*]] = udiv i32 [[ADD343]], 1 +// CHECK2-NEXT: [[MUL345:%.*]] = mul i32 [[MUL340]], [[DIV344]] +// CHECK2-NEXT: [[CONV346:%.*]] = zext i32 [[MUL345]] to i64 +// CHECK2-NEXT: [[DIV347:%.*]] = sdiv i64 [[TMP177]], [[CONV346]] +// CHECK2-NEXT: [[TMP185:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK2-NEXT: [[TMP186:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_5]], align 4 +// CHECK2-NEXT: [[SUB348:%.*]] = sub i32 [[TMP185]], [[TMP186]] +// CHECK2-NEXT: [[SUB349:%.*]] = sub i32 [[SUB348]], 1 +// CHECK2-NEXT: [[TMP187:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[ADD350:%.*]] = add i32 [[SUB349]], [[TMP187]] +// CHECK2-NEXT: [[TMP188:%.*]] = load i32, ptr [[DOTNEW_STEP7]], align 4 +// CHECK2-NEXT: [[DIV351:%.*]] = udiv i32 [[ADD350]], [[TMP188]] +// CHECK2-NEXT: [[MUL352:%.*]] = mul i32 1, [[DIV351]] +// CHECK2-NEXT: [[TMP189:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB353:%.*]] = sub i32 [[TMP189]], -63 +// CHECK2-NEXT: [[DIV354:%.*]] = udiv i32 [[SUB353]], 64 +// CHECK2-NEXT: [[MUL355:%.*]] = mul i32 [[MUL352]], [[DIV354]] +// CHECK2-NEXT: [[TMP190:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP191:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB356:%.*]] = sub i32 [[TMP190]], [[TMP191]] +// CHECK2-NEXT: [[SUB357:%.*]] = sub i32 [[SUB356]], 1 +// CHECK2-NEXT: [[ADD358:%.*]] = add i32 [[SUB357]], 1 +// CHECK2-NEXT: [[DIV359:%.*]] = udiv i32 [[ADD358]], 1 +// CHECK2-NEXT: [[MUL360:%.*]] = mul i32 [[MUL355]], [[DIV359]] +// CHECK2-NEXT: [[CONV361:%.*]] = zext i32 [[MUL360]] to i64 +// CHECK2-NEXT: [[MUL362:%.*]] = mul nsw i64 [[DIV347]], [[CONV361]] +// CHECK2-NEXT: [[SUB363:%.*]] = sub nsw i64 [[TMP176]], [[MUL362]] +// CHECK2-NEXT: [[TMP192:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB364:%.*]] = sub i32 [[TMP192]], -63 +// CHECK2-NEXT: [[DIV365:%.*]] = udiv i32 [[SUB364]], 64 +// CHECK2-NEXT: [[MUL366:%.*]] = mul i32 1, [[DIV365]] +// CHECK2-NEXT: [[TMP193:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP194:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB367:%.*]] = sub i32 [[TMP193]], [[TMP194]] +// CHECK2-NEXT: [[SUB368:%.*]] = sub i32 [[SUB367]], 1 +// CHECK2-NEXT: [[ADD369:%.*]] = add i32 [[SUB368]], 1 +// CHECK2-NEXT: [[DIV370:%.*]] = udiv i32 [[ADD369]], 1 +// CHECK2-NEXT: [[MUL371:%.*]] = mul i32 [[MUL366]], [[DIV370]] +// CHECK2-NEXT: [[CONV372:%.*]] = zext i32 [[MUL371]] to i64 +// CHECK2-NEXT: [[DIV373:%.*]] = sdiv i64 [[SUB363]], [[CONV372]] +// CHECK2-NEXT: [[TMP195:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_14]], align 4 +// CHECK2-NEXT: [[SUB374:%.*]] = sub i32 [[TMP195]], -63 +// CHECK2-NEXT: [[DIV375:%.*]] = udiv i32 [[SUB374]], 64 +// CHECK2-NEXT: [[MUL376:%.*]] = mul i32 1, [[DIV375]] +// CHECK2-NEXT: [[TMP196:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP197:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB377:%.*]] = sub i32 [[TMP196]], [[TMP197]] +// CHECK2-NEXT: [[SUB378:%.*]] = sub i32 [[SUB377]], 1 +// CHECK2-NEXT: [[ADD379:%.*]] = add i32 [[SUB378]], 1 +// CHECK2-NEXT: [[DIV380:%.*]] = udiv i32 [[ADD379]], 1 +// CHECK2-NEXT: [[MUL381:%.*]] = mul i32 [[MUL376]], [[DIV380]] +// CHECK2-NEXT: [[CONV382:%.*]] = zext i32 [[MUL381]] to i64 +// CHECK2-NEXT: [[MUL383:%.*]] = mul nsw i64 [[DIV373]], [[CONV382]] +// CHECK2-NEXT: [[SUB384:%.*]] = sub nsw i64 [[SUB332]], [[MUL383]] +// CHECK2-NEXT: [[TMP198:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP199:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB385:%.*]] = sub i32 [[TMP198]], [[TMP199]] +// CHECK2-NEXT: [[SUB386:%.*]] = sub i32 [[SUB385]], 1 +// CHECK2-NEXT: [[ADD387:%.*]] = add i32 [[SUB386]], 1 +// CHECK2-NEXT: [[DIV388:%.*]] = udiv i32 [[ADD387]], 1 +// CHECK2-NEXT: [[MUL389:%.*]] = mul i32 1, [[DIV388]] +// CHECK2-NEXT: [[CONV390:%.*]] = zext i32 [[MUL389]] to i64 +// CHECK2-NEXT: [[DIV391:%.*]] = sdiv i64 [[SUB384]], [[CONV390]] +// CHECK2-NEXT: [[TMP200:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4 +// CHECK2-NEXT: [[TMP201:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_16]], align 4 +// CHECK2-NEXT: [[SUB392:%.*]] = sub i32 [[TMP200]], [[TMP201]] +// CHECK2-NEXT: [[SUB393:%.*]] = sub i32 [[SUB392]], 1 +// CHECK2-NEXT: [[ADD394:%.*]] = add i32 [[SUB393]], 1 +// CHECK2-NEXT: [[DIV395:%.*]] = udiv i32 [[ADD394]], 1 +// CHECK2-NEXT: [[MUL396:%.*]] = mul i32 1, [[DIV395]] +// CHECK2-NEXT: [[CONV397:%.*]] = zext i32 [[MUL396]] to i64 +// CHECK2-NEXT: [[MUL398:%.*]] = mul nsw i64 [[DIV391]], [[CONV397]] +// CHECK2-NEXT: [[SUB399:%.*]] = sub nsw i64 [[SUB301]], [[MUL398]] +// CHECK2-NEXT: [[MUL400:%.*]] = mul nsw i64 [[SUB399]], 1 +// CHECK2-NEXT: [[ADD401:%.*]] = add nsw i64 [[CONV218]], [[MUL400]] +// CHECK2-NEXT: [[CONV402:%.*]] = trunc i64 [[ADD401]] to i32 +// CHECK2-NEXT: store i32 [[CONV402]], ptr [[DOTTILE_0_IV_K52]], align 4 +// CHECK2-NEXT: [[TMP202:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_8]], align 4 +// CHECK2-NEXT: [[TMP203:%.*]] = load i32, ptr [[DOTTILE_0_IV_K52]], align 4 +// CHECK2-NEXT: [[TMP204:%.*]] = load i32, ptr [[DOTNEW_STEP10]], align 4 +// CHECK2-NEXT: [[MUL403:%.*]] = mul i32 [[TMP203]], [[TMP204]] +// CHECK2-NEXT: [[ADD404:%.*]] = add i32 [[TMP202]], [[MUL403]] +// CHECK2-NEXT: store i32 [[ADD404]], ptr [[K]], align 4 +// CHECK2-NEXT: [[TMP205:%.*]] = load i32, ptr [[I49]], align 4 +// CHECK2-NEXT: [[TMP206:%.*]] = load i32, ptr [[J50]], align 4 +// CHECK2-NEXT: [[TMP207:%.*]] = load i32, ptr [[K]], align 4 +// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP205]], i32 noundef [[TMP206]], i32 noundef [[TMP207]]) +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP208:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: [[ADD405:%.*]] = add nsw i64 [[TMP208]], 1 +// CHECK2-NEXT: store i64 [[ADD405]], ptr [[DOTOMP_IV]], align 8 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]]) +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]]) +// CHECK2-NEXT: ret void + diff --git a/clang/test/OpenMP/interchange_messages.cpp b/clang/test/OpenMP/interchange_messages.cpp index 175c2f1efa744..7b9a4963445e4 100644 --- a/clang/test/OpenMP/interchange_messages.cpp +++ b/clang/test/OpenMP/interchange_messages.cpp @@ -2,12 +2,102 @@ void func() { + // expected-error@+1 {{expected '('}} + #pragma omp interchange permutation + for (int i = 0; i < 7; ++i) + for (int j = 0; j < 13; ++j) + ; + + // expected-error@+2 {{expected expression}} + // expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}} + #pragma omp interchange permutation( + for (int i = 0; i < 7; ++i) + for (int j = 0; j < 13; ++j) + ; + + // expected-error@+1 {{expected expression}} + #pragma omp interchange permutation() + for (int i = 0; i < 7; ++i) + for (int j = 0; j < 13; ++j) + ; + + // expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}} + #pragma omp interchange permutation(1 + for (int i = 0; i < 7; ++i) + for (int j = 0; j < 13; ++j) + ; + + // expected-error@+2 {{expected expression}} + // expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}} + #pragma omp interchange permutation(1, + for (int i = 0; i < 7; ++i) + for (int j = 0; j < 13; ++j) + ; + + // expected-error@+1 {{expected expression}} + #pragma omp interchange permutation(1,) + for (int i = 0; i < 7; ++i) + for (int j = 0; j < 13; ++j) + ; + + // expected-error@+2 {{expected expression}} + // expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}} + #pragma omp interchange permutation(5+ + for (int i = 0; i < 7; ++i) + for (int j = 0; j < 13; ++j) + ; + + // expected-error@+1 {{expected expression}} + #pragma omp interchange permutation(5+) + for (int i = 0; i < 7; ++i) + for (int j = 0; j < 13; ++j) + ; + + // expected-error@+1 {{expected expression}} + #pragma omp interchange permutation(for) + for (int i = 0; i < 7; ++i) + for (int j = 0; j < 13; ++j) + ; + + // expected-error@+1 {{permutation index must be at least 1 and at most 1}} + #pragma omp interchange permutation(0) + for (int i = 0; i < 7; ++i) + for (int j = 0; j < 13; ++j) + ; + + // expected-error@+1 {{permutation index must be at least 1 and at most 2}} + #pragma omp interchange permutation(1,3) + for (int i = 0; i < 7; ++i) + for (int j = 0; j < 13; ++j) + ; + + // expected-error@+1 {{index 1 must appear exactly once in the permutation clause}} + #pragma omp interchange permutation(1,1) + for (int i = 0; i < 7; ++i) + for (int j = 0; j < 13; ++j) + ; + + // expected-error@+4 {{expression is not an integral constant expression}} + // expected-note@+3 {{read of non-const variable 'a' is not allowed in a constant expression}} + // expected-note@+1 {{declared here}} + int a; + #pragma omp interchange permutation(a) + for (int i = 0; i < 7; ++i) + for (int j = 0; j < 13; ++j) + ; + // expected-warning@+1 {{extra tokens at the end of '#pragma omp interchange' are ignored}} #pragma omp interchange foo for (int i = 0; i < 7; ++i) for (int j = 0; j < 13; ++j) ; + // expected-error@+1 {{directive '#pragma omp interchange' cannot contain more than one 'permutation' clause}} + #pragma omp interchange permutation(2,1) permutation(2,1) + for (int i = 0; i < 7; ++i) + for (int j = 0; j < 13; ++j) + ; + // expected-error@+1 {{unexpected OpenMP clause 'collapse' in directive '#pragma omp interchange'}} #pragma omp interchange collapse(2) for (int i = 0; i < 7; ++i) @@ -75,3 +165,67 @@ void func() { ; } + + +template +static void templated_func() { + // In a template context, but expression itself not instantiation-dependent + + // expected-error@+1 {{permutation index must be at least 1 and at most 2}} + #pragma omp interchange permutation(0,1) + for (int i = 0; i < 7; ++i) + for (int j = 0; j<7; ++j) + ; + + // expected-error@+1 {{index 1 must appear exactly once in the permutation clause}} + #pragma omp interchange permutation(1,1) + for (int i = 0; i < 7; ++i) + for (int j = 0; j<7; ++j) + ; +} + + + +template +static void templated_func_value_dependent() { + // expected-error@+1 {{permutation index must be at least 1 and at most 2}} + #pragma omp interchange permutation(S,S+1) + for (int i = 0; i < 7; ++i) + for (int j = 0; j<7; ++j) + ; + + // expected-error@+1 {{index 1 must appear exactly once in the permutation clause}} + #pragma omp interchange permutation(S+1,S+1) + for (int i = 0; i < 7; ++i) + for (int j = 0; j<7; ++j) + ; +} + + +template +static void templated_func_type_dependent() { + constexpr T s = 0; + + // expected-error@+1 {{permutation index must be at least 1 and at most 2}} + #pragma omp interchange permutation(s,s+1) + for (int i = 0; i < 7; ++i) + for (int j = 0; j<7; ++j) + ; + + // expected-error@+1 {{index 1 must appear exactly once in the permutation clause}} + #pragma omp interchange permutation(s+1,s+1) + for (int i = 0; i < 7; ++i) + for (int j = 0; j<7; ++j) + ; +} + + +void template_inst() { + // expected-note@+1 {{in instantiation of function template specialization 'templated_func' requested here}} + templated_func(); + // expected-note@+1 {{in instantiation of function template specialization 'templated_func_value_dependent<0>' requested here}} + templated_func_value_dependent<0>(); + // expected-note@+1 {{in instantiation of function template specialization 'templated_func_type_dependent' requested here}} + templated_func_type_dependent(); +} + diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp index e6c323775c999..c282a9071391e 100644 --- a/clang/tools/libclang/CIndex.cpp +++ b/clang/tools/libclang/CIndex.cpp @@ -2387,6 +2387,12 @@ void OMPClauseEnqueue::VisitOMPSizesClause(const OMPSizesClause *C) { Visitor->AddStmt(E); } +void OMPClauseEnqueue::VisitOMPPermutationClause( + const OMPPermutationClause *C) { + for (auto E : C->getArgsRefs()) + Visitor->AddStmt(E); +} + void OMPClauseEnqueue::VisitOMPFullClause(const OMPFullClause *C) {} void OMPClauseEnqueue::VisitOMPPartialClause(const OMPPartialClause *C) { diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp index c93767eb8a550..91c1b08dd8c2f 100644 --- a/flang/lib/Lower/OpenMP/Clauses.cpp +++ b/flang/lib/Lower/OpenMP/Clauses.cpp @@ -1161,6 +1161,12 @@ Sizes make(const parser::OmpClause::Sizes &inp, return Sizes{/*SizeList=*/makeList(inp.v, makeExprFn(semaCtx))}; } +Permutation make(const parser::OmpClause::Permutation &inp, + semantics::SemanticsContext &semaCtx) { + // inp.v -> std::list + return Permutation{/*ArgList=*/makeList(inp.v, makeExprFn(semaCtx))}; +} + TaskReduction make(const parser::OmpClause::TaskReduction &inp, semantics::SemanticsContext &semaCtx) { // inp.v -> parser::OmpReductionClause diff --git a/flang/lib/Lower/OpenMP/Clauses.h b/flang/lib/Lower/OpenMP/Clauses.h index 51bf0eab0f8d0..62f3df3e3ee95 100644 --- a/flang/lib/Lower/OpenMP/Clauses.h +++ b/flang/lib/Lower/OpenMP/Clauses.h @@ -250,6 +250,7 @@ using Shared = tomp::clause::SharedT; using Simdlen = tomp::clause::SimdlenT; using Simd = tomp::clause::SimdT; using Sizes = tomp::clause::SizesT; +using Permutation = tomp::clause::PermutationT; using TaskReduction = tomp::clause::TaskReductionT; using ThreadLimit = tomp::clause::ThreadLimitT; using Threads = tomp::clause::ThreadsT; diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp index cc2930cbd7ded..214d6b4a91087 100644 --- a/flang/lib/Parser/openmp-parsers.cpp +++ b/flang/lib/Parser/openmp-parsers.cpp @@ -350,6 +350,8 @@ TYPE_PARSER( parenthesized(scalarIntConstantExpr))) || "SIZES" >> construct(construct( parenthesized(nonemptyList(scalarIntExpr)))) || + "PERMUTATION" >> construct(construct( + parenthesized(nonemptyList(scalarIntExpr)))) || "THREADS" >> construct(construct()) || "THREAD_LIMIT" >> construct(construct( parenthesized(scalarIntExpr))) || diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp index 6eec4d5d31751..cf90c92bbf3c4 100644 --- a/flang/lib/Semantics/check-omp-structure.cpp +++ b/flang/lib/Semantics/check-omp-structure.cpp @@ -2402,6 +2402,7 @@ CHECK_SIMPLE_CLAUSE(Relaxed, OMPC_relaxed) CHECK_SIMPLE_CLAUSE(SeqCst, OMPC_seq_cst) CHECK_SIMPLE_CLAUSE(Simd, OMPC_simd) CHECK_SIMPLE_CLAUSE(Sizes, OMPC_sizes) +CHECK_SIMPLE_CLAUSE(Permutation, OMPC_permutation) CHECK_SIMPLE_CLAUSE(TaskReduction, OMPC_task_reduction) CHECK_SIMPLE_CLAUSE(Uniform, OMPC_uniform) CHECK_SIMPLE_CLAUSE(Unknown, OMPC_unknown) diff --git a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h index 43b2fb558e0ea..ac34ddafc5e72 100644 --- a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h +++ b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h @@ -956,6 +956,14 @@ struct PartialT { OPT(UnrollFactor) v; }; +// V6.0: `permutation` clause +template // +struct PermutationT { + using ArgList = ListT; + using WrapperTrait = std::true_type; + ArgList v; +}; + // V5.2: [12.4] `priority` clause template // struct PriorityT { @@ -1267,9 +1275,9 @@ using WrapperClausesT = std::variant< NovariantsT, NumTeamsT, NumThreadsT, OrderedT, PartialT, PriorityT, PrivateT, ProcBindT, SafelenT, SeverityT, SharedT, - SimdlenT, SizesT, ThreadLimitT, - UniformT, UpdateT, UseDeviceAddrT, - UseDevicePtrT, UsesAllocatorsT>; + SimdlenT, SizesT, PermutationT, + ThreadLimitT, UniformT, UpdateT, + UseDeviceAddrT, UseDevicePtrT, UsesAllocatorsT>; template using UnionOfAllClausesT = typename type::Union< // diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td index fcf087d1f9c6e..f2f09812a8690 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMP.td +++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td @@ -349,6 +349,11 @@ def OMPC_Partial: Clause<"partial"> { let flangClass = "ScalarIntConstantExpr"; let isValueOptional = true; } +def OMPC_Permutation: Clause<"permutation"> { + let clangClass = "OMPPermutationClause"; + let flangClass = "ScalarIntExpr"; + let isValueList = true; +} def OMPC_Priority : Clause<"priority"> { let clangClass = "OMPPriorityClause"; let flangClass = "ScalarIntExpr"; @@ -766,6 +771,9 @@ def OMP_For : Directive<"for"> { let category = CA_Executable; } def OMP_Interchange : Directive<"interchange"> { + let allowedOnceClauses = [ + VersionedClause, + ]; let association = AS_Loop; let category = CA_Executable; } From fa3258ecb8f18702bb45fa7f7c5c436be9e575cc Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 9 Oct 2024 13:58:58 +0100 Subject: [PATCH 013/119] [VPlan] Sink retrieving legacy costs to more specific computeCost impls. (#109708) Make legacy cost retrieval independent of getInstructionForCost by sinking it to more specific ::computeCost implementation (specifically VPInterleaveRecipe::computeCost and VPSingleDefRecipe::computeCost). Inline getInstructionForCost to VPRecipeBase::cost(), as it is now only used to decide which recipes to skip during cost computation and when to apply forced costs. PR: https://github.com/llvm/llvm-project/pull/109708 --- llvm/lib/Transforms/Vectorize/VPlan.h | 16 ++++ .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 80 ++++++++++++------- 2 files changed, 65 insertions(+), 31 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 97677f97b90da..68a62638b9d58 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -930,6 +930,10 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { const Instruction *getUnderlyingInstr() const { return cast(getUnderlyingValue()); } + + /// Return the cost of this VPSingleDefRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; }; /// Class to record LLVM IR flag for a recipe along with it. @@ -1411,6 +1415,10 @@ class VPIRInstruction : public VPRecipeBase { void execute(VPTransformState &State) override; + /// Return the cost of this VPIRInstruction. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + Instruction &getInstruction() { return I; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -2391,6 +2399,10 @@ class VPInterleaveRecipe : public VPRecipeBase { /// Generate the wide load or store, and shuffles. void execute(VPTransformState &State) override; + /// Return the cost of this VPInterleaveRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, @@ -2624,6 +2636,10 @@ class VPBranchOnMaskRecipe : public VPRecipeBase { /// conditional branch. void execute(VPTransformState &State) override; + /// Return the cost of this VPBranchOnMaskRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 2468616be0bd7..ba94cd2958766 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -280,33 +280,28 @@ void VPRecipeBase::moveBefore(VPBasicBlock &BB, insertBefore(BB, I); } -/// Return the underlying instruction to be used for computing \p R's cost via -/// the legacy cost model. Return nullptr if there's no suitable instruction. -static Instruction *getInstructionForCost(const VPRecipeBase *R) { - if (auto *S = dyn_cast(R)) - return dyn_cast_or_null(S->getUnderlyingValue()); - if (auto *IG = dyn_cast(R)) - return IG->getInsertPos(); - // Currently the legacy cost model only calculates the instruction cost with - // underlying instruction. Removing the WidenMem here will prevent - // force-target-instruction-cost overwriting the cost of recipe with - // underlying instruction which is inconsistent with the legacy model. - // TODO: Remove WidenMem from this function when we don't need to compare to - // the legacy model. - if (auto *WidenMem = dyn_cast(R)) - return &WidenMem->getIngredient(); - return nullptr; -} - InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) { - auto *UI = getInstructionForCost(this); - if (UI && Ctx.skipCostComputation(UI, VF.isVector())) - return 0; - - InstructionCost RecipeCost = computeCost(VF, Ctx); - if (UI && ForceTargetInstructionCost.getNumOccurrences() > 0 && - RecipeCost.isValid()) - RecipeCost = InstructionCost(ForceTargetInstructionCost); + // Get the underlying instruction for the recipe, if there is one. It is used + // to + // * decide if cost computation should be skipped for this recipe, + // * apply forced target instruction cost. + Instruction *UI = nullptr; + if (auto *S = dyn_cast(this)) + UI = dyn_cast_or_null(S->getUnderlyingValue()); + else if (auto *IG = dyn_cast(this)) + UI = IG->getInsertPos(); + else if (auto *WidenMem = dyn_cast(this)) + UI = &WidenMem->getIngredient(); + + InstructionCost RecipeCost; + if (UI && Ctx.skipCostComputation(UI, VF.isVector())) { + RecipeCost = 0; + } else { + RecipeCost = computeCost(VF, Ctx); + if (UI && ForceTargetInstructionCost.getNumOccurrences() > 0 && + RecipeCost.isValid()) + RecipeCost = InstructionCost(ForceTargetInstructionCost); + } LLVM_DEBUG({ dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": "; @@ -317,11 +312,14 @@ InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) { InstructionCost VPRecipeBase::computeCost(ElementCount VF, VPCostContext &Ctx) const { - // Compute the cost for the recipe falling back to the legacy cost model using - // the underlying instruction. If there is no underlying instruction, returns - // 0. - Instruction *UI = getInstructionForCost(this); - if (UI && isa(this)) { + llvm_unreachable("subclasses should implement computeCost"); +} + +InstructionCost VPSingleDefRecipe::computeCost(ElementCount VF, + VPCostContext &Ctx) const { + Instruction *UI = dyn_cast_or_null(getUnderlyingValue()); + if (isa(this)) { + assert(UI && "VPReplicateRecipe must have an underlying instruction"); // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan // transform, avoid computing their cost multiple times for now. Ctx.SkipCostComputation.insert(UI); @@ -870,6 +868,13 @@ void VPIRInstruction::execute(VPTransformState &State) { State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator())); } +InstructionCost VPIRInstruction::computeCost(ElementCount VF, + VPCostContext &Ctx) const { + // The recipe wraps an existing IR instruction on the border of VPlan's scope, + // hence it does not contribute to the cost-modeling for the VPlan. + return 0; +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPIRInstruction::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { @@ -2210,6 +2215,14 @@ void VPBranchOnMaskRecipe::execute(VPTransformState &State) { ReplaceInstWithInst(CurrentTerminator, CondBr); } +InstructionCost VPBranchOnMaskRecipe::computeCost(ElementCount VF, + VPCostContext &Ctx) const { + // The legacy cost model doesn't assign costs to branches for individual + // replicate regions. Match the current behavior in the VPlan cost model for + // now. + return 0; +} + void VPPredInstPHIRecipe::execute(VPTransformState &State) { assert(State.Lane && "Predicated instruction PHI works per instance."); Instruction *ScalarPredInst = @@ -2892,6 +2905,11 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, } #endif +InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF, + VPCostContext &Ctx) const { + return Ctx.getLegacyCost(IG->getInsertPos(), VF); +} + void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) { Value *Start = getStartValue()->getLiveInIRValue(); PHINode *Phi = PHINode::Create(Start->getType(), 2, "index"); From c47f3e8c7027fbf13495dc865c28d852bf77836d Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 9 Oct 2024 13:48:17 +0100 Subject: [PATCH 014/119] [X86] combineSelect - Fold select(pcmpeq(and(X,Pow2),0),A,B) -> select(pcmpeq(and(X,Pow2),Pow2),B,A) Matches what we already do in LowerVSETCC to reuse an existing constant Fixes #110875 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 13 ++++ llvm/test/CodeGen/X86/vselect-pcmp.ll | 89 ++++++++++++------------- 2 files changed, 57 insertions(+), 45 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index fd8291bfaea7c..ddbe82b1de5cf 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -46915,6 +46915,19 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(CondVT, CondNot), RHS, LHS); + // select(pcmpeq(and(X,Pow2),0),A,B) -> select(pcmpeq(and(X,Pow2),Pow2),B,A) + if (Cond.getOpcode() == X86ISD::PCMPEQ && + Cond.getOperand(0).getOpcode() == ISD::AND && + ISD::isBuildVectorAllZeros(Cond.getOperand(1).getNode()) && + isConstantPowerOf2(Cond.getOperand(0).getOperand(1), + Cond.getScalarValueSizeInBits(), + /*AllowUndefs=*/true) && + Cond.hasOneUse()) { + Cond = DAG.getNode(X86ISD::PCMPEQ, DL, CondVT, Cond.getOperand(0), + Cond.getOperand(0).getOperand(1)); + return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS); + } + // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the // signbit. if (Cond.getOpcode() == X86ISD::PCMPGT && diff --git a/llvm/test/CodeGen/X86/vselect-pcmp.ll b/llvm/test/CodeGen/X86/vselect-pcmp.ll index 1cf59ea2ab7ad..eecb298ca3ec4 100644 --- a/llvm/test/CodeGen/X86/vselect-pcmp.ll +++ b/llvm/test/CodeGen/X86/vselect-pcmp.ll @@ -643,18 +643,18 @@ define <16 x i8> @blend_splat1_mask_cond_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x ; ; AVX512F-LABEL: blend_splat1_mask_cond_v16i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX512F-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm0 -; AVX512F-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 +; AVX512F-NEXT: vpblendvb %xmm0, %xmm2, %xmm1, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: blend_splat1_mask_cond_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX512VL-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq {{.*#+}} xmm0 = xmm2 ^ (xmm0 & (xmm1 ^ xmm2)) +; AVX512VL-NEXT: vpternlogq {{.*#+}} xmm0 = xmm1 ^ (xmm0 & (xmm2 ^ xmm1)) ; AVX512VL-NEXT: retq ; ; XOP-LABEL: blend_splat1_mask_cond_v16i8: @@ -795,18 +795,18 @@ define <32 x i8> @blend_splatmax_mask_cond_v32i8(<32 x i8> %x, <32 x i8> %y, <32 ; ; AVX512F-LABEL: blend_splatmax_mask_cond_v32i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 +; AVX512F-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: blend_splatmax_mask_cond_v32i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm1 ^ ymm2)) +; AVX512VL-NEXT: vpternlogq {{.*#+}} ymm0 = ymm1 ^ (ymm0 & (ymm2 ^ ymm1)) ; AVX512VL-NEXT: retq ; ; XOP-LABEL: blend_splatmax_mask_cond_v32i8: @@ -972,18 +972,18 @@ define <16 x i8> @blend_splat_mask_cond_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x ; ; AVX512F-LABEL: blend_splat_mask_cond_v16i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm3 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512F-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX512F-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm0 -; AVX512F-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 +; AVX512F-NEXT: vpblendvb %xmm0, %xmm2, %xmm1, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: blend_splat_mask_cond_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VL-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX512VL-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq {{.*#+}} xmm0 = xmm2 ^ (xmm0 & (xmm1 ^ xmm2)) +; AVX512VL-NEXT: vpternlogq {{.*#+}} xmm0 = xmm1 ^ (xmm0 & (xmm2 ^ xmm1)) ; AVX512VL-NEXT: retq ; ; XOP-LABEL: blend_splat_mask_cond_v16i8: @@ -1002,10 +1002,10 @@ define <16 x i8> @blend_splat_mask_cond_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x define <2 x i64> @blend_mask_cond_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z) { ; AVX1-LABEL: blend_mask_cond_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,4] +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqq %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 +; AVX1-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: blend_mask_cond_v2i64: @@ -1126,26 +1126,26 @@ define <8 x i16> @blend_mask_cond_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %z define <16 x i8> @blend_mask_cond_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %z) { ; AVX12-LABEL: blend_mask_cond_v16i8: ; AVX12: # %bb.0: -; AVX12-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX12-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX12-NEXT: vmovdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,4,4,4,4,2,2,2,2] +; AVX12-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX12-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm0 -; AVX12-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 +; AVX12-NEXT: vpblendvb %xmm0, %xmm2, %xmm1, %xmm0 ; AVX12-NEXT: retq ; ; AVX512F-LABEL: blend_mask_cond_v16i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,4,4,4,4,2,2,2,2] +; AVX512F-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX512F-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm0 -; AVX512F-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 +; AVX512F-NEXT: vpblendvb %xmm0, %xmm2, %xmm1, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: blend_mask_cond_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,4,4,4,4,2,2,2,2] +; AVX512VL-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX512VL-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq {{.*#+}} xmm0 = xmm2 ^ (xmm0 & (xmm1 ^ xmm2)) +; AVX512VL-NEXT: vpternlogq {{.*#+}} xmm0 = xmm1 ^ (xmm0 & (xmm2 ^ xmm1)) ; AVX512VL-NEXT: retq ; ; XOP-LABEL: blend_mask_cond_v16i8: @@ -1326,26 +1326,26 @@ define <32 x i8> @blend_mask_cond_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %z ; ; AVX2-LABEL: blend_mask_cond_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,4,4,4,4,2,2,2,2,1,2,4,8,16,32,64,128,4,4,4,4,128,4,2,16] +; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 +; AVX2-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: blend_mask_cond_v32i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,4,4,4,4,2,2,2,2,1,2,4,8,16,32,64,128,4,4,4,4,128,4,2,16] +; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 +; AVX512F-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: blend_mask_cond_v32i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,4,4,4,4,2,2,2,2,1,2,4,8,16,32,64,128,4,4,4,4,128,4,2,16] +; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm1 ^ ymm2)) +; AVX512VL-NEXT: vpternlogq {{.*#+}} ymm0 = ymm1 ^ (ymm0 & (ymm2 ^ ymm1)) ; AVX512VL-NEXT: retq ; ; XOP-LABEL: blend_mask_cond_v32i8: @@ -1736,17 +1736,16 @@ define <64 x i8> @PR110875(<32 x i8> %a0, <32 x i8> %a1, i64 %a2) { ; AVX2: # %bb.0: ; AVX2-NEXT: vmovq %rdi, %xmm2 ; AVX2-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,22,22,22,22,22,22,22,22,23,23,23,23,23,23,23,23] -; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19] +; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19] +; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,22,22,22,22,22,22,22,22,23,23,23,23,23,23,23,23] ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm5 = [20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20] +; AVX2-NEXT: vpblendvb %ymm3, %ymm5, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm4 = [20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpblendvb %ymm2, %ymm5, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: PR110875: From 15dc2d5c5e55e6f0b9e3a78d352698b8873f2566 Mon Sep 17 00:00:00 2001 From: Victor Mustya Date: Wed, 9 Oct 2024 06:11:55 -0700 Subject: [PATCH 015/119] [IR] Prevent implicit SymbolTableListTraits template instantiation (#111600) The `SymbolTableListTraits` template is explicitly instantiated for the following types: * `llvm/lib/IR/Function.cpp` - `BasicBlock` * `llvm/lib/IR/Module.cpp` - `Function` - `GlobalAlias` - `GlobalIFunc` - `GlobalVariable` When LLVM is built on Windows with the `LLVM_EXPORT_SYMBOLS_FOR_PLUGINS` option enabled, the implicit instantiation of the template prevents the `SymbolTableListTraits` template from being exported. This causes link errors when the template or IR API is used in a plugin. This change prevents the template being implicitly instantiated for these types. --- llvm/include/llvm/IR/SymbolTableListTraits.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/llvm/include/llvm/IR/SymbolTableListTraits.h b/llvm/include/llvm/IR/SymbolTableListTraits.h index bd31fca5e525b..fcf6f0fb15280 100644 --- a/llvm/include/llvm/IR/SymbolTableListTraits.h +++ b/llvm/include/llvm/IR/SymbolTableListTraits.h @@ -106,6 +106,15 @@ class SymbolTableListTraits : public ilist_alloc_traits { static ValueSymbolTable *toPtr(ValueSymbolTable &R) { return &R; } }; +// The SymbolTableListTraits template is explicitly instantiated for the +// following data types, so add extern template statements to prevent implicit +// instantiation. +extern template class SymbolTableListTraits; +extern template class SymbolTableListTraits; +extern template class SymbolTableListTraits; +extern template class SymbolTableListTraits; +extern template class SymbolTableListTraits; + /// List that automatically updates parent links and symbol tables. /// /// When nodes are inserted into and removed from this list, the associated From d25f1a19c8cc68f3ff4659192605d39c35474cc8 Mon Sep 17 00:00:00 2001 From: simpal01 Date: Wed, 9 Oct 2024 14:19:11 +0100 Subject: [PATCH 016/119] Add 64bit atomic check in the is_always_lock_free_pass test. (#111540) Currently this test is completely xfailed as part of the patch https://github.com/llvm/llvm-project/pull/106077. But this test works on A and R profile, not in v7M profile. Because the test contain cases in which m-profile will fail for atomic types greater than 4 bytes in size. --- .../std/atomics/atomics.lockfree/is_always_lock_free.pass.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcxx/test/std/atomics/atomics.lockfree/is_always_lock_free.pass.cpp b/libcxx/test/std/atomics/atomics.lockfree/is_always_lock_free.pass.cpp index 1ebe31375079b..e922bc7413514 100644 --- a/libcxx/test/std/atomics/atomics.lockfree/is_always_lock_free.pass.cpp +++ b/libcxx/test/std/atomics/atomics.lockfree/is_always_lock_free.pass.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // UNSUPPORTED: c++03, c++11, c++14 -// XFAIL: LIBCXX-PICOLIBC-FIXME +// XFAIL: !has-64-bit-atomics // // From 1e357cde4836d034d2f7a6d9af099eef23271756 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 9 Oct 2024 17:23:50 +0400 Subject: [PATCH 017/119] AMDGPU: Use pointer types more consistently (#111651) This was using addrspace 0 and 1 pointers interchangably. This works out since they happen to use the same size, but consistently query or use the correct one. --- .../lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp | 16 +++++++--------- .../AMDGPU/lower-ctor-dtor-constexpr-alias.ll | 2 +- llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll | 2 +- .../CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll | 2 +- 4 files changed, 10 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp index 6e878a9701876..ea11002bb6a5f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp @@ -77,12 +77,12 @@ static void createInitOrFiniCalls(Function &F, bool IsCtor) { auto *LoopBB = BasicBlock::Create(C, "while.entry", &F); auto *ExitBB = BasicBlock::Create(C, "while.end", &F); Type *PtrTy = IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS); + ArrayType *PtrArrayTy = ArrayType::get(PtrTy, 0); auto *Begin = M.getOrInsertGlobal( - IsCtor ? "__init_array_start" : "__fini_array_start", - ArrayType::get(PtrTy, 0), [&]() { + IsCtor ? "__init_array_start" : "__fini_array_start", PtrArrayTy, [&]() { return new GlobalVariable( - M, ArrayType::get(PtrTy, 0), + M, PtrArrayTy, /*isConstant=*/true, GlobalValue::ExternalLinkage, /*Initializer=*/nullptr, IsCtor ? "__init_array_start" : "__fini_array_start", @@ -90,10 +90,9 @@ static void createInitOrFiniCalls(Function &F, bool IsCtor) { /*AddressSpace=*/AMDGPUAS::GLOBAL_ADDRESS); }); auto *End = M.getOrInsertGlobal( - IsCtor ? "__init_array_end" : "__fini_array_end", - ArrayType::get(PtrTy, 0), [&]() { + IsCtor ? "__init_array_end" : "__fini_array_end", PtrArrayTy, [&]() { return new GlobalVariable( - M, ArrayType::get(PtrTy, 0), + M, PtrArrayTy, /*isConstant=*/true, GlobalValue::ExternalLinkage, /*Initializer=*/nullptr, IsCtor ? "__init_array_end" : "__fini_array_end", @@ -117,7 +116,7 @@ static void createInitOrFiniCalls(Function &F, bool IsCtor) { auto *Size = IRB.CreateAShr(ByteSize, ConstantInt::get(Int64Ty, 3)); auto *Offset = IRB.CreateSub(Size, ConstantInt::get(Int64Ty, 1)); Start = IRB.CreateInBoundsGEP( - ArrayType::get(IRB.getPtrTy(), 0), Begin, + PtrArrayTy, Begin, ArrayRef({ConstantInt::get(Int64Ty, 0), Offset})); Stop = Begin; } @@ -128,8 +127,7 @@ static void createInitOrFiniCalls(Function &F, bool IsCtor) { LoopBB, ExitBB); IRB.SetInsertPoint(LoopBB); auto *CallBackPHI = IRB.CreatePHI(PtrTy, 2, "ptr"); - auto *CallBack = IRB.CreateLoad(IRB.getPtrTy(F.getAddressSpace()), - CallBackPHI, "callback"); + auto *CallBack = IRB.CreateLoad(F.getType(), CallBackPHI, "callback"); IRB.CreateCall(CallBackTy, CallBack); auto *NewCallBack = IRB.CreateConstGEP1_64(PtrTy, CallBackPHI, IsCtor ? 1 : -1, "next"); diff --git a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll index 2ad40ef5e5476..a87e07cb57e05 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll @@ -66,7 +66,7 @@ define void @bar() addrspace(1) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = ashr i64 sub (i64 ptrtoint (ptr addrspace(1) @__fini_array_end to i64), i64 ptrtoint (ptr addrspace(1) @__fini_array_start to i64)), 3 ; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [0 x ptr], ptr addrspace(1) @__fini_array_start, i64 0, i64 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [0 x ptr addrspace(1)], ptr addrspace(1) @__fini_array_start, i64 0, i64 [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp uge ptr addrspace(1) [[TMP2]], @__fini_array_start ; CHECK-NEXT: br i1 [[TMP3]], label [[WHILE_ENTRY:%.*]], label [[WHILE_END:%.*]] ; CHECK: while.entry: diff --git a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll index 503f3b1d896f9..a423b320db559 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll @@ -81,7 +81,7 @@ define internal void @bar() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = ashr i64 sub (i64 ptrtoint (ptr addrspace(1) @__fini_array_end to i64), i64 ptrtoint (ptr addrspace(1) @__fini_array_start to i64)), 3 ; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [0 x ptr], ptr addrspace(1) @__fini_array_start, i64 0, i64 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [0 x ptr addrspace(1)], ptr addrspace(1) @__fini_array_start, i64 0, i64 [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp uge ptr addrspace(1) [[TMP2]], @__fini_array_start ; CHECK-NEXT: br i1 [[TMP3]], label [[WHILE_ENTRY:%.*]], label [[WHILE_END:%.*]] ; CHECK: while.entry: diff --git a/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll b/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll index 9d00b676d6610..309ecb17e79ed 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll @@ -73,7 +73,7 @@ define internal void @bar.5() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = ashr i64 sub (i64 ptrtoint (ptr addrspace(1) @__fini_array_end to i64), i64 ptrtoint (ptr addrspace(1) @__fini_array_start to i64)), 3 ; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [0 x ptr], ptr addrspace(1) @__fini_array_start, i64 0, i64 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [0 x ptr addrspace(1)], ptr addrspace(1) @__fini_array_start, i64 0, i64 [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp uge ptr addrspace(1) [[TMP2]], @__fini_array_start ; CHECK-NEXT: br i1 [[TMP3]], label [[WHILE_ENTRY:%.*]], label [[WHILE_END:%.*]] ; CHECK: while.entry: From a9ebdbb5ac7de7a028f6060b789196a43aea7580 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Wed, 9 Oct 2024 09:24:23 -0400 Subject: [PATCH 018/119] [MLIR] Vector: turn the ExtractStridedSlice rewrite pattern from #111541 into a canonicalization (#111614) This is a reasonable canonicalization because `extract` is more constrained than `extract_strided_slices`, so there is no loss of semantics here, just lifting an op to a special-case higher/constrained op. And the additional `shape_cast` is merely adding leading unit dims to match the original result type. Context: discussion on #111541. I wasn't sure how this would turn out, but in the process of writing this PR, I discovered at least 2 bugs in the pattern introduced in #111541, which shows the value of shared canonicalization patterns which are exercised on a high number of testcases. --------- Signed-off-by: Benoit Jacob --- .../Vector/Transforms/VectorRewritePatterns.h | 5 -- mlir/lib/Dialect/Vector/IR/VectorOps.cpp | 79 ++++++++++++++++++- ...sertExtractStridedSliceRewritePatterns.cpp | 69 ---------------- mlir/test/Dialect/Vector/canonicalize.mlir | 49 ++++++++++++ ...uous-extract-strided-slice-to-extract.mlir | 24 ------ .../Dialect/Vector/TestVectorTransforms.cpp | 23 ------ 6 files changed, 127 insertions(+), 122 deletions(-) delete mode 100644 mlir/test/Dialect/Vector/vector-contiguous-extract-strided-slice-to-extract.mlir diff --git a/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h b/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h index ec1de7fa66aa0..a59f06f3c1ef1 100644 --- a/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h +++ b/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h @@ -235,11 +235,6 @@ void populateVectorExtractStridedSliceToExtractInsertChainPatterns( std::function controlFn = nullptr, PatternBenefit benefit = 1); -/// Pattern to rewrite simple cases of N-D extract_strided_slice, where the -/// slice is contiguous, into extract and shape_cast. -void populateVectorContiguousExtractStridedSliceToExtractPatterns( - RewritePatternSet &patterns, PatternBenefit benefit = 1); - /// Populate `patterns` with a pattern to break down 1-D vector.bitcast ops /// based on the destination vector shape. Bitcasts from a lower bitwidth /// element type to a higher bitwidth one are extracted from the lower bitwidth diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp index 1718530b4aa16..a2abe1619454f 100644 --- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp +++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp @@ -3772,6 +3772,82 @@ class StridedSliceSplat final : public OpRewritePattern { } }; +/// Pattern to rewrite simple cases of N-D extract_strided_slice, where the +/// slice is contiguous, into extract and shape_cast. +/// +/// Example: +/// Before: +/// %1 = vector.extract_strided_slice %arg0 { +/// offsets = [0, 0, 0, 0, 0], +/// sizes = [1, 1, 1, 1, 8], +/// strides = [1, 1, 1, 1, 1] +/// } : vector<8x1x1x2x8xi8> to vector<1x1x1x1x8xi8> +/// After: +/// %0 = vector.extract %arg0[0, 0, 0, 0] +/// : vector<8xi8> from vector<8x1x1x2x8xi8> +/// %1 = vector.shape_cast %0 +/// : vector<8xi8> to vector<1x1x1x1x8xi8> +/// +class ContiguousExtractStridedSliceToExtract final + : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(ExtractStridedSliceOp op, + PatternRewriter &rewriter) const override { + if (op.hasNonUnitStrides()) + return failure(); + Value source = op.getOperand(); + auto sourceType = cast(source.getType()); + if (sourceType.isScalable() || sourceType.getRank() == 0) + return failure(); + + // Compute the number of offsets to pass to ExtractOp::build. That is the + // difference between the source rank and the desired slice rank. We walk + // the dimensions from innermost out, and stop when the next slice dimension + // is not full-size. + SmallVector sizes = getI64SubArray(op.getSizes()); + int numOffsets; + for (numOffsets = sizes.size(); numOffsets > 0; --numOffsets) { + if (sizes[numOffsets - 1] != sourceType.getDimSize(numOffsets - 1)) + break; + } + + // If the created extract op would have no offsets, then this whole + // extract_strided_slice is the identity and should have been handled by + // other canonicalizations. + if (numOffsets == 0) + return failure(); + + // If not even the inner-most dimension is full-size, this op can't be + // rewritten as an ExtractOp. + if (numOffsets == sourceType.getRank() && + static_cast(sizes.size()) == sourceType.getRank()) + return failure(); + + // The outer dimensions must have unit size. + for (int i = 0; i < numOffsets; ++i) { + if (sizes[i] != 1) + return failure(); + } + + // Avoid generating slices that have leading unit dimensions. The shape_cast + // op that we create below would take bad generic fallback patterns + // (ShapeCastOpRewritePattern). + while (sizes[numOffsets] == 1 && + numOffsets < static_cast(sizes.size()) - 1) { + ++numOffsets; + } + + SmallVector offsets = getI64SubArray(op.getOffsets()); + auto extractOffsets = ArrayRef(offsets).take_front(numOffsets); + Value extract = rewriter.create(op->getLoc(), source, + extractOffsets); + rewriter.replaceOpWithNewOp(op, op.getType(), extract); + return success(); + } +}; + } // namespace void ExtractStridedSliceOp::getCanonicalizationPatterns( @@ -3780,7 +3856,8 @@ void ExtractStridedSliceOp::getCanonicalizationPatterns( // ConstantMaskOp and ExtractStridedSliceOp(ConstantOp) -> ConstantOp. results.add(context); + StridedSliceSplat, ContiguousExtractStridedSliceToExtract>( + context); } //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorInsertExtractStridedSliceRewritePatterns.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorInsertExtractStridedSliceRewritePatterns.cpp index ad845608f18d1..ec2ef3fc7501c 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorInsertExtractStridedSliceRewritePatterns.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorInsertExtractStridedSliceRewritePatterns.cpp @@ -329,81 +329,12 @@ class DecomposeNDExtractStridedSlice } }; -/// Pattern to rewrite simple cases of N-D extract_strided_slice, where the -/// slice is contiguous, into extract and shape_cast. -/// -/// Example: -/// Before: -/// %1 = vector.extract_strided_slice %arg0 {offsets = [0, 0, 0, 0, 0], -/// sizes = [1, 1, 1, 1, 8], strides = [1, 1, 1, 1, 1]} : -/// vector<8x1x1x2x8xi8> to vector<1x1x1x1x8xi8> -/// After: -/// %0 = vector.extract %arg0[0, 0, 0, 0] : vector<8xi8> from -/// vector<8x1x1x2x8xi8> %1 = vector.shape_cast %0 : vector<8xi8> to -/// vector<1x1x1x1x8xi8> -/// -class ContiguousExtractStridedSliceToExtract final - : public OpRewritePattern { -public: - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(ExtractStridedSliceOp op, - PatternRewriter &rewriter) const override { - if (op.hasNonUnitStrides()) { - return failure(); - } - Value source = op.getOperand(); - auto sourceType = cast(source.getType()); - if (sourceType.isScalable()) { - return failure(); - } - - // Compute the number of offsets to pass to ExtractOp::build. That is the - // difference between the source rank and the desired slice rank. We walk - // the dimensions from innermost out, and stop when the next slice dimension - // is not full-size. - SmallVector sizes = getI64SubArray(op.getSizes()); - int numOffsets; - for (numOffsets = sourceType.getRank(); numOffsets > 0; --numOffsets) { - if (sizes[numOffsets - 1] != sourceType.getDimSize(numOffsets - 1)) { - break; - } - } - - // If not even the inner-most dimension is full-size, this op can't be - // rewritten as an ExtractOp. - if (numOffsets == sourceType.getRank()) { - return failure(); - } - - // Avoid generating slices that have unit outer dimensions. The shape_cast - // op that we create below would take bad generic fallback patterns - // (ShapeCastOpRewritePattern). - while (sizes[numOffsets] == 1 && numOffsets < sourceType.getRank() - 1) { - ++numOffsets; - } - - SmallVector offsets = getI64SubArray(op.getOffsets()); - auto extractOffsets = ArrayRef(offsets).take_front(numOffsets); - Value extract = rewriter.create(op->getLoc(), source, - extractOffsets); - rewriter.replaceOpWithNewOp(op, op.getType(), extract); - return success(); - } -}; - void vector::populateVectorInsertExtractStridedSliceDecompositionPatterns( RewritePatternSet &patterns, PatternBenefit benefit) { patterns.add(patterns.getContext(), benefit); } -void vector::populateVectorContiguousExtractStridedSliceToExtractPatterns( - RewritePatternSet &patterns, PatternBenefit benefit) { - patterns.add(patterns.getContext(), - benefit); -} - void vector::populateVectorExtractStridedSliceToExtractInsertChainPatterns( RewritePatternSet &patterns, std::function controlFn, diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir index b7c78de4b5bd8..6d6bc199e601c 100644 --- a/mlir/test/Dialect/Vector/canonicalize.mlir +++ b/mlir/test/Dialect/Vector/canonicalize.mlir @@ -2742,3 +2742,52 @@ func.func @vector_insert_const_regression(%arg0: i8) -> vector<4xi8> { %1 = vector.insert %arg0, %0 [0] : i8 into vector<4xi8> return %1 : vector<4xi8> } + +// ----- + +// CHECK-LABEL: @contiguous_extract_strided_slices_to_extract +// CHECK: %[[EXTRACT:.+]] = vector.extract {{.*}}[0, 0, 0, 0, 0] : vector<4xi32> from vector<8x1x2x1x1x4xi32> +// CHECK-NEXT: return %[[EXTRACT]] : vector<4xi32> +func.func @contiguous_extract_strided_slices_to_extract(%arg0 : vector<8x1x2x1x1x4xi32>) -> vector<4xi32> { + %1 = vector.extract_strided_slice %arg0 {offsets = [0, 0, 0, 0, 0, 0], sizes = [1, 1, 1, 1, 1, 4], strides = [1, 1, 1, 1, 1, 1]} : vector<8x1x2x1x1x4xi32> to vector<1x1x1x1x1x4xi32> + %2 = vector.shape_cast %1 : vector<1x1x1x1x1x4xi32> to vector<4xi32> + return %2 : vector<4xi32> +} + +// ----- + +// CHECK-LABEL: @contiguous_extract_strided_slices_to_extract_shorter_size_list +// CHECK: %[[EXTRACT:.+]] = vector.extract {{.*}}[0, 0, 0, 0] : vector<1x4xi32> from vector<8x1x2x1x1x4xi32> +// CHECK-NEXT: return %[[EXTRACT]] : vector<1x4xi32> +func.func @contiguous_extract_strided_slices_to_extract_shorter_size_list(%arg0 : vector<8x1x2x1x1x4xi32>) -> vector<1x4xi32> { + %1 = vector.extract_strided_slice %arg0 {offsets = [0, 0, 0, 0, 0], sizes = [1, 1, 1, 1, 1], strides = [1, 1, 1, 1, 1]} : vector<8x1x2x1x1x4xi32> to vector<1x1x1x1x1x4xi32> + %2 = vector.shape_cast %1 : vector<1x1x1x1x1x4xi32> to vector<1x4xi32> + return %2 : vector<1x4xi32> +} + +// ----- + +// CHECK-LABEL: @contiguous_extract_strided_slices_to_extract_failure_non_unit_outer_size +// CHECK-NEXT: vector.extract_strided_slice +func.func @contiguous_extract_strided_slices_to_extract_failure_non_unit_outer_size(%arg0 : vector<8x1x2x1x1x4xi32>) -> vector<8x1x1x1x1x4xi32> { + %1 = vector.extract_strided_slice %arg0 {offsets = [0, 0, 0, 0, 0, 0], sizes = [8, 1, 1, 1, 1, 4], strides = [1, 1, 1, 1, 1, 1]} : vector<8x1x2x1x1x4xi32> to vector<8x1x1x1x1x4xi32> + return %1 : vector<8x1x1x1x1x4xi32> +} + +// ----- + +// CHECK-LABEL: @contiguous_extract_strided_slices_to_extract_failure_non_full_size +// CHECK-NEXT: vector.extract_strided_slice +func.func @contiguous_extract_strided_slices_to_extract_failure_non_full_size(%arg0 : vector<8x1x2x1x1x4xi32>) -> vector<1x1x1x1x1x2xi32> { + %1 = vector.extract_strided_slice %arg0 {offsets = [0, 0, 0, 0, 0, 0], sizes = [1, 1, 1, 1, 1, 2], strides = [1, 1, 1, 1, 1, 1]} : vector<8x1x2x1x1x4xi32> to vector<1x1x1x1x1x2xi32> + return %1 : vector<1x1x1x1x1x2xi32> +} + +// ----- + +// CHECK-LABEL: @contiguous_extract_strided_slices_to_extract_failure_non_full_inner_size +// CHECK-NEXT: vector.extract_strided_slice +func.func @contiguous_extract_strided_slices_to_extract_failure_non_full_inner_size(%arg0 : vector<8x1x2x1x1x4xi32>) -> vector<1x1x2x1x1x1xi32> { + %1 = vector.extract_strided_slice %arg0 {offsets = [0, 0, 0, 0, 0, 0], sizes = [1, 1, 2, 1, 1, 1], strides = [1, 1, 1, 1, 1, 1]} : vector<8x1x2x1x1x4xi32> to vector<1x1x2x1x1x1xi32> + return %1 : vector<1x1x2x1x1x1xi32> +} diff --git a/mlir/test/Dialect/Vector/vector-contiguous-extract-strided-slice-to-extract.mlir b/mlir/test/Dialect/Vector/vector-contiguous-extract-strided-slice-to-extract.mlir deleted file mode 100644 index d1401ad7853fc..0000000000000 --- a/mlir/test/Dialect/Vector/vector-contiguous-extract-strided-slice-to-extract.mlir +++ /dev/null @@ -1,24 +0,0 @@ -// RUN: mlir-opt --test-vector-contiguous-extract-strided-slice-to-extract %s | FileCheck %s - -// CHECK-LABEL: @contiguous -// CHECK: %[[EXTRACT:.+]] = vector.extract {{.*}}[0, 0, 0, 0, 0] : vector<4xi32> from vector<8x1x2x1x1x4xi32> -// CHECK-NEXT: return %[[EXTRACT]] : vector<4xi32> -func.func @contiguous(%arg0 : vector<8x1x2x1x1x4xi32>) -> vector<4xi32> { - %1 = vector.extract_strided_slice %arg0 {offsets = [0, 0, 0, 0, 0, 0], sizes = [1, 1, 1, 1, 1, 4], strides = [1, 1, 1, 1, 1, 1]} : vector<8x1x2x1x1x4xi32> to vector<1x1x1x1x1x4xi32> - %2 = vector.shape_cast %1 : vector<1x1x1x1x1x4xi32> to vector<4xi32> - return %2 : vector<4xi32> -} - -// CHECK-LABEL: @non_full_size -// CHECK-NEXT: vector.extract_strided_slice -func.func @non_full_size(%arg0 : vector<8x1x2x1x1x4xi32>) -> vector<1x1x1x1x1x2xi32> { - %1 = vector.extract_strided_slice %arg0 {offsets = [0, 0, 0, 0, 0, 0], sizes = [1, 1, 1, 1, 1, 2], strides = [1, 1, 1, 1, 1, 1]} : vector<8x1x2x1x1x4xi32> to vector<1x1x1x1x1x2xi32> - return %1 : vector<1x1x1x1x1x2xi32> -} - -// CHECK-LABEL: @non_full_inner_size -// CHECK-NEXT: vector.extract_strided_slice -func.func @non_full_inner_size(%arg0 : vector<8x1x2x1x1x4xi32>) -> vector<1x1x2x1x1x1xi32> { - %1 = vector.extract_strided_slice %arg0 {offsets = [0, 0, 0, 0, 0, 0], sizes = [1, 1, 2, 1, 1, 1], strides = [1, 1, 1, 1, 1, 1]} : vector<8x1x2x1x1x4xi32> to vector<1x1x2x1x1x1xi32> - return %1 : vector<1x1x2x1x1x1xi32> -} diff --git a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp index d91e955b70641..72aaa7dc4f897 100644 --- a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp +++ b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp @@ -709,27 +709,6 @@ struct TestVectorExtractStridedSliceLowering } }; -struct TestVectorContiguousExtractStridedSliceToExtract - : public PassWrapper> { - MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID( - TestVectorExtractStridedSliceLowering) - - StringRef getArgument() const final { - return "test-vector-contiguous-extract-strided-slice-to-extract"; - } - StringRef getDescription() const final { - return "Test lowering patterns that rewrite simple cases of N-D " - "extract_strided_slice, where the slice is contiguous, into extract " - "and shape_cast"; - } - void runOnOperation() override { - RewritePatternSet patterns(&getContext()); - populateVectorContiguousExtractStridedSliceToExtractPatterns(patterns); - (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)); - } -}; - struct TestVectorBreakDownBitCast : public PassWrapper> { @@ -956,8 +935,6 @@ void registerTestVectorLowerings() { PassRegistration(); - PassRegistration(); - PassRegistration(); PassRegistration(); From 390943f25b18a352bb3a72fe1b0908df355f77d9 Mon Sep 17 00:00:00 2001 From: Leandro Lupori Date: Wed, 9 Oct 2024 10:37:46 -0300 Subject: [PATCH 019/119] [flang] Implement conversion of compatible derived types (#111165) With some restrictions, BIND(C) derived types can be converted to compatible BIND(C) derived types. Semantics already support this, but ConvertOp was missing the conversion of such types. Fixes https://github.com/llvm/llvm-project/issues/107783 --- flang/lib/Optimizer/Builder/FIRBuilder.cpp | 5 ++++- flang/lib/Optimizer/CodeGen/CodeGen.cpp | 25 ++++++++++++++++++++++ flang/lib/Optimizer/Dialect/FIROps.cpp | 12 ++++++++++- flang/test/Fir/convert-to-llvm.fir | 25 ++++++++++++++++++++++ flang/test/Fir/invalid.fir | 8 +++++++ 5 files changed, 73 insertions(+), 2 deletions(-) diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp index 9ad37c8df434a..8fa695a5c0c2e 100644 --- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp +++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp @@ -479,7 +479,10 @@ mlir::Value fir::factory::createConvert(mlir::OpBuilder &builder, mlir::Location loc, mlir::Type toTy, mlir::Value val) { if (val.getType() != toTy) { - assert(!fir::isa_derived(toTy)); + assert((!fir::isa_derived(toTy) || + mlir::cast(val.getType()).getTypeList() == + mlir::cast(toTy).getTypeList()) && + "incompatible record types"); return builder.create(loc, toTy, val); } return val; diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp index 1611de9e6389a..15fcc09c6219a 100644 --- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp +++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp @@ -660,6 +660,31 @@ struct ConvertOpConversion : public fir::FIROpConversion { auto loc = convert.getLoc(); auto i1Type = mlir::IntegerType::get(convert.getContext(), 1); + if (mlir::isa(toFirTy)) { + // Convert to compatible BIND(C) record type. + // Double check that the record types are compatible (it should have + // already been checked by the verifier). + assert(mlir::cast(fromFirTy).getTypeList() == + mlir::cast(toFirTy).getTypeList() && + "incompatible record types"); + + auto toStTy = mlir::cast(toTy); + mlir::Value val = rewriter.create(loc, toStTy); + auto indexTypeMap = toStTy.getSubelementIndexMap(); + assert(indexTypeMap.has_value() && "invalid record type"); + + for (auto [attr, type] : indexTypeMap.value()) { + int64_t index = mlir::cast(attr).getInt(); + auto extVal = + rewriter.create(loc, op0, index); + val = + rewriter.create(loc, val, extVal, index); + } + + rewriter.replaceOp(convert, val); + return mlir::success(); + } + if (mlir::isa(fromFirTy) || mlir::isa(toFirTy)) { // By specification fir::LogicalType value may be any number, diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp index 8fdc06f6fce3f..90ce8b8760591 100644 --- a/flang/lib/Optimizer/Dialect/FIROps.cpp +++ b/flang/lib/Optimizer/Dialect/FIROps.cpp @@ -1410,6 +1410,15 @@ bool fir::ConvertOp::areVectorsCompatible(mlir::Type inTy, mlir::Type outTy) { return true; } +static bool areRecordsCompatible(mlir::Type inTy, mlir::Type outTy) { + // Both records must have the same field types. + // Trust frontend semantics for in-depth checks, such as if both records + // have the BIND(C) attribute. + auto inRecTy = mlir::dyn_cast(inTy); + auto outRecTy = mlir::dyn_cast(outTy); + return inRecTy && outRecTy && inRecTy.getTypeList() == outRecTy.getTypeList(); +} + bool fir::ConvertOp::canBeConverted(mlir::Type inType, mlir::Type outType) { if (inType == outType) return true; @@ -1428,7 +1437,8 @@ bool fir::ConvertOp::canBeConverted(mlir::Type inType, mlir::Type outType) { (fir::isBoxedRecordType(inType) && fir::isPolymorphicType(outType)) || (fir::isPolymorphicType(inType) && fir::isPolymorphicType(outType)) || (fir::isPolymorphicType(inType) && mlir::isa(outType)) || - areVectorsCompatible(inType, outType); + areVectorsCompatible(inType, outType) || + areRecordsCompatible(inType, outType); } llvm::LogicalResult fir::ConvertOp::verify() { diff --git a/flang/test/Fir/convert-to-llvm.fir b/flang/test/Fir/convert-to-llvm.fir index 0c17d7c25a8c8..1182a0a10f218 100644 --- a/flang/test/Fir/convert-to-llvm.fir +++ b/flang/test/Fir/convert-to-llvm.fir @@ -816,6 +816,31 @@ func.func @convert_complex16(%arg0 : complex) -> complex { // ----- +// Test `fir.convert` operation conversion between compatible fir.record types. + +func.func @convert_record(%arg0 : !fir.type<_QMmod1Trec{i:i32,f:f64,c:!llvm.struct<(f32, f32)>,cstr:!fir.array<4x!fir.char<1>>}>) -> + !fir.type<_QMmod2Trec{i:i32,f:f64,c:!llvm.struct<(f32, f32)>,cstr:!fir.array<4x!fir.char<1>>}> { + %0 = fir.convert %arg0 : (!fir.type<_QMmod1Trec{i:i32,f:f64,c:!llvm.struct<(f32, f32)>,cstr:!fir.array<4x!fir.char<1>>}>) -> + !fir.type<_QMmod2Trec{i:i32,f:f64,c:!llvm.struct<(f32, f32)>,cstr:!fir.array<4x!fir.char<1>>}> + return %0 : !fir.type<_QMmod2Trec{i:i32,f:f64,c:!llvm.struct<(f32, f32)>,cstr:!fir.array<4x!fir.char<1>>}> +} + +// CHECK-LABEL: func @convert_record( +// CHECK-SAME: %[[ARG0:.*]]: [[MOD1_REC:!llvm.struct<"_QMmod1Trec", \(i32, f64, struct<\(f32, f32\)>, array<4 x array<1 x i8>>\)>]]) -> +// CHECK-SAME: [[MOD2_REC:!llvm.struct<"_QMmod2Trec", \(i32, f64, struct<\(f32, f32\)>, array<4 x array<1 x i8>>\)>]] +// CHECK: %{{.*}} = llvm.mlir.undef : [[MOD2_REC]] +// CHECK-DAG: %[[I:.*]] = llvm.extractvalue %[[ARG0]][0] : [[MOD1_REC]] +// CHECK-DAG: %{{.*}} = llvm.insertvalue %[[I]], %{{.*}}[0] : [[MOD2_REC]] +// CHECK-DAG: %[[F:.*]] = llvm.extractvalue %[[ARG0]][1] : [[MOD1_REC]] +// CHECK-DAG: %{{.*}} = llvm.insertvalue %[[F]], %{{.*}}[1] : [[MOD2_REC]] +// CHECK-DAG: %[[C:.*]] = llvm.extractvalue %[[ARG0]][2] : [[MOD1_REC]] +// CHECK-DAG: %{{.*}} = llvm.insertvalue %[[C]], %{{.*}}[2] : [[MOD2_REC]] +// CHECK-DAG: %[[CSTR:.*]] = llvm.extractvalue %[[ARG0]][3] : [[MOD1_REC]] +// CHECK-DAG: %{{.*}} = llvm.insertvalue %[[CSTR]], %{{.*}}[3] : [[MOD2_REC]] +// CHECK: llvm.return %{{.*}} : [[MOD2_REC]] + +// ----- + // Test `fir.store` --> `llvm.store` conversion func.func @test_store_index(%val_to_store : index, %addr : !fir.ref) { diff --git a/flang/test/Fir/invalid.fir b/flang/test/Fir/invalid.fir index 086a426db5642..7e3f9d6498412 100644 --- a/flang/test/Fir/invalid.fir +++ b/flang/test/Fir/invalid.fir @@ -965,6 +965,14 @@ func.func @fp_to_logical(%arg0: f32) -> !fir.logical<4> { // ----- +func.func @rec_to_rec(%arg0: !fir.type) -> !fir.type { + // expected-error@+1{{'fir.convert' op invalid type conversion}} + %0 = fir.convert %arg0 : (!fir.type) -> !fir.type + return %0 : !fir.type +} + +// ----- + func.func @bad_box_offset(%not_a_box : !fir.ref) { // expected-error@+1{{'fir.box_offset' op box_ref operand must have !fir.ref> type}} %addr1 = fir.box_offset %not_a_box base_addr : (!fir.ref) -> !fir.llvm_ptr> From 6f8e855150534358ea8c9301960c7c83119b1394 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Wed, 9 Oct 2024 15:42:19 +0200 Subject: [PATCH 020/119] [clang][bytecode] Implement __builtin_ai32_addcarryx* (#111671) --- clang/lib/AST/ByteCode/Function.h | 4 ++ clang/lib/AST/ByteCode/InterpBuiltin.cpp | 50 ++++++++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/clang/lib/AST/ByteCode/Function.h b/clang/lib/AST/ByteCode/Function.h index 640bfa65644f0..7fe9aeb110120 100644 --- a/clang/lib/AST/ByteCode/Function.h +++ b/clang/lib/AST/ByteCode/Function.h @@ -222,6 +222,10 @@ class Function final { return ParamOffsets[ParamIndex]; } + PrimType getParamType(unsigned ParamIndex) const { + return ParamTypes[ParamIndex]; + } + private: /// Construct a function representing an actual function. Function(Program &P, FunctionDeclTy Source, unsigned ArgSize, diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 98381254886e2..7d811b7baea7c 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -38,6 +38,15 @@ static T getParam(const InterpFrame *Frame, unsigned Index) { return Frame->getParam(Offset); } +// static APSInt getAPSIntParam(InterpStack &Stk, size_t Offset = 0) { +static APSInt getAPSIntParam(const InterpFrame *Frame, unsigned Index) { + APSInt R; + unsigned Offset = Frame->getFunction()->getParamOffset(Index); + INT_TYPE_SWITCH(Frame->getFunction()->getParamType(Index), + R = Frame->getParam(Offset).toAPSInt()); + return R; +} + PrimType getIntPrimType(const InterpState &S) { const TargetInfo &TI = S.getASTContext().getTargetInfo(); unsigned IntWidth = TI.getIntWidth(); @@ -1273,6 +1282,39 @@ static bool interp__builtin_ia32_pext(InterpState &S, CodePtr OpPC, return true; } +static bool interp__builtin_ia32_addcarry_subborrow(InterpState &S, + CodePtr OpPC, + const InterpFrame *Frame, + const Function *Func, + const CallExpr *Call) { + unsigned BuiltinOp = Func->getBuiltinID(); + APSInt CarryIn = getAPSIntParam(Frame, 0); + APSInt LHS = getAPSIntParam(Frame, 1); + APSInt RHS = getAPSIntParam(Frame, 2); + + bool IsAdd = BuiltinOp == clang::X86::BI__builtin_ia32_addcarryx_u32 || + BuiltinOp == clang::X86::BI__builtin_ia32_addcarryx_u64; + + unsigned BitWidth = LHS.getBitWidth(); + unsigned CarryInBit = CarryIn.ugt(0) ? 1 : 0; + APInt ExResult = + IsAdd ? (LHS.zext(BitWidth + 1) + (RHS.zext(BitWidth + 1) + CarryInBit)) + : (LHS.zext(BitWidth + 1) - (RHS.zext(BitWidth + 1) + CarryInBit)); + + APInt Result = ExResult.extractBits(BitWidth, 0); + APSInt CarryOut = + APSInt(ExResult.extractBits(1, BitWidth), /*IsUnsigned=*/true); + + Pointer &CarryOutPtr = S.Stk.peek(); + QualType CarryOutType = Call->getArg(3)->getType()->getPointeeType(); + PrimType CarryOutT = *S.getContext().classify(CarryOutType); + assignInteger(CarryOutPtr, CarryOutT, APSInt(Result, true)); + + pushInteger(S, CarryOut, Call->getType()); + + return true; +} + static bool interp__builtin_os_log_format_buffer_size(InterpState &S, CodePtr OpPC, const InterpFrame *Frame, @@ -1898,6 +1940,14 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const Function *F, return false; break; + case clang::X86::BI__builtin_ia32_addcarryx_u32: + case clang::X86::BI__builtin_ia32_addcarryx_u64: + case clang::X86::BI__builtin_ia32_subborrow_u32: + case clang::X86::BI__builtin_ia32_subborrow_u64: + if (!interp__builtin_ia32_addcarry_subborrow(S, OpPC, Frame, F, Call)) + return false; + break; + case Builtin::BI__builtin_os_log_format_buffer_size: if (!interp__builtin_os_log_format_buffer_size(S, OpPC, Frame, F, Call)) return false; From 7d9f9938ff788aa7565c61ce6f391264750515af Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 9 Oct 2024 06:43:35 -0700 Subject: [PATCH 021/119] [Transform] Avoid repeated hash lookups (NFC) (#111620) --- mlir/lib/Dialect/Transform/Transforms/CheckUses.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mlir/lib/Dialect/Transform/Transforms/CheckUses.cpp b/mlir/lib/Dialect/Transform/Transforms/CheckUses.cpp index 561d3d5b05af6..e6db819b51c22 100644 --- a/mlir/lib/Dialect/Transform/Transforms/CheckUses.cpp +++ b/mlir/lib/Dialect/Transform/Transforms/CheckUses.cpp @@ -39,11 +39,11 @@ template const llvm::SmallPtrSet & getReachableImpl(Block *block, FnTy getNextNodes, DenseMap> &cache) { - auto it = cache.find(block); - if (it != cache.end()) + auto [it, inserted] = cache.try_emplace(block); + if (!inserted) return it->getSecond(); - llvm::SmallPtrSet &reachable = cache[block]; + llvm::SmallPtrSet &reachable = it->second; SmallVector worklist; worklist.push_back(block); while (!worklist.empty()) { From 48e4d67537ba4cc2cec2600628015f4ae167d88c Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 9 Oct 2024 06:44:20 -0700 Subject: [PATCH 022/119] [DSE] Simplify code with MapVector::operator[] (NFC) (#111621) --- llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index ce8c988ba531d..6fce46a624c9c 100644 --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -2259,10 +2259,7 @@ DSEState::eliminateDeadDefs(const MemoryLocationWrapper &KillingLocWrapper) { KillingLocWrapper.MemLoc, DeadLocWrapper.MemLoc, KillingOffset, DeadOffset); if (OR == OW_MaybePartial) { - auto Iter = - IOLs.insert(std::make_pair( - DeadLocWrapper.DefInst->getParent(), InstOverlapIntervalsTy())); - auto &IOL = Iter.first->second; + auto &IOL = IOLs[DeadLocWrapper.DefInst->getParent()]; OR = isPartialOverwrite(KillingLocWrapper.MemLoc, DeadLocWrapper.MemLoc, KillingOffset, DeadOffset, DeadLocWrapper.DefInst, IOL); From bda4fc05cfcc4eb3c9d69e0acaa9533bfe5b3de3 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 9 Oct 2024 06:44:58 -0700 Subject: [PATCH 023/119] [NVPTX] Avoid repeated map lookups (NFC) (#111627) --- llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index 12f6161cbf61b..7f4e1035e7a70 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -1145,13 +1145,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, const Function *demotedFunc = nullptr; if (!processDemoted && canDemoteGlobalVar(GVar, demotedFunc)) { O << "// " << GVar->getName() << " has been demoted\n"; - if (localDecls.find(demotedFunc) != localDecls.end()) - localDecls[demotedFunc].push_back(GVar); - else { - std::vector temp; - temp.push_back(GVar); - localDecls[demotedFunc] = temp; - } + localDecls[demotedFunc].push_back(GVar); return; } @@ -1368,10 +1362,11 @@ void NVPTXAsmPrinter::AggBuffer::printWords(raw_ostream &os) { } void NVPTXAsmPrinter::emitDemotedVars(const Function *f, raw_ostream &O) { - if (localDecls.find(f) == localDecls.end()) + auto It = localDecls.find(f); + if (It == localDecls.end()) return; - std::vector &gvars = localDecls[f]; + std::vector &gvars = It->second; const NVPTXTargetMachine &NTM = static_cast(TM); const NVPTXSubtarget &STI = From 1ad5f315ae1cd44369a72cc8dc44fc9ef0cbe638 Mon Sep 17 00:00:00 2001 From: cor3ntin Date: Wed, 9 Oct 2024 15:45:16 +0200 Subject: [PATCH 024/119] [Clang] Avoid a crash when parsing an invalid pseudo-destructor (#111666) Fixes #111460. --- clang/docs/ReleaseNotes.rst | 1 + clang/lib/Sema/SemaExprCXX.cpp | 3 ++- clang/test/Parser/cxx2c-pack-indexing.cpp | 11 +++++++++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 583c1e6b4215c..a4bb303a2bc42 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -481,6 +481,7 @@ Bug Fixes to C++ Support - Clang now uses the correct set of template argument lists when comparing the constraints of out-of-line definitions and member templates explicitly specialized for a given implicit instantiation of a class template. (#GH102320) +- Fix a crash when parsing a pseudo destructor involving an invalid type. (#GH111460) Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp index d490452e91c3b..8e9bcb10a80b4 100644 --- a/clang/lib/Sema/SemaExprCXX.cpp +++ b/clang/lib/Sema/SemaExprCXX.cpp @@ -8429,7 +8429,8 @@ ExprResult Sema::ActOnPseudoDestructorExpr(Scope *S, Expr *Base, QualType ObjectType; QualType T; TypeLocBuilder TLB; - if (CheckArrow(*this, ObjectType, Base, OpKind, OpLoc)) + if (CheckArrow(*this, ObjectType, Base, OpKind, OpLoc) || + DS.getTypeSpecType() == DeclSpec::TST_error) return ExprError(); switch (DS.getTypeSpecType()) { diff --git a/clang/test/Parser/cxx2c-pack-indexing.cpp b/clang/test/Parser/cxx2c-pack-indexing.cpp index 1b84ddfc6c20a..c279bdd7af8c4 100644 --- a/clang/test/Parser/cxx2c-pack-indexing.cpp +++ b/clang/test/Parser/cxx2c-pack-indexing.cpp @@ -63,3 +63,14 @@ struct base { int main() { SS().f(0); } + + +namespace GH11460 { +template +requires( ); // expected-error {{expected expression}} +struct SS { + void f( ) { + (*p).~T...[](); // expected-error {{use of undeclared identifier 'p'}} + } +}; +} From c911b0a73ca08e7cd2409486aff721f7a6339432 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 9 Oct 2024 06:45:44 -0700 Subject: [PATCH 025/119] [clang-tidy] Avoid repeated hash lookups (NFC) (#111628) --- .../clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp index 9c3c7cc70c187..225e867c9b24f 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp @@ -474,10 +474,8 @@ void ProTypeMemberInitCheck::checkMissingMemberInitializer( // It only includes fields that have not been fixed SmallPtrSet AllFieldsToInit; forEachField(ClassDecl, FieldsToInit, [&](const FieldDecl *F) { - if (!HasRecordClassMemberSet.contains(F)) { + if (HasRecordClassMemberSet.insert(F).second) AllFieldsToInit.insert(F); - HasRecordClassMemberSet.insert(F); - } }); if (FieldsToInit.empty()) return; From 01a0e85ab7183144816bd569b7ab3899663b3a0c Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 9 Oct 2024 06:46:07 -0700 Subject: [PATCH 026/119] [Conversion] Avoid repeated hash lookups (NFC) (#111637) --- mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp index 2bcd082fb3e82..dece254c325fc 100644 --- a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp +++ b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp @@ -509,12 +509,11 @@ static LogicalResult processParallelLoop( ensureLaunchIndependent(cloningMap.lookupOrDefault(step))}); // todo(herhut,ravishankarm): Update the behavior of setMappingAttr // when this condition is relaxed. - if (bounds.contains(processor)) { + if (!bounds.try_emplace(processor, launchBound).second) { return rewriter.notifyMatchFailure( parallelOp, "cannot redefine the bound for processor " + Twine(static_cast(processor))); } - bounds[processor] = launchBound; } if (!boundIsPrecise) { // We are using an approximation, create a surrounding conditional. From f59b151f094376e135955810f523dcf6b5acde80 Mon Sep 17 00:00:00 2001 From: Mikhail Goncharov Date: Wed, 9 Oct 2024 15:51:52 +0200 Subject: [PATCH 027/119] [bazel] port 8e2ccdc4deedd463a20237b4d842b4c51f9fe603 --- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 3 +++ 1 file changed, 3 insertions(+) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 3e7ca13742f40..e52439f00879f 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -5391,6 +5391,7 @@ cc_library( ":MemorySlotInterfacesIncGen", ":SideEffectInterfaces", ":Support", + ":ViewLikeInterface", "//llvm:AsmParser", "//llvm:BinaryFormat", "//llvm:BitReader", @@ -5434,6 +5435,7 @@ cc_library( ":NVVMDialect", ":Pass", ":TransformUtils", + ":ViewLikeInterface", "//llvm:BinaryFormat", "//llvm:Support", ], @@ -5804,6 +5806,7 @@ td_library( ":MemorySlotInterfacesTdFiles", ":OpBaseTdFiles", ":SideEffectInterfacesTdFiles", + ":ViewLikeInterfaceTdFiles", ], ) From e85fcb763173590fdcd5cb922b7ca1fc97cf170b Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 9 Oct 2024 18:03:35 +0400 Subject: [PATCH 028/119] AMDGPU: Add instruction flags when lowering ctor/dtor (#111652) These should be well behaved address computations. --- llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp | 10 +++++++--- .../CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll | 4 ++-- llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll | 4 ++-- llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll | 4 ++-- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp index ea11002bb6a5f..a774ad53b5bed 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp @@ -112,9 +112,13 @@ static void createInitOrFiniCalls(Function &F, bool IsCtor) { Type *Int64Ty = IntegerType::getInt64Ty(C); auto *EndPtr = IRB.CreatePtrToInt(End, Int64Ty); auto *BeginPtr = IRB.CreatePtrToInt(Begin, Int64Ty); - auto *ByteSize = IRB.CreateSub(EndPtr, BeginPtr); - auto *Size = IRB.CreateAShr(ByteSize, ConstantInt::get(Int64Ty, 3)); - auto *Offset = IRB.CreateSub(Size, ConstantInt::get(Int64Ty, 1)); + auto *ByteSize = IRB.CreateSub(EndPtr, BeginPtr, "", /*HasNUW=*/true, + /*HasNSW=*/true); + auto *Size = IRB.CreateAShr(ByteSize, ConstantInt::get(Int64Ty, 3), "", + /*isExact=*/true); + auto *Offset = + IRB.CreateSub(Size, ConstantInt::get(Int64Ty, 1), "", /*HasNUW=*/true, + /*HasNSW=*/true); Start = IRB.CreateInBoundsGEP( PtrArrayTy, Begin, ArrayRef({ConstantInt::get(Int64Ty, 0), Offset})); diff --git a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll index a87e07cb57e05..968871af2d059 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll @@ -64,8 +64,8 @@ define void @bar() addrspace(1) { ; CHECK-LABEL: define weak_odr amdgpu_kernel void @amdgcn.device.fini( ; CHECK-SAME: ) #[[ATTR2:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = ashr i64 sub (i64 ptrtoint (ptr addrspace(1) @__fini_array_end to i64), i64 ptrtoint (ptr addrspace(1) @__fini_array_start to i64)), 3 -; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = ashr exact i64 sub nuw nsw (i64 ptrtoint (ptr addrspace(1) @__fini_array_end to i64), i64 ptrtoint (ptr addrspace(1) @__fini_array_start to i64)), 3 +; CHECK-NEXT: [[TMP1:%.*]] = sub nuw nsw i64 [[TMP0]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [0 x ptr addrspace(1)], ptr addrspace(1) @__fini_array_start, i64 0, i64 [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp uge ptr addrspace(1) [[TMP2]], @__fini_array_start ; CHECK-NEXT: br i1 [[TMP3]], label [[WHILE_ENTRY:%.*]], label [[WHILE_END:%.*]] diff --git a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll index a423b320db559..98497a64e3204 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll @@ -79,8 +79,8 @@ define internal void @bar() { ; CHECK-LABEL: define weak_odr amdgpu_kernel void @amdgcn.device.fini( ; CHECK-SAME: ) #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = ashr i64 sub (i64 ptrtoint (ptr addrspace(1) @__fini_array_end to i64), i64 ptrtoint (ptr addrspace(1) @__fini_array_start to i64)), 3 -; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = ashr exact i64 sub nuw nsw (i64 ptrtoint (ptr addrspace(1) @__fini_array_end to i64), i64 ptrtoint (ptr addrspace(1) @__fini_array_start to i64)), 3 +; CHECK-NEXT: [[TMP1:%.*]] = sub nuw nsw i64 [[TMP0]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [0 x ptr addrspace(1)], ptr addrspace(1) @__fini_array_start, i64 0, i64 [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp uge ptr addrspace(1) [[TMP2]], @__fini_array_start ; CHECK-NEXT: br i1 [[TMP3]], label [[WHILE_ENTRY:%.*]], label [[WHILE_END:%.*]] diff --git a/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll b/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll index 309ecb17e79ed..a137f31c7aeec 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll @@ -71,8 +71,8 @@ define internal void @bar.5() { ; CHECK-LABEL: define weak_odr amdgpu_kernel void @amdgcn.device.fini( ; CHECK-SAME: ) #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = ashr i64 sub (i64 ptrtoint (ptr addrspace(1) @__fini_array_end to i64), i64 ptrtoint (ptr addrspace(1) @__fini_array_start to i64)), 3 -; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = ashr exact i64 sub nuw nsw (i64 ptrtoint (ptr addrspace(1) @__fini_array_end to i64), i64 ptrtoint (ptr addrspace(1) @__fini_array_start to i64)), 3 +; CHECK-NEXT: [[TMP1:%.*]] = sub nuw nsw i64 [[TMP0]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [0 x ptr addrspace(1)], ptr addrspace(1) @__fini_array_start, i64 0, i64 [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp uge ptr addrspace(1) [[TMP2]], @__fini_array_start ; CHECK-NEXT: br i1 [[TMP3]], label [[WHILE_ENTRY:%.*]], label [[WHILE_END:%.*]] From 665457815f11118f7e755a471f33606c8562a4be Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Tue, 8 Oct 2024 16:44:10 +0000 Subject: [PATCH 029/119] [LLVM][AArch64] Enable SVEIntrinsicOpts at all optimisation levels. --- flang/test/Driver/default-backend-pipelines.f90 | 4 ++-- llvm/lib/Target/AArch64/AArch64TargetMachine.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/flang/test/Driver/default-backend-pipelines.f90 b/flang/test/Driver/default-backend-pipelines.f90 index e6910c170f117..12b929e9fa40f 100644 --- a/flang/test/Driver/default-backend-pipelines.f90 +++ b/flang/test/Driver/default-backend-pipelines.f90 @@ -5,9 +5,9 @@ ! RUN: %flang_fc1 -S -O2 %s -triple aarch64-unknown-linux-gnu -mllvm -debug-pass=Structure -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK-O2 ! RUN: %flang_fc1 -S -O3 %s -triple aarch64-unknown-linux-gnu -mllvm -debug-pass=Structure -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK-O3 -! CHECK-O2-NOT: SVE intrinsics optimizations +! CHECK-O2-NOT: Optimize selects -! CHECK-O3: SVE intrinsics optimizations +! CHECK-O3: Optimize selects subroutine simple_loop integer :: i diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index 9f96f6c5e83ec..7b0ae23358673 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -588,7 +588,7 @@ void AArch64PassConfig::addIRPasses() { // Expand any SVE vector library calls that we can't code generate directly. if (EnableSVEIntrinsicOpts && - TM->getOptLevel() == CodeGenOptLevel::Aggressive) + TM->getOptLevel() != CodeGenOptLevel::None) addPass(createSVEIntrinsicOptsPass()); // Cmpxchg instructions are often used with a subsequent comparison to From c4d288d9e21f44bc4a0f26e7655eba851a9b6225 Mon Sep 17 00:00:00 2001 From: Kareem Ergawy Date: Wed, 9 Oct 2024 16:47:06 +0200 Subject: [PATCH 030/119] [flang][OpenMP] Don't check unlabelled `cycle` branching for target loops (#111656) Properly handles `cycle` branching inside target distribute loops. --- flang/lib/Semantics/check-directive-structure.h | 3 +++ flang/test/Semantics/OpenMP/do05-positivecase.f90 | 15 +++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/flang/lib/Semantics/check-directive-structure.h b/flang/lib/Semantics/check-directive-structure.h index a1aff52f3a684..2a9cb785a882f 100644 --- a/flang/lib/Semantics/check-directive-structure.h +++ b/flang/lib/Semantics/check-directive-structure.h @@ -74,6 +74,9 @@ template class NoBranchingEnforce { case llvm::omp::Directive::OMPD_distribute_parallel_for: case llvm::omp::Directive::OMPD_distribute_simd: case llvm::omp::Directive::OMPD_distribute_parallel_for_simd: + case llvm::omp::Directive::OMPD_target_teams_distribute_parallel_do: + case llvm::omp::Directive:: + OMPD_target_teams_distribute_parallel_do_simd: return; default: break; diff --git a/flang/test/Semantics/OpenMP/do05-positivecase.f90 b/flang/test/Semantics/OpenMP/do05-positivecase.f90 index 3b512a5b4f25e..5e1b1b86f72f6 100644 --- a/flang/test/Semantics/OpenMP/do05-positivecase.f90 +++ b/flang/test/Semantics/OpenMP/do05-positivecase.f90 @@ -42,4 +42,19 @@ program omp_do end do !$omp end parallel + !$omp target teams distribute parallel do + !DEF:/omp_do/OtherConstruct4/i (OmpPrivate ,OmpPreDetermined) HostAssoc INTEGER(4) + do i=1,100 + !REF:/omp_do/OtherConstruct4/i + if(i<10) cycle + end do + !$omp end target teams distribute parallel do + + !$omp target teams distribute parallel do simd + !DEF:/omp_do/OtherConstruct5/i (OmpLinear,OmpPreDetermined) HostAssoc INTEGER(4) + do i=1,100 + !REF:/omp_do/OtherConstruct5/i + if(i<10) cycle + end do + !$omp end target teams distribute parallel do simd end program omp_do From 1731bb79a97537c71f916f1e70a442a6615599d0 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 9 Oct 2024 19:13:06 +0400 Subject: [PATCH 031/119] llvm-reduce: Fix not checking shouldKeep in special-globals reduction (#111647) --- ...cial-globals-missing-should-keep-assert.ll | 20 +++++++++++++++++++ .../deltas/ReduceSpecialGlobals.cpp | 6 ++++-- 2 files changed, 24 insertions(+), 2 deletions(-) create mode 100644 llvm/test/tools/llvm-reduce/special-globals-missing-should-keep-assert.ll diff --git a/llvm/test/tools/llvm-reduce/special-globals-missing-should-keep-assert.ll b/llvm/test/tools/llvm-reduce/special-globals-missing-should-keep-assert.ll new file mode 100644 index 0000000000000..e13cd298da4b1 --- /dev/null +++ b/llvm/test/tools/llvm-reduce/special-globals-missing-should-keep-assert.ll @@ -0,0 +1,20 @@ +; RUN: llvm-reduce -abort-on-invalid-reduction --delta-passes=special-globals --test FileCheck --test-arg --check-prefix=CHECK-INTERESTINGNESS --test-arg %s --test-arg --input-file %s -o %t.0 +; RUN: FileCheck --implicit-check-not=define --check-prefix=CHECK-FINAL %s < %t.0 + +; Check that we don't hit "input module no longer interesting after +; counting chunks" The special-globals reduction was not checking +; shouldKeep before unconditionally erasing it. + +; CHECK-INTERESTINGNESS: llvm.used +; CHECK-FINAL: llvm.used +; CHECK-FINAL: define void @kept_used +; CHECK-FINAL: define void @other +@llvm.used = appending global [2 x ptr] [ptr @kept_used, ptr @other ] + +define void @kept_used() { + ret void +} + +define void @other() { + ret void +} diff --git a/llvm/tools/llvm-reduce/deltas/ReduceSpecialGlobals.cpp b/llvm/tools/llvm-reduce/deltas/ReduceSpecialGlobals.cpp index 5b124a4052960..aadd038033d5c 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceSpecialGlobals.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceSpecialGlobals.cpp @@ -33,8 +33,10 @@ static void extractSpecialGlobalsFromModule(Oracle &O, for (StringRef Name : SpecialGlobalNames) { if (auto *Used = Program.getNamedGlobal(Name)) { - Used->replaceAllUsesWith(getDefaultValue(Used->getType())); - Used->eraseFromParent(); + if (!O.shouldKeep()) { + Used->replaceAllUsesWith(getDefaultValue(Used->getType())); + Used->eraseFromParent(); + } } } } From e637a5c9fef866158018dcaecc3c385d157460f5 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Wed, 9 Oct 2024 17:13:22 +0200 Subject: [PATCH 032/119] [clang][bytecode] Only allow lossless ptr-to-int casts (#111669) Only allow those casts if the bitwidth of the two types match. --- clang/lib/AST/ByteCode/Interp.cpp | 41 +++++++++++++++++++++++++++ clang/lib/AST/ByteCode/Interp.h | 43 ++++------------------------- clang/test/AST/ByteCode/codegen.cpp | 4 +++ 3 files changed, 51 insertions(+), 37 deletions(-) diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp index 050de67c2e77d..82e11743cc529 100644 --- a/clang/lib/AST/ByteCode/Interp.cpp +++ b/clang/lib/AST/ByteCode/Interp.cpp @@ -22,6 +22,7 @@ #include "clang/AST/Expr.h" #include "clang/AST/ExprCXX.h" #include "clang/Basic/DiagnosticSema.h" +#include "clang/Basic/TargetInfo.h" #include "llvm/ADT/APSInt.h" #include "llvm/ADT/StringExtras.h" #include @@ -1415,6 +1416,46 @@ bool InvalidShuffleVectorIndex(InterpState &S, CodePtr OpPC, uint32_t Index) { return false; } +bool CheckPointerToIntegralCast(InterpState &S, CodePtr OpPC, + const Pointer &Ptr, unsigned BitWidth) { + if (Ptr.isDummy()) + return false; + + const SourceInfo &E = S.Current->getSource(OpPC); + S.CCEDiag(E, diag::note_constexpr_invalid_cast) + << 2 << S.getLangOpts().CPlusPlus << S.Current->getRange(OpPC); + + if (Ptr.isBlockPointer() && !Ptr.isZero()) { + // Only allow based lvalue casts if they are lossless. + if (S.getASTContext().getTargetInfo().getPointerWidth(LangAS::Default) != + BitWidth) + return Invalid(S, OpPC); + } + return true; +} + +bool CastPointerIntegralAP(InterpState &S, CodePtr OpPC, uint32_t BitWidth) { + const Pointer &Ptr = S.Stk.pop(); + + if (!CheckPointerToIntegralCast(S, OpPC, Ptr, BitWidth)) + return false; + + S.Stk.push>( + IntegralAP::from(Ptr.getIntegerRepresentation(), BitWidth)); + return true; +} + +bool CastPointerIntegralAPS(InterpState &S, CodePtr OpPC, uint32_t BitWidth) { + const Pointer &Ptr = S.Stk.pop(); + + if (!CheckPointerToIntegralCast(S, OpPC, Ptr, BitWidth)) + return false; + + S.Stk.push>( + IntegralAP::from(Ptr.getIntegerRepresentation(), BitWidth)); + return true; +} + // https://github.com/llvm/llvm-project/issues/102513 #if defined(_WIN32) && !defined(__clang__) && !defined(NDEBUG) #pragma optimize("", off) diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h index 2c5538d221bf0..4170891002447 100644 --- a/clang/lib/AST/ByteCode/Interp.h +++ b/clang/lib/AST/ByteCode/Interp.h @@ -2289,53 +2289,22 @@ static inline bool CastFloatingIntegralAPS(InterpState &S, CodePtr OpPC, return CheckFloatResult(S, OpPC, F, Status, FPO); } +bool CheckPointerToIntegralCast(InterpState &S, CodePtr OpPC, + const Pointer &Ptr, unsigned BitWidth); +bool CastPointerIntegralAP(InterpState &S, CodePtr OpPC, uint32_t BitWidth); +bool CastPointerIntegralAPS(InterpState &S, CodePtr OpPC, uint32_t BitWidth); + template ::T> bool CastPointerIntegral(InterpState &S, CodePtr OpPC) { const Pointer &Ptr = S.Stk.pop(); - if (Ptr.isDummy()) + if (!CheckPointerToIntegralCast(S, OpPC, Ptr, T::bitWidth())) return false; - const SourceInfo &E = S.Current->getSource(OpPC); - S.CCEDiag(E, diag::note_constexpr_invalid_cast) - << 2 << S.getLangOpts().CPlusPlus << S.Current->getRange(OpPC); - S.Stk.push(T::from(Ptr.getIntegerRepresentation())); return true; } -static inline bool CastPointerIntegralAP(InterpState &S, CodePtr OpPC, - uint32_t BitWidth) { - const Pointer &Ptr = S.Stk.pop(); - - if (Ptr.isDummy()) - return false; - - const SourceInfo &E = S.Current->getSource(OpPC); - S.CCEDiag(E, diag::note_constexpr_invalid_cast) - << 2 << S.getLangOpts().CPlusPlus << S.Current->getRange(OpPC); - - S.Stk.push>( - IntegralAP::from(Ptr.getIntegerRepresentation(), BitWidth)); - return true; -} - -static inline bool CastPointerIntegralAPS(InterpState &S, CodePtr OpPC, - uint32_t BitWidth) { - const Pointer &Ptr = S.Stk.pop(); - - if (Ptr.isDummy()) - return false; - - const SourceInfo &E = S.Current->getSource(OpPC); - S.CCEDiag(E, diag::note_constexpr_invalid_cast) - << 2 << S.getLangOpts().CPlusPlus << S.Current->getRange(OpPC); - - S.Stk.push>( - IntegralAP::from(Ptr.getIntegerRepresentation(), BitWidth)); - return true; -} - template ::T> static inline bool CastIntegralFixedPoint(InterpState &S, CodePtr OpPC, uint32_t FPS) { diff --git a/clang/test/AST/ByteCode/codegen.cpp b/clang/test/AST/ByteCode/codegen.cpp index 12d8b5a5c548e..ea2c812f30f6f 100644 --- a/clang/test/AST/ByteCode/codegen.cpp +++ b/clang/test/AST/ByteCode/codegen.cpp @@ -1,6 +1,10 @@ // RUN: %clang_cc1 -triple x86_64-linux -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -triple x86_64-linux -emit-llvm -o - %s -fexperimental-new-constant-interpreter | FileCheck %s +#ifdef __SIZEOF_INT128__ +// CHECK: @PR11705 = global i128 0 +__int128_t PR11705 = (__int128_t)&PR11705; +#endif int arr[2]; // CHECK: @pastEnd = constant ptr getelementptr (i8, ptr @arr, i64 8) From 72a957ba4c8ef059f1572f6d4ee0cba8dc615268 Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Wed, 9 Oct 2024 16:17:43 +0100 Subject: [PATCH 033/119] [Cuda] Handle -fcuda-short-ptr even with -nocudalib (#111682) When passed -nocudalib/-nogpulib, Cuda's argument handling would bail out before handling -fcuda-short-ptr, meaning the frontend and backend data layouts would mismatch. --- clang/lib/Driver/ToolChains/Cuda.cpp | 8 ++++---- clang/test/Driver/cuda-short-ptr.cu | 6 ++++++ 2 files changed, 10 insertions(+), 4 deletions(-) create mode 100644 clang/test/Driver/cuda-short-ptr.cu diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp index 509cd87b28c37..7a70cf1c5694f 100644 --- a/clang/lib/Driver/ToolChains/Cuda.cpp +++ b/clang/lib/Driver/ToolChains/Cuda.cpp @@ -848,6 +848,10 @@ void CudaToolChain::addClangTargetOptions( if (CudaInstallation.version() >= CudaVersion::CUDA_90) CC1Args.push_back("-fcuda-allow-variadic-functions"); + if (DriverArgs.hasFlag(options::OPT_fcuda_short_ptr, + options::OPT_fno_cuda_short_ptr, false)) + CC1Args.append({"-mllvm", "--nvptx-short-ptr"}); + if (DriverArgs.hasArg(options::OPT_nogpulib)) return; @@ -873,10 +877,6 @@ void CudaToolChain::addClangTargetOptions( clang::CudaVersion CudaInstallationVersion = CudaInstallation.version(); - if (DriverArgs.hasFlag(options::OPT_fcuda_short_ptr, - options::OPT_fno_cuda_short_ptr, false)) - CC1Args.append({"-mllvm", "--nvptx-short-ptr"}); - if (CudaInstallationVersion >= CudaVersion::UNKNOWN) CC1Args.push_back( DriverArgs.MakeArgString(Twine("-target-sdk-version=") + diff --git a/clang/test/Driver/cuda-short-ptr.cu b/clang/test/Driver/cuda-short-ptr.cu new file mode 100644 index 0000000000000..e0ae4505e0b56 --- /dev/null +++ b/clang/test/Driver/cuda-short-ptr.cu @@ -0,0 +1,6 @@ +// Checks that cuda compilation does the right thing when passed -fcuda-short-ptr + +// RUN: %clang -### --target=x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=sm_20 -fcuda-short-ptr -nocudainc -nocudalib --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 | FileCheck %s + +// CHECK: "-mllvm" "--nvptx-short-ptr" +// CHECK-SAME: "-fcuda-short-ptr" From c136d3237a3c6230cfe1ab3f0f6790f903c54a27 Mon Sep 17 00:00:00 2001 From: David Green Date: Wed, 9 Oct 2024 16:20:03 +0100 Subject: [PATCH 034/119] [VectorCombine] Do not try to operate on OperandBundles. (#111635) This bails out if we see an intrinsic with an operand bundle on it, to make sure we don't process the bundles incorrectly. Fixes #110382. --- .../Transforms/Vectorize/VectorCombine.cpp | 59 ++++++++++--------- .../AArch64/shuffletoidentity.ll | 48 +++++++++++++++ 2 files changed, 79 insertions(+), 28 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index a2ab5d9666407..627edb680dfa1 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -1984,33 +1984,35 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) { // We need each element to be the same type of value, and check that each // element has a single use. - if (all_of(drop_begin(Item), [Item](InstLane IL) { - Value *FrontV = Item.front().first->get(); - if (!IL.first) - return true; - Value *V = IL.first->get(); - if (auto *I = dyn_cast(V); I && !I->hasOneUse()) - return false; - if (V->getValueID() != FrontV->getValueID()) - return false; - if (auto *CI = dyn_cast(V)) - if (CI->getPredicate() != cast(FrontV)->getPredicate()) - return false; - if (auto *CI = dyn_cast(V)) - if (CI->getSrcTy() != cast(FrontV)->getSrcTy()) - return false; - if (auto *SI = dyn_cast(V)) - if (!isa(SI->getOperand(0)->getType()) || - SI->getOperand(0)->getType() != - cast(FrontV)->getOperand(0)->getType()) - return false; - if (isa(V) && !isa(V)) - return false; - auto *II = dyn_cast(V); - return !II || (isa(FrontV) && - II->getIntrinsicID() == - cast(FrontV)->getIntrinsicID()); - })) { + auto CheckLaneIsEquivalentToFirst = [Item](InstLane IL) { + Value *FrontV = Item.front().first->get(); + if (!IL.first) + return true; + Value *V = IL.first->get(); + if (auto *I = dyn_cast(V); I && !I->hasOneUse()) + return false; + if (V->getValueID() != FrontV->getValueID()) + return false; + if (auto *CI = dyn_cast(V)) + if (CI->getPredicate() != cast(FrontV)->getPredicate()) + return false; + if (auto *CI = dyn_cast(V)) + if (CI->getSrcTy() != cast(FrontV)->getSrcTy()) + return false; + if (auto *SI = dyn_cast(V)) + if (!isa(SI->getOperand(0)->getType()) || + SI->getOperand(0)->getType() != + cast(FrontV)->getOperand(0)->getType()) + return false; + if (isa(V) && !isa(V)) + return false; + auto *II = dyn_cast(V); + return !II || (isa(FrontV) && + II->getIntrinsicID() == + cast(FrontV)->getIntrinsicID() && + !II->hasOperandBundles()); + }; + if (all_of(drop_begin(Item), CheckLaneIsEquivalentToFirst)) { // Check the operator is one that we support. if (isa(FrontU)) { // We exclude div/rem in case they hit UB from poison lanes. @@ -2038,7 +2040,8 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) { Worklist.push_back(generateInstLaneVectorFromOperand(Item, 2)); continue; } else if (auto *II = dyn_cast(FrontU); - II && isTriviallyVectorizable(II->getIntrinsicID())) { + II && isTriviallyVectorizable(II->getIntrinsicID()) && + !II->hasOperandBundles()) { for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) { if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op)) { if (!all_of(drop_begin(Item), [Item, Op](InstLane &IL) { diff --git a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll index af04fb0ab4621..66fe11369d88b 100644 --- a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll +++ b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll @@ -1066,4 +1066,52 @@ entry: ret <2 x float> %4 } +define <16 x i64> @operandbundles(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c) { +; CHECK-LABEL: @operandbundles( +; CHECK-NEXT: [[CALL:%.*]] = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i64> [[C:%.*]]) [ "jl_roots"(ptr addrspace(10) null, ptr addrspace(10) null) ] +; CHECK-NEXT: [[SHUFFLEVECTOR:%.*]] = shufflevector <4 x i64> [[CALL]], <4 x i64> poison, <16 x i32> +; CHECK-NEXT: [[SHUFFLEVECTOR1:%.*]] = shufflevector <16 x i64> [[SHUFFLEVECTOR]], <16 x i64> undef, <16 x i32> +; CHECK-NEXT: ret <16 x i64> [[SHUFFLEVECTOR1]] +; + %call = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c) [ "jl_roots"(ptr addrspace(10) null, ptr addrspace(10) null) ] + %shufflevector = shufflevector <4 x i64> %call, <4 x i64> poison, <16 x i32> + %shufflevector1 = shufflevector <16 x i64> %shufflevector, <16 x i64> undef, <16 x i32> + ret <16 x i64> %shufflevector1 +} + +define <8 x i8> @operandbundles_first(<8 x i8> %a) { +; CHECK-LABEL: @operandbundles_first( +; CHECK-NEXT: [[AB:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[AT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[ABT:%.*]] = call <4 x i8> @llvm.abs.v4i8(<4 x i8> [[AT]], i1 false) [ "jl_roots"(ptr addrspace(10) null, ptr addrspace(10) null) ] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[AT]], <4 x i8> [[AB]], <8 x i32> +; CHECK-NEXT: [[R:%.*]] = call <8 x i8> @llvm.abs.v8i8(<8 x i8> [[TMP1]], i1 false) +; CHECK-NEXT: ret <8 x i8> [[R]] +; + %ab = shufflevector <8 x i8> %a, <8 x i8> poison, <4 x i32> + %at = shufflevector <8 x i8> %a, <8 x i8> poison, <4 x i32> + %abt = call <4 x i8> @llvm.abs.v4i8(<4 x i8> %at, i1 false) [ "jl_roots"(ptr addrspace(10) null, ptr addrspace(10) null) ] + %abb = call <4 x i8> @llvm.abs.v4i8(<4 x i8> %ab, i1 false) + %r = shufflevector <4 x i8> %abt, <4 x i8> %abb, <8 x i32> + ret <8 x i8> %r +} + +define <8 x i8> @operandbundles_second(<8 x i8> %a) { +; CHECK-LABEL: @operandbundles_second( +; CHECK-NEXT: [[AB:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[AT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[ABB:%.*]] = call <4 x i8> @llvm.abs.v4i8(<4 x i8> [[AB]], i1 false) [ "jl_roots"(ptr addrspace(10) null, ptr addrspace(10) null) ] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[AT]], <4 x i8> [[AB]], <8 x i32> +; CHECK-NEXT: [[R:%.*]] = call <8 x i8> @llvm.abs.v8i8(<8 x i8> [[TMP1]], i1 false) +; CHECK-NEXT: ret <8 x i8> [[R]] +; + %ab = shufflevector <8 x i8> %a, <8 x i8> poison, <4 x i32> + %at = shufflevector <8 x i8> %a, <8 x i8> poison, <4 x i32> + %abt = call <4 x i8> @llvm.abs.v4i8(<4 x i8> %at, i1 false) + %abb = call <4 x i8> @llvm.abs.v4i8(<4 x i8> %ab, i1 false) [ "jl_roots"(ptr addrspace(10) null, ptr addrspace(10) null) ] + %r = shufflevector <4 x i8> %abt, <4 x i8> %abb, <8 x i32> + ret <8 x i8> %r +} + +declare <4 x i64> @llvm.fshl.v4i64(<4 x i64>, <4 x i64>, <4 x i64>) declare void @use(<4 x i8>) From d905a3c51b31775791c15132a1b3613d75871853 Mon Sep 17 00:00:00 2001 From: Ellis Hoag Date: Wed, 9 Oct 2024 08:26:30 -0700 Subject: [PATCH 035/119] [NFC] Format MachineVerifier.cpp to remove extra indentation (#111602) Many structs in this class have the wrong indentation. To generate this diff, I touched the first line of each struct and then ran `git clang-format`. This will make blaming more difficult, but this autoformatting is difficult to avoid triggering. I think it's best to push this as one NFC PR. --- llvm/lib/CodeGen/MachineVerifier.cpp | 501 ++++++++++++++------------- 1 file changed, 251 insertions(+), 250 deletions(-) diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp index 24a0f41775cc1..b7218afdd3820 100644 --- a/llvm/lib/CodeGen/MachineVerifier.cpp +++ b/llvm/lib/CodeGen/MachineVerifier.cpp @@ -93,257 +93,258 @@ using namespace llvm; namespace { - struct MachineVerifier { - MachineVerifier(MachineFunctionAnalysisManager &MFAM, const char *b, - raw_ostream *OS) - : MFAM(&MFAM), OS(OS ? *OS : nulls()), Banner(b) {} - - MachineVerifier(Pass *pass, const char *b, raw_ostream *OS) - : PASS(pass), OS(OS ? *OS : nulls()), Banner(b) {} - - MachineVerifier(const char *b, LiveVariables *LiveVars, - LiveIntervals *LiveInts, LiveStacks *LiveStks, - SlotIndexes *Indexes, raw_ostream *OS) - : OS(OS ? *OS : nulls()), Banner(b), LiveVars(LiveVars), - LiveInts(LiveInts), LiveStks(LiveStks), Indexes(Indexes) {} - - unsigned verify(const MachineFunction &MF); - - MachineFunctionAnalysisManager *MFAM = nullptr; - Pass *const PASS = nullptr; - raw_ostream &OS; - const char *Banner; - const MachineFunction *MF = nullptr; - const TargetMachine *TM = nullptr; - const TargetInstrInfo *TII = nullptr; - const TargetRegisterInfo *TRI = nullptr; - const MachineRegisterInfo *MRI = nullptr; - const RegisterBankInfo *RBI = nullptr; - - unsigned foundErrors = 0; - - // Avoid querying the MachineFunctionProperties for each operand. - bool isFunctionRegBankSelected = false; - bool isFunctionSelected = false; - bool isFunctionTracksDebugUserValues = false; - - using RegVector = SmallVector; - using RegMaskVector = SmallVector; - using RegSet = DenseSet; - using RegMap = DenseMap; - using BlockSet = SmallPtrSet; - - const MachineInstr *FirstNonPHI = nullptr; - const MachineInstr *FirstTerminator = nullptr; - BlockSet FunctionBlocks; - - BitVector regsReserved; - RegSet regsLive; - RegVector regsDefined, regsDead, regsKilled; - RegMaskVector regMasks; - - SlotIndex lastIndex; - - // Add Reg and any sub-registers to RV - void addRegWithSubRegs(RegVector &RV, Register Reg) { - RV.push_back(Reg); - if (Reg.isPhysical()) - append_range(RV, TRI->subregs(Reg.asMCReg())); - } - - struct BBInfo { - // Is this MBB reachable from the MF entry point? - bool reachable = false; - - // Vregs that must be live in because they are used without being - // defined. Map value is the user. vregsLiveIn doesn't include regs - // that only are used by PHI nodes. - RegMap vregsLiveIn; - - // Regs killed in MBB. They may be defined again, and will then be in both - // regsKilled and regsLiveOut. - RegSet regsKilled; - - // Regs defined in MBB and live out. Note that vregs passing through may - // be live out without being mentioned here. - RegSet regsLiveOut; - - // Vregs that pass through MBB untouched. This set is disjoint from - // regsKilled and regsLiveOut. - RegSet vregsPassed; - - // Vregs that must pass through MBB because they are needed by a successor - // block. This set is disjoint from regsLiveOut. - RegSet vregsRequired; - - // Set versions of block's predecessor and successor lists. - BlockSet Preds, Succs; - - BBInfo() = default; - - // Add register to vregsRequired if it belongs there. Return true if - // anything changed. - bool addRequired(Register Reg) { - if (!Reg.isVirtual()) - return false; - if (regsLiveOut.count(Reg)) - return false; - return vregsRequired.insert(Reg).second; - } +struct MachineVerifier { + MachineVerifier(MachineFunctionAnalysisManager &MFAM, const char *b, + raw_ostream *OS) + : MFAM(&MFAM), OS(OS ? *OS : nulls()), Banner(b) {} + + MachineVerifier(Pass *pass, const char *b, raw_ostream *OS) + : PASS(pass), OS(OS ? *OS : nulls()), Banner(b) {} + + MachineVerifier(const char *b, LiveVariables *LiveVars, + LiveIntervals *LiveInts, LiveStacks *LiveStks, + SlotIndexes *Indexes, raw_ostream *OS) + : OS(OS ? *OS : nulls()), Banner(b), LiveVars(LiveVars), + LiveInts(LiveInts), LiveStks(LiveStks), Indexes(Indexes) {} + + unsigned verify(const MachineFunction &MF); + + MachineFunctionAnalysisManager *MFAM = nullptr; + Pass *const PASS = nullptr; + raw_ostream &OS; + const char *Banner; + const MachineFunction *MF = nullptr; + const TargetMachine *TM = nullptr; + const TargetInstrInfo *TII = nullptr; + const TargetRegisterInfo *TRI = nullptr; + const MachineRegisterInfo *MRI = nullptr; + const RegisterBankInfo *RBI = nullptr; + + unsigned foundErrors = 0; + + // Avoid querying the MachineFunctionProperties for each operand. + bool isFunctionRegBankSelected = false; + bool isFunctionSelected = false; + bool isFunctionTracksDebugUserValues = false; + + using RegVector = SmallVector; + using RegMaskVector = SmallVector; + using RegSet = DenseSet; + using RegMap = DenseMap; + using BlockSet = SmallPtrSet; + + const MachineInstr *FirstNonPHI = nullptr; + const MachineInstr *FirstTerminator = nullptr; + BlockSet FunctionBlocks; + + BitVector regsReserved; + RegSet regsLive; + RegVector regsDefined, regsDead, regsKilled; + RegMaskVector regMasks; + + SlotIndex lastIndex; + + // Add Reg and any sub-registers to RV + void addRegWithSubRegs(RegVector &RV, Register Reg) { + RV.push_back(Reg); + if (Reg.isPhysical()) + append_range(RV, TRI->subregs(Reg.asMCReg())); + } + + struct BBInfo { + // Is this MBB reachable from the MF entry point? + bool reachable = false; + + // Vregs that must be live in because they are used without being + // defined. Map value is the user. vregsLiveIn doesn't include regs + // that only are used by PHI nodes. + RegMap vregsLiveIn; + + // Regs killed in MBB. They may be defined again, and will then be in both + // regsKilled and regsLiveOut. + RegSet regsKilled; + + // Regs defined in MBB and live out. Note that vregs passing through may + // be live out without being mentioned here. + RegSet regsLiveOut; + + // Vregs that pass through MBB untouched. This set is disjoint from + // regsKilled and regsLiveOut. + RegSet vregsPassed; + + // Vregs that must pass through MBB because they are needed by a successor + // block. This set is disjoint from regsLiveOut. + RegSet vregsRequired; + + // Set versions of block's predecessor and successor lists. + BlockSet Preds, Succs; + + BBInfo() = default; + + // Add register to vregsRequired if it belongs there. Return true if + // anything changed. + bool addRequired(Register Reg) { + if (!Reg.isVirtual()) + return false; + if (regsLiveOut.count(Reg)) + return false; + return vregsRequired.insert(Reg).second; + } - // Same for a full set. - bool addRequired(const RegSet &RS) { - bool Changed = false; - for (Register Reg : RS) - Changed |= addRequired(Reg); - return Changed; - } + // Same for a full set. + bool addRequired(const RegSet &RS) { + bool Changed = false; + for (Register Reg : RS) + Changed |= addRequired(Reg); + return Changed; + } - // Same for a full map. - bool addRequired(const RegMap &RM) { - bool Changed = false; - for (const auto &I : RM) - Changed |= addRequired(I.first); - return Changed; - } + // Same for a full map. + bool addRequired(const RegMap &RM) { + bool Changed = false; + for (const auto &I : RM) + Changed |= addRequired(I.first); + return Changed; + } - // Live-out registers are either in regsLiveOut or vregsPassed. - bool isLiveOut(Register Reg) const { - return regsLiveOut.count(Reg) || vregsPassed.count(Reg); - } - }; + // Live-out registers are either in regsLiveOut or vregsPassed. + bool isLiveOut(Register Reg) const { + return regsLiveOut.count(Reg) || vregsPassed.count(Reg); + } + }; - // Extra register info per MBB. - DenseMap MBBInfoMap; - - bool isReserved(Register Reg) { - return Reg.id() < regsReserved.size() && regsReserved.test(Reg.id()); - } - - bool isAllocatable(Register Reg) const { - return Reg.id() < TRI->getNumRegs() && TRI->isInAllocatableClass(Reg) && - !regsReserved.test(Reg.id()); - } - - // Analysis information if available - LiveVariables *LiveVars = nullptr; - LiveIntervals *LiveInts = nullptr; - LiveStacks *LiveStks = nullptr; - SlotIndexes *Indexes = nullptr; - - // This is calculated only when trying to verify convergence control tokens. - // Similar to the LLVM IR verifier, we calculate this locally instead of - // relying on the pass manager. - MachineDominatorTree DT; - - void visitMachineFunctionBefore(); - void visitMachineBasicBlockBefore(const MachineBasicBlock *MBB); - void visitMachineBundleBefore(const MachineInstr *MI); - - /// Verify that all of \p MI's virtual register operands are scalars. - /// \returns True if all virtual register operands are scalar. False - /// otherwise. - bool verifyAllRegOpsScalar(const MachineInstr &MI, - const MachineRegisterInfo &MRI); - bool verifyVectorElementMatch(LLT Ty0, LLT Ty1, const MachineInstr *MI); - - bool verifyGIntrinsicSideEffects(const MachineInstr *MI); - bool verifyGIntrinsicConvergence(const MachineInstr *MI); - void verifyPreISelGenericInstruction(const MachineInstr *MI); - - void visitMachineInstrBefore(const MachineInstr *MI); - void visitMachineOperand(const MachineOperand *MO, unsigned MONum); - void visitMachineBundleAfter(const MachineInstr *MI); - void visitMachineBasicBlockAfter(const MachineBasicBlock *MBB); - void visitMachineFunctionAfter(); - - void report(const char *msg, const MachineFunction *MF); - void report(const char *msg, const MachineBasicBlock *MBB); - void report(const char *msg, const MachineInstr *MI); - void report(const char *msg, const MachineOperand *MO, unsigned MONum, - LLT MOVRegType = LLT{}); - void report(const Twine &Msg, const MachineInstr *MI); - - void report_context(const LiveInterval &LI) const; - void report_context(const LiveRange &LR, Register VRegUnit, - LaneBitmask LaneMask) const; - void report_context(const LiveRange::Segment &S) const; - void report_context(const VNInfo &VNI) const; - void report_context(SlotIndex Pos) const; - void report_context(MCPhysReg PhysReg) const; - void report_context_liverange(const LiveRange &LR) const; - void report_context_lanemask(LaneBitmask LaneMask) const; - void report_context_vreg(Register VReg) const; - void report_context_vreg_regunit(Register VRegOrUnit) const; - - void verifyInlineAsm(const MachineInstr *MI); - - void checkLiveness(const MachineOperand *MO, unsigned MONum); - void checkLivenessAtUse(const MachineOperand *MO, unsigned MONum, - SlotIndex UseIdx, const LiveRange &LR, - Register VRegOrUnit, - LaneBitmask LaneMask = LaneBitmask::getNone()); - void checkLivenessAtDef(const MachineOperand *MO, unsigned MONum, - SlotIndex DefIdx, const LiveRange &LR, - Register VRegOrUnit, bool SubRangeCheck = false, - LaneBitmask LaneMask = LaneBitmask::getNone()); - - void markReachable(const MachineBasicBlock *MBB); - void calcRegsPassed(); - void checkPHIOps(const MachineBasicBlock &MBB); - - void calcRegsRequired(); - void verifyLiveVariables(); - void verifyLiveIntervals(); - void verifyLiveInterval(const LiveInterval&); - void verifyLiveRangeValue(const LiveRange &, const VNInfo *, Register, + // Extra register info per MBB. + DenseMap MBBInfoMap; + + bool isReserved(Register Reg) { + return Reg.id() < regsReserved.size() && regsReserved.test(Reg.id()); + } + + bool isAllocatable(Register Reg) const { + return Reg.id() < TRI->getNumRegs() && TRI->isInAllocatableClass(Reg) && + !regsReserved.test(Reg.id()); + } + + // Analysis information if available + LiveVariables *LiveVars = nullptr; + LiveIntervals *LiveInts = nullptr; + LiveStacks *LiveStks = nullptr; + SlotIndexes *Indexes = nullptr; + + // This is calculated only when trying to verify convergence control tokens. + // Similar to the LLVM IR verifier, we calculate this locally instead of + // relying on the pass manager. + MachineDominatorTree DT; + + void visitMachineFunctionBefore(); + void visitMachineBasicBlockBefore(const MachineBasicBlock *MBB); + void visitMachineBundleBefore(const MachineInstr *MI); + + /// Verify that all of \p MI's virtual register operands are scalars. + /// \returns True if all virtual register operands are scalar. False + /// otherwise. + bool verifyAllRegOpsScalar(const MachineInstr &MI, + const MachineRegisterInfo &MRI); + bool verifyVectorElementMatch(LLT Ty0, LLT Ty1, const MachineInstr *MI); + + bool verifyGIntrinsicSideEffects(const MachineInstr *MI); + bool verifyGIntrinsicConvergence(const MachineInstr *MI); + void verifyPreISelGenericInstruction(const MachineInstr *MI); + + void visitMachineInstrBefore(const MachineInstr *MI); + void visitMachineOperand(const MachineOperand *MO, unsigned MONum); + void visitMachineBundleAfter(const MachineInstr *MI); + void visitMachineBasicBlockAfter(const MachineBasicBlock *MBB); + void visitMachineFunctionAfter(); + + void report(const char *msg, const MachineFunction *MF); + void report(const char *msg, const MachineBasicBlock *MBB); + void report(const char *msg, const MachineInstr *MI); + void report(const char *msg, const MachineOperand *MO, unsigned MONum, + LLT MOVRegType = LLT{}); + void report(const Twine &Msg, const MachineInstr *MI); + + void report_context(const LiveInterval &LI) const; + void report_context(const LiveRange &LR, Register VRegUnit, + LaneBitmask LaneMask) const; + void report_context(const LiveRange::Segment &S) const; + void report_context(const VNInfo &VNI) const; + void report_context(SlotIndex Pos) const; + void report_context(MCPhysReg PhysReg) const; + void report_context_liverange(const LiveRange &LR) const; + void report_context_lanemask(LaneBitmask LaneMask) const; + void report_context_vreg(Register VReg) const; + void report_context_vreg_regunit(Register VRegOrUnit) const; + + void verifyInlineAsm(const MachineInstr *MI); + + void checkLiveness(const MachineOperand *MO, unsigned MONum); + void checkLivenessAtUse(const MachineOperand *MO, unsigned MONum, + SlotIndex UseIdx, const LiveRange &LR, + Register VRegOrUnit, + LaneBitmask LaneMask = LaneBitmask::getNone()); + void checkLivenessAtDef(const MachineOperand *MO, unsigned MONum, + SlotIndex DefIdx, const LiveRange &LR, + Register VRegOrUnit, bool SubRangeCheck = false, + LaneBitmask LaneMask = LaneBitmask::getNone()); + + void markReachable(const MachineBasicBlock *MBB); + void calcRegsPassed(); + void checkPHIOps(const MachineBasicBlock &MBB); + + void calcRegsRequired(); + void verifyLiveVariables(); + void verifyLiveIntervals(); + void verifyLiveInterval(const LiveInterval &); + void verifyLiveRangeValue(const LiveRange &, const VNInfo *, Register, + LaneBitmask); + void verifyLiveRangeSegment(const LiveRange &, + const LiveRange::const_iterator I, Register, LaneBitmask); - void verifyLiveRangeSegment(const LiveRange &, - const LiveRange::const_iterator I, Register, - LaneBitmask); - void verifyLiveRange(const LiveRange &, Register, - LaneBitmask LaneMask = LaneBitmask::getNone()); + void verifyLiveRange(const LiveRange &, Register, + LaneBitmask LaneMask = LaneBitmask::getNone()); - void verifyStackFrame(); + void verifyStackFrame(); - void verifySlotIndexes() const; - void verifyProperties(const MachineFunction &MF); - }; - - struct MachineVerifierLegacyPass : public MachineFunctionPass { - static char ID; // Pass ID, replacement for typeid + void verifySlotIndexes() const; + void verifyProperties(const MachineFunction &MF); +}; - const std::string Banner; +struct MachineVerifierLegacyPass : public MachineFunctionPass { + static char ID; // Pass ID, replacement for typeid - MachineVerifierLegacyPass(std::string banner = std::string()) - : MachineFunctionPass(ID), Banner(std::move(banner)) { - initializeMachineVerifierLegacyPassPass(*PassRegistry::getPassRegistry()); - } + const std::string Banner; - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addUsedIfAvailable(); - AU.addUsedIfAvailable(); - AU.addUsedIfAvailable(); - AU.addUsedIfAvailable(); - AU.setPreservesAll(); - MachineFunctionPass::getAnalysisUsage(AU); - } + MachineVerifierLegacyPass(std::string banner = std::string()) + : MachineFunctionPass(ID), Banner(std::move(banner)) { + initializeMachineVerifierLegacyPassPass(*PassRegistry::getPassRegistry()); + } - bool runOnMachineFunction(MachineFunction &MF) override { - // Skip functions that have known verification problems. - // FIXME: Remove this mechanism when all problematic passes have been - // fixed. - if (MF.getProperties().hasProperty( - MachineFunctionProperties::Property::FailsVerification)) - return false; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addUsedIfAvailable(); + AU.addUsedIfAvailable(); + AU.addUsedIfAvailable(); + AU.addUsedIfAvailable(); + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } - unsigned FoundErrors = - MachineVerifier(this, Banner.c_str(), &errs()).verify(MF); - if (FoundErrors) - report_fatal_error("Found "+Twine(FoundErrors)+" machine code errors."); + bool runOnMachineFunction(MachineFunction &MF) override { + // Skip functions that have known verification problems. + // FIXME: Remove this mechanism when all problematic passes have been + // fixed. + if (MF.getProperties().hasProperty( + MachineFunctionProperties::Property::FailsVerification)) return false; - } - }; + + unsigned FoundErrors = + MachineVerifier(this, Banner.c_str(), &errs()).verify(MF); + if (FoundErrors) + report_fatal_error("Found " + Twine(FoundErrors) + + " machine code errors."); + return false; + } +}; } // end anonymous namespace @@ -3846,18 +3847,18 @@ namespace { // integer, we can't tell whether it is a FrameSetup or FrameDestroy if the // value is zero. // We use a bool plus an integer to capture the stack state. - struct StackStateOfBB { - StackStateOfBB() = default; - StackStateOfBB(int EntryVal, int ExitVal, bool EntrySetup, bool ExitSetup) : - EntryValue(EntryVal), ExitValue(ExitVal), EntryIsSetup(EntrySetup), - ExitIsSetup(ExitSetup) {} - - // Can be negative, which means we are setting up a frame. - int EntryValue = 0; - int ExitValue = 0; - bool EntryIsSetup = false; - bool ExitIsSetup = false; - }; +struct StackStateOfBB { + StackStateOfBB() = default; + StackStateOfBB(int EntryVal, int ExitVal, bool EntrySetup, bool ExitSetup) + : EntryValue(EntryVal), ExitValue(ExitVal), EntryIsSetup(EntrySetup), + ExitIsSetup(ExitSetup) {} + + // Can be negative, which means we are setting up a frame. + int EntryValue = 0; + int ExitValue = 0; + bool EntryIsSetup = false; + bool ExitIsSetup = false; +}; } // end anonymous namespace From 774893dcd929c370bad714a70a7d670bb2d6f649 Mon Sep 17 00:00:00 2001 From: Krzysztof Drewniak Date: Wed, 9 Oct 2024 10:40:49 -0500 Subject: [PATCH 036/119] [mlir][ROCDL] Plumb through AMDGPU memory access metadata (#110916) The LLVM backend has moved from function-wide attributes for making assurances about potentially unsafe atomic operations (like "unsafe-fp-atomics") to metadata on individual atomic operations. This commit adds support for generating this metadata from MLIR. --------- Co-authored-by: Quinn Dawkins --- .../mlir/Dialect/LLVMIR/LLVMAttrDefs.td | 1 + mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 9 +++++-- .../ROCDL/ROCDLToLLVMIRTranslation.cpp | 27 ++++++++++++++++++- mlir/test/Target/LLVMIR/rocdl.mlir | 23 ++++++++++++++++ 4 files changed, 57 insertions(+), 3 deletions(-) diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td index 80c22a357287b..c298c8277eb0c 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td @@ -1071,6 +1071,7 @@ def LLVM_ConstantRangeAttr : LLVM_Attr<"ConstantRange", "constant_range"> { Syntax: ``` `<` `i`(width($lower)) $lower `,` $upper `>` + ``` }]; let builders = [ diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td index aae2cf88ded04..b80d9ae88910c 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -58,7 +58,12 @@ def ROCDL_Dialect : Dialect { "::mlir::StringAttr":$flat_work_group_size, "::mlir::IntegerAttr":$max_flat_work_group_size, "::mlir::IntegerAttr":$waves_per_eu, - "::mlir::BoolAttr":$unsafe_fp_atomics + "::mlir::BoolAttr":$unsafe_fp_atomics, + // Correspond to LLVM metadata of the same name + "::mlir::UnitAttr":$last_use, + "::mlir::UnitAttr":$no_remote_memory, + "::mlir::UnitAttr":$no_fine_grained_memory, + "::mlir::UnitAttr":$ignore_denormal_mode ); let useDefaultAttributePrinterParser = 1; @@ -88,7 +93,7 @@ class ROCDL_IntrPure1Op : class ROCDL_IntrOp overloadedResults, list overloadedOperands, list traits, int numResults, - int requiresAccessGroup = 0, int requiresAliasAnalysis = 0, list immArgPositions = [], + int requiresAccessGroup = 0, int requiresAliasAnalysis = 0, list immArgPositions = [], list immArgAttrNames = []> : LLVM_IntrOpBase(attribute.getNameDialect()); + llvm::LLVMContext &llvmContext = moduleTranslation.getLLVMContext(); if (dialect->getKernelAttrHelper().getName() == attribute.getName()) { auto func = dyn_cast(op); if (!func) @@ -198,7 +199,6 @@ class ROCDLDialectLLVMIRTranslationInterface if (!value) return op->emitOpError(Twine(attribute.getName()) + " must be a dense i32 array attribute"); - llvm::LLVMContext &llvmContext = moduleTranslation.getLLVMContext(); SmallVector metadata; llvm::Type *i32 = llvm::IntegerType::get(llvmContext, 32); for (int32_t i : value.asArrayRef()) { @@ -210,6 +210,31 @@ class ROCDLDialectLLVMIRTranslationInterface llvm::MDNode *node = llvm::MDNode::get(llvmContext, metadata); llvmFunc->setMetadata("reqd_work_group_size", node); } + + // Atomic and nontemporal metadata + if (dialect->getLastUseAttrHelper().getName() == attribute.getName()) { + for (llvm::Instruction *i : instructions) + i->setMetadata("amdgpu.last.use", llvm::MDNode::get(llvmContext, {})); + } + if (dialect->getNoRemoteMemoryAttrHelper().getName() == + attribute.getName()) { + for (llvm::Instruction *i : instructions) + i->setMetadata("amdgpu.no.remote.memory", + llvm::MDNode::get(llvmContext, {})); + } + if (dialect->getNoFineGrainedMemoryAttrHelper().getName() == + attribute.getName()) { + for (llvm::Instruction *i : instructions) + i->setMetadata("amdgpu.no.fine.grained.memory", + llvm::MDNode::get(llvmContext, {})); + } + if (dialect->getIgnoreDenormalModeAttrHelper().getName() == + attribute.getName()) { + for (llvm::Instruction *i : instructions) + i->setMetadata("amdgpu.ignore.denormal.mode", + llvm::MDNode::get(llvmContext, {})); + } + return success(); } }; diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir index 08c2d4e647797..97276b087b7e9 100644 --- a/mlir/test/Target/LLVMIR/rocdl.mlir +++ b/mlir/test/Target/LLVMIR/rocdl.mlir @@ -564,11 +564,34 @@ llvm.func @rocdl_8bit_floats(%source: i32, %stoch: i32) -> i32 { } llvm.func @rocdl_16bit_packed_floats(%sourceA: f32, %sourceB: f32) -> vector<2xf16> { + // CHECK-LABEL: @rocdl_16bit_packed_floats // CHECK: call <2 x half> @llvm.amdgcn.cvt.pkrtz(float {{.*}}, float {{.*}}) %source = rocdl.cvt.pkrtz %sourceA, %sourceB : vector<2xf16> llvm.return %source : vector<2xf16> } +llvm.func @rocdl_atomic_attrs(%ptr: !llvm.ptr<1>, %data: f32) { + // CHECK-LABEL: @rocdl_atomic_attrs + // CHECK: atomicrmw + // CHECK-SAME: !amdgpu.ignore.denormal.mode + // CHECK-SAME: !amdgpu.no.fine.grained.memory + // CHECK-SAME: !amdgpu.no.remote.memory + llvm.atomicrmw fadd %ptr, %data monotonic { + rocdl.ignore_denormal_mode, + rocdl.no_fine_grained_memory, + rocdl.no_remote_memory} : !llvm.ptr<1>, f32 + llvm.return +} + +llvm.func @rocdl_last_use(%ptr: !llvm.ptr<1>) -> i32 { + // CHECK-LABEL: @rocdl_last_use + // CHECK: %[[ret:.+]] = load + // CHECK-SAME: !amdgpu.last.use + // CHECK: ret i32 %[[ret]] + %ret = llvm.load %ptr {rocdl.last_use} : !llvm.ptr<1> -> i32 + llvm.return %ret : i32 +} + // CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "uniform-work-group-size"="true" } // CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1,1024" // CHECK-DAG: attributes #[[$KNOWN_BLOCK_SIZE_ATTRS]] = { "amdgpu-flat-work-group-size"="128,128" From 18952bdcd6f987620e6396261c2bb444e428e07e Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Wed, 9 Oct 2024 12:18:59 -0400 Subject: [PATCH 037/119] [gn build] Fix up win/x86 flags and add stage2_unix_x86 (#111595) --- llvm/utils/gn/build/BUILD.gn | 8 ++++++-- llvm/utils/gn/build/toolchain/BUILD.gn | 7 +++++++ llvm/utils/gn/build/toolchain/target_flags.gni | 9 +++++++-- llvm/utils/gn/secondary/compiler-rt/BUILD.gn | 1 + llvm/utils/gn/secondary/compiler-rt/lib/BUILD.gn | 2 +- llvm/utils/gn/secondary/compiler-rt/test/test.gni | 2 +- 6 files changed, 23 insertions(+), 6 deletions(-) diff --git a/llvm/utils/gn/build/BUILD.gn b/llvm/utils/gn/build/BUILD.gn index 0b0f62721d374..9b5254e4c9f0b 100644 --- a/llvm/utils/gn/build/BUILD.gn +++ b/llvm/utils/gn/build/BUILD.gn @@ -61,7 +61,7 @@ config("compiler_defaults") { } asmflags = target_flags - cflags = target_flags + cflags = target_flags + target_cflags cflags_cc = [] ldflags = target_flags + target_ldflags @@ -343,7 +343,11 @@ config("compiler_defaults") { ldflags += [ "/winsysroot:" + rebase_path(sysroot, root_build_dir) ] # FIXME: Remove once PR54409 is fixed. - ldflags += [ "/machine:x64" ] + if (current_cpu == "x64") { + ldflags += [ "/machine:x64" ] + } else if (current_cpu == "x86") { + ldflags += [ "/machine:x86" ] + } } } else if (current_os != "ios" && current_os != "mac" && current_os != "android") { diff --git a/llvm/utils/gn/build/toolchain/BUILD.gn b/llvm/utils/gn/build/toolchain/BUILD.gn index e4852549bba6c..ce2e6df1b69f5 100644 --- a/llvm/utils/gn/build/toolchain/BUILD.gn +++ b/llvm/utils/gn/build/toolchain/BUILD.gn @@ -209,6 +209,13 @@ stage2_unix_toolchain("stage2_unix") { } } +stage2_unix_toolchain("stage2_unix_x86") { + toolchain_args = { + current_os = host_os + current_cpu = "x86" + } +} + if (android_ndk_path != "") { # Android compiler-rt libraries don't really work with per-target runtime # directories yet so force it off. diff --git a/llvm/utils/gn/build/toolchain/target_flags.gni b/llvm/utils/gn/build/toolchain/target_flags.gni index 50d31a3da85fc..cdfab75ed8bcd 100644 --- a/llvm/utils/gn/build/toolchain/target_flags.gni +++ b/llvm/utils/gn/build/toolchain/target_flags.gni @@ -8,6 +8,7 @@ import("//llvm/utils/gn/build/toolchain/compiler.gni") # COMPILER_RT_TEST_COMPILER_CFLAGS). target_flags = [] +target_cflags = [] target_ldflags = [] if (current_os == "android") { @@ -55,6 +56,10 @@ if (current_os == "android") { target_flags += [ "--target=$llvm_current_triple" ] } -if (current_cpu == "x86" && current_os != "win") { - target_flags += [ "-m32" ] +if (current_cpu == "x86") { + if (current_os == "win") { + target_cflags += [ "-m32" ] + } else { + target_flags += [ "-m32" ] + } } diff --git a/llvm/utils/gn/secondary/compiler-rt/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/BUILD.gn index b31016108419e..c8d3917ccde31 100644 --- a/llvm/utils/gn/secondary/compiler-rt/BUILD.gn +++ b/llvm/utils/gn/secondary/compiler-rt/BUILD.gn @@ -12,6 +12,7 @@ if (current_os == "win" || win_sysroot != "") { } if (current_os != "win") { supported_toolchains += [ "//llvm/utils/gn/build/toolchain:stage2_unix" ] + supported_toolchains += [ "//llvm/utils/gn/build/toolchain:stage2_unix_x86" ] } supported_toolchains += supported_android_toolchains if (llvm_build_AArch64) { diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/lib/BUILD.gn index 02c20483b7a8b..57b86f53254f5 100644 --- a/llvm/utils/gn/secondary/compiler-rt/lib/BUILD.gn +++ b/llvm/utils/gn/secondary/compiler-rt/lib/BUILD.gn @@ -4,7 +4,7 @@ group("lib") { "//compiler-rt/lib/builtins", "//compiler-rt/lib/cfi:ignorelist($host_toolchain)", ] - if (current_os == "linux") { + if (current_os == "linux" && current_cpu == "x64") { deps += [ "//compiler-rt/lib/msan" ] } if (current_os == "linux" || current_os == "android") { diff --git a/llvm/utils/gn/secondary/compiler-rt/test/test.gni b/llvm/utils/gn/secondary/compiler-rt/test/test.gni index 2d1aa0721248e..01de4ee09256c 100644 --- a/llvm/utils/gn/secondary/compiler-rt/test/test.gni +++ b/llvm/utils/gn/secondary/compiler-rt/test/test.gni @@ -10,7 +10,7 @@ declare_args() { target_flags_string = "" -foreach(flag, target_flags + target_ldflags) { +foreach(flag, target_flags + target_cflags + target_ldflags) { if (target_flags_string != "") { target_flags_string += " " } From 2e47b93fd29ad6ef13a4134f3b0be3c42e91180c Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 9 Oct 2024 18:34:17 +0200 Subject: [PATCH 038/119] [ARM] Honour -mno-movt in stack protector handling (#109022) When -mno-movt is passed to Clang, the ARM codegen correctly avoids movt/movw pairs to take the address of __stack_chk_guard in the stack protector code emitted into the function pro- and epilogues. However, the Thumb2 codegen fails to do so, and happily emits movw/movt pairs unless it is generating an ELF binary and the symbol might be in a different DSO. Let's incorporate a check for useMovt() in the logic here, so movt/movw are never emitted when -mno-movt is specified. Suggestions welcome for how/where to add a test case for this. Signed-off-by: Ard Biesheuvel --- llvm/lib/Target/ARM/Thumb2InstrInfo.cpp | 5 +++- llvm/test/CodeGen/ARM/stack-guard-nomovt.ll | 32 +++++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/ARM/stack-guard-nomovt.ll diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp index d1e07b6703a5e..27f86389a3856 100644 --- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -264,8 +264,11 @@ void Thumb2InstrInfo::expandLoadStackGuard( } const auto *GV = cast((*MI->memoperands_begin())->getValue()); - if (MF.getSubtarget().isTargetELF() && !GV->isDSOLocal()) + const ARMSubtarget &Subtarget = MF.getSubtarget(); + if (Subtarget.isTargetELF() && !GV->isDSOLocal()) expandLoadStackGuardBase(MI, ARM::t2LDRLIT_ga_pcrel, ARM::t2LDRi12); + else if (!Subtarget.useMovt()) + expandLoadStackGuardBase(MI, ARM::tLDRLIT_ga_abs, ARM::t2LDRi12); else if (MF.getTarget().isPositionIndependent()) expandLoadStackGuardBase(MI, ARM::t2MOV_ga_pcrel, ARM::t2LDRi12); else diff --git a/llvm/test/CodeGen/ARM/stack-guard-nomovt.ll b/llvm/test/CodeGen/ARM/stack-guard-nomovt.ll new file mode 100644 index 0000000000000..6802dabfda87a --- /dev/null +++ b/llvm/test/CodeGen/ARM/stack-guard-nomovt.ll @@ -0,0 +1,32 @@ +; RUN: llc -relocation-model=static -mattr=+no-movt < %s | FileCheck %s + +target triple = "thumbv7a-linux-gnueabi" + +define i32 @test1() #0 { +; CHECK-LABEL: test1: +; CHECK: @ %bb.0: +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: sub.w sp, sp, #1032 +; CHECK-NEXT: ldr r0, .LCPI0_0 +; CHECK-NEXT: ldr r0, [r0] +; CHECK-NEXT: str.w r0, [sp, #1028] +; CHECK-NEXT: add r0, sp, #4 +; CHECK-NEXT: bl foo +; CHECK-NEXT: ldr.w r0, [sp, #1028] +; CHECK-NEXT: ldr r1, .LCPI0_0 +; CHECK-NEXT: ldr r1, [r1] +; CHECK-NEXT: cmp r1, r0 +; CHECK-NEXT: ittt eq +; CHECK-NEXT: moveq r0, #0 +; CHECK-NEXT: addeq.w sp, sp, #1032 +; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: .LBB0_1: +; CHECK-NEXT: bl __stack_chk_fail + %a1 = alloca [256 x i32], align 4 + call void @foo(ptr %a1) #3 + ret i32 0 +} + +declare void @foo(ptr) + +attributes #0 = { nounwind sspstrong } From cf5bbeb533d49fd3f3c174af2239188e2d47b7db Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Wed, 9 Oct 2024 16:33:47 +0000 Subject: [PATCH 039/119] [gn build] Remove unix x86 stage2 toolchain It's breaking the bots, e.g. http://45.33.8.238/linux/149792/step_3.txt --- llvm/utils/gn/secondary/compiler-rt/BUILD.gn | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/utils/gn/secondary/compiler-rt/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/BUILD.gn index c8d3917ccde31..b31016108419e 100644 --- a/llvm/utils/gn/secondary/compiler-rt/BUILD.gn +++ b/llvm/utils/gn/secondary/compiler-rt/BUILD.gn @@ -12,7 +12,6 @@ if (current_os == "win" || win_sysroot != "") { } if (current_os != "win") { supported_toolchains += [ "//llvm/utils/gn/build/toolchain:stage2_unix" ] - supported_toolchains += [ "//llvm/utils/gn/build/toolchain:stage2_unix_x86" ] } supported_toolchains += supported_android_toolchains if (llvm_build_AArch64) { From 1553cb5d3b14a0516c2796c295a3b32d147d13d0 Mon Sep 17 00:00:00 2001 From: George Burgess IV Date: Wed, 9 Oct 2024 10:40:53 -0600 Subject: [PATCH 040/119] [Sema] Support negation/parens with __builtin_available (#111439) At present, `__builtin_available` is really restrictive with its use. Overall, this seems like a good thing, since the analyses behind it are not very expensive. That said, it's very straightforward to support these two cases: ``` if ((__builtin_available(foo, *))) { // ... } ``` and ``` if (!__builtin_available(foo, *)) { // ... } else { // ... } ``` Seems nice to do so. --- clang/lib/Sema/SemaAvailability.cpp | 51 ++++++++++++++++++++++------- clang/test/Sema/attr-availability.c | 25 +++++++++++++- 2 files changed, 64 insertions(+), 12 deletions(-) diff --git a/clang/lib/Sema/SemaAvailability.cpp b/clang/lib/Sema/SemaAvailability.cpp index e04cbeec16555..798cabaa31a47 100644 --- a/clang/lib/Sema/SemaAvailability.cpp +++ b/clang/lib/Sema/SemaAvailability.cpp @@ -1005,25 +1005,54 @@ bool DiagnoseUnguardedAvailability::VisitTypeLoc(TypeLoc Ty) { return true; } +struct ExtractedAvailabilityExpr { + const ObjCAvailabilityCheckExpr *E = nullptr; + bool isNegated = false; +}; + +ExtractedAvailabilityExpr extractAvailabilityExpr(const Expr *IfCond) { + const auto *E = IfCond; + bool IsNegated = false; + while (true) { + E = E->IgnoreParens(); + if (const auto *AE = dyn_cast(E)) { + return ExtractedAvailabilityExpr{AE, IsNegated}; + } + + const auto *UO = dyn_cast(E); + if (!UO || UO->getOpcode() != UO_LNot) { + return ExtractedAvailabilityExpr{}; + } + E = UO->getSubExpr(); + IsNegated = !IsNegated; + } +} + bool DiagnoseUnguardedAvailability::TraverseIfStmt(IfStmt *If) { - VersionTuple CondVersion; - if (auto *E = dyn_cast(If->getCond())) { - CondVersion = E->getVersion(); - - // If we're using the '*' case here or if this check is redundant, then we - // use the enclosing version to check both branches. - if (CondVersion.empty() || CondVersion <= AvailabilityStack.back()) - return TraverseStmt(If->getThen()) && TraverseStmt(If->getElse()); - } else { + ExtractedAvailabilityExpr IfCond = extractAvailabilityExpr(If->getCond()); + if (!IfCond.E) { // This isn't an availability checking 'if', we can just continue. return Base::TraverseIfStmt(If); } + VersionTuple CondVersion = IfCond.E->getVersion(); + // If we're using the '*' case here or if this check is redundant, then we + // use the enclosing version to check both branches. + if (CondVersion.empty() || CondVersion <= AvailabilityStack.back()) { + return TraverseStmt(If->getThen()) && TraverseStmt(If->getElse()); + } + + auto *Guarded = If->getThen(); + auto *Unguarded = If->getElse(); + if (IfCond.isNegated) { + std::swap(Guarded, Unguarded); + } + AvailabilityStack.push_back(CondVersion); - bool ShouldContinue = TraverseStmt(If->getThen()); + bool ShouldContinue = TraverseStmt(Guarded); AvailabilityStack.pop_back(); - return ShouldContinue && TraverseStmt(If->getElse()); + return ShouldContinue && TraverseStmt(Unguarded); } } // end anonymous namespace diff --git a/clang/test/Sema/attr-availability.c b/clang/test/Sema/attr-availability.c index a5cc602a8fa9d..a496c5271f2a3 100644 --- a/clang/test/Sema/attr-availability.c +++ b/clang/test/Sema/attr-availability.c @@ -40,7 +40,7 @@ void test_10095131(void) { #ifdef WARN_PARTIAL // FIXME: This note should point to the declaration with the availability // attribute. -// expected-note@+2 {{'PartiallyAvailable' has been marked as being introduced in macOS 10.8 here, but the deployment target is macOS 10.5}} +// expected-note@+2 5 {{'PartiallyAvailable' has been marked as being introduced in macOS 10.8 here, but the deployment target is macOS 10.5}} #endif extern void PartiallyAvailable(void) ; void with_redeclaration(void) { @@ -53,6 +53,29 @@ void with_redeclaration(void) { enum PartialEnum p = kPartialEnumConstant; } +#ifdef WARN_PARTIAL +void conditional_warnings() { + if (__builtin_available(macos 10.8, *)) { + PartiallyAvailable(); + } else { + PartiallyAvailable(); // expected-warning {{only available on macOS 10.8 or newer}} expected-note {{enclose 'PartiallyAvailable'}} + } + if (!__builtin_available(macos 10.8, *)) { + PartiallyAvailable(); // expected-warning {{only available on macOS 10.8 or newer}} expected-note {{enclose 'PartiallyAvailable'}} + } else { + PartiallyAvailable(); + } + if (!!!(!__builtin_available(macos 10.8, *))) { + PartiallyAvailable(); + } else { + PartiallyAvailable(); // expected-warning {{only available on macOS 10.8 or newer}} expected-note {{enclose 'PartiallyAvailable'}} + } + if (~__builtin_available(macos 10.8, *)) { // expected-warning {{does not guard availability here}} + PartiallyAvailable(); // expected-warning {{only available on macOS 10.8 or newer}} expected-note {{enclose 'PartiallyAvailable'}} + } +} +#endif + __attribute__((availability(macos, unavailable))) // expected-warning {{attribute 'availability' is ignored}} enum { NSDataWritingFileProtectionWriteOnly = 0x30000000, From 17bc959961aa5792821516b547100316fc886ab4 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Wed, 9 Oct 2024 09:54:11 -0700 Subject: [PATCH 041/119] [AMDGPU] Optionally Use GCNRPTrackers during scheduling (#93090) This adds the ability to use the GCNRPTrackers during scheduling. These trackers have several advantages over the generic trackers: 1. global live-thru trackers, 2. subregister based RP deltas, and 3. flexible vreg -> PressureSet mappings. This feature is off-by-default to ease with the roll-out process. In particular, when using the optional trackers, the scheduler will still maintain the generic trackers leading to unnecessary compile time. --- .../Target/AMDGPU/GCNIterativeScheduler.cpp | 2 +- llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 199 +++++- llvm/lib/Target/AMDGPU/GCNRegPressure.h | 96 ++- llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 148 +++- llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 60 +- .../CodeGen/AMDGPU/high-RP-reschedule.mir | 10 +- llvm/test/CodeGen/AMDGPU/pr51516.mir | 6 +- .../schedule-amdgpu-tracker-physreg-crash.ll | 65 ++ .../AMDGPU/schedule-amdgpu-tracker-physreg.ll | 491 +++++++++++++ .../AMDGPU/schedule-amdgpu-trackers.ll | 647 ++++++++++++++++++ ...schedule-regpressure-ilp-metric-spills.mir | 17 +- .../AMDGPU/schedule-relaxed-occupancy.ll | 10 +- 12 files changed, 1672 insertions(+), 79 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg-crash.ll create mode 100644 llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll create mode 100644 llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp index 13504508e2fb2..da065e8d8cb6b 100644 --- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp +++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp @@ -480,7 +480,7 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy( LLVM_DEBUG(dbgs() << "Scheduling using default scheduler, " "target occupancy = " << TgtOcc << '\n'); - GCNMaxOccupancySchedStrategy LStrgy(Context); + GCNMaxOccupancySchedStrategy LStrgy(Context, /*IsLegacyScheduler=*/true); unsigned FinalOccupancy = std::min(Occ, MFI->getOccupancy()); for (int I = 0; I < NumPasses; ++I) { diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index 7c633b2bce7bc..d46c4cf23a221 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -296,6 +296,63 @@ collectVirtualRegUses(SmallVectorImpl &RegMaskPairs, } } +/// Mostly copy/paste from CodeGen/RegisterPressure.cpp +static LaneBitmask getLanesWithProperty( + const LiveIntervals &LIS, const MachineRegisterInfo &MRI, + bool TrackLaneMasks, Register RegUnit, SlotIndex Pos, + LaneBitmask SafeDefault, + function_ref Property) { + if (RegUnit.isVirtual()) { + const LiveInterval &LI = LIS.getInterval(RegUnit); + LaneBitmask Result; + if (TrackLaneMasks && LI.hasSubRanges()) { + for (const LiveInterval::SubRange &SR : LI.subranges()) { + if (Property(SR, Pos)) + Result |= SR.LaneMask; + } + } else if (Property(LI, Pos)) { + Result = TrackLaneMasks ? MRI.getMaxLaneMaskForVReg(RegUnit) + : LaneBitmask::getAll(); + } + + return Result; + } + + const LiveRange *LR = LIS.getCachedRegUnit(RegUnit); + if (LR == nullptr) + return SafeDefault; + return Property(*LR, Pos) ? LaneBitmask::getAll() : LaneBitmask::getNone(); +} + +/// Mostly copy/paste from CodeGen/RegisterPressure.cpp +/// Helper to find a vreg use between two indices {PriorUseIdx, NextUseIdx}. +/// The query starts with a lane bitmask which gets lanes/bits removed for every +/// use we find. +static LaneBitmask findUseBetween(unsigned Reg, LaneBitmask LastUseMask, + SlotIndex PriorUseIdx, SlotIndex NextUseIdx, + const MachineRegisterInfo &MRI, + const SIRegisterInfo *TRI, + const LiveIntervals *LIS, + bool Upward = false) { + for (const MachineOperand &MO : MRI.use_nodbg_operands(Reg)) { + if (MO.isUndef()) + continue; + const MachineInstr *MI = MO.getParent(); + SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot(); + bool InRange = Upward ? (InstSlot > PriorUseIdx && InstSlot <= NextUseIdx) + : (InstSlot >= PriorUseIdx && InstSlot < NextUseIdx); + if (!InRange) + continue; + + unsigned SubRegIdx = MO.getSubReg(); + LaneBitmask UseMask = TRI->getSubRegIndexLaneMask(SubRegIdx); + LastUseMask &= ~UseMask; + if (LastUseMask.none()) + return LaneBitmask::getNone(); + } + return LastUseMask; +} + /////////////////////////////////////////////////////////////////////////////// // GCNRPTracker @@ -354,17 +411,28 @@ void GCNRPTracker::reset(const MachineInstr &MI, MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs); } -//////////////////////////////////////////////////////////////////////////////// -// GCNUpwardRPTracker - -void GCNUpwardRPTracker::reset(const MachineRegisterInfo &MRI_, - const LiveRegSet &LiveRegs_) { +void GCNRPTracker::reset(const MachineRegisterInfo &MRI_, + const LiveRegSet &LiveRegs_) { MRI = &MRI_; LiveRegs = LiveRegs_; LastTrackedMI = nullptr; MaxPressure = CurPressure = getRegPressure(MRI_, LiveRegs_); } +/// Mostly copy/paste from CodeGen/RegisterPressure.cpp +LaneBitmask GCNRPTracker::getLastUsedLanes(Register RegUnit, + SlotIndex Pos) const { + return getLanesWithProperty( + LIS, *MRI, true, RegUnit, Pos.getBaseIndex(), LaneBitmask::getNone(), + [](const LiveRange &LR, SlotIndex Pos) { + const LiveRange::Segment *S = LR.getSegmentContaining(Pos); + return S != nullptr && S->end == Pos.getRegSlot(); + }); +} + +//////////////////////////////////////////////////////////////////////////////// +// GCNUpwardRPTracker + void GCNUpwardRPTracker::recede(const MachineInstr &MI) { assert(MRI && "call reset first"); @@ -441,25 +509,37 @@ bool GCNDownwardRPTracker::reset(const MachineInstr &MI, return true; } -bool GCNDownwardRPTracker::advanceBeforeNext() { +bool GCNDownwardRPTracker::advanceBeforeNext(MachineInstr *MI, + bool UseInternalIterator) { assert(MRI && "call reset first"); - if (!LastTrackedMI) - return NextMI == MBBEnd; - - assert(NextMI == MBBEnd || !NextMI->isDebugInstr()); + SlotIndex SI; + const MachineInstr *CurrMI; + if (UseInternalIterator) { + if (!LastTrackedMI) + return NextMI == MBBEnd; + + assert(NextMI == MBBEnd || !NextMI->isDebugInstr()); + CurrMI = LastTrackedMI; + + SI = NextMI == MBBEnd + ? LIS.getInstructionIndex(*LastTrackedMI).getDeadSlot() + : LIS.getInstructionIndex(*NextMI).getBaseIndex(); + } else { //! UseInternalIterator + SI = LIS.getInstructionIndex(*MI).getBaseIndex(); + CurrMI = MI; + } - SlotIndex SI = NextMI == MBBEnd - ? LIS.getInstructionIndex(*LastTrackedMI).getDeadSlot() - : LIS.getInstructionIndex(*NextMI).getBaseIndex(); assert(SI.isValid()); // Remove dead registers or mask bits. SmallSet SeenRegs; - for (auto &MO : LastTrackedMI->operands()) { + for (auto &MO : CurrMI->operands()) { if (!MO.isReg() || !MO.getReg().isVirtual()) continue; if (MO.isUse() && !MO.readsReg()) continue; + if (!UseInternalIterator && MO.isDef()) + continue; if (!SeenRegs.insert(MO.getReg()).second) continue; const LiveInterval &LI = LIS.getInterval(MO.getReg()); @@ -492,15 +572,22 @@ bool GCNDownwardRPTracker::advanceBeforeNext() { LastTrackedMI = nullptr; - return NextMI == MBBEnd; + return UseInternalIterator && (NextMI == MBBEnd); } -void GCNDownwardRPTracker::advanceToNext() { - LastTrackedMI = &*NextMI++; - NextMI = skipDebugInstructionsForward(NextMI, MBBEnd); +void GCNDownwardRPTracker::advanceToNext(MachineInstr *MI, + bool UseInternalIterator) { + if (UseInternalIterator) { + LastTrackedMI = &*NextMI++; + NextMI = skipDebugInstructionsForward(NextMI, MBBEnd); + } else { + LastTrackedMI = MI; + } + + const MachineInstr *CurrMI = LastTrackedMI; // Add new registers or mask bits. - for (const auto &MO : LastTrackedMI->all_defs()) { + for (const auto &MO : CurrMI->all_defs()) { Register Reg = MO.getReg(); if (!Reg.isVirtual()) continue; @@ -513,11 +600,16 @@ void GCNDownwardRPTracker::advanceToNext() { MaxPressure = max(MaxPressure, CurPressure); } -bool GCNDownwardRPTracker::advance() { - if (NextMI == MBBEnd) +bool GCNDownwardRPTracker::advance(MachineInstr *MI, bool UseInternalIterator) { + if (UseInternalIterator && NextMI == MBBEnd) return false; - advanceBeforeNext(); - advanceToNext(); + + advanceBeforeNext(MI, UseInternalIterator); + advanceToNext(MI, UseInternalIterator); + if (!UseInternalIterator) { + // We must remove any dead def lanes from the current RP + advanceBeforeNext(MI, true); + } return true; } @@ -559,6 +651,67 @@ Printable llvm::reportMismatch(const GCNRPTracker::LiveRegSet &LISLR, }); } +GCNRegPressure +GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI, + const SIRegisterInfo *TRI) const { + assert(!MI->isDebugOrPseudoInstr() && "Expect a nondebug instruction."); + + SlotIndex SlotIdx; + SlotIdx = LIS.getInstructionIndex(*MI).getRegSlot(); + + // Account for register pressure similar to RegPressureTracker::recede(). + RegisterOperands RegOpers; + RegOpers.collect(*MI, *TRI, *MRI, true, /*IgnoreDead=*/false); + RegOpers.adjustLaneLiveness(LIS, *MRI, SlotIdx); + GCNRegPressure TempPressure = CurPressure; + + for (const RegisterMaskPair &Use : RegOpers.Uses) { + Register Reg = Use.RegUnit; + if (!Reg.isVirtual()) + continue; + LaneBitmask LastUseMask = getLastUsedLanes(Reg, SlotIdx); + if (LastUseMask.none()) + continue; + // The LastUseMask is queried from the liveness information of instruction + // which may be further down the schedule. Some lanes may actually not be + // last uses for the current position. + // FIXME: allow the caller to pass in the list of vreg uses that remain + // to be bottom-scheduled to avoid searching uses at each query. + SlotIndex CurrIdx; + const MachineBasicBlock *MBB = MI->getParent(); + MachineBasicBlock::const_iterator IdxPos = skipDebugInstructionsForward( + LastTrackedMI ? LastTrackedMI : MBB->begin(), MBB->end()); + if (IdxPos == MBB->end()) { + CurrIdx = LIS.getMBBEndIdx(MBB); + } else { + CurrIdx = LIS.getInstructionIndex(*IdxPos).getRegSlot(); + } + + LastUseMask = + findUseBetween(Reg, LastUseMask, CurrIdx, SlotIdx, *MRI, TRI, &LIS); + if (LastUseMask.none()) + continue; + + LaneBitmask LiveMask = + LiveRegs.contains(Reg) ? LiveRegs.at(Reg) : LaneBitmask(0); + LaneBitmask NewMask = LiveMask & ~LastUseMask; + TempPressure.inc(Reg, LiveMask, NewMask, *MRI); + } + + // Generate liveness for defs. + for (const RegisterMaskPair &Def : RegOpers.Defs) { + Register Reg = Def.RegUnit; + if (!Reg.isVirtual()) + continue; + LaneBitmask LiveMask = + LiveRegs.contains(Reg) ? LiveRegs.at(Reg) : LaneBitmask(0); + LaneBitmask NewMask = LiveMask | Def.LaneMask; + TempPressure.inc(Reg, LiveMask, NewMask, *MRI); + } + + return TempPressure; +} + bool GCNUpwardRPTracker::isValid() const { const auto &SI = LIS.getInstructionIndex(*LastTrackedMI).getBaseIndex(); const auto LISLR = llvm::getLiveRegs(SI, LIS, *MRI); diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index d419fcc802c60..06c3d9027db1b 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -19,6 +19,7 @@ #include "GCNSubtarget.h" #include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/RegisterPressure.h" #include namespace llvm { @@ -149,6 +150,9 @@ inline GCNRegPressure operator-(const GCNRegPressure &P1, return Diff; } +/////////////////////////////////////////////////////////////////////////////// +// GCNRPTracker + class GCNRPTracker { public: using LiveRegSet = DenseMap; @@ -165,7 +169,14 @@ class GCNRPTracker { void reset(const MachineInstr &MI, const LiveRegSet *LiveRegsCopy, bool After); + /// Mostly copy/paste from CodeGen/RegisterPressure.cpp + void bumpDeadDefs(ArrayRef DeadDefs); + + LaneBitmask getLastUsedLanes(Register RegUnit, SlotIndex Pos) const; + public: + // reset tracker and set live register set to the specified value. + void reset(const MachineRegisterInfo &MRI_, const LiveRegSet &LiveRegs_); // live regs for the current state const decltype(LiveRegs) &getLiveRegs() const { return LiveRegs; } const MachineInstr *getLastTrackedMI() const { return LastTrackedMI; } @@ -182,34 +193,38 @@ class GCNRPTracker { GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, const LiveIntervals &LIS, const MachineRegisterInfo &MRI); +//////////////////////////////////////////////////////////////////////////////// +// GCNUpwardRPTracker + class GCNUpwardRPTracker : public GCNRPTracker { public: GCNUpwardRPTracker(const LiveIntervals &LIS_) : GCNRPTracker(LIS_) {} - // reset tracker and set live register set to the specified value. - void reset(const MachineRegisterInfo &MRI_, const LiveRegSet &LiveRegs_); + using GCNRPTracker::reset; - // reset tracker at the specified slot index. + /// reset tracker at the specified slot index \p SI. void reset(const MachineRegisterInfo &MRI, SlotIndex SI) { - reset(MRI, llvm::getLiveRegs(SI, LIS, MRI)); + GCNRPTracker::reset(MRI, llvm::getLiveRegs(SI, LIS, MRI)); } - // reset tracker to the end of the MBB. + /// reset tracker to the end of the \p MBB. void reset(const MachineBasicBlock &MBB) { reset(MBB.getParent()->getRegInfo(), LIS.getSlotIndexes()->getMBBEndIdx(&MBB)); } - // reset tracker to the point just after MI (in program order). + /// reset tracker to the point just after \p MI (in program order). void reset(const MachineInstr &MI) { reset(MI.getMF()->getRegInfo(), LIS.getInstructionIndex(MI).getDeadSlot()); } - // move to the state just before the MI (in program order). + /// Move to the state of RP just before the \p MI . If \p UseInternalIterator + /// is set, also update the internal iterators. Setting \p UseInternalIterator + /// to false allows for an externally managed iterator / program order. void recede(const MachineInstr &MI); - // checks whether the tracker's state after receding MI corresponds - // to reported by LIS. + /// \p returns whether the tracker's state after receding MI corresponds + /// to reported by LIS. bool isValid() const; const GCNRegPressure &getMaxPressure() const { return MaxPressure; } @@ -223,6 +238,9 @@ class GCNUpwardRPTracker : public GCNRPTracker { } }; +//////////////////////////////////////////////////////////////////////////////// +// GCNDownwardRPTracker + class GCNDownwardRPTracker : public GCNRPTracker { // Last position of reset or advanceBeforeNext MachineBasicBlock::const_iterator NextMI; @@ -232,37 +250,65 @@ class GCNDownwardRPTracker : public GCNRPTracker { public: GCNDownwardRPTracker(const LiveIntervals &LIS_) : GCNRPTracker(LIS_) {} + using GCNRPTracker::reset; + MachineBasicBlock::const_iterator getNext() const { return NextMI; } - // Return MaxPressure and clear it. + /// \p return MaxPressure and clear it. GCNRegPressure moveMaxPressure() { auto Res = MaxPressure; MaxPressure.clear(); return Res; } - // Reset tracker to the point before the MI - // filling live regs upon this point using LIS. - // Returns false if block is empty except debug values. + /// Reset tracker to the point before the \p MI + /// filling \p LiveRegs upon this point using LIS. + /// \p returns false if block is empty except debug values. bool reset(const MachineInstr &MI, const LiveRegSet *LiveRegs = nullptr); - // Move to the state right before the next MI or after the end of MBB. - // Returns false if reached end of the block. - bool advanceBeforeNext(); - - // Move to the state at the MI, advanceBeforeNext has to be called first. - void advanceToNext(); - - // Move to the state at the next MI. Returns false if reached end of block. - bool advance(); - - // Advance instructions until before End. + /// Move to the state right before the next MI or after the end of MBB. + /// \p returns false if reached end of the block. + /// If \p UseInternalIterator is true, then internal iterators are used and + /// set to process in program order. If \p UseInternalIterator is false, then + /// it is assumed that the tracker is using an externally managed iterator, + /// and advance* calls will not update the state of the iterator. In such + /// cases, the tracker will move to the state right before the provided \p MI + /// and use LIS for RP calculations. + bool advanceBeforeNext(MachineInstr *MI = nullptr, + bool UseInternalIterator = true); + + /// Move to the state at the MI, advanceBeforeNext has to be called first. + /// If \p UseInternalIterator is true, then internal iterators are used and + /// set to process in program order. If \p UseInternalIterator is false, then + /// it is assumed that the tracker is using an externally managed iterator, + /// and advance* calls will not update the state of the iterator. In such + /// cases, the tracker will move to the state at the provided \p MI . + void advanceToNext(MachineInstr *MI = nullptr, + bool UseInternalIterator = true); + + /// Move to the state at the next MI. \p returns false if reached end of + /// block. If \p UseInternalIterator is true, then internal iterators are used + /// and set to process in program order. If \p UseInternalIterator is false, + /// then it is assumed that the tracker is using an externally managed + /// iterator, and advance* calls will not update the state of the iterator. In + /// such cases, the tracker will move to the state right before the provided + /// \p MI and use LIS for RP calculations. + bool advance(MachineInstr *MI = nullptr, bool UseInternalIterator = true); + + /// Advance instructions until before \p End. bool advance(MachineBasicBlock::const_iterator End); - // Reset to Begin and advance to End. + /// Reset to \p Begin and advance to \p End. bool advance(MachineBasicBlock::const_iterator Begin, MachineBasicBlock::const_iterator End, const LiveRegSet *LiveRegsCopy = nullptr); + + /// Mostly copy/paste from CodeGen/RegisterPressure.cpp + /// Calculate the impact \p MI will have on CurPressure and \return the + /// speculated pressure. In order to support RP Speculation, this does not + /// rely on the implicit program ordering in the LiveIntervals. + GCNRegPressure bumpDownwardPressure(const MachineInstr *MI, + const SIRegisterInfo *TRI) const; }; /// \returns the LaneMask of live lanes of \p Reg at position \p SI. Only the diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index d6958d9055fad..11c95675aeeaf 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -58,11 +58,17 @@ static cl::opt "Wave Limited (amdgpu-limit-wave-threshold)."), cl::init(false)); +static cl::opt GCNTrackers( + "amdgpu-use-amdgpu-trackers", cl::Hidden, + cl::desc("Use the AMDGPU specific RPTrackers during scheduling"), + cl::init(false)); + const unsigned ScheduleMetrics::ScaleFactor = 100; GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C) : GenericScheduler(C), TargetOccupancy(0), MF(nullptr), - HasHighPressure(false) {} + DownwardTracker(*C->LIS), UpwardTracker(*C->LIS), HasHighPressure(false) { +} void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) { GenericScheduler::initialize(DAG); @@ -148,17 +154,38 @@ static bool canUsePressureDiffs(const SUnit &SU) { return true; } -static void getRegisterPressures(bool AtTop, - const RegPressureTracker &RPTracker, SUnit *SU, - std::vector &Pressure, - std::vector &MaxPressure) { +static void getRegisterPressures( + bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU, + std::vector &Pressure, std::vector &MaxPressure, + GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker, + ScheduleDAGMI *DAG, const SIRegisterInfo *SRI) { // getDownwardPressure() and getUpwardPressure() make temporary changes to // the tracker, so we need to pass those function a non-const copy. RegPressureTracker &TempTracker = const_cast(RPTracker); - if (AtTop) - TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure); - else - TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure); + if (!GCNTrackers) { + AtTop + ? TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure) + : TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure); + + return; + } + + // GCNTrackers + Pressure.resize(4, 0); + MachineInstr *MI = SU->getInstr(); + GCNRegPressure NewPressure; + if (AtTop) { + GCNDownwardRPTracker TempDownwardTracker(DownwardTracker); + NewPressure = TempDownwardTracker.bumpDownwardPressure(MI, SRI); + } else { + GCNUpwardRPTracker TempUpwardTracker(UpwardTracker); + TempUpwardTracker.recede(*MI); + NewPressure = TempUpwardTracker.getPressure(); + } + Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum(); + Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = + NewPressure.getArchVGPRNum(); + Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum(); } void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, @@ -187,8 +214,9 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, // // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of // PressureDiffs. - if (AtTop || !canUsePressureDiffs(*SU)) { - getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure); + if (AtTop || !canUsePressureDiffs(*SU) || GCNTrackers) { + getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure, + DownwardTracker, UpwardTracker, DAG, SRI); } else { // Reserve 4 slots. Pressure.resize(4, 0); @@ -206,7 +234,8 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, #ifdef EXPENSIVE_CHECKS std::vector CheckPressure, CheckMaxPressure; - getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure); + getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure, + TheTracker, UpwardTracker, DAG, SRI); if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] != CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] || Pressure[AMDGPU::RegisterPressureSets::VGPR_32] != @@ -294,8 +323,16 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, unsigned SGPRPressure = 0; unsigned VGPRPressure = 0; if (DAG->isTrackingPressure()) { - SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; - VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; + if (!GCNTrackers) { + SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; + VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; + } else { + GCNRPTracker *T = IsBottomUp + ? static_cast(&UpwardTracker) + : static_cast(&DownwardTracker); + SGPRPressure = T->getPressure().getSGPRNum(); + VGPRPressure = T->getPressure().getArchVGPRNum(); + } } ReadyQueue &Q = Zone.Available; for (SUnit *SU : Q) { @@ -444,6 +481,16 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) { return SU; } +void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { + if (GCNTrackers) { + MachineInstr *MI = SU->getInstr(); + IsTopNode ? (void)DownwardTracker.advance(MI, false) + : UpwardTracker.recede(*MI); + } + + return GenericScheduler::schedNode(SU, IsTopNode); +} + GCNSchedStageID GCNSchedStrategy::getCurrentStage() { assert(CurrentStage && CurrentStage != SchedStages.end()); return *CurrentStage; @@ -470,12 +517,13 @@ GCNSchedStageID GCNSchedStrategy::getNextStage() const { } GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy( - const MachineSchedContext *C) + const MachineSchedContext *C, bool IsLegacyScheduler) : GCNSchedStrategy(C) { SchedStages.push_back(GCNSchedStageID::OccInitialSchedule); SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule); SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule); SchedStages.push_back(GCNSchedStageID::PreRARematerialize); + GCNTrackers = GCNTrackers & !IsLegacyScheduler; } GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C) @@ -571,7 +619,8 @@ GCNScheduleDAGMILive::GCNScheduleDAGMILive( MachineSchedContext *C, std::unique_ptr S) : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget()), MFI(*MF.getInfo()), - StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy) { + StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy), + RegionLiveOuts(this, /*IsLiveOut=*/true) { LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n"); if (RelaxedOcc) { @@ -613,6 +662,14 @@ GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const { return RPTracker.moveMaxPressure(); } +static MachineInstr *getLastMIForRegion(MachineBasicBlock::iterator RegionBegin, + MachineBasicBlock::iterator RegionEnd) { + auto REnd = RegionEnd == RegionBegin->getParent()->end() + ? std::prev(RegionEnd) + : RegionEnd; + return &*skipDebugInstructionsBackward(REnd, RegionBegin); +} + void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx, const MachineBasicBlock *MBB) { GCNDownwardRPTracker RPTracker(*LIS); @@ -687,20 +744,45 @@ void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx, } DenseMap -GCNScheduleDAGMILive::getBBLiveInMap() const { +GCNScheduleDAGMILive::getRegionLiveInMap() const { assert(!Regions.empty()); - std::vector BBStarters; - BBStarters.reserve(Regions.size()); + std::vector RegionFirstMIs; + RegionFirstMIs.reserve(Regions.size()); auto I = Regions.rbegin(), E = Regions.rend(); auto *BB = I->first->getParent(); do { auto *MI = &*skipDebugInstructionsForward(I->first, I->second); - BBStarters.push_back(MI); + RegionFirstMIs.push_back(MI); do { ++I; } while (I != E && I->first->getParent() == BB); } while (I != E); - return getLiveRegMap(BBStarters, false /*After*/, *LIS); + return getLiveRegMap(RegionFirstMIs, /*After=*/false, *LIS); +} + +DenseMap +GCNScheduleDAGMILive::getRegionLiveOutMap() const { + assert(!Regions.empty()); + std::vector RegionLastMIs; + RegionLastMIs.reserve(Regions.size()); + for (auto &[RegionBegin, RegionEnd] : reverse(Regions)) + RegionLastMIs.push_back(getLastMIForRegion(RegionBegin, RegionEnd)); + + return getLiveRegMap(RegionLastMIs, /*After=*/true, *LIS); +} + +void RegionPressureMap::buildLiveRegMap() { + IdxToInstruction.clear(); + + RegionLiveRegMap = + IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap(); + for (unsigned I = 0; I < DAG->Regions.size(); I++) { + MachineInstr *RegionKey = + IsLiveOut + ? getLastMIForRegion(DAG->Regions[I].first, DAG->Regions[I].second) + : &*DAG->Regions[I].first; + IdxToInstruction[I] = RegionKey; + } } void GCNScheduleDAGMILive::finalizeSchedule() { @@ -726,8 +808,11 @@ void GCNScheduleDAGMILive::finalizeSchedule() { void GCNScheduleDAGMILive::runSchedStages() { LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n"); - if (!Regions.empty()) - BBLiveInMap = getBBLiveInMap(); + if (!Regions.empty()) { + BBLiveInMap = getRegionLiveInMap(); + if (GCNTrackers) + RegionLiveOuts.buildLiveRegMap(); + } GCNSchedStrategy &S = static_cast(*SchedImpl); while (S.advanceStage()) { @@ -745,6 +830,19 @@ void GCNScheduleDAGMILive::runSchedStages() { continue; } + if (GCNTrackers) { + GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker(); + GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker(); + GCNRPTracker::LiveRegSet *RegionLiveIns = + &LiveIns[Stage->getRegionIdx()]; + + reinterpret_cast(DownwardTracker) + ->reset(MRI, *RegionLiveIns); + reinterpret_cast(UpwardTracker) + ->reset(MRI, RegionLiveOuts.getLiveRegsForRegionIdx( + Stage->getRegionIdx())); + } + ScheduleDAGMILive::schedule(); Stage->finalizeGCNRegion(); } @@ -1015,6 +1113,7 @@ void GCNSchedStage::finalizeGCNRegion() { void GCNSchedStage::checkScheduling() { // Check the results of scheduling. PressureAfter = DAG.getRealRegPressure(RegionIdx); + LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter)); LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n"); @@ -1586,6 +1685,9 @@ bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST, DAG.Regions = NewRegions; DAG.RescheduleRegions = NewRescheduleRegions; + if (GCNTrackers) + DAG.RegionLiveOuts.buildLiveRegMap(); + SIMachineFunctionInfo &MFI = *MF.getInfo(); MFI.increaseOccupancy(MF, ++DAG.MinOccupancy); diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index f0aea2bc4ab86..64d517038f90e 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -70,6 +70,12 @@ class GCNSchedStrategy : public GenericScheduler { // Pointer to the current SchedStageID. SmallVectorImpl::iterator CurrentStage = nullptr; + // GCN RP Tracker for top-down scheduling + mutable GCNDownwardRPTracker DownwardTracker; + + // GCN RP Tracker for botttom-up scheduling + mutable GCNUpwardRPTracker UpwardTracker; + public: // schedule() have seen register pressure over the critical limits and had to // track register pressure for actual scheduling heuristics. @@ -102,6 +108,8 @@ class GCNSchedStrategy : public GenericScheduler { SUnit *pickNode(bool &IsTopNode) override; + void schedNode(SUnit *SU, bool IsTopNode) override; + void initialize(ScheduleDAGMI *DAG) override; unsigned getTargetOccupancy() { return TargetOccupancy; } @@ -116,13 +124,18 @@ class GCNSchedStrategy : public GenericScheduler { bool hasNextStage() const; GCNSchedStageID getNextStage() const; + + GCNDownwardRPTracker *getDownwardTracker() { return &DownwardTracker; } + + GCNUpwardRPTracker *getUpwardTracker() { return &UpwardTracker; } }; /// The goal of this scheduling strategy is to maximize kernel occupancy (i.e. /// maximum number of waves per simd). class GCNMaxOccupancySchedStrategy final : public GCNSchedStrategy { public: - GCNMaxOccupancySchedStrategy(const MachineSchedContext *C); + GCNMaxOccupancySchedStrategy(const MachineSchedContext *C, + bool IsLegacyScheduler = false); }; /// The goal of this scheduling strategy is to maximize ILP for a single wave @@ -163,6 +176,32 @@ inline raw_ostream &operator<<(raw_ostream &OS, const ScheduleMetrics &Sm) { return OS; } +class GCNScheduleDAGMILive; +class RegionPressureMap { + GCNScheduleDAGMILive *DAG; + // The live in/out pressure as indexed by the first or last MI in the region + // before scheduling. + DenseMap RegionLiveRegMap; + // The mapping of RegionIDx to key instruction + DenseMap IdxToInstruction; + // Whether we are calculating LiveOuts or LiveIns + bool IsLiveOut; + +public: + RegionPressureMap() {} + RegionPressureMap(GCNScheduleDAGMILive *GCNDAG, bool LiveOut) + : DAG(GCNDAG), IsLiveOut(LiveOut) {} + // Build the Instr->LiveReg and RegionIdx->Instr maps + void buildLiveRegMap(); + + // Retrieve the LiveReg for a given RegionIdx + GCNRPTracker::LiveRegSet &getLiveRegsForRegionIdx(unsigned RegionIdx) { + assert(IdxToInstruction.find(RegionIdx) != IdxToInstruction.end()); + MachineInstr *Key = IdxToInstruction[RegionIdx]; + return RegionLiveRegMap[Key]; + } +}; + class GCNScheduleDAGMILive final : public ScheduleDAGMILive { friend class GCNSchedStage; friend class OccInitialScheduleStage; @@ -170,6 +209,7 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive { friend class ClusteredLowOccStage; friend class PreRARematStage; friend class ILPInitialScheduleStage; + friend class RegionPressureMap; const GCNSubtarget &ST; @@ -211,9 +251,22 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive { // Temporary basic block live-in cache. DenseMap MBBLiveIns; + // The map of the initial first region instruction to region live in registers DenseMap BBLiveInMap; - DenseMap getBBLiveInMap() const; + // Calculate the map of the initial first region instruction to region live in + // registers + DenseMap getRegionLiveInMap() const; + + // Calculate the map of the initial last region instruction to region live out + // registers + DenseMap + getRegionLiveOutMap() const; + + // The live out registers per region. These are internally stored as a map of + // the initial last region instruction to region live out registers, but can + // be retreived with the regionIdx by calls to getLiveRegsForRegionIdx. + RegionPressureMap RegionLiveOuts; // Return current region pressure. GCNRegPressure getRealRegPressure(unsigned RegionIdx) const; @@ -311,6 +364,9 @@ class GCNSchedStage { return DAG.RegionsWithExcessRP[RegionIdx]; } + // The region number this stage is currently working on + unsigned getRegionIdx() { return RegionIdx; } + // Returns true if the new schedule may result in more spilling. bool mayCauseSpilling(unsigned WavesAfter); diff --git a/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir b/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir index e9005e94ce5db..d57450baea911 100644 --- a/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir +++ b/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir @@ -1,11 +1,17 @@ # REQUIRES: asserts -# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=machine-scheduler -verify-misched -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-misched -run-pass=machine-scheduler -verify-misched -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-misched -run-pass=machine-scheduler -amdgpu-use-amdgpu-trackers=1 -verify-misched -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck -check-prefix=GCN-GCNTRACKER %s --- | define amdgpu_kernel void @high-RP-reschedule() { ret void } ... -# CHECK: Unclustered High Register Pressure Reschedule stage successfully increased occupancy to 4 +# GCN: Unclustered High Register Pressure Reschedule stage successfully increased occupancy to 4 + +# GCN-GCNTRACKER: Occupancy before scheduling: 3, after 4. +# GCN-GCNTRACKER-NEXT: Ending scheduling stage: Max Occupancy Initial Schedule + +# When using the GCN Trackers, the scheduler is able to acieve desired occupancy without running high-RP-reschedule stage. --- name: high-RP-reschedule diff --git a/llvm/test/CodeGen/AMDGPU/pr51516.mir b/llvm/test/CodeGen/AMDGPU/pr51516.mir index 4be102f7860ea..f496a4b06bb23 100644 --- a/llvm/test/CodeGen/AMDGPU/pr51516.mir +++ b/llvm/test/CodeGen/AMDGPU/pr51516.mir @@ -1,4 +1,5 @@ -# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -verify-machineinstrs -start-before=machine-scheduler -stop-after=virtregrewriter,2 -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -verify-misched -start-before=machine-scheduler -stop-after=virtregrewriter,2 -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -amdgpu-use-amdgpu-trackers=1 -verify-misched -start-before=machine-scheduler -stop-after=virtregrewriter,2 -o - %s | FileCheck -check-prefix=GCN-GCNTRACKER %s # Check that %3 was not rematerialized before the last store since its operand %1 # is killed by that store. @@ -7,6 +8,9 @@ # GCN: renamable $vgpr33_vgpr34_vgpr35_vgpr36 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) # GCN: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr47, killed renamable $vgpr29_vgpr30_vgpr31_vgpr32, killed renamable $sgpr0_sgpr1, 16, 0, implicit $exec, implicit killed renamable $vgpr46 +# GCN-GCNTRACKER-LABEL: name: global_sextload_v32i32_to_v32i64 +# GCN-GCNTRACKER-NOT: SI_SPILL + --- name: global_sextload_v32i32_to_v32i64 tracksRegLiveness: true diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg-crash.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg-crash.ll new file mode 100644 index 0000000000000..79187f51af0d2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg-crash.ll @@ -0,0 +1,65 @@ +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+xnack -amdgpu-use-amdgpu-trackers=1 2>&1 < %s | FileCheck -check-prefixes=ERR-GCNTRACKERS %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+xnack 2>&1 < %s | FileCheck -check-prefixes=GCN %s + +%asm.output = type { <16 x i32>, <16 x i32>, <16 x i32>, <8 x i32>, <2 x i32>, i32, ; sgprs + <16 x i32>, <7 x i32>, ; vgprs + i64 ; vcc + } + +%asm.output2 = type { <16 x i32>, <16 x i32>, <16 x i32>, <8 x i32>, <2 x i32>, i32, ; sgprs + <16 x i32>, <5 x i32>, ; vgprs + i64 ; vcc + } + +%asm.output3 = type { <16 x i32>, <16 x i32>, <16 x i32>, <8 x i32>, <2 x i32>, ; sgprs + <16 x i32>, <6 x i32>, ; vgprs + i64 ; vcc + } + +; ERR-GCNTRACKERS: ran out of registers during register allocation +; GCN-NOT: ran out of registers during register allocation + +; FIXME: GCN Trackers do not track pressure from PhysRegs, so scheduling is actually worse + +define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 { + %alloca0 = alloca [4096 x i32], align 64, addrspace(5) + %alloca1 = alloca i32, align 4, addrspace(5) + call void asm sideeffect "; use alloca0 $0", "v"(ptr addrspace(5) %alloca0) + + %asm = call %asm.output asm sideeffect + "; def $0, $1, $2, $3, $4, $5, $6, $7, $8", + "={s[0:15]},={s[16:31]},={s[32:47]},={s[48:55]},={s[56:57]},={s58},={v[0:15]},={v[16:22]},={vcc}"() + + %s0 = extractvalue %asm.output %asm, 0 + %s1 = extractvalue %asm.output %asm, 1 + %s2 = extractvalue %asm.output %asm, 2 + %s3 = extractvalue %asm.output %asm, 3 + %s4 = extractvalue %asm.output %asm, 4 + %s5 = extractvalue %asm.output %asm, 5 + + %v0 = extractvalue %asm.output %asm, 6 + %v1 = extractvalue %asm.output %asm, 7 + + %vcc = extractvalue %asm.output %asm, 8 + + ; scc is unavailable since it is live in + call void asm sideeffect "; use $0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10", + "{s[0:15]},{s[16:31]},{s[32:47]},{s[48:55]},{s[56:57]},{s58},{v[0:15]},{v[16:22]},{vcc},{s59},{scc}"( + <16 x i32> %s0, + <16 x i32> %s1, + <16 x i32> %s2, + <8 x i32> %s3, + <2 x i32> %s4, + i32 %s5, + <16 x i32> %v0, + <7 x i32> %v1, + i64 %vcc, + ptr addrspace(5) %alloca1, + i32 0) ; use of scc + + ret void +} + +attributes #0 = { nounwind alignstack=64 "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="10,10" "no-realign-stack" } +attributes #1 = { nounwind alignstack=16 "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="10,10" "no-realign-stack" } + diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll new file mode 100644 index 0000000000000..c490c76f4531d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll @@ -0,0 +1,491 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -amdgpu-s-branch-bits=5 -amdgpu-long-branch-factor=0 < %s | FileCheck --check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -amdgpu-s-branch-bits=5 -amdgpu-long-branch-factor=0 -amdgpu-use-amdgpu-trackers=1 < %s | FileCheck --check-prefix=GCN-GCNTRACKERS %s + +; CHECK-LABEL: {{^}}spill: +; GCN: codeLenInByte = 1000 +; GCN-GCNTRACKERS: codeLenInByte = 1016 +; GCN: NumSgprs: 104 +; GCN-GCNTRACKERS: NumSgprs: 104 +; GCN: NumVgprs: 1 +; GCN-GCNTRACKERS: NumVgprs: 2 +; GCN: ScratchSize: 0 +; GCN-GCNTRACKERS: ScratchSize: 0 +; GCN: Occupancy: 5 +; GCN-GCNTRACKERS: Occupancy: 5 + +; FIXME: GCN Trackers do not track pressure from PhysRegs, so scheduling is actually worse + +define amdgpu_kernel void @spill(ptr addrspace(1) %arg, i32 %cnd) #0 { +entry: + %sgpr0 = tail call i32 asm sideeffect "s_mov_b32 s0, 0", "={s0}"() #0 + %sgpr1 = tail call i32 asm sideeffect "s_mov_b32 s1, 0", "={s1}"() #0 + %sgpr2 = tail call i32 asm sideeffect "s_mov_b32 s2, 0", "={s2}"() #0 + %sgpr3 = tail call i32 asm sideeffect "s_mov_b32 s3, 0", "={s3}"() #0 + %sgpr4 = tail call i32 asm sideeffect "s_mov_b32 s4, 0", "={s4}"() #0 + %sgpr5 = tail call i32 asm sideeffect "s_mov_b32 s5, 0", "={s5}"() #0 + %sgpr6 = tail call i32 asm sideeffect "s_mov_b32 s6, 0", "={s6}"() #0 + %sgpr7 = tail call i32 asm sideeffect "s_mov_b32 s7, 0", "={s7}"() #0 + %sgpr8 = tail call i32 asm sideeffect "s_mov_b32 s8, 0", "={s8}"() #0 + %sgpr9 = tail call i32 asm sideeffect "s_mov_b32 s9, 0", "={s9}"() #0 + %sgpr10 = tail call i32 asm sideeffect "s_mov_b32 s10, 0", "={s10}"() #0 + %sgpr11 = tail call i32 asm sideeffect "s_mov_b32 s11, 0", "={s11}"() #0 + %sgpr12 = tail call i32 asm sideeffect "s_mov_b32 s12, 0", "={s12}"() #0 + %sgpr13 = tail call i32 asm sideeffect "s_mov_b32 s13, 0", "={s13}"() #0 + %sgpr14 = tail call i32 asm sideeffect "s_mov_b32 s14, 0", "={s14}"() #0 + %sgpr15 = tail call i32 asm sideeffect "s_mov_b32 s15, 0", "={s15}"() #0 + %sgpr16 = tail call i32 asm sideeffect "s_mov_b32 s16, 0", "={s16}"() #0 + %sgpr17 = tail call i32 asm sideeffect "s_mov_b32 s17, 0", "={s17}"() #0 + %sgpr18 = tail call i32 asm sideeffect "s_mov_b32 s18, 0", "={s18}"() #0 + %sgpr19 = tail call i32 asm sideeffect "s_mov_b32 s19, 0", "={s19}"() #0 + %sgpr20 = tail call i32 asm sideeffect "s_mov_b32 s20, 0", "={s20}"() #0 + %sgpr21 = tail call i32 asm sideeffect "s_mov_b32 s21, 0", "={s21}"() #0 + %sgpr22 = tail call i32 asm sideeffect "s_mov_b32 s22, 0", "={s22}"() #0 + %sgpr23 = tail call i32 asm sideeffect "s_mov_b32 s23, 0", "={s23}"() #0 + %sgpr24 = tail call i32 asm sideeffect "s_mov_b32 s24, 0", "={s24}"() #0 + %sgpr25 = tail call i32 asm sideeffect "s_mov_b32 s25, 0", "={s25}"() #0 + %sgpr26 = tail call i32 asm sideeffect "s_mov_b32 s26, 0", "={s26}"() #0 + %sgpr27 = tail call i32 asm sideeffect "s_mov_b32 s27, 0", "={s27}"() #0 + %sgpr28 = tail call i32 asm sideeffect "s_mov_b32 s28, 0", "={s28}"() #0 + %sgpr29 = tail call i32 asm sideeffect "s_mov_b32 s29, 0", "={s29}"() #0 + %sgpr30 = tail call i32 asm sideeffect "s_mov_b32 s30, 0", "={s30}"() #0 + %sgpr31 = tail call i32 asm sideeffect "s_mov_b32 s31, 0", "={s31}"() #0 + %sgpr32 = tail call i32 asm sideeffect "s_mov_b32 s32, 0", "={s32}"() #0 + %sgpr33 = tail call i32 asm sideeffect "s_mov_b32 s33, 0", "={s33}"() #0 + %sgpr34 = tail call i32 asm sideeffect "s_mov_b32 s34, 0", "={s34}"() #0 + %sgpr35 = tail call i32 asm sideeffect "s_mov_b32 s35, 0", "={s35}"() #0 + %sgpr36 = tail call i32 asm sideeffect "s_mov_b32 s36, 0", "={s36}"() #0 + %sgpr37 = tail call i32 asm sideeffect "s_mov_b32 s37, 0", "={s37}"() #0 + %sgpr38 = tail call i32 asm sideeffect "s_mov_b32 s38, 0", "={s38}"() #0 + %sgpr39 = tail call i32 asm sideeffect "s_mov_b32 s39, 0", "={s39}"() #0 + %sgpr40 = tail call i32 asm sideeffect "s_mov_b32 s40, 0", "={s40}"() #0 + %sgpr41 = tail call i32 asm sideeffect "s_mov_b32 s41, 0", "={s41}"() #0 + %sgpr42 = tail call i32 asm sideeffect "s_mov_b32 s42, 0", "={s42}"() #0 + %sgpr43 = tail call i32 asm sideeffect "s_mov_b32 s43, 0", "={s43}"() #0 + %sgpr44 = tail call i32 asm sideeffect "s_mov_b32 s44, 0", "={s44}"() #0 + %sgpr45 = tail call i32 asm sideeffect "s_mov_b32 s45, 0", "={s45}"() #0 + %sgpr46 = tail call i32 asm sideeffect "s_mov_b32 s46, 0", "={s46}"() #0 + %sgpr47 = tail call i32 asm sideeffect "s_mov_b32 s47, 0", "={s47}"() #0 + %sgpr48 = tail call i32 asm sideeffect "s_mov_b32 s48, 0", "={s48}"() #0 + %sgpr49 = tail call i32 asm sideeffect "s_mov_b32 s49, 0", "={s49}"() #0 + %sgpr50 = tail call i32 asm sideeffect "s_mov_b32 s50, 0", "={s50}"() #0 + %sgpr51 = tail call i32 asm sideeffect "s_mov_b32 s51, 0", "={s51}"() #0 + %sgpr52 = tail call i32 asm sideeffect "s_mov_b32 s52, 0", "={s52}"() #0 + %sgpr53 = tail call i32 asm sideeffect "s_mov_b32 s53, 0", "={s53}"() #0 + %sgpr54 = tail call i32 asm sideeffect "s_mov_b32 s54, 0", "={s54}"() #0 + %sgpr55 = tail call i32 asm sideeffect "s_mov_b32 s55, 0", "={s55}"() #0 + %sgpr56 = tail call i32 asm sideeffect "s_mov_b32 s56, 0", "={s56}"() #0 + %sgpr57 = tail call i32 asm sideeffect "s_mov_b32 s57, 0", "={s57}"() #0 + %sgpr58 = tail call i32 asm sideeffect "s_mov_b32 s58, 0", "={s58}"() #0 + %sgpr59 = tail call i32 asm sideeffect "s_mov_b32 s59, 0", "={s59}"() #0 + %sgpr60 = tail call i32 asm sideeffect "s_mov_b32 s60, 0", "={s60}"() #0 + %sgpr61 = tail call i32 asm sideeffect "s_mov_b32 s61, 0", "={s61}"() #0 + %sgpr62 = tail call i32 asm sideeffect "s_mov_b32 s62, 0", "={s62}"() #0 + %sgpr63 = tail call i32 asm sideeffect "s_mov_b32 s63, 0", "={s63}"() #0 + %sgpr64 = tail call i32 asm sideeffect "s_mov_b32 s64, 0", "={s64}"() #0 + %sgpr65 = tail call i32 asm sideeffect "s_mov_b32 s65, 0", "={s65}"() #0 + %sgpr66 = tail call i32 asm sideeffect "s_mov_b32 s66, 0", "={s66}"() #0 + %sgpr67 = tail call i32 asm sideeffect "s_mov_b32 s67, 0", "={s67}"() #0 + %sgpr68 = tail call i32 asm sideeffect "s_mov_b32 s68, 0", "={s68}"() #0 + %sgpr69 = tail call i32 asm sideeffect "s_mov_b32 s69, 0", "={s69}"() #0 + %sgpr70 = tail call i32 asm sideeffect "s_mov_b32 s70, 0", "={s70}"() #0 + %sgpr71 = tail call i32 asm sideeffect "s_mov_b32 s71, 0", "={s71}"() #0 + %sgpr72 = tail call i32 asm sideeffect "s_mov_b32 s72, 0", "={s72}"() #0 + %sgpr73 = tail call i32 asm sideeffect "s_mov_b32 s73, 0", "={s73}"() #0 + %sgpr74 = tail call i32 asm sideeffect "s_mov_b32 s74, 0", "={s74}"() #0 + %sgpr75 = tail call i32 asm sideeffect "s_mov_b32 s75, 0", "={s75}"() #0 + %sgpr76 = tail call i32 asm sideeffect "s_mov_b32 s76, 0", "={s76}"() #0 + %sgpr77 = tail call i32 asm sideeffect "s_mov_b32 s77, 0", "={s77}"() #0 + %sgpr78 = tail call i32 asm sideeffect "s_mov_b32 s78, 0", "={s78}"() #0 + %sgpr79 = tail call i32 asm sideeffect "s_mov_b32 s79, 0", "={s79}"() #0 + %sgpr80 = tail call i32 asm sideeffect "s_mov_b32 s80, 0", "={s80}"() #0 + %sgpr81 = tail call i32 asm sideeffect "s_mov_b32 s81, 0", "={s81}"() #0 + %sgpr82 = tail call i32 asm sideeffect "s_mov_b32 s82, 0", "={s82}"() #0 + %sgpr83 = tail call i32 asm sideeffect "s_mov_b32 s83, 0", "={s83}"() #0 + %sgpr84 = tail call i32 asm sideeffect "s_mov_b32 s84, 0", "={s84}"() #0 + %sgpr85 = tail call i32 asm sideeffect "s_mov_b32 s85, 0", "={s85}"() #0 + %sgpr86 = tail call i32 asm sideeffect "s_mov_b32 s86, 0", "={s86}"() #0 + %sgpr87 = tail call i32 asm sideeffect "s_mov_b32 s87, 0", "={s87}"() #0 + %sgpr88 = tail call i32 asm sideeffect "s_mov_b32 s88, 0", "={s88}"() #0 + %sgpr89 = tail call i32 asm sideeffect "s_mov_b32 s89, 0", "={s89}"() #0 + %sgpr90 = tail call i32 asm sideeffect "s_mov_b32 s90, 0", "={s90}"() #0 + %sgpr91 = tail call i32 asm sideeffect "s_mov_b32 s91, 0", "={s91}"() #0 + %sgpr92 = tail call i32 asm sideeffect "s_mov_b32 s92, 0", "={s92}"() #0 + %sgpr93 = tail call i32 asm sideeffect "s_mov_b32 s93, 0", "={s93}"() #0 + %sgpr94 = tail call i32 asm sideeffect "s_mov_b32 s94, 0", "={s94}"() #0 + %sgpr95 = tail call i32 asm sideeffect "s_mov_b32 s95, 0", "={s95}"() #0 + %sgpr96 = tail call i32 asm sideeffect "s_mov_b32 s96, 0", "={s96}"() #0 + %sgpr97 = tail call i32 asm sideeffect "s_mov_b32 s97, 0", "={s97}"() #0 + %sgpr98 = tail call i32 asm sideeffect "s_mov_b32 s98, 0", "={s98}"() #0 + %sgpr99 = tail call i32 asm sideeffect "s_mov_b32 s99, 0", "={s99}"() #0 + %sgpr100 = tail call i32 asm sideeffect "s_mov_b32 s100, 0", "={s100}"() #0 + %sgpr101 = tail call i32 asm sideeffect "s_mov_b32 s101, 0", "={s101}"() #0 + %vcc_lo = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={vcc_lo}"() #0 + %vcc_hi = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={vcc_hi}"() #0 + %cmp = icmp eq i32 %cnd, 0 + br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch + +bb2: ; 68 bytes + ; 64 byte asm + call void asm sideeffect + "v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64",""() #0 + br label %bb3 + +bb3: + tail call void asm sideeffect "; reg use $0", "{s0}"(i32 %sgpr0) #0 + tail call void asm sideeffect "; reg use $0", "{s1}"(i32 %sgpr1) #0 + tail call void asm sideeffect "; reg use $0", "{s2}"(i32 %sgpr2) #0 + tail call void asm sideeffect "; reg use $0", "{s3}"(i32 %sgpr3) #0 + tail call void asm sideeffect "; reg use $0", "{s4}"(i32 %sgpr4) #0 + tail call void asm sideeffect "; reg use $0", "{s5}"(i32 %sgpr5) #0 + tail call void asm sideeffect "; reg use $0", "{s6}"(i32 %sgpr6) #0 + tail call void asm sideeffect "; reg use $0", "{s7}"(i32 %sgpr7) #0 + tail call void asm sideeffect "; reg use $0", "{s8}"(i32 %sgpr8) #0 + tail call void asm sideeffect "; reg use $0", "{s9}"(i32 %sgpr9) #0 + tail call void asm sideeffect "; reg use $0", "{s10}"(i32 %sgpr10) #0 + tail call void asm sideeffect "; reg use $0", "{s11}"(i32 %sgpr11) #0 + tail call void asm sideeffect "; reg use $0", "{s12}"(i32 %sgpr12) #0 + tail call void asm sideeffect "; reg use $0", "{s13}"(i32 %sgpr13) #0 + tail call void asm sideeffect "; reg use $0", "{s14}"(i32 %sgpr14) #0 + tail call void asm sideeffect "; reg use $0", "{s15}"(i32 %sgpr15) #0 + tail call void asm sideeffect "; reg use $0", "{s16}"(i32 %sgpr16) #0 + tail call void asm sideeffect "; reg use $0", "{s17}"(i32 %sgpr17) #0 + tail call void asm sideeffect "; reg use $0", "{s18}"(i32 %sgpr18) #0 + tail call void asm sideeffect "; reg use $0", "{s19}"(i32 %sgpr19) #0 + tail call void asm sideeffect "; reg use $0", "{s20}"(i32 %sgpr20) #0 + tail call void asm sideeffect "; reg use $0", "{s21}"(i32 %sgpr21) #0 + tail call void asm sideeffect "; reg use $0", "{s22}"(i32 %sgpr22) #0 + tail call void asm sideeffect "; reg use $0", "{s23}"(i32 %sgpr23) #0 + tail call void asm sideeffect "; reg use $0", "{s24}"(i32 %sgpr24) #0 + tail call void asm sideeffect "; reg use $0", "{s25}"(i32 %sgpr25) #0 + tail call void asm sideeffect "; reg use $0", "{s26}"(i32 %sgpr26) #0 + tail call void asm sideeffect "; reg use $0", "{s27}"(i32 %sgpr27) #0 + tail call void asm sideeffect "; reg use $0", "{s28}"(i32 %sgpr28) #0 + tail call void asm sideeffect "; reg use $0", "{s29}"(i32 %sgpr29) #0 + tail call void asm sideeffect "; reg use $0", "{s30}"(i32 %sgpr30) #0 + tail call void asm sideeffect "; reg use $0", "{s31}"(i32 %sgpr31) #0 + tail call void asm sideeffect "; reg use $0", "{s32}"(i32 %sgpr32) #0 + tail call void asm sideeffect "; reg use $0", "{s33}"(i32 %sgpr33) #0 + tail call void asm sideeffect "; reg use $0", "{s34}"(i32 %sgpr34) #0 + tail call void asm sideeffect "; reg use $0", "{s35}"(i32 %sgpr35) #0 + tail call void asm sideeffect "; reg use $0", "{s36}"(i32 %sgpr36) #0 + tail call void asm sideeffect "; reg use $0", "{s37}"(i32 %sgpr37) #0 + tail call void asm sideeffect "; reg use $0", "{s38}"(i32 %sgpr38) #0 + tail call void asm sideeffect "; reg use $0", "{s39}"(i32 %sgpr39) #0 + tail call void asm sideeffect "; reg use $0", "{s40}"(i32 %sgpr40) #0 + tail call void asm sideeffect "; reg use $0", "{s41}"(i32 %sgpr41) #0 + tail call void asm sideeffect "; reg use $0", "{s42}"(i32 %sgpr42) #0 + tail call void asm sideeffect "; reg use $0", "{s43}"(i32 %sgpr43) #0 + tail call void asm sideeffect "; reg use $0", "{s44}"(i32 %sgpr44) #0 + tail call void asm sideeffect "; reg use $0", "{s45}"(i32 %sgpr45) #0 + tail call void asm sideeffect "; reg use $0", "{s46}"(i32 %sgpr46) #0 + tail call void asm sideeffect "; reg use $0", "{s47}"(i32 %sgpr47) #0 + tail call void asm sideeffect "; reg use $0", "{s48}"(i32 %sgpr48) #0 + tail call void asm sideeffect "; reg use $0", "{s49}"(i32 %sgpr49) #0 + tail call void asm sideeffect "; reg use $0", "{s50}"(i32 %sgpr50) #0 + tail call void asm sideeffect "; reg use $0", "{s51}"(i32 %sgpr51) #0 + tail call void asm sideeffect "; reg use $0", "{s52}"(i32 %sgpr52) #0 + tail call void asm sideeffect "; reg use $0", "{s53}"(i32 %sgpr53) #0 + tail call void asm sideeffect "; reg use $0", "{s54}"(i32 %sgpr54) #0 + tail call void asm sideeffect "; reg use $0", "{s55}"(i32 %sgpr55) #0 + tail call void asm sideeffect "; reg use $0", "{s56}"(i32 %sgpr56) #0 + tail call void asm sideeffect "; reg use $0", "{s57}"(i32 %sgpr57) #0 + tail call void asm sideeffect "; reg use $0", "{s58}"(i32 %sgpr58) #0 + tail call void asm sideeffect "; reg use $0", "{s59}"(i32 %sgpr59) #0 + tail call void asm sideeffect "; reg use $0", "{s60}"(i32 %sgpr60) #0 + tail call void asm sideeffect "; reg use $0", "{s61}"(i32 %sgpr61) #0 + tail call void asm sideeffect "; reg use $0", "{s62}"(i32 %sgpr62) #0 + tail call void asm sideeffect "; reg use $0", "{s63}"(i32 %sgpr63) #0 + tail call void asm sideeffect "; reg use $0", "{s64}"(i32 %sgpr64) #0 + tail call void asm sideeffect "; reg use $0", "{s65}"(i32 %sgpr65) #0 + tail call void asm sideeffect "; reg use $0", "{s66}"(i32 %sgpr66) #0 + tail call void asm sideeffect "; reg use $0", "{s67}"(i32 %sgpr67) #0 + tail call void asm sideeffect "; reg use $0", "{s68}"(i32 %sgpr68) #0 + tail call void asm sideeffect "; reg use $0", "{s69}"(i32 %sgpr69) #0 + tail call void asm sideeffect "; reg use $0", "{s70}"(i32 %sgpr70) #0 + tail call void asm sideeffect "; reg use $0", "{s71}"(i32 %sgpr71) #0 + tail call void asm sideeffect "; reg use $0", "{s72}"(i32 %sgpr72) #0 + tail call void asm sideeffect "; reg use $0", "{s73}"(i32 %sgpr73) #0 + tail call void asm sideeffect "; reg use $0", "{s74}"(i32 %sgpr74) #0 + tail call void asm sideeffect "; reg use $0", "{s75}"(i32 %sgpr75) #0 + tail call void asm sideeffect "; reg use $0", "{s76}"(i32 %sgpr76) #0 + tail call void asm sideeffect "; reg use $0", "{s77}"(i32 %sgpr77) #0 + tail call void asm sideeffect "; reg use $0", "{s78}"(i32 %sgpr78) #0 + tail call void asm sideeffect "; reg use $0", "{s79}"(i32 %sgpr79) #0 + tail call void asm sideeffect "; reg use $0", "{s80}"(i32 %sgpr80) #0 + tail call void asm sideeffect "; reg use $0", "{s81}"(i32 %sgpr81) #0 + tail call void asm sideeffect "; reg use $0", "{s82}"(i32 %sgpr82) #0 + tail call void asm sideeffect "; reg use $0", "{s83}"(i32 %sgpr83) #0 + tail call void asm sideeffect "; reg use $0", "{s84}"(i32 %sgpr84) #0 + tail call void asm sideeffect "; reg use $0", "{s85}"(i32 %sgpr85) #0 + tail call void asm sideeffect "; reg use $0", "{s86}"(i32 %sgpr86) #0 + tail call void asm sideeffect "; reg use $0", "{s87}"(i32 %sgpr87) #0 + tail call void asm sideeffect "; reg use $0", "{s88}"(i32 %sgpr88) #0 + tail call void asm sideeffect "; reg use $0", "{s89}"(i32 %sgpr89) #0 + tail call void asm sideeffect "; reg use $0", "{s90}"(i32 %sgpr90) #0 + tail call void asm sideeffect "; reg use $0", "{s91}"(i32 %sgpr91) #0 + tail call void asm sideeffect "; reg use $0", "{s92}"(i32 %sgpr92) #0 + tail call void asm sideeffect "; reg use $0", "{s93}"(i32 %sgpr93) #0 + tail call void asm sideeffect "; reg use $0", "{s94}"(i32 %sgpr94) #0 + tail call void asm sideeffect "; reg use $0", "{s95}"(i32 %sgpr95) #0 + tail call void asm sideeffect "; reg use $0", "{s96}"(i32 %sgpr96) #0 + tail call void asm sideeffect "; reg use $0", "{s97}"(i32 %sgpr97) #0 + tail call void asm sideeffect "; reg use $0", "{s98}"(i32 %sgpr98) #0 + tail call void asm sideeffect "; reg use $0", "{s99}"(i32 %sgpr99) #0 + tail call void asm sideeffect "; reg use $0", "{s100}"(i32 %sgpr100) #0 + tail call void asm sideeffect "; reg use $0", "{s101}"(i32 %sgpr101) #0 + tail call void asm sideeffect "; reg use $0", "{vcc_lo}"(i32 %vcc_lo) #0 + tail call void asm sideeffect "; reg use $0", "{vcc_hi}"(i32 %vcc_hi) #0 + ret void +} + +; CHECK-LABEL: {{^}}spill_func: +; GCN: codeLenInByte = 1612 +; GCN-GCNTRACKERS: codeLenInByte = 1660 +; GCN: NumSgprs: 104 +; GCN-GCNTRACKERS: NumSgprs: 104 +; GCN: NumVgprs: 3 +; GCN-GCNTRACKERS: NumVgprs: 4 +; GCN: ScratchSize: 12 +; GCN-GCNTRACKERS: ScratchSize: 16 + +define void @spill_func(ptr addrspace(1) %arg) #0 { +entry: + %cnd = tail call i32 @llvm.amdgcn.workgroup.id.x() #0 + %sgpr0 = tail call i32 asm sideeffect "s_mov_b32 s0, 0", "={s0}"() #0 + %sgpr1 = tail call i32 asm sideeffect "s_mov_b32 s1, 0", "={s1}"() #0 + %sgpr2 = tail call i32 asm sideeffect "s_mov_b32 s2, 0", "={s2}"() #0 + %sgpr3 = tail call i32 asm sideeffect "s_mov_b32 s3, 0", "={s3}"() #0 + %sgpr4 = tail call i32 asm sideeffect "s_mov_b32 s4, 0", "={s4}"() #0 + %sgpr5 = tail call i32 asm sideeffect "s_mov_b32 s5, 0", "={s5}"() #0 + %sgpr6 = tail call i32 asm sideeffect "s_mov_b32 s6, 0", "={s6}"() #0 + %sgpr7 = tail call i32 asm sideeffect "s_mov_b32 s7, 0", "={s7}"() #0 + %sgpr8 = tail call i32 asm sideeffect "s_mov_b32 s8, 0", "={s8}"() #0 + %sgpr9 = tail call i32 asm sideeffect "s_mov_b32 s9, 0", "={s9}"() #0 + %sgpr10 = tail call i32 asm sideeffect "s_mov_b32 s10, 0", "={s10}"() #0 + %sgpr11 = tail call i32 asm sideeffect "s_mov_b32 s11, 0", "={s11}"() #0 + %sgpr12 = tail call i32 asm sideeffect "s_mov_b32 s12, 0", "={s12}"() #0 + %sgpr13 = tail call i32 asm sideeffect "s_mov_b32 s13, 0", "={s13}"() #0 + %sgpr14 = tail call i32 asm sideeffect "s_mov_b32 s14, 0", "={s14}"() #0 + %sgpr15 = tail call i32 asm sideeffect "s_mov_b32 s15, 0", "={s15}"() #0 + %sgpr16 = tail call i32 asm sideeffect "s_mov_b32 s16, 0", "={s16}"() #0 + %sgpr17 = tail call i32 asm sideeffect "s_mov_b32 s17, 0", "={s17}"() #0 + %sgpr18 = tail call i32 asm sideeffect "s_mov_b32 s18, 0", "={s18}"() #0 + %sgpr19 = tail call i32 asm sideeffect "s_mov_b32 s19, 0", "={s19}"() #0 + %sgpr20 = tail call i32 asm sideeffect "s_mov_b32 s20, 0", "={s20}"() #0 + %sgpr21 = tail call i32 asm sideeffect "s_mov_b32 s21, 0", "={s21}"() #0 + %sgpr22 = tail call i32 asm sideeffect "s_mov_b32 s22, 0", "={s22}"() #0 + %sgpr23 = tail call i32 asm sideeffect "s_mov_b32 s23, 0", "={s23}"() #0 + %sgpr24 = tail call i32 asm sideeffect "s_mov_b32 s24, 0", "={s24}"() #0 + %sgpr25 = tail call i32 asm sideeffect "s_mov_b32 s25, 0", "={s25}"() #0 + %sgpr26 = tail call i32 asm sideeffect "s_mov_b32 s26, 0", "={s26}"() #0 + %sgpr27 = tail call i32 asm sideeffect "s_mov_b32 s27, 0", "={s27}"() #0 + %sgpr28 = tail call i32 asm sideeffect "s_mov_b32 s28, 0", "={s28}"() #0 + %sgpr29 = tail call i32 asm sideeffect "s_mov_b32 s29, 0", "={s29}"() #0 + %sgpr30 = tail call i32 asm sideeffect "s_mov_b32 s30, 0", "={s30}"() #0 + %sgpr31 = tail call i32 asm sideeffect "s_mov_b32 s31, 0", "={s31}"() #0 + %sgpr32 = tail call i32 asm sideeffect "s_mov_b32 s32, 0", "={s32}"() #0 + %sgpr33 = tail call i32 asm sideeffect "s_mov_b32 s33, 0", "={s33}"() #0 + %sgpr34 = tail call i32 asm sideeffect "s_mov_b32 s34, 0", "={s34}"() #0 + %sgpr35 = tail call i32 asm sideeffect "s_mov_b32 s35, 0", "={s35}"() #0 + %sgpr36 = tail call i32 asm sideeffect "s_mov_b32 s36, 0", "={s36}"() #0 + %sgpr37 = tail call i32 asm sideeffect "s_mov_b32 s37, 0", "={s37}"() #0 + %sgpr38 = tail call i32 asm sideeffect "s_mov_b32 s38, 0", "={s38}"() #0 + %sgpr39 = tail call i32 asm sideeffect "s_mov_b32 s39, 0", "={s39}"() #0 + %sgpr40 = tail call i32 asm sideeffect "s_mov_b32 s40, 0", "={s40}"() #0 + %sgpr41 = tail call i32 asm sideeffect "s_mov_b32 s41, 0", "={s41}"() #0 + %sgpr42 = tail call i32 asm sideeffect "s_mov_b32 s42, 0", "={s42}"() #0 + %sgpr43 = tail call i32 asm sideeffect "s_mov_b32 s43, 0", "={s43}"() #0 + %sgpr44 = tail call i32 asm sideeffect "s_mov_b32 s44, 0", "={s44}"() #0 + %sgpr45 = tail call i32 asm sideeffect "s_mov_b32 s45, 0", "={s45}"() #0 + %sgpr46 = tail call i32 asm sideeffect "s_mov_b32 s46, 0", "={s46}"() #0 + %sgpr47 = tail call i32 asm sideeffect "s_mov_b32 s47, 0", "={s47}"() #0 + %sgpr48 = tail call i32 asm sideeffect "s_mov_b32 s48, 0", "={s48}"() #0 + %sgpr49 = tail call i32 asm sideeffect "s_mov_b32 s49, 0", "={s49}"() #0 + %sgpr50 = tail call i32 asm sideeffect "s_mov_b32 s50, 0", "={s50}"() #0 + %sgpr51 = tail call i32 asm sideeffect "s_mov_b32 s51, 0", "={s51}"() #0 + %sgpr52 = tail call i32 asm sideeffect "s_mov_b32 s52, 0", "={s52}"() #0 + %sgpr53 = tail call i32 asm sideeffect "s_mov_b32 s53, 0", "={s53}"() #0 + %sgpr54 = tail call i32 asm sideeffect "s_mov_b32 s54, 0", "={s54}"() #0 + %sgpr55 = tail call i32 asm sideeffect "s_mov_b32 s55, 0", "={s55}"() #0 + %sgpr56 = tail call i32 asm sideeffect "s_mov_b32 s56, 0", "={s56}"() #0 + %sgpr57 = tail call i32 asm sideeffect "s_mov_b32 s57, 0", "={s57}"() #0 + %sgpr58 = tail call i32 asm sideeffect "s_mov_b32 s58, 0", "={s58}"() #0 + %sgpr59 = tail call i32 asm sideeffect "s_mov_b32 s59, 0", "={s59}"() #0 + %sgpr60 = tail call i32 asm sideeffect "s_mov_b32 s60, 0", "={s60}"() #0 + %sgpr61 = tail call i32 asm sideeffect "s_mov_b32 s61, 0", "={s61}"() #0 + %sgpr62 = tail call i32 asm sideeffect "s_mov_b32 s62, 0", "={s62}"() #0 + %sgpr63 = tail call i32 asm sideeffect "s_mov_b32 s63, 0", "={s63}"() #0 + %sgpr64 = tail call i32 asm sideeffect "s_mov_b32 s64, 0", "={s64}"() #0 + %sgpr65 = tail call i32 asm sideeffect "s_mov_b32 s65, 0", "={s65}"() #0 + %sgpr66 = tail call i32 asm sideeffect "s_mov_b32 s66, 0", "={s66}"() #0 + %sgpr67 = tail call i32 asm sideeffect "s_mov_b32 s67, 0", "={s67}"() #0 + %sgpr68 = tail call i32 asm sideeffect "s_mov_b32 s68, 0", "={s68}"() #0 + %sgpr69 = tail call i32 asm sideeffect "s_mov_b32 s69, 0", "={s69}"() #0 + %sgpr70 = tail call i32 asm sideeffect "s_mov_b32 s70, 0", "={s70}"() #0 + %sgpr71 = tail call i32 asm sideeffect "s_mov_b32 s71, 0", "={s71}"() #0 + %sgpr72 = tail call i32 asm sideeffect "s_mov_b32 s72, 0", "={s72}"() #0 + %sgpr73 = tail call i32 asm sideeffect "s_mov_b32 s73, 0", "={s73}"() #0 + %sgpr74 = tail call i32 asm sideeffect "s_mov_b32 s74, 0", "={s74}"() #0 + %sgpr75 = tail call i32 asm sideeffect "s_mov_b32 s75, 0", "={s75}"() #0 + %sgpr76 = tail call i32 asm sideeffect "s_mov_b32 s76, 0", "={s76}"() #0 + %sgpr77 = tail call i32 asm sideeffect "s_mov_b32 s77, 0", "={s77}"() #0 + %sgpr78 = tail call i32 asm sideeffect "s_mov_b32 s78, 0", "={s78}"() #0 + %sgpr79 = tail call i32 asm sideeffect "s_mov_b32 s79, 0", "={s79}"() #0 + %sgpr80 = tail call i32 asm sideeffect "s_mov_b32 s80, 0", "={s80}"() #0 + %sgpr81 = tail call i32 asm sideeffect "s_mov_b32 s81, 0", "={s81}"() #0 + %sgpr82 = tail call i32 asm sideeffect "s_mov_b32 s82, 0", "={s82}"() #0 + %sgpr83 = tail call i32 asm sideeffect "s_mov_b32 s83, 0", "={s83}"() #0 + %sgpr84 = tail call i32 asm sideeffect "s_mov_b32 s84, 0", "={s84}"() #0 + %sgpr85 = tail call i32 asm sideeffect "s_mov_b32 s85, 0", "={s85}"() #0 + %sgpr86 = tail call i32 asm sideeffect "s_mov_b32 s86, 0", "={s86}"() #0 + %sgpr87 = tail call i32 asm sideeffect "s_mov_b32 s87, 0", "={s87}"() #0 + %sgpr88 = tail call i32 asm sideeffect "s_mov_b32 s88, 0", "={s88}"() #0 + %sgpr89 = tail call i32 asm sideeffect "s_mov_b32 s89, 0", "={s89}"() #0 + %sgpr90 = tail call i32 asm sideeffect "s_mov_b32 s90, 0", "={s90}"() #0 + %sgpr91 = tail call i32 asm sideeffect "s_mov_b32 s91, 0", "={s91}"() #0 + %sgpr92 = tail call i32 asm sideeffect "s_mov_b32 s92, 0", "={s92}"() #0 + %sgpr93 = tail call i32 asm sideeffect "s_mov_b32 s93, 0", "={s93}"() #0 + %sgpr94 = tail call i32 asm sideeffect "s_mov_b32 s94, 0", "={s94}"() #0 + %sgpr95 = tail call i32 asm sideeffect "s_mov_b32 s95, 0", "={s95}"() #0 + %sgpr96 = tail call i32 asm sideeffect "s_mov_b32 s96, 0", "={s96}"() #0 + %sgpr97 = tail call i32 asm sideeffect "s_mov_b32 s97, 0", "={s97}"() #0 + %sgpr98 = tail call i32 asm sideeffect "s_mov_b32 s98, 0", "={s98}"() #0 + %sgpr99 = tail call i32 asm sideeffect "s_mov_b32 s99, 0", "={s99}"() #0 + %sgpr100 = tail call i32 asm sideeffect "s_mov_b32 s100, 0", "={s100}"() #0 + %sgpr101 = tail call i32 asm sideeffect "s_mov_b32 s101, 0", "={s101}"() #0 + %vcc_lo = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={vcc_lo}"() #0 + %vcc_hi = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={vcc_hi}"() #0 + %cmp = icmp eq i32 %cnd, 0 + br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch + +bb2: ; 68 bytes + ; 64 byte asm + call void asm sideeffect + "v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64",""() #0 + br label %bb3 + +bb3: + tail call void asm sideeffect "; reg use $0", "{s0}"(i32 %sgpr0) #0 + tail call void asm sideeffect "; reg use $0", "{s1}"(i32 %sgpr1) #0 + tail call void asm sideeffect "; reg use $0", "{s2}"(i32 %sgpr2) #0 + tail call void asm sideeffect "; reg use $0", "{s3}"(i32 %sgpr3) #0 + tail call void asm sideeffect "; reg use $0", "{s4}"(i32 %sgpr4) #0 + tail call void asm sideeffect "; reg use $0", "{s5}"(i32 %sgpr5) #0 + tail call void asm sideeffect "; reg use $0", "{s6}"(i32 %sgpr6) #0 + tail call void asm sideeffect "; reg use $0", "{s7}"(i32 %sgpr7) #0 + tail call void asm sideeffect "; reg use $0", "{s8}"(i32 %sgpr8) #0 + tail call void asm sideeffect "; reg use $0", "{s9}"(i32 %sgpr9) #0 + tail call void asm sideeffect "; reg use $0", "{s10}"(i32 %sgpr10) #0 + tail call void asm sideeffect "; reg use $0", "{s11}"(i32 %sgpr11) #0 + tail call void asm sideeffect "; reg use $0", "{s12}"(i32 %sgpr12) #0 + tail call void asm sideeffect "; reg use $0", "{s13}"(i32 %sgpr13) #0 + tail call void asm sideeffect "; reg use $0", "{s14}"(i32 %sgpr14) #0 + tail call void asm sideeffect "; reg use $0", "{s15}"(i32 %sgpr15) #0 + tail call void asm sideeffect "; reg use $0", "{s16}"(i32 %sgpr16) #0 + tail call void asm sideeffect "; reg use $0", "{s17}"(i32 %sgpr17) #0 + tail call void asm sideeffect "; reg use $0", "{s18}"(i32 %sgpr18) #0 + tail call void asm sideeffect "; reg use $0", "{s19}"(i32 %sgpr19) #0 + tail call void asm sideeffect "; reg use $0", "{s20}"(i32 %sgpr20) #0 + tail call void asm sideeffect "; reg use $0", "{s21}"(i32 %sgpr21) #0 + tail call void asm sideeffect "; reg use $0", "{s22}"(i32 %sgpr22) #0 + tail call void asm sideeffect "; reg use $0", "{s23}"(i32 %sgpr23) #0 + tail call void asm sideeffect "; reg use $0", "{s24}"(i32 %sgpr24) #0 + tail call void asm sideeffect "; reg use $0", "{s25}"(i32 %sgpr25) #0 + tail call void asm sideeffect "; reg use $0", "{s26}"(i32 %sgpr26) #0 + tail call void asm sideeffect "; reg use $0", "{s27}"(i32 %sgpr27) #0 + tail call void asm sideeffect "; reg use $0", "{s28}"(i32 %sgpr28) #0 + tail call void asm sideeffect "; reg use $0", "{s29}"(i32 %sgpr29) #0 + tail call void asm sideeffect "; reg use $0", "{s30}"(i32 %sgpr30) #0 + tail call void asm sideeffect "; reg use $0", "{s31}"(i32 %sgpr31) #0 + tail call void asm sideeffect "; reg use $0", "{s32}"(i32 %sgpr32) #0 + tail call void asm sideeffect "; reg use $0", "{s33}"(i32 %sgpr33) #0 + tail call void asm sideeffect "; reg use $0", "{s34}"(i32 %sgpr34) #0 + tail call void asm sideeffect "; reg use $0", "{s35}"(i32 %sgpr35) #0 + tail call void asm sideeffect "; reg use $0", "{s36}"(i32 %sgpr36) #0 + tail call void asm sideeffect "; reg use $0", "{s37}"(i32 %sgpr37) #0 + tail call void asm sideeffect "; reg use $0", "{s38}"(i32 %sgpr38) #0 + tail call void asm sideeffect "; reg use $0", "{s39}"(i32 %sgpr39) #0 + tail call void asm sideeffect "; reg use $0", "{s40}"(i32 %sgpr40) #0 + tail call void asm sideeffect "; reg use $0", "{s41}"(i32 %sgpr41) #0 + tail call void asm sideeffect "; reg use $0", "{s42}"(i32 %sgpr42) #0 + tail call void asm sideeffect "; reg use $0", "{s43}"(i32 %sgpr43) #0 + tail call void asm sideeffect "; reg use $0", "{s44}"(i32 %sgpr44) #0 + tail call void asm sideeffect "; reg use $0", "{s45}"(i32 %sgpr45) #0 + tail call void asm sideeffect "; reg use $0", "{s46}"(i32 %sgpr46) #0 + tail call void asm sideeffect "; reg use $0", "{s47}"(i32 %sgpr47) #0 + tail call void asm sideeffect "; reg use $0", "{s48}"(i32 %sgpr48) #0 + tail call void asm sideeffect "; reg use $0", "{s49}"(i32 %sgpr49) #0 + tail call void asm sideeffect "; reg use $0", "{s50}"(i32 %sgpr50) #0 + tail call void asm sideeffect "; reg use $0", "{s51}"(i32 %sgpr51) #0 + tail call void asm sideeffect "; reg use $0", "{s52}"(i32 %sgpr52) #0 + tail call void asm sideeffect "; reg use $0", "{s53}"(i32 %sgpr53) #0 + tail call void asm sideeffect "; reg use $0", "{s54}"(i32 %sgpr54) #0 + tail call void asm sideeffect "; reg use $0", "{s55}"(i32 %sgpr55) #0 + tail call void asm sideeffect "; reg use $0", "{s56}"(i32 %sgpr56) #0 + tail call void asm sideeffect "; reg use $0", "{s57}"(i32 %sgpr57) #0 + tail call void asm sideeffect "; reg use $0", "{s58}"(i32 %sgpr58) #0 + tail call void asm sideeffect "; reg use $0", "{s59}"(i32 %sgpr59) #0 + tail call void asm sideeffect "; reg use $0", "{s60}"(i32 %sgpr60) #0 + tail call void asm sideeffect "; reg use $0", "{s61}"(i32 %sgpr61) #0 + tail call void asm sideeffect "; reg use $0", "{s62}"(i32 %sgpr62) #0 + tail call void asm sideeffect "; reg use $0", "{s63}"(i32 %sgpr63) #0 + tail call void asm sideeffect "; reg use $0", "{s64}"(i32 %sgpr64) #0 + tail call void asm sideeffect "; reg use $0", "{s65}"(i32 %sgpr65) #0 + tail call void asm sideeffect "; reg use $0", "{s66}"(i32 %sgpr66) #0 + tail call void asm sideeffect "; reg use $0", "{s67}"(i32 %sgpr67) #0 + tail call void asm sideeffect "; reg use $0", "{s68}"(i32 %sgpr68) #0 + tail call void asm sideeffect "; reg use $0", "{s69}"(i32 %sgpr69) #0 + tail call void asm sideeffect "; reg use $0", "{s70}"(i32 %sgpr70) #0 + tail call void asm sideeffect "; reg use $0", "{s71}"(i32 %sgpr71) #0 + tail call void asm sideeffect "; reg use $0", "{s72}"(i32 %sgpr72) #0 + tail call void asm sideeffect "; reg use $0", "{s73}"(i32 %sgpr73) #0 + tail call void asm sideeffect "; reg use $0", "{s74}"(i32 %sgpr74) #0 + tail call void asm sideeffect "; reg use $0", "{s75}"(i32 %sgpr75) #0 + tail call void asm sideeffect "; reg use $0", "{s76}"(i32 %sgpr76) #0 + tail call void asm sideeffect "; reg use $0", "{s77}"(i32 %sgpr77) #0 + tail call void asm sideeffect "; reg use $0", "{s78}"(i32 %sgpr78) #0 + tail call void asm sideeffect "; reg use $0", "{s79}"(i32 %sgpr79) #0 + tail call void asm sideeffect "; reg use $0", "{s80}"(i32 %sgpr80) #0 + tail call void asm sideeffect "; reg use $0", "{s81}"(i32 %sgpr81) #0 + tail call void asm sideeffect "; reg use $0", "{s82}"(i32 %sgpr82) #0 + tail call void asm sideeffect "; reg use $0", "{s83}"(i32 %sgpr83) #0 + tail call void asm sideeffect "; reg use $0", "{s84}"(i32 %sgpr84) #0 + tail call void asm sideeffect "; reg use $0", "{s85}"(i32 %sgpr85) #0 + tail call void asm sideeffect "; reg use $0", "{s86}"(i32 %sgpr86) #0 + tail call void asm sideeffect "; reg use $0", "{s87}"(i32 %sgpr87) #0 + tail call void asm sideeffect "; reg use $0", "{s88}"(i32 %sgpr88) #0 + tail call void asm sideeffect "; reg use $0", "{s89}"(i32 %sgpr89) #0 + tail call void asm sideeffect "; reg use $0", "{s90}"(i32 %sgpr90) #0 + tail call void asm sideeffect "; reg use $0", "{s91}"(i32 %sgpr91) #0 + tail call void asm sideeffect "; reg use $0", "{s92}"(i32 %sgpr92) #0 + tail call void asm sideeffect "; reg use $0", "{s93}"(i32 %sgpr93) #0 + tail call void asm sideeffect "; reg use $0", "{s94}"(i32 %sgpr94) #0 + tail call void asm sideeffect "; reg use $0", "{s95}"(i32 %sgpr95) #0 + tail call void asm sideeffect "; reg use $0", "{s96}"(i32 %sgpr96) #0 + tail call void asm sideeffect "; reg use $0", "{s97}"(i32 %sgpr97) #0 + tail call void asm sideeffect "; reg use $0", "{s98}"(i32 %sgpr98) #0 + tail call void asm sideeffect "; reg use $0", "{s99}"(i32 %sgpr99) #0 + tail call void asm sideeffect "; reg use $0", "{s100}"(i32 %sgpr100) #0 + tail call void asm sideeffect "; reg use $0", "{s101}"(i32 %sgpr101) #0 + tail call void asm sideeffect "; reg use $0", "{vcc_lo}"(i32 %vcc_lo) #0 + tail call void asm sideeffect "; reg use $0", "{vcc_hi}"(i32 %vcc_hi) #0 + ret void +} + +declare i32 @llvm.amdgcn.workgroup.id.x() #0 + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll new file mode 100644 index 0000000000000..53f533ebb2842 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll @@ -0,0 +1,647 @@ +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-misched < %s | FileCheck --check-prefixes=GFX11-PAL %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -amdgpu-use-amdgpu-trackers=1 -verify-misched < %s | FileCheck --check-prefixes=GFX11-PAL-GCNTRACKERS %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-scalarize-global-loads=false -verify-misched < %s | FileCheck --check-prefixes=TONGA %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-scalarize-global-loads=false -amdgpu-use-amdgpu-trackers=1 -verify-misched < %s | FileCheck --check-prefixes=TONGA-GCNTRACKERS %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-misched < %s | FileCheck --check-prefixes=GFX908 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -amdgpu-use-amdgpu-trackers=1 -verify-misched < %s | FileCheck --check-prefixes=GFX908-GCNTRACKERS %s +; RUN: llc -mtriple=amdgcn -verify-misched < %s | FileCheck --check-prefixes=GENERIC %s +; RUN: llc -mtriple=amdgcn -amdgpu-use-amdgpu-trackers=1 -verify-misched < %s | FileCheck --check-prefixes=GENERIC-GCNTRACKERS %s + +; GCN Trackers are sensitive to minor changes in RP, and will avoid scheduling certain instructions, which, if scheduled, +; allow scheduling of other instructions which reduce RP + +; CHECK-LABEL: {{^}}return_72xi32: +; GFX11-PAL: codeLenInByte = 768 +; GFX11-PAL-GCNTRACKERS: codeLenInByte = 888 +; GFX11-PAL: NumSgprs: 33 +; GFX11-PAL-GCNTRACKERS: NumSgprs: 33 +; GFX11-PAL: NumVgprs: 64 +; GFX11-PAL-GCNTRACKERS: NumVgprs: 64 +; GFX11-PAL: ScratchSize: 220 +; GFX11-PAL-GCNTRACKERS: ScratchSize: 248 + + +; CHECK-LABEL: {{^}}call_72xi32: +; GFX11-PAL: codeLenInByte = 1300 +; GFX11-PAL-GCNTRACKERS: codeLenInByte = 1372 +; GFX11-PAL: NumSgprs: 35 +; GFX11-PAL-GCNTRACKERS: NumSgprs: 35 +; GFX11-PAL: NumVgprs: 64 +; GFX11-PAL-GCNTRACKERS: NumVgprs: 64 +; GFX11-PAL: ScratchSize: 2780 +; GFX11-PAL-GCNTRACKERS: ScratchSize: 2808 + + +define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { + ret <72 x i32> %val +} + +define amdgpu_gfx void @call_72xi32() #1 { +entry: + %ret.0 = call amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> zeroinitializer) + %val.0 = insertelement <72 x i32> %ret.0, i32 42, i32 0 + %val.1 = insertelement <72 x i32> %val.0, i32 24, i32 58 + %ret.1 = call amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val.1) + ret void +} + +; CHECK-LABEL: {{^}}global_extload_v16f16_to_v16f64: +; TONGA: codeLenInByte = 420 +; TONGA-GCNTRACKERS: codeLenInByte = 436 +; TONGA: NumSgprs: 96 +; TONGA-GCNTRACKERS: NumSgprs: 96 +; TONGA: NumVgprs: 33 +; TONGA-GCNTRACKERS: NumVgprs: 25 +; TONGA: Occupancy: 7 +; TONGA-GCNTRACKERS: Occupancy: 8 + + +define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out, ptr addrspace(1) %in) { + %val = load <16 x half>, ptr addrspace(1) %in + %cvt = fpext <16 x half> %val to <16 x double> + store <16 x double> %cvt, ptr addrspace(1) %out + ret void +} + +; CHECK-LABEL: {{^}}constant_zextload_v64i16_to_v64i32: +; GENERIC: codeLenInByte = 860 +; GENERIC-GCNTRACKERS: codeLenInByte = 860 +; GENERIC: NumSgprs: 71 +; GENERIC-GCNTRACKERS: NumSgprs: 54 +; GENERIC: NumVgprs: 16 +; GENERIC-GCNTRACKERS: NumVgprs: 16 +; GENERIC: Occupancy: 7 +; GENERIC-GCNTRACKERS: Occupancy: 8 + +define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) %in) { + %load = load <64 x i16>, ptr addrspace(4) %in + %ext = zext <64 x i16> %load to <64 x i32> + store <64 x i32> %ext, ptr addrspace(1) %out + ret void +} + +; CHECK-LABEL: {{^}}excess_soft_clause_reg_pressure: +; GFX908: codeLenInByte = 1436 +; GFX908-GCNTRACKERS: codeLenInByte = 1436 +; GFX908: NumSgprs: 56 +; GFX908-GCNTRACKERS: NumSgprs: 56 +; GFX908: NumVgprs: 43 +; GFX908-GCNTRACKERS: NumVgprs: 39 +; GFX908: Occupancy: 5 +; GFX908-GCNTRACKERS: Occupancy: 6 + + +define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(ptr addrspace(4) %wei_ptr, ptr addrspace(1) %out_ptr, ptr addrspace(1) %in) { +entry: + %i = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %i2 = load i64, ptr addrspace(4) %i, align 8 + %i3 = tail call i32 @llvm.amdgcn.workgroup.id.x() + %i4 = shl i32 %i3, 8 + %i5 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !5 + %i6 = add i32 %i4, %i5 + %i7 = trunc i64 %i2 to i32 + %conv = add i32 %i6, %i7 + %conv.frozen = freeze i32 %conv + %div = udiv i32 %conv.frozen, 49 + %add.ptr22 = getelementptr inbounds float, ptr addrspace(4) %wei_ptr, i64 undef + %in.ptr1 = getelementptr inbounds float, ptr addrspace(1) %in, i32 %i5 + br label %for.cond28.preheader + +for.cond28.preheader: ; preds = %for.cond28.preheader, %entry + %accum.sroa.110.0 = phi float [ 0.000000e+00, %entry ], [ %i251, %for.cond28.preheader ] + %accum.sroa.106.0 = phi float [ 0.000000e+00, %entry ], [ %i247, %for.cond28.preheader ] + %accum.sroa.102.0 = phi float [ 0.000000e+00, %entry ], [ %i243, %for.cond28.preheader ] + %accum.sroa.98.0 = phi float [ 0.000000e+00, %entry ], [ %i239, %for.cond28.preheader ] + %accum.sroa.94.0 = phi float [ 0.000000e+00, %entry ], [ %i235, %for.cond28.preheader ] + %accum.sroa.90.0 = phi float [ 0.000000e+00, %entry ], [ %i231, %for.cond28.preheader ] + %accum.sroa.86.0 = phi float [ 0.000000e+00, %entry ], [ %i227, %for.cond28.preheader ] + %accum.sroa.82.0 = phi float [ 0.000000e+00, %entry ], [ %i223, %for.cond28.preheader ] + %accum.sroa.78.0 = phi float [ 0.000000e+00, %entry ], [ %i219, %for.cond28.preheader ] + %accum.sroa.74.0 = phi float [ 0.000000e+00, %entry ], [ %i215, %for.cond28.preheader ] + %accum.sroa.70.0 = phi float [ 0.000000e+00, %entry ], [ %i211, %for.cond28.preheader ] + %accum.sroa.66.0 = phi float [ 0.000000e+00, %entry ], [ %i207, %for.cond28.preheader ] + %accum.sroa.62.0 = phi float [ 0.000000e+00, %entry ], [ %i203, %for.cond28.preheader ] + %accum.sroa.58.0 = phi float [ 0.000000e+00, %entry ], [ %i199, %for.cond28.preheader ] + %accum.sroa.54.0 = phi float [ 0.000000e+00, %entry ], [ %i195, %for.cond28.preheader ] + %accum.sroa.50.0 = phi float [ 0.000000e+00, %entry ], [ %i191, %for.cond28.preheader ] + %accum.sroa.46.0 = phi float [ 0.000000e+00, %entry ], [ %i187, %for.cond28.preheader ] + %accum.sroa.42.0 = phi float [ 0.000000e+00, %entry ], [ %i183, %for.cond28.preheader ] + %accum.sroa.38.0 = phi float [ 0.000000e+00, %entry ], [ %i179, %for.cond28.preheader ] + %accum.sroa.34.0 = phi float [ 0.000000e+00, %entry ], [ %i175, %for.cond28.preheader ] + %accum.sroa.30.0 = phi float [ 0.000000e+00, %entry ], [ %i171, %for.cond28.preheader ] + %accum.sroa.26.0 = phi float [ 0.000000e+00, %entry ], [ %i167, %for.cond28.preheader ] + %accum.sroa.22.0 = phi float [ 0.000000e+00, %entry ], [ %i163, %for.cond28.preheader ] + %accum.sroa.18.0 = phi float [ 0.000000e+00, %entry ], [ %i159, %for.cond28.preheader ] + %accum.sroa.14.0 = phi float [ 0.000000e+00, %entry ], [ %i155, %for.cond28.preheader ] + %accum.sroa.10.0 = phi float [ 0.000000e+00, %entry ], [ %i151, %for.cond28.preheader ] + %accum.sroa.6.0 = phi float [ 0.000000e+00, %entry ], [ %i147, %for.cond28.preheader ] + %accum.sroa.0.0 = phi float [ 0.000000e+00, %entry ], [ %i143, %for.cond28.preheader ] + %accum.sroa.114.0 = phi float [ 0.000000e+00, %entry ], [ %i255, %for.cond28.preheader ] + %accum.sroa.118.0 = phi float [ 0.000000e+00, %entry ], [ %i259, %for.cond28.preheader ] + %accum.sroa.122.0 = phi float [ 0.000000e+00, %entry ], [ %i263, %for.cond28.preheader ] + %accum.sroa.126.0 = phi float [ 0.000000e+00, %entry ], [ %i267, %for.cond28.preheader ] + %i_ptr.0288 = phi ptr addrspace(1) [ %in.ptr1, %entry ], [ %add.ptr47.3, %for.cond28.preheader ] + %w_ptr.0287 = phi ptr addrspace(4) [ %add.ptr22, %entry ], [ %add.ptr74, %for.cond28.preheader ] + %ci.0286 = phi i32 [ 0, %entry ], [ %inc116, %for.cond28.preheader ] + %i8 = load float, ptr addrspace(1) %i_ptr.0288, align 4 + %add.ptr47 = getelementptr inbounds float, ptr addrspace(1) %i_ptr.0288, i64 49 + %i9 = load float, ptr addrspace(1) %add.ptr47, align 4 + %add.ptr47.1 = getelementptr inbounds float, ptr addrspace(1) %i_ptr.0288, i64 98 + %i10 = load float, ptr addrspace(1) %add.ptr47.1, align 4 + %add.ptr47.2 = getelementptr inbounds float, ptr addrspace(1) %i_ptr.0288, i64 147 + %i11 = load float, ptr addrspace(1) %add.ptr47.2, align 4 + %i12 = load float, ptr addrspace(4) %w_ptr.0287, align 4 + %add.ptr66 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1024 + %i13 = load float, ptr addrspace(4) %add.ptr66, align 4 + %add.ptr66.1 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2048 + %i14 = load float, ptr addrspace(4) %add.ptr66.1, align 4 + %add.ptr66.2 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3072 + %i15 = load float, ptr addrspace(4) %add.ptr66.2, align 4 + %add.ptr70 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1 + %i16 = load float, ptr addrspace(4) %add.ptr70, align 4 + %add.ptr66.1291 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1025 + %i17 = load float, ptr addrspace(4) %add.ptr66.1291, align 4 + %add.ptr66.1.1 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2049 + %i18 = load float, ptr addrspace(4) %add.ptr66.1.1, align 4 + %add.ptr66.2.1 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3073 + %i19 = load float, ptr addrspace(4) %add.ptr66.2.1, align 4 + %add.ptr70.1 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2 + %i20 = load float, ptr addrspace(4) %add.ptr70.1, align 4 + %add.ptr66.2293 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1026 + %i21 = load float, ptr addrspace(4) %add.ptr66.2293, align 4 + %add.ptr66.1.2 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2050 + %i22 = load float, ptr addrspace(4) %add.ptr66.1.2, align 4 + %add.ptr66.2.2 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3074 + %i23 = load float, ptr addrspace(4) %add.ptr66.2.2, align 4 + %add.ptr70.2 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3 + %i24 = load float, ptr addrspace(4) %add.ptr70.2, align 4 + %add.ptr66.3 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1027 + %i25 = load float, ptr addrspace(4) %add.ptr66.3, align 4 + %add.ptr66.1.3 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2051 + %i26 = load float, ptr addrspace(4) %add.ptr66.1.3, align 4 + %add.ptr66.2.3 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3075 + %i27 = load float, ptr addrspace(4) %add.ptr66.2.3, align 4 + %add.ptr70.3 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 4 + %i28 = load float, ptr addrspace(4) %add.ptr70.3, align 4 + %add.ptr66.4 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1028 + %i29 = load float, ptr addrspace(4) %add.ptr66.4, align 4 + %add.ptr66.1.4 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2052 + %i30 = load float, ptr addrspace(4) %add.ptr66.1.4, align 4 + %add.ptr66.2.4 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3076 + %i31 = load float, ptr addrspace(4) %add.ptr66.2.4, align 4 + %add.ptr70.4 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 5 + %i32 = load float, ptr addrspace(4) %add.ptr70.4, align 4 + %add.ptr66.5 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1029 + %i33 = load float, ptr addrspace(4) %add.ptr66.5, align 4 + %add.ptr66.1.5 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2053 + %i34 = load float, ptr addrspace(4) %add.ptr66.1.5, align 4 + %add.ptr66.2.5 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3077 + %i35 = load float, ptr addrspace(4) %add.ptr66.2.5, align 4 + %add.ptr70.5 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 6 + %i36 = load float, ptr addrspace(4) %add.ptr70.5, align 4 + %add.ptr66.6 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1030 + %i37 = load float, ptr addrspace(4) %add.ptr66.6, align 4 + %add.ptr66.1.6 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2054 + %i38 = load float, ptr addrspace(4) %add.ptr66.1.6, align 4 + %add.ptr66.2.6 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3078 + %i39 = load float, ptr addrspace(4) %add.ptr66.2.6, align 4 + %add.ptr70.6 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 7 + %i40 = load float, ptr addrspace(4) %add.ptr70.6, align 4 + %add.ptr66.7 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1031 + %i41 = load float, ptr addrspace(4) %add.ptr66.7, align 4 + %add.ptr66.1.7 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2055 + %i42 = load float, ptr addrspace(4) %add.ptr66.1.7, align 4 + %add.ptr66.2.7 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3079 + %i43 = load float, ptr addrspace(4) %add.ptr66.2.7, align 4 + %add.ptr70.7 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 8 + %i44 = load float, ptr addrspace(4) %add.ptr70.7, align 4 + %add.ptr66.8 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1032 + %i45 = load float, ptr addrspace(4) %add.ptr66.8, align 4 + %add.ptr66.1.8 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2056 + %i46 = load float, ptr addrspace(4) %add.ptr66.1.8, align 4 + %add.ptr66.2.8 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3080 + %i47 = load float, ptr addrspace(4) %add.ptr66.2.8, align 4 + %add.ptr70.8 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 9 + %i48 = load float, ptr addrspace(4) %add.ptr70.8, align 4 + %add.ptr66.9 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1033 + %i49 = load float, ptr addrspace(4) %add.ptr66.9, align 4 + %add.ptr66.1.9 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2057 + %i50 = load float, ptr addrspace(4) %add.ptr66.1.9, align 4 + %add.ptr66.2.9 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3081 + %i51 = load float, ptr addrspace(4) %add.ptr66.2.9, align 4 + %add.ptr70.9 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 10 + %i52 = load float, ptr addrspace(4) %add.ptr70.9, align 4 + %add.ptr66.10 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1034 + %i53 = load float, ptr addrspace(4) %add.ptr66.10, align 4 + %add.ptr66.1.10 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2058 + %i54 = load float, ptr addrspace(4) %add.ptr66.1.10, align 4 + %add.ptr66.2.10 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3082 + %i55 = load float, ptr addrspace(4) %add.ptr66.2.10, align 4 + %add.ptr70.10 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 11 + %i56 = load float, ptr addrspace(4) %add.ptr70.10, align 4 + %add.ptr66.11 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1035 + %i57 = load float, ptr addrspace(4) %add.ptr66.11, align 4 + %add.ptr66.1.11 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2059 + %i58 = load float, ptr addrspace(4) %add.ptr66.1.11, align 4 + %add.ptr66.2.11 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3083 + %i59 = load float, ptr addrspace(4) %add.ptr66.2.11, align 4 + %add.ptr70.11 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 12 + %i60 = load float, ptr addrspace(4) %add.ptr70.11, align 4 + %add.ptr66.12 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1036 + %i61 = load float, ptr addrspace(4) %add.ptr66.12, align 4 + %add.ptr66.1.12 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2060 + %i62 = load float, ptr addrspace(4) %add.ptr66.1.12, align 4 + %add.ptr66.2.12 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3084 + %i63 = load float, ptr addrspace(4) %add.ptr66.2.12, align 4 + %add.ptr70.12 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 13 + %i64 = load float, ptr addrspace(4) %add.ptr70.12, align 4 + %add.ptr66.13 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1037 + %i65 = load float, ptr addrspace(4) %add.ptr66.13, align 4 + %add.ptr66.1.13 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2061 + %i66 = load float, ptr addrspace(4) %add.ptr66.1.13, align 4 + %add.ptr66.2.13 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3085 + %i67 = load float, ptr addrspace(4) %add.ptr66.2.13, align 4 + %add.ptr70.13 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 14 + %i68 = load float, ptr addrspace(4) %add.ptr70.13, align 4 + %add.ptr66.14 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1038 + %i69 = load float, ptr addrspace(4) %add.ptr66.14, align 4 + %add.ptr66.1.14 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2062 + %i70 = load float, ptr addrspace(4) %add.ptr66.1.14, align 4 + %add.ptr66.2.14 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3086 + %i71 = load float, ptr addrspace(4) %add.ptr66.2.14, align 4 + %add.ptr70.14 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 15 + %i72 = load float, ptr addrspace(4) %add.ptr70.14, align 4 + %add.ptr66.15 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1039 + %i73 = load float, ptr addrspace(4) %add.ptr66.15, align 4 + %add.ptr66.1.15 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2063 + %i74 = load float, ptr addrspace(4) %add.ptr66.1.15, align 4 + %add.ptr66.2.15 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3087 + %i75 = load float, ptr addrspace(4) %add.ptr66.2.15, align 4 + %add.ptr70.15 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 16 + %i76 = load float, ptr addrspace(4) %add.ptr70.15, align 4 + %add.ptr66.16 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1040 + %i77 = load float, ptr addrspace(4) %add.ptr66.16, align 4 + %add.ptr66.1.16 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2064 + %i78 = load float, ptr addrspace(4) %add.ptr66.1.16, align 4 + %add.ptr66.2.16 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3088 + %i79 = load float, ptr addrspace(4) %add.ptr66.2.16, align 4 + %add.ptr70.16 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 17 + %i80 = load float, ptr addrspace(4) %add.ptr70.16, align 4 + %add.ptr66.17 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1041 + %i81 = load float, ptr addrspace(4) %add.ptr66.17, align 4 + %add.ptr66.1.17 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2065 + %i82 = load float, ptr addrspace(4) %add.ptr66.1.17, align 4 + %add.ptr66.2.17 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3089 + %i83 = load float, ptr addrspace(4) %add.ptr66.2.17, align 4 + %add.ptr70.17 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 18 + %i84 = load float, ptr addrspace(4) %add.ptr70.17, align 4 + %add.ptr66.18 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1042 + %i85 = load float, ptr addrspace(4) %add.ptr66.18, align 4 + %add.ptr66.1.18 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2066 + %i86 = load float, ptr addrspace(4) %add.ptr66.1.18, align 4 + %add.ptr66.2.18 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3090 + %i87 = load float, ptr addrspace(4) %add.ptr66.2.18, align 4 + %add.ptr70.18 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 19 + %i88 = load float, ptr addrspace(4) %add.ptr70.18, align 4 + %add.ptr66.19 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1043 + %i89 = load float, ptr addrspace(4) %add.ptr66.19, align 4 + %add.ptr66.1.19 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2067 + %i90 = load float, ptr addrspace(4) %add.ptr66.1.19, align 4 + %add.ptr66.2.19 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3091 + %i91 = load float, ptr addrspace(4) %add.ptr66.2.19, align 4 + %add.ptr70.19 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 20 + %i92 = load float, ptr addrspace(4) %add.ptr70.19, align 4 + %add.ptr66.20 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1044 + %i93 = load float, ptr addrspace(4) %add.ptr66.20, align 4 + %add.ptr66.1.20 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2068 + %i94 = load float, ptr addrspace(4) %add.ptr66.1.20, align 4 + %add.ptr66.2.20 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3092 + %i95 = load float, ptr addrspace(4) %add.ptr66.2.20, align 4 + %add.ptr70.20 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 21 + %i96 = load float, ptr addrspace(4) %add.ptr70.20, align 4 + %add.ptr66.21 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1045 + %i97 = load float, ptr addrspace(4) %add.ptr66.21, align 4 + %add.ptr66.1.21 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2069 + %i98 = load float, ptr addrspace(4) %add.ptr66.1.21, align 4 + %add.ptr66.2.21 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3093 + %i99 = load float, ptr addrspace(4) %add.ptr66.2.21, align 4 + %add.ptr70.21 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 22 + %i100 = load float, ptr addrspace(4) %add.ptr70.21, align 4 + %add.ptr66.22 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1046 + %i101 = load float, ptr addrspace(4) %add.ptr66.22, align 4 + %add.ptr66.1.22 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2070 + %i102 = load float, ptr addrspace(4) %add.ptr66.1.22, align 4 + %add.ptr66.2.22 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3094 + %i103 = load float, ptr addrspace(4) %add.ptr66.2.22, align 4 + %add.ptr70.22 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 23 + %i104 = load float, ptr addrspace(4) %add.ptr70.22, align 4 + %add.ptr66.23 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1047 + %i105 = load float, ptr addrspace(4) %add.ptr66.23, align 4 + %add.ptr66.1.23 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2071 + %i106 = load float, ptr addrspace(4) %add.ptr66.1.23, align 4 + %add.ptr66.2.23 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3095 + %i107 = load float, ptr addrspace(4) %add.ptr66.2.23, align 4 + %add.ptr70.23 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 24 + %i108 = load float, ptr addrspace(4) %add.ptr70.23, align 4 + %add.ptr66.24 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1048 + %i109 = load float, ptr addrspace(4) %add.ptr66.24, align 4 + %add.ptr66.1.24 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2072 + %i110 = load float, ptr addrspace(4) %add.ptr66.1.24, align 4 + %add.ptr66.2.24 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3096 + %i111 = load float, ptr addrspace(4) %add.ptr66.2.24, align 4 + %add.ptr70.24 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 25 + %i112 = load float, ptr addrspace(4) %add.ptr70.24, align 4 + %add.ptr66.25 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1049 + %i113 = load float, ptr addrspace(4) %add.ptr66.25, align 4 + %add.ptr66.1.25 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2073 + %i114 = load float, ptr addrspace(4) %add.ptr66.1.25, align 4 + %add.ptr66.2.25 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3097 + %i115 = load float, ptr addrspace(4) %add.ptr66.2.25, align 4 + %add.ptr70.25 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 26 + %i116 = load float, ptr addrspace(4) %add.ptr70.25, align 4 + %add.ptr66.26 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1050 + %i117 = load float, ptr addrspace(4) %add.ptr66.26, align 4 + %add.ptr66.1.26 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2074 + %i118 = load float, ptr addrspace(4) %add.ptr66.1.26, align 4 + %add.ptr66.2.26 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3098 + %i119 = load float, ptr addrspace(4) %add.ptr66.2.26, align 4 + %add.ptr70.26 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 27 + %i120 = load float, ptr addrspace(4) %add.ptr70.26, align 4 + %add.ptr66.27 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1051 + %i121 = load float, ptr addrspace(4) %add.ptr66.27, align 4 + %add.ptr66.1.27 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2075 + %i122 = load float, ptr addrspace(4) %add.ptr66.1.27, align 4 + %add.ptr66.2.27 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3099 + %i123 = load float, ptr addrspace(4) %add.ptr66.2.27, align 4 + %add.ptr70.27 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 28 + %i124 = load float, ptr addrspace(4) %add.ptr70.27, align 4 + %add.ptr66.28 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1052 + %i125 = load float, ptr addrspace(4) %add.ptr66.28, align 4 + %add.ptr66.1.28 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2076 + %i126 = load float, ptr addrspace(4) %add.ptr66.1.28, align 4 + %add.ptr66.2.28 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3100 + %i127 = load float, ptr addrspace(4) %add.ptr66.2.28, align 4 + %add.ptr70.28 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 29 + %i128 = load float, ptr addrspace(4) %add.ptr70.28, align 4 + %add.ptr66.29 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1053 + %i129 = load float, ptr addrspace(4) %add.ptr66.29, align 4 + %add.ptr66.1.29 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2077 + %i130 = load float, ptr addrspace(4) %add.ptr66.1.29, align 4 + %add.ptr66.2.29 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3101 + %i131 = load float, ptr addrspace(4) %add.ptr66.2.29, align 4 + %add.ptr70.29 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 30 + %i132 = load float, ptr addrspace(4) %add.ptr70.29, align 4 + %add.ptr66.30 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1054 + %i133 = load float, ptr addrspace(4) %add.ptr66.30, align 4 + %add.ptr66.1.30 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2078 + %i134 = load float, ptr addrspace(4) %add.ptr66.1.30, align 4 + %add.ptr66.2.30 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3102 + %i135 = load float, ptr addrspace(4) %add.ptr66.2.30, align 4 + %add.ptr70.30 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 31 + %i136 = load float, ptr addrspace(4) %add.ptr70.30, align 4 + %add.ptr66.31 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1055 + %i137 = load float, ptr addrspace(4) %add.ptr66.31, align 4 + %add.ptr66.1.31 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2079 + %i138 = load float, ptr addrspace(4) %add.ptr66.1.31, align 4 + %add.ptr66.2.31 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3103 + %i139 = load float, ptr addrspace(4) %add.ptr66.2.31, align 4 + %add.ptr47.3 = getelementptr inbounds float, ptr addrspace(1) %i_ptr.0288, i64 196 + %i140 = tail call float @llvm.fmuladd.f32(float %i8, float %i12, float %accum.sroa.0.0) + %i141 = tail call float @llvm.fmuladd.f32(float %i9, float %i13, float %i140) + %i142 = tail call float @llvm.fmuladd.f32(float %i10, float %i14, float %i141) + %i143 = tail call float @llvm.fmuladd.f32(float %i11, float %i15, float %i142) + %i144 = tail call float @llvm.fmuladd.f32(float %i8, float %i16, float %accum.sroa.6.0) + %i145 = tail call float @llvm.fmuladd.f32(float %i9, float %i17, float %i144) + %i146 = tail call float @llvm.fmuladd.f32(float %i10, float %i18, float %i145) + %i147 = tail call float @llvm.fmuladd.f32(float %i11, float %i19, float %i146) + %i148 = tail call float @llvm.fmuladd.f32(float %i8, float %i20, float %accum.sroa.10.0) + %i149 = tail call float @llvm.fmuladd.f32(float %i9, float %i21, float %i148) + %i150 = tail call float @llvm.fmuladd.f32(float %i10, float %i22, float %i149) + %i151 = tail call float @llvm.fmuladd.f32(float %i11, float %i23, float %i150) + %i152 = tail call float @llvm.fmuladd.f32(float %i8, float %i24, float %accum.sroa.14.0) + %i153 = tail call float @llvm.fmuladd.f32(float %i9, float %i25, float %i152) + %i154 = tail call float @llvm.fmuladd.f32(float %i10, float %i26, float %i153) + %i155 = tail call float @llvm.fmuladd.f32(float %i11, float %i27, float %i154) + %i156 = tail call float @llvm.fmuladd.f32(float %i8, float %i28, float %accum.sroa.18.0) + %i157 = tail call float @llvm.fmuladd.f32(float %i9, float %i29, float %i156) + %i158 = tail call float @llvm.fmuladd.f32(float %i10, float %i30, float %i157) + %i159 = tail call float @llvm.fmuladd.f32(float %i11, float %i31, float %i158) + %i160 = tail call float @llvm.fmuladd.f32(float %i8, float %i32, float %accum.sroa.22.0) + %i161 = tail call float @llvm.fmuladd.f32(float %i9, float %i33, float %i160) + %i162 = tail call float @llvm.fmuladd.f32(float %i10, float %i34, float %i161) + %i163 = tail call float @llvm.fmuladd.f32(float %i11, float %i35, float %i162) + %i164 = tail call float @llvm.fmuladd.f32(float %i8, float %i36, float %accum.sroa.26.0) + %i165 = tail call float @llvm.fmuladd.f32(float %i9, float %i37, float %i164) + %i166 = tail call float @llvm.fmuladd.f32(float %i10, float %i38, float %i165) + %i167 = tail call float @llvm.fmuladd.f32(float %i11, float %i39, float %i166) + %i168 = tail call float @llvm.fmuladd.f32(float %i8, float %i40, float %accum.sroa.30.0) + %i169 = tail call float @llvm.fmuladd.f32(float %i9, float %i41, float %i168) + %i170 = tail call float @llvm.fmuladd.f32(float %i10, float %i42, float %i169) + %i171 = tail call float @llvm.fmuladd.f32(float %i11, float %i43, float %i170) + %i172 = tail call float @llvm.fmuladd.f32(float %i8, float %i44, float %accum.sroa.34.0) + %i173 = tail call float @llvm.fmuladd.f32(float %i9, float %i45, float %i172) + %i174 = tail call float @llvm.fmuladd.f32(float %i10, float %i46, float %i173) + %i175 = tail call float @llvm.fmuladd.f32(float %i11, float %i47, float %i174) + %i176 = tail call float @llvm.fmuladd.f32(float %i8, float %i48, float %accum.sroa.38.0) + %i177 = tail call float @llvm.fmuladd.f32(float %i9, float %i49, float %i176) + %i178 = tail call float @llvm.fmuladd.f32(float %i10, float %i50, float %i177) + %i179 = tail call float @llvm.fmuladd.f32(float %i11, float %i51, float %i178) + %i180 = tail call float @llvm.fmuladd.f32(float %i8, float %i52, float %accum.sroa.42.0) + %i181 = tail call float @llvm.fmuladd.f32(float %i9, float %i53, float %i180) + %i182 = tail call float @llvm.fmuladd.f32(float %i10, float %i54, float %i181) + %i183 = tail call float @llvm.fmuladd.f32(float %i11, float %i55, float %i182) + %i184 = tail call float @llvm.fmuladd.f32(float %i8, float %i56, float %accum.sroa.46.0) + %i185 = tail call float @llvm.fmuladd.f32(float %i9, float %i57, float %i184) + %i186 = tail call float @llvm.fmuladd.f32(float %i10, float %i58, float %i185) + %i187 = tail call float @llvm.fmuladd.f32(float %i11, float %i59, float %i186) + %i188 = tail call float @llvm.fmuladd.f32(float %i8, float %i60, float %accum.sroa.50.0) + %i189 = tail call float @llvm.fmuladd.f32(float %i9, float %i61, float %i188) + %i190 = tail call float @llvm.fmuladd.f32(float %i10, float %i62, float %i189) + %i191 = tail call float @llvm.fmuladd.f32(float %i11, float %i63, float %i190) + %i192 = tail call float @llvm.fmuladd.f32(float %i8, float %i64, float %accum.sroa.54.0) + %i193 = tail call float @llvm.fmuladd.f32(float %i9, float %i65, float %i192) + %i194 = tail call float @llvm.fmuladd.f32(float %i10, float %i66, float %i193) + %i195 = tail call float @llvm.fmuladd.f32(float %i11, float %i67, float %i194) + %i196 = tail call float @llvm.fmuladd.f32(float %i8, float %i68, float %accum.sroa.58.0) + %i197 = tail call float @llvm.fmuladd.f32(float %i9, float %i69, float %i196) + %i198 = tail call float @llvm.fmuladd.f32(float %i10, float %i70, float %i197) + %i199 = tail call float @llvm.fmuladd.f32(float %i11, float %i71, float %i198) + %i200 = tail call float @llvm.fmuladd.f32(float %i8, float %i72, float %accum.sroa.62.0) + %i201 = tail call float @llvm.fmuladd.f32(float %i9, float %i73, float %i200) + %i202 = tail call float @llvm.fmuladd.f32(float %i10, float %i74, float %i201) + %i203 = tail call float @llvm.fmuladd.f32(float %i11, float %i75, float %i202) + %i204 = tail call float @llvm.fmuladd.f32(float %i8, float %i76, float %accum.sroa.66.0) + %i205 = tail call float @llvm.fmuladd.f32(float %i9, float %i77, float %i204) + %i206 = tail call float @llvm.fmuladd.f32(float %i10, float %i78, float %i205) + %i207 = tail call float @llvm.fmuladd.f32(float %i11, float %i79, float %i206) + %i208 = tail call float @llvm.fmuladd.f32(float %i8, float %i80, float %accum.sroa.70.0) + %i209 = tail call float @llvm.fmuladd.f32(float %i9, float %i81, float %i208) + %i210 = tail call float @llvm.fmuladd.f32(float %i10, float %i82, float %i209) + %i211 = tail call float @llvm.fmuladd.f32(float %i11, float %i83, float %i210) + %i212 = tail call float @llvm.fmuladd.f32(float %i8, float %i84, float %accum.sroa.74.0) + %i213 = tail call float @llvm.fmuladd.f32(float %i9, float %i85, float %i212) + %i214 = tail call float @llvm.fmuladd.f32(float %i10, float %i86, float %i213) + %i215 = tail call float @llvm.fmuladd.f32(float %i11, float %i87, float %i214) + %i216 = tail call float @llvm.fmuladd.f32(float %i8, float %i88, float %accum.sroa.78.0) + %i217 = tail call float @llvm.fmuladd.f32(float %i9, float %i89, float %i216) + %i218 = tail call float @llvm.fmuladd.f32(float %i10, float %i90, float %i217) + %i219 = tail call float @llvm.fmuladd.f32(float %i11, float %i91, float %i218) + %i220 = tail call float @llvm.fmuladd.f32(float %i8, float %i92, float %accum.sroa.82.0) + %i221 = tail call float @llvm.fmuladd.f32(float %i9, float %i93, float %i220) + %i222 = tail call float @llvm.fmuladd.f32(float %i10, float %i94, float %i221) + %i223 = tail call float @llvm.fmuladd.f32(float %i11, float %i95, float %i222) + %i224 = tail call float @llvm.fmuladd.f32(float %i8, float %i96, float %accum.sroa.86.0) + %i225 = tail call float @llvm.fmuladd.f32(float %i9, float %i97, float %i224) + %i226 = tail call float @llvm.fmuladd.f32(float %i10, float %i98, float %i225) + %i227 = tail call float @llvm.fmuladd.f32(float %i11, float %i99, float %i226) + %i228 = tail call float @llvm.fmuladd.f32(float %i8, float %i100, float %accum.sroa.90.0) + %i229 = tail call float @llvm.fmuladd.f32(float %i9, float %i101, float %i228) + %i230 = tail call float @llvm.fmuladd.f32(float %i10, float %i102, float %i229) + %i231 = tail call float @llvm.fmuladd.f32(float %i11, float %i103, float %i230) + %i232 = tail call float @llvm.fmuladd.f32(float %i8, float %i104, float %accum.sroa.94.0) + %i233 = tail call float @llvm.fmuladd.f32(float %i9, float %i105, float %i232) + %i234 = tail call float @llvm.fmuladd.f32(float %i10, float %i106, float %i233) + %i235 = tail call float @llvm.fmuladd.f32(float %i11, float %i107, float %i234) + %i236 = tail call float @llvm.fmuladd.f32(float %i8, float %i108, float %accum.sroa.98.0) + %i237 = tail call float @llvm.fmuladd.f32(float %i9, float %i109, float %i236) + %i238 = tail call float @llvm.fmuladd.f32(float %i10, float %i110, float %i237) + %i239 = tail call float @llvm.fmuladd.f32(float %i11, float %i111, float %i238) + %i240 = tail call float @llvm.fmuladd.f32(float %i8, float %i112, float %accum.sroa.102.0) + %i241 = tail call float @llvm.fmuladd.f32(float %i9, float %i113, float %i240) + %i242 = tail call float @llvm.fmuladd.f32(float %i10, float %i114, float %i241) + %i243 = tail call float @llvm.fmuladd.f32(float %i11, float %i115, float %i242) + %i244 = tail call float @llvm.fmuladd.f32(float %i8, float %i116, float %accum.sroa.106.0) + %i245 = tail call float @llvm.fmuladd.f32(float %i9, float %i117, float %i244) + %i246 = tail call float @llvm.fmuladd.f32(float %i10, float %i118, float %i245) + %i247 = tail call float @llvm.fmuladd.f32(float %i11, float %i119, float %i246) + %i248 = tail call float @llvm.fmuladd.f32(float %i8, float %i120, float %accum.sroa.110.0) + %i249 = tail call float @llvm.fmuladd.f32(float %i9, float %i121, float %i248) + %i250 = tail call float @llvm.fmuladd.f32(float %i10, float %i122, float %i249) + %i251 = tail call float @llvm.fmuladd.f32(float %i11, float %i123, float %i250) + %i252 = tail call float @llvm.fmuladd.f32(float %i8, float %i124, float %accum.sroa.114.0) + %i253 = tail call float @llvm.fmuladd.f32(float %i9, float %i125, float %i252) + %i254 = tail call float @llvm.fmuladd.f32(float %i10, float %i126, float %i253) + %i255 = tail call float @llvm.fmuladd.f32(float %i11, float %i127, float %i254) + %i256 = tail call float @llvm.fmuladd.f32(float %i8, float %i128, float %accum.sroa.118.0) + %i257 = tail call float @llvm.fmuladd.f32(float %i9, float %i129, float %i256) + %i258 = tail call float @llvm.fmuladd.f32(float %i10, float %i130, float %i257) + %i259 = tail call float @llvm.fmuladd.f32(float %i11, float %i131, float %i258) + %i260 = tail call float @llvm.fmuladd.f32(float %i8, float %i132, float %accum.sroa.122.0) + %i261 = tail call float @llvm.fmuladd.f32(float %i9, float %i133, float %i260) + %i262 = tail call float @llvm.fmuladd.f32(float %i10, float %i134, float %i261) + %i263 = tail call float @llvm.fmuladd.f32(float %i11, float %i135, float %i262) + %i264 = tail call float @llvm.fmuladd.f32(float %i8, float %i136, float %accum.sroa.126.0) + %i265 = tail call float @llvm.fmuladd.f32(float %i9, float %i137, float %i264) + %i266 = tail call float @llvm.fmuladd.f32(float %i10, float %i138, float %i265) + %i267 = tail call float @llvm.fmuladd.f32(float %i11, float %i139, float %i266) + %add.ptr74 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 4096 + %inc116 = add nuw nsw i32 %ci.0286, 1 + %exitcond.not = icmp eq i32 %inc116, 512 + br i1 %exitcond.not, label %for.cond.cleanup26, label %for.cond28.preheader + +for.cond.cleanup26: ; preds = %for.cond28.preheader + %mul119 = shl nuw nsw i32 undef, 1 + %mul120 = mul i32 %div, 200704 + %mul121 = mul i32 undef, 6272 + %add122 = add i32 %mul120, %mul121 + %mul123 = mul nuw nsw i32 undef, 28 + %add124 = add i32 %add122, %mul123 + %add126 = add i32 %add124, %mul119 + %idx.ext127 = zext i32 %add126 to i64 + %add.ptr128 = getelementptr inbounds float, ptr addrspace(1) %out_ptr, i64 %idx.ext127 + store float %i143, ptr addrspace(1) %add.ptr128, align 4 + %add.ptr184 = getelementptr inbounds float, ptr addrspace(1) %add.ptr128, i64 196 + store float %i147, ptr addrspace(1) %add.ptr184, align 4 + %add.ptr167.1 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184, i64 14 + store float 0.000000e+00, ptr addrspace(1) %add.ptr167.1, align 4 + %add.ptr175.1.1 = getelementptr inbounds float, ptr addrspace(1) %add.ptr167.1, i64 1 + store float 0.000000e+00, ptr addrspace(1) %add.ptr175.1.1, align 4 + %add.ptr184.1 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184, i64 196 + store float %i151, ptr addrspace(1) %add.ptr184.1, align 4 + %add.ptr184.2 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.1, i64 196 + store float %i155, ptr addrspace(1) %add.ptr184.2, align 4 + %add.ptr184.3 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.2, i64 196 + store float %i159, ptr addrspace(1) %add.ptr184.3, align 4 + %add.ptr184.4 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.3, i64 196 + store float %i163, ptr addrspace(1) %add.ptr184.4, align 4 + %add.ptr154.5 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.4, i64 1 + store float 0.000000e+00, ptr addrspace(1) %add.ptr154.5, align 4 + %add.ptr184.5 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.4, i64 196 + store float %i167, ptr addrspace(1) %add.ptr184.5, align 4 + %add.ptr154.6 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.5, i64 1 + store float 0.000000e+00, ptr addrspace(1) %add.ptr154.6, align 4 + %add.ptr184.6 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.5, i64 196 + store float %i171, ptr addrspace(1) %add.ptr184.6, align 4 + %add.ptr184.7 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.6, i64 196 + store float %i175, ptr addrspace(1) %add.ptr184.7, align 4 + %add.ptr167.8 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.7, i64 14 + store float 0.000000e+00, ptr addrspace(1) %add.ptr167.8, align 4 + %add.ptr175.1.8 = getelementptr inbounds float, ptr addrspace(1) %add.ptr167.8, i64 1 + store float 0.000000e+00, ptr addrspace(1) %add.ptr175.1.8, align 4 + %add.ptr184.8 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.7, i64 196 + store float %i179, ptr addrspace(1) %add.ptr184.8, align 4 + %add.ptr184.9 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.8, i64 196 + store float %i183, ptr addrspace(1) %add.ptr184.9, align 4 + %add.ptr184.10 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.9, i64 196 + store float %i187, ptr addrspace(1) %add.ptr184.10, align 4 + %add.ptr184.11 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.10, i64 196 + store float %i191, ptr addrspace(1) %add.ptr184.11, align 4 + %add.ptr184.12 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.11, i64 196 + store float %i195, ptr addrspace(1) %add.ptr184.12, align 4 + %add.ptr184.13 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.12, i64 196 + store float %i199, ptr addrspace(1) %add.ptr184.13, align 4 + %add.ptr184.14 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.13, i64 196 + store float %i203, ptr addrspace(1) %add.ptr184.14, align 4 + %add.ptr184.15 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.14, i64 196 + store float %i207, ptr addrspace(1) %add.ptr184.15, align 4 + %add.ptr184.16 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.15, i64 196 + store float %i211, ptr addrspace(1) %add.ptr184.16, align 4 + %add.ptr184.17 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.16, i64 196 + store float %i215, ptr addrspace(1) %add.ptr184.17, align 4 + %add.ptr184.18 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.17, i64 196 + store float %i219, ptr addrspace(1) %add.ptr184.18, align 4 + %add.ptr184.19 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.18, i64 196 + store float %i223, ptr addrspace(1) %add.ptr184.19, align 4 + %add.ptr184.20 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.19, i64 196 + store float %i227, ptr addrspace(1) %add.ptr184.20, align 4 + %add.ptr184.21 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.20, i64 196 + store float %i231, ptr addrspace(1) %add.ptr184.21, align 4 + %add.ptr184.22 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.21, i64 196 + store float %i235, ptr addrspace(1) %add.ptr184.22, align 4 + %add.ptr184.23 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.22, i64 196 + store float %i239, ptr addrspace(1) %add.ptr184.23, align 4 + %add.ptr184.24 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.23, i64 196 + store float %i243, ptr addrspace(1) %add.ptr184.24, align 4 + %add.ptr184.25 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.24, i64 196 + store float %i247, ptr addrspace(1) %add.ptr184.25, align 4 + %add.ptr184.26 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.25, i64 196 + store float %i251, ptr addrspace(1) %add.ptr184.26, align 4 + %add.ptr184.27 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.26, i64 196 + store float %i255, ptr addrspace(1) %add.ptr184.27, align 4 + %add.ptr184.28 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.27, i64 196 + store float %i259, ptr addrspace(1) %add.ptr184.28, align 4 + %add.ptr184.29 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.28, i64 196 + store float %i263, ptr addrspace(1) %add.ptr184.29, align 4 + %add.ptr184.30 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.29, i64 196 + store float %i267, ptr addrspace(1) %add.ptr184.30, align 4 + ret void +} + + + +declare float @llvm.fmuladd.f32(float, float, float) #2 +declare i32 @llvm.amdgcn.workitem.id.x() #3 +declare i32 @llvm.amdgcn.workgroup.id.x() #3 +declare align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #3 + +!0 = !{i32 1, i32 2, i32 1, i32 0} +!1 = !{!"none", !"none", !"none", !"none"} +!2 = !{!"ptr", !"ptr", !"ptr", !"float"} +!3 = !{!"restrict const", !"restrict const", !"restrict", !""} +!4 = !{i32 256, i32 1, i32 1} +!5 = !{i32 0, i32 1024} + +attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,1" } +attributes #1 = { nounwind "amdgpu-num-vgpr"="64" } +attributes #2 = { nofree nosync nounwind readnone speculatable willreturn } +attributes #3 = { nounwind readnone speculatable willreturn } \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir index 14bb4310c619e..34d203e0de2ff 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir +++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir @@ -1,4 +1,5 @@ -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -start-before=machine-scheduler -stop-after=greedy,1 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -start-before=machine-scheduler -stop-after=greedy,2 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -start-before=machine-scheduler -stop-after=greedy,2 -amdgpu-use-amdgpu-trackers=1 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN-GCNTRACKER %s --- | define amdgpu_kernel void @no_sched_metric_due_to_spills() #0 { @@ -11,6 +12,20 @@ # GCN-LABEL: name: no_sched_metric_due_to_spills # GCN-NOT: SI_SPILL_ # GCN: S_ENDPGM + +# GCN-GCNTRACKER-LABEL: name: no_sched_metric_due_to_spills +# GCN-GCNTRACKER: SI_SPILL_V32_SAVE +# GCN-GCNTRACKER: SI_SPILL_V32_SAVE +# GCN-GCNTRACKER: SI_SPILL_V32_SAVE +# GCN-GCNTRACKER: SI_SPILL_V32_SAVE +# GCN-GCNTRACKER: SI_SPILL_V32_SAVE +# GCN-GCNTRACKER: SI_SPILL_V32_SAVE +# GCN-GCNTRACKER: S_ENDPGM + +# When using the GCN Trackers, the scheduler is able to acieve desired occupancy without running high-RP-reschedule stage. However, the RP is still high, +# and RA is unable to allocate without spills. By running the high-RP-reschedule schedule we would have furhter decreased RP, which provides increased +# flexibility for RA. + --- name: no_sched_metric_due_to_spills tracksRegLiveness: true diff --git a/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll b/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll index 94815558bf3d6..71f8d91874f04 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll @@ -1,16 +1,24 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefix=OCC %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -amdgpu-use-amdgpu-trackers=1 -verify-machineinstrs < %s | FileCheck --check-prefix=OCC-GCNTRACKER %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs -amdgpu-schedule-relaxed-occupancy=true < %s | FileCheck --check-prefix=RELAX %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -amdgpu-use-amdgpu-trackers=1 -verify-machineinstrs -amdgpu-schedule-relaxed-occupancy=true < %s | FileCheck --check-prefix=RELAX-GCNTRACKER %s ; Using -amgpu-schedule-relaxed-occupancy allows scheduler to produce better ILP by further relaxing occupancy target -; GCN-LABEL: {{^}}load_fma_store: +; CHECK-LABEL: {{^}}load_fma_store: ; OCC: NumVgprs: 32 +; OCC-GCNTRACKER: NumVgprs: 24 ; RELAX: NumVgprs: 64 +; RELAX-GCNTRACKER: NumVgprs: 60 ; OCC: NumVGPRsForWavesPerEU: 32 +; OCC-GCNTRACKER: NumVGPRsForWavesPerEU: 24 ; RELAX: NumVGPRsForWavesPerEU: 64 +; RELAX-GCNTRACKER: NumVGPRsForWavesPerEU: 60 ; OCC: Occupancy: 8 +; OCC-GCNTRACKER: Occupancy: 8 ; RELAX: Occupancy: 4 +; RELAX-GCNTRACKER: Occupancy: 4 define amdgpu_kernel void @load_fma_store(ptr addrspace(3) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) #1 { bb: From ec450b19004a653f3db3ad50e88fbf6529a9d841 Mon Sep 17 00:00:00 2001 From: Adam Siemieniuk Date: Wed, 9 Oct 2024 18:59:14 +0200 Subject: [PATCH 042/119] [mlir][xegpu] Allow out-of-bounds writes (#110811) Relaxes vector.transfer_write lowering to allow out-of-bound writes. This aligns lowering with the current hardware specification which does not update bytes in out-of-bound locations during block stores. --- .../VectorToXeGPU/VectorToXeGPU.cpp | 11 +++---- .../transfer-write-to-xegpu.mlir | 29 ++++++++++++------- 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp index e9acda657b3c2..215e1b1b87452 100644 --- a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp +++ b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp @@ -218,18 +218,15 @@ struct TransferWriteLowering if (failed(transferPreconditions(rewriter, writeOp))) return failure(); - if (writeOp.hasOutOfBoundsDim()) - return rewriter.notifyMatchFailure(writeOp, - "Unsupported out-of-bounds write"); AffineMap map = writeOp.getPermutationMap(); if (!map.isMinorIdentity()) return rewriter.notifyMatchFailure(writeOp, "Expects identity map"); VectorType vecTy = writeOp.getVectorType(); - auto descType = - xegpu::TensorDescType::get(vecTy.getShape(), vecTy.getElementType(), - /*array_length=*/1, /*boundary_check=*/false, - xegpu::MemorySpace::Global); + auto descType = xegpu::TensorDescType::get( + vecTy.getShape(), vecTy.getElementType(), + /*array_length=*/1, /*boundary_check=*/writeOp.hasOutOfBoundsDim(), + xegpu::MemorySpace::Global); xegpu::CreateNdDescOp ndDesc = createNdDescriptor( rewriter, loc, descType, dyn_cast>(writeOp.getSource()), diff --git a/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir index 361919c47b097..076760fe21dc8 100644 --- a/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir +++ b/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir @@ -66,30 +66,37 @@ func.func @store_dynamic_source(%vec: vector<8x16xf32>, // ----- -func.func @no_store_transposed(%vec: vector<8x16xf32>, - %source: memref<32x64xf32>, %offset: index) { +func.func @store_out_of_bounds(%vec: vector<8x16xf32>, + %source: memref<7x64xf32>, %offset: index) { vector.transfer_write %vec, %source[%offset, %offset] - {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, - in_bounds = [true, true]} - : vector<8x16xf32>, memref<32x64xf32> + {in_bounds = [false, true]} + : vector<8x16xf32>, memref<7x64xf32> return } -// CHECK-LABEL: @no_store_transposed( -// CHECK: vector.transfer_write +// CHECK-LABEL: @store_out_of_bounds( +// CHECK-SAME: %[[VEC:.+]]: vector<8x16xf32>, +// CHECK-SAME: %[[SRC:.+]]: memref<7x64xf32>, +// CHECK-SAME: %[[OFFSET:.+]]: index +// CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc +// CHECK-SAME: %[[SRC]][%[[OFFSET]], %[[OFFSET]]] +// CHECK-SAME: memref<7x64xf32> -> !xegpu.tensor_desc<8x16xf32, +// CHECK-SAME: boundary_check = true +// CHECK: xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8x16xf32> // ----- -func.func @no_store_out_of_bounds(%vec: vector<8x16xf32>, +func.func @no_store_transposed(%vec: vector<8x16xf32>, %source: memref<32x64xf32>, %offset: index) { vector.transfer_write %vec, %source[%offset, %offset] - {in_bounds = [false, true]} + {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, + in_bounds = [true, true]} : vector<8x16xf32>, memref<32x64xf32> return } -// CHECK-LABEL: @no_store_out_of_bounds( -// CHECK: vector.transfer_write +// CHECK-LABEL: @no_store_transposed( +// CHECK: vector.transfer_write // ----- From 18d655fdcce4d17080e6cb2721f93f6db856277e Mon Sep 17 00:00:00 2001 From: Amara Emerson Date: Wed, 9 Oct 2024 10:12:07 -0700 Subject: [PATCH 043/119] [SimplifyCFG][NFC] Improve compile time for TryToSimplifyUncondBranchFromEmptyBlock optimization. (#110715) In some pathological cases this optimization can spend an unreasonable amount of time populating the set for predecessors of the successor block. This change sinks some of that initializing to the point where it's actually necessary so we can take advantage of the existing early-exits. rdar://137063034 --- llvm/lib/Transforms/Utils/Local.cpp | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index cfe40f91f9a5d..f3b8623ebb0f8 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -1023,7 +1023,6 @@ static void replaceUndefValuesInPhi(PHINode *PN, static bool CanRedirectPredsOfEmptyBBToSucc(BasicBlock *BB, BasicBlock *Succ, const SmallPtrSetImpl &BBPreds, - const SmallPtrSetImpl &SuccPreds, BasicBlock *&CommonPred) { // There must be phis in BB, otherwise BB will be merged into Succ directly @@ -1042,7 +1041,7 @@ CanRedirectPredsOfEmptyBBToSucc(BasicBlock *BB, BasicBlock *Succ, // Get the single common predecessor of both BB and Succ. Return false // when there are more than one common predecessors. - for (BasicBlock *SuccPred : SuccPreds) { + for (BasicBlock *SuccPred : predecessors(Succ)) { if (BBPreds.count(SuccPred)) { if (CommonPred) return false; @@ -1166,7 +1165,6 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB, return false; SmallPtrSet BBPreds(pred_begin(BB), pred_end(BB)); - SmallPtrSet SuccPreds(pred_begin(Succ), pred_end(Succ)); // The single common predecessor of BB and Succ when BB cannot be killed BasicBlock *CommonPred = nullptr; @@ -1175,9 +1173,8 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB, // Even if we can not fold BB into Succ, we may be able to redirect the // predecessors of BB to Succ. - bool BBPhisMergeable = - BBKillable || - CanRedirectPredsOfEmptyBBToSucc(BB, Succ, BBPreds, SuccPreds, CommonPred); + bool BBPhisMergeable = BBKillable || CanRedirectPredsOfEmptyBBToSucc( + BB, Succ, BBPreds, CommonPred); if ((!BBKillable && !BBPhisMergeable) || introduceTooManyPhiEntries(BB, Succ)) return false; @@ -1302,7 +1299,7 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB, // All predecessors of BB (except the common predecessor) will be moved to // Succ. Updates.reserve(Updates.size() + 2 * pred_size(BB) + 1); - + SmallPtrSet SuccPreds(pred_begin(Succ), pred_end(Succ)); for (auto *PredOfBB : predecessors(BB)) { // Do not modify those common predecessors of BB and Succ if (!SuccPreds.contains(PredOfBB)) From 13cd43aa6fa1dc5bfb96119db43b8c549386a86e Mon Sep 17 00:00:00 2001 From: Saiyedul Islam Date: Wed, 9 Oct 2024 22:51:02 +0530 Subject: [PATCH 044/119] [Clang][OpenMP] Do not use feature option during packaging (#111702) Clang-offload-packager allows packaging of images based on an arbitrary list of key-value pairs where only triple-key is mandatory. Using target features as a key during packaging is not correct, as clang does not allow packaging multiple images in one binary which only differ in a target feature. TargetID features (xnack and sramecc) anyways are handled using arch-key and not as target features. --- clang/lib/Driver/ToolChains/Clang.cpp | 9 +-------- clang/test/Driver/amdgpu-openmp-toolchain.c | 2 +- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 66ec0a7fd32f9..5b09f97c40b48 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -9107,13 +9107,6 @@ void OffloadPackager::ConstructJob(Compilation &C, const JobAction &JA, llvm::copy_if(Features, std::back_inserter(FeatureArgs), [](StringRef Arg) { return !Arg.starts_with("-target"); }); - if (TC->getTriple().isAMDGPU()) { - for (StringRef Feature : llvm::split(Arch.split(':').second, ':')) { - FeatureArgs.emplace_back( - Args.MakeArgString(Feature.take_back() + Feature.drop_back())); - } - } - // TODO: We need to pass in the full target-id and handle it properly in the // linker wrapper. SmallVector Parts{ @@ -9123,7 +9116,7 @@ void OffloadPackager::ConstructJob(Compilation &C, const JobAction &JA, "kind=" + Kind.str(), }; - if (TC->getDriver().isUsingOffloadLTO() || TC->getTriple().isAMDGPU()) + if (TC->getDriver().isUsingOffloadLTO()) for (StringRef Feature : FeatureArgs) Parts.emplace_back("feature=" + Feature.str()); diff --git a/clang/test/Driver/amdgpu-openmp-toolchain.c b/clang/test/Driver/amdgpu-openmp-toolchain.c index 184819b790c4f..f596708047c15 100644 --- a/clang/test/Driver/amdgpu-openmp-toolchain.c +++ b/clang/test/Driver/amdgpu-openmp-toolchain.c @@ -64,7 +64,7 @@ // RUN: %clang -### -target x86_64-pc-linux-gnu -fopenmp --offload-arch=gfx90a:sramecc-:xnack+ \ // RUN: -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-TARGET-ID // CHECK-TARGET-ID: "-cc1" "-triple" "amdgcn-amd-amdhsa" {{.*}} "-target-cpu" "gfx90a" "-target-feature" "-sramecc" "-target-feature" "+xnack" -// CHECK-TARGET-ID: clang-offload-packager{{.*}}arch=gfx90a:sramecc-:xnack+,kind=openmp,feature=-sramecc,feature=+xnack +// CHECK-TARGET-ID: clang-offload-packager{{.*}}arch=gfx90a:sramecc-:xnack+,kind=openmp // RUN: not %clang -### -target x86_64-pc-linux-gnu -fopenmp --offload-arch=gfx90a,gfx90a:xnack+ \ // RUN: -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-TARGET-ID-ERROR From 3a08551a0337e999d5d8ca0b0e591d1a2b934865 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Wed, 9 Oct 2024 10:24:06 -0700 Subject: [PATCH 045/119] [AMDGPU] Fix expensive check Change-Id: I0b26d5db6d3da8936ab25ee2b1e9002840b9853e --- llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 11c95675aeeaf..57f517bfba0eb 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -235,7 +235,7 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, #ifdef EXPENSIVE_CHECKS std::vector CheckPressure, CheckMaxPressure; getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure, - TheTracker, UpwardTracker, DAG, SRI); + DownwardTracker, UpwardTracker, DAG, SRI); if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] != CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] || Pressure[AMDGPU::RegisterPressureSets::VGPR_32] != From 4e33afee5a167d5293edeef15e414c0dbbcf3cef Mon Sep 17 00:00:00 2001 From: Shourya Goel Date: Wed, 9 Oct 2024 22:55:49 +0530 Subject: [PATCH 046/119] [libc][math] Implement `issignaling` and `iscanonical` macro. (#111403) #109201 --- libc/cmake/modules/LLVMLibCTestRules.cmake | 4 ++ .../llvm-libc-macros/math-function-macros.h | 17 +++++-- libc/test/include/CMakeLists.txt | 51 ++++++++----------- libc/test/include/IsSignalingTest.h | 49 ------------------ libc/test/include/iscanonical_test.c | 29 +++++++++++ libc/test/include/issignaling_test.c | 13 +++-- libc/test/include/issignaling_test.cpp | 18 ------- libc/test/include/issignalingf_test.cpp | 18 ------- libc/test/include/issignalingl_test.cpp | 18 ------- 9 files changed, 76 insertions(+), 141 deletions(-) delete mode 100644 libc/test/include/IsSignalingTest.h create mode 100644 libc/test/include/iscanonical_test.c delete mode 100644 libc/test/include/issignaling_test.cpp delete mode 100644 libc/test/include/issignalingf_test.cpp delete mode 100644 libc/test/include/issignalingl_test.cpp diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake index 1f6ccb27f35f0..35cc6fe46be53 100644 --- a/libc/cmake/modules/LLVMLibCTestRules.cmake +++ b/libc/cmake/modules/LLVMLibCTestRules.cmake @@ -68,6 +68,10 @@ function(get_object_files_for_test result skipped_entrypoints_list) endif() get_target_property(object_file_raw ${dep} "OBJECT_FILE_RAW") if(object_file_raw) + # TODO: Remove this once we stop suffixing the target with ".__internal__" + if(fq_target_name STREQUAL "libc.test.include.issignaling_c_test.__unit__" OR fq_target_name STREQUAL "libc.test.include.iscanonical_c_test.__unit__") + string(REPLACE ".__internal__" "" object_file_raw ${object_file_raw}) + endif() list(APPEND dep_obj ${object_file_raw}) endif() elseif(${dep_type} STREQUAL ${ENTRYPOINT_OBJ_VENDOR_TARGET_TYPE}) diff --git a/libc/include/llvm-libc-macros/math-function-macros.h b/libc/include/llvm-libc-macros/math-function-macros.h index c740eb2d18825..21d09f1f5e1a4 100644 --- a/libc/include/llvm-libc-macros/math-function-macros.h +++ b/libc/include/llvm-libc-macros/math-function-macros.h @@ -11,6 +11,19 @@ #include "math-macros.h" +#ifndef __cplusplus +#define issignaling(x) \ + _Generic((x), \ + float: issignalingf, \ + double: issignaling, \ + long double: issignalingl)(x) +#define iscanonical(x) \ + _Generic((x), \ + float: iscanonicalf, \ + double: iscanonical, \ + long double: iscanonicall)(x) +#endif + #define isfinite(x) __builtin_isfinite(x) #define isinf(x) __builtin_isinf(x) #define isnan(x) __builtin_isnan(x) @@ -20,9 +33,5 @@ __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL, FP_ZERO, x) #define isnormal(x) __builtin_isnormal(x) #define issubnormal(x) (fpclassify(x) == FP_SUBNORMAL) -#if (defined(__clang__) && __clang_major__ >= 18) || \ - (defined(__GNUC__) && __GNUC__ >= 13) -#define issignaling(x) __builtin_issignaling(x) -#endif #endif // LLVM_LIBC_MACROS_MATH_FUNCTION_MACROS_H diff --git a/libc/test/include/CMakeLists.txt b/libc/test/include/CMakeLists.txt index dd8f21bdd07ae..9218516062879 100644 --- a/libc/test/include/CMakeLists.txt +++ b/libc/test/include/CMakeLists.txt @@ -81,36 +81,6 @@ add_libc_test( libc.include.llvm-libc-macros.stdckdint_macros ) -add_libc_test( - issignaling_test - SUITE - libc_include_tests - SRCS - issignaling_test.cpp - DEPENDS - libc.include.llvm-libc-macros.math_function_macros -) - -add_libc_test( - issignalingf_test - SUITE - libc_include_tests - SRCS - issignalingf_test.cpp - DEPENDS - libc.include.llvm-libc-macros.math_function_macros -) - -add_libc_test( - issignalingl_test - SUITE - libc_include_tests - SRCS - issignalingl_test.cpp - DEPENDS - libc.include.llvm-libc-macros.math_function_macros -) - add_libc_test( issubnormal_test SUITE @@ -409,6 +379,27 @@ add_libc_test( -Werror DEPENDS libc.include.llvm-libc-macros.math_function_macros + libc.src.math.issignaling + libc.src.math.issignalingf + libc.src.math.issignalingl +) + +add_libc_test( + iscanonical_c_test + C_TEST + UNIT_TEST_ONLY + SUITE + libc_include_tests + SRCS + iscanonical_test.c + COMPILE_OPTIONS + -Wall + -Werror + DEPENDS + libc.include.llvm-libc-macros.math_function_macros + libc.src.math.iscanonical + libc.src.math.iscanonicalf + libc.src.math.iscanonicall ) add_libc_test( diff --git a/libc/test/include/IsSignalingTest.h b/libc/test/include/IsSignalingTest.h deleted file mode 100644 index c369cfe090ed3..0000000000000 --- a/libc/test/include/IsSignalingTest.h +++ /dev/null @@ -1,49 +0,0 @@ -//===-- Utility class to test the issignaling macro ------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_TEST_INCLUDE_MATH_ISSIGNALING_H -#define LLVM_LIBC_TEST_INCLUDE_MATH_ISSIGNALING_H - -#include "test/UnitTest/FPMatcher.h" -#include "test/UnitTest/Test.h" - -#include "include/llvm-libc-macros/math-function-macros.h" - -template -class IsSignalingTest : public LIBC_NAMESPACE::testing::Test { - DECLARE_SPECIAL_CONSTANTS(T) - -public: - typedef int (*IsSignalingFunc)(T); - - void testSpecialNumbers(IsSignalingFunc func) { - EXPECT_EQ(func(aNaN), 0); - EXPECT_EQ(func(neg_aNaN), 0); - EXPECT_EQ(func(sNaN), 1); - EXPECT_EQ(func(neg_sNaN), 1); - EXPECT_EQ(func(inf), 0); - EXPECT_EQ(func(neg_inf), 0); - EXPECT_EQ(func(min_normal), 0); - EXPECT_EQ(func(max_normal), 0); - EXPECT_EQ(func(neg_max_normal), 0); - EXPECT_EQ(func(min_denormal), 0); - EXPECT_EQ(func(neg_min_denormal), 0); - EXPECT_EQ(func(max_denormal), 0); - EXPECT_EQ(func(zero), 0); - EXPECT_EQ(func(neg_zero), 0); - } -}; - -#define LIST_ISSIGNALING_TESTS(T, func) \ - using LlvmLibcIsSignalingTest = IsSignalingTest; \ - TEST_F(LlvmLibcIsSignalingTest, SpecialNumbers) { \ - auto issignaling_func = [](T x) { return func(x); }; \ - testSpecialNumbers(issignaling_func); \ - } - -#endif // LLVM_LIBC_TEST_INCLUDE_MATH_ISSIGNALING_H diff --git a/libc/test/include/iscanonical_test.c b/libc/test/include/iscanonical_test.c new file mode 100644 index 0000000000000..c0ad23b21826d --- /dev/null +++ b/libc/test/include/iscanonical_test.c @@ -0,0 +1,29 @@ +//===-- Unittests for iscanonical macro -----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDSList-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +int iscanonical(double); +int iscanonicalf(float); +int iscanonicall(long double); + +#include "include/llvm-libc-macros/math-function-macros.h" + +#include + +// check if macro is defined +#ifndef iscanonical +#error "iscanonical macro is not defined" +#else +int main(void) { + assert(iscanonical(__builtin_nans("")) == 0); + assert(iscanonical(__builtin_nansf("")) == 0); + assert(iscanonical(__builtin_nansl("")) == 0); + assert(iscanonical(1.819f) == 1); + assert(iscanonical(-1.726) == 1); + assert(iscanonical(1.426L) == 1); + return 0; +} +#endif diff --git a/libc/test/include/issignaling_test.c b/libc/test/include/issignaling_test.c index 2c080696404ae..c89970c225469 100644 --- a/libc/test/include/issignaling_test.c +++ b/libc/test/include/issignaling_test.c @@ -5,20 +5,25 @@ // SPDSList-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +int issignaling(double); +int issignalingf(float); +int issignalingl(long double); + #include "include/llvm-libc-macros/math-function-macros.h" #include -// TODO: enable the test unconditionally when issignaling macro is fixed for -// older compiler +// check if macro is defined +#ifndef issignaling +#error "issignaling macro is not defined" +#else int main(void) { -#ifdef issignaling assert(issignaling(__builtin_nans("")) == 1); assert(issignaling(__builtin_nansf("")) == 1); assert(issignaling(__builtin_nansl("")) == 1); assert(issignaling(1.819f) == 0); assert(issignaling(-1.726) == 0); assert(issignaling(1.426L) == 0); -#endif return 0; } +#endif diff --git a/libc/test/include/issignaling_test.cpp b/libc/test/include/issignaling_test.cpp deleted file mode 100644 index 3d25ea394c835..0000000000000 --- a/libc/test/include/issignaling_test.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//===-- Unittest for issignaling[d] macro ---------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDSList-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "IsSignalingTest.h" -#include "include/llvm-libc-macros/math-function-macros.h" - -// TODO: enable the test unconditionally when issignaling macro is fixed for -// older compiler -#ifdef issignaling -LIST_ISSIGNALING_TESTS(double, issignaling) -#else -TEST(LlvmLibcIsSignalingTest, Skip) {} -#endif diff --git a/libc/test/include/issignalingf_test.cpp b/libc/test/include/issignalingf_test.cpp deleted file mode 100644 index 02426ceb24ac8..0000000000000 --- a/libc/test/include/issignalingf_test.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//===-- Unittest for issignaling[f] macro ---------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDSList-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "IsSignalingTest.h" -#include "include/llvm-libc-macros/math-function-macros.h" - -// TODO: enable the test unconditionally when issignaling macro is fixed for -// older compiler -#ifdef issignaling -LIST_ISSIGNALING_TESTS(float, issignaling) -#else -TEST(LlvmLibcIsSignalingTest, Skip) {} -#endif diff --git a/libc/test/include/issignalingl_test.cpp b/libc/test/include/issignalingl_test.cpp deleted file mode 100644 index 9897647fb1077..0000000000000 --- a/libc/test/include/issignalingl_test.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//===-- Unittest for issignaling[l] macro ---------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDSList-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "IsSignalingTest.h" -#include "include/llvm-libc-macros/math-function-macros.h" - -// TODO: enable the test unconditionally when issignaling macro is fixed for -// older compiler -#ifdef issignaling -LIST_ISSIGNALING_TESTS(long double, issignaling) -#else -TEST(LlvmLibcIsSignalingTest, Skip) {} -#endif From ee0e17a4d8b42278ded1217e415073e8bce88b2a Mon Sep 17 00:00:00 2001 From: vporpo Date: Wed, 9 Oct 2024 10:29:48 -0700 Subject: [PATCH 047/119] [SandboxVec][DAG] Drop RAR and fix dependency scanning loop (#111715) --- .../Vectorize/SandboxVectorizer/DependencyGraph.h | 1 - .../SandboxVectorizer/DependencyGraph.cpp | 14 ++++++-------- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h index b1fe67d446be0..134adc4b21ab1 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h @@ -173,7 +173,6 @@ class DependencyGraph { enum class DependencyType { ReadAfterWrite, ///> Memory dependency write -> read WriteAfterWrite, ///> Memory dependency write -> write - ReadAfterRead, ///> Memory dependency read -> read WriteAfterRead, ///> Memory dependency read -> write Control, ///> Control-related dependency, like with PHI/Terminator Other, ///> Currently used for stack related instrs diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp index b88b0e89b9e79..82f253d4c6323 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp @@ -62,8 +62,6 @@ DependencyGraph::getRoughDepType(Instruction *FromI, Instruction *ToI) { } else if (FromI->mayReadFromMemory()) { if (ToI->mayWriteToMemory()) return DependencyType::WriteAfterRead; - if (ToI->mayReadFromMemory()) - return DependencyType::ReadAfterRead; } if (isa(FromI) || isa(ToI)) return DependencyType::Control; @@ -103,7 +101,7 @@ bool DependencyGraph::alias(Instruction *SrcI, Instruction *DstI, // TODO: Check AABudget ModRefInfo SrcModRef = isOrdered(SrcI) - ? ModRefInfo::Mod + ? ModRefInfo::ModRef : Utils::aliasAnalysisGetModRefInfo(*BatchAA, SrcI, *DstLocOpt); switch (DepType) { case DependencyType::ReadAfterWrite: @@ -119,8 +117,6 @@ bool DependencyGraph::alias(Instruction *SrcI, Instruction *DstI, bool DependencyGraph::hasDep(Instruction *SrcI, Instruction *DstI) { DependencyType RoughDepType = getRoughDepType(SrcI, DstI); switch (RoughDepType) { - case DependencyType::ReadAfterRead: - return false; case DependencyType::ReadAfterWrite: case DependencyType::WriteAfterWrite: case DependencyType::WriteAfterRead: @@ -175,9 +171,11 @@ Interval DependencyGraph::extend(ArrayRef Instrs) { } // Create the dependencies. auto DstRange = MemDGNodeIntervalBuilder::make(InstrInterval, *this); - for (MemDGNode &DstN : drop_begin(DstRange)) { - auto SrcRange = Interval(DstRange.top(), DstN.getPrevNode()); - scanAndAddDeps(DstN, SrcRange); + if (!DstRange.empty()) { + for (MemDGNode &DstN : drop_begin(DstRange)) { + auto SrcRange = Interval(DstRange.top(), DstN.getPrevNode()); + scanAndAddDeps(DstN, SrcRange); + } } return InstrInterval; From 10ada4ae738b9d93174e516ca841e61a8f4fd612 Mon Sep 17 00:00:00 2001 From: Jorge Gorbe Moya Date: Wed, 9 Oct 2024 10:37:05 -0700 Subject: [PATCH 048/119] [SandboxVectorizer] Use sbvec-passes flag to create a pipeline of Region passes after BottomUpVec. (#111223) The main change is that the main SandboxVectorizer pass no longer has a pipeline of function passes. Now it is a wrapper that creates sandbox IR from functions before calling BottomUpVec. BottomUpVec now builds its own RegionPassManager from the `sbvec-passes` flag, using a PassRegistry.def file. For now, these region passes are not run (BottomUpVec doesn't create Regions yet), and only a null pass for testing exists. This commit also changes the ownership model for sandboxir::PassManager: instead of having a PassRegistry that owns passes, and PassManagers that contain non-owning pointers to the passes, now PassManager owns (via unique pointers) the passes it contains. PassRegistry is now deleted, and the logic to parse and create a pass pipeline is now in PassManager::setPassPipeline. --- llvm/include/llvm/SandboxIR/PassManager.h | 82 ++++++++------ .../SandboxVectorizer/Passes/BottomUpVec.h | 8 +- .../SandboxVectorizer/Passes/NullPass.h | 19 ++++ .../SandboxVectorizer/SandboxVectorizer.h | 10 +- llvm/lib/SandboxIR/PassManager.cpp | 44 +------- .../SandboxVectorizer/Passes/BottomUpVec.cpp | 46 +++++++- .../SandboxVectorizer/Passes/PassRegistry.def | 22 ++++ .../SandboxVectorizer/SandboxVectorizer.cpp | 41 +------ .../default_pass_pipeline.ll | 3 +- .../SandboxVectorizer/user_pass_pipeline.ll | 8 +- llvm/unittests/SandboxIR/PassTest.cpp | 104 +++++++++--------- 11 files changed, 204 insertions(+), 183 deletions(-) create mode 100644 llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/NullPass.h create mode 100644 llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/PassRegistry.def diff --git a/llvm/include/llvm/SandboxIR/PassManager.h b/llvm/include/llvm/SandboxIR/PassManager.h index 54192c6bf1333..247c43615f576 100644 --- a/llvm/include/llvm/SandboxIR/PassManager.h +++ b/llvm/include/llvm/SandboxIR/PassManager.h @@ -18,6 +18,8 @@ #ifndef LLVM_SANDBOXIR_PASSMANAGER_H #define LLVM_SANDBOXIR_PASSMANAGER_H +#include + #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/SandboxIR/Pass.h" @@ -32,25 +34,65 @@ template class PassManager : public ParentPass { protected: /// The list of passes that this pass manager will run. - SmallVector Passes; + SmallVector> Passes; PassManager(StringRef Name) : ParentPass(Name) {} PassManager(const PassManager &) = delete; + PassManager(PassManager &&) = default; virtual ~PassManager() = default; PassManager &operator=(const PassManager &) = delete; public: /// Adds \p Pass to the pass pipeline. - void addPass(ContainedPass *Pass) { + void addPass(std::unique_ptr Pass) { // TODO: Check that Pass's class type works with this PassManager type. - Passes.push_back(Pass); + Passes.push_back(std::move(Pass)); + } + + using CreatePassFunc = + std::function(StringRef)>; + + /// Parses \p Pipeline as a comma-separated sequence of pass names and sets + /// the pass pipeline, using \p CreatePass to instantiate passes by name. + /// + /// After calling this function, the PassManager contains only the specified + /// pipeline, any previously added passes are cleared. + void setPassPipeline(StringRef Pipeline, CreatePassFunc CreatePass) { + static constexpr const char EndToken = '\0'; + static constexpr const char PassDelimToken = ','; + + assert(Passes.empty() && + "setPassPipeline called on a non-empty sandboxir::PassManager"); + // Add EndToken to the end to ease parsing. + std::string PipelineStr = std::string(Pipeline) + EndToken; + int FlagBeginIdx = 0; + + for (auto [Idx, C] : enumerate(PipelineStr)) { + // Keep moving Idx until we find the end of the pass name. + bool FoundDelim = C == EndToken || C == PassDelimToken; + if (!FoundDelim) + continue; + unsigned Sz = Idx - FlagBeginIdx; + std::string PassName(&PipelineStr[FlagBeginIdx], Sz); + FlagBeginIdx = Idx + 1; + + // Get the pass that corresponds to PassName and add it to the pass + // manager. + auto Pass = CreatePass(PassName); + if (Pass == nullptr) { + errs() << "Pass '" << PassName << "' not registered!\n"; + exit(1); + } + addPass(std::move(Pass)); + } } + #ifndef NDEBUG void print(raw_ostream &OS) const override { OS << this->getName(); OS << "("; // TODO: This should call Pass->print(OS) because Pass may be a PM. - interleave(Passes, OS, [&OS](auto *Pass) { OS << Pass->getName(); }, ","); + interleave(Passes, OS, [&OS](auto &Pass) { OS << Pass->getName(); }, ","); OS << ")"; } LLVM_DUMP_METHOD void dump() const override { @@ -79,38 +121,6 @@ class RegionPassManager final : public PassManager { bool runOnRegion(Region &R) final; }; -/// Owns the passes and provides an API to get a pass by its name. -class PassRegistry { - SmallVector, 8> Passes; - DenseMap NameToPassMap; - -public: - static constexpr const char PassDelimToken = ','; - PassRegistry() = default; - /// Registers \p PassPtr and takes ownership. - Pass ®isterPass(std::unique_ptr &&PassPtr) { - auto &PassRef = *PassPtr.get(); - NameToPassMap[PassRef.getName()] = &PassRef; - Passes.push_back(std::move(PassPtr)); - return PassRef; - } - /// \Returns the pass with name \p Name, or null if not registered. - Pass *getPassByName(StringRef Name) const { - auto It = NameToPassMap.find(Name); - return It != NameToPassMap.end() ? It->second : nullptr; - } - /// Creates a pass pipeline and returns the first pass manager. - FunctionPassManager &parseAndCreatePassPipeline(StringRef Pipeline); - -#ifndef NDEBUG - void print(raw_ostream &OS) const { - for (const auto &PassPtr : Passes) - OS << PassPtr->getName() << "\n"; - } - LLVM_DUMP_METHOD void dump() const; -#endif -}; - } // namespace llvm::sandboxir #endif // LLVM_SANDBOXIR_PASSMANAGER_H diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h index a2108f07c28e5..02abdf0a1ef0d 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h @@ -15,18 +15,24 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/SandboxIR/Constant.h" #include "llvm/SandboxIR/Pass.h" +#include "llvm/SandboxIR/PassManager.h" #include "llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h" namespace llvm::sandboxir { +class RegionPassManager; + class BottomUpVec final : public FunctionPass { bool Change = false; LegalityAnalysis Legality; void vectorizeRec(ArrayRef Bndl); void tryVectorize(ArrayRef Seeds); + // The PM containing the pipeline of region passes. + RegionPassManager RPM; + public: - BottomUpVec() : FunctionPass("bottom-up-vec") {} + BottomUpVec(); bool runOnFunction(Function &F) final; }; diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/NullPass.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/NullPass.h new file mode 100644 index 0000000000000..75b9f42520156 --- /dev/null +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/NullPass.h @@ -0,0 +1,19 @@ +#ifndef LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_PASSES_NULLPASS_H +#define LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_PASSES_NULLPASS_H + +#include "llvm/SandboxIR/Pass.h" + +namespace llvm::sandboxir { + +class Region; + +/// A Region pass that does nothing, for use as a placeholder in tests. +class NullPass final : public RegionPass { +public: + NullPass() : RegionPass("null") {} + bool runOnRegion(Region &R) final { return false; } +}; + +} // namespace llvm::sandboxir + +#endif // LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_PASSES_NULLPASS_H diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h index dd9f02d327264..1dcd976bf751c 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h @@ -8,7 +8,10 @@ #ifndef LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_SANDBOXVECTORIZER_H #define LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_SANDBOXVECTORIZER_H +#include + #include "llvm/IR/PassManager.h" +#include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h" namespace llvm { @@ -17,10 +20,13 @@ class TargetTransformInfo; class SandboxVectorizerPass : public PassInfoMixin { TargetTransformInfo *TTI = nullptr; -public: - PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); + // The main vectorizer pass. + sandboxir::BottomUpVec BottomUpVecPass; bool runImpl(Function &F); + +public: + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); }; } // namespace llvm diff --git a/llvm/lib/SandboxIR/PassManager.cpp b/llvm/lib/SandboxIR/PassManager.cpp index 95bc5e56bb3ec..3a1cfa1d367a2 100644 --- a/llvm/lib/SandboxIR/PassManager.cpp +++ b/llvm/lib/SandboxIR/PassManager.cpp @@ -8,11 +8,11 @@ #include "llvm/SandboxIR/PassManager.h" -using namespace llvm::sandboxir; +namespace llvm::sandboxir { bool FunctionPassManager::runOnFunction(Function &F) { bool Change = false; - for (FunctionPass *Pass : Passes) { + for (auto &Pass : Passes) { Change |= Pass->runOnFunction(F); // TODO: run the verifier. } @@ -22,7 +22,7 @@ bool FunctionPassManager::runOnFunction(Function &F) { bool RegionPassManager::runOnRegion(Region &R) { bool Change = false; - for (RegionPass *Pass : Passes) { + for (auto &Pass : Passes) { Change |= Pass->runOnRegion(R); // TODO: run the verifier. } @@ -30,40 +30,4 @@ bool RegionPassManager::runOnRegion(Region &R) { return Change; } -FunctionPassManager & -PassRegistry::parseAndCreatePassPipeline(StringRef Pipeline) { - static constexpr const char EndToken = '\0'; - // Add EndToken to the end to ease parsing. - std::string PipelineStr = std::string(Pipeline) + EndToken; - int FlagBeginIdx = 0; - // Start with a FunctionPassManager. - auto &InitialPM = static_cast( - registerPass(std::make_unique("init-fpm"))); - - for (auto [Idx, C] : enumerate(PipelineStr)) { - // Keep moving Idx until we find the end of the pass name. - bool FoundDelim = C == EndToken || C == PassDelimToken; - if (!FoundDelim) - continue; - unsigned Sz = Idx - FlagBeginIdx; - std::string PassName(&PipelineStr[FlagBeginIdx], Sz); - FlagBeginIdx = Idx + 1; - - // Get the pass that corresponds to PassName and add it to the pass manager. - auto *Pass = getPassByName(PassName); - if (Pass == nullptr) { - errs() << "Pass '" << PassName << "' not registered!\n"; - exit(1); - } - // TODO: This is safe for now, but would require proper upcasting once we - // add more Pass sub-classes. - InitialPM.addPass(static_cast(Pass)); - } - return InitialPM; -} -#ifndef NDEBUG -void PassRegistry::dump() const { - print(dbgs()); - dbgs() << "\n"; -} -#endif // NDEBUG +} // namespace llvm::sandboxir diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp index c59abd09d4362..77198f932a3ec 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp @@ -10,10 +10,41 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/SandboxIR/Function.h" #include "llvm/SandboxIR/Instruction.h" - -using namespace llvm::sandboxir; +#include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/NullPass.h" namespace llvm::sandboxir { + +static cl::opt + PrintPassPipeline("sbvec-print-pass-pipeline", cl::init(false), cl::Hidden, + cl::desc("Prints the pass pipeline and returns.")); + +/// A magic string for the default pass pipeline. +static const char *DefaultPipelineMagicStr = "*"; + +static cl::opt UserDefinedPassPipeline( + "sbvec-passes", cl::init(DefaultPipelineMagicStr), cl::Hidden, + cl::desc("Comma-separated list of vectorizer passes. If not set " + "we run the predefined pipeline.")); + +static std::unique_ptr createRegionPass(StringRef Name) { +#define REGION_PASS(NAME, CREATE_PASS) \ + if (Name == NAME) \ + return std::make_unique(CREATE_PASS); +#include "PassRegistry.def" + return nullptr; +} + +BottomUpVec::BottomUpVec() : FunctionPass("bottom-up-vec"), RPM("rpm") { + // Create a pipeline to be run on each Region created by BottomUpVec. + if (UserDefinedPassPipeline == DefaultPipelineMagicStr) { + // TODO: Add default passes to RPM. + } else { + // Create the user-defined pipeline. + RPM.setPassPipeline(UserDefinedPassPipeline, createRegionPass); + } +} + // TODO: This is a temporary function that returns some seeds. // Replace this with SeedCollector's function when it lands. static llvm::SmallVector collectSeeds(BasicBlock &BB) { @@ -34,8 +65,6 @@ static SmallVector getOperand(ArrayRef Bndl, return Operands; } -} // namespace llvm::sandboxir - void BottomUpVec::vectorizeRec(ArrayRef Bndl) { auto LegalityRes = Legality.canVectorize(Bndl); switch (LegalityRes.getSubclassID()) { @@ -53,14 +82,23 @@ void BottomUpVec::vectorizeRec(ArrayRef Bndl) { void BottomUpVec::tryVectorize(ArrayRef Bndl) { vectorizeRec(Bndl); } bool BottomUpVec::runOnFunction(Function &F) { + if (PrintPassPipeline) { + RPM.printPipeline(outs()); + return false; + } + Change = false; // TODO: Start from innermost BBs first for (auto &BB : F) { // TODO: Replace with proper SeedCollector function. auto Seeds = collectSeeds(BB); // TODO: Slice Seeds into smaller chunks. + // TODO: If vectorization succeeds, run the RegionPassManager on the + // resulting region. if (Seeds.size() >= 2) tryVectorize(Seeds); } return Change; } + +} // namespace llvm::sandboxir diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/PassRegistry.def b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/PassRegistry.def new file mode 100644 index 0000000000000..bbb0dcba1ea51 --- /dev/null +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/PassRegistry.def @@ -0,0 +1,22 @@ +//===- PassRegistry.def - Registry of passes --------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file is used as the registry of sub-passes that are part of the +// SandboxVectorizer pass. +// +//===----------------------------------------------------------------------===// + +// NOTE: NO INCLUDE GUARD DESIRED! + +#ifndef REGION_PASS +#define REGION_PASS(NAME, CREATE_PASS) +#endif + +REGION_PASS("null", NullPass()) + +#undef REGION_PASS diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp index 80afcb499a2c2..cbaf2b6288d92 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp @@ -9,8 +9,6 @@ #include "llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/SandboxIR/Constant.h" -#include "llvm/SandboxIR/PassManager.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h" using namespace llvm; @@ -18,18 +16,6 @@ using namespace llvm; #define SV_NAME "sandbox-vectorizer" #define DEBUG_TYPE SV_NAME -cl::opt - PrintPassPipeline("sbvec-print-pass-pipeline", cl::init(false), cl::Hidden, - cl::desc("Prints the pass pipeline and returns.")); - -/// A magic string for the default pass pipeline. -const char *DefaultPipelineMagicStr = "*"; - -cl::opt UserDefinedPassPipeline( - "sbvec-passes", cl::init(DefaultPipelineMagicStr), cl::Hidden, - cl::desc("Comma-separated list of vectorizer passes. If not set " - "we run the predefined pipeline.")); - PreservedAnalyses SandboxVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) { TTI = &AM.getResult(F); @@ -56,31 +42,8 @@ bool SandboxVectorizerPass::runImpl(Function &LLVMF) { return false; } + // Create SandboxIR for LLVMF and run BottomUpVec on it. sandboxir::Context Ctx(LLVMF.getContext()); - // Create SandboxIR for `LLVMF`. sandboxir::Function &F = *Ctx.createFunction(&LLVMF); - // Create the passes and register them with the PassRegistry. - sandboxir::PassRegistry PR; - auto &BottomUpVecPass = static_cast( - PR.registerPass(std::make_unique())); - - sandboxir::FunctionPassManager *PM = nullptr; - if (UserDefinedPassPipeline == DefaultPipelineMagicStr) { - // Create the default pass pipeline. - PM = &static_cast(PR.registerPass( - std::make_unique("pm"))); - PM->addPass(&BottomUpVecPass); - } else { - // Create the user-defined pipeline. - PM = &PR.parseAndCreatePassPipeline(UserDefinedPassPipeline); - } - - if (PrintPassPipeline) { - PM->printPipeline(outs()); - return false; - } - - // Run the pass pipeline. - bool Change = PM->runOnFunction(F); - return Change; + return BottomUpVecPass.runOnFunction(F); } diff --git a/llvm/test/Transforms/SandboxVectorizer/default_pass_pipeline.ll b/llvm/test/Transforms/SandboxVectorizer/default_pass_pipeline.ll index 5ccd64d9f487a..86bfbee636478 100644 --- a/llvm/test/Transforms/SandboxVectorizer/default_pass_pipeline.ll +++ b/llvm/test/Transforms/SandboxVectorizer/default_pass_pipeline.ll @@ -4,8 +4,7 @@ ; This checks the default pass pipeline for the sandbox vectorizer. define void @pipeline() { -; CHECK: pm -; CHECK: bottom-up-vec +; CHECK: rpm ; CHECK-EMPTY: ret void } diff --git a/llvm/test/Transforms/SandboxVectorizer/user_pass_pipeline.ll b/llvm/test/Transforms/SandboxVectorizer/user_pass_pipeline.ll index 2879fbba1b9c0..2e6dab0aa29c7 100644 --- a/llvm/test/Transforms/SandboxVectorizer/user_pass_pipeline.ll +++ b/llvm/test/Transforms/SandboxVectorizer/user_pass_pipeline.ll @@ -1,12 +1,12 @@ -; RUN: opt -passes=sandbox-vectorizer -sbvec-print-pass-pipeline -sbvec-passes=bottom-up-vec,bottom-up-vec %s -disable-output | FileCheck %s +; RUN: opt -passes=sandbox-vectorizer -sbvec-print-pass-pipeline -sbvec-passes=null,null %s -disable-output | FileCheck %s ; !!!WARNING!!! This won't get updated by update_test_checks.py ! ; This checks the user defined pass pipeline. define void @pipeline() { -; CHECK: pm -; CHECK: bottom-up-vec -; CHECK: bottom-up-vec +; CHECK: rpm +; CHECK: null +; CHECK: null ; CHECK-EMPTY: ret void } diff --git a/llvm/unittests/SandboxIR/PassTest.cpp b/llvm/unittests/SandboxIR/PassTest.cpp index b380ae9fd475a..ae7284ecf2deb 100644 --- a/llvm/unittests/SandboxIR/PassTest.cpp +++ b/llvm/unittests/SandboxIR/PassTest.cpp @@ -180,12 +180,10 @@ define void @foo() { }; unsigned BBCnt1 = 0; unsigned BBCnt2 = 0; - TestPass1 TPass1(BBCnt1); - TestPass2 TPass2(BBCnt2); FunctionPassManager FPM("test-fpm"); - FPM.addPass(&TPass1); - FPM.addPass(&TPass2); + FPM.addPass(std::make_unique(BBCnt1)); + FPM.addPass(std::make_unique(BBCnt2)); // Check runOnFunction(). FPM.runOnFunction(*F); EXPECT_EQ(BBCnt1, 1u); @@ -238,12 +236,10 @@ define i8 @foo(i8 %v0, i8 %v1) { }; unsigned InstCount1 = 0; unsigned InstCount2 = 0; - TestPass1 TPass1(InstCount1); - TestPass2 TPass2(InstCount2); RegionPassManager RPM("test-rpm"); - RPM.addPass(&TPass1); - RPM.addPass(&TPass2); + RPM.addPass(std::make_unique(InstCount1)); + RPM.addPass(std::make_unique(InstCount2)); // Check runOnRegion(). llvm::SmallVector> Regions = Region::createRegionsFromMD(*F); @@ -260,62 +256,60 @@ define i8 @foo(i8 %v0, i8 %v1) { #endif // NDEBUG } -TEST_F(PassTest, PassRegistry) { - class TestPass1 final : public FunctionPass { - public: - TestPass1() : FunctionPass("test-pass1") {} - bool runOnFunction(Function &F) final { return false; } - }; - class TestPass2 final : public FunctionPass { - public: - TestPass2() : FunctionPass("test-pass2") {} - bool runOnFunction(Function &F) final { return false; } - }; - - PassRegistry Registry; - auto &TP1 = Registry.registerPass(std::make_unique()); - auto &TP2 = Registry.registerPass(std::make_unique()); - - // Check getPassByName(). - EXPECT_EQ(Registry.getPassByName("test-pass1"), &TP1); - EXPECT_EQ(Registry.getPassByName("test-pass2"), &TP2); - -#ifndef NDEBUG - // Check print(). - std::string Buff; - llvm::raw_string_ostream SS(Buff); - Registry.print(SS); - EXPECT_EQ(Buff, "test-pass1\ntest-pass2\n"); -#endif // NDEBUG +TEST_F(PassTest, SetPassPipeline) { + auto *F = parseFunction(R"IR( +define void @f() { + ret void } +)IR", + "f"); + class FooPass final : public FunctionPass { + std::string &Str; -TEST_F(PassTest, ParsePassPipeline) { - class TestPass1 final : public FunctionPass { public: - TestPass1() : FunctionPass("test-pass1") {} - bool runOnFunction(Function &F) final { return false; } + FooPass(std::string &Str) : FunctionPass("foo-pass"), Str(Str) {} + bool runOnFunction(Function &F) final { + Str += "foo"; + return false; + } }; - class TestPass2 final : public FunctionPass { + class BarPass final : public FunctionPass { + std::string &Str; + public: - TestPass2() : FunctionPass("test-pass2") {} - bool runOnFunction(Function &F) final { return false; } + BarPass(std::string &Str) : FunctionPass("bar-pass"), Str(Str) {} + bool runOnFunction(Function &F) final { + Str += "bar"; + return false; + } }; - PassRegistry Registry; - Registry.registerPass(std::make_unique()); - Registry.registerPass(std::make_unique()); + std::string Str; + auto CreatePass = + [&Str](llvm::StringRef Name) -> std::unique_ptr { + if (Name == "foo") + return std::make_unique(Str); + if (Name == "bar") + return std::make_unique(Str); + return nullptr; + }; - [[maybe_unused]] auto &FPM = - Registry.parseAndCreatePassPipeline("test-pass1,test-pass2,test-pass1"); + FunctionPassManager FPM("test-fpm"); + FPM.setPassPipeline("foo,bar,foo", CreatePass); + FPM.runOnFunction(*F); + EXPECT_EQ(Str, "foobarfoo"); + + // A second call to setPassPipeline will trigger an assertion in debug mode. #ifndef NDEBUG - std::string Buff; - llvm::raw_string_ostream SS(Buff); - FPM.print(SS); - EXPECT_EQ(Buff, "init-fpm(test-pass1,test-pass2,test-pass1)"); -#endif // NDEBUG + EXPECT_DEATH(FPM.setPassPipeline("bar,bar,foo", CreatePass), + "setPassPipeline called on a non-empty sandboxir::PassManager"); +#endif - EXPECT_DEATH(Registry.parseAndCreatePassPipeline("bad-pass-name"), + // Fresh PM for the death tests so they die from bad pipeline strings, rather + // than from multiple setPassPipeline calls. + FunctionPassManager FPM2("test-fpm"); + EXPECT_DEATH(FPM2.setPassPipeline("bad-pass-name", CreatePass), ".*not registered.*"); - EXPECT_DEATH(Registry.parseAndCreatePassPipeline(""), ".*not registered.*"); - EXPECT_DEATH(Registry.parseAndCreatePassPipeline(","), ".*not registered.*"); + EXPECT_DEATH(FPM2.setPassPipeline("", CreatePass), ".*not registered.*"); + EXPECT_DEATH(FPM2.setPassPipeline(",", CreatePass), ".*not registered.*"); } From a075e785b8f4e2323ce89b742185386314909f21 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 9 Oct 2024 21:38:47 +0400 Subject: [PATCH 049/119] AMDGPU: Fix incorrectly selecting fp8/bf8 conversion intrinsics (#107291) Trying to codegen these on targets without the instructions should fail to select. Not sure if all the predicates are correct. We had a fake one disconnected to a feature which was always true. Fixes: SWDEV-482274 --- llvm/lib/Target/AMDGPU/AMDGPU.td | 8 ++ llvm/lib/Target/AMDGPU/GCNSubtarget.h | 3 +- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 11 +- .../AMDGPU/llvm.amdgcn.cvt.f32.fp8.err.ll | 100 ++++++++++++++++++ 4 files changed, 117 insertions(+), 5 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f32.fp8.err.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index dc94edf85586f..25117544d6a84 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -685,6 +685,13 @@ def FeatureFP8ConversionInsts : SubtargetFeature<"fp8-conversion-insts", "Has fp8 and bf8 conversion instructions" >; +def FeatureCvtFP8VOP1Bug : SubtargetFeature<"cvt-fp8-vop1-bug", + "HasCvtFP8Vop1Bug", + "true", + "FP8/BF8 VOP1 form of conversion to F32 is unreliable", + [FeatureFP8ConversionInsts] +>; + def FeaturePkFmacF16Inst : SubtargetFeature<"pk-fmac-f16-inst", "HasPkFmacF16Inst", "true", @@ -1439,6 +1446,7 @@ def FeatureISAVersion9_4_Common : FeatureSet< FeatureMAIInsts, FeatureFP8Insts, FeatureFP8ConversionInsts, + FeatureCvtFP8VOP1Bug, FeaturePkFmacF16Inst, FeatureAtomicFaddRtnInsts, FeatureAtomicFaddNoRtnInsts, diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index e6b7342d5fffc..1945812609316 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -158,6 +158,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasMAIInsts = false; bool HasFP8Insts = false; bool HasFP8ConversionInsts = false; + bool HasCvtFP8Vop1Bug = false; bool HasPkFmacF16Inst = false; bool HasAtomicFMinFMaxF32GlobalInsts = false; bool HasAtomicFMinFMaxF64GlobalInsts = false; @@ -1352,7 +1353,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasSplitBarriers() const { return getGeneration() >= GFX12; } // \returns true if FP8/BF8 VOP1 form of conversion to F32 is unreliable. - bool hasCvtFP8VOP1Bug() const { return true; } + bool hasCvtFP8VOP1Bug() const { return HasCvtFP8Vop1Bug; } // \returns true if CSUB (a.k.a. SUB_CLAMP on GFX12) atomics support a // no-return form. diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 871a7c3c2579e..be98d201a64a7 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -582,7 +582,7 @@ class Cvt_F32_F8_Pat; -let SubtargetPredicate = isGFX9Only in { +let SubtargetPredicate = HasFP8ConversionInsts in { let OtherPredicates = [HasCvtFP8VOP1Bug] in { def : GCNPat<(f32 (int_amdgcn_cvt_f32_fp8 i32:$src, 0)), (V_CVT_F32_FP8_sdwa 0, $src, 0, 0, 0)>; @@ -590,18 +590,21 @@ let OtherPredicates = [HasCvtFP8VOP1Bug] in { (V_CVT_F32_BF8_sdwa 0, $src, 0, 0, 0)>; } -let OtherPredicates = [HasNoCvtFP8VOP1Bug] in { +let OtherPredicates = [HasNoCvtFP8VOP1Bug, HasSDWA] in { // FIXME: HasSDWA is a substitute for !gfx12 def : GCNPat<(f32 (int_amdgcn_cvt_f32_fp8 i32:$src, 0)), (V_CVT_F32_FP8_e32 $src)>; def : GCNPat<(f32 (int_amdgcn_cvt_f32_bf8 i32:$src, 0)), (V_CVT_F32_BF8_e32 $src)>; } +let OtherPredicates = [HasSDWA] in { foreach Index = [1, 2, 3] in { def : Cvt_F32_F8_Pat; def : Cvt_F32_F8_Pat; } -} // End SubtargetPredicate = isGFX9Only +} // End OtherPredicates = [HasSDWA] + +} // End SubtargetPredicate = HasFP8ConversionInsts class Cvt_PK_F32_F8_Pat : GCNPat< @@ -611,7 +614,7 @@ class Cvt_PK_F32_F8_Pat; -let SubtargetPredicate = isGFX9Only in { +let SubtargetPredicate = HasFP8ConversionInsts, OtherPredicates = [HasSDWA] in { foreach Index = [0, -1] in { def : Cvt_PK_F32_F8_Pat; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f32.fp8.err.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f32.fp8.err.ll new file mode 100644 index 0000000000000..29812993d541e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f32.fp8.err.ll @@ -0,0 +1,100 @@ +; RUN: split-file %s %t + +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx908 -filetype=null %t/fp8-byte0-err.ll 2>&1 | FileCheck -check-prefix=ERR-FP8-BYTE0-ERR %s +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx908 -filetype=null %t/fp8-byte1-err.ll 2>&1 | FileCheck -check-prefix=ERR-FP8-BYTE1-ERR %s +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx908 -filetype=null %t/bf8-byte0-err.ll 2>&1 | FileCheck -check-prefix=ERR-BF8-BYTE0-ERR %s +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx908 -filetype=null %t/bf8-byte1-err.ll 2>&1 | FileCheck -check-prefix=ERR-BF8-BYTE1-ERR %s + +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -filetype=null %t/fp8-byte0-err.ll 2>&1 | FileCheck -check-prefix=ERR-FP8-BYTE0-ERR %s +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -filetype=null %t/fp8-byte1-err.ll 2>&1 | FileCheck -check-prefix=ERR-FP8-BYTE1-ERR %s +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -filetype=null %t/bf8-byte0-err.ll 2>&1 | FileCheck -check-prefix=ERR-BF8-BYTE0-ERR %s +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -filetype=null %t/bf8-byte1-err.ll 2>&1 | FileCheck -check-prefix=ERR-BF8-BYTE1-ERR %s + + +; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx908 -filetype=null %t/fp8-byte0-err.ll 2>&1 | FileCheck -check-prefix=ERR-FP8-BYTE0-ERR-GISEL %s +; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx908 -filetype=null %t/fp8-byte1-err.ll 2>&1 | FileCheck -check-prefix=ERR-FP8-BYTE1-ERR-GISEL %s +; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx908 -filetype=null %t/bf8-byte0-err.ll 2>&1 | FileCheck -check-prefix=ERR-BF8-BYTE0-ERR-GISEL %s +; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx908 -filetype=null %t/bf8-byte1-err.ll 2>&1 | FileCheck -check-prefix=ERR-BF8-BYTE1-ERR-GISEL %s + +; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -filetype=null %t/fp8-byte0-err.ll 2>&1 | FileCheck -check-prefix=ERR-FP8-BYTE0-ERR-GISEL %s +; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -filetype=null %t/fp8-byte1-err.ll 2>&1 | FileCheck -check-prefix=ERR-FP8-BYTE1-ERR-GISEL %s +; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -filetype=null %t/bf8-byte0-err.ll 2>&1 | FileCheck -check-prefix=ERR-BF8-BYTE0-ERR-GISEL %s +; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -filetype=null %t/bf8-byte1-err.ll 2>&1 | FileCheck -check-prefix=ERR-BF8-BYTE1-ERR-GISEL %s + + + +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx908 -filetype=null %t/pk-fp8-word0-err.ll 2>&1 | FileCheck -check-prefix=ERR-PK-FP8-WORD0-ERR %s +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx908 -filetype=null %t/pk-fp8-word1-err.ll 2>&1 | FileCheck -check-prefix=ERR-PK-FP8-WORD1-ERR %s +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx908 -filetype=null %t/pk-bf8-word0-err.ll 2>&1 | FileCheck -check-prefix=ERR-PK-BF8-WORD0-ERR %s +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx908 -filetype=null %t/pk-bf8-word1-err.ll 2>&1 | FileCheck -check-prefix=ERR-PK-BF8-WORD1-ERR %s + +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -filetype=null %t/pk-fp8-word0-err.ll 2>&1 | FileCheck -check-prefix=ERR-PK-FP8-WORD0-ERR %s +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -filetype=null %t/pk-fp8-word1-err.ll 2>&1 | FileCheck -check-prefix=ERR-PK-FP8-WORD1-ERR %s +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -filetype=null %t/pk-bf8-word0-err.ll 2>&1 | FileCheck -check-prefix=ERR-PK-BF8-WORD0-ERR %s +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -filetype=null %t/pk-bf8-word1-err.ll 2>&1 | FileCheck -check-prefix=ERR-PK-BF8-WORD1-ERR %s + + +;--- fp8-byte0-err.ll +; ERR-FP8-BYTE0-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.cvt.f32.fp8 +; ERR-FP8-BYTE0-ERR-GISEL: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.cvt.f32.fp8), %{{[0-9]+}}:vgpr(s32), 0 + +define float @test_cvt_f32_fp8_byte0(i32 %a) { + %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 0) + ret float %ret +} + +;--- fp8-byte1-err.ll +; ERR-FP8-BYTE1-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.cvt.f32.fp8 +; ERR-FP8-BYTE1-ERR-GISEL: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.cvt.f32.fp8), %{{[0-9]+}}:vgpr(s32), 1 +define float @test_cvt_f32_fp8_byte1(i32 %a) { + %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1) + ret float %ret +} + +;--- bf8-byte0-err.ll +; ERR-BF8-BYTE0-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.cvt.f32.bf8 +; ERR-BF8-BYTE0-ERR-GISEL: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.cvt.f32.bf8), %{{[0-9]+}}:vgpr(s32), 0 +define float @test_cvt_f32_bf8_byte0(i32 %a) { + %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0) + ret float %ret +} + +;--- bf8-byte1-err.ll +; ERR-BF8-BYTE1-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.cvt.f32.bf8 +; ERR-BF8-BYTE1-ERR-GISEL: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.cvt.f32.bf8), %{{[0-9]+}}:vgpr(s32), 1 +define float @test_cvt_f32_bf8_byte1(i32 %a) { + %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 1) + ret float %ret +} + +;--- pk-fp8-word0-err.ll +; ERR-PK-FP8-WORD0-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.cvt.pk.f32.fp8 +; ERR-PK-FP8-WORD0-ERR-GISEL: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.cvt.pk.f32.fp8), %{{[0-9]+}}:vgpr(s32), 0 +define <2 x float> @test_cvt_pk_f32_fp8_word0(i32 %a) { + %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 false) + ret <2 x float> %ret +} + +;--- pk-fp8-word1-err.ll +; ERR-PK-FP8-WORD1-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.cvt.pk.f32.fp8 +; ERR-PK-FP8-WORD1-ERR-GISEL: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.cvt.pk.f32.fp8), %{{[0-9]+}}:vgpr(s32), 1 +define <2 x float> @test_cvt_pk_f32_fp8_word1(i32 %a) { + %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true) + ret <2 x float> %ret +} + +;--- pk-bf8-word0-err.ll +; ERR-PK-BF8-WORD0-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.cvt.pk.f32.bf8 +; ERR-PK-BF8-WORD0-ERR-GISEL: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.cvt.pk.f32.bf8), %{{[0-9]+}}:vgpr(s32), 0 +define <2 x float> @test_cvt_pk_f32_bf8_word0(i32 %a) { + %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false) + ret <2 x float> %ret +} + +;--- pk-bf8-word1-err.ll +; ERR-PK-BF8-WORD1-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.cvt.pk.f32.bf8 +; ERR-PK-BF8-WORD1-ERR-GISEL: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.cvt.pk.f32.bf8), %{{[0-9]+}}:vgpr(s32), 1 +define <2 x float> @test_cvt_pk_f32_bf8_word1(i32 %a) { + %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 true) + ret <2 x float> %ret +} From dc09f9644144a9598837a3684414603edb175e51 Mon Sep 17 00:00:00 2001 From: Wael Yehia Date: Wed, 9 Oct 2024 17:53:16 +0000 Subject: [PATCH 050/119] [test] remove profile file at the start of profile/instrprof-write-file-atexit-explicitly.c --- .../test/profile/instrprof-write-file-atexit-explicitly.c | 1 + 1 file changed, 1 insertion(+) diff --git a/compiler-rt/test/profile/instrprof-write-file-atexit-explicitly.c b/compiler-rt/test/profile/instrprof-write-file-atexit-explicitly.c index 18c365af50034..6923e1d1d55d6 100644 --- a/compiler-rt/test/profile/instrprof-write-file-atexit-explicitly.c +++ b/compiler-rt/test/profile/instrprof-write-file-atexit-explicitly.c @@ -1,3 +1,4 @@ +// RUN: rm -f %t.profraw // RUN: %clang_profgen -o %t -O3 %s // RUN: %run %t %t.profraw // RUN: llvm-profdata merge -o %t.profdata %t.profraw From 102c384b5792eaa4e1b0095f9794637a23196ea3 Mon Sep 17 00:00:00 2001 From: Jorge Gorbe Moya Date: Wed, 9 Oct 2024 10:45:23 -0700 Subject: [PATCH 051/119] Revert "[SandboxVectorizer] Use sbvec-passes flag to create a pipeline of Region passes after BottomUpVec." (#111727) Reverts llvm/llvm-project#111223 It broke one of the build bots: LLVM Buildbot has detected a new failure on builder flang-aarch64-libcxx running on linaro-flang-aarch64-libcxx while building llvm at step 5 "build-unified-tree". Full details are available at: https://lab.llvm.org/buildbot/#/builders/89/builds/8127 --- llvm/include/llvm/SandboxIR/PassManager.h | 82 ++++++-------- .../SandboxVectorizer/Passes/BottomUpVec.h | 8 +- .../SandboxVectorizer/Passes/NullPass.h | 19 ---- .../SandboxVectorizer/SandboxVectorizer.h | 10 +- llvm/lib/SandboxIR/PassManager.cpp | 44 +++++++- .../SandboxVectorizer/Passes/BottomUpVec.cpp | 46 +------- .../SandboxVectorizer/Passes/PassRegistry.def | 22 ---- .../SandboxVectorizer/SandboxVectorizer.cpp | 41 ++++++- .../default_pass_pipeline.ll | 3 +- .../SandboxVectorizer/user_pass_pipeline.ll | 8 +- llvm/unittests/SandboxIR/PassTest.cpp | 104 +++++++++--------- 11 files changed, 183 insertions(+), 204 deletions(-) delete mode 100644 llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/NullPass.h delete mode 100644 llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/PassRegistry.def diff --git a/llvm/include/llvm/SandboxIR/PassManager.h b/llvm/include/llvm/SandboxIR/PassManager.h index 247c43615f576..54192c6bf1333 100644 --- a/llvm/include/llvm/SandboxIR/PassManager.h +++ b/llvm/include/llvm/SandboxIR/PassManager.h @@ -18,8 +18,6 @@ #ifndef LLVM_SANDBOXIR_PASSMANAGER_H #define LLVM_SANDBOXIR_PASSMANAGER_H -#include - #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/SandboxIR/Pass.h" @@ -34,65 +32,25 @@ template class PassManager : public ParentPass { protected: /// The list of passes that this pass manager will run. - SmallVector> Passes; + SmallVector Passes; PassManager(StringRef Name) : ParentPass(Name) {} PassManager(const PassManager &) = delete; - PassManager(PassManager &&) = default; virtual ~PassManager() = default; PassManager &operator=(const PassManager &) = delete; public: /// Adds \p Pass to the pass pipeline. - void addPass(std::unique_ptr Pass) { + void addPass(ContainedPass *Pass) { // TODO: Check that Pass's class type works with this PassManager type. - Passes.push_back(std::move(Pass)); - } - - using CreatePassFunc = - std::function(StringRef)>; - - /// Parses \p Pipeline as a comma-separated sequence of pass names and sets - /// the pass pipeline, using \p CreatePass to instantiate passes by name. - /// - /// After calling this function, the PassManager contains only the specified - /// pipeline, any previously added passes are cleared. - void setPassPipeline(StringRef Pipeline, CreatePassFunc CreatePass) { - static constexpr const char EndToken = '\0'; - static constexpr const char PassDelimToken = ','; - - assert(Passes.empty() && - "setPassPipeline called on a non-empty sandboxir::PassManager"); - // Add EndToken to the end to ease parsing. - std::string PipelineStr = std::string(Pipeline) + EndToken; - int FlagBeginIdx = 0; - - for (auto [Idx, C] : enumerate(PipelineStr)) { - // Keep moving Idx until we find the end of the pass name. - bool FoundDelim = C == EndToken || C == PassDelimToken; - if (!FoundDelim) - continue; - unsigned Sz = Idx - FlagBeginIdx; - std::string PassName(&PipelineStr[FlagBeginIdx], Sz); - FlagBeginIdx = Idx + 1; - - // Get the pass that corresponds to PassName and add it to the pass - // manager. - auto Pass = CreatePass(PassName); - if (Pass == nullptr) { - errs() << "Pass '" << PassName << "' not registered!\n"; - exit(1); - } - addPass(std::move(Pass)); - } + Passes.push_back(Pass); } - #ifndef NDEBUG void print(raw_ostream &OS) const override { OS << this->getName(); OS << "("; // TODO: This should call Pass->print(OS) because Pass may be a PM. - interleave(Passes, OS, [&OS](auto &Pass) { OS << Pass->getName(); }, ","); + interleave(Passes, OS, [&OS](auto *Pass) { OS << Pass->getName(); }, ","); OS << ")"; } LLVM_DUMP_METHOD void dump() const override { @@ -121,6 +79,38 @@ class RegionPassManager final : public PassManager { bool runOnRegion(Region &R) final; }; +/// Owns the passes and provides an API to get a pass by its name. +class PassRegistry { + SmallVector, 8> Passes; + DenseMap NameToPassMap; + +public: + static constexpr const char PassDelimToken = ','; + PassRegistry() = default; + /// Registers \p PassPtr and takes ownership. + Pass ®isterPass(std::unique_ptr &&PassPtr) { + auto &PassRef = *PassPtr.get(); + NameToPassMap[PassRef.getName()] = &PassRef; + Passes.push_back(std::move(PassPtr)); + return PassRef; + } + /// \Returns the pass with name \p Name, or null if not registered. + Pass *getPassByName(StringRef Name) const { + auto It = NameToPassMap.find(Name); + return It != NameToPassMap.end() ? It->second : nullptr; + } + /// Creates a pass pipeline and returns the first pass manager. + FunctionPassManager &parseAndCreatePassPipeline(StringRef Pipeline); + +#ifndef NDEBUG + void print(raw_ostream &OS) const { + for (const auto &PassPtr : Passes) + OS << PassPtr->getName() << "\n"; + } + LLVM_DUMP_METHOD void dump() const; +#endif +}; + } // namespace llvm::sandboxir #endif // LLVM_SANDBOXIR_PASSMANAGER_H diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h index 02abdf0a1ef0d..a2108f07c28e5 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h @@ -15,24 +15,18 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/SandboxIR/Constant.h" #include "llvm/SandboxIR/Pass.h" -#include "llvm/SandboxIR/PassManager.h" #include "llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h" namespace llvm::sandboxir { -class RegionPassManager; - class BottomUpVec final : public FunctionPass { bool Change = false; LegalityAnalysis Legality; void vectorizeRec(ArrayRef Bndl); void tryVectorize(ArrayRef Seeds); - // The PM containing the pipeline of region passes. - RegionPassManager RPM; - public: - BottomUpVec(); + BottomUpVec() : FunctionPass("bottom-up-vec") {} bool runOnFunction(Function &F) final; }; diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/NullPass.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/NullPass.h deleted file mode 100644 index 75b9f42520156..0000000000000 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/NullPass.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_PASSES_NULLPASS_H -#define LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_PASSES_NULLPASS_H - -#include "llvm/SandboxIR/Pass.h" - -namespace llvm::sandboxir { - -class Region; - -/// A Region pass that does nothing, for use as a placeholder in tests. -class NullPass final : public RegionPass { -public: - NullPass() : RegionPass("null") {} - bool runOnRegion(Region &R) final { return false; } -}; - -} // namespace llvm::sandboxir - -#endif // LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_PASSES_NULLPASS_H diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h index 1dcd976bf751c..dd9f02d327264 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h @@ -8,10 +8,7 @@ #ifndef LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_SANDBOXVECTORIZER_H #define LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_SANDBOXVECTORIZER_H -#include - #include "llvm/IR/PassManager.h" -#include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h" namespace llvm { @@ -20,13 +17,10 @@ class TargetTransformInfo; class SandboxVectorizerPass : public PassInfoMixin { TargetTransformInfo *TTI = nullptr; - // The main vectorizer pass. - sandboxir::BottomUpVec BottomUpVecPass; - - bool runImpl(Function &F); - public: PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); + + bool runImpl(Function &F); }; } // namespace llvm diff --git a/llvm/lib/SandboxIR/PassManager.cpp b/llvm/lib/SandboxIR/PassManager.cpp index 3a1cfa1d367a2..95bc5e56bb3ec 100644 --- a/llvm/lib/SandboxIR/PassManager.cpp +++ b/llvm/lib/SandboxIR/PassManager.cpp @@ -8,11 +8,11 @@ #include "llvm/SandboxIR/PassManager.h" -namespace llvm::sandboxir { +using namespace llvm::sandboxir; bool FunctionPassManager::runOnFunction(Function &F) { bool Change = false; - for (auto &Pass : Passes) { + for (FunctionPass *Pass : Passes) { Change |= Pass->runOnFunction(F); // TODO: run the verifier. } @@ -22,7 +22,7 @@ bool FunctionPassManager::runOnFunction(Function &F) { bool RegionPassManager::runOnRegion(Region &R) { bool Change = false; - for (auto &Pass : Passes) { + for (RegionPass *Pass : Passes) { Change |= Pass->runOnRegion(R); // TODO: run the verifier. } @@ -30,4 +30,40 @@ bool RegionPassManager::runOnRegion(Region &R) { return Change; } -} // namespace llvm::sandboxir +FunctionPassManager & +PassRegistry::parseAndCreatePassPipeline(StringRef Pipeline) { + static constexpr const char EndToken = '\0'; + // Add EndToken to the end to ease parsing. + std::string PipelineStr = std::string(Pipeline) + EndToken; + int FlagBeginIdx = 0; + // Start with a FunctionPassManager. + auto &InitialPM = static_cast( + registerPass(std::make_unique("init-fpm"))); + + for (auto [Idx, C] : enumerate(PipelineStr)) { + // Keep moving Idx until we find the end of the pass name. + bool FoundDelim = C == EndToken || C == PassDelimToken; + if (!FoundDelim) + continue; + unsigned Sz = Idx - FlagBeginIdx; + std::string PassName(&PipelineStr[FlagBeginIdx], Sz); + FlagBeginIdx = Idx + 1; + + // Get the pass that corresponds to PassName and add it to the pass manager. + auto *Pass = getPassByName(PassName); + if (Pass == nullptr) { + errs() << "Pass '" << PassName << "' not registered!\n"; + exit(1); + } + // TODO: This is safe for now, but would require proper upcasting once we + // add more Pass sub-classes. + InitialPM.addPass(static_cast(Pass)); + } + return InitialPM; +} +#ifndef NDEBUG +void PassRegistry::dump() const { + print(dbgs()); + dbgs() << "\n"; +} +#endif // NDEBUG diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp index 77198f932a3ec..c59abd09d4362 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp @@ -10,41 +10,10 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/SandboxIR/Function.h" #include "llvm/SandboxIR/Instruction.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/NullPass.h" -namespace llvm::sandboxir { - -static cl::opt - PrintPassPipeline("sbvec-print-pass-pipeline", cl::init(false), cl::Hidden, - cl::desc("Prints the pass pipeline and returns.")); - -/// A magic string for the default pass pipeline. -static const char *DefaultPipelineMagicStr = "*"; - -static cl::opt UserDefinedPassPipeline( - "sbvec-passes", cl::init(DefaultPipelineMagicStr), cl::Hidden, - cl::desc("Comma-separated list of vectorizer passes. If not set " - "we run the predefined pipeline.")); - -static std::unique_ptr createRegionPass(StringRef Name) { -#define REGION_PASS(NAME, CREATE_PASS) \ - if (Name == NAME) \ - return std::make_unique(CREATE_PASS); -#include "PassRegistry.def" - return nullptr; -} - -BottomUpVec::BottomUpVec() : FunctionPass("bottom-up-vec"), RPM("rpm") { - // Create a pipeline to be run on each Region created by BottomUpVec. - if (UserDefinedPassPipeline == DefaultPipelineMagicStr) { - // TODO: Add default passes to RPM. - } else { - // Create the user-defined pipeline. - RPM.setPassPipeline(UserDefinedPassPipeline, createRegionPass); - } -} +using namespace llvm::sandboxir; +namespace llvm::sandboxir { // TODO: This is a temporary function that returns some seeds. // Replace this with SeedCollector's function when it lands. static llvm::SmallVector collectSeeds(BasicBlock &BB) { @@ -65,6 +34,8 @@ static SmallVector getOperand(ArrayRef Bndl, return Operands; } +} // namespace llvm::sandboxir + void BottomUpVec::vectorizeRec(ArrayRef Bndl) { auto LegalityRes = Legality.canVectorize(Bndl); switch (LegalityRes.getSubclassID()) { @@ -82,23 +53,14 @@ void BottomUpVec::vectorizeRec(ArrayRef Bndl) { void BottomUpVec::tryVectorize(ArrayRef Bndl) { vectorizeRec(Bndl); } bool BottomUpVec::runOnFunction(Function &F) { - if (PrintPassPipeline) { - RPM.printPipeline(outs()); - return false; - } - Change = false; // TODO: Start from innermost BBs first for (auto &BB : F) { // TODO: Replace with proper SeedCollector function. auto Seeds = collectSeeds(BB); // TODO: Slice Seeds into smaller chunks. - // TODO: If vectorization succeeds, run the RegionPassManager on the - // resulting region. if (Seeds.size() >= 2) tryVectorize(Seeds); } return Change; } - -} // namespace llvm::sandboxir diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/PassRegistry.def b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/PassRegistry.def deleted file mode 100644 index bbb0dcba1ea51..0000000000000 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/PassRegistry.def +++ /dev/null @@ -1,22 +0,0 @@ -//===- PassRegistry.def - Registry of passes --------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file is used as the registry of sub-passes that are part of the -// SandboxVectorizer pass. -// -//===----------------------------------------------------------------------===// - -// NOTE: NO INCLUDE GUARD DESIRED! - -#ifndef REGION_PASS -#define REGION_PASS(NAME, CREATE_PASS) -#endif - -REGION_PASS("null", NullPass()) - -#undef REGION_PASS diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp index cbaf2b6288d92..80afcb499a2c2 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp @@ -9,6 +9,8 @@ #include "llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/SandboxIR/Constant.h" +#include "llvm/SandboxIR/PassManager.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h" using namespace llvm; @@ -16,6 +18,18 @@ using namespace llvm; #define SV_NAME "sandbox-vectorizer" #define DEBUG_TYPE SV_NAME +cl::opt + PrintPassPipeline("sbvec-print-pass-pipeline", cl::init(false), cl::Hidden, + cl::desc("Prints the pass pipeline and returns.")); + +/// A magic string for the default pass pipeline. +const char *DefaultPipelineMagicStr = "*"; + +cl::opt UserDefinedPassPipeline( + "sbvec-passes", cl::init(DefaultPipelineMagicStr), cl::Hidden, + cl::desc("Comma-separated list of vectorizer passes. If not set " + "we run the predefined pipeline.")); + PreservedAnalyses SandboxVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) { TTI = &AM.getResult(F); @@ -42,8 +56,31 @@ bool SandboxVectorizerPass::runImpl(Function &LLVMF) { return false; } - // Create SandboxIR for LLVMF and run BottomUpVec on it. sandboxir::Context Ctx(LLVMF.getContext()); + // Create SandboxIR for `LLVMF`. sandboxir::Function &F = *Ctx.createFunction(&LLVMF); - return BottomUpVecPass.runOnFunction(F); + // Create the passes and register them with the PassRegistry. + sandboxir::PassRegistry PR; + auto &BottomUpVecPass = static_cast( + PR.registerPass(std::make_unique())); + + sandboxir::FunctionPassManager *PM = nullptr; + if (UserDefinedPassPipeline == DefaultPipelineMagicStr) { + // Create the default pass pipeline. + PM = &static_cast(PR.registerPass( + std::make_unique("pm"))); + PM->addPass(&BottomUpVecPass); + } else { + // Create the user-defined pipeline. + PM = &PR.parseAndCreatePassPipeline(UserDefinedPassPipeline); + } + + if (PrintPassPipeline) { + PM->printPipeline(outs()); + return false; + } + + // Run the pass pipeline. + bool Change = PM->runOnFunction(F); + return Change; } diff --git a/llvm/test/Transforms/SandboxVectorizer/default_pass_pipeline.ll b/llvm/test/Transforms/SandboxVectorizer/default_pass_pipeline.ll index 86bfbee636478..5ccd64d9f487a 100644 --- a/llvm/test/Transforms/SandboxVectorizer/default_pass_pipeline.ll +++ b/llvm/test/Transforms/SandboxVectorizer/default_pass_pipeline.ll @@ -4,7 +4,8 @@ ; This checks the default pass pipeline for the sandbox vectorizer. define void @pipeline() { -; CHECK: rpm +; CHECK: pm +; CHECK: bottom-up-vec ; CHECK-EMPTY: ret void } diff --git a/llvm/test/Transforms/SandboxVectorizer/user_pass_pipeline.ll b/llvm/test/Transforms/SandboxVectorizer/user_pass_pipeline.ll index 2e6dab0aa29c7..2879fbba1b9c0 100644 --- a/llvm/test/Transforms/SandboxVectorizer/user_pass_pipeline.ll +++ b/llvm/test/Transforms/SandboxVectorizer/user_pass_pipeline.ll @@ -1,12 +1,12 @@ -; RUN: opt -passes=sandbox-vectorizer -sbvec-print-pass-pipeline -sbvec-passes=null,null %s -disable-output | FileCheck %s +; RUN: opt -passes=sandbox-vectorizer -sbvec-print-pass-pipeline -sbvec-passes=bottom-up-vec,bottom-up-vec %s -disable-output | FileCheck %s ; !!!WARNING!!! This won't get updated by update_test_checks.py ! ; This checks the user defined pass pipeline. define void @pipeline() { -; CHECK: rpm -; CHECK: null -; CHECK: null +; CHECK: pm +; CHECK: bottom-up-vec +; CHECK: bottom-up-vec ; CHECK-EMPTY: ret void } diff --git a/llvm/unittests/SandboxIR/PassTest.cpp b/llvm/unittests/SandboxIR/PassTest.cpp index ae7284ecf2deb..b380ae9fd475a 100644 --- a/llvm/unittests/SandboxIR/PassTest.cpp +++ b/llvm/unittests/SandboxIR/PassTest.cpp @@ -180,10 +180,12 @@ define void @foo() { }; unsigned BBCnt1 = 0; unsigned BBCnt2 = 0; + TestPass1 TPass1(BBCnt1); + TestPass2 TPass2(BBCnt2); FunctionPassManager FPM("test-fpm"); - FPM.addPass(std::make_unique(BBCnt1)); - FPM.addPass(std::make_unique(BBCnt2)); + FPM.addPass(&TPass1); + FPM.addPass(&TPass2); // Check runOnFunction(). FPM.runOnFunction(*F); EXPECT_EQ(BBCnt1, 1u); @@ -236,10 +238,12 @@ define i8 @foo(i8 %v0, i8 %v1) { }; unsigned InstCount1 = 0; unsigned InstCount2 = 0; + TestPass1 TPass1(InstCount1); + TestPass2 TPass2(InstCount2); RegionPassManager RPM("test-rpm"); - RPM.addPass(std::make_unique(InstCount1)); - RPM.addPass(std::make_unique(InstCount2)); + RPM.addPass(&TPass1); + RPM.addPass(&TPass2); // Check runOnRegion(). llvm::SmallVector> Regions = Region::createRegionsFromMD(*F); @@ -256,60 +260,62 @@ define i8 @foo(i8 %v0, i8 %v1) { #endif // NDEBUG } -TEST_F(PassTest, SetPassPipeline) { - auto *F = parseFunction(R"IR( -define void @f() { - ret void -} -)IR", - "f"); - class FooPass final : public FunctionPass { - std::string &Str; - +TEST_F(PassTest, PassRegistry) { + class TestPass1 final : public FunctionPass { public: - FooPass(std::string &Str) : FunctionPass("foo-pass"), Str(Str) {} - bool runOnFunction(Function &F) final { - Str += "foo"; - return false; - } + TestPass1() : FunctionPass("test-pass1") {} + bool runOnFunction(Function &F) final { return false; } }; - class BarPass final : public FunctionPass { - std::string &Str; - + class TestPass2 final : public FunctionPass { public: - BarPass(std::string &Str) : FunctionPass("bar-pass"), Str(Str) {} - bool runOnFunction(Function &F) final { - Str += "bar"; - return false; - } + TestPass2() : FunctionPass("test-pass2") {} + bool runOnFunction(Function &F) final { return false; } }; - std::string Str; - auto CreatePass = - [&Str](llvm::StringRef Name) -> std::unique_ptr { - if (Name == "foo") - return std::make_unique(Str); - if (Name == "bar") - return std::make_unique(Str); - return nullptr; + PassRegistry Registry; + auto &TP1 = Registry.registerPass(std::make_unique()); + auto &TP2 = Registry.registerPass(std::make_unique()); + + // Check getPassByName(). + EXPECT_EQ(Registry.getPassByName("test-pass1"), &TP1); + EXPECT_EQ(Registry.getPassByName("test-pass2"), &TP2); + +#ifndef NDEBUG + // Check print(). + std::string Buff; + llvm::raw_string_ostream SS(Buff); + Registry.print(SS); + EXPECT_EQ(Buff, "test-pass1\ntest-pass2\n"); +#endif // NDEBUG +} + +TEST_F(PassTest, ParsePassPipeline) { + class TestPass1 final : public FunctionPass { + public: + TestPass1() : FunctionPass("test-pass1") {} + bool runOnFunction(Function &F) final { return false; } + }; + class TestPass2 final : public FunctionPass { + public: + TestPass2() : FunctionPass("test-pass2") {} + bool runOnFunction(Function &F) final { return false; } }; - FunctionPassManager FPM("test-fpm"); - FPM.setPassPipeline("foo,bar,foo", CreatePass); - FPM.runOnFunction(*F); - EXPECT_EQ(Str, "foobarfoo"); + PassRegistry Registry; + Registry.registerPass(std::make_unique()); + Registry.registerPass(std::make_unique()); - // A second call to setPassPipeline will trigger an assertion in debug mode. + [[maybe_unused]] auto &FPM = + Registry.parseAndCreatePassPipeline("test-pass1,test-pass2,test-pass1"); #ifndef NDEBUG - EXPECT_DEATH(FPM.setPassPipeline("bar,bar,foo", CreatePass), - "setPassPipeline called on a non-empty sandboxir::PassManager"); -#endif + std::string Buff; + llvm::raw_string_ostream SS(Buff); + FPM.print(SS); + EXPECT_EQ(Buff, "init-fpm(test-pass1,test-pass2,test-pass1)"); +#endif // NDEBUG - // Fresh PM for the death tests so they die from bad pipeline strings, rather - // than from multiple setPassPipeline calls. - FunctionPassManager FPM2("test-fpm"); - EXPECT_DEATH(FPM2.setPassPipeline("bad-pass-name", CreatePass), + EXPECT_DEATH(Registry.parseAndCreatePassPipeline("bad-pass-name"), ".*not registered.*"); - EXPECT_DEATH(FPM2.setPassPipeline("", CreatePass), ".*not registered.*"); - EXPECT_DEATH(FPM2.setPassPipeline(",", CreatePass), ".*not registered.*"); + EXPECT_DEATH(Registry.parseAndCreatePassPipeline(""), ".*not registered.*"); + EXPECT_DEATH(Registry.parseAndCreatePassPipeline(","), ".*not registered.*"); } From 208584d91ae138d752d89436e3df12fa8f2e60a8 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Wed, 9 Oct 2024 20:00:33 +0200 Subject: [PATCH 052/119] [clang][bytecode] Fix source range of uncalled base dtor (#111683) Make this emit the same source range as the current interpreter. --- clang/lib/AST/ByteCode/EvaluationResult.cpp | 5 +++-- clang/test/Misc/constexpr-subobj-init-source-ranges.cpp | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/clang/lib/AST/ByteCode/EvaluationResult.cpp b/clang/lib/AST/ByteCode/EvaluationResult.cpp index 627d4b2f65be9..c0d116cdf26c4 100644 --- a/clang/lib/AST/ByteCode/EvaluationResult.cpp +++ b/clang/lib/AST/ByteCode/EvaluationResult.cpp @@ -130,8 +130,9 @@ static bool CheckFieldsInitialized(InterpState &S, SourceLocation Loc, const Descriptor *Desc = BasePtr.getDeclDesc(); if (const auto *CD = dyn_cast_if_present(R->getDecl())) { const auto &BS = *std::next(CD->bases_begin(), I); - S.FFDiag(BS.getBaseTypeLoc(), diag::note_constexpr_uninitialized_base) - << B.Desc->getType() << BS.getSourceRange(); + SourceLocation TypeBeginLoc = BS.getBaseTypeLoc(); + S.FFDiag(TypeBeginLoc, diag::note_constexpr_uninitialized_base) + << B.Desc->getType() << SourceRange(TypeBeginLoc, BS.getEndLoc()); } else { S.FFDiag(Desc->getLocation(), diag::note_constexpr_uninitialized_base) << B.Desc->getType(); diff --git a/clang/test/Misc/constexpr-subobj-init-source-ranges.cpp b/clang/test/Misc/constexpr-subobj-init-source-ranges.cpp index 342da2d886668..990d1056d6d46 100644 --- a/clang/test/Misc/constexpr-subobj-init-source-ranges.cpp +++ b/clang/test/Misc/constexpr-subobj-init-source-ranges.cpp @@ -1,4 +1,5 @@ // RUN: not %clang_cc1 -fsyntax-only -fdiagnostics-print-source-range-info %s 2>&1 | FileCheck %s --strict-whitespace +// RUN: not %clang_cc1 -fsyntax-only -fdiagnostics-print-source-range-info -fexperimental-new-constant-interpreter %s 2>&1 | FileCheck %s --strict-whitespace struct DelBase { constexpr DelBase() = delete; From 1bb52e94621d2cba4f34504697cb0ea83805cb98 Mon Sep 17 00:00:00 2001 From: Nathan Lanza Date: Wed, 9 Oct 2024 14:20:50 -0400 Subject: [PATCH 053/119] [CIR] Build out AST consumer patterns to reach the entry point into CIRGen Build out the necessary infrastructure for the main entry point into ClangIR generation -- CIRGenModule. A set of boilerplate classes exist to facilitate this -- CIRGenerator, CIRGenAction, EmitCIRAction and CIRGenConsumer. These all mirror the corresponding types from LLVM generation by Clang's CodeGen. The main entry point to CIR generation is `CIRGenModule::buildTopLevelDecl`. It is currently just an empty function. We've added a test to ensure that the pipeline reaches this point and doesn't fail, but does nothing else. This will be removed in one of the subsequent patches that'll add basic `cir.func` emission. This patch also re-adds `-emit-cir` to the driver. lib/Driver/Driver.cpp requires that a driver flag exists to facilirate the selection of the right actions for the driver to create. Without a driver flag you get the standard behaviors of `-S`, `-c`, etc. If we want to emit CIR IR and, eventually, bytecode we'll need a driver flag to force this. This is why `-emit-llvm` is a driver flag. Notably, `-emit-llvm-bc` as a cc1 flag doesn't ever do the right thing. Without a driver flag it is incorrectly ignored and an executable is emitted. With `-S` a file named `something.s` is emitted which actually contains bitcode. Reviewers: AaronBallman, MaskRay, bcardosolopes Reviewed By: bcardosolopes, AaronBallman Pull Request: https://github.com/llvm/llvm-project/pull/91007 --- clang/include/clang/CIR/CIRGenerator.h | 60 ++++++++++++++++ .../clang/CIR/FrontendAction/CIRGenAction.h | 60 ++++++++++++++++ clang/include/clang/Driver/Options.td | 2 +- clang/lib/CIR/CMakeLists.txt | 2 + clang/lib/CIR/CodeGen/CIRGenModule.cpp | 32 +++++++++ clang/lib/CIR/CodeGen/CIRGenModule.h | 62 ++++++++++++++++ clang/lib/CIR/CodeGen/CIRGenTypeCache.h | 27 +++++++ clang/lib/CIR/CodeGen/CIRGenerator.cpp | 43 +++++++++++ clang/lib/CIR/CodeGen/CMakeLists.txt | 23 ++++++ clang/lib/CIR/FrontendAction/CIRGenAction.cpp | 72 +++++++++++++++++++ clang/lib/CIR/FrontendAction/CMakeLists.txt | 17 +++++ clang/lib/Driver/ToolChains/Clang.cpp | 3 + clang/lib/FrontendTool/CMakeLists.txt | 15 ++++ .../ExecuteCompilerInvocation.cpp | 16 +++++ clang/test/CIR/hello.c | 5 ++ clang/test/CIR/lit.local.cfg | 2 + 16 files changed, 440 insertions(+), 1 deletion(-) create mode 100644 clang/include/clang/CIR/CIRGenerator.h create mode 100644 clang/include/clang/CIR/FrontendAction/CIRGenAction.h create mode 100644 clang/lib/CIR/CodeGen/CIRGenModule.cpp create mode 100644 clang/lib/CIR/CodeGen/CIRGenModule.h create mode 100644 clang/lib/CIR/CodeGen/CIRGenTypeCache.h create mode 100644 clang/lib/CIR/CodeGen/CIRGenerator.cpp create mode 100644 clang/lib/CIR/CodeGen/CMakeLists.txt create mode 100644 clang/lib/CIR/FrontendAction/CIRGenAction.cpp create mode 100644 clang/lib/CIR/FrontendAction/CMakeLists.txt create mode 100644 clang/test/CIR/hello.c create mode 100644 clang/test/CIR/lit.local.cfg diff --git a/clang/include/clang/CIR/CIRGenerator.h b/clang/include/clang/CIR/CIRGenerator.h new file mode 100644 index 0000000000000..9a8930ac46ea9 --- /dev/null +++ b/clang/include/clang/CIR/CIRGenerator.h @@ -0,0 +1,60 @@ +//===- CIRGenerator.h - CIR Generation from Clang AST ---------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file declares a simple interface to perform CIR generation from Clang +// AST +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_CIR_CIRGENERATOR_H +#define LLVM_CLANG_CIR_CIRGENERATOR_H + +#include "clang/AST/ASTConsumer.h" +#include "clang/Basic/CodeGenOptions.h" + +#include "llvm/ADT/IntrusiveRefCntPtr.h" +#include "llvm/Support/VirtualFileSystem.h" + +#include + +namespace clang { +class DeclGroupRef; +class DiagnosticsEngine; +} // namespace clang + +namespace mlir { +class MLIRContext; +} // namespace mlir +namespace cir { +class CIRGenModule; + +class CIRGenerator : public clang::ASTConsumer { + virtual void anchor(); + clang::DiagnosticsEngine &diags; + clang::ASTContext *astCtx; + // Only used for debug info. + llvm::IntrusiveRefCntPtr fs; + + const clang::CodeGenOptions &codeGenOpts; + +protected: + std::unique_ptr mlirCtx; + std::unique_ptr cgm; + +public: + CIRGenerator(clang::DiagnosticsEngine &diags, + llvm::IntrusiveRefCntPtr fs, + const clang::CodeGenOptions &cgo); + ~CIRGenerator() override; + void Initialize(clang::ASTContext &astCtx) override; + bool HandleTopLevelDecl(clang::DeclGroupRef group) override; +}; + +} // namespace cir + +#endif // LLVM_CLANG_CIR_CIRGENERATOR_H diff --git a/clang/include/clang/CIR/FrontendAction/CIRGenAction.h b/clang/include/clang/CIR/FrontendAction/CIRGenAction.h new file mode 100644 index 0000000000000..2ab612613b73d --- /dev/null +++ b/clang/include/clang/CIR/FrontendAction/CIRGenAction.h @@ -0,0 +1,60 @@ +//===---- CIRGenAction.h - CIR Code Generation Frontend Action -*- C++ -*--===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_CIR_CIRGENACTION_H +#define LLVM_CLANG_CIR_CIRGENACTION_H + +#include "clang/Frontend/FrontendAction.h" + +#include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/OwningOpRef.h" + +namespace mlir { +class MLIRContext; +class ModuleOp; +} // namespace mlir + +namespace cir { +class CIRGenConsumer; + +class CIRGenAction : public clang::ASTFrontendAction { +public: + enum class OutputType { + EmitCIR, + }; + +private: + friend class CIRGenConsumer; + + mlir::OwningOpRef MLIRMod; + + mlir::MLIRContext *MLIRCtx; + +protected: + CIRGenAction(OutputType Action, mlir::MLIRContext *MLIRCtx = nullptr); + + std::unique_ptr + CreateASTConsumer(clang::CompilerInstance &CI, + llvm::StringRef InFile) override; + +public: + ~CIRGenAction() override; + + OutputType Action; +}; + +class EmitCIRAction : public CIRGenAction { + virtual void anchor(); + +public: + EmitCIRAction(mlir::MLIRContext *MLIRCtx = nullptr); +}; + +} // namespace cir + +#endif diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 90f0c4f2df213..9adc0b15f2ea8 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -2996,7 +2996,7 @@ defm clangir : BoolFOption<"clangir", PosFlag, NegFlag LLVM pipeline to compile">, BothFlags<[], [ClangOption, CC1Option], "">>; -def emit_cir : Flag<["-"], "emit-cir">, Visibility<[CC1Option]>, +def emit_cir : Flag<["-"], "emit-cir">, Visibility<[ClangOption, CC1Option]>, Group, HelpText<"Build ASTs and then lower to ClangIR">; /// ClangIR-specific options - END diff --git a/clang/lib/CIR/CMakeLists.txt b/clang/lib/CIR/CMakeLists.txt index d2ff200e0da5f..11cca734808df 100644 --- a/clang/lib/CIR/CMakeLists.txt +++ b/clang/lib/CIR/CMakeLists.txt @@ -2,3 +2,5 @@ include_directories(${LLVM_MAIN_SRC_DIR}/../mlir/include) include_directories(${CMAKE_BINARY_DIR}/tools/mlir/include) add_subdirectory(Dialect) +add_subdirectory(CodeGen) +add_subdirectory(FrontendAction) diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp new file mode 100644 index 0000000000000..95e62326939fc --- /dev/null +++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp @@ -0,0 +1,32 @@ +//===- CIRGenModule.cpp - Per-Module state for CIR generation -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This is the internal per-translation-unit state used for CIR translation. +// +//===----------------------------------------------------------------------===// + +#include "CIRGenModule.h" + +#include "clang/AST/ASTContext.h" +#include "clang/AST/DeclBase.h" + +#include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/Location.h" +#include "mlir/IR/MLIRContext.h" + +using namespace cir; +CIRGenModule::CIRGenModule(mlir::MLIRContext &context, + clang::ASTContext &astctx, + const clang::CodeGenOptions &cgo, + DiagnosticsEngine &diags) + : astCtx(astctx), langOpts(astctx.getLangOpts()), + theModule{mlir::ModuleOp::create(mlir::UnknownLoc())}, + target(astCtx.getTargetInfo()) {} + +// Emit code for a single top level declaration. +void CIRGenModule::buildTopLevelDecl(Decl *decl) {} diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h new file mode 100644 index 0000000000000..ab2a1d8864659 --- /dev/null +++ b/clang/lib/CIR/CodeGen/CIRGenModule.h @@ -0,0 +1,62 @@ +//===--- CIRGenModule.h - Per-Module state for CIR gen ----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This is the internal per-translation-unit state used for CIR translation. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_LIB_CIR_CODEGEN_CIRGENMODULE_H +#define LLVM_CLANG_LIB_CIR_CODEGEN_CIRGENMODULE_H + +#include "CIRGenTypeCache.h" + +#include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/MLIRContext.h" + +namespace clang { +class ASTContext; +class CodeGenOptions; +class Decl; +class DiagnosticsEngine; +class LangOptions; +class TargetInfo; +} // namespace clang + +using namespace clang; +namespace cir { + +/// This class organizes the cross-function state that is used while generating +/// CIR code. +class CIRGenModule : public CIRGenTypeCache { + CIRGenModule(CIRGenModule &) = delete; + CIRGenModule &operator=(CIRGenModule &) = delete; + +public: + CIRGenModule(mlir::MLIRContext &context, clang::ASTContext &astctx, + const clang::CodeGenOptions &cgo, + clang::DiagnosticsEngine &diags); + + ~CIRGenModule() = default; + +private: + /// Hold Clang AST information. + clang::ASTContext &astCtx; + + const clang::LangOptions &langOpts; + + /// A "module" matches a c/cpp source file: containing a list of functions. + mlir::ModuleOp theModule; + + const clang::TargetInfo ⌖ + +public: + void buildTopLevelDecl(clang::Decl *decl); +}; +} // namespace cir + +#endif // LLVM_CLANG_LIB_CIR_CODEGEN_CIRGENMODULE_H diff --git a/clang/lib/CIR/CodeGen/CIRGenTypeCache.h b/clang/lib/CIR/CodeGen/CIRGenTypeCache.h new file mode 100644 index 0000000000000..6478e0a078099 --- /dev/null +++ b/clang/lib/CIR/CodeGen/CIRGenTypeCache.h @@ -0,0 +1,27 @@ +//===--- CIRGenTypeCache.h - Commonly used LLVM types and info -*- C++ --*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This structure provides a set of common types useful during CIR emission. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_LIB_CIR_CIRGENTYPECACHE_H +#define LLVM_CLANG_LIB_CIR_CIRGENTYPECACHE_H + +namespace cir { + +/// This structure provides a set of types that are commonly used +/// during IR emission. It's initialized once in CodeGenModule's +/// constructor and then copied around into new CIRGenFunction's. +struct CIRGenTypeCache { + CIRGenTypeCache() = default; +}; + +} // namespace cir + +#endif // LLVM_CLANG_LIB_CIR_CODEGEN_CIRGENTYPECACHE_H diff --git a/clang/lib/CIR/CodeGen/CIRGenerator.cpp b/clang/lib/CIR/CodeGen/CIRGenerator.cpp new file mode 100644 index 0000000000000..159355a99ece8 --- /dev/null +++ b/clang/lib/CIR/CodeGen/CIRGenerator.cpp @@ -0,0 +1,43 @@ +//===--- CIRGenerator.cpp - Emit CIR from ASTs ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This builds an AST and converts it to CIR. +// +//===----------------------------------------------------------------------===// + +#include "CIRGenModule.h" + +#include "clang/AST/DeclGroup.h" +#include "clang/CIR/CIRGenerator.h" + +using namespace cir; +using namespace clang; + +void CIRGenerator::anchor() {} + +CIRGenerator::CIRGenerator(clang::DiagnosticsEngine &diags, + llvm::IntrusiveRefCntPtr vfs, + const CodeGenOptions &cgo) + : diags(diags), fs(std::move(vfs)), codeGenOpts{cgo} {} +CIRGenerator::~CIRGenerator() = default; + +void CIRGenerator::Initialize(ASTContext &astCtx) { + using namespace llvm; + + this->astCtx = &astCtx; + + cgm = std::make_unique(*mlirCtx, astCtx, codeGenOpts, diags); +} + +bool CIRGenerator::HandleTopLevelDecl(DeclGroupRef group) { + + for (Decl *decl : group) + cgm->buildTopLevelDecl(decl); + + return true; +} diff --git a/clang/lib/CIR/CodeGen/CMakeLists.txt b/clang/lib/CIR/CodeGen/CMakeLists.txt new file mode 100644 index 0000000000000..17a3aabfbd7f0 --- /dev/null +++ b/clang/lib/CIR/CodeGen/CMakeLists.txt @@ -0,0 +1,23 @@ +set( + LLVM_LINK_COMPONENTS + Core + Support +) + +get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) + +add_clang_library(clangCIR + CIRGenerator.cpp + CIRGenModule.cpp + + DEPENDS + MLIRCIR + ${dialect_libs} + + LINK_LIBS + clangAST + clangBasic + clangLex + ${dialect_libs} + MLIRCIR +) diff --git a/clang/lib/CIR/FrontendAction/CIRGenAction.cpp b/clang/lib/CIR/FrontendAction/CIRGenAction.cpp new file mode 100644 index 0000000000000..72b9fa0c13c59 --- /dev/null +++ b/clang/lib/CIR/FrontendAction/CIRGenAction.cpp @@ -0,0 +1,72 @@ +//===--- CIRGenAction.cpp - LLVM Code generation Frontend Action ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/CIR/FrontendAction/CIRGenAction.h" +#include "clang/CIR/CIRGenerator.h" +#include "clang/Frontend/CompilerInstance.h" + +#include "mlir/IR/MLIRContext.h" +#include "mlir/IR/OwningOpRef.h" + +using namespace cir; +using namespace clang; + +namespace cir { + +class CIRGenConsumer : public clang::ASTConsumer { + + virtual void anchor(); + + std::unique_ptr OutputStream; + + IntrusiveRefCntPtr FS; + std::unique_ptr Gen; + +public: + CIRGenConsumer(CIRGenAction::OutputType Action, + DiagnosticsEngine &DiagnosticsEngine, + IntrusiveRefCntPtr VFS, + const HeaderSearchOptions &HeaderSearchOptions, + const CodeGenOptions &CodeGenOptions, + const TargetOptions &TargetOptions, + const LangOptions &LangOptions, + const FrontendOptions &FEOptions, + std::unique_ptr OS) + : OutputStream(std::move(OS)), FS(VFS), + Gen(std::make_unique(DiagnosticsEngine, std::move(VFS), + CodeGenOptions)) {} + + bool HandleTopLevelDecl(DeclGroupRef D) override { + Gen->HandleTopLevelDecl(D); + return true; + } +}; +} // namespace cir + +void CIRGenConsumer::anchor() {} + +CIRGenAction::CIRGenAction(OutputType Act, mlir::MLIRContext *MLIRCtx) + : MLIRCtx(MLIRCtx ? MLIRCtx : new mlir::MLIRContext), Action(Act) {} + +CIRGenAction::~CIRGenAction() { MLIRMod.release(); } + +std::unique_ptr +CIRGenAction::CreateASTConsumer(CompilerInstance &CI, StringRef InFile) { + std::unique_ptr Out = CI.takeOutputStream(); + + auto Result = std::make_unique( + Action, CI.getDiagnostics(), &CI.getVirtualFileSystem(), + CI.getHeaderSearchOpts(), CI.getCodeGenOpts(), CI.getTargetOpts(), + CI.getLangOpts(), CI.getFrontendOpts(), std::move(Out)); + + return Result; +} + +void EmitCIRAction::anchor() {} +EmitCIRAction::EmitCIRAction(mlir::MLIRContext *MLIRCtx) + : CIRGenAction(OutputType::EmitCIR, MLIRCtx) {} diff --git a/clang/lib/CIR/FrontendAction/CMakeLists.txt b/clang/lib/CIR/FrontendAction/CMakeLists.txt new file mode 100644 index 0000000000000..b0616ab5d64b0 --- /dev/null +++ b/clang/lib/CIR/FrontendAction/CMakeLists.txt @@ -0,0 +1,17 @@ +set(LLVM_LINK_COMPONENTS + Core + Support + ) + +get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) + +add_clang_library(clangCIRFrontendAction + CIRGenAction.cpp + + LINK_LIBS + clangAST + clangFrontend + clangCIR + MLIRCIR + MLIRIR + ) diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 5b09f97c40b48..49b07322a21a5 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -5139,6 +5139,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, } } + if (Args.hasArg(options::OPT_fclangir)) + CmdArgs.push_back("-fclangir"); + if (IsOpenMPDevice) { // We have to pass the triple of the host if compiling for an OpenMP device. std::string NormalizedTriple = diff --git a/clang/lib/FrontendTool/CMakeLists.txt b/clang/lib/FrontendTool/CMakeLists.txt index 51c379ade2704..bfc7652b4c118 100644 --- a/clang/lib/FrontendTool/CMakeLists.txt +++ b/clang/lib/FrontendTool/CMakeLists.txt @@ -12,6 +12,15 @@ set(link_libs clangRewriteFrontend ) +set(deps) + +if(CLANG_ENABLE_CIR) + list(APPEND link_libs + clangCIRFrontendAction + MLIRIR + ) +endif() + if(CLANG_ENABLE_ARCMT) list(APPEND link_libs clangARCMigrate @@ -29,7 +38,13 @@ add_clang_library(clangFrontendTool DEPENDS ClangDriverOptions + ${deps} LINK_LIBS ${link_libs} ) + +if(CLANG_ENABLE_CIR) + target_include_directories(clangFrontendTool PRIVATE ${LLVM_MAIN_SRC_DIR}/../mlir/include) + target_include_directories(clangFrontendTool PRIVATE ${CMAKE_BINARY_DIR}/tools/mlir/include) +endif() diff --git a/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp b/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp index 7476b1076d103..60fde03289cf3 100644 --- a/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp +++ b/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp @@ -31,6 +31,11 @@ #include "llvm/Support/BuryPointer.h" #include "llvm/Support/DynamicLibrary.h" #include "llvm/Support/ErrorHandling.h" + +#if CLANG_ENABLE_CIR +#include "clang/CIR/FrontendAction/CIRGenAction.h" +#endif + using namespace clang; using namespace llvm::opt; @@ -42,6 +47,13 @@ CreateFrontendBaseAction(CompilerInstance &CI) { StringRef Action("unknown"); (void)Action; + unsigned UseCIR = CI.getFrontendOpts().UseClangIRPipeline; + frontend::ActionKind Act = CI.getFrontendOpts().ProgramAction; + bool EmitsCIR = Act == EmitCIR; + + if (!UseCIR && EmitsCIR) + llvm::report_fatal_error("-emit-cir and only valid when using -fclangir"); + switch (CI.getFrontendOpts().ProgramAction) { case ASTDeclList: return std::make_unique(); case ASTDump: return std::make_unique(); @@ -54,7 +66,11 @@ CreateFrontendBaseAction(CompilerInstance &CI) { case EmitAssembly: return std::make_unique(); case EmitBC: return std::make_unique(); case EmitCIR: +#if CLANG_ENABLE_CIR + return std::make_unique<::cir::EmitCIRAction>(); +#else llvm_unreachable("CIR suppport not built into clang"); +#endif case EmitHTML: return std::make_unique(); case EmitLLVM: return std::make_unique(); case EmitLLVMOnly: return std::make_unique(); diff --git a/clang/test/CIR/hello.c b/clang/test/CIR/hello.c new file mode 100644 index 0000000000000..61f38d0a5bd01 --- /dev/null +++ b/clang/test/CIR/hello.c @@ -0,0 +1,5 @@ +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s | FileCheck --allow-empty %s + +// just confirm that we don't crash +// CHECK-NOT: * +void foo() {} diff --git a/clang/test/CIR/lit.local.cfg b/clang/test/CIR/lit.local.cfg new file mode 100644 index 0000000000000..6afd60f47bff9 --- /dev/null +++ b/clang/test/CIR/lit.local.cfg @@ -0,0 +1,2 @@ +if not config.root.clang_enable_cir: + config.unsupported = True From 1cfe5b89b70c8170da041e9507fd0801be766669 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Wed, 9 Oct 2024 11:32:41 -0700 Subject: [PATCH 054/119] [lldb] Use SEND_ERROR instead of FATAL_ERROR in test/CMakeLists.txt (#111729) Use SEND_ERROR (continue processing, but skip generation) instead of FATAL_ERROR (stop processing and generation). This means that developers get to see all errors at once, instead of seeing just the first error and having to reconfigure to discover the next one. --- lldb/test/CMakeLists.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lldb/test/CMakeLists.txt b/lldb/test/CMakeLists.txt index 25037934f636c..1abab96ba7bb1 100644 --- a/lldb/test/CMakeLists.txt +++ b/lldb/test/CMakeLists.txt @@ -3,7 +3,7 @@ # Lit requires a Python3 interpreter, let's be careful and fail early if it's # not present. if (NOT DEFINED Python3_EXECUTABLE) - message(FATAL_ERROR + message(SEND_ERROR "LLDB test suite requires a Python3 interpreter but none " "was found. Please install Python3 or disable tests with " "`LLDB_INCLUDE_TESTS=OFF`.") @@ -22,7 +22,7 @@ if(LLDB_ENFORCE_STRICT_TEST_REQUIREMENTS) foreach(module ${useful_python_modules}) lldb_find_python_module(${module}) if (NOT PY_${module}_FOUND) - message(FATAL_ERROR + message(SEND_ERROR "Python module '${module}' not found. Please install it via pip or via " "your operating system's package manager. Alternatively, disable " "strict testing requirements with " @@ -66,10 +66,10 @@ if (LLDB_TEST_OBJC_GNUSTEP) find_package(GNUstepObjC) if (NOT GNUstepObjC_FOUND) if (LLDB_TEST_OBJC_GNUSTEP_DIR) - message(FATAL_ERROR "Failed to find GNUstep libobjc2 in ${LLDB_TEST_OBJC_GNUSTEP_DIR}. " + message(SEND_ERROR "Failed to find GNUstep libobjc2 in ${LLDB_TEST_OBJC_GNUSTEP_DIR}. " "Please check LLDB_TEST_OBJC_GNUSTEP_DIR or turn off LLDB_TEST_OBJC_GNUSTEP.") else() - message(FATAL_ERROR "Failed to find GNUstep libobjc2. " + message(SEND_ERROR "Failed to find GNUstep libobjc2. " "Please set LLDB_TEST_OBJC_GNUSTEP_DIR or turn off LLDB_TEST_OBJC_GNUSTEP.") endif() endif() @@ -185,7 +185,7 @@ if(TARGET clang) set(LIBCXX_LIBRARY_DIR "${LLDB_TEST_LIBCXX_ROOT_DIR}/lib${LIBCXX_LIBDIR_SUFFIX}") set(LIBCXX_GENERATED_INCLUDE_DIR "${LLDB_TEST_LIBCXX_ROOT_DIR}/include/c++/v1") else() - message(FATAL_ERROR + message(SEND_ERROR "Couldn't find libcxx build in '${LLDB_TEST_LIBCXX_ROOT_DIR}'. To run the " "test-suite for a standalone LLDB build please build libcxx and point " "LLDB_TEST_LIBCXX_ROOT_DIR to it.") @@ -194,7 +194,7 @@ if(TARGET clang) # We require libcxx for the test suite, so if we aren't building it, # provide a helpful error about how to resolve the situation. if(NOT LLDB_HAS_LIBCXX) - message(FATAL_ERROR + message(SEND_ERROR "LLDB test suite requires libc++, but it is currently disabled. " "Please add `libcxx` to `LLVM_ENABLE_RUNTIMES` or disable tests via " "`LLDB_INCLUDE_TESTS=OFF`.") From e82fcda1475b6708b7d314fd7a54e551306d5739 Mon Sep 17 00:00:00 2001 From: Tyler Nowicki Date: Wed, 9 Oct 2024 14:34:19 -0400 Subject: [PATCH 055/119] [Coroutines] Move util headers to include/llvm (#111599) Plugin libraries that use coroutines can do so right now, however, to provide their own ABI they need to be able to use various headers, some of which such are required (such as the ABI header). This change exposes the coro utils and required headers by moving them to include/llvm/Transforms/Coroutines. My experience with our out-of-tree plugin ABI has been that at least these headers are needed. The headers moved are: * ABI.h (ABI object) * CoroInstr.h (helpers) * Coroshape.h (Shape object) * MaterializationUtils.h (helpers) * SpillingUtils.h (helpers) * SuspendCrossingInfo.h (analysis) This has no code changes other than those required to move the headers and these are: * include guard name changes * include path changes * minor clang-format induced changes * removal of LLVM_LIBRARY_VISIBILITY --- .../llvm}/Transforms/Coroutines/ABI.h | 17 ++-- .../llvm}/Transforms/Coroutines/CoroInstr.h | 88 +++++++++---------- .../llvm}/Transforms/Coroutines/CoroShape.h | 4 +- .../llvm/Transforms/Coroutines/CoroSplit.h | 1 + .../Coroutines/MaterializationUtils.h | 9 +- .../llvm}/Transforms/Coroutines/SpillUtils.h | 5 +- .../Coroutines/SuspendCrossingInfo.h | 6 +- llvm/lib/Transforms/Coroutines/CoroEarly.cpp | 2 +- llvm/lib/Transforms/Coroutines/CoroFrame.cpp | 9 +- llvm/lib/Transforms/Coroutines/CoroInternal.h | 4 +- llvm/lib/Transforms/Coroutines/CoroSplit.cpp | 6 +- llvm/lib/Transforms/Coroutines/Coroutines.cpp | 6 +- .../Coroutines/MaterializationUtils.cpp | 5 +- llvm/lib/Transforms/Coroutines/SpillUtils.cpp | 3 +- .../Coroutines/SuspendCrossingInfo.cpp | 2 +- .../Transforms/Coroutines/ExtraRematTest.cpp | 1 + 16 files changed, 85 insertions(+), 83 deletions(-) rename llvm/{lib => include/llvm}/Transforms/Coroutines/ABI.h (89%) rename llvm/{lib => include/llvm}/Transforms/Coroutines/CoroInstr.h (90%) rename llvm/{lib => include/llvm}/Transforms/Coroutines/CoroShape.h (99%) rename llvm/{lib => include/llvm}/Transforms/Coroutines/MaterializationUtils.h (76%) rename llvm/{lib => include/llvm}/Transforms/Coroutines/SpillUtils.h (93%) rename llvm/{lib => include/llvm}/Transforms/Coroutines/SuspendCrossingInfo.h (97%) diff --git a/llvm/lib/Transforms/Coroutines/ABI.h b/llvm/include/llvm/Transforms/Coroutines/ABI.h similarity index 89% rename from llvm/lib/Transforms/Coroutines/ABI.h rename to llvm/include/llvm/Transforms/Coroutines/ABI.h index 7fa835e84ca33..e7568d275c161 100644 --- a/llvm/lib/Transforms/Coroutines/ABI.h +++ b/llvm/include/llvm/Transforms/Coroutines/ABI.h @@ -12,12 +12,13 @@ // ABI enum and ABI class are used by the Coroutine passes when lowering. //===----------------------------------------------------------------------===// -#ifndef LIB_TRANSFORMS_COROUTINES_ABI_H -#define LIB_TRANSFORMS_COROUTINES_ABI_H +#ifndef LLVM_TRANSFORMS_COROUTINES_ABI_H +#define LLVM_TRANSFORMS_COROUTINES_ABI_H -#include "CoroShape.h" -#include "SuspendCrossingInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Transforms/Coroutines/CoroShape.h" +#include "llvm/Transforms/Coroutines/MaterializationUtils.h" +#include "llvm/Transforms/Coroutines/SuspendCrossingInfo.h" namespace llvm { @@ -30,7 +31,7 @@ namespace coro { // ABI operations. The ABIs (e.g. Switch, Async, Retcon{Once}) are the common // ABIs. -class LLVM_LIBRARY_VISIBILITY BaseABI { +class BaseABI { public: BaseABI(Function &F, coro::Shape &S, std::function IsMaterializable) @@ -56,7 +57,7 @@ class LLVM_LIBRARY_VISIBILITY BaseABI { std::function IsMaterializable; }; -class LLVM_LIBRARY_VISIBILITY SwitchABI : public BaseABI { +class SwitchABI : public BaseABI { public: SwitchABI(Function &F, coro::Shape &S, std::function IsMaterializable) @@ -69,7 +70,7 @@ class LLVM_LIBRARY_VISIBILITY SwitchABI : public BaseABI { TargetTransformInfo &TTI) override; }; -class LLVM_LIBRARY_VISIBILITY AsyncABI : public BaseABI { +class AsyncABI : public BaseABI { public: AsyncABI(Function &F, coro::Shape &S, std::function IsMaterializable) @@ -82,7 +83,7 @@ class LLVM_LIBRARY_VISIBILITY AsyncABI : public BaseABI { TargetTransformInfo &TTI) override; }; -class LLVM_LIBRARY_VISIBILITY AnyRetconABI : public BaseABI { +class AnyRetconABI : public BaseABI { public: AnyRetconABI(Function &F, coro::Shape &S, std::function IsMaterializable) diff --git a/llvm/lib/Transforms/Coroutines/CoroInstr.h b/llvm/include/llvm/Transforms/Coroutines/CoroInstr.h similarity index 90% rename from llvm/lib/Transforms/Coroutines/CoroInstr.h rename to llvm/include/llvm/Transforms/Coroutines/CoroInstr.h index a31703fe01304..a329a06bf1389 100644 --- a/llvm/lib/Transforms/Coroutines/CoroInstr.h +++ b/llvm/include/llvm/Transforms/Coroutines/CoroInstr.h @@ -22,8 +22,8 @@ // the Coroutine library. //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TRANSFORMS_COROUTINES_COROINSTR_H -#define LLVM_LIB_TRANSFORMS_COROUTINES_COROINSTR_H +#ifndef LLVM_TRANSFORMS_COROUTINES_COROINSTR_H +#define LLVM_TRANSFORMS_COROUTINES_COROINSTR_H #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IntrinsicInst.h" @@ -32,7 +32,7 @@ namespace llvm { /// This class represents the llvm.coro.subfn.addr instruction. -class LLVM_LIBRARY_VISIBILITY CoroSubFnInst : public IntrinsicInst { +class CoroSubFnInst : public IntrinsicInst { enum { FrameArg, IndexArg }; public: @@ -67,7 +67,7 @@ class LLVM_LIBRARY_VISIBILITY CoroSubFnInst : public IntrinsicInst { }; /// This represents the llvm.coro.alloc instruction. -class LLVM_LIBRARY_VISIBILITY CoroAllocInst : public IntrinsicInst { +class CoroAllocInst : public IntrinsicInst { public: // Methods to support type inquiry through isa, cast, and dyn_cast: static bool classof(const IntrinsicInst *I) { @@ -82,7 +82,7 @@ class LLVM_LIBRARY_VISIBILITY CoroAllocInst : public IntrinsicInst { // FIXME: add callback metadata // FIXME: make a proper IntrinisicInst. Currently this is not possible, // because llvm.coro.await.suspend.* can be invoked. -class LLVM_LIBRARY_VISIBILITY CoroAwaitSuspendInst : public CallBase { +class CoroAwaitSuspendInst : public CallBase { enum { AwaiterArg, FrameArg, WrapperArg }; public: @@ -112,7 +112,7 @@ class LLVM_LIBRARY_VISIBILITY CoroAwaitSuspendInst : public CallBase { }; /// This represents a common base class for llvm.coro.id instructions. -class LLVM_LIBRARY_VISIBILITY AnyCoroIdInst : public IntrinsicInst { +class AnyCoroIdInst : public IntrinsicInst { public: CoroAllocInst *getCoroAlloc() { for (User *U : users()) @@ -143,7 +143,7 @@ class LLVM_LIBRARY_VISIBILITY AnyCoroIdInst : public IntrinsicInst { }; /// This represents the llvm.coro.id instruction. -class LLVM_LIBRARY_VISIBILITY CoroIdInst : public AnyCoroIdInst { +class CoroIdInst : public AnyCoroIdInst { enum { AlignArg, PromiseArg, CoroutineArg, InfoArg }; public: @@ -232,7 +232,7 @@ class LLVM_LIBRARY_VISIBILITY CoroIdInst : public AnyCoroIdInst { /// This represents either the llvm.coro.id.retcon or /// llvm.coro.id.retcon.once instruction. -class LLVM_LIBRARY_VISIBILITY AnyCoroIdRetconInst : public AnyCoroIdInst { +class AnyCoroIdRetconInst : public AnyCoroIdInst { enum { SizeArg, AlignArg, StorageArg, PrototypeArg, AllocArg, DeallocArg }; public: @@ -246,9 +246,7 @@ class LLVM_LIBRARY_VISIBILITY AnyCoroIdRetconInst : public AnyCoroIdInst { return cast(getArgOperand(AlignArg))->getAlignValue(); } - Value *getStorage() const { - return getArgOperand(StorageArg); - } + Value *getStorage() const { return getArgOperand(StorageArg); } /// Return the prototype for the continuation function. The type, /// attributes, and calling convention of the continuation function(s) @@ -270,8 +268,8 @@ class LLVM_LIBRARY_VISIBILITY AnyCoroIdRetconInst : public AnyCoroIdInst { // Methods to support type inquiry through isa, cast, and dyn_cast: static bool classof(const IntrinsicInst *I) { auto ID = I->getIntrinsicID(); - return ID == Intrinsic::coro_id_retcon - || ID == Intrinsic::coro_id_retcon_once; + return ID == Intrinsic::coro_id_retcon || + ID == Intrinsic::coro_id_retcon_once; } static bool classof(const Value *V) { return isa(V) && classof(cast(V)); @@ -279,8 +277,7 @@ class LLVM_LIBRARY_VISIBILITY AnyCoroIdRetconInst : public AnyCoroIdInst { }; /// This represents the llvm.coro.id.retcon instruction. -class LLVM_LIBRARY_VISIBILITY CoroIdRetconInst - : public AnyCoroIdRetconInst { +class CoroIdRetconInst : public AnyCoroIdRetconInst { public: // Methods to support type inquiry through isa, cast, and dyn_cast: static bool classof(const IntrinsicInst *I) { @@ -292,8 +289,7 @@ class LLVM_LIBRARY_VISIBILITY CoroIdRetconInst }; /// This represents the llvm.coro.id.retcon.once instruction. -class LLVM_LIBRARY_VISIBILITY CoroIdRetconOnceInst - : public AnyCoroIdRetconInst { +class CoroIdRetconOnceInst : public AnyCoroIdRetconInst { public: // Methods to support type inquiry through isa, cast, and dyn_cast: static bool classof(const IntrinsicInst *I) { @@ -305,7 +301,7 @@ class LLVM_LIBRARY_VISIBILITY CoroIdRetconOnceInst }; /// This represents the llvm.coro.id.async instruction. -class LLVM_LIBRARY_VISIBILITY CoroIdAsyncInst : public AnyCoroIdInst { +class CoroIdAsyncInst : public AnyCoroIdInst { enum { SizeArg, AlignArg, StorageArg, AsyncFuncPtrArg }; public: @@ -356,7 +352,7 @@ class LLVM_LIBRARY_VISIBILITY CoroIdAsyncInst : public AnyCoroIdInst { }; /// This represents the llvm.coro.context.alloc instruction. -class LLVM_LIBRARY_VISIBILITY CoroAsyncContextAllocInst : public IntrinsicInst { +class CoroAsyncContextAllocInst : public IntrinsicInst { enum { AsyncFuncPtrArg }; public: @@ -375,8 +371,7 @@ class LLVM_LIBRARY_VISIBILITY CoroAsyncContextAllocInst : public IntrinsicInst { }; /// This represents the llvm.coro.context.dealloc instruction. -class LLVM_LIBRARY_VISIBILITY CoroAsyncContextDeallocInst - : public IntrinsicInst { +class CoroAsyncContextDeallocInst : public IntrinsicInst { enum { AsyncContextArg }; public: @@ -396,7 +391,7 @@ class LLVM_LIBRARY_VISIBILITY CoroAsyncContextDeallocInst /// This represents the llvm.coro.async.resume instruction. /// During lowering this is replaced by the resume function of a suspend point /// (the continuation function). -class LLVM_LIBRARY_VISIBILITY CoroAsyncResumeInst : public IntrinsicInst { +class CoroAsyncResumeInst : public IntrinsicInst { public: // Methods to support type inquiry through isa, cast, and dyn_cast: static bool classof(const IntrinsicInst *I) { @@ -408,7 +403,7 @@ class LLVM_LIBRARY_VISIBILITY CoroAsyncResumeInst : public IntrinsicInst { }; /// This represents the llvm.coro.async.size.replace instruction. -class LLVM_LIBRARY_VISIBILITY CoroAsyncSizeReplace : public IntrinsicInst { +class CoroAsyncSizeReplace : public IntrinsicInst { public: // Methods to support type inquiry through isa, cast, and dyn_cast: static bool classof(const IntrinsicInst *I) { @@ -420,7 +415,7 @@ class LLVM_LIBRARY_VISIBILITY CoroAsyncSizeReplace : public IntrinsicInst { }; /// This represents the llvm.coro.frame instruction. -class LLVM_LIBRARY_VISIBILITY CoroFrameInst : public IntrinsicInst { +class CoroFrameInst : public IntrinsicInst { public: // Methods to support type inquiry through isa, cast, and dyn_cast: static bool classof(const IntrinsicInst *I) { @@ -432,7 +427,7 @@ class LLVM_LIBRARY_VISIBILITY CoroFrameInst : public IntrinsicInst { }; /// This represents the llvm.coro.free instruction. -class LLVM_LIBRARY_VISIBILITY CoroFreeInst : public IntrinsicInst { +class CoroFreeInst : public IntrinsicInst { enum { IdArg, FrameArg }; public: @@ -447,8 +442,8 @@ class LLVM_LIBRARY_VISIBILITY CoroFreeInst : public IntrinsicInst { } }; -/// This class represents the llvm.coro.begin instruction. -class LLVM_LIBRARY_VISIBILITY CoroBeginInst : public IntrinsicInst { +/// This class represents the llvm.coro.begin instructions. +class CoroBeginInst : public IntrinsicInst { enum { IdArg, MemArg }; public: @@ -468,7 +463,7 @@ class LLVM_LIBRARY_VISIBILITY CoroBeginInst : public IntrinsicInst { }; /// This represents the llvm.coro.save instruction. -class LLVM_LIBRARY_VISIBILITY CoroSaveInst : public IntrinsicInst { +class CoroSaveInst : public IntrinsicInst { public: // Methods to support type inquiry through isa, cast, and dyn_cast: static bool classof(const IntrinsicInst *I) { @@ -480,7 +475,7 @@ class LLVM_LIBRARY_VISIBILITY CoroSaveInst : public IntrinsicInst { }; /// This represents the llvm.coro.promise instruction. -class LLVM_LIBRARY_VISIBILITY CoroPromiseInst : public IntrinsicInst { +class CoroPromiseInst : public IntrinsicInst { enum { FrameArg, AlignArg, FromArg }; public: @@ -505,7 +500,7 @@ class LLVM_LIBRARY_VISIBILITY CoroPromiseInst : public IntrinsicInst { } }; -class LLVM_LIBRARY_VISIBILITY AnyCoroSuspendInst : public IntrinsicInst { +class AnyCoroSuspendInst : public IntrinsicInst { public: CoroSaveInst *getCoroSave() const; @@ -521,7 +516,7 @@ class LLVM_LIBRARY_VISIBILITY AnyCoroSuspendInst : public IntrinsicInst { }; /// This represents the llvm.coro.suspend instruction. -class LLVM_LIBRARY_VISIBILITY CoroSuspendInst : public AnyCoroSuspendInst { +class CoroSuspendInst : public AnyCoroSuspendInst { enum { SaveArg, FinalArg }; public: @@ -553,7 +548,7 @@ inline CoroSaveInst *AnyCoroSuspendInst::getCoroSave() const { } /// This represents the llvm.coro.suspend.async instruction. -class LLVM_LIBRARY_VISIBILITY CoroSuspendAsyncInst : public AnyCoroSuspendInst { +class CoroSuspendAsyncInst : public AnyCoroSuspendInst { public: enum { StorageArgNoArg, @@ -594,7 +589,7 @@ class LLVM_LIBRARY_VISIBILITY CoroSuspendAsyncInst : public AnyCoroSuspendInst { }; /// This represents the llvm.coro.suspend.retcon instruction. -class LLVM_LIBRARY_VISIBILITY CoroSuspendRetconInst : public AnyCoroSuspendInst { +class CoroSuspendRetconInst : public AnyCoroSuspendInst { public: op_iterator value_begin() { return arg_begin(); } const_op_iterator value_begin() const { return arg_begin(); } @@ -619,7 +614,7 @@ class LLVM_LIBRARY_VISIBILITY CoroSuspendRetconInst : public AnyCoroSuspendInst }; /// This represents the llvm.coro.size instruction. -class LLVM_LIBRARY_VISIBILITY CoroSizeInst : public IntrinsicInst { +class CoroSizeInst : public IntrinsicInst { public: // Methods to support type inquiry through isa, cast, and dyn_cast: static bool classof(const IntrinsicInst *I) { @@ -631,7 +626,7 @@ class LLVM_LIBRARY_VISIBILITY CoroSizeInst : public IntrinsicInst { }; /// This represents the llvm.coro.align instruction. -class LLVM_LIBRARY_VISIBILITY CoroAlignInst : public IntrinsicInst { +class CoroAlignInst : public IntrinsicInst { public: // Methods to support type inquiry through isa, cast, and dyn_cast: static bool classof(const IntrinsicInst *I) { @@ -643,7 +638,7 @@ class LLVM_LIBRARY_VISIBILITY CoroAlignInst : public IntrinsicInst { }; /// This represents the llvm.end.results instruction. -class LLVM_LIBRARY_VISIBILITY CoroEndResults : public IntrinsicInst { +class CoroEndResults : public IntrinsicInst { public: op_iterator retval_begin() { return arg_begin(); } const_op_iterator retval_begin() const { return arg_begin(); } @@ -671,7 +666,7 @@ class LLVM_LIBRARY_VISIBILITY CoroEndResults : public IntrinsicInst { } }; -class LLVM_LIBRARY_VISIBILITY AnyCoroEndInst : public IntrinsicInst { +class AnyCoroEndInst : public IntrinsicInst { enum { FrameArg, UnwindArg, TokenArg }; public: @@ -700,7 +695,7 @@ class LLVM_LIBRARY_VISIBILITY AnyCoroEndInst : public IntrinsicInst { }; /// This represents the llvm.coro.end instruction. -class LLVM_LIBRARY_VISIBILITY CoroEndInst : public AnyCoroEndInst { +class CoroEndInst : public AnyCoroEndInst { public: // Methods to support type inquiry through isa, cast, and dyn_cast: static bool classof(const IntrinsicInst *I) { @@ -712,7 +707,7 @@ class LLVM_LIBRARY_VISIBILITY CoroEndInst : public AnyCoroEndInst { }; /// This represents the llvm.coro.end instruction. -class LLVM_LIBRARY_VISIBILITY CoroAsyncEndInst : public AnyCoroEndInst { +class CoroAsyncEndInst : public AnyCoroEndInst { enum { FrameArg, UnwindArg, MustTailCallFuncArg }; public: @@ -736,12 +731,11 @@ class LLVM_LIBRARY_VISIBILITY CoroAsyncEndInst : public AnyCoroEndInst { }; /// This represents the llvm.coro.alloca.alloc instruction. -class LLVM_LIBRARY_VISIBILITY CoroAllocaAllocInst : public IntrinsicInst { +class CoroAllocaAllocInst : public IntrinsicInst { enum { SizeArg, AlignArg }; + public: - Value *getSize() const { - return getArgOperand(SizeArg); - } + Value *getSize() const { return getArgOperand(SizeArg); } Align getAlignment() const { return cast(getArgOperand(AlignArg))->getAlignValue(); } @@ -756,8 +750,9 @@ class LLVM_LIBRARY_VISIBILITY CoroAllocaAllocInst : public IntrinsicInst { }; /// This represents the llvm.coro.alloca.get instruction. -class LLVM_LIBRARY_VISIBILITY CoroAllocaGetInst : public IntrinsicInst { +class CoroAllocaGetInst : public IntrinsicInst { enum { AllocArg }; + public: CoroAllocaAllocInst *getAlloc() const { return cast(getArgOperand(AllocArg)); @@ -773,8 +768,9 @@ class LLVM_LIBRARY_VISIBILITY CoroAllocaGetInst : public IntrinsicInst { }; /// This represents the llvm.coro.alloca.free instruction. -class LLVM_LIBRARY_VISIBILITY CoroAllocaFreeInst : public IntrinsicInst { +class CoroAllocaFreeInst : public IntrinsicInst { enum { AllocArg }; + public: CoroAllocaAllocInst *getAlloc() const { return cast(getArgOperand(AllocArg)); @@ -791,4 +787,4 @@ class LLVM_LIBRARY_VISIBILITY CoroAllocaFreeInst : public IntrinsicInst { } // End namespace llvm. -#endif +#endif // LLVM_TRANSFORMS_COROUTINES_COROINSTR_H diff --git a/llvm/lib/Transforms/Coroutines/CoroShape.h b/llvm/include/llvm/Transforms/Coroutines/CoroShape.h similarity index 99% rename from llvm/lib/Transforms/Coroutines/CoroShape.h rename to llvm/include/llvm/Transforms/Coroutines/CoroShape.h index 7daa03beb2542..ea93ced1ce29e 100644 --- a/llvm/lib/Transforms/Coroutines/CoroShape.h +++ b/llvm/include/llvm/Transforms/Coroutines/CoroShape.h @@ -12,9 +12,9 @@ #ifndef LLVM_TRANSFORMS_COROUTINES_COROSHAPE_H #define LLVM_TRANSFORMS_COROUTINES_COROSHAPE_H -#include "CoroInstr.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/PassManager.h" +#include "llvm/Transforms/Coroutines/CoroInstr.h" namespace llvm { @@ -49,7 +49,7 @@ enum class ABI { // Holds structural Coroutine Intrinsics for a particular function and other // values used during CoroSplit pass. -struct LLVM_LIBRARY_VISIBILITY Shape { +struct Shape { CoroBeginInst *CoroBegin = nullptr; SmallVector CoroEnds; SmallVector CoroSizes; diff --git a/llvm/include/llvm/Transforms/Coroutines/CoroSplit.h b/llvm/include/llvm/Transforms/Coroutines/CoroSplit.h index 52b6c4918ada9..a5fd57f8f9dfa 100644 --- a/llvm/include/llvm/Transforms/Coroutines/CoroSplit.h +++ b/llvm/include/llvm/Transforms/Coroutines/CoroSplit.h @@ -18,6 +18,7 @@ #include "llvm/Analysis/CGSCCPassManager.h" #include "llvm/Analysis/LazyCallGraph.h" #include "llvm/IR/PassManager.h" +#include "llvm/Transforms/Coroutines/ABI.h" namespace llvm { diff --git a/llvm/lib/Transforms/Coroutines/MaterializationUtils.h b/llvm/include/llvm/Transforms/Coroutines/MaterializationUtils.h similarity index 76% rename from llvm/lib/Transforms/Coroutines/MaterializationUtils.h rename to llvm/include/llvm/Transforms/Coroutines/MaterializationUtils.h index f391851c97b3b..d8fc0c86a6fb5 100644 --- a/llvm/lib/Transforms/Coroutines/MaterializationUtils.h +++ b/llvm/include/llvm/Transforms/Coroutines/MaterializationUtils.h @@ -6,11 +6,10 @@ // //===----------------------------------------------------------------------===// -#include "SuspendCrossingInfo.h" -#include "llvm/IR/Instruction.h" +#include "llvm/Transforms/Coroutines/SuspendCrossingInfo.h" -#ifndef LIB_TRANSFORMS_COROUTINES_MATERIALIZATIONUTILS_H -#define LIB_TRANSFORMS_COROUTINES_MATERIALIZATIONUTILS_H +#ifndef LLVM_TRANSFORMS_COROUTINES_MATERIALIZATIONUTILS_H +#define LLVM_TRANSFORMS_COROUTINES_MATERIALIZATIONUTILS_H namespace llvm { @@ -27,4 +26,4 @@ void doRematerializations(Function &F, SuspendCrossingInfo &Checker, } // namespace llvm -#endif // LIB_TRANSFORMS_COROUTINES_MATERIALIZATIONUTILS_H +#endif // LLVM_TRANSFORMS_COROUTINES_MATERIALIZATIONUTILS_H diff --git a/llvm/lib/Transforms/Coroutines/SpillUtils.h b/llvm/include/llvm/Transforms/Coroutines/SpillUtils.h similarity index 93% rename from llvm/lib/Transforms/Coroutines/SpillUtils.h rename to llvm/include/llvm/Transforms/Coroutines/SpillUtils.h index 8843b611e0842..6cdf83c0603f4 100644 --- a/llvm/lib/Transforms/Coroutines/SpillUtils.h +++ b/llvm/include/llvm/Transforms/Coroutines/SpillUtils.h @@ -6,8 +6,9 @@ // //===----------------------------------------------------------------------===// -#include "CoroInternal.h" -#include "SuspendCrossingInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/Transforms/Coroutines/CoroShape.h" +#include "llvm/Transforms/Coroutines/SuspendCrossingInfo.h" #ifndef LLVM_TRANSFORMS_COROUTINES_SPILLINGINFO_H #define LLVM_TRANSFORMS_COROUTINES_SPILLINGINFO_H diff --git a/llvm/lib/Transforms/Coroutines/SuspendCrossingInfo.h b/llvm/include/llvm/Transforms/Coroutines/SuspendCrossingInfo.h similarity index 97% rename from llvm/lib/Transforms/Coroutines/SuspendCrossingInfo.h rename to llvm/include/llvm/Transforms/Coroutines/SuspendCrossingInfo.h index db889966dcf1d..49cae6dde47e5 100644 --- a/llvm/lib/Transforms/Coroutines/SuspendCrossingInfo.h +++ b/llvm/include/llvm/Transforms/Coroutines/SuspendCrossingInfo.h @@ -12,16 +12,16 @@ // ptrs in the BlockToIndexMapping. //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TRANSFORMS_COROUTINES_SUSPENDCROSSINGINFO_H -#define LLVM_LIB_TRANSFORMS_COROUTINES_SUSPENDCROSSINGINFO_H +#ifndef LLVM_TRANSFORMS_COROUTINES_SUSPENDCROSSINGINFO_H +#define LLVM_TRANSFORMS_COROUTINES_SUSPENDCROSSINGINFO_H -#include "CoroInstr.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SmallVector.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instruction.h" +#include "llvm/Transforms/Coroutines/CoroInstr.h" namespace llvm { diff --git a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp index 093a81648c92e..a3674306f3e10 100644 --- a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp @@ -8,12 +8,12 @@ #include "llvm/Transforms/Coroutines/CoroEarly.h" #include "CoroInternal.h" -#include "CoroShape.h" #include "llvm/IR/DIBuilder.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Module.h" +#include "llvm/Transforms/Coroutines/CoroShape.h" using namespace llvm; diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp index 2b43b7a5d027d..021fcc20c1f18 100644 --- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp @@ -15,11 +15,7 @@ // the value into the coroutine frame. //===----------------------------------------------------------------------===// -#include "ABI.h" #include "CoroInternal.h" -#include "MaterializationUtils.h" -#include "SpillUtils.h" -#include "SuspendCrossingInfo.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/SmallString.h" @@ -33,6 +29,11 @@ #include "llvm/IR/Module.h" #include "llvm/Support/Debug.h" #include "llvm/Support/OptimizedStructLayout.h" +#include "llvm/Transforms/Coroutines/ABI.h" +#include "llvm/Transforms/Coroutines/CoroInstr.h" +#include "llvm/Transforms/Coroutines/MaterializationUtils.h" +#include "llvm/Transforms/Coroutines/SpillUtils.h" +#include "llvm/Transforms/Coroutines/SuspendCrossingInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/PromoteMemToReg.h" diff --git a/llvm/lib/Transforms/Coroutines/CoroInternal.h b/llvm/lib/Transforms/Coroutines/CoroInternal.h index 88d0f83c98c9e..a0b52063aca10 100644 --- a/llvm/lib/Transforms/Coroutines/CoroInternal.h +++ b/llvm/lib/Transforms/Coroutines/CoroInternal.h @@ -11,10 +11,10 @@ #ifndef LLVM_LIB_TRANSFORMS_COROUTINES_COROINTERNAL_H #define LLVM_LIB_TRANSFORMS_COROUTINES_COROINTERNAL_H -#include "CoroInstr.h" -#include "CoroShape.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/Transforms/Coroutines/CoroInstr.h" +#include "llvm/Transforms/Coroutines/CoroShape.h" namespace llvm { diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp index 9aed4f6522a3f..ef1f27118bc14 100644 --- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp @@ -19,10 +19,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Coroutines/CoroSplit.h" -#include "ABI.h" -#include "CoroInstr.h" #include "CoroInternal.h" -#include "MaterializationUtils.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/PriorityWorklist.h" #include "llvm/ADT/SmallPtrSet.h" @@ -64,6 +61,9 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/PrettyStackTrace.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Coroutines/ABI.h" +#include "llvm/Transforms/Coroutines/CoroInstr.h" +#include "llvm/Transforms/Coroutines/MaterializationUtils.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/CallGraphUpdater.h" diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp index 184efbfe903d2..f4d9a7a8aa856 100644 --- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp +++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp @@ -10,10 +10,7 @@ // //===----------------------------------------------------------------------===// -#include "ABI.h" -#include "CoroInstr.h" #include "CoroInternal.h" -#include "CoroShape.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/CallGraph.h" @@ -29,6 +26,9 @@ #include "llvm/IR/Type.h" #include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Transforms/Coroutines/ABI.h" +#include "llvm/Transforms/Coroutines/CoroInstr.h" +#include "llvm/Transforms/Coroutines/CoroShape.h" #include "llvm/Transforms/Utils/Local.h" #include #include diff --git a/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp b/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp index 708e8734175f9..c3ea0977d4211 100644 --- a/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp +++ b/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp @@ -9,12 +9,13 @@ // This file contains classes used to materialize insts after suspends points. //===----------------------------------------------------------------------===// -#include "MaterializationUtils.h" -#include "SpillUtils.h" +#include "llvm/Transforms/Coroutines/MaterializationUtils.h" +#include "CoroInternal.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instruction.h" +#include "llvm/Transforms/Coroutines/SpillUtils.h" #include using namespace llvm; diff --git a/llvm/lib/Transforms/Coroutines/SpillUtils.cpp b/llvm/lib/Transforms/Coroutines/SpillUtils.cpp index 96b5c8440e5f9..e8609af992122 100644 --- a/llvm/lib/Transforms/Coroutines/SpillUtils.cpp +++ b/llvm/lib/Transforms/Coroutines/SpillUtils.cpp @@ -6,7 +6,8 @@ // //===----------------------------------------------------------------------===// -#include "SpillUtils.h" +#include "llvm/Transforms/Coroutines/SpillUtils.h" +#include "CoroInternal.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/PtrUseVisitor.h" #include "llvm/IR/CFG.h" diff --git a/llvm/lib/Transforms/Coroutines/SuspendCrossingInfo.cpp b/llvm/lib/Transforms/Coroutines/SuspendCrossingInfo.cpp index 84699e653db60..f18f23306befb 100644 --- a/llvm/lib/Transforms/Coroutines/SuspendCrossingInfo.cpp +++ b/llvm/lib/Transforms/Coroutines/SuspendCrossingInfo.cpp @@ -12,7 +12,7 @@ // ptrs in the BlockToIndexMapping. //===----------------------------------------------------------------------===// -#include "SuspendCrossingInfo.h" +#include "llvm/Transforms/Coroutines/SuspendCrossingInfo.h" // The "coro-suspend-crossing" flag is very noisy. There is another debug type, // "coro-frame", which results in leaner debug spew. diff --git a/llvm/unittests/Transforms/Coroutines/ExtraRematTest.cpp b/llvm/unittests/Transforms/Coroutines/ExtraRematTest.cpp index 9aab3cfd9cf10..1d55889a32d7a 100644 --- a/llvm/unittests/Transforms/Coroutines/ExtraRematTest.cpp +++ b/llvm/unittests/Transforms/Coroutines/ExtraRematTest.cpp @@ -11,6 +11,7 @@ #include "llvm/Passes/PassBuilder.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Testing/Support/Error.h" +#include "llvm/Transforms/Coroutines/ABI.h" #include "llvm/Transforms/Coroutines/CoroSplit.h" #include "gtest/gtest.h" From 9200adee266b5bfaa468c5ce2715ed9794e1a7a8 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Wed, 9 Oct 2024 14:58:13 -0400 Subject: [PATCH 056/119] [libc++] Narrow the exports for common_type (#111681) Based on a comment in #99473, it seems like `export *` may be overkill. --- libcxx/include/module.modulemap | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap index 22a1313498e73..3ea91274a9cc9 100644 --- a/libcxx/include/module.modulemap +++ b/libcxx/include/module.modulemap @@ -73,9 +73,9 @@ module std_core [system] { module common_reference { header "__type_traits/common_reference.h" } module common_type { header "__type_traits/common_type.h" - // We need to export everything from this module because common_type inherits from __builtin_common_type, - // which needs to be re-exported. - export * + // We need to export those because common_type inherits from either of those based on __builtin_common_type. + export std_core.type_traits.type_identity + export std_core.utility_core.empty } module conditional { header "__type_traits/conditional.h" } module conjunction { header "__type_traits/conjunction.h" } From 749e21860061a1b317916579252a864b92a134d7 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Wed, 9 Oct 2024 15:04:56 -0400 Subject: [PATCH 057/119] [libc++][NFC] Remove obsolete --osx-roots parameter to run-buildbot That isn't used anymore since we now run backdeployment testing on the target system directly instead of using pre-packaged roots. --- libcxx/utils/ci/run-buildbot | 8 -------- 1 file changed, 8 deletions(-) diff --git a/libcxx/utils/ci/run-buildbot b/libcxx/utils/ci/run-buildbot index e040f15acc3da..536d627036130 100755 --- a/libcxx/utils/ci/run-buildbot +++ b/libcxx/utils/ci/run-buildbot @@ -28,10 +28,6 @@ ${PROGNAME} [options] --build-dir The directory to use for building the library. By default, this is '/build/'. ---osx-roots Path to pre-downloaded macOS dylibs. By default, we download - them from Green Dragon. This is only relevant at all when - running back-deployment testing if one wants to override - the old dylibs we use to run the tests with different ones. Environment variables CC The C compiler to use, this value is used by CMake. This variable is optional. @@ -66,10 +62,6 @@ while [[ $# -gt 0 ]]; do BUILD_DIR="${2}" shift; shift ;; - --osx-roots) - OSX_ROOTS="${2}" - shift; shift - ;; *) BUILDER="${1}" shift From 4605ba0437728ecf8233ba6dbb52ffba30a22743 Mon Sep 17 00:00:00 2001 From: Tarun Prabhu Date: Wed, 9 Oct 2024 13:09:17 -0600 Subject: [PATCH 058/119] [flang] Link libflangPasses against correct libraries libflangPasses.so was not linked against the correct libraries which caused a build failure with -DBUILD_SHARED_LIBS=On. Fixes #110425 --- flang/lib/Optimizer/Passes/CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/flang/lib/Optimizer/Passes/CMakeLists.txt b/flang/lib/Optimizer/Passes/CMakeLists.txt index 3df988940e005..40abbdfbdd651 100644 --- a/flang/lib/Optimizer/Passes/CMakeLists.txt +++ b/flang/lib/Optimizer/Passes/CMakeLists.txt @@ -1,3 +1,6 @@ +get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) +get_property(extension_libs GLOBAL PROPERTY MLIR_EXTENSION_LIBS) + add_flang_library(flangPasses CommandLineOpts.cpp Pipelines.cpp @@ -9,6 +12,8 @@ add_flang_library(flangPasses FIRCodeGen FIRTransforms FlangOpenMPTransforms + ${dialect_libs} + ${extension_libs} FortranCommon HLFIRTransforms MLIRPass From e0737174f944e6da2d3052e57de04ad93503956b Mon Sep 17 00:00:00 2001 From: Petr Hosek Date: Wed, 9 Oct 2024 12:18:53 -0700 Subject: [PATCH 059/119] [libc++abi] Rename abort_message to __abort_message (#111413) This is an internal API and the name should reflect that. This is a reland of #108887. --- libcxxabi/src/abort_message.cpp | 2 +- libcxxabi/src/abort_message.h | 4 ++-- libcxxabi/src/cxa_default_handlers.cpp | 12 ++++++------ libcxxabi/src/cxa_exception_storage.cpp | 10 +++++----- libcxxabi/src/cxa_guard_impl.h | 2 +- libcxxabi/src/cxa_handlers.cpp | 6 +++--- libcxxabi/src/cxa_thread_atexit.cpp | 2 +- libcxxabi/src/cxa_vector.cpp | 2 +- libcxxabi/src/cxa_virtual.cpp | 4 ++-- libcxxabi/src/demangle/DemangleConfig.h | 2 +- libcxxabi/src/stdlib_new_delete.cpp | 4 ++-- 11 files changed, 25 insertions(+), 25 deletions(-) diff --git a/libcxxabi/src/abort_message.cpp b/libcxxabi/src/abort_message.cpp index 859a5031b93fe..9e5a984807e2c 100644 --- a/libcxxabi/src/abort_message.cpp +++ b/libcxxabi/src/abort_message.cpp @@ -26,7 +26,7 @@ # define _LIBCXXABI_USE_CRASHREPORTER_CLIENT #endif -void abort_message(const char* format, ...) +void __abort_message(const char* format, ...) { // Write message to stderr. We do this before formatting into a // variable-size buffer so that we still get some information if diff --git a/libcxxabi/src/abort_message.h b/libcxxabi/src/abort_message.h index 9764177780191..2c12c429569d3 100644 --- a/libcxxabi/src/abort_message.h +++ b/libcxxabi/src/abort_message.h @@ -12,14 +12,14 @@ #include "cxxabi.h" extern "C" _LIBCXXABI_HIDDEN _LIBCXXABI_NORETURN void -abort_message(const char *format, ...) __attribute__((format(printf, 1, 2))); +__abort_message(const char *format, ...) __attribute__((format(printf, 1, 2))); #ifndef _LIBCXXABI_ASSERT # define _LIBCXXABI_ASSERT(expr, msg) \ do { \ if (!(expr)) { \ char const* __msg = (msg); \ - ::abort_message("%s:%d: %s", __FILE__, __LINE__, __msg); \ + ::__abort_message("%s:%d: %s", __FILE__, __LINE__, __msg); \ } \ } while (false) diff --git a/libcxxabi/src/cxa_default_handlers.cpp b/libcxxabi/src/cxa_default_handlers.cpp index 60e402c55b395..52b1aacae9932 100644 --- a/libcxxabi/src/cxa_default_handlers.cpp +++ b/libcxxabi/src/cxa_default_handlers.cpp @@ -30,18 +30,18 @@ static void demangling_terminate_handler() // If there is no uncaught exception, just note that we're terminating if (!globals) - abort_message("terminating"); + __abort_message("terminating"); __cxa_exception* exception_header = globals->caughtExceptions; if (!exception_header) - abort_message("terminating"); + __abort_message("terminating"); _Unwind_Exception* unwind_exception = reinterpret_cast<_Unwind_Exception*>(exception_header + 1) - 1; // If we're terminating due to a foreign exception if (!__isOurExceptionClass(unwind_exception)) - abort_message("terminating due to %s foreign exception", cause); + __abort_message("terminating due to %s foreign exception", cause); void* thrown_object = __getExceptionClass(unwind_exception) == kOurDependentExceptionClass ? @@ -67,19 +67,19 @@ static void demangling_terminate_handler() { // Include the what() message from the exception const std::exception* e = static_cast(thrown_object); - abort_message("terminating due to %s exception of type %s: %s", cause, name, e->what()); + __abort_message("terminating due to %s exception of type %s: %s", cause, name, e->what()); } else { // Else just note that we're terminating due to an exception - abort_message("terminating due to %s exception of type %s", cause, name); + __abort_message("terminating due to %s exception of type %s", cause, name); } } #else // !_LIBCXXABI_NO_EXCEPTIONS __attribute__((noreturn)) static void demangling_terminate_handler() { - abort_message("terminating"); + __abort_message("terminating"); } #endif // !_LIBCXXABI_NO_EXCEPTIONS diff --git a/libcxxabi/src/cxa_exception_storage.cpp b/libcxxabi/src/cxa_exception_storage.cpp index c842da195accb..733f0d470569a 100644 --- a/libcxxabi/src/cxa_exception_storage.cpp +++ b/libcxxabi/src/cxa_exception_storage.cpp @@ -61,12 +61,12 @@ namespace { void _LIBCPP_TLS_DESTRUCTOR_CC destruct_(void *p) { __free_with_fallback(p); if (0 != std::__libcpp_tls_set(key_, NULL)) - abort_message("cannot zero out thread value for __cxa_get_globals()"); + __abort_message("cannot zero out thread value for __cxa_get_globals()"); } void construct_() { if (0 != std::__libcpp_tls_create(&key_, destruct_)) - abort_message("cannot create thread specific key for __cxa_get_globals()"); + __abort_message("cannot create thread specific key for __cxa_get_globals()"); } } // namespace @@ -80,9 +80,9 @@ extern "C" { retVal = static_cast<__cxa_eh_globals*>( __calloc_with_fallback(1, sizeof(__cxa_eh_globals))); if (NULL == retVal) - abort_message("cannot allocate __cxa_eh_globals"); + __abort_message("cannot allocate __cxa_eh_globals"); if (0 != std::__libcpp_tls_set(key_, retVal)) - abort_message("std::__libcpp_tls_set failure in __cxa_get_globals()"); + __abort_message("std::__libcpp_tls_set failure in __cxa_get_globals()"); } return retVal; } @@ -94,7 +94,7 @@ extern "C" { __cxa_eh_globals *__cxa_get_globals_fast() { // First time through, create the key. if (0 != std::__libcpp_execute_once(&flag_, construct_)) - abort_message("execute once failure in __cxa_get_globals_fast()"); + __abort_message("execute once failure in __cxa_get_globals_fast()"); return static_cast<__cxa_eh_globals*>(std::__libcpp_tls_get(key_)); } } // extern "C" diff --git a/libcxxabi/src/cxa_guard_impl.h b/libcxxabi/src/cxa_guard_impl.h index 3e533054098e2..7b05bf32f3eda 100644 --- a/libcxxabi/src/cxa_guard_impl.h +++ b/libcxxabi/src/cxa_guard_impl.h @@ -91,7 +91,7 @@ // the former. #ifdef BUILDING_CXA_GUARD # include "abort_message.h" -# define ABORT_WITH_MESSAGE(...) ::abort_message(__VA_ARGS__) +# define ABORT_WITH_MESSAGE(...) ::__abort_message(__VA_ARGS__) #elif defined(TESTING_CXA_GUARD) # define ABORT_WITH_MESSAGE(...) ::abort() #else diff --git a/libcxxabi/src/cxa_handlers.cpp b/libcxxabi/src/cxa_handlers.cpp index 344250dde0c7e..f879ff0d8ff18 100644 --- a/libcxxabi/src/cxa_handlers.cpp +++ b/libcxxabi/src/cxa_handlers.cpp @@ -33,7 +33,7 @@ __unexpected(unexpected_handler func) { func(); // unexpected handler should not return - abort_message("unexpected_handler unexpectedly returned"); + __abort_message("unexpected_handler unexpectedly returned"); } __attribute__((noreturn)) @@ -58,13 +58,13 @@ __terminate(terminate_handler func) noexcept #endif // _LIBCXXABI_NO_EXCEPTIONS func(); // handler should not return - abort_message("terminate_handler unexpectedly returned"); + __abort_message("terminate_handler unexpectedly returned"); #ifndef _LIBCXXABI_NO_EXCEPTIONS } catch (...) { // handler should not throw exception - abort_message("terminate_handler unexpectedly threw an exception"); + __abort_message("terminate_handler unexpectedly threw an exception"); } #endif // _LIBCXXABI_NO_EXCEPTIONS } diff --git a/libcxxabi/src/cxa_thread_atexit.cpp b/libcxxabi/src/cxa_thread_atexit.cpp index c6bd0aa323f2e..8546cfe48c397 100644 --- a/libcxxabi/src/cxa_thread_atexit.cpp +++ b/libcxxabi/src/cxa_thread_atexit.cpp @@ -89,7 +89,7 @@ namespace { // __cxa_thread_atexit() may be called arbitrarily late (for example, from // global destructors or atexit() handlers). if (std::__libcpp_tls_create(&dtors_key, run_dtors) != 0) { - abort_message("std::__libcpp_tls_create() failed in __cxa_thread_atexit()"); + __abort_message("std::__libcpp_tls_create() failed in __cxa_thread_atexit()"); } } diff --git a/libcxxabi/src/cxa_vector.cpp b/libcxxabi/src/cxa_vector.cpp index 17d942a6e61c7..857ee27d065c3 100644 --- a/libcxxabi/src/cxa_vector.cpp +++ b/libcxxabi/src/cxa_vector.cpp @@ -121,7 +121,7 @@ void throw_bad_array_new_length() { #ifndef _LIBCXXABI_NO_EXCEPTIONS throw std::bad_array_new_length(); #else - abort_message("__cxa_vec_new failed to allocate memory"); + __abort_message("__cxa_vec_new failed to allocate memory"); #endif } diff --git a/libcxxabi/src/cxa_virtual.cpp b/libcxxabi/src/cxa_virtual.cpp index c868672e00af9..8f4fdd0919f0e 100644 --- a/libcxxabi/src/cxa_virtual.cpp +++ b/libcxxabi/src/cxa_virtual.cpp @@ -13,12 +13,12 @@ namespace __cxxabiv1 { extern "C" { _LIBCXXABI_FUNC_VIS _LIBCXXABI_NORETURN void __cxa_pure_virtual(void) { - abort_message("Pure virtual function called!"); + __abort_message("Pure virtual function called!"); } _LIBCXXABI_FUNC_VIS _LIBCXXABI_NORETURN void __cxa_deleted_virtual(void) { - abort_message("Deleted virtual function called!"); + __abort_message("Deleted virtual function called!"); } } // extern "C" } // abi diff --git a/libcxxabi/src/demangle/DemangleConfig.h b/libcxxabi/src/demangle/DemangleConfig.h index d67d89bdb0692..06fd223f5553f 100644 --- a/libcxxabi/src/demangle/DemangleConfig.h +++ b/libcxxabi/src/demangle/DemangleConfig.h @@ -15,7 +15,7 @@ // build systems to override this value. // https://libcxx.llvm.org/UsingLibcxx.html#enabling-the-safe-libc-mode #ifndef _LIBCPP_VERBOSE_ABORT -#define _LIBCPP_VERBOSE_ABORT(...) abort_message(__VA_ARGS__) +#define _LIBCPP_VERBOSE_ABORT(...) __abort_message(__VA_ARGS__) #include "../abort_message.h" #endif diff --git a/libcxxabi/src/stdlib_new_delete.cpp b/libcxxabi/src/stdlib_new_delete.cpp index b802559d479e2..bd576e6aeed74 100644 --- a/libcxxabi/src/stdlib_new_delete.cpp +++ b/libcxxabi/src/stdlib_new_delete.cpp @@ -32,14 +32,14 @@ inline void __throw_bad_alloc_shim() { #ifndef _LIBCPP_HAS_NO_EXCEPTIONS throw std::bad_alloc(); #else - abort_message("bad_alloc was thrown in -fno-exceptions mode"); + __abort_message("bad_alloc was thrown in -fno-exceptions mode"); #endif } #define _LIBCPP_ASSERT_SHIM(expr, str) \ do { \ if (!expr) \ - abort_message(str); \ + __abort_message(str); \ } while (false) // ------------------ BEGIN COPY ------------------ From d602f935daebce2ae6a023420133b3fa7da3c923 Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Wed, 9 Oct 2024 14:22:34 -0500 Subject: [PATCH 060/119] [flang][OpenMP] Treat POINTER variables as valid variable list items (#111722) Follow-up to 418920b3fbdefec5b56ee2b9db96884d0ada7329, which started diagnosing the legality of objects in OpenMP clauses (and caused some test failures). --- flang/lib/Semantics/check-omp-structure.cpp | 10 +++++++--- flang/lib/Semantics/check-omp-structure.h | 1 + flang/test/Semantics/OpenMP/shared-pointer.f90 | 13 +++++++++++++ 3 files changed, 21 insertions(+), 3 deletions(-) create mode 100644 flang/test/Semantics/OpenMP/shared-pointer.f90 diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp index cf90c92bbf3c4..a54fa14730321 100644 --- a/flang/lib/Semantics/check-omp-structure.cpp +++ b/flang/lib/Semantics/check-omp-structure.cpp @@ -200,8 +200,12 @@ bool OmpStructureChecker::CheckAllowedClause(llvmOmpClause clause) { return CheckAllowed(clause); } +bool OmpStructureChecker::IsVariableListItem(const Symbol &sym) { + return evaluate::IsVariable(sym) || sym.attrs().test(Attr::POINTER); +} + bool OmpStructureChecker::IsExtendedListItem(const Symbol &sym) { - return evaluate::IsVariable(sym) || sym.IsSubprogram(); + return IsVariableListItem(sym) || sym.IsSubprogram(); } bool OmpStructureChecker::IsCloselyNestedRegion(const OmpDirectiveSet &set) { @@ -2351,7 +2355,7 @@ void OmpStructureChecker::Enter(const parser::OmpClause &x) { SymbolSourceMap symbols; GetSymbolsInObjectList(*objList, symbols); for (const auto &[sym, source] : symbols) { - if (!evaluate::IsVariable(sym)) { + if (!IsVariableListItem(*sym)) { deferredNonVariables_.insert({sym, source}); } } @@ -3428,7 +3432,7 @@ void OmpStructureChecker::Enter(const parser::OmpClause::To &x) { SymbolSourceMap symbols; GetSymbolsInObjectList(objList, symbols); for (const auto &[sym, source] : symbols) { - if (!evaluate::IsVariable(*sym)) { + if (!IsVariableListItem(*sym)) { context_.SayWithDecl( *sym, source, "'%s' must be a variable"_err_en_US, sym->name()); } diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h index e6863b53ecfde..a8e60b411e184 100644 --- a/flang/lib/Semantics/check-omp-structure.h +++ b/flang/lib/Semantics/check-omp-structure.h @@ -141,6 +141,7 @@ class OmpStructureChecker private: bool CheckAllowedClause(llvmOmpClause clause); + bool IsVariableListItem(const Symbol &sym); bool IsExtendedListItem(const Symbol &sym); void CheckMultipleOccurrence(semantics::UnorderedSymbolSet &listVars, const std::list &nameList, const parser::CharBlock &item, diff --git a/flang/test/Semantics/OpenMP/shared-pointer.f90 b/flang/test/Semantics/OpenMP/shared-pointer.f90 new file mode 100644 index 0000000000000..6826086d02a54 --- /dev/null +++ b/flang/test/Semantics/OpenMP/shared-pointer.f90 @@ -0,0 +1,13 @@ +!RUN: %flang_fc1 -fopenmp -emit-fir -o - %s | FileCheck %s +!RUN: bbc -fopenmp -emit-fir -o - %s | FileCheck %s + +!Allow POINTER variables in OpenMP SHARED clause. Check that this +!code compiles. + +!CHECK-LABEL: func.func @_QPfoo +subroutine foo() + procedure(), pointer :: pf + !$omp parallel shared(pf) + !$omp end parallel +end + From f020bf15263f71e76e8b64fd0c333fff9744beae Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 9 Oct 2024 15:22:44 -0400 Subject: [PATCH 061/119] [SLP]Initial support for non-power-of-2 (but whole reg) vectorization for stores Allows non-power-of-2 vectorization for stores, but still requires, that vectorized number of elements forms full vector registers. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/111194 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 69 ++++++++++++++----- .../SLPVectorizer/X86/long-full-reg-stores.ll | 28 ++------ 2 files changed, 58 insertions(+), 39 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 553df1c08f3ae..94de520a2715f 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -278,6 +278,22 @@ static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, return bit_ceil(divideCeil(Sz, NumParts)) * NumParts; } +/// Returns the number of elements of the given type \p Ty, not greater than \p +/// Sz, which forms type, which splits by \p TTI into whole vector types during +/// legalization. +static unsigned +getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, + unsigned Sz) { + if (!isValidElementType(Ty)) + return bit_floor(Sz); + // Find the number of elements, which forms full vectors. + unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz)); + if (NumParts == 0 || NumParts >= Sz) + return bit_floor(Sz); + unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts)); + return (Sz / RegVF) * RegVF; +} + static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl &Mask) { // The ShuffleBuilder implementation use shufflevector to splat an "element". @@ -7716,7 +7732,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, } size_t NumUniqueScalarValues = UniqueValues.size(); bool IsFullVectors = hasFullVectorsOrPowerOf2( - *TTI, UniqueValues.front()->getType(), NumUniqueScalarValues); + *TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues); if (NumUniqueScalarValues == VL.size() && (VectorizeNonPowerOf2 || IsFullVectors)) { ReuseShuffleIndices.clear(); @@ -17466,7 +17482,11 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, const unsigned Sz = R.getVectorElementSize(Chain[0]); unsigned VF = Chain.size(); - if (!has_single_bit(Sz) || !has_single_bit(VF) || VF < 2 || VF < MinVF) { + if (!has_single_bit(Sz) || + !hasFullVectorsOrPowerOf2( + *TTI, cast(Chain.front())->getValueOperand()->getType(), + VF) || + VF < 2 || VF < MinVF) { // Check if vectorizing with a non-power-of-2 VF should be considered. At // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost // all vector lanes are used. @@ -17484,10 +17504,12 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, InstructionsState S = getSameOpcode(ValOps.getArrayRef(), *TLI); if (all_of(ValOps, IsaPred) && ValOps.size() > 1) { DenseSet Stores(Chain.begin(), Chain.end()); - bool IsPowerOf2 = - has_single_bit(ValOps.size()) || + bool IsAllowedSize = + hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(), + ValOps.size()) || (VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1)); - if ((!IsPowerOf2 && S.getOpcode() && S.getOpcode() != Instruction::Load && + if ((!IsAllowedSize && S.getOpcode() && + S.getOpcode() != Instruction::Load && (!S.MainOp->isSafeToRemove() || any_of(ValOps.getArrayRef(), [&](Value *V) { @@ -17498,7 +17520,7 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, })); }))) || (ValOps.size() > Chain.size() / 2 && !S.getOpcode())) { - Size = (!IsPowerOf2 && S.getOpcode()) ? 1 : 2; + Size = (!IsAllowedSize && S.getOpcode()) ? 1 : 2; return false; } } @@ -17626,15 +17648,11 @@ bool SLPVectorizerPass::vectorizeStores( unsigned MaxVF = std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts); - unsigned MaxRegVF = MaxVF; auto *Store = cast(Operands[0]); Type *StoreTy = Store->getValueOperand()->getType(); Type *ValueTy = StoreTy; if (auto *Trunc = dyn_cast(Store->getValueOperand())) ValueTy = Trunc->getSrcTy(); - if (ValueTy == StoreTy && - R.getVectorElementSize(Store->getValueOperand()) <= EltSize) - MaxVF = std::min(MaxVF, bit_floor(Operands.size())); unsigned MinVF = std::max( 2, PowerOf2Ceil(TTI->getStoreMinimumVF( R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy, @@ -17652,10 +17670,21 @@ bool SLPVectorizerPass::vectorizeStores( // First try vectorizing with a non-power-of-2 VF. At the moment, only // consider cases where VF + 1 is a power-of-2, i.e. almost all vector // lanes are used. - unsigned CandVF = - std::clamp(Operands.size(), MaxVF, MaxRegVF); - if (has_single_bit(CandVF + 1)) + unsigned CandVF = std::clamp(Operands.size(), MinVF, MaxVF); + if (has_single_bit(CandVF + 1)) { NonPowerOf2VF = CandVF; + assert(NonPowerOf2VF != MaxVF && + "Non-power-of-2 VF should not be equal to MaxVF"); + } + } + + unsigned MaxRegVF = MaxVF; + MaxVF = std::min(MaxVF, bit_floor(Operands.size())); + if (MaxVF < MinVF) { + LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF + << ") < " + << "MinVF (" << MinVF << ")\n"); + continue; } unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF); @@ -17810,7 +17839,7 @@ bool SLPVectorizerPass::vectorizeStores( std::bind(IsNotVectorized, Size >= MaxRegVF, std::placeholders::_1))); } - if (!AnyProfitableGraph && Size >= MaxRegVF) + if (!AnyProfitableGraph && Size >= MaxRegVF && has_single_bit(Size)) break; } // All values vectorized - exit. @@ -17823,7 +17852,7 @@ bool SLPVectorizerPass::vectorizeStores( (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph))) break; constexpr unsigned StoresLimit = 64; - const unsigned MaxTotalNum = bit_floor(std::min( + const unsigned MaxTotalNum = std::min( Operands.size(), static_cast( End - @@ -17831,8 +17860,13 @@ bool SLPVectorizerPass::vectorizeStores( RangeSizes.begin(), find_if(RangeSizes, std::bind(IsNotVectorized, true, std::placeholders::_1))) + - 1))); - unsigned VF = PowerOf2Ceil(CandidateVFs.front()) * 2; + 1)); + unsigned VF = bit_ceil(CandidateVFs.front()) * 2; + unsigned Limit = + getFloorFullVectorNumberOfElements(*TTI, StoreTy, MaxTotalNum); + CandidateVFs.clear(); + if (bit_floor(Limit) == VF) + CandidateVFs.push_back(Limit); if (VF > MaxTotalNum || VF >= StoresLimit) break; for_each(RangeSizes, [&](std::pair &P) { @@ -17841,7 +17875,6 @@ bool SLPVectorizerPass::vectorizeStores( }); // Last attempt to vectorize max number of elements, if all previous // attempts were unsuccessful because of the cost issues. - CandidateVFs.clear(); CandidateVFs.push_back(VF); } } diff --git a/llvm/test/Transforms/SLPVectorizer/X86/long-full-reg-stores.ll b/llvm/test/Transforms/SLPVectorizer/X86/long-full-reg-stores.ll index b5f993f986c7c..aff66dd7c10ea 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/long-full-reg-stores.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/long-full-reg-stores.ll @@ -4,30 +4,16 @@ define void @test(ptr noalias %0, ptr noalias %1) { ; CHECK-LABEL: define void @test( ; CHECK-SAME: ptr noalias [[TMP0:%.*]], ptr noalias [[TMP1:%.*]]) { -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i64 24 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP1]], i64 48 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP1]], i64 8 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP1]], i64 16 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 24 -; CHECK-NEXT: [[TMP8:%.*]] = load double, ptr [[TMP7]], align 8 -; CHECK-NEXT: store double [[TMP8]], ptr [[TMP5]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP0]], i64 48 -; CHECK-NEXT: [[TMP10:%.*]] = load double, ptr [[TMP9]], align 16 -; CHECK-NEXT: store double [[TMP10]], ptr [[TMP6]], align 16 ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP0]], i64 8 -; CHECK-NEXT: [[TMP12:%.*]] = load double, ptr [[TMP11]], align 8 -; CHECK-NEXT: store double [[TMP12]], ptr [[TMP3]], align 8 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP0]], i64 32 -; CHECK-NEXT: [[TMP14:%.*]] = load double, ptr [[TMP13]], align 16 -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP1]], i64 32 -; CHECK-NEXT: store double [[TMP14]], ptr [[TMP15]], align 16 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[TMP0]], i64 56 -; CHECK-NEXT: [[TMP17:%.*]] = load double, ptr [[TMP16]], align 8 -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP1]], i64 40 -; CHECK-NEXT: store double [[TMP17]], ptr [[TMP18]], align 8 -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[TMP0]], i64 16 -; CHECK-NEXT: [[TMP20:%.*]] = load double, ptr [[TMP19]], align 16 -; CHECK-NEXT: store double [[TMP20]], ptr [[TMP4]], align 16 +; CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, ptr [[TMP9]], align 16 +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x double>, ptr [[TMP11]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> poison, <6 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <6 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP10]], <6 x i32> +; CHECK-NEXT: store <6 x double> [[TMP13]], ptr [[TMP5]], align 8 ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP0]], i64 40 ; CHECK-NEXT: [[TMP22:%.*]] = load double, ptr [[TMP21]], align 8 ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[TMP1]], i64 56 From 74e1062e34542c6c43293da51ad6e1c7d41ced2c Mon Sep 17 00:00:00 2001 From: Zentrik Date: Wed, 9 Oct 2024 20:43:11 +0100 Subject: [PATCH 062/119] [MLIR] Don't build MLIRExecutionEngineShared on Windows (#109524) This disabled the build of `MLIRExecutionEngineShared` because this causes linkage issues in windows for currently unknown reasons. Related issue: https://github.com/llvm/llvm-project/issues/106859. --- mlir/lib/ExecutionEngine/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt index a091944b9ee7d..7fc17b97f0c56 100644 --- a/mlir/lib/ExecutionEngine/CMakeLists.txt +++ b/mlir/lib/ExecutionEngine/CMakeLists.txt @@ -97,7 +97,7 @@ add_mlir_library(MLIRExecutionEngine MLIRTargetLLVMIRExport ) -if(LLVM_BUILD_LLVM_DYLIB) +if(LLVM_BUILD_LLVM_DYLIB AND NOT (WIN32 OR MINGW OR CYGWIN)) # Does not build on windows currently, see #106859 # Build a shared library for the execution engine. Some downstream projects # use this library to build their own CPU runners while preserving dynamic # linkage. From f0fc1d376c85d226e6623a3981da0bf4f4efc2ec Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 9 Oct 2024 12:55:32 -0700 Subject: [PATCH 063/119] [RISCV] Use MCStreamer::emitInstruction instead of calling AsmPrinter::EmitToStreamer. NFC (#111714) This allows us to pass the STI we already have cached instead of AsmPrinter::EmitToStreamer looking it up from the MachineFunction again. My plan is to make EmitHwasanMemaccessSymbols use RISCVAsmPrinter::EmitToStreamer instead of calling MCStreamer::emitInstruction. To do that I need control of the MCSubtargetInfo. --- llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp index 3bed8c4349dac..384a7cf59f063 100644 --- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp +++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp @@ -247,7 +247,7 @@ bool RISCVAsmPrinter::EmitToStreamer(MCStreamer &S, const MCInst &Inst) { bool Res = RISCVRVC::compress(CInst, Inst, *STI); if (Res) ++RISCVNumInstrsCompressed; - AsmPrinter::EmitToStreamer(S, Res ? CInst : Inst); + S.emitInstruction(Res ? CInst : Inst, *STI); return Res; } From 65bd5ed84f8b5e24bbee094a721c386ee6670798 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Wed, 9 Oct 2024 13:07:09 -0700 Subject: [PATCH 064/119] [mlir][openacc] Update verifier to catch missing device type attribute (#111586) Operands with device_type support need the corresponding attribute but this was not catches in the verifier if it was missing. The custom parser usually constructs it but creating the op from python could lead to a segfault in the printer. This patch updates the verifier so we catch this early on. --- mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp | 25 ++++++++++++++----------- mlir/test/Dialect/OpenACC/invalid.mlir | 7 +++++++ 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp index 877bd226a0352..919a0853fb604 100644 --- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp +++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp @@ -759,20 +759,23 @@ static LogicalResult verifyDeviceTypeAndSegmentCountMatch( Op op, OperandRange operands, DenseI32ArrayAttr segments, ArrayAttr deviceTypes, llvm::StringRef keyword, int32_t maxInSegment = 0) { std::size_t numOperandsInSegments = 0; - - if (!segments) - return success(); - - for (auto segCount : segments.asArrayRef()) { - if (maxInSegment != 0 && segCount > maxInSegment) - return op.emitOpError() << keyword << " expects a maximum of " - << maxInSegment << " values per segment"; - numOperandsInSegments += segCount; + std::size_t nbOfSegments = 0; + + if (segments) { + for (auto segCount : segments.asArrayRef()) { + if (maxInSegment != 0 && segCount > maxInSegment) + return op.emitOpError() << keyword << " expects a maximum of " + << maxInSegment << " values per segment"; + numOperandsInSegments += segCount; + ++nbOfSegments; + } } - if (numOperandsInSegments != operands.size()) + + if ((numOperandsInSegments != operands.size()) || + (!deviceTypes && !operands.empty())) return op.emitOpError() << keyword << " operand count does not match count in segments"; - if (deviceTypes.getValue().size() != (size_t)segments.size()) + if (deviceTypes && deviceTypes.getValue().size() != nbOfSegments) return op.emitOpError() << keyword << " segment count does not match device_type count"; return success(); diff --git a/mlir/test/Dialect/OpenACC/invalid.mlir b/mlir/test/Dialect/OpenACC/invalid.mlir index ec5430420524c..96edb585ae21a 100644 --- a/mlir/test/Dialect/OpenACC/invalid.mlir +++ b/mlir/test/Dialect/OpenACC/invalid.mlir @@ -507,6 +507,13 @@ acc.parallel num_gangs({%i64value: i64, %i64value : i64, %i64value : i64, %i64va // ----- +%0 = "arith.constant"() <{value = 1 : i64}> : () -> i64 +// expected-error@+1 {{num_gangs operand count does not match count in segments}} +"acc.parallel"(%0) <{numGangsSegments = array, operandSegmentSizes = array}> ({ +}) : (i64) -> () + +// ----- + %i64value = arith.constant 1 : i64 acc.parallel { // expected-error@+1 {{'acc.set' op cannot be nested in a compute operation}} From 35684fa4bc8d2288d479cb8aa9d275b14bfefead Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Wed, 9 Oct 2024 13:09:19 -0700 Subject: [PATCH 065/119] [Github] Switch vectorization PR label to vectorizers (#111633) This changes the PR label to match the name of the subscriber team. Fixes #111485. --- .github/new-prs-labeler.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/new-prs-labeler.yml b/.github/new-prs-labeler.yml index 8cc6c36fa945b..cef4782331510 100644 --- a/.github/new-prs-labeler.yml +++ b/.github/new-prs-labeler.yml @@ -69,7 +69,7 @@ PGO: - llvm/**/llvm-profdata/**/* - llvm/**/llvm-profgen/**/* -vectorization: +vectorizers: - llvm/lib/Transforms/Vectorize/**/* - llvm/include/llvm/Transforms/Vectorize/**/* From 853c43d04a378c379e49db552e856f02a5ad9216 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Wed, 9 Oct 2024 14:30:09 -0700 Subject: [PATCH 066/119] [TTI] NFC: Port TLI.shouldSinkOperands to TTI (#110564) Porting to TTI provides direct access to the instruction cost model, which can enable instruction cost based sinking without introducing code duplication. --- .../llvm/Analysis/TargetTransformInfo.h | 29 ++ .../llvm/Analysis/TargetTransformInfoImpl.h | 7 + llvm/include/llvm/CodeGen/TargetLowering.h | 19 - llvm/lib/Analysis/TargetTransformInfo.cpp | 9 + llvm/lib/CodeGen/CodeGenPrepare.cpp | 6 +- .../Target/AArch64/AArch64ISelLowering.cpp | 416 ----------------- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 3 - .../AArch64/AArch64TargetTransformInfo.cpp | 417 ++++++++++++++++++ .../AArch64/AArch64TargetTransformInfo.h | 5 +- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 19 - llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 3 - .../AMDGPU/AMDGPUTargetTransformInfo.cpp | 19 + .../Target/AMDGPU/AMDGPUTargetTransformInfo.h | 3 + llvm/lib/Target/ARM/ARMISelLowering.cpp | 143 ------ llvm/lib/Target/ARM/ARMISelLowering.h | 2 - .../lib/Target/ARM/ARMTargetTransformInfo.cpp | 146 ++++++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h | 2 + llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 139 ------ llvm/lib/Target/RISCV/RISCVISelLowering.h | 8 - .../Target/RISCV/RISCVTargetTransformInfo.cpp | 142 +++++- .../Target/RISCV/RISCVTargetTransformInfo.h | 9 + .../WebAssembly/WebAssemblyISelLowering.cpp | 24 - .../WebAssembly/WebAssemblyISelLowering.h | 2 - .../WebAssemblyTargetTransformInfo.cpp | 24 + .../WebAssemblyTargetTransformInfo.h | 6 +- llvm/lib/Target/X86/X86ISelLowering.cpp | 80 ---- llvm/lib/Target/X86/X86ISelLowering.h | 6 - .../lib/Target/X86/X86TargetTransformInfo.cpp | 79 ++++ llvm/lib/Target/X86/X86TargetTransformInfo.h | 5 + 29 files changed, 901 insertions(+), 871 deletions(-) diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 2befacea4df86..5c5da5e06c1bf 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1747,6 +1747,21 @@ class TargetTransformInfo { bool hasActiveVectorLength(unsigned Opcode, Type *DataType, Align Alignment) const; + /// Return true if sinking I's operands to the same basic block as I is + /// profitable, e.g. because the operands can be folded into a target + /// instruction during instruction selection. After calling the function + /// \p Ops contains the Uses to sink ordered by dominance (dominating users + /// come first). + bool isProfitableToSinkOperands(Instruction *I, + SmallVectorImpl &Ops) const; + + /// Return true if it's significantly cheaper to shift a vector by a uniform + /// scalar than by an amount which will vary across each lane. On x86 before + /// AVX2 for example, there is a "psllw" instruction for the former case, but + /// no simple instruction for a general "a << b" operation on vectors. + /// This should also apply to lowering for vector funnel shifts (rotates). + bool isVectorShiftByScalarCheap(Type *Ty) const; + struct VPLegalization { enum VPTransform { // keep the predicating parameter @@ -2187,6 +2202,11 @@ class TargetTransformInfo::Concept { virtual bool supportsScalableVectors() const = 0; virtual bool hasActiveVectorLength(unsigned Opcode, Type *DataType, Align Alignment) const = 0; + virtual bool + isProfitableToSinkOperands(Instruction *I, + SmallVectorImpl &OpsToSink) const = 0; + + virtual bool isVectorShiftByScalarCheap(Type *Ty) const = 0; virtual VPLegalization getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0; virtual bool hasArmWideBranch(bool Thumb) const = 0; @@ -2963,6 +2983,15 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { return Impl.hasActiveVectorLength(Opcode, DataType, Alignment); } + bool isProfitableToSinkOperands(Instruction *I, + SmallVectorImpl &Ops) const override { + return Impl.isProfitableToSinkOperands(I, Ops); + }; + + bool isVectorShiftByScalarCheap(Type *Ty) const override { + return Impl.isVectorShiftByScalarCheap(Ty); + } + VPLegalization getVPLegalizationStrategy(const VPIntrinsic &PI) const override { return Impl.getVPLegalizationStrategy(PI); diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 01a16e7c7b1e5..6d3ce93acbe45 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -977,6 +977,13 @@ class TargetTransformInfoImplBase { return false; } + bool isProfitableToSinkOperands(Instruction *I, + SmallVectorImpl &Ops) const { + return false; + } + + bool isVectorShiftByScalarCheap(Type *Ty) const { return false; } + TargetTransformInfo::VPLegalization getVPLegalizationStrategy(const VPIntrinsic &PI) const { return TargetTransformInfo::VPLegalization( diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 4c76592c42e1e..5ab31a687ec5e 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -2860,15 +2860,6 @@ class TargetLoweringBase { return Value == 0; } - /// Return true if it's significantly cheaper to shift a vector by a uniform - /// scalar than by an amount which will vary across each lane. On x86 before - /// AVX2 for example, there is a "psllw" instruction for the former case, but - /// no simple instruction for a general "a << b" operation on vectors. - /// This should also apply to lowering for vector funnel shifts (rotates). - virtual bool isVectorShiftByScalarCheap(Type *Ty) const { - return false; - } - /// Given a shuffle vector SVI representing a vector splat, return a new /// scalar type of size equal to SVI's scalar type if the new type is more /// profitable. Returns nullptr otherwise. For example under MVE float splats @@ -3085,16 +3076,6 @@ class TargetLoweringBase { /// a larger type. virtual bool signExtendConstant(const ConstantInt *C) const { return false; } - /// Return true if sinking I's operands to the same basic block as I is - /// profitable, e.g. because the operands can be folded into a target - /// instruction during instruction selection. After calling the function - /// \p Ops contains the Uses to sink ordered by dominance (dominating users - /// come first). - virtual bool shouldSinkOperands(Instruction *I, - SmallVectorImpl &Ops) const { - return false; - } - /// Try to optimize extending or truncating conversion instructions (like /// zext, trunc, fptoui, uitofp) for the target. virtual bool diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index b612a3331e573..3dc29fc7cd77b 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1359,6 +1359,15 @@ bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType, return TTIImpl->hasActiveVectorLength(Opcode, DataType, Alignment); } +bool TargetTransformInfo::isProfitableToSinkOperands( + Instruction *I, SmallVectorImpl &OpsToSink) const { + return TTIImpl->isProfitableToSinkOperands(I, OpsToSink); +} + +bool TargetTransformInfo::isVectorShiftByScalarCheap(Type *Ty) const { + return TTIImpl->isVectorShiftByScalarCheap(Ty); +} + TargetTransformInfo::Concept::~Concept() = default; TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {} diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 631cc26d6022f..3e09fbad6ab19 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -7274,7 +7274,7 @@ bool CodeGenPrepare::optimizeShiftInst(BinaryOperator *Shift) { // We can't do this effectively in SDAG because we may not be able to // determine if the select operands are splats from within a basic block. Type *Ty = Shift->getType(); - if (!Ty->isVectorTy() || !TLI->isVectorShiftByScalarCheap(Ty)) + if (!Ty->isVectorTy() || !TTI->isVectorShiftByScalarCheap(Ty)) return false; Value *Cond, *TVal, *FVal; if (!match(Shift->getOperand(1), @@ -7309,7 +7309,7 @@ bool CodeGenPrepare::optimizeFunnelShift(IntrinsicInst *Fsh) { // We can't do this effectively in SDAG because we may not be able to // determine if the select operands are splats from within a basic block. Type *Ty = Fsh->getType(); - if (!Ty->isVectorTy() || !TLI->isVectorShiftByScalarCheap(Ty)) + if (!Ty->isVectorTy() || !TTI->isVectorShiftByScalarCheap(Ty)) return false; Value *Cond, *TVal, *FVal; if (!match(Fsh->getOperand(2), @@ -7566,7 +7566,7 @@ bool CodeGenPrepare::tryToSinkFreeOperands(Instruction *I) { // If the operands of I can be folded into a target instruction together with // I, duplicate and sink them. SmallVector OpsToSink; - if (!TLI->shouldSinkOperands(I, OpsToSink)) + if (!TTI->isProfitableToSinkOperands(I, OpsToSink)) return false; // OpsToSink can contain multiple uses in a use chain (e.g. diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 288fd3639e5eb..381794caeb85b 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -16326,422 +16326,6 @@ bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const { return true; } -static bool isSplatShuffle(Value *V) { - if (auto *Shuf = dyn_cast(V)) - return all_equal(Shuf->getShuffleMask()); - return false; -} - -/// Check if both Op1 and Op2 are shufflevector extracts of either the lower -/// or upper half of the vector elements. -static bool areExtractShuffleVectors(Value *Op1, Value *Op2, - bool AllowSplat = false) { - // Scalable types can't be extract shuffle vectors. - if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy()) - return false; - - auto areTypesHalfed = [](Value *FullV, Value *HalfV) { - auto *FullTy = FullV->getType(); - auto *HalfTy = HalfV->getType(); - return FullTy->getPrimitiveSizeInBits().getFixedValue() == - 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue(); - }; - - auto extractHalf = [](Value *FullV, Value *HalfV) { - auto *FullVT = cast(FullV->getType()); - auto *HalfVT = cast(HalfV->getType()); - return FullVT->getNumElements() == 2 * HalfVT->getNumElements(); - }; - - ArrayRef M1, M2; - Value *S1Op1 = nullptr, *S2Op1 = nullptr; - if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) || - !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2)))) - return false; - - // If we allow splats, set S1Op1/S2Op1 to nullptr for the relavant arg so that - // it is not checked as an extract below. - if (AllowSplat && isSplatShuffle(Op1)) - S1Op1 = nullptr; - if (AllowSplat && isSplatShuffle(Op2)) - S2Op1 = nullptr; - - // Check that the operands are half as wide as the result and we extract - // half of the elements of the input vectors. - if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) || - (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2)))) - return false; - - // Check the mask extracts either the lower or upper half of vector - // elements. - int M1Start = 0; - int M2Start = 0; - int NumElements = cast(Op1->getType())->getNumElements() * 2; - if ((S1Op1 && - !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) || - (S2Op1 && - !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start))) - return false; - - if ((M1Start != 0 && M1Start != (NumElements / 2)) || - (M2Start != 0 && M2Start != (NumElements / 2))) - return false; - if (S1Op1 && S2Op1 && M1Start != M2Start) - return false; - - return true; -} - -/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth -/// of the vector elements. -static bool areExtractExts(Value *Ext1, Value *Ext2) { - auto areExtDoubled = [](Instruction *Ext) { - return Ext->getType()->getScalarSizeInBits() == - 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits(); - }; - - if (!match(Ext1, m_ZExtOrSExt(m_Value())) || - !match(Ext2, m_ZExtOrSExt(m_Value())) || - !areExtDoubled(cast(Ext1)) || - !areExtDoubled(cast(Ext2))) - return false; - - return true; -} - -/// Check if Op could be used with vmull_high_p64 intrinsic. -static bool isOperandOfVmullHighP64(Value *Op) { - Value *VectorOperand = nullptr; - ConstantInt *ElementIndex = nullptr; - return match(Op, m_ExtractElt(m_Value(VectorOperand), - m_ConstantInt(ElementIndex))) && - ElementIndex->getValue() == 1 && - isa(VectorOperand->getType()) && - cast(VectorOperand->getType())->getNumElements() == 2; -} - -/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic. -static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) { - return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2); -} - -static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl &Ops) { - // Restrict ourselves to the form CodeGenPrepare typically constructs. - auto *GEP = dyn_cast(Ptrs); - if (!GEP || GEP->getNumOperands() != 2) - return false; - - Value *Base = GEP->getOperand(0); - Value *Offsets = GEP->getOperand(1); - - // We only care about scalar_base+vector_offsets. - if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy()) - return false; - - // Sink extends that would allow us to use 32-bit offset vectors. - if (isa(Offsets) || isa(Offsets)) { - auto *OffsetsInst = cast(Offsets); - if (OffsetsInst->getType()->getScalarSizeInBits() > 32 && - OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32) - Ops.push_back(&GEP->getOperandUse(1)); - } - - // Sink the GEP. - return true; -} - -/// We want to sink following cases: -/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale; -/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm); -static bool shouldSinkVScale(Value *Op, SmallVectorImpl &Ops) { - if (match(Op, m_VScale())) - return true; - if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) || - match(Op, m_Mul(m_VScale(), m_ConstantInt()))) { - Ops.push_back(&cast(Op)->getOperandUse(0)); - return true; - } - if (match(Op, m_Shl(m_ZExt(m_VScale()), m_ConstantInt())) || - match(Op, m_Mul(m_ZExt(m_VScale()), m_ConstantInt()))) { - Value *ZExtOp = cast(Op)->getOperand(0); - Ops.push_back(&cast(ZExtOp)->getOperandUse(0)); - Ops.push_back(&cast(Op)->getOperandUse(0)); - return true; - } - return false; -} - -/// Check if sinking \p I's operands to I's basic block is profitable, because -/// the operands can be folded into a target instruction, e.g. -/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2). -bool AArch64TargetLowering::shouldSinkOperands( - Instruction *I, SmallVectorImpl &Ops) const { - if (IntrinsicInst *II = dyn_cast(I)) { - switch (II->getIntrinsicID()) { - case Intrinsic::aarch64_neon_smull: - case Intrinsic::aarch64_neon_umull: - if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1), - /*AllowSplat=*/true)) { - Ops.push_back(&II->getOperandUse(0)); - Ops.push_back(&II->getOperandUse(1)); - return true; - } - [[fallthrough]]; - - case Intrinsic::fma: - case Intrinsic::fmuladd: - if (isa(I->getType()) && - cast(I->getType())->getElementType()->isHalfTy() && - !Subtarget->hasFullFP16()) - return false; - [[fallthrough]]; - case Intrinsic::aarch64_neon_sqdmull: - case Intrinsic::aarch64_neon_sqdmulh: - case Intrinsic::aarch64_neon_sqrdmulh: - // Sink splats for index lane variants - if (isSplatShuffle(II->getOperand(0))) - Ops.push_back(&II->getOperandUse(0)); - if (isSplatShuffle(II->getOperand(1))) - Ops.push_back(&II->getOperandUse(1)); - return !Ops.empty(); - case Intrinsic::aarch64_neon_fmlal: - case Intrinsic::aarch64_neon_fmlal2: - case Intrinsic::aarch64_neon_fmlsl: - case Intrinsic::aarch64_neon_fmlsl2: - // Sink splats for index lane variants - if (isSplatShuffle(II->getOperand(1))) - Ops.push_back(&II->getOperandUse(1)); - if (isSplatShuffle(II->getOperand(2))) - Ops.push_back(&II->getOperandUse(2)); - return !Ops.empty(); - case Intrinsic::aarch64_sve_ptest_first: - case Intrinsic::aarch64_sve_ptest_last: - if (auto *IIOp = dyn_cast(II->getOperand(0))) - if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue) - Ops.push_back(&II->getOperandUse(0)); - return !Ops.empty(); - case Intrinsic::aarch64_sme_write_horiz: - case Intrinsic::aarch64_sme_write_vert: - case Intrinsic::aarch64_sme_writeq_horiz: - case Intrinsic::aarch64_sme_writeq_vert: { - auto *Idx = dyn_cast(II->getOperand(1)); - if (!Idx || Idx->getOpcode() != Instruction::Add) - return false; - Ops.push_back(&II->getOperandUse(1)); - return true; - } - case Intrinsic::aarch64_sme_read_horiz: - case Intrinsic::aarch64_sme_read_vert: - case Intrinsic::aarch64_sme_readq_horiz: - case Intrinsic::aarch64_sme_readq_vert: - case Intrinsic::aarch64_sme_ld1b_vert: - case Intrinsic::aarch64_sme_ld1h_vert: - case Intrinsic::aarch64_sme_ld1w_vert: - case Intrinsic::aarch64_sme_ld1d_vert: - case Intrinsic::aarch64_sme_ld1q_vert: - case Intrinsic::aarch64_sme_st1b_vert: - case Intrinsic::aarch64_sme_st1h_vert: - case Intrinsic::aarch64_sme_st1w_vert: - case Intrinsic::aarch64_sme_st1d_vert: - case Intrinsic::aarch64_sme_st1q_vert: - case Intrinsic::aarch64_sme_ld1b_horiz: - case Intrinsic::aarch64_sme_ld1h_horiz: - case Intrinsic::aarch64_sme_ld1w_horiz: - case Intrinsic::aarch64_sme_ld1d_horiz: - case Intrinsic::aarch64_sme_ld1q_horiz: - case Intrinsic::aarch64_sme_st1b_horiz: - case Intrinsic::aarch64_sme_st1h_horiz: - case Intrinsic::aarch64_sme_st1w_horiz: - case Intrinsic::aarch64_sme_st1d_horiz: - case Intrinsic::aarch64_sme_st1q_horiz: { - auto *Idx = dyn_cast(II->getOperand(3)); - if (!Idx || Idx->getOpcode() != Instruction::Add) - return false; - Ops.push_back(&II->getOperandUse(3)); - return true; - } - case Intrinsic::aarch64_neon_pmull: - if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1))) - return false; - Ops.push_back(&II->getOperandUse(0)); - Ops.push_back(&II->getOperandUse(1)); - return true; - case Intrinsic::aarch64_neon_pmull64: - if (!areOperandsOfVmullHighP64(II->getArgOperand(0), - II->getArgOperand(1))) - return false; - Ops.push_back(&II->getArgOperandUse(0)); - Ops.push_back(&II->getArgOperandUse(1)); - return true; - case Intrinsic::masked_gather: - if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops)) - return false; - Ops.push_back(&II->getArgOperandUse(0)); - return true; - case Intrinsic::masked_scatter: - if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops)) - return false; - Ops.push_back(&II->getArgOperandUse(1)); - return true; - default: - return false; - } - } - - // Sink vscales closer to uses for better isel - switch (I->getOpcode()) { - case Instruction::GetElementPtr: - case Instruction::Add: - case Instruction::Sub: - for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) { - if (shouldSinkVScale(I->getOperand(Op), Ops)) { - Ops.push_back(&I->getOperandUse(Op)); - return true; - } - } - break; - default: - break; - } - - if (!I->getType()->isVectorTy()) - return false; - - switch (I->getOpcode()) { - case Instruction::Sub: - case Instruction::Add: { - if (!areExtractExts(I->getOperand(0), I->getOperand(1))) - return false; - - // If the exts' operands extract either the lower or upper elements, we - // can sink them too. - auto Ext1 = cast(I->getOperand(0)); - auto Ext2 = cast(I->getOperand(1)); - if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) { - Ops.push_back(&Ext1->getOperandUse(0)); - Ops.push_back(&Ext2->getOperandUse(0)); - } - - Ops.push_back(&I->getOperandUse(0)); - Ops.push_back(&I->getOperandUse(1)); - - return true; - } - case Instruction::Or: { - // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) -> - // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1) - if (Subtarget->hasNEON()) { - Instruction *OtherAnd, *IA, *IB; - Value *MaskValue; - // MainAnd refers to And instruction that has 'Not' as one of its operands - if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)), - m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))), - m_Instruction(IA)))))) { - if (match(OtherAnd, - m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) { - Instruction *MainAnd = I->getOperand(0) == OtherAnd - ? cast(I->getOperand(1)) - : cast(I->getOperand(0)); - - // Both Ands should be in same basic block as Or - if (I->getParent() != MainAnd->getParent() || - I->getParent() != OtherAnd->getParent()) - return false; - - // Non-mask operands of both Ands should also be in same basic block - if (I->getParent() != IA->getParent() || - I->getParent() != IB->getParent()) - return false; - - Ops.push_back(&MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0)); - Ops.push_back(&I->getOperandUse(0)); - Ops.push_back(&I->getOperandUse(1)); - - return true; - } - } - } - - return false; - } - case Instruction::Mul: { - int NumZExts = 0, NumSExts = 0; - for (auto &Op : I->operands()) { - // Make sure we are not already sinking this operand - if (any_of(Ops, [&](Use *U) { return U->get() == Op; })) - continue; - - if (match(&Op, m_SExt(m_Value()))) { - NumSExts++; - continue; - } else if (match(&Op, m_ZExt(m_Value()))) { - NumZExts++; - continue; - } - - ShuffleVectorInst *Shuffle = dyn_cast(Op); - - // If the Shuffle is a splat and the operand is a zext/sext, sinking the - // operand and the s/zext can help create indexed s/umull. This is - // especially useful to prevent i64 mul being scalarized. - if (Shuffle && isSplatShuffle(Shuffle) && - match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) { - Ops.push_back(&Shuffle->getOperandUse(0)); - Ops.push_back(&Op); - if (match(Shuffle->getOperand(0), m_SExt(m_Value()))) - NumSExts++; - else - NumZExts++; - continue; - } - - if (!Shuffle) - continue; - - Value *ShuffleOperand = Shuffle->getOperand(0); - InsertElementInst *Insert = dyn_cast(ShuffleOperand); - if (!Insert) - continue; - - Instruction *OperandInstr = dyn_cast(Insert->getOperand(1)); - if (!OperandInstr) - continue; - - ConstantInt *ElementConstant = - dyn_cast(Insert->getOperand(2)); - // Check that the insertelement is inserting into element 0 - if (!ElementConstant || !ElementConstant->isZero()) - continue; - - unsigned Opcode = OperandInstr->getOpcode(); - if (Opcode == Instruction::SExt) - NumSExts++; - else if (Opcode == Instruction::ZExt) - NumZExts++; - else { - // If we find that the top bits are known 0, then we can sink and allow - // the backend to generate a umull. - unsigned Bitwidth = I->getType()->getScalarSizeInBits(); - APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2); - const DataLayout &DL = I->getDataLayout(); - if (!MaskedValueIsZero(OperandInstr, UpperMask, DL)) - continue; - NumZExts++; - } - - Ops.push_back(&Shuffle->getOperandUse(0)); - Ops.push_back(&Op); - } - - // Is it profitable to sink if we found two of the same type of extends. - return !Ops.empty() && (NumSExts == 2 || NumZExts == 2); - } - default: - return false; - } - return false; -} - static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth, unsigned NumElts, bool IsLittleEndian, SmallVectorImpl &Mask) { diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 1bae7562f459a..035a802cd49b3 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -688,9 +688,6 @@ class AArch64TargetLowering : public TargetLowering { bool isZExtFree(EVT VT1, EVT VT2) const override; bool isZExtFree(SDValue Val, EVT VT2) const override; - bool shouldSinkOperands(Instruction *I, - SmallVectorImpl &Ops) const override; - bool optimizeExtendOrTruncateConversion( Instruction *I, Loop *L, const TargetTransformInfo &TTI) const override; diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 80d5168ae961a..7b74bb2a03a64 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -4672,3 +4672,420 @@ bool AArch64TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, return TargetTransformInfoImplBase::isLSRCostLess(C1, C2); } + +static bool isSplatShuffle(Value *V) { + if (auto *Shuf = dyn_cast(V)) + return all_equal(Shuf->getShuffleMask()); + return false; +} + +/// Check if both Op1 and Op2 are shufflevector extracts of either the lower +/// or upper half of the vector elements. +static bool areExtractShuffleVectors(Value *Op1, Value *Op2, + bool AllowSplat = false) { + // Scalable types can't be extract shuffle vectors. + if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy()) + return false; + + auto areTypesHalfed = [](Value *FullV, Value *HalfV) { + auto *FullTy = FullV->getType(); + auto *HalfTy = HalfV->getType(); + return FullTy->getPrimitiveSizeInBits().getFixedValue() == + 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue(); + }; + + auto extractHalf = [](Value *FullV, Value *HalfV) { + auto *FullVT = cast(FullV->getType()); + auto *HalfVT = cast(HalfV->getType()); + return FullVT->getNumElements() == 2 * HalfVT->getNumElements(); + }; + + ArrayRef M1, M2; + Value *S1Op1 = nullptr, *S2Op1 = nullptr; + if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) || + !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2)))) + return false; + + // If we allow splats, set S1Op1/S2Op1 to nullptr for the relavant arg so that + // it is not checked as an extract below. + if (AllowSplat && isSplatShuffle(Op1)) + S1Op1 = nullptr; + if (AllowSplat && isSplatShuffle(Op2)) + S2Op1 = nullptr; + + // Check that the operands are half as wide as the result and we extract + // half of the elements of the input vectors. + if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) || + (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2)))) + return false; + + // Check the mask extracts either the lower or upper half of vector + // elements. + int M1Start = 0; + int M2Start = 0; + int NumElements = cast(Op1->getType())->getNumElements() * 2; + if ((S1Op1 && + !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) || + (S2Op1 && + !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start))) + return false; + + if ((M1Start != 0 && M1Start != (NumElements / 2)) || + (M2Start != 0 && M2Start != (NumElements / 2))) + return false; + if (S1Op1 && S2Op1 && M1Start != M2Start) + return false; + + return true; +} + +/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth +/// of the vector elements. +static bool areExtractExts(Value *Ext1, Value *Ext2) { + auto areExtDoubled = [](Instruction *Ext) { + return Ext->getType()->getScalarSizeInBits() == + 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits(); + }; + + if (!match(Ext1, m_ZExtOrSExt(m_Value())) || + !match(Ext2, m_ZExtOrSExt(m_Value())) || + !areExtDoubled(cast(Ext1)) || + !areExtDoubled(cast(Ext2))) + return false; + + return true; +} + +/// Check if Op could be used with vmull_high_p64 intrinsic. +static bool isOperandOfVmullHighP64(Value *Op) { + Value *VectorOperand = nullptr; + ConstantInt *ElementIndex = nullptr; + return match(Op, m_ExtractElt(m_Value(VectorOperand), + m_ConstantInt(ElementIndex))) && + ElementIndex->getValue() == 1 && + isa(VectorOperand->getType()) && + cast(VectorOperand->getType())->getNumElements() == 2; +} + +/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic. +static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) { + return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2); +} + +static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl &Ops) { + // Restrict ourselves to the form CodeGenPrepare typically constructs. + auto *GEP = dyn_cast(Ptrs); + if (!GEP || GEP->getNumOperands() != 2) + return false; + + Value *Base = GEP->getOperand(0); + Value *Offsets = GEP->getOperand(1); + + // We only care about scalar_base+vector_offsets. + if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy()) + return false; + + // Sink extends that would allow us to use 32-bit offset vectors. + if (isa(Offsets) || isa(Offsets)) { + auto *OffsetsInst = cast(Offsets); + if (OffsetsInst->getType()->getScalarSizeInBits() > 32 && + OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32) + Ops.push_back(&GEP->getOperandUse(1)); + } + + // Sink the GEP. + return true; +} + +/// We want to sink following cases: +/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale; +/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm); +static bool shouldSinkVScale(Value *Op, SmallVectorImpl &Ops) { + if (match(Op, m_VScale())) + return true; + if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) || + match(Op, m_Mul(m_VScale(), m_ConstantInt()))) { + Ops.push_back(&cast(Op)->getOperandUse(0)); + return true; + } + if (match(Op, m_Shl(m_ZExt(m_VScale()), m_ConstantInt())) || + match(Op, m_Mul(m_ZExt(m_VScale()), m_ConstantInt()))) { + Value *ZExtOp = cast(Op)->getOperand(0); + Ops.push_back(&cast(ZExtOp)->getOperandUse(0)); + Ops.push_back(&cast(Op)->getOperandUse(0)); + return true; + } + return false; +} + +/// Check if sinking \p I's operands to I's basic block is profitable, because +/// the operands can be folded into a target instruction, e.g. +/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2). +bool AArch64TTIImpl::isProfitableToSinkOperands( + Instruction *I, SmallVectorImpl &Ops) const { + if (IntrinsicInst *II = dyn_cast(I)) { + switch (II->getIntrinsicID()) { + case Intrinsic::aarch64_neon_smull: + case Intrinsic::aarch64_neon_umull: + if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1), + /*AllowSplat=*/true)) { + Ops.push_back(&II->getOperandUse(0)); + Ops.push_back(&II->getOperandUse(1)); + return true; + } + [[fallthrough]]; + + case Intrinsic::fma: + case Intrinsic::fmuladd: + if (isa(I->getType()) && + cast(I->getType())->getElementType()->isHalfTy() && + !ST->hasFullFP16()) + return false; + [[fallthrough]]; + case Intrinsic::aarch64_neon_sqdmull: + case Intrinsic::aarch64_neon_sqdmulh: + case Intrinsic::aarch64_neon_sqrdmulh: + // Sink splats for index lane variants + if (isSplatShuffle(II->getOperand(0))) + Ops.push_back(&II->getOperandUse(0)); + if (isSplatShuffle(II->getOperand(1))) + Ops.push_back(&II->getOperandUse(1)); + return !Ops.empty(); + case Intrinsic::aarch64_neon_fmlal: + case Intrinsic::aarch64_neon_fmlal2: + case Intrinsic::aarch64_neon_fmlsl: + case Intrinsic::aarch64_neon_fmlsl2: + // Sink splats for index lane variants + if (isSplatShuffle(II->getOperand(1))) + Ops.push_back(&II->getOperandUse(1)); + if (isSplatShuffle(II->getOperand(2))) + Ops.push_back(&II->getOperandUse(2)); + return !Ops.empty(); + case Intrinsic::aarch64_sve_ptest_first: + case Intrinsic::aarch64_sve_ptest_last: + if (auto *IIOp = dyn_cast(II->getOperand(0))) + if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue) + Ops.push_back(&II->getOperandUse(0)); + return !Ops.empty(); + case Intrinsic::aarch64_sme_write_horiz: + case Intrinsic::aarch64_sme_write_vert: + case Intrinsic::aarch64_sme_writeq_horiz: + case Intrinsic::aarch64_sme_writeq_vert: { + auto *Idx = dyn_cast(II->getOperand(1)); + if (!Idx || Idx->getOpcode() != Instruction::Add) + return false; + Ops.push_back(&II->getOperandUse(1)); + return true; + } + case Intrinsic::aarch64_sme_read_horiz: + case Intrinsic::aarch64_sme_read_vert: + case Intrinsic::aarch64_sme_readq_horiz: + case Intrinsic::aarch64_sme_readq_vert: + case Intrinsic::aarch64_sme_ld1b_vert: + case Intrinsic::aarch64_sme_ld1h_vert: + case Intrinsic::aarch64_sme_ld1w_vert: + case Intrinsic::aarch64_sme_ld1d_vert: + case Intrinsic::aarch64_sme_ld1q_vert: + case Intrinsic::aarch64_sme_st1b_vert: + case Intrinsic::aarch64_sme_st1h_vert: + case Intrinsic::aarch64_sme_st1w_vert: + case Intrinsic::aarch64_sme_st1d_vert: + case Intrinsic::aarch64_sme_st1q_vert: + case Intrinsic::aarch64_sme_ld1b_horiz: + case Intrinsic::aarch64_sme_ld1h_horiz: + case Intrinsic::aarch64_sme_ld1w_horiz: + case Intrinsic::aarch64_sme_ld1d_horiz: + case Intrinsic::aarch64_sme_ld1q_horiz: + case Intrinsic::aarch64_sme_st1b_horiz: + case Intrinsic::aarch64_sme_st1h_horiz: + case Intrinsic::aarch64_sme_st1w_horiz: + case Intrinsic::aarch64_sme_st1d_horiz: + case Intrinsic::aarch64_sme_st1q_horiz: { + auto *Idx = dyn_cast(II->getOperand(3)); + if (!Idx || Idx->getOpcode() != Instruction::Add) + return false; + Ops.push_back(&II->getOperandUse(3)); + return true; + } + case Intrinsic::aarch64_neon_pmull: + if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1))) + return false; + Ops.push_back(&II->getOperandUse(0)); + Ops.push_back(&II->getOperandUse(1)); + return true; + case Intrinsic::aarch64_neon_pmull64: + if (!areOperandsOfVmullHighP64(II->getArgOperand(0), + II->getArgOperand(1))) + return false; + Ops.push_back(&II->getArgOperandUse(0)); + Ops.push_back(&II->getArgOperandUse(1)); + return true; + case Intrinsic::masked_gather: + if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops)) + return false; + Ops.push_back(&II->getArgOperandUse(0)); + return true; + case Intrinsic::masked_scatter: + if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops)) + return false; + Ops.push_back(&II->getArgOperandUse(1)); + return true; + default: + return false; + } + } + + // Sink vscales closer to uses for better isel + switch (I->getOpcode()) { + case Instruction::GetElementPtr: + case Instruction::Add: + case Instruction::Sub: + for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) { + if (shouldSinkVScale(I->getOperand(Op), Ops)) { + Ops.push_back(&I->getOperandUse(Op)); + return true; + } + } + break; + default: + break; + } + + if (!I->getType()->isVectorTy()) + return false; + + switch (I->getOpcode()) { + case Instruction::Sub: + case Instruction::Add: { + if (!areExtractExts(I->getOperand(0), I->getOperand(1))) + return false; + + // If the exts' operands extract either the lower or upper elements, we + // can sink them too. + auto Ext1 = cast(I->getOperand(0)); + auto Ext2 = cast(I->getOperand(1)); + if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) { + Ops.push_back(&Ext1->getOperandUse(0)); + Ops.push_back(&Ext2->getOperandUse(0)); + } + + Ops.push_back(&I->getOperandUse(0)); + Ops.push_back(&I->getOperandUse(1)); + + return true; + } + case Instruction::Or: { + // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) -> + // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1) + if (ST->hasNEON()) { + Instruction *OtherAnd, *IA, *IB; + Value *MaskValue; + // MainAnd refers to And instruction that has 'Not' as one of its operands + if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)), + m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))), + m_Instruction(IA)))))) { + if (match(OtherAnd, + m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) { + Instruction *MainAnd = I->getOperand(0) == OtherAnd + ? cast(I->getOperand(1)) + : cast(I->getOperand(0)); + + // Both Ands should be in same basic block as Or + if (I->getParent() != MainAnd->getParent() || + I->getParent() != OtherAnd->getParent()) + return false; + + // Non-mask operands of both Ands should also be in same basic block + if (I->getParent() != IA->getParent() || + I->getParent() != IB->getParent()) + return false; + + Ops.push_back( + &MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0)); + Ops.push_back(&I->getOperandUse(0)); + Ops.push_back(&I->getOperandUse(1)); + + return true; + } + } + } + + return false; + } + case Instruction::Mul: { + int NumZExts = 0, NumSExts = 0; + for (auto &Op : I->operands()) { + // Make sure we are not already sinking this operand + if (any_of(Ops, [&](Use *U) { return U->get() == Op; })) + continue; + + if (match(&Op, m_SExt(m_Value()))) { + NumSExts++; + continue; + } else if (match(&Op, m_ZExt(m_Value()))) { + NumZExts++; + continue; + } + + ShuffleVectorInst *Shuffle = dyn_cast(Op); + + // If the Shuffle is a splat and the operand is a zext/sext, sinking the + // operand and the s/zext can help create indexed s/umull. This is + // especially useful to prevent i64 mul being scalarized. + if (Shuffle && isSplatShuffle(Shuffle) && + match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) { + Ops.push_back(&Shuffle->getOperandUse(0)); + Ops.push_back(&Op); + if (match(Shuffle->getOperand(0), m_SExt(m_Value()))) + NumSExts++; + else + NumZExts++; + continue; + } + + if (!Shuffle) + continue; + + Value *ShuffleOperand = Shuffle->getOperand(0); + InsertElementInst *Insert = dyn_cast(ShuffleOperand); + if (!Insert) + continue; + + Instruction *OperandInstr = dyn_cast(Insert->getOperand(1)); + if (!OperandInstr) + continue; + + ConstantInt *ElementConstant = + dyn_cast(Insert->getOperand(2)); + // Check that the insertelement is inserting into element 0 + if (!ElementConstant || !ElementConstant->isZero()) + continue; + + unsigned Opcode = OperandInstr->getOpcode(); + if (Opcode == Instruction::SExt) + NumSExts++; + else if (Opcode == Instruction::ZExt) + NumZExts++; + else { + // If we find that the top bits are known 0, then we can sink and allow + // the backend to generate a umull. + unsigned Bitwidth = I->getType()->getScalarSizeInBits(); + APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2); + const DataLayout &DL = I->getDataLayout(); + if (!MaskedValueIsZero(OperandInstr, UpperMask, DL)) + continue; + NumZExts++; + } + + Ops.push_back(&Shuffle->getOperandUse(0)); + Ops.push_back(&Op); + } + + // Is it profitable to sink if we found two of the same type of extends. + return !Ops.empty() && (NumSExts == 2 || NumZExts == 2); + } + default: + return false; + } + return false; +} diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 28e45207596ec..1d09d67f6ec9e 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -416,7 +416,6 @@ class AArch64TTIImpl : public BasicTTIImplBase { InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const; - /// @} bool enableSelectOptimize() { return ST->enableSelectOptimize(); } @@ -435,6 +434,10 @@ class AArch64TTIImpl : public BasicTTIImplBase { bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2); + + bool isProfitableToSinkOperands(Instruction *I, + SmallVectorImpl &Ops) const; + /// @} }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index cceb89e23f129..0f65df0763cc8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -6043,22 +6043,3 @@ bool AMDGPUTargetLowering::isReassocProfitable(MachineRegisterInfo &MRI, Register N0, Register N1) const { return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks } - -/// Whether it is profitable to sink the operands of an -/// Instruction I to the basic block of I. -/// This helps using several modifiers (like abs and neg) more often. -bool AMDGPUTargetLowering::shouldSinkOperands( - Instruction *I, SmallVectorImpl &Ops) const { - using namespace PatternMatch; - - for (auto &Op : I->operands()) { - // Ensure we are not already sinking this operand. - if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); })) - continue; - - if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value()))) - Ops.push_back(&Op); - } - - return !Ops.empty(); -} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 5c2abd334276c..b2fd31cb2346e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -387,9 +387,6 @@ class AMDGPUTargetLowering : public TargetLowering { MVT getFenceOperandTy(const DataLayout &DL) const override { return MVT::i32; } - - bool shouldSinkOperands(Instruction *I, - SmallVectorImpl &Ops) const override; }; namespace AMDGPUISD { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 0a2d4e6494305..3f4f42377d56e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1183,6 +1183,25 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, return BaseT::getShuffleCost(Kind, VT, Mask, CostKind, Index, SubTp); } +/// Whether it is profitable to sink the operands of an +/// Instruction I to the basic block of I. +/// This helps using several modifiers (like abs and neg) more often. +bool GCNTTIImpl::isProfitableToSinkOperands(Instruction *I, + SmallVectorImpl &Ops) const { + using namespace PatternMatch; + + for (auto &Op : I->operands()) { + // Ensure we are not already sinking this operand. + if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); })) + continue; + + if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value()))) + Ops.push_back(&Op); + } + + return !Ops.empty(); +} + bool GCNTTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { const TargetMachine &TM = getTLI()->getTargetMachine(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 76785ee456a41..30da002376251 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -237,6 +237,9 @@ class GCNTTIImpl final : public BasicTTIImplBase { ArrayRef Args = {}, const Instruction *CxtI = nullptr); + bool isProfitableToSinkOperands(Instruction *I, + SmallVectorImpl &Ops) const; + bool areInlineCompatible(const Function *Caller, const Function *Callee) const; diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 1733424a8b669..bf757edfa8589 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -19283,149 +19283,6 @@ bool ARMTargetLowering::isFNegFree(EVT VT) const { return false; } -/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth -/// of the vector elements. -static bool areExtractExts(Value *Ext1, Value *Ext2) { - auto areExtDoubled = [](Instruction *Ext) { - return Ext->getType()->getScalarSizeInBits() == - 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits(); - }; - - if (!match(Ext1, m_ZExtOrSExt(m_Value())) || - !match(Ext2, m_ZExtOrSExt(m_Value())) || - !areExtDoubled(cast(Ext1)) || - !areExtDoubled(cast(Ext2))) - return false; - - return true; -} - -/// Check if sinking \p I's operands to I's basic block is profitable, because -/// the operands can be folded into a target instruction, e.g. -/// sext/zext can be folded into vsubl. -bool ARMTargetLowering::shouldSinkOperands(Instruction *I, - SmallVectorImpl &Ops) const { - if (!I->getType()->isVectorTy()) - return false; - - if (Subtarget->hasNEON()) { - switch (I->getOpcode()) { - case Instruction::Sub: - case Instruction::Add: { - if (!areExtractExts(I->getOperand(0), I->getOperand(1))) - return false; - Ops.push_back(&I->getOperandUse(0)); - Ops.push_back(&I->getOperandUse(1)); - return true; - } - default: - return false; - } - } - - if (!Subtarget->hasMVEIntegerOps()) - return false; - - auto IsFMSMul = [&](Instruction *I) { - if (!I->hasOneUse()) - return false; - auto *Sub = cast(*I->users().begin()); - return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(1) == I; - }; - auto IsFMS = [&](Instruction *I) { - if (match(I->getOperand(0), m_FNeg(m_Value())) || - match(I->getOperand(1), m_FNeg(m_Value()))) - return true; - return false; - }; - - auto IsSinker = [&](Instruction *I, int Operand) { - switch (I->getOpcode()) { - case Instruction::Add: - case Instruction::Mul: - case Instruction::FAdd: - case Instruction::ICmp: - case Instruction::FCmp: - return true; - case Instruction::FMul: - return !IsFMSMul(I); - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - return Operand == 1; - case Instruction::Call: - if (auto *II = dyn_cast(I)) { - switch (II->getIntrinsicID()) { - case Intrinsic::fma: - return !IsFMS(I); - case Intrinsic::sadd_sat: - case Intrinsic::uadd_sat: - case Intrinsic::arm_mve_add_predicated: - case Intrinsic::arm_mve_mul_predicated: - case Intrinsic::arm_mve_qadd_predicated: - case Intrinsic::arm_mve_vhadd: - case Intrinsic::arm_mve_hadd_predicated: - case Intrinsic::arm_mve_vqdmull: - case Intrinsic::arm_mve_vqdmull_predicated: - case Intrinsic::arm_mve_vqdmulh: - case Intrinsic::arm_mve_qdmulh_predicated: - case Intrinsic::arm_mve_vqrdmulh: - case Intrinsic::arm_mve_qrdmulh_predicated: - case Intrinsic::arm_mve_fma_predicated: - return true; - case Intrinsic::ssub_sat: - case Intrinsic::usub_sat: - case Intrinsic::arm_mve_sub_predicated: - case Intrinsic::arm_mve_qsub_predicated: - case Intrinsic::arm_mve_hsub_predicated: - case Intrinsic::arm_mve_vhsub: - return Operand == 1; - default: - return false; - } - } - return false; - default: - return false; - } - }; - - for (auto OpIdx : enumerate(I->operands())) { - Instruction *Op = dyn_cast(OpIdx.value().get()); - // Make sure we are not already sinking this operand - if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; })) - continue; - - Instruction *Shuffle = Op; - if (Shuffle->getOpcode() == Instruction::BitCast) - Shuffle = dyn_cast(Shuffle->getOperand(0)); - // We are looking for a splat that can be sunk. - if (!Shuffle || - !match(Shuffle, m_Shuffle( - m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()), - m_Undef(), m_ZeroMask()))) - continue; - if (!IsSinker(I, OpIdx.index())) - continue; - - // All uses of the shuffle should be sunk to avoid duplicating it across gpr - // and vector registers - for (Use &U : Op->uses()) { - Instruction *Insn = cast(U.getUser()); - if (!IsSinker(Insn, U.getOperandNo())) - return false; - } - - Ops.push_back(&Shuffle->getOperandUse(0)); - if (Shuffle != Op) - Ops.push_back(&Op->getOperandUse(0)); - Ops.push_back(&OpIdx.value()); - } - return true; -} - Type *ARMTargetLowering::shouldConvertSplatType(ShuffleVectorInst *SVI) const { if (!Subtarget->hasMVEIntegerOps()) return nullptr; diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index a255e9b6fc365..316f7d3b9bce5 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -460,8 +460,6 @@ class VectorType; bool isTruncateFree(Type *SrcTy, Type *DstTy) const override; bool isTruncateFree(EVT SrcVT, EVT DstVT) const override; bool isZExtFree(SDValue Val, EVT VT2) const override; - bool shouldSinkOperands(Instruction *I, - SmallVectorImpl &Ops) const override; Type* shouldConvertSplatType(ShuffleVectorInst* SVI) const override; bool isFNegFree(EVT VT) const override; diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 865e2f3066ef0..835ae98efb852 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -2659,3 +2659,149 @@ bool ARMTTIImpl::hasArmWideBranch(bool Thumb) const { return ST->hasARMOps(); } } + +/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth +/// of the vector elements. +static bool areExtractExts(Value *Ext1, Value *Ext2) { + using namespace PatternMatch; + + auto areExtDoubled = [](Instruction *Ext) { + return Ext->getType()->getScalarSizeInBits() == + 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits(); + }; + + if (!match(Ext1, m_ZExtOrSExt(m_Value())) || + !match(Ext2, m_ZExtOrSExt(m_Value())) || + !areExtDoubled(cast(Ext1)) || + !areExtDoubled(cast(Ext2))) + return false; + + return true; +} + +/// Check if sinking \p I's operands to I's basic block is profitable, because +/// the operands can be folded into a target instruction, e.g. +/// sext/zext can be folded into vsubl. +bool ARMTTIImpl::isProfitableToSinkOperands(Instruction *I, + SmallVectorImpl &Ops) const { + using namespace PatternMatch; + + if (!I->getType()->isVectorTy()) + return false; + + if (ST->hasNEON()) { + switch (I->getOpcode()) { + case Instruction::Sub: + case Instruction::Add: { + if (!areExtractExts(I->getOperand(0), I->getOperand(1))) + return false; + Ops.push_back(&I->getOperandUse(0)); + Ops.push_back(&I->getOperandUse(1)); + return true; + } + default: + return false; + } + } + + if (!ST->hasMVEIntegerOps()) + return false; + + auto IsFMSMul = [&](Instruction *I) { + if (!I->hasOneUse()) + return false; + auto *Sub = cast(*I->users().begin()); + return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(1) == I; + }; + auto IsFMS = [&](Instruction *I) { + if (match(I->getOperand(0), m_FNeg(m_Value())) || + match(I->getOperand(1), m_FNeg(m_Value()))) + return true; + return false; + }; + + auto IsSinker = [&](Instruction *I, int Operand) { + switch (I->getOpcode()) { + case Instruction::Add: + case Instruction::Mul: + case Instruction::FAdd: + case Instruction::ICmp: + case Instruction::FCmp: + return true; + case Instruction::FMul: + return !IsFMSMul(I); + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + return Operand == 1; + case Instruction::Call: + if (auto *II = dyn_cast(I)) { + switch (II->getIntrinsicID()) { + case Intrinsic::fma: + return !IsFMS(I); + case Intrinsic::sadd_sat: + case Intrinsic::uadd_sat: + case Intrinsic::arm_mve_add_predicated: + case Intrinsic::arm_mve_mul_predicated: + case Intrinsic::arm_mve_qadd_predicated: + case Intrinsic::arm_mve_vhadd: + case Intrinsic::arm_mve_hadd_predicated: + case Intrinsic::arm_mve_vqdmull: + case Intrinsic::arm_mve_vqdmull_predicated: + case Intrinsic::arm_mve_vqdmulh: + case Intrinsic::arm_mve_qdmulh_predicated: + case Intrinsic::arm_mve_vqrdmulh: + case Intrinsic::arm_mve_qrdmulh_predicated: + case Intrinsic::arm_mve_fma_predicated: + return true; + case Intrinsic::ssub_sat: + case Intrinsic::usub_sat: + case Intrinsic::arm_mve_sub_predicated: + case Intrinsic::arm_mve_qsub_predicated: + case Intrinsic::arm_mve_hsub_predicated: + case Intrinsic::arm_mve_vhsub: + return Operand == 1; + default: + return false; + } + } + return false; + default: + return false; + } + }; + + for (auto OpIdx : enumerate(I->operands())) { + Instruction *Op = dyn_cast(OpIdx.value().get()); + // Make sure we are not already sinking this operand + if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; })) + continue; + + Instruction *Shuffle = Op; + if (Shuffle->getOpcode() == Instruction::BitCast) + Shuffle = dyn_cast(Shuffle->getOperand(0)); + // We are looking for a splat that can be sunk. + if (!Shuffle || !match(Shuffle, m_Shuffle(m_InsertElt(m_Undef(), m_Value(), + m_ZeroInt()), + m_Undef(), m_ZeroMask()))) + continue; + if (!IsSinker(I, OpIdx.index())) + continue; + + // All uses of the shuffle should be sunk to avoid duplicating it across gpr + // and vector registers + for (Use &U : Op->uses()) { + Instruction *Insn = cast(U.getUser()); + if (!IsSinker(Insn, U.getOperandNo())) + return false; + } + + Ops.push_back(&Shuffle->getOperandUse(0)); + if (Shuffle != Op) + Ops.push_back(&Op->getOperandUse(0)); + Ops.push_back(&OpIdx.value()); + } + return true; +} diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index 7be53c4bcaa29..b0a75134ee02b 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -335,6 +335,8 @@ class ARMTTIImpl : public BasicTTIImplBase { bool hasArmWideBranch(bool Thumb) const; + bool isProfitableToSinkOperands(Instruction *I, + SmallVectorImpl &Ops) const; /// @} }; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 463887b8b55e6..01fa418e4dbdf 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -2068,145 +2068,6 @@ bool RISCVTargetLowering:: return !XC; } -bool RISCVTargetLowering::canSplatOperand(unsigned Opcode, int Operand) const { - switch (Opcode) { - case Instruction::Add: - case Instruction::Sub: - case Instruction::Mul: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: - case Instruction::FAdd: - case Instruction::FSub: - case Instruction::FMul: - case Instruction::FDiv: - case Instruction::ICmp: - case Instruction::FCmp: - return true; - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::Select: - return Operand == 1; - default: - return false; - } -} - - -bool RISCVTargetLowering::canSplatOperand(Instruction *I, int Operand) const { - if (!I->getType()->isVectorTy() || !Subtarget.hasVInstructions()) - return false; - - if (canSplatOperand(I->getOpcode(), Operand)) - return true; - - auto *II = dyn_cast(I); - if (!II) - return false; - - switch (II->getIntrinsicID()) { - case Intrinsic::fma: - case Intrinsic::vp_fma: - return Operand == 0 || Operand == 1; - case Intrinsic::vp_shl: - case Intrinsic::vp_lshr: - case Intrinsic::vp_ashr: - case Intrinsic::vp_udiv: - case Intrinsic::vp_sdiv: - case Intrinsic::vp_urem: - case Intrinsic::vp_srem: - case Intrinsic::ssub_sat: - case Intrinsic::vp_ssub_sat: - case Intrinsic::usub_sat: - case Intrinsic::vp_usub_sat: - return Operand == 1; - // These intrinsics are commutative. - case Intrinsic::vp_add: - case Intrinsic::vp_mul: - case Intrinsic::vp_and: - case Intrinsic::vp_or: - case Intrinsic::vp_xor: - case Intrinsic::vp_fadd: - case Intrinsic::vp_fmul: - case Intrinsic::vp_icmp: - case Intrinsic::vp_fcmp: - case Intrinsic::smin: - case Intrinsic::vp_smin: - case Intrinsic::umin: - case Intrinsic::vp_umin: - case Intrinsic::smax: - case Intrinsic::vp_smax: - case Intrinsic::umax: - case Intrinsic::vp_umax: - case Intrinsic::sadd_sat: - case Intrinsic::vp_sadd_sat: - case Intrinsic::uadd_sat: - case Intrinsic::vp_uadd_sat: - // These intrinsics have 'vr' versions. - case Intrinsic::vp_sub: - case Intrinsic::vp_fsub: - case Intrinsic::vp_fdiv: - return Operand == 0 || Operand == 1; - default: - return false; - } -} - -/// Check if sinking \p I's operands to I's basic block is profitable, because -/// the operands can be folded into a target instruction, e.g. -/// splats of scalars can fold into vector instructions. -bool RISCVTargetLowering::shouldSinkOperands( - Instruction *I, SmallVectorImpl &Ops) const { - using namespace llvm::PatternMatch; - - if (!I->getType()->isVectorTy() || !Subtarget.hasVInstructions()) - return false; - - // Don't sink splat operands if the target prefers it. Some targets requires - // S2V transfer buffers and we can run out of them copying the same value - // repeatedly. - // FIXME: It could still be worth doing if it would improve vector register - // pressure and prevent a vector spill. - if (!Subtarget.sinkSplatOperands()) - return false; - - for (auto OpIdx : enumerate(I->operands())) { - if (!canSplatOperand(I, OpIdx.index())) - continue; - - Instruction *Op = dyn_cast(OpIdx.value().get()); - // Make sure we are not already sinking this operand - if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; })) - continue; - - // We are looking for a splat that can be sunk. - if (!match(Op, m_Shuffle(m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()), - m_Undef(), m_ZeroMask()))) - continue; - - // Don't sink i1 splats. - if (cast(Op->getType())->getElementType()->isIntegerTy(1)) - continue; - - // All uses of the shuffle should be sunk to avoid duplicating it across gpr - // and vector registers - for (Use &U : Op->uses()) { - Instruction *Insn = cast(U.getUser()); - if (!canSplatOperand(Insn, U.getOperandNo())) - return false; - } - - Ops.push_back(&Op->getOperandUse(0)); - Ops.push_back(&OpIdx.value()); - } - return true; -} - bool RISCVTargetLowering::shouldScalarizeBinop(SDValue VecOp) const { unsigned Opc = VecOp.getOpcode(); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 05581552ab604..3864d58a129e9 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -520,14 +520,6 @@ class RISCVTargetLowering : public TargetLowering { SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override; - /// Return true if the (vector) instruction I will be lowered to an instruction - /// with a scalar splat operand for the given Operand number. - bool canSplatOperand(Instruction *I, int Operand) const; - /// Return true if a vector instruction will lower to a target instruction - /// able to splat the given operand. - bool canSplatOperand(unsigned Opcode, int Operand) const; - bool shouldSinkOperands(Instruction *I, - SmallVectorImpl &Ops) const override; bool shouldScalarizeBinop(SDValue VecOp) const override; bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; int getLegalZfaFPImm(const APFloat &Imm, EVT VT) const; diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index a61461681f79e..8d18fd63e4a2e 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -1979,8 +1979,8 @@ InstructionCost RISCVTTIImpl::getArithmeticInstrCost( } auto getConstantMatCost = - [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost { - if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand)) + [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost { + if (OpInfo.isUniform() && canSplatOperand(Opcode, Operand)) // Two sub-cases: // * Has a 5 bit immediate operand which can be splatted. // * Has a larger immediate which must be materialized in scalar register @@ -2294,3 +2294,141 @@ bool RISCVTTIImpl::shouldConsiderAddressTypePromotion( } return Considerable; } + +bool RISCVTTIImpl::canSplatOperand(unsigned Opcode, int Operand) const { + switch (Opcode) { + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::FAdd: + case Instruction::FSub: + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::ICmp: + case Instruction::FCmp: + return true; + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::Select: + return Operand == 1; + default: + return false; + } +} + +bool RISCVTTIImpl::canSplatOperand(Instruction *I, int Operand) const { + if (!I->getType()->isVectorTy() || !ST->hasVInstructions()) + return false; + + if (canSplatOperand(I->getOpcode(), Operand)) + return true; + + auto *II = dyn_cast(I); + if (!II) + return false; + + switch (II->getIntrinsicID()) { + case Intrinsic::fma: + case Intrinsic::vp_fma: + return Operand == 0 || Operand == 1; + case Intrinsic::vp_shl: + case Intrinsic::vp_lshr: + case Intrinsic::vp_ashr: + case Intrinsic::vp_udiv: + case Intrinsic::vp_sdiv: + case Intrinsic::vp_urem: + case Intrinsic::vp_srem: + case Intrinsic::ssub_sat: + case Intrinsic::vp_ssub_sat: + case Intrinsic::usub_sat: + case Intrinsic::vp_usub_sat: + return Operand == 1; + // These intrinsics are commutative. + case Intrinsic::vp_add: + case Intrinsic::vp_mul: + case Intrinsic::vp_and: + case Intrinsic::vp_or: + case Intrinsic::vp_xor: + case Intrinsic::vp_fadd: + case Intrinsic::vp_fmul: + case Intrinsic::vp_icmp: + case Intrinsic::vp_fcmp: + case Intrinsic::smin: + case Intrinsic::vp_smin: + case Intrinsic::umin: + case Intrinsic::vp_umin: + case Intrinsic::smax: + case Intrinsic::vp_smax: + case Intrinsic::umax: + case Intrinsic::vp_umax: + case Intrinsic::sadd_sat: + case Intrinsic::vp_sadd_sat: + case Intrinsic::uadd_sat: + case Intrinsic::vp_uadd_sat: + // These intrinsics have 'vr' versions. + case Intrinsic::vp_sub: + case Intrinsic::vp_fsub: + case Intrinsic::vp_fdiv: + return Operand == 0 || Operand == 1; + default: + return false; + } +} + +/// Check if sinking \p I's operands to I's basic block is profitable, because +/// the operands can be folded into a target instruction, e.g. +/// splats of scalars can fold into vector instructions. +bool RISCVTTIImpl::isProfitableToSinkOperands( + Instruction *I, SmallVectorImpl &Ops) const { + using namespace llvm::PatternMatch; + + if (!I->getType()->isVectorTy() || !ST->hasVInstructions()) + return false; + + // Don't sink splat operands if the target prefers it. Some targets requires + // S2V transfer buffers and we can run out of them copying the same value + // repeatedly. + // FIXME: It could still be worth doing if it would improve vector register + // pressure and prevent a vector spill. + if (!ST->sinkSplatOperands()) + return false; + + for (auto OpIdx : enumerate(I->operands())) { + if (!canSplatOperand(I, OpIdx.index())) + continue; + + Instruction *Op = dyn_cast(OpIdx.value().get()); + // Make sure we are not already sinking this operand + if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; })) + continue; + + // We are looking for a splat that can be sunk. + if (!match(Op, m_Shuffle(m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()), + m_Undef(), m_ZeroMask()))) + continue; + + // Don't sink i1 splats. + if (cast(Op->getType())->getElementType()->isIntegerTy(1)) + continue; + + // All uses of the shuffle should be sunk to avoid duplicating it across gpr + // and vector registers + for (Use &U : Op->uses()) { + Instruction *Insn = cast(U.getUser()); + if (!canSplatOperand(Insn, U.getOperandNo())) + return false; + } + + Ops.push_back(&Op->getOperandUse(0)); + Ops.push_back(&OpIdx.value()); + } + return true; +} \ No newline at end of file diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 65bbd90550855..3f50bd86b9b3b 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -412,6 +412,15 @@ class RISCVTTIImpl : public BasicTTIImplBase { shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader); std::optional getMinPageSize() const { return 4096; } + /// Return true if the (vector) instruction I will be lowered to an + /// instruction with a scalar splat operand for the given Operand number. + bool canSplatOperand(Instruction *I, int Operand) const; + /// Return true if a vector instruction will lower to a target instruction + /// able to splat the given operand. + bool canSplatOperand(unsigned Opcode, int Operand) const; + + bool isProfitableToSinkOperands(Instruction *I, + SmallVectorImpl &Ops) const; }; } // end namespace llvm diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index fa78bf38f426c..5f76d666823e2 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -843,30 +843,6 @@ bool WebAssemblyTargetLowering::isOffsetFoldingLegal( return isa(GV) ? false : TargetLowering::isOffsetFoldingLegal(GA); } -bool WebAssemblyTargetLowering::shouldSinkOperands( - Instruction *I, SmallVectorImpl &Ops) const { - using namespace llvm::PatternMatch; - - if (!I->getType()->isVectorTy() || !I->isShift()) - return false; - - Value *V = I->getOperand(1); - // We dont need to sink constant splat. - if (dyn_cast(V)) - return false; - - if (match(V, m_Shuffle(m_InsertElt(m_Value(), m_Value(), m_ZeroInt()), - m_Value(), m_ZeroMask()))) { - // Sink insert - Ops.push_back(&cast(V)->getOperandUse(0)); - // Sink shuffle - Ops.push_back(&I->getOperandUse(1)); - return true; - } - - return false; -} - EVT WebAssemblyTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &C, EVT VT) const { diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h index 7d9cfb7739e43..139b064aa0423 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h @@ -76,8 +76,6 @@ class WebAssemblyTargetLowering final : public TargetLowering { bool isIntDivCheap(EVT VT, AttributeList Attr) const override; bool isVectorLoadExtDesirable(SDValue ExtVal) const override; bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; - bool shouldSinkOperands(Instruction *I, - SmallVectorImpl &Ops) const override; EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp index b109594811d97..9fe5e5f27f8da 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp @@ -154,3 +154,27 @@ void WebAssemblyTTIImpl::getUnrollingPreferences( bool WebAssemblyTTIImpl::supportsTailCalls() const { return getST()->hasTailCall(); } + +bool WebAssemblyTTIImpl::isProfitableToSinkOperands( + Instruction *I, SmallVectorImpl &Ops) const { + using namespace llvm::PatternMatch; + + if (!I->getType()->isVectorTy() || !I->isShift()) + return false; + + Value *V = I->getOperand(1); + // We dont need to sink constant splat. + if (dyn_cast(V)) + return false; + + if (match(V, m_Shuffle(m_InsertElt(m_Value(), m_Value(), m_ZeroInt()), + m_Value(), m_ZeroMask()))) { + // Sink insert + Ops.push_back(&cast(V)->getOperandUse(0)); + // Sink shuffle + Ops.push_back(&I->getOperandUse(1)); + return true; + } + + return false; +} diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h index ac3a333991684..2ce6cbf3ba026 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h @@ -71,12 +71,16 @@ class WebAssemblyTTIImpl final : public BasicTTIImplBase { TTI::ReductionShuffle getPreferredExpandedReductionShuffle(const IntrinsicInst *II) const; - /// @} bool areInlineCompatible(const Function *Caller, const Function *Callee) const; bool supportsTailCalls() const; + + bool isProfitableToSinkOperands(Instruction *I, + SmallVectorImpl &Ops) const; + + /// @} }; } // end namespace llvm diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index ddbe82b1de5cf..70f06b8d3a5f2 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -34671,29 +34671,6 @@ bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL, return true; } -bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const { - unsigned Bits = Ty->getScalarSizeInBits(); - - // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts. - // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred. - if (Subtarget.hasXOP() && - (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64)) - return false; - - // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable - // shifts just as cheap as scalar ones. - if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64)) - return false; - - // AVX512BW has shifts such as vpsllvw. - if (Subtarget.hasBWI() && Bits == 16) - return false; - - // Otherwise, it's significantly cheaper to shift by a scalar amount than by a - // fully general vector. - return true; -} - bool X86TargetLowering::isBinOp(unsigned Opcode) const { switch (Opcode) { // These are non-commutative binops. @@ -34808,63 +34785,6 @@ bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { return false; } -bool X86TargetLowering::shouldSinkOperands(Instruction *I, - SmallVectorImpl &Ops) const { - using namespace llvm::PatternMatch; - - FixedVectorType *VTy = dyn_cast(I->getType()); - if (!VTy) - return false; - - if (I->getOpcode() == Instruction::Mul && - VTy->getElementType()->isIntegerTy(64)) { - for (auto &Op : I->operands()) { - // Make sure we are not already sinking this operand - if (any_of(Ops, [&](Use *U) { return U->get() == Op; })) - continue; - - // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or - // the PMULUDQ pattern where the input is a zext_inreg from vXi32. - if (Subtarget.hasSSE41() && - match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)), - m_SpecificInt(32)))) { - Ops.push_back(&cast(Op)->getOperandUse(0)); - Ops.push_back(&Op); - } else if (Subtarget.hasSSE2() && - match(Op.get(), - m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) { - Ops.push_back(&Op); - } - } - - return !Ops.empty(); - } - - // A uniform shift amount in a vector shift or funnel shift may be much - // cheaper than a generic variable vector shift, so make that pattern visible - // to SDAG by sinking the shuffle instruction next to the shift. - int ShiftAmountOpNum = -1; - if (I->isShift()) - ShiftAmountOpNum = 1; - else if (auto *II = dyn_cast(I)) { - if (II->getIntrinsicID() == Intrinsic::fshl || - II->getIntrinsicID() == Intrinsic::fshr) - ShiftAmountOpNum = 2; - } - - if (ShiftAmountOpNum == -1) - return false; - - auto *Shuf = dyn_cast(I->getOperand(ShiftAmountOpNum)); - if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 && - isVectorShiftByScalarCheap(I->getType())) { - Ops.push_back(&I->getOperandUse(ShiftAmountOpNum)); - return true; - } - - return false; -} - bool X86TargetLowering::shouldConvertPhiType(Type *From, Type *To) const { if (!Subtarget.is64Bit()) return false; diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 0ab42f032c3ea..a2515ff35e692 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1404,10 +1404,6 @@ namespace llvm { bool isLegalStoreImmediate(int64_t Imm) const override; - /// This is used to enable splatted operand transforms for vector shifts - /// and vector funnel shifts. - bool isVectorShiftByScalarCheap(Type *Ty) const override; - /// Add x86-specific opcodes to the default list. bool isBinOp(unsigned Opcode) const override; @@ -1434,8 +1430,6 @@ namespace llvm { bool isZExtFree(EVT VT1, EVT VT2) const override; bool isZExtFree(SDValue Val, EVT VT2) const override; - bool shouldSinkOperands(Instruction *I, - SmallVectorImpl &Ops) const override; bool shouldConvertPhiType(Type *From, Type *To) const override; /// Return true if folding a vector load into ExtVal (a sign, zero, or any diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index aa84e3887c389..413ef0136d5c0 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -6900,3 +6900,82 @@ InstructionCost X86TTIImpl::getBranchMispredictPenalty() const { // TODO: Hook MispredictPenalty of SchedMachineModel into this. return 14; } + +bool X86TTIImpl::isVectorShiftByScalarCheap(Type *Ty) const { + unsigned Bits = Ty->getScalarSizeInBits(); + + // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts. + // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred. + if (ST->hasXOP() && (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64)) + return false; + + // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable + // shifts just as cheap as scalar ones. + if (ST->hasAVX2() && (Bits == 32 || Bits == 64)) + return false; + + // AVX512BW has shifts such as vpsllvw. + if (ST->hasBWI() && Bits == 16) + return false; + + // Otherwise, it's significantly cheaper to shift by a scalar amount than by a + // fully general vector. + return true; +} + +bool X86TTIImpl::isProfitableToSinkOperands(Instruction *I, + SmallVectorImpl &Ops) const { + using namespace llvm::PatternMatch; + + FixedVectorType *VTy = dyn_cast(I->getType()); + if (!VTy) + return false; + + if (I->getOpcode() == Instruction::Mul && + VTy->getElementType()->isIntegerTy(64)) { + for (auto &Op : I->operands()) { + // Make sure we are not already sinking this operand + if (any_of(Ops, [&](Use *U) { return U->get() == Op; })) + continue; + + // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or + // the PMULUDQ pattern where the input is a zext_inreg from vXi32. + if (ST->hasSSE41() && + match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)), + m_SpecificInt(32)))) { + Ops.push_back(&cast(Op)->getOperandUse(0)); + Ops.push_back(&Op); + } else if (ST->hasSSE2() && + match(Op.get(), + m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) { + Ops.push_back(&Op); + } + } + + return !Ops.empty(); + } + + // A uniform shift amount in a vector shift or funnel shift may be much + // cheaper than a generic variable vector shift, so make that pattern visible + // to SDAG by sinking the shuffle instruction next to the shift. + int ShiftAmountOpNum = -1; + if (I->isShift()) + ShiftAmountOpNum = 1; + else if (auto *II = dyn_cast(I)) { + if (II->getIntrinsicID() == Intrinsic::fshl || + II->getIntrinsicID() == Intrinsic::fshr) + ShiftAmountOpNum = 2; + } + + if (ShiftAmountOpNum == -1) + return false; + + auto *Shuf = dyn_cast(I->getOperand(ShiftAmountOpNum)); + if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 && + isVectorShiftByScalarCheap(I->getType())) { + Ops.push_back(&I->getOperandUse(ShiftAmountOpNum)); + return true; + } + + return false; +} diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index c16461b157e07..0100f328ab4bd 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -297,6 +297,11 @@ class X86TTIImpl : public BasicTTIImplBase { InstructionCost getBranchMispredictPenalty() const; + bool isProfitableToSinkOperands(Instruction *I, + SmallVectorImpl &Ops) const; + + bool isVectorShiftByScalarCheap(Type *Ty) const; + private: bool supportsGather() const; InstructionCost getGSVectorCost(unsigned Opcode, TTI::TargetCostKind CostKind, From 115cb402d8ed91f94d22afcc4c2c9ed9def53cc7 Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Wed, 9 Oct 2024 14:31:16 -0700 Subject: [PATCH 067/119] [WebAssembly] Don't fold non-nuw add/sub in FastISel (#111278) We should not fold one of add/sub operands into a load/store's offset when `nuw` (no unsigned wrap) is not present, because the address calculation, which adds the offset with the operand, does not wrap. This is handled correctly in the normal ISel: https://github.com/llvm/llvm-project/blob/6de5305b3d7a4a19a29b35d481a8090e2a6d3a7e/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp#L328-L332 but not in FastISel. This positivity check in FastISel is not sufficient to avoid this case fully: https://github.com/llvm/llvm-project/blob/6de5305b3d7a4a19a29b35d481a8090e2a6d3a7e/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp#L348-L352 because 1. Even if RHS is within signed int range, depending on the value of the LHS, the resulting value can exceed uint32 max. 2. When one of the operands is a label, `Address` can contain a `GlobalValue` and a `Reg` at the same time, so the `GlobalValue` becomes incorrectly an offset: https://github.com/llvm/llvm-project/blob/6de5305b3d7a4a19a29b35d481a8090e2a6d3a7e/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp#L53-L69 https://github.com/llvm/llvm-project/blob/6de5305b3d7a4a19a29b35d481a8090e2a6d3a7e/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp#L409-L417 Both cases are in the newly added test. We should handle `SUB` too because `SUB` is the same as `ADD` when RHS's sign changes. I checked why our current normal ISel only handles `ADD`, and the reason it's OK for the normal ISel to handle only `ADD` seems that DAGCombiner replaces `SUB` with `ADD` here: https://github.com/llvm/llvm-project/blob/6de5305b3d7a4a19a29b35d481a8090e2a6d3a7e/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp#L3904-L3907 Fixes #111018. --- .../WebAssembly/WebAssemblyFastISel.cpp | 12 ++ .../WebAssembly/fast-isel-no-offset.ll | 106 ++++++++++++++++++ .../CodeGen/WebAssembly/fast-isel-pr47040.ll | 2 +- 3 files changed, 119 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/WebAssembly/fast-isel-no-offset.ll diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp index 317c6463985dc..7c90fff2a5c1d 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp @@ -337,6 +337,12 @@ bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) { break; } case Instruction::Add: { + // We should not fold operands into an offset when 'nuw' (no unsigned wrap) + // is not present, because the address calculation does not wrap. + if (auto *OFBinOp = dyn_cast(U)) + if (!OFBinOp->hasNoUnsignedWrap()) + break; + // Adds of constants are common and easy enough. const Value *LHS = U->getOperand(0); const Value *RHS = U->getOperand(1); @@ -360,6 +366,12 @@ bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) { break; } case Instruction::Sub: { + // We should not fold operands into an offset when 'nuw' (no unsigned wrap) + // is not present, because the address calculation does not wrap. + if (auto *OFBinOp = dyn_cast(U)) + if (!OFBinOp->hasNoUnsignedWrap()) + break; + // Subs of constants are common and easy enough. const Value *LHS = U->getOperand(0); const Value *RHS = U->getOperand(1); diff --git a/llvm/test/CodeGen/WebAssembly/fast-isel-no-offset.ll b/llvm/test/CodeGen/WebAssembly/fast-isel-no-offset.ll new file mode 100644 index 0000000000000..d4ba1f3bc4a45 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/fast-isel-no-offset.ll @@ -0,0 +1,106 @@ +; RUN: llc < %s -asm-verbose=false -fast-isel -fast-isel-abort=1 -verify-machineinstrs | FileCheck %s + +target triple = "wasm32-unknown-unknown" + +; FastISel should not fold one of the add/sub operands into a load/store's +; offset when 'nuw' (no unsigned wrap) is not present, because the address +; calculation does not wrap. When there is an add/sub and nuw is not present, we +; bail out of FastISel. + +@mylabel = external global ptr + +; CHECK-LABEL: dont_fold_non_nuw_add_load: +; CHECK: local.get 0 +; CHECK-NEXT: i32.const 2147483644 +; CHECK-NEXT: i32.add +; CHECK-NEXT: i32.load 0 +define i32 @dont_fold_non_nuw_add_load(ptr %p) { + %q = ptrtoint ptr %p to i32 + %r = add i32 %q, 2147483644 + %s = inttoptr i32 %r to ptr + %t = load i32, ptr %s + ret i32 %t +} + +; CHECK-LABEL: dont_fold_non_nuw_add_store: +; CHECK: local.get 0 +; CHECK-NEXT: i32.const 2147483644 +; CHECK-NEXT: i32.add +; CHECK-NEXT: i32.const 5 +; CHECK-NEXT: i32.store 0 +define void @dont_fold_non_nuw_add_store(ptr %p) { + %q = ptrtoint ptr %p to i32 + %r = add i32 %q, 2147483644 + %s = inttoptr i32 %r to ptr + store i32 5, ptr %s + ret void +} + +; CHECK-LABEL: dont_fold_non_nuw_add_load_2: +; CHECK: i32.const mylabel +; CHECK-NEXT: i32.const -4 +; CHECK-NEXT: i32.add +; CHECK-NEXT: i32.load 0 +define i32 @dont_fold_non_nuw_add_load_2() { + %t = load i32, ptr inttoptr (i32 add (i32 ptrtoint (ptr @mylabel to i32), i32 -4) to ptr), align 4 + ret i32 %t +} + +; CHECK-LABEL: dont_fold_non_nuw_add_store_2: +; CHECK: i32.const mylabel +; CHECK-NEXT: i32.const -4 +; CHECK-NEXT: i32.add +; CHECK-NEXT: i32.const 5 +; CHECK-NEXT: i32.store 0 +define void @dont_fold_non_nuw_add_store_2() { + store i32 5, ptr inttoptr (i32 add (i32 ptrtoint (ptr @mylabel to i32), i32 -4) to ptr), align 4 + ret void +} + +; CHECK-LABEL: dont_fold_non_nuw_sub_load: +; CHECK: local.get 0 +; CHECK-NEXT: i32.const -2147483644 +; CHECK-NEXT: i32.sub +; CHECK-NEXT: i32.load 0 +define i32 @dont_fold_non_nuw_sub_load(ptr %p) { + %q = ptrtoint ptr %p to i32 + %r = sub i32 %q, -2147483644 + %s = inttoptr i32 %r to ptr + %t = load i32, ptr %s + ret i32 %t +} + +; CHECK-LABEL: dont_fold_non_nuw_sub_store: +; CHECK: local.get 0 +; CHECK-NEXT: i32.const -2147483644 +; CHECK-NEXT: i32.sub +; CHECK-NEXT: i32.const 5 +; CHECK-NEXT: i32.store 0 +define void @dont_fold_non_nuw_sub_store(ptr %p) { + %q = ptrtoint ptr %p to i32 + %r = sub i32 %q, -2147483644 + %s = inttoptr i32 %r to ptr + store i32 5, ptr %s + ret void +} + +; CHECK-LABEL: dont_fold_non_nuw_sub_load_2: +; CHECK: i32.const mylabel +; CHECK-NEXT: i32.const 4 +; CHECK-NEXT: i32.sub +; CHECK-NEXT: i32.load 0 +define i32 @dont_fold_non_nuw_sub_load_2() { + %t = load i32, ptr inttoptr (i32 sub (i32 ptrtoint (ptr @mylabel to i32), i32 4) to ptr), align 4 + ret i32 %t +} + +; CHECK-LABEL: dont_fold_non_nuw_sub_store_2: +; CHECK: i32.const mylabel +; CHECK-NEXT: i32.const 4 +; CHECK-NEXT: i32.sub +; CHECK-NEXT: i32.const 5 +; CHECK-NEXT: i32.store 0 +define void @dont_fold_non_nuw_sub_store_2() { + store i32 5, ptr inttoptr (i32 sub (i32 ptrtoint (ptr @mylabel to i32), i32 4) to ptr), align 4 + ret void +} diff --git a/llvm/test/CodeGen/WebAssembly/fast-isel-pr47040.ll b/llvm/test/CodeGen/WebAssembly/fast-isel-pr47040.ll index 6a1304cb9a93a..75cb5b66b3ebe 100644 --- a/llvm/test/CodeGen/WebAssembly/fast-isel-pr47040.ll +++ b/llvm/test/CodeGen/WebAssembly/fast-isel-pr47040.ll @@ -14,7 +14,7 @@ target triple = "wasm32-unknown-unknown" define i32 @foo() { %stack_addr = alloca i32 %stack_i = ptrtoint ptr %stack_addr to i32 - %added = add i32 %stack_i, undef + %added = add nuw i32 %stack_i, undef %added_addr = inttoptr i32 %added to ptr %ret = load i32, ptr %added_addr ret i32 %ret From ac3321f104ae2a0639845f860b05c97568bb24e2 Mon Sep 17 00:00:00 2001 From: Chelsea Cassanova Date: Wed, 9 Oct 2024 14:41:50 -0700 Subject: [PATCH 068/119] [lldb] Add missing include to SBLanguages.h (#111763) SBLanguages.h uses a uint16_t but is missing the include for ``, if any file includes this without including that it will cause a build error so this commit adds this include. --- lldb/scripts/generate-sbapi-dwarf-enum.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lldb/scripts/generate-sbapi-dwarf-enum.py b/lldb/scripts/generate-sbapi-dwarf-enum.py index 7fd6037986317..689c7f81bfda0 100755 --- a/lldb/scripts/generate-sbapi-dwarf-enum.py +++ b/lldb/scripts/generate-sbapi-dwarf-enum.py @@ -16,6 +16,8 @@ #ifndef LLDB_API_SBLANGUAGE_H #define LLDB_API_SBLANGUAGE_H +#include + namespace lldb { /// Used by \\ref SBExpressionOptions. /// These enumerations use the same language enumerations as the DWARF From 91dd4ec20e8371ea5e920f5493688e13306a67d2 Mon Sep 17 00:00:00 2001 From: Krystian Stasiowski Date: Wed, 9 Oct 2024 15:43:55 -0600 Subject: [PATCH 069/119] Revert "[clang] Track function template instantiation from definition (#110387)" (#111764) This reverts commit 4336f00f2156970cc0af2816331387a0a4039317. --- clang/docs/ReleaseNotes.rst | 1 - clang/include/clang/AST/Decl.h | 7 -- clang/include/clang/AST/DeclBase.h | 10 +- clang/include/clang/AST/DeclTemplate.h | 9 -- clang/include/clang/Sema/Sema.h | 6 -- clang/lib/AST/Decl.cpp | 1 - clang/lib/Sema/SemaTemplateDeduction.cpp | 17 ++- clang/lib/Sema/SemaTemplateInstantiate.cpp | 17 +-- .../lib/Sema/SemaTemplateInstantiateDecl.cpp | 22 +--- clang/lib/Serialization/ASTReaderDecl.cpp | 1 - clang/lib/Serialization/ASTWriterDecl.cpp | 3 +- clang/test/SemaTemplate/GH55509.cpp | 101 ------------------ 12 files changed, 26 insertions(+), 169 deletions(-) delete mode 100644 clang/test/SemaTemplate/GH55509.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index a4bb303a2bc42..29b9fe07f545f 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -468,7 +468,6 @@ Bug Fixes to C++ Support - Fixed an assertion failure in debug mode, and potential crashes in release mode, when diagnosing a failed cast caused indirectly by a failed implicit conversion to the type of the constructor parameter. - Fixed an assertion failure by adjusting integral to boolean vector conversions (#GH108326) -- Clang is now better at keeping track of friend function template instance contexts. (#GH55509) - Fixed an issue deducing non-type template arguments of reference type. (#GH73460) - Fixed an issue in constraint evaluation, where type constraints on the lambda expression containing outer unexpanded parameters were not correctly expanded. (#GH101754) diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h index 6afc86710a813..7ff35d73df599 100644 --- a/clang/include/clang/AST/Decl.h +++ b/clang/include/clang/AST/Decl.h @@ -2299,13 +2299,6 @@ class FunctionDecl : public DeclaratorDecl, FunctionDeclBits.IsLateTemplateParsed = ILT; } - bool isInstantiatedFromMemberTemplate() const { - return FunctionDeclBits.IsInstantiatedFromMemberTemplate; - } - void setInstantiatedFromMemberTemplate(bool Val = true) { - FunctionDeclBits.IsInstantiatedFromMemberTemplate = Val; - } - /// Whether this function is "trivial" in some specialized C++ senses. /// Can only be true for default constructors, copy constructors, /// copy assignment operators, and destructors. Not meaningful until diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h index eb67dc03157e6..ee662ed73d7e0 100644 --- a/clang/include/clang/AST/DeclBase.h +++ b/clang/include/clang/AST/DeclBase.h @@ -1763,8 +1763,6 @@ class DeclContext { uint64_t HasImplicitReturnZero : 1; LLVM_PREFERRED_TYPE(bool) uint64_t IsLateTemplateParsed : 1; - LLVM_PREFERRED_TYPE(bool) - uint64_t IsInstantiatedFromMemberTemplate : 1; /// Kind of contexpr specifier as defined by ConstexprSpecKind. LLVM_PREFERRED_TYPE(ConstexprSpecKind) @@ -1815,7 +1813,7 @@ class DeclContext { }; /// Number of inherited and non-inherited bits in FunctionDeclBitfields. - enum { NumFunctionDeclBits = NumDeclContextBits + 32 }; + enum { NumFunctionDeclBits = NumDeclContextBits + 31 }; /// Stores the bits used by CXXConstructorDecl. If modified /// NumCXXConstructorDeclBits and the accessor @@ -1826,12 +1824,12 @@ class DeclContext { LLVM_PREFERRED_TYPE(FunctionDeclBitfields) uint64_t : NumFunctionDeclBits; - /// 19 bits to fit in the remaining available space. + /// 20 bits to fit in the remaining available space. /// Note that this makes CXXConstructorDeclBitfields take /// exactly 64 bits and thus the width of NumCtorInitializers /// will need to be shrunk if some bit is added to NumDeclContextBitfields, /// NumFunctionDeclBitfields or CXXConstructorDeclBitfields. - uint64_t NumCtorInitializers : 16; + uint64_t NumCtorInitializers : 17; LLVM_PREFERRED_TYPE(bool) uint64_t IsInheritingConstructor : 1; @@ -1845,7 +1843,7 @@ class DeclContext { }; /// Number of inherited and non-inherited bits in CXXConstructorDeclBitfields. - enum { NumCXXConstructorDeclBits = NumFunctionDeclBits + 19 }; + enum { NumCXXConstructorDeclBits = NumFunctionDeclBits + 20 }; /// Stores the bits used by ObjCMethodDecl. /// If modified NumObjCMethodDeclBits and the accessor diff --git a/clang/include/clang/AST/DeclTemplate.h b/clang/include/clang/AST/DeclTemplate.h index 2fb49ec1aea0d..05739f39d2a49 100644 --- a/clang/include/clang/AST/DeclTemplate.h +++ b/clang/include/clang/AST/DeclTemplate.h @@ -1008,15 +1008,6 @@ class FunctionTemplateDecl : public RedeclarableTemplateDecl { return getTemplatedDecl()->isThisDeclarationADefinition(); } - bool isCompatibleWithDefinition() const { - return getTemplatedDecl()->isInstantiatedFromMemberTemplate() || - isThisDeclarationADefinition(); - } - void setInstantiatedFromMemberTemplate(FunctionTemplateDecl *D) { - getTemplatedDecl()->setInstantiatedFromMemberTemplate(); - RedeclarableTemplateDecl::setInstantiatedFromMemberTemplate(D); - } - /// Return the specialization with the provided arguments if it exists, /// otherwise return the insertion point. FunctionDecl *findSpecialization(ArrayRef Args, diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 86053bd7da172..67a6dbeb520a8 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -13017,12 +13017,6 @@ class Sema final : public SemaBase { std::optional> Innermost = std::nullopt, bool RelativeToPrimary = false, bool ForConstraintInstantiation = false); - void getTemplateInstantiationArgs( - MultiLevelTemplateArgumentList &Result, const NamedDecl *D, - const DeclContext *DC = nullptr, bool Final = false, - std::optional> Innermost = std::nullopt, - bool RelativeToPrimary = false, bool ForConstraintInstantiation = false); - /// RAII object to handle the state changes required to synthesize /// a function body. class SynthesizedFunctionScope { diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp index 8f54b5f1589d4..58d11a0312c50 100644 --- a/clang/lib/AST/Decl.cpp +++ b/clang/lib/AST/Decl.cpp @@ -3067,7 +3067,6 @@ FunctionDecl::FunctionDecl(Kind DK, ASTContext &C, DeclContext *DC, FunctionDeclBits.IsIneligibleOrNotSelected = false; FunctionDeclBits.HasImplicitReturnZero = false; FunctionDeclBits.IsLateTemplateParsed = false; - FunctionDeclBits.IsInstantiatedFromMemberTemplate = false; FunctionDeclBits.ConstexprKind = static_cast(ConstexprKind); FunctionDeclBits.BodyContainsImmediateEscalatingExpression = false; FunctionDeclBits.InstantiationIsPending = false; diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp index dfae0d6cda0d9..aa62cfa7dcbd1 100644 --- a/clang/lib/Sema/SemaTemplateDeduction.cpp +++ b/clang/lib/Sema/SemaTemplateDeduction.cpp @@ -3928,7 +3928,22 @@ TemplateDeductionResult Sema::FinishTemplateArgumentDeduction( if (FunctionTemplate->getFriendObjectKind()) Owner = FunctionTemplate->getLexicalDeclContext(); FunctionDecl *FD = FunctionTemplate->getTemplatedDecl(); - + // additional check for inline friend, + // ``` + // template int foo(F1 X); + // template struct A { + // template friend int foo(F1 X) { return A1; } + // }; + // template struct A<1>; + // int a = foo(1.0); + // ``` + const FunctionDecl *FDFriend; + if (FD->getFriendObjectKind() == Decl::FriendObjectKind::FOK_None && + FD->isDefined(FDFriend, /*CheckForPendingFriendDefinition*/ true) && + FDFriend->getFriendObjectKind() != Decl::FriendObjectKind::FOK_None) { + FD = const_cast(FDFriend); + Owner = FD->getLexicalDeclContext(); + } MultiLevelTemplateArgumentList SubstArgs( FunctionTemplate, CanonicalDeducedArgumentList->asArray(), /*Final=*/false); diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp index 261ef4edf1759..5b5e50f668b25 100644 --- a/clang/lib/Sema/SemaTemplateInstantiate.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp @@ -512,13 +512,13 @@ struct TemplateInstantiationArgumentCollecter } // namespace -void Sema::getTemplateInstantiationArgs( - MultiLevelTemplateArgumentList &Result, const NamedDecl *ND, - const DeclContext *DC, bool Final, +MultiLevelTemplateArgumentList Sema::getTemplateInstantiationArgs( + const NamedDecl *ND, const DeclContext *DC, bool Final, std::optional> Innermost, bool RelativeToPrimary, bool ForConstraintInstantiation) { assert((ND || DC) && "Can't find arguments for a decl if one isn't provided"); // Accumulate the set of template argument lists in this structure. + MultiLevelTemplateArgumentList Result; const Decl *CurDecl = ND; if (!CurDecl) @@ -529,17 +529,6 @@ void Sema::getTemplateInstantiationArgs( do { CurDecl = Collecter.Visit(const_cast(CurDecl)); } while (CurDecl); -} - -MultiLevelTemplateArgumentList Sema::getTemplateInstantiationArgs( - const NamedDecl *ND, const DeclContext *DC, bool Final, - std::optional> Innermost, bool RelativeToPrimary, - bool ForConstraintInstantiation) { - assert((ND || DC) && "Can't find arguments for a decl if one isn't provided"); - // Accumulate the set of template argument lists in this structure. - MultiLevelTemplateArgumentList Result; - getTemplateInstantiationArgs(Result, ND, DC, Final, Innermost, - RelativeToPrimary, ForConstraintInstantiation); return Result; } diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp index 8cdf0b17d2dd2..74f2152e441e1 100644 --- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp @@ -5224,26 +5224,8 @@ void Sema::InstantiateFunctionDefinition(SourceLocation PointOfInstantiation, RebuildTypeSourceInfoForDefaultSpecialMembers(); SetDeclDefaulted(Function, PatternDecl->getLocation()); } else { - DeclContext *DC = Function; - MultiLevelTemplateArgumentList TemplateArgs; - if (auto *Primary = Function->getPrimaryTemplate(); - Primary && - !isGenericLambdaCallOperatorOrStaticInvokerSpecialization(Function)) { - auto It = llvm::find_if(Primary->redecls(), - [](const RedeclarableTemplateDecl *RTD) { - return cast(RTD) - ->isCompatibleWithDefinition(); - }); - assert(It != Primary->redecls().end() && - "Should't get here without a definition"); - DC = (*It)->getLexicalDeclContext(); - if (Function->getTemplateSpecializationKind() != - TSK_ExplicitSpecialization) - TemplateArgs.addOuterTemplateArguments( - Function, Function->getTemplateSpecializationArgs()->asArray(), - /*Final=*/false); - } - getTemplateInstantiationArgs(TemplateArgs, /*D=*/nullptr, DC); + MultiLevelTemplateArgumentList TemplateArgs = getTemplateInstantiationArgs( + Function, Function->getLexicalDeclContext()); // Substitute into the qualifier; we can get a substitution failure here // through evil use of alias templates. diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp index a44df84a8bcef..1ccc810f415eb 100644 --- a/clang/lib/Serialization/ASTReaderDecl.cpp +++ b/clang/lib/Serialization/ASTReaderDecl.cpp @@ -1087,7 +1087,6 @@ void ASTDeclReader::VisitFunctionDecl(FunctionDecl *FD) { FD->setHasImplicitReturnZero(FunctionDeclBits.getNextBit()); FD->setIsMultiVersion(FunctionDeclBits.getNextBit()); FD->setLateTemplateParsed(FunctionDeclBits.getNextBit()); - FD->setInstantiatedFromMemberTemplate(FunctionDeclBits.getNextBit()); FD->setFriendConstraintRefersToEnclosingTemplate( FunctionDeclBits.getNextBit()); FD->setUsesSEHTry(FunctionDeclBits.getNextBit()); diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp index dec93317dc7b3..f21cbd11b6ab8 100644 --- a/clang/lib/Serialization/ASTWriterDecl.cpp +++ b/clang/lib/Serialization/ASTWriterDecl.cpp @@ -626,7 +626,7 @@ void ASTDeclWriter::VisitDeclaratorDecl(DeclaratorDecl *D) { } void ASTDeclWriter::VisitFunctionDecl(FunctionDecl *D) { - static_assert(DeclContext::NumFunctionDeclBits == 45, + static_assert(DeclContext::NumFunctionDeclBits == 44, "You need to update the serializer after you change the " "FunctionDeclBits"); @@ -732,7 +732,6 @@ void ASTDeclWriter::VisitFunctionDecl(FunctionDecl *D) { FunctionDeclBits.addBit(D->hasImplicitReturnZero()); FunctionDeclBits.addBit(D->isMultiVersion()); FunctionDeclBits.addBit(D->isLateTemplateParsed()); - FunctionDeclBits.addBit(D->isInstantiatedFromMemberTemplate()); FunctionDeclBits.addBit(D->FriendConstraintRefersToEnclosingTemplate()); FunctionDeclBits.addBit(D->usesSEHTry()); Record.push_back(FunctionDeclBits); diff --git a/clang/test/SemaTemplate/GH55509.cpp b/clang/test/SemaTemplate/GH55509.cpp deleted file mode 100644 index f95833fbed7b1..0000000000000 --- a/clang/test/SemaTemplate/GH55509.cpp +++ /dev/null @@ -1,101 +0,0 @@ -// RUN: %clang_cc1 -fsyntax-only -verify -std=c++26 %s - -namespace t1 { - template struct A { - template friend auto cica(const A&, C) { - return N; - } - }; - - template<> struct A<0> { - template friend auto cica(const A<0>&, C); - // expected-note@-1 {{declared here}} - }; - - void test() { - cica(A<0>{}, 0); - // expected-error@-1 {{function 'cica' with deduced return type cannot be used before it is defined}} - - (void)A<1>{}; - cica(A<0>{}, 0); - } -} // namespace t1 -namespace t2 { - template struct A { - template friend auto cica(const A&, C) { - return N; - } - }; - - template<> struct A<0> { - template friend auto cica(const A<0>&, C); - }; - - template {}, nullptr))> - void MakeCica(); - // expected-note@-1 {{candidate function}} - - template void MakeCica(A = {}); - // expected-note@-1 {{candidate function}} - - void test() { - MakeCica<0>(); - - MakeCica<0>(); - // expected-error@-1 {{call to 'MakeCica' is ambiguous}} - } -} // namespace t2 -namespace t3 { - template struct A { - template friend auto cica(const A&, C) { - return N-1; - } - }; - - template<> struct A<0> { - template friend auto cica(const A<0>&, C); - }; - - template - static constexpr bool MakeCica(int); - - template - static constexpr bool MakeCica(short, A = {}); - - template , class Val = decltype(MakeCica(0))> - static constexpr bool has_cica = Val{}; - - constexpr bool cica2 = has_cica<0> || has_cica<0>; -} // namespace t3 -namespace t4 { - template struct A { - template friend auto cica(const A&, C); - }; - - template<> struct A<0> { - template friend auto cica(const A<0>&, C) { - C a; - } - }; - - template struct A<1>; - - void test() { - cica(A<0>{}, 0); - } -} // namespace t4 -namespace regression1 { - template class A; - - template [[gnu::abi_tag("TAG")]] void foo(A); - - template struct A { - friend void foo <>(A); - }; - - template struct A; - - template [[gnu::abi_tag("TAG")]] void foo(A) {} - - template void foo(A); -} // namespace regression1 From c55d68fcc67d70235d6e4b75fe3879ab4d24a6b6 Mon Sep 17 00:00:00 2001 From: Jan Svoboda Date: Wed, 9 Oct 2024 14:49:09 -0700 Subject: [PATCH 070/119] [clang][deps] Serialize JSON without creating intermediate objects (#111734) The dependency scanner uses the `llvm::json` library for outputting the dependency information. Until now, it created an in-memory representation of the dependency graph using the `llvm::json::Object` hierarchy. This not only creates unnecessary copies of the data, but also forces lexicographical ordering of attributes in the output, both of which I'd like to avoid. This patch adopts the `llvm::json::OStream` API instead and reorders the attribute printing logic such that the existing lexicographical ordering is preserved (for now). --- clang/tools/clang-scan-deps/ClangScanDeps.cpp | 159 ++++++++++-------- 1 file changed, 87 insertions(+), 72 deletions(-) diff --git a/clang/tools/clang-scan-deps/ClangScanDeps.cpp b/clang/tools/clang-scan-deps/ClangScanDeps.cpp index b642a37c79e98..7d36cee7a22b3 100644 --- a/clang/tools/clang-scan-deps/ClangScanDeps.cpp +++ b/clang/tools/clang-scan-deps/ClangScanDeps.cpp @@ -330,38 +330,46 @@ handleMakeDependencyToolResult(const std::string &Input, return false; } -static llvm::json::Array toJSONSorted(const llvm::StringSet<> &Set) { - std::vector Strings; - for (auto &&I : Set) - Strings.push_back(I.getKey()); +template +static auto toJSONStrings(llvm::json::OStream &JOS, Container &&Strings) { + return [&JOS, Strings = std::forward(Strings)] { + for (StringRef Str : Strings) + JOS.value(Str); + }; +} + +static auto toJSONSorted(llvm::json::OStream &JOS, + const llvm::StringSet<> &Set) { + SmallVector Strings(Set.keys()); llvm::sort(Strings); - return llvm::json::Array(Strings); + return toJSONStrings(JOS, std::move(Strings)); } // Technically, we don't need to sort the dependency list to get determinism. // Leaving these be will simply preserve the import order. -static llvm::json::Array toJSONSorted(std::vector V) { +static auto toJSONSorted(llvm::json::OStream &JOS, std::vector V) { llvm::sort(V); - - llvm::json::Array Ret; - for (const ModuleID &MID : V) - Ret.push_back(llvm::json::Object( - {{"module-name", MID.ModuleName}, {"context-hash", MID.ContextHash}})); - return Ret; + return [&JOS, V = std::move(V)] { + for (const ModuleID &MID : V) + JOS.object([&] { + JOS.attribute("context-hash", StringRef(MID.ContextHash)); + JOS.attribute("module-name", StringRef(MID.ModuleName)); + }); + }; } -static llvm::json::Array -toJSONSorted(llvm::SmallVector &LinkLibs) { - llvm::sort(LinkLibs, [](const Module::LinkLibrary &lhs, - const Module::LinkLibrary &rhs) { - return lhs.Library < rhs.Library; +static auto toJSONSorted(llvm::json::OStream &JOS, + SmallVector LinkLibs) { + llvm::sort(LinkLibs, [](const auto &LHS, const auto &RHS) { + return LHS.Library < RHS.Library; }); - - llvm::json::Array Ret; - for (const Module::LinkLibrary &LL : LinkLibs) - Ret.push_back(llvm::json::Object( - {{"link-name", LL.Library}, {"isFramework", LL.IsFramework}})); - return Ret; + return [&JOS, LinkLibs = std::move(LinkLibs)] { + for (const auto &LL : LinkLibs) + JOS.object([&] { + JOS.attribute("isFramework", LL.IsFramework); + JOS.attribute("link-name", StringRef(LL.Library)); + }); + }; } // Thread safe. @@ -450,58 +458,65 @@ class FullDeps { ModuleIDs.push_back(M.first); llvm::sort(ModuleIDs); - using namespace llvm::json; - - Array OutModules; - for (auto &&ModID : ModuleIDs) { - auto &MD = Modules[ModID]; - Object O{{"name", MD.ID.ModuleName}, - {"context-hash", MD.ID.ContextHash}, - {"file-deps", toJSONSorted(MD.FileDeps)}, - {"clang-module-deps", toJSONSorted(MD.ClangModuleDeps)}, - {"clang-modulemap-file", MD.ClangModuleMapFile}, - {"command-line", MD.getBuildArguments()}, - {"link-libraries", toJSONSorted(MD.LinkLibraries)}}; - OutModules.push_back(std::move(O)); - } - - Array TUs; - for (auto &&I : Inputs) { - Array Commands; - if (I.DriverCommandLine.empty()) { - for (const auto &Cmd : I.Commands) { - Object O{ - {"input-file", I.FileName}, - {"clang-context-hash", I.ContextHash}, - {"file-deps", I.FileDeps}, - {"clang-module-deps", toJSONSorted(I.ModuleDeps)}, - {"executable", Cmd.Executable}, - {"command-line", Cmd.Arguments}, - }; - Commands.push_back(std::move(O)); + llvm::json::OStream JOS(OS, /*IndentSize=*/2); + + JOS.object([&] { + JOS.attributeArray("modules", [&] { + for (auto &&ModID : ModuleIDs) { + auto &MD = Modules[ModID]; + JOS.object([&] { + JOS.attributeArray("clang-module-deps", + toJSONSorted(JOS, MD.ClangModuleDeps)); + JOS.attribute("clang-modulemap-file", + StringRef(MD.ClangModuleMapFile)); + JOS.attributeArray("command-line", + toJSONStrings(JOS, MD.getBuildArguments())); + JOS.attribute("context-hash", StringRef(MD.ID.ContextHash)); + JOS.attributeArray("file-deps", toJSONSorted(JOS, MD.FileDeps)); + JOS.attributeArray("link-libraries", + toJSONSorted(JOS, MD.LinkLibraries)); + JOS.attribute("name", StringRef(MD.ID.ModuleName)); + }); } - } else { - Object O{ - {"input-file", I.FileName}, - {"clang-context-hash", I.ContextHash}, - {"file-deps", I.FileDeps}, - {"clang-module-deps", toJSONSorted(I.ModuleDeps)}, - {"executable", "clang"}, - {"command-line", I.DriverCommandLine}, - }; - Commands.push_back(std::move(O)); - } - TUs.push_back(Object{ - {"commands", std::move(Commands)}, }); - } - - Object Output{ - {"modules", std::move(OutModules)}, - {"translation-units", std::move(TUs)}, - }; - OS << llvm::formatv("{0:2}\n", Value(std::move(Output))); + JOS.attributeArray("translation-units", [&] { + for (auto &&I : Inputs) { + JOS.object([&] { + JOS.attributeArray("commands", [&] { + if (I.DriverCommandLine.empty()) { + for (const auto &Cmd : I.Commands) { + JOS.object([&] { + JOS.attribute("clang-context-hash", + StringRef(I.ContextHash)); + JOS.attributeArray("clang-module-deps", + toJSONSorted(JOS, I.ModuleDeps)); + JOS.attributeArray("command-line", + toJSONStrings(JOS, Cmd.Arguments)); + JOS.attribute("executable", StringRef(Cmd.Executable)); + JOS.attributeArray("file-deps", + toJSONStrings(JOS, I.FileDeps)); + JOS.attribute("input-file", StringRef(I.FileName)); + }); + } + } else { + JOS.object([&] { + JOS.attribute("clang-context-hash", StringRef(I.ContextHash)); + JOS.attributeArray("clang-module-deps", + toJSONSorted(JOS, I.ModuleDeps)); + JOS.attributeArray("command-line", + toJSONStrings(JOS, I.DriverCommandLine)); + JOS.attribute("executable", "clang"); + JOS.attributeArray("file-deps", + toJSONStrings(JOS, I.FileDeps)); + JOS.attribute("input-file", StringRef(I.FileName)); + }); + } + }); + }); + } + }); + }); } private: From 1dff3309fd3c956fce9e6f60ff57a26f364733d1 Mon Sep 17 00:00:00 2001 From: Krystian Stasiowski Date: Wed, 9 Oct 2024 15:49:32 -0600 Subject: [PATCH 071/119] Revert "Reapply "[Clang][Sema] Refactor collection of multi-level template argument lists (#106585)" (#111173)" (#111766) This reverts commit 4da8ac34f76e707ab94380b94f616457cfd2cb83. --- clang/docs/ReleaseNotes.rst | 3 - clang/include/clang/AST/DeclTemplate.h | 66 +- clang/include/clang/Sema/Sema.h | 25 +- clang/lib/AST/Decl.cpp | 49 +- clang/lib/AST/DeclCXX.cpp | 20 +- clang/lib/AST/DeclTemplate.cpp | 30 +- clang/lib/Sema/SemaConcept.cpp | 29 +- clang/lib/Sema/SemaDecl.cpp | 31 +- clang/lib/Sema/SemaDeclCXX.cpp | 4 +- clang/lib/Sema/SemaTemplate.cpp | 179 +++-- clang/lib/Sema/SemaTemplateDeduction.cpp | 33 +- clang/lib/Sema/SemaTemplateDeductionGuide.cpp | 45 +- clang/lib/Sema/SemaTemplateInstantiate.cpp | 752 +++++++++--------- .../lib/Sema/SemaTemplateInstantiateDecl.cpp | 46 +- clang/lib/Serialization/ASTReader.cpp | 3 +- clang/lib/Serialization/ASTReaderDecl.cpp | 18 +- clang/lib/Serialization/ASTWriterDecl.cpp | 17 +- .../temp/temp.constr/temp.constr.decl/p4.cpp | 175 ---- .../CXX/temp/temp.spec/temp.expl.spec/p7.cpp | 178 ----- clang/test/Modules/cxx-templates.cpp | 4 +- 20 files changed, 702 insertions(+), 1005 deletions(-) delete mode 100644 clang/test/CXX/temp/temp.constr/temp.constr.decl/p4.cpp delete mode 100644 clang/test/CXX/temp/temp.spec/temp.expl.spec/p7.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 29b9fe07f545f..16e6a230ef428 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -477,9 +477,6 @@ Bug Fixes to C++ Support in certain friend declarations. (#GH93099) - Clang now instantiates the correct lambda call operator when a lambda's class type is merged across modules. (#GH110401) -- Clang now uses the correct set of template argument lists when comparing the constraints of - out-of-line definitions and member templates explicitly specialized for a given implicit instantiation of - a class template. (#GH102320) - Fix a crash when parsing a pseudo destructor involving an invalid type. (#GH111460) Bug Fixes to AST Handling diff --git a/clang/include/clang/AST/DeclTemplate.h b/clang/include/clang/AST/DeclTemplate.h index 05739f39d2a49..687715a22e9fd 100644 --- a/clang/include/clang/AST/DeclTemplate.h +++ b/clang/include/clang/AST/DeclTemplate.h @@ -781,11 +781,15 @@ class RedeclarableTemplateDecl : public TemplateDecl, EntryType *Entry, void *InsertPos); struct CommonBase { - CommonBase() {} + CommonBase() : InstantiatedFromMember(nullptr, false) {} /// The template from which this was most /// directly instantiated (or null). - RedeclarableTemplateDecl *InstantiatedFromMember = nullptr; + /// + /// The boolean value indicates whether this template + /// was explicitly specialized. + llvm::PointerIntPair + InstantiatedFromMember; /// If non-null, points to an array of specializations (including /// partial specializations) known only by their external declaration IDs. @@ -805,19 +809,14 @@ class RedeclarableTemplateDecl : public TemplateDecl, }; /// Pointer to the common data shared by all declarations of this - /// template, and a flag indicating if the template is a member - /// specialization. - mutable llvm::PointerIntPair Common; - - CommonBase *getCommonPtrInternal() const { return Common.getPointer(); } + /// template. + mutable CommonBase *Common = nullptr; /// Retrieves the "common" pointer shared by all (re-)declarations of /// the same template. Calling this routine may implicitly allocate memory /// for the common pointer. CommonBase *getCommonPtr() const; - void setCommonPtr(CommonBase *C) const { Common.setPointer(C); } - virtual CommonBase *newCommon(ASTContext &C) const = 0; // Construct a template decl with name, parameters, and templated element. @@ -858,12 +857,15 @@ class RedeclarableTemplateDecl : public TemplateDecl, /// template<> template /// struct X::Inner { /* ... */ }; /// \endcode - bool isMemberSpecialization() const { return Common.getInt(); } + bool isMemberSpecialization() const { + return getCommonPtr()->InstantiatedFromMember.getInt(); + } /// Note that this member template is a specialization. void setMemberSpecialization() { - assert(!isMemberSpecialization() && "already a member specialization"); - Common.setInt(true); + assert(getCommonPtr()->InstantiatedFromMember.getPointer() && + "Only member templates can be member template specializations"); + getCommonPtr()->InstantiatedFromMember.setInt(true); } /// Retrieve the member template from which this template was @@ -903,12 +905,12 @@ class RedeclarableTemplateDecl : public TemplateDecl, /// void X::f(T, U); /// \endcode RedeclarableTemplateDecl *getInstantiatedFromMemberTemplate() const { - return getCommonPtr()->InstantiatedFromMember; + return getCommonPtr()->InstantiatedFromMember.getPointer(); } void setInstantiatedFromMemberTemplate(RedeclarableTemplateDecl *TD) { - assert(!getCommonPtr()->InstantiatedFromMember); - getCommonPtr()->InstantiatedFromMember = TD; + assert(!getCommonPtr()->InstantiatedFromMember.getPointer()); + getCommonPtr()->InstantiatedFromMember.setPointer(TD); } /// Retrieve the "injected" template arguments that correspond to the @@ -1987,8 +1989,6 @@ class ClassTemplateSpecializationDecl : public CXXRecordDecl, /// template arguments have been deduced. void setInstantiationOf(ClassTemplatePartialSpecializationDecl *PartialSpec, const TemplateArgumentList *TemplateArgs) { - assert(!isa(this) && - "A partial specialization cannot be instantiated from a template"); assert(!SpecializedTemplate.is() && "Already set to a class template partial specialization!"); auto *PS = new (getASTContext()) SpecializedPartialSpecialization(); @@ -2000,8 +2000,6 @@ class ClassTemplateSpecializationDecl : public CXXRecordDecl, /// Note that this class template specialization is an instantiation /// of the given class template. void setInstantiationOf(ClassTemplateDecl *TemplDecl) { - assert(!isa(this) && - "A partial specialization cannot be instantiated from a template"); assert(!SpecializedTemplate.is() && "Previously set to a class template partial specialization!"); SpecializedTemplate = TemplDecl; @@ -2189,11 +2187,18 @@ class ClassTemplatePartialSpecializationDecl /// struct X::Inner { /* ... */ }; /// \endcode bool isMemberSpecialization() const { - return InstantiatedFromMember.getInt(); + const auto *First = + cast(getFirstDecl()); + return First->InstantiatedFromMember.getInt(); } /// Note that this member template is a specialization. - void setMemberSpecialization() { return InstantiatedFromMember.setInt(true); } + void setMemberSpecialization() { + auto *First = cast(getFirstDecl()); + assert(First->InstantiatedFromMember.getPointer() && + "Only member templates can be member template specializations"); + return First->InstantiatedFromMember.setInt(true); + } /// Retrieves the injected specialization type for this partial /// specialization. This is not the same as the type-decl-type for @@ -2263,6 +2268,10 @@ class ClassTemplateDecl : public RedeclarableTemplateDecl { return static_cast(RedeclarableTemplateDecl::getCommonPtr()); } + void setCommonPtr(Common *C) { + RedeclarableTemplateDecl::Common = C; + } + public: friend class ASTDeclReader; @@ -2745,8 +2754,6 @@ class VarTemplateSpecializationDecl : public VarDecl, /// template arguments have been deduced. void setInstantiationOf(VarTemplatePartialSpecializationDecl *PartialSpec, const TemplateArgumentList *TemplateArgs) { - assert(!isa(this) && - "A partial specialization cannot be instantiated from a template"); assert(!SpecializedTemplate.is() && "Already set to a variable template partial specialization!"); auto *PS = new (getASTContext()) SpecializedPartialSpecialization(); @@ -2758,8 +2765,6 @@ class VarTemplateSpecializationDecl : public VarDecl, /// Note that this variable template specialization is an instantiation /// of the given variable template. void setInstantiationOf(VarTemplateDecl *TemplDecl) { - assert(!isa(this) && - "A partial specialization cannot be instantiated from a template"); assert(!SpecializedTemplate.is() && "Previously set to a variable template partial specialization!"); SpecializedTemplate = TemplDecl; @@ -2944,11 +2949,18 @@ class VarTemplatePartialSpecializationDecl /// U* X::Inner = (T*)(0) + 1; /// \endcode bool isMemberSpecialization() const { - return InstantiatedFromMember.getInt(); + const auto *First = + cast(getFirstDecl()); + return First->InstantiatedFromMember.getInt(); } /// Note that this member template is a specialization. - void setMemberSpecialization() { return InstantiatedFromMember.setInt(true); } + void setMemberSpecialization() { + auto *First = cast(getFirstDecl()); + assert(First->InstantiatedFromMember.getPointer() && + "Only member templates can be member template specializations"); + return First->InstantiatedFromMember.setInt(true); + } SourceRange getSourceRange() const override LLVM_READONLY; diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 67a6dbeb520a8..51f38a90f3ea0 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -11325,9 +11325,9 @@ class Sema final : public SemaBase { CXXScopeSpec &SS, IdentifierInfo *Name, SourceLocation NameLoc, const ParsedAttributesView &Attr, TemplateParameterList *TemplateParams, AccessSpecifier AS, SourceLocation ModulePrivateLoc, - SourceLocation FriendLoc, - ArrayRef OuterTemplateParamLists, - bool IsMemberSpecialization, SkipBodyInfo *SkipBody = nullptr); + SourceLocation FriendLoc, unsigned NumOuterTemplateParamLists, + TemplateParameterList **OuterTemplateParamLists, + SkipBodyInfo *SkipBody = nullptr); /// Translates template arguments as provided by the parser /// into template arguments used by semantic analysis. @@ -11366,8 +11366,7 @@ class Sema final : public SemaBase { DeclResult ActOnVarTemplateSpecialization( Scope *S, Declarator &D, TypeSourceInfo *DI, LookupResult &Previous, SourceLocation TemplateKWLoc, TemplateParameterList *TemplateParams, - StorageClass SC, bool IsPartialSpecialization, - bool IsMemberSpecialization); + StorageClass SC, bool IsPartialSpecialization); /// Get the specialization of the given variable template corresponding to /// the specified argument list, or a null-but-valid result if the arguments @@ -13008,14 +13007,28 @@ class Sema final : public SemaBase { /// dealing with a specialization. This is only relevant for function /// template specializations. /// + /// \param Pattern If non-NULL, indicates the pattern from which we will be + /// instantiating the definition of the given declaration, \p ND. This is + /// used to determine the proper set of template instantiation arguments for + /// friend function template specializations. + /// /// \param ForConstraintInstantiation when collecting arguments, /// ForConstraintInstantiation indicates we should continue looking when /// encountering a lambda generic call operator, and continue looking for /// arguments on an enclosing class template. + /// + /// \param SkipForSpecialization when specified, any template specializations + /// in a traversal would be ignored. + /// \param ForDefaultArgumentSubstitution indicates we should continue looking + /// when encountering a specialized member function template, rather than + /// returning immediately. MultiLevelTemplateArgumentList getTemplateInstantiationArgs( const NamedDecl *D, const DeclContext *DC = nullptr, bool Final = false, std::optional> Innermost = std::nullopt, - bool RelativeToPrimary = false, bool ForConstraintInstantiation = false); + bool RelativeToPrimary = false, const FunctionDecl *Pattern = nullptr, + bool ForConstraintInstantiation = false, + bool SkipForSpecialization = false, + bool ForDefaultArgumentSubstitution = false); /// RAII object to handle the state changes required to synthesize /// a function body. diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp index 58d11a0312c50..84ef9f74582ef 100644 --- a/clang/lib/AST/Decl.cpp +++ b/clang/lib/AST/Decl.cpp @@ -2696,27 +2696,21 @@ VarDecl *VarDecl::getTemplateInstantiationPattern() const { if (isTemplateInstantiation(VDTemplSpec->getTemplateSpecializationKind())) { auto From = VDTemplSpec->getInstantiatedFrom(); if (auto *VTD = From.dyn_cast()) { - while (true) { - VTD = VTD->getMostRecentDecl(); - if (VTD->isMemberSpecialization()) - break; - if (auto *NewVTD = VTD->getInstantiatedFromMemberTemplate()) - VTD = NewVTD; - else + while (!VTD->isMemberSpecialization()) { + auto *NewVTD = VTD->getInstantiatedFromMemberTemplate(); + if (!NewVTD) break; + VTD = NewVTD; } return getDefinitionOrSelf(VTD->getTemplatedDecl()); } if (auto *VTPSD = From.dyn_cast()) { - while (true) { - VTPSD = VTPSD->getMostRecentDecl(); - if (VTPSD->isMemberSpecialization()) - break; - if (auto *NewVTPSD = VTPSD->getInstantiatedFromMember()) - VTPSD = NewVTPSD; - else + while (!VTPSD->isMemberSpecialization()) { + auto *NewVTPSD = VTPSD->getInstantiatedFromMember(); + if (!NewVTPSD) break; + VTPSD = NewVTPSD; } return getDefinitionOrSelf(VTPSD); } @@ -2725,17 +2719,15 @@ VarDecl *VarDecl::getTemplateInstantiationPattern() const { // If this is the pattern of a variable template, find where it was // instantiated from. FIXME: Is this necessary? - if (VarTemplateDecl *VTD = VD->getDescribedVarTemplate()) { - while (true) { - VTD = VTD->getMostRecentDecl(); - if (VTD->isMemberSpecialization()) - break; - if (auto *NewVTD = VTD->getInstantiatedFromMemberTemplate()) - VTD = NewVTD; - else + if (VarTemplateDecl *VarTemplate = VD->getDescribedVarTemplate()) { + while (!VarTemplate->isMemberSpecialization()) { + auto *NewVT = VarTemplate->getInstantiatedFromMemberTemplate(); + if (!NewVT) break; + VarTemplate = NewVT; } - return getDefinitionOrSelf(VTD->getTemplatedDecl()); + + return getDefinitionOrSelf(VarTemplate->getTemplatedDecl()); } if (VD == this) @@ -4150,14 +4142,11 @@ FunctionDecl::getTemplateInstantiationPattern(bool ForDefinition) const { if (FunctionTemplateDecl *Primary = getPrimaryTemplate()) { // If we hit a point where the user provided a specialization of this // template, we're done looking. - while (true) { - Primary = Primary->getMostRecentDecl(); - if (ForDefinition && Primary->isMemberSpecialization()) - break; - if (auto *NewPrimary = Primary->getInstantiatedFromMemberTemplate()) - Primary = NewPrimary; - else + while (!ForDefinition || !Primary->isMemberSpecialization()) { + auto *NewPrimary = Primary->getInstantiatedFromMemberTemplate(); + if (!NewPrimary) break; + Primary = NewPrimary; } return getDefinitionOrSelf(Primary->getTemplatedDecl()); diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp index cfc7a9a218f25..1364ccc745ba0 100644 --- a/clang/lib/AST/DeclCXX.cpp +++ b/clang/lib/AST/DeclCXX.cpp @@ -2023,27 +2023,19 @@ const CXXRecordDecl *CXXRecordDecl::getTemplateInstantiationPattern() const { if (auto *TD = dyn_cast(this)) { auto From = TD->getInstantiatedFrom(); if (auto *CTD = From.dyn_cast()) { - while (true) { - CTD = CTD->getMostRecentDecl(); - if (CTD->isMemberSpecialization()) - break; - if (auto *NewCTD = CTD->getInstantiatedFromMemberTemplate()) - CTD = NewCTD; - else + while (auto *NewCTD = CTD->getInstantiatedFromMemberTemplate()) { + if (NewCTD->isMemberSpecialization()) break; + CTD = NewCTD; } return GetDefinitionOrSelf(CTD->getTemplatedDecl()); } if (auto *CTPSD = From.dyn_cast()) { - while (true) { - CTPSD = CTPSD->getMostRecentDecl(); - if (CTPSD->isMemberSpecialization()) - break; - if (auto *NewCTPSD = CTPSD->getInstantiatedFromMemberTemplate()) - CTPSD = NewCTPSD; - else + while (auto *NewCTPSD = CTPSD->getInstantiatedFromMember()) { + if (NewCTPSD->isMemberSpecialization()) break; + CTPSD = NewCTPSD; } return GetDefinitionOrSelf(CTPSD); } diff --git a/clang/lib/AST/DeclTemplate.cpp b/clang/lib/AST/DeclTemplate.cpp index d9b67b7bedf5a..6fe817c5ef1c6 100644 --- a/clang/lib/AST/DeclTemplate.cpp +++ b/clang/lib/AST/DeclTemplate.cpp @@ -309,16 +309,16 @@ bool TemplateDecl::isTypeAlias() const { void RedeclarableTemplateDecl::anchor() {} RedeclarableTemplateDecl::CommonBase *RedeclarableTemplateDecl::getCommonPtr() const { - if (CommonBase *C = getCommonPtrInternal()) - return C; + if (Common) + return Common; // Walk the previous-declaration chain until we either find a declaration // with a common pointer or we run out of previous declarations. SmallVector PrevDecls; for (const RedeclarableTemplateDecl *Prev = getPreviousDecl(); Prev; Prev = Prev->getPreviousDecl()) { - if (CommonBase *C = Prev->getCommonPtrInternal()) { - setCommonPtr(C); + if (Prev->Common) { + Common = Prev->Common; break; } @@ -326,18 +326,18 @@ RedeclarableTemplateDecl::CommonBase *RedeclarableTemplateDecl::getCommonPtr() c } // If we never found a common pointer, allocate one now. - if (!getCommonPtrInternal()) { + if (!Common) { // FIXME: If any of the declarations is from an AST file, we probably // need an update record to add the common data. - setCommonPtr(newCommon(getASTContext())); + Common = newCommon(getASTContext()); } // Update any previous declarations we saw with the common pointer. for (const RedeclarableTemplateDecl *Prev : PrevDecls) - Prev->setCommonPtr(getCommonPtrInternal()); + Prev->Common = Common; - return getCommonPtrInternal(); + return Common; } void RedeclarableTemplateDecl::loadLazySpecializationsImpl() const { @@ -463,17 +463,19 @@ void FunctionTemplateDecl::addSpecialization( } void FunctionTemplateDecl::mergePrevDecl(FunctionTemplateDecl *Prev) { + using Base = RedeclarableTemplateDecl; + // If we haven't created a common pointer yet, then it can just be created // with the usual method. - if (!getCommonPtrInternal()) + if (!Base::Common) return; - Common *ThisCommon = static_cast(getCommonPtrInternal()); + Common *ThisCommon = static_cast(Base::Common); Common *PrevCommon = nullptr; SmallVector PreviousDecls; for (; Prev; Prev = Prev->getPreviousDecl()) { - if (CommonBase *C = Prev->getCommonPtrInternal()) { - PrevCommon = static_cast(C); + if (Prev->Base::Common) { + PrevCommon = static_cast(Prev->Base::Common); break; } PreviousDecls.push_back(Prev); @@ -483,7 +485,7 @@ void FunctionTemplateDecl::mergePrevDecl(FunctionTemplateDecl *Prev) { // use this common pointer. if (!PrevCommon) { for (auto *D : PreviousDecls) - D->setCommonPtr(ThisCommon); + D->Base::Common = ThisCommon; return; } @@ -491,7 +493,7 @@ void FunctionTemplateDecl::mergePrevDecl(FunctionTemplateDecl *Prev) { assert(ThisCommon->Specializations.size() == 0 && "Can't merge incompatible declarations!"); - setCommonPtr(PrevCommon); + Base::Common = PrevCommon; } //===----------------------------------------------------------------------===// diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp index e36ee06221371..998a148a7d24a 100644 --- a/clang/lib/Sema/SemaConcept.cpp +++ b/clang/lib/Sema/SemaConcept.cpp @@ -585,8 +585,8 @@ static bool CheckConstraintSatisfaction( ArrayRef TemplateArgs = TemplateArgsLists.getNumSubstitutedLevels() > 0 - ? TemplateArgsLists.getInnermost() - : ArrayRef{}; + ? TemplateArgsLists.getOutermost() + : ArrayRef {}; Sema::InstantiatingTemplate Inst(S, TemplateIDRange.getBegin(), Sema::InstantiatingTemplate::ConstraintsCheck{}, const_cast(Template), TemplateArgs, TemplateIDRange); @@ -834,6 +834,7 @@ Sema::SetupConstraintCheckingTemplateArgumentsAndScope( getTemplateInstantiationArgs(FD, FD->getLexicalDeclContext(), /*Final=*/false, /*Innermost=*/std::nullopt, /*RelativeToPrimary=*/true, + /*Pattern=*/nullptr, /*ForConstraintInstantiation=*/true); if (SetupConstraintScope(FD, TemplateArgs, MLTAL, Scope)) return std::nullopt; @@ -909,13 +910,15 @@ bool Sema::CheckFunctionConstraints(const FunctionDecl *FD, // Figure out the to-translation-unit depth for this function declaration for // the purpose of seeing if they differ by constraints. This isn't the same as // getTemplateDepth, because it includes already instantiated parents. -static unsigned CalculateTemplateDepthForConstraints(Sema &S, - const NamedDecl *ND) { +static unsigned +CalculateTemplateDepthForConstraints(Sema &S, const NamedDecl *ND, + bool SkipForSpecialization = false) { MultiLevelTemplateArgumentList MLTAL = S.getTemplateInstantiationArgs( ND, ND->getLexicalDeclContext(), /*Final=*/false, /*Innermost=*/std::nullopt, /*RelativeToPrimary=*/true, - /*ForConstraintInstantiation=*/true); + /*Pattern=*/nullptr, + /*ForConstraintInstantiation=*/true, SkipForSpecialization); return MLTAL.getNumLevels(); } @@ -954,7 +957,8 @@ static const Expr *SubstituteConstraintExpressionWithoutSatisfaction( DeclInfo.getDecl(), DeclInfo.getLexicalDeclContext(), /*Final=*/false, /*Innermost=*/std::nullopt, /*RelativeToPrimary=*/true, - /*ForConstraintInstantiation=*/true); + /*Pattern=*/nullptr, /*ForConstraintInstantiation=*/true, + /*SkipForSpecialization*/ false); if (MLTAL.getNumSubstitutedLevels() == 0) return ConstrExpr; @@ -1064,16 +1068,16 @@ bool Sema::AreConstraintExpressionsEqual(const NamedDecl *Old, bool Sema::FriendConstraintsDependOnEnclosingTemplate(const FunctionDecl *FD) { assert(FD->getFriendObjectKind() && "Must be a friend!"); - FunctionTemplateDecl *FTD = FD->getDescribedFunctionTemplate(); // The logic for non-templates is handled in ASTContext::isSameEntity, so we // don't have to bother checking 'DependsOnEnclosingTemplate' for a // non-function-template. - assert(FTD && "Non-function templates don't need to be checked"); + assert(FD->getDescribedFunctionTemplate() && + "Non-function templates don't need to be checked"); SmallVector ACs; - FTD->getAssociatedConstraints(ACs); + FD->getDescribedFunctionTemplate()->getAssociatedConstraints(ACs); - unsigned OldTemplateDepth = FTD->getTemplateParameters()->getDepth(); + unsigned OldTemplateDepth = CalculateTemplateDepthForConstraints(*this, FD); for (const Expr *Constraint : ACs) if (ConstraintExpressionDependsOnEnclosingTemplate(FD, OldTemplateDepth, Constraint)) @@ -1520,6 +1524,7 @@ static bool substituteParameterMappings(Sema &S, NormalizedConstraint &N, CSE->getNamedConcept(), CSE->getNamedConcept()->getLexicalDeclContext(), /*Final=*/false, CSE->getTemplateArguments(), /*RelativeToPrimary=*/true, + /*Pattern=*/nullptr, /*ForConstraintInstantiation=*/true); return substituteParameterMappings(S, N, CSE->getNamedConcept(), MLTAL, @@ -1800,8 +1805,8 @@ bool Sema::IsAtLeastAsConstrained(NamedDecl *D1, return false; } - unsigned Depth1 = CalculateTemplateDepthForConstraints(*this, D1); - unsigned Depth2 = CalculateTemplateDepthForConstraints(*this, D2); + unsigned Depth1 = CalculateTemplateDepthForConstraints(*this, D1, true); + unsigned Depth2 = CalculateTemplateDepthForConstraints(*this, D2, true); for (size_t I = 0; I != AC1.size() && I != AC2.size(); ++I) { if (Depth2 > Depth1) { diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 072f43d360ee1..118873bc93ad4 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -4510,10 +4510,10 @@ void Sema::MergeVarDecl(VarDecl *New, LookupResult &Previous) { adjustDeclContextForDeclaratorDecl(New, Old); // Ensure the template parameters are compatible. - if (NewTemplate && !TemplateParameterListsAreEqual( - NewTemplate, NewTemplate->getTemplateParameters(), - OldTemplate, OldTemplate->getTemplateParameters(), - /*Complain=*/true, TPL_TemplateMatch)) + if (NewTemplate && + !TemplateParameterListsAreEqual(NewTemplate->getTemplateParameters(), + OldTemplate->getTemplateParameters(), + /*Complain=*/true, TPL_TemplateMatch)) return New->setInvalidDecl(); // C++ [class.mem]p1: @@ -7663,7 +7663,7 @@ NamedDecl *Sema::ActOnVariableDeclarator( : SourceLocation(); DeclResult Res = ActOnVarTemplateSpecialization( S, D, TInfo, Previous, TemplateKWLoc, TemplateParams, SC, - IsPartialSpecialization, IsMemberSpecialization); + IsPartialSpecialization); if (Res.isInvalid()) return nullptr; NewVD = cast(Res.get()); @@ -7682,10 +7682,6 @@ NamedDecl *Sema::ActOnVariableDeclarator( VarTemplateDecl::Create(Context, DC, D.getIdentifierLoc(), Name, TemplateParams, NewVD); NewVD->setDescribedVarTemplate(NewTemplate); - // If we are providing an explicit specialization of a static variable - // template, make a note of that. - if (IsMemberSpecialization) - NewTemplate->setMemberSpecialization(); } // If this decl has an auto type in need of deduction, make a note of the @@ -8063,6 +8059,12 @@ NamedDecl *Sema::ActOnVariableDeclarator( ? TPC_ClassTemplateMember : TPC_VarTemplate)) NewVD->setInvalidDecl(); + + // If we are providing an explicit specialization of a static variable + // template, make a note of that. + if (PrevVarTemplate && + PrevVarTemplate->getInstantiatedFromMemberTemplate()) + PrevVarTemplate->setMemberSpecialization(); } } @@ -9869,8 +9871,6 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC, NewFD); FunctionTemplate->setLexicalDeclContext(CurContext); NewFD->setDescribedFunctionTemplate(FunctionTemplate); - if (isMemberSpecialization) - FunctionTemplate->setMemberSpecialization(); // For source fidelity, store the other template param lists. if (TemplateParamLists.size() > 1) { @@ -12028,7 +12028,10 @@ bool Sema::CheckFunctionDeclaration(Scope *S, FunctionDecl *NewFD, // If this is an explicit specialization of a member that is a function // template, mark it as a member specialization. - if (IsMemberSpecialization) { + if (IsMemberSpecialization && + NewTemplateDecl->getInstantiatedFromMemberTemplate()) { + NewTemplateDecl->setMemberSpecialization(); + assert(OldTemplateDecl->isMemberSpecialization()); // Explicit specializations of a member template do not inherit deleted // status from the parent member template that they are specializing. if (OldFD->isDeleted()) { @@ -17090,8 +17093,8 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc, DeclResult Result = CheckClassTemplate( S, TagSpec, TUK, KWLoc, SS, Name, NameLoc, Attrs, TemplateParams, AS, ModulePrivateLoc, - /*FriendLoc*/ SourceLocation(), TemplateParameterLists.drop_back(), - isMemberSpecialization, SkipBody); + /*FriendLoc*/ SourceLocation(), TemplateParameterLists.size() - 1, + TemplateParameterLists.data(), SkipBody); return Result.get(); } else { // The "template<>" header is extraneous. diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index 75d82c12e0c1f..9cb2ed02a3f76 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -17416,8 +17416,8 @@ DeclResult Sema::ActOnTemplatedFriendTag( return CheckClassTemplate(S, TagSpec, TagUseKind::Friend, TagLoc, SS, Name, NameLoc, Attr, TemplateParams, AS_public, /*ModulePrivateLoc=*/SourceLocation(), - FriendLoc, TempParamLists.drop_back(), - IsMemberSpecialization) + FriendLoc, TempParamLists.size() - 1, + TempParamLists.data()) .get(); } else { // The "template<>" header is extraneous. diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp index dfd56debc75e9..c7d48b81bc034 100644 --- a/clang/lib/Sema/SemaTemplate.cpp +++ b/clang/lib/Sema/SemaTemplate.cpp @@ -1795,9 +1795,8 @@ DeclResult Sema::CheckClassTemplate( CXXScopeSpec &SS, IdentifierInfo *Name, SourceLocation NameLoc, const ParsedAttributesView &Attr, TemplateParameterList *TemplateParams, AccessSpecifier AS, SourceLocation ModulePrivateLoc, - SourceLocation FriendLoc, - ArrayRef OuterTemplateParamLists, - bool IsMemberSpecialization, SkipBodyInfo *SkipBody) { + SourceLocation FriendLoc, unsigned NumOuterTemplateParamLists, + TemplateParameterList **OuterTemplateParamLists, SkipBodyInfo *SkipBody) { assert(TemplateParams && TemplateParams->size() > 0 && "No template parameters"); assert(TUK != TagUseKind::Reference && @@ -1985,6 +1984,19 @@ DeclResult Sema::CheckClassTemplate( } if (PrevClassTemplate) { + // Ensure that the template parameter lists are compatible. Skip this check + // for a friend in a dependent context: the template parameter list itself + // could be dependent. + if (!(TUK == TagUseKind::Friend && CurContext->isDependentContext()) && + !TemplateParameterListsAreEqual( + TemplateCompareNewDeclInfo(SemanticContext ? SemanticContext + : CurContext, + CurContext, KWLoc), + TemplateParams, PrevClassTemplate, + PrevClassTemplate->getTemplateParameters(), /*Complain=*/true, + TPL_TemplateMatch)) + return true; + // C++ [temp.class]p4: // In a redeclaration, partial specialization, explicit // specialization or explicit instantiation of a class template, @@ -1999,6 +2011,30 @@ DeclResult Sema::CheckClassTemplate( Diag(PrevRecordDecl->getLocation(), diag::note_previous_use); Kind = PrevRecordDecl->getTagKind(); } + + // Check for redefinition of this class template. + if (TUK == TagUseKind::Definition) { + if (TagDecl *Def = PrevRecordDecl->getDefinition()) { + // If we have a prior definition that is not visible, treat this as + // simply making that previous definition visible. + NamedDecl *Hidden = nullptr; + if (SkipBody && !hasVisibleDefinition(Def, &Hidden)) { + SkipBody->ShouldSkip = true; + SkipBody->Previous = Def; + auto *Tmpl = cast(Hidden)->getDescribedClassTemplate(); + assert(Tmpl && "original definition of a class template is not a " + "class template?"); + makeMergedDefinitionVisible(Hidden); + makeMergedDefinitionVisible(Tmpl); + } else { + Diag(NameLoc, diag::err_redefinition) << Name; + Diag(Def->getLocation(), diag::note_previous_definition); + // FIXME: Would it make sense to try to "forget" the previous + // definition, as part of error recovery? + return true; + } + } + } } else if (PrevDecl) { // C++ [temp]p5: // A class template shall not have the same name as any other @@ -2010,6 +2046,23 @@ DeclResult Sema::CheckClassTemplate( return true; } + // Check the template parameter list of this declaration, possibly + // merging in the template parameter list from the previous class + // template declaration. Skip this check for a friend in a dependent + // context, because the template parameter list might be dependent. + if (!(TUK == TagUseKind::Friend && CurContext->isDependentContext()) && + CheckTemplateParameterList( + TemplateParams, + PrevClassTemplate ? GetTemplateParameterList(PrevClassTemplate) + : nullptr, + (SS.isSet() && SemanticContext && SemanticContext->isRecord() && + SemanticContext->isDependentContext()) + ? TPC_ClassTemplateMember + : TUK == TagUseKind::Friend ? TPC_FriendClassTemplate + : TPC_ClassTemplate, + SkipBody)) + Invalid = true; + if (SS.isSet()) { // If the name of the template was qualified, we must be defining the // template out-of-line. @@ -2036,8 +2089,10 @@ DeclResult Sema::CheckClassTemplate( PrevClassTemplate->getTemplatedDecl() : nullptr, /*DelayTypeCreation=*/true); SetNestedNameSpecifier(*this, NewClass, SS); - if (!OuterTemplateParamLists.empty()) - NewClass->setTemplateParameterListsInfo(Context, OuterTemplateParamLists); + if (NumOuterTemplateParamLists > 0) + NewClass->setTemplateParameterListsInfo( + Context, + llvm::ArrayRef(OuterTemplateParamLists, NumOuterTemplateParamLists)); // Add alignment attributes if necessary; these attributes are checked when // the ASTContext lays out the structure. @@ -2050,10 +2105,7 @@ DeclResult Sema::CheckClassTemplate( = ClassTemplateDecl::Create(Context, SemanticContext, NameLoc, DeclarationName(Name), TemplateParams, NewClass); - // If we are providing an explicit specialization of a member that is a - // class template, make a note of that. - if (IsMemberSpecialization) - NewTemplate->setMemberSpecialization(); + if (ShouldAddRedecl) NewTemplate->setPreviousDecl(PrevClassTemplate); @@ -2068,6 +2120,12 @@ DeclResult Sema::CheckClassTemplate( assert(T->isDependentType() && "Class template type is not dependent?"); (void)T; + // If we are providing an explicit specialization of a member that is a + // class template, make a note of that. + if (PrevClassTemplate && + PrevClassTemplate->getInstantiatedFromMemberTemplate()) + PrevClassTemplate->setMemberSpecialization(); + // Set the access specifier. if (!Invalid && TUK != TagUseKind::Friend && NewTemplate->getDeclContext()->isRecord()) @@ -2077,62 +2135,8 @@ DeclResult Sema::CheckClassTemplate( NewClass->setLexicalDeclContext(CurContext); NewTemplate->setLexicalDeclContext(CurContext); - // Ensure that the template parameter lists are compatible. Skip this check - // for a friend in a dependent context: the template parameter list itself - // could be dependent. - if (ShouldAddRedecl && PrevClassTemplate && - !TemplateParameterListsAreEqual( - NewTemplate, TemplateParams, PrevClassTemplate, - PrevClassTemplate->getTemplateParameters(), - /*Complain=*/true, TPL_TemplateMatch)) - return true; - - // Check the template parameter list of this declaration, possibly - // merging in the template parameter list from the previous class - // template declaration. Skip this check for a friend in a dependent - // context, because the template parameter list might be dependent. - if (ShouldAddRedecl && - CheckTemplateParameterList( - TemplateParams, - PrevClassTemplate ? PrevClassTemplate->getTemplateParameters() - : nullptr, - (SS.isSet() && SemanticContext && SemanticContext->isRecord() && - SemanticContext->isDependentContext()) - ? TPC_ClassTemplateMember - : TUK == TagUseKind::Friend ? TPC_FriendClassTemplate - : TPC_ClassTemplate, - SkipBody)) - Invalid = true; - - if (TUK == TagUseKind::Definition) { - if (PrevClassTemplate) { - // Check for redefinition of this class template. - if (TagDecl *Def = - PrevClassTemplate->getTemplatedDecl()->getDefinition()) { - // If we have a prior definition that is not visible, treat this as - // simply making that previous definition visible. - NamedDecl *Hidden = nullptr; - if (SkipBody && !hasVisibleDefinition(Def, &Hidden)) { - SkipBody->ShouldSkip = true; - SkipBody->Previous = Def; - auto *Tmpl = cast(Hidden)->getDescribedClassTemplate(); - assert(Tmpl && "original definition of a class template is not a " - "class template?"); - makeMergedDefinitionVisible(Hidden); - makeMergedDefinitionVisible(Tmpl); - } else { - Diag(NameLoc, diag::err_redefinition) << Name; - Diag(Def->getLocation(), diag::note_previous_definition); - // FIXME: Would it make sense to try to "forget" the previous - // definition, as part of error recovery? - return true; - } - } - } - - if (!SkipBody || !SkipBody->ShouldSkip) - NewClass->startDefinition(); - } + if (TUK == TagUseKind::Definition && (!SkipBody || !SkipBody->ShouldSkip)) + NewClass->startDefinition(); ProcessDeclAttributeList(S, NewClass, Attr); ProcessAPINotes(NewClass); @@ -4129,8 +4133,7 @@ void Sema::CheckDeductionGuideTemplate(FunctionTemplateDecl *TD) { DeclResult Sema::ActOnVarTemplateSpecialization( Scope *S, Declarator &D, TypeSourceInfo *DI, LookupResult &Previous, SourceLocation TemplateKWLoc, TemplateParameterList *TemplateParams, - StorageClass SC, bool IsPartialSpecialization, - bool IsMemberSpecialization) { + StorageClass SC, bool IsPartialSpecialization) { // D must be variable template id. assert(D.getName().getKind() == UnqualifiedIdKind::IK_TemplateId && "Variable template specialization is declared with a template id."); @@ -4248,16 +4251,17 @@ DeclResult Sema::ActOnVarTemplateSpecialization( Context, VarTemplate->getDeclContext(), TemplateKWLoc, TemplateNameLoc, TemplateParams, VarTemplate, DI->getType(), DI, SC, CanonicalConverted); - // If we are providing an explicit specialization of a member variable - // template specialization, make a note of that. - if (IsMemberSpecialization) - Partial->setMemberSpecialization(); Partial->setTemplateArgsAsWritten(TemplateArgs); if (!PrevPartial) VarTemplate->AddPartialSpecialization(Partial, InsertPos); Specialization = Partial; + // If we are providing an explicit specialization of a member variable + // template specialization, make a note of that. + if (PrevPartial && PrevPartial->getInstantiatedFromMember()) + PrevPartial->setMemberSpecialization(); + CheckTemplatePartialSpecialization(Partial); } else { // Create a new class template specialization declaration node for @@ -5772,7 +5776,9 @@ bool Sema::CheckTemplateArgumentList( MultiLevelTemplateArgumentList MLTAL = getTemplateInstantiationArgs( Template, NewContext, /*Final=*/false, CanonicalConverted, - /*RelativeToPrimary=*/true, /*ForConceptInstantiation=*/true); + /*RelativeToPrimary=*/true, + /*Pattern=*/nullptr, + /*ForConceptInstantiation=*/true); if (EnsureTemplateArgumentListConstraints( Template, MLTAL, SourceRange(TemplateLoc, TemplateArgs.getRAngleLoc()))) { @@ -8461,12 +8467,15 @@ DeclResult Sema::ActOnClassTemplateSpecialization( Diag(TemplateNameLoc, diag::err_partial_spec_args_match_primary_template) << /*class template*/ 0 << (TUK == TagUseKind::Definition) << FixItHint::CreateRemoval(SourceRange(LAngleLoc, RAngleLoc)); - return CheckClassTemplate( - S, TagSpec, TUK, KWLoc, SS, ClassTemplate->getIdentifier(), - TemplateNameLoc, Attr, TemplateParams, AS_none, - /*ModulePrivateLoc=*/SourceLocation(), - /*FriendLoc*/ SourceLocation(), TemplateParameterLists.drop_back(), - isMemberSpecialization); + return CheckClassTemplate(S, TagSpec, TUK, KWLoc, SS, + ClassTemplate->getIdentifier(), + TemplateNameLoc, + Attr, + TemplateParams, + AS_none, /*ModulePrivateLoc=*/SourceLocation(), + /*FriendLoc*/SourceLocation(), + TemplateParameterLists.size() - 1, + TemplateParameterLists.data()); } // Create a new class template partial specialization declaration node. @@ -8476,11 +8485,6 @@ DeclResult Sema::ActOnClassTemplateSpecialization( ClassTemplatePartialSpecializationDecl::Create( Context, Kind, DC, KWLoc, TemplateNameLoc, TemplateParams, ClassTemplate, CanonicalConverted, CanonType, PrevPartial); - - // If we are providing an explicit specialization of a member class - // template specialization, make a note of that. - if (isMemberSpecialization) - Partial->setMemberSpecialization(); Partial->setTemplateArgsAsWritten(TemplateArgs); SetNestedNameSpecifier(*this, Partial, SS); if (TemplateParameterLists.size() > 1 && SS.isSet()) { @@ -8492,6 +8496,11 @@ DeclResult Sema::ActOnClassTemplateSpecialization( ClassTemplate->AddPartialSpecialization(Partial, InsertPos); Specialization = Partial; + // If we are providing an explicit specialization of a member class + // template specialization, make a note of that. + if (PrevPartial && PrevPartial->getInstantiatedFromMember()) + PrevPartial->setMemberSpecialization(); + CheckTemplatePartialSpecialization(Partial); } else { // Create a new class template specialization declaration node for @@ -11290,8 +11299,8 @@ class ExplicitSpecializationVisibilityChecker { template void checkTemplate(TemplDecl *TD) { - if (TD->getMostRecentDecl()->isMemberSpecialization()) { - if (!CheckMemberSpecialization(TD->getMostRecentDecl())) + if (TD->isMemberSpecialization()) { + if (!CheckMemberSpecialization(TD)) diagnose(TD->getMostRecentDecl(), false); } } diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp index aa62cfa7dcbd1..d106874c4c5bd 100644 --- a/clang/lib/Sema/SemaTemplateDeduction.cpp +++ b/clang/lib/Sema/SemaTemplateDeduction.cpp @@ -3138,6 +3138,20 @@ template<> struct IsPartialSpecialization { static constexpr bool value = true; }; +template +static bool DeducedArgsNeedReplacement(TemplateDeclT *Template) { + return false; +} +template <> +bool DeducedArgsNeedReplacement( + VarTemplatePartialSpecializationDecl *Spec) { + return !Spec->isClassScopeExplicitSpecialization(); +} +template <> +bool DeducedArgsNeedReplacement( + ClassTemplatePartialSpecializationDecl *Spec) { + return !Spec->isClassScopeExplicitSpecialization(); +} template static TemplateDeductionResult @@ -3148,10 +3162,23 @@ CheckDeducedArgumentConstraints(Sema &S, TemplateDeclT *Template, llvm::SmallVector AssociatedConstraints; Template->getAssociatedConstraints(AssociatedConstraints); + std::optional> Innermost; + // If we don't need to replace the deduced template arguments, + // we can add them immediately as the inner-most argument list. + if (!DeducedArgsNeedReplacement(Template)) + Innermost = CanonicalDeducedArgs; + MultiLevelTemplateArgumentList MLTAL = S.getTemplateInstantiationArgs( - Template, Template->getDeclContext(), /*Final=*/false, - /*Innermost=*/CanonicalDeducedArgs, /*RelativeToPrimary=*/true, - /*ForConstraintInstantiation=*/true); + Template, Template->getDeclContext(), /*Final=*/false, Innermost, + /*RelativeToPrimary=*/true, /*Pattern=*/ + nullptr, /*ForConstraintInstantiation=*/true); + + // getTemplateInstantiationArgs picks up the non-deduced version of the + // template args when this is a variable template partial specialization and + // not class-scope explicit specialization, so replace with Deduced Args + // instead of adding to inner-most. + if (!Innermost) + MLTAL.replaceInnermostTemplateArguments(Template, CanonicalDeducedArgs); if (S.CheckConstraintSatisfaction(Template, AssociatedConstraints, MLTAL, Info.getLocation(), diff --git a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp index ca93c840f0321..545da21183c3c 100644 --- a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp +++ b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp @@ -765,7 +765,7 @@ buildAssociatedConstraints(Sema &SemaRef, FunctionTemplateDecl *F, } // Template arguments used to transform the template arguments in // DeducedResults. - SmallVector InnerArgsForBuildingRC( + SmallVector TemplateArgsForBuildingRC( F->getTemplateParameters()->size()); // Transform the transformed template args MultiLevelTemplateArgumentList Args; @@ -778,30 +778,33 @@ buildAssociatedConstraints(Sema &SemaRef, FunctionTemplateDecl *F, NamedDecl *TP = F->getTemplateParameters()->getParam(Index); MultiLevelTemplateArgumentList Args; Args.setKind(TemplateSubstitutionKind::Rewrite); - Args.addOuterTemplateArguments(InnerArgsForBuildingRC); + Args.addOuterTemplateArguments(TemplateArgsForBuildingRC); // Rebuild the template parameter with updated depth and index. NamedDecl *NewParam = transformTemplateParameter(SemaRef, F->getDeclContext(), TP, Args, /*NewIndex=*/FirstUndeducedParamIdx, getDepthAndIndex(TP).first + AdjustDepth); FirstUndeducedParamIdx += 1; - assert(InnerArgsForBuildingRC[Index].isNull()); - InnerArgsForBuildingRC[Index] = Context.getInjectedTemplateArg(NewParam); + assert(TemplateArgsForBuildingRC[Index].isNull()); + TemplateArgsForBuildingRC[Index] = + Context.getInjectedTemplateArg(NewParam); continue; } TemplateArgumentLoc Input = SemaRef.getTrivialTemplateArgumentLoc(D, QualType(), SourceLocation{}); TemplateArgumentLoc Output; if (!SemaRef.SubstTemplateArgument(Input, Args, Output)) { - assert(InnerArgsForBuildingRC[Index].isNull() && + assert(TemplateArgsForBuildingRC[Index].isNull() && "InstantiatedArgs must be null before setting"); - InnerArgsForBuildingRC[Index] = Output.getArgument(); + TemplateArgsForBuildingRC[Index] = Output.getArgument(); } } - // A list of template arguments for transforming the require-clause using - // the transformed template arguments as the template argument list of F. - // + // A list of template arguments for transforming the require-clause of F. + // It must contain the entire set of template argument lists. + MultiLevelTemplateArgumentList ArgsForBuildingRC; + ArgsForBuildingRC.setKind(clang::TemplateSubstitutionKind::Rewrite); + ArgsForBuildingRC.addOuterTemplateArguments(TemplateArgsForBuildingRC); // For 2), if the underlying deduction guide F is nested in a class template, // we need the entire template argument list, as the constraint AST in the // require-clause of F remains completely uninstantiated. @@ -824,15 +827,25 @@ buildAssociatedConstraints(Sema &SemaRef, FunctionTemplateDecl *F, // - The occurrence of U in the function parameter is [depth:0, index:0] // - The template parameter of U is [depth:0, index:0] // + // We add the outer template arguments which is [int] to the multi-level arg + // list to ensure that the occurrence U in `C` will be replaced with int + // during the substitution. + // // NOTE: The underlying deduction guide F is instantiated -- either from an // explicitly-written deduction guide member, or from a constructor. - MultiLevelTemplateArgumentList ArgsForBuildingRC = - SemaRef.getTemplateInstantiationArgs(F, F->getLexicalDeclContext(), - /*Final=*/false, - /*Innermost=*/InnerArgsForBuildingRC, - /*RelativeToPrimary=*/true, - /*ForConstraintInstantiation=*/true); - ArgsForBuildingRC.setKind(clang::TemplateSubstitutionKind::Rewrite); + // getInstantiatedFromMemberTemplate() can only handle the former case, so we + // check the DeclContext kind. + if (F->getLexicalDeclContext()->getDeclKind() == + clang::Decl::ClassTemplateSpecialization) { + auto OuterLevelArgs = SemaRef.getTemplateInstantiationArgs( + F, F->getLexicalDeclContext(), + /*Final=*/false, /*Innermost=*/std::nullopt, + /*RelativeToPrimary=*/true, + /*Pattern=*/nullptr, + /*ForConstraintInstantiation=*/true); + for (auto It : OuterLevelArgs) + ArgsForBuildingRC.addOuterTemplateArguments(It.Args); + } ExprResult E = SemaRef.SubstExpr(RC, ArgsForBuildingRC); if (E.isInvalid()) diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp index 5b5e50f668b25..7d42cf6b8cced 100644 --- a/clang/lib/Sema/SemaTemplateInstantiate.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp @@ -52,6 +52,38 @@ using namespace sema; //===----------------------------------------------------------------------===/ namespace { +namespace TemplateInstArgsHelpers { +struct Response { + const Decl *NextDecl = nullptr; + bool IsDone = false; + bool ClearRelativeToPrimary = true; + static Response Done() { + Response R; + R.IsDone = true; + return R; + } + static Response ChangeDecl(const Decl *ND) { + Response R; + R.NextDecl = ND; + return R; + } + static Response ChangeDecl(const DeclContext *Ctx) { + Response R; + R.NextDecl = Decl::castFromDeclContext(Ctx); + return R; + } + + static Response UseNextDecl(const Decl *CurDecl) { + return ChangeDecl(CurDecl->getDeclContext()); + } + + static Response DontClearRelativeToPrimaryNextDecl(const Decl *CurDecl) { + Response R = Response::UseNextDecl(CurDecl); + R.ClearRelativeToPrimary = false; + return R; + } +}; + // Retrieve the primary template for a lambda call operator. It's // unfortunate that we only have the mappings of call operators rather // than lambda classes. @@ -139,396 +171,374 @@ bool isLambdaEnclosedByTypeAliasDecl( .TraverseType(Underlying); } -struct TemplateInstantiationArgumentCollecter - : DeclVisitor { - Sema &S; - MultiLevelTemplateArgumentList &Result; - std::optional> Innermost; - bool RelativeToPrimary; - bool ForConstraintInstantiation; - - TemplateInstantiationArgumentCollecter( - Sema &S, MultiLevelTemplateArgumentList &Result, - std::optional> Innermost, - bool RelativeToPrimary, bool ForConstraintInstantiation) - : S(S), Result(Result), Innermost(Innermost), - RelativeToPrimary(RelativeToPrimary), - ForConstraintInstantiation(ForConstraintInstantiation) {} - - Decl *Done() { return nullptr; } - - Decl *ChangeDecl(const Decl *D) { - RelativeToPrimary = false; - return const_cast(D); - } - - Decl *ChangeDecl(const DeclContext *DC) { - return ChangeDecl(Decl::castFromDeclContext(DC)); - } - - Decl *UseNextDecl(const Decl *D) { return ChangeDecl(D->getDeclContext()); } - - void AddInnermostTemplateArguments(const Decl *D) { - assert(Innermost); - Result.addOuterTemplateArguments(const_cast(D), *Innermost, - /*Final=*/false); - Innermost.reset(); - } - - void AddOuterTemplateArguments(const Decl *D, ArrayRef Args, - bool Final) { - Result.addOuterTemplateArguments(const_cast(D), Args, Final); +// Add template arguments from a variable template instantiation. +Response +HandleVarTemplateSpec(const VarTemplateSpecializationDecl *VarTemplSpec, + MultiLevelTemplateArgumentList &Result, + bool SkipForSpecialization) { + // For a class-scope explicit specialization, there are no template arguments + // at this level, but there may be enclosing template arguments. + if (VarTemplSpec->isClassScopeExplicitSpecialization()) + return Response::DontClearRelativeToPrimaryNextDecl(VarTemplSpec); + + // We're done when we hit an explicit specialization. + if (VarTemplSpec->getSpecializationKind() == TSK_ExplicitSpecialization && + !isa(VarTemplSpec)) + return Response::Done(); + + // If this variable template specialization was instantiated from a + // specialized member that is a variable template, we're done. + assert(VarTemplSpec->getSpecializedTemplate() && "No variable template?"); + llvm::PointerUnion + Specialized = VarTemplSpec->getSpecializedTemplateOrPartial(); + if (VarTemplatePartialSpecializationDecl *Partial = + Specialized.dyn_cast()) { + if (!SkipForSpecialization) + Result.addOuterTemplateArguments( + Partial, VarTemplSpec->getTemplateInstantiationArgs().asArray(), + /*Final=*/false); + if (Partial->isMemberSpecialization()) + return Response::Done(); + } else { + VarTemplateDecl *Tmpl = Specialized.get(); + if (!SkipForSpecialization) + Result.addOuterTemplateArguments( + Tmpl, VarTemplSpec->getTemplateInstantiationArgs().asArray(), + /*Final=*/false); + if (Tmpl->isMemberSpecialization()) + return Response::Done(); } + return Response::DontClearRelativeToPrimaryNextDecl(VarTemplSpec); +} - Decl *VisitTemplateTemplateParmDecl(TemplateTemplateParmDecl *TTPD) { - if (Innermost) - AddInnermostTemplateArguments(TTPD); - else if (ForConstraintInstantiation) - AddOuterTemplateArguments(nullptr, std::nullopt, /*Final=*/false); - - for (unsigned Depth = TTPD->getDepth() + 1; Depth--;) - AddOuterTemplateArguments(nullptr, std::nullopt, /*Final=*/false); - - return Done(); - } +// If we have a template template parameter with translation unit context, +// then we're performing substitution into a default template argument of +// this template template parameter before we've constructed the template +// that will own this template template parameter. In this case, we +// use empty template parameter lists for all of the outer templates +// to avoid performing any substitutions. +Response +HandleDefaultTempArgIntoTempTempParam(const TemplateTemplateParmDecl *TTP, + MultiLevelTemplateArgumentList &Result) { + for (unsigned I = 0, N = TTP->getDepth() + 1; I != N; ++I) + Result.addOuterTemplateArguments(std::nullopt); + return Response::Done(); +} - Decl *VisitFunctionTemplateDecl(FunctionTemplateDecl *FTD) { - assert( - (ForConstraintInstantiation || Result.getNumSubstitutedLevels() == 0) && - "outer template not instantiated?"); +Response HandlePartialClassTemplateSpec( + const ClassTemplatePartialSpecializationDecl *PartialClassTemplSpec, + MultiLevelTemplateArgumentList &Result, bool SkipForSpecialization) { + if (!SkipForSpecialization) + Result.addOuterRetainedLevels(PartialClassTemplSpec->getTemplateDepth()); + return Response::Done(); +} - if (Innermost) - AddInnermostTemplateArguments(FTD); - else if (ForConstraintInstantiation) - AddOuterTemplateArguments(FTD, FTD->getInjectedTemplateArgs(), - /*Final=*/false); +// Add template arguments from a class template instantiation. +Response +HandleClassTemplateSpec(const ClassTemplateSpecializationDecl *ClassTemplSpec, + MultiLevelTemplateArgumentList &Result, + bool SkipForSpecialization) { + if (!ClassTemplSpec->isClassScopeExplicitSpecialization()) { + // We're done when we hit an explicit specialization. + if (ClassTemplSpec->getSpecializationKind() == TSK_ExplicitSpecialization && + !isa(ClassTemplSpec)) + return Response::Done(); - if (FTD->isMemberSpecialization()) - return Done(); + if (!SkipForSpecialization) + Result.addOuterTemplateArguments( + const_cast(ClassTemplSpec), + ClassTemplSpec->getTemplateInstantiationArgs().asArray(), + /*Final=*/false); - if (FTD->getFriendObjectKind()) - return ChangeDecl(FTD->getLexicalDeclContext()); - return UseNextDecl(FTD); + // If this class template specialization was instantiated from a + // specialized member that is a class template, we're done. + assert(ClassTemplSpec->getSpecializedTemplate() && "No class template?"); + if (ClassTemplSpec->getSpecializedTemplate()->isMemberSpecialization()) + return Response::Done(); + + // If this was instantiated from a partial template specialization, we need + // to get the next level of declaration context from the partial + // specialization, as the ClassTemplateSpecializationDecl's + // DeclContext/LexicalDeclContext will be for the primary template. + if (auto *InstFromPartialTempl = ClassTemplSpec->getSpecializedTemplateOrPartial() + .dyn_cast()) + return Response::ChangeDecl(InstFromPartialTempl->getLexicalDeclContext()); } + return Response::UseNextDecl(ClassTemplSpec); +} - Decl *VisitVarTemplateDecl(VarTemplateDecl *VTD) { - assert( - (ForConstraintInstantiation || Result.getNumSubstitutedLevels() == 0) && - "outer template not instantiated?"); - - if (Innermost) - AddInnermostTemplateArguments(VTD); - else if (ForConstraintInstantiation) - AddOuterTemplateArguments(VTD, VTD->getInjectedTemplateArgs(), - /*Final=*/false); - - if (VTD->isMemberSpecialization()) - return Done(); - - return UseNextDecl(VTD); - } +Response HandleFunction(Sema &SemaRef, const FunctionDecl *Function, + MultiLevelTemplateArgumentList &Result, + const FunctionDecl *Pattern, bool RelativeToPrimary, + bool ForConstraintInstantiation, + bool ForDefaultArgumentSubstitution) { + // Add template arguments from a function template specialization. + if (!RelativeToPrimary && + Function->getTemplateSpecializationKindForInstantiation() == + TSK_ExplicitSpecialization) + return Response::Done(); + + if (!RelativeToPrimary && + Function->getTemplateSpecializationKind() == TSK_ExplicitSpecialization) { + // This is an implicit instantiation of an explicit specialization. We + // don't get any template arguments from this function but might get + // some from an enclosing template. + return Response::UseNextDecl(Function); + } else if (const TemplateArgumentList *TemplateArgs = + Function->getTemplateSpecializationArgs()) { + // Add the template arguments for this specialization. + Result.addOuterTemplateArguments(const_cast(Function), + TemplateArgs->asArray(), + /*Final=*/false); - Decl *VisitVarTemplatePartialSpecializationDecl( - VarTemplatePartialSpecializationDecl *VTPSD) { + if (RelativeToPrimary && + (Function->getTemplateSpecializationKind() == + TSK_ExplicitSpecialization || + (Function->getFriendObjectKind() && + !Function->getPrimaryTemplate()->getFriendObjectKind()))) + return Response::UseNextDecl(Function); + + // If this function was instantiated from a specialized member that is + // a function template, we're done. + assert(Function->getPrimaryTemplate() && "No function template?"); + if (!ForDefaultArgumentSubstitution && + Function->getPrimaryTemplate()->isMemberSpecialization()) + return Response::Done(); + + // If this function is a generic lambda specialization, we are done. + if (!ForConstraintInstantiation && + isGenericLambdaCallOperatorOrStaticInvokerSpecialization(Function)) + return Response::Done(); + + } else if (Function->getDescribedFunctionTemplate()) { assert( (ForConstraintInstantiation || Result.getNumSubstitutedLevels() == 0) && - "outer template not instantiated?"); - - if (Innermost) - AddInnermostTemplateArguments(VTPSD); - else if (ForConstraintInstantiation) - AddOuterTemplateArguments(VTPSD, VTPSD->getTemplateArgs().asArray(), - /*Final=*/false); - - if (VTPSD->isMemberSpecialization()) - return Done(); - - return UseNextDecl(VTPSD); + "Outer template not instantiated?"); } - - Decl *VisitClassTemplateDecl(ClassTemplateDecl *CTD) { - assert( - (ForConstraintInstantiation || Result.getNumSubstitutedLevels() == 0) && - "outer template not instantiated?"); - - if (Innermost) - AddInnermostTemplateArguments(CTD); - else if (ForConstraintInstantiation) - AddOuterTemplateArguments(CTD, CTD->getInjectedTemplateArgs(), - /*Final=*/false); - - if (CTD->isMemberSpecialization()) - return Done(); - - if (CTD->getFriendObjectKind()) - return ChangeDecl(CTD->getLexicalDeclContext()); - return UseNextDecl(CTD); + // If this is a friend or local declaration and it declares an entity at + // namespace scope, take arguments from its lexical parent + // instead of its semantic parent, unless of course the pattern we're + // instantiating actually comes from the file's context! + if ((Function->getFriendObjectKind() || Function->isLocalExternDecl()) && + Function->getNonTransparentDeclContext()->isFileContext() && + (!Pattern || !Pattern->getLexicalDeclContext()->isFileContext())) { + return Response::ChangeDecl(Function->getLexicalDeclContext()); } - Decl *VisitClassTemplatePartialSpecializationDecl( - ClassTemplatePartialSpecializationDecl *CTPSD) { - assert( - (ForConstraintInstantiation || Result.getNumSubstitutedLevels() == 0) && - "outer template not instantiated?"); - - if (Innermost) - AddInnermostTemplateArguments(CTPSD); - else if (ForConstraintInstantiation) - AddOuterTemplateArguments(CTPSD, CTPSD->getTemplateArgs().asArray(), - /*Final=*/false); + if (ForConstraintInstantiation && Function->getFriendObjectKind()) + return Response::ChangeDecl(Function->getLexicalDeclContext()); + return Response::UseNextDecl(Function); +} - if (CTPSD->isMemberSpecialization()) - return Done(); +Response HandleFunctionTemplateDecl(const FunctionTemplateDecl *FTD, + MultiLevelTemplateArgumentList &Result) { + if (!isa(FTD->getDeclContext())) { + Result.addOuterTemplateArguments( + const_cast(FTD), + const_cast(FTD)->getInjectedTemplateArgs(), + /*Final=*/false); + + NestedNameSpecifier *NNS = FTD->getTemplatedDecl()->getQualifier(); + + while (const Type *Ty = NNS ? NNS->getAsType() : nullptr) { + if (NNS->isInstantiationDependent()) { + if (const auto *TSTy = Ty->getAs()) { + ArrayRef Arguments = TSTy->template_arguments(); + // Prefer template arguments from the injected-class-type if possible. + // For example, + // ```cpp + // template struct S { + // template void foo(); + // }; + // template template + // ^^^^^^^^^^^^^ InjectedTemplateArgs + // They're of kind TemplateArgument::Pack, not of + // TemplateArgument::Type. + // void S::foo() {} + // ^^^^^^^ + // TSTy->template_arguments() (which are of PackExpansionType) + // ``` + // This meets the contract in + // TreeTransform::TryExpandParameterPacks that the template arguments + // for unexpanded parameters should be of a Pack kind. + if (TSTy->isCurrentInstantiation()) { + auto *RD = TSTy->getCanonicalTypeInternal()->getAsCXXRecordDecl(); + if (ClassTemplateDecl *CTD = RD->getDescribedClassTemplate()) + Arguments = CTD->getInjectedTemplateArgs(); + else if (auto *Specialization = + dyn_cast(RD)) + Arguments = + Specialization->getTemplateInstantiationArgs().asArray(); + } + Result.addOuterTemplateArguments( + TSTy->getTemplateName().getAsTemplateDecl(), Arguments, + /*Final=*/false); + } + } - return UseNextDecl(CTPSD); + NNS = NNS->getPrefix(); + } } - Decl *VisitTypeAliasTemplateDecl(TypeAliasTemplateDecl *TATD) { - assert( - (ForConstraintInstantiation || Result.getNumSubstitutedLevels() == 0) && - "outer template not instantiated?"); - if (Innermost) - AddInnermostTemplateArguments(TATD); - else if (ForConstraintInstantiation) - AddOuterTemplateArguments(TATD, TATD->getInjectedTemplateArgs(), - /*Final=*/false); - - return UseNextDecl(TATD); - } + return Response::ChangeDecl(FTD->getLexicalDeclContext()); +} - Decl *VisitConceptDecl(ConceptDecl *CD) { +Response HandleRecordDecl(Sema &SemaRef, const CXXRecordDecl *Rec, + MultiLevelTemplateArgumentList &Result, + ASTContext &Context, + bool ForConstraintInstantiation) { + if (ClassTemplateDecl *ClassTemplate = Rec->getDescribedClassTemplate()) { assert( (ForConstraintInstantiation || Result.getNumSubstitutedLevels() == 0) && - "outer template not instantiated?"); - if (Innermost) - AddInnermostTemplateArguments(CD); - - return UseNextDecl(CD); + "Outer template not instantiated?"); + if (ClassTemplate->isMemberSpecialization()) + return Response::Done(); + if (ForConstraintInstantiation) + Result.addOuterTemplateArguments(const_cast(Rec), + ClassTemplate->getInjectedTemplateArgs(), + /*Final=*/false); } - Decl *VisitFunctionDecl(FunctionDecl *FD) { - assert(!FD->getDescribedFunctionTemplate() && - "not for templated declarations"); - - if (!RelativeToPrimary) { - // Add template arguments from a function template specialization. - if (const MemberSpecializationInfo *MSI = - FD->getMemberSpecializationInfo(); - MSI && - MSI->getTemplateSpecializationKind() == TSK_ExplicitSpecialization) - return Done(); - - // This is an implicit instantiation of an explicit specialization. We - // don't get any template arguments from this function but might get - // some from an enclosing template. - if (FD->getTemplateSpecializationKind() == TSK_ExplicitSpecialization) - return UseNextDecl(FD); - } - - if (const TemplateArgumentList *TemplateArgs = - FD->getTemplateSpecializationArgs()) { - // Add the template arguments for this specialization. - if (Innermost) - AddInnermostTemplateArguments(FD); - else - AddOuterTemplateArguments(FD, TemplateArgs->asArray(), /*Final=*/false); - - if (FD->getTemplateSpecializationKind() == TSK_ExplicitSpecialization || - (FD->getFriendObjectKind() && - !FD->getPrimaryTemplate()->getFriendObjectKind())) - return UseNextDecl(FD); - - // If this function was instantiated from a specialized member that is - // a function template, we're done. - assert(FD->getPrimaryTemplate() && "No function template?"); - if (FD->getPrimaryTemplate()->isMemberSpecialization()) - return Done(); - - // If this function is a generic lambda specialization, we are done. - if (!ForConstraintInstantiation && - isGenericLambdaCallOperatorOrStaticInvokerSpecialization(FD)) - return Done(); - } - - // If this is a friend or local declaration and it declares an entity at - // namespace scope, take arguments from its lexical parent - // instead of its semantic parent, unless of course the pattern we're - // instantiating actually comes from the file's context! - if ((FD->getFriendObjectKind() || FD->isLocalExternDecl()) && - FD->getNonTransparentDeclContext()->isFileContext()) { - return ChangeDecl(FD->getLexicalDeclContext()); - } - - if (ForConstraintInstantiation && FD->getFriendObjectKind()) - return ChangeDecl(FD->getLexicalDeclContext()); - return UseNextDecl(FD); + if (const MemberSpecializationInfo *MSInfo = + Rec->getMemberSpecializationInfo()) + if (MSInfo->getTemplateSpecializationKind() == TSK_ExplicitSpecialization) + return Response::Done(); + + bool IsFriend = Rec->getFriendObjectKind() || + (Rec->getDescribedClassTemplate() && + Rec->getDescribedClassTemplate()->getFriendObjectKind()); + if (ForConstraintInstantiation && IsFriend && + Rec->getNonTransparentDeclContext()->isFileContext()) { + return Response::ChangeDecl(Rec->getLexicalDeclContext()); } - Decl *VisitCXXRecordDecl(CXXRecordDecl *RD) { - assert(!RD->getDescribedClassTemplate() && - "not for templated declarations"); - - if (const MemberSpecializationInfo *MSI = RD->getMemberSpecializationInfo(); - MSI && - MSI->getTemplateSpecializationKind() == TSK_ExplicitSpecialization) - return Done(); - - if (ForConstraintInstantiation && RD->getFriendObjectKind() && - RD->getNonTransparentDeclContext()->isFileContext()) { - return ChangeDecl(RD->getLexicalDeclContext()); - } - - // This is to make sure we pick up the VarTemplateSpecializationDecl or the - // TypeAliasTemplateDecl that this lambda is defined inside of. - if (RD->isLambda()) { - if (Decl *LCD = RD->getLambdaContextDecl()) - return ChangeDecl(LCD); - // Retrieve the template arguments for a using alias declaration. - // This is necessary for constraint checking, since we always keep - // constraints relative to the primary template. - if (auto TypeAlias = getEnclosingTypeAliasTemplateDecl(S); - ForConstraintInstantiation && TypeAlias) { - if (isLambdaEnclosedByTypeAliasDecl(RD->getLambdaCallOperator(), - TypeAlias.PrimaryTypeAliasDecl)) { - AddOuterTemplateArguments(TypeAlias.Template, - TypeAlias.AssociatedTemplateArguments, - /*Final=*/false); - // Visit the parent of the current type alias declaration rather than - // the lambda thereof. - // E.g., in the following example: - // struct S { - // template using T = decltype([] {} ()); - // }; - // void foo() { - // S::T var; - // } - // The instantiated lambda expression (which we're visiting at 'var') - // has a function DeclContext 'foo' rather than the Record DeclContext - // S. This seems to be an oversight to me that we may want to set a - // Sema Context from the CXXScopeSpec before substituting into T. - return ChangeDecl(TypeAlias.Template->getDeclContext()); - } + // This is to make sure we pick up the VarTemplateSpecializationDecl or the + // TypeAliasTemplateDecl that this lambda is defined inside of. + if (Rec->isLambda()) { + if (const Decl *LCD = Rec->getLambdaContextDecl()) + return Response::ChangeDecl(LCD); + // Retrieve the template arguments for a using alias declaration. + // This is necessary for constraint checking, since we always keep + // constraints relative to the primary template. + if (auto TypeAlias = getEnclosingTypeAliasTemplateDecl(SemaRef); + ForConstraintInstantiation && TypeAlias) { + if (isLambdaEnclosedByTypeAliasDecl(Rec->getLambdaCallOperator(), + TypeAlias.PrimaryTypeAliasDecl)) { + Result.addOuterTemplateArguments(TypeAlias.Template, + TypeAlias.AssociatedTemplateArguments, + /*Final=*/false); + // Visit the parent of the current type alias declaration rather than + // the lambda thereof. + // E.g., in the following example: + // struct S { + // template using T = decltype([] {} ()); + // }; + // void foo() { + // S::T var; + // } + // The instantiated lambda expression (which we're visiting at 'var') + // has a function DeclContext 'foo' rather than the Record DeclContext + // S. This seems to be an oversight to me that we may want to set a + // Sema Context from the CXXScopeSpec before substituting into T. + return Response::ChangeDecl(TypeAlias.Template->getDeclContext()); } } - - return UseNextDecl(RD); - } - - Decl * - VisitClassTemplateSpecializationDecl(ClassTemplateSpecializationDecl *CTSD) { - // For a class-scope explicit specialization, there are no template - // arguments at this level, but there may be enclosing template arguments. - if (CTSD->isClassScopeExplicitSpecialization()) - return UseNextDecl(CTSD); - - // We're done when we hit an explicit specialization. - if (CTSD->getSpecializationKind() == TSK_ExplicitSpecialization) - return Done(); - - if (Innermost) - AddInnermostTemplateArguments(CTSD); - else - AddOuterTemplateArguments(CTSD, - CTSD->getTemplateInstantiationArgs().asArray(), - /*Final=*/false); - - // If this class template specialization was instantiated from a - // specialized member that is a class template, we're done. - assert(CTSD->getSpecializedTemplate() && "No class template?"); - llvm::PointerUnion - Specialized = CTSD->getSpecializedTemplateOrPartial(); - if (auto *CTPSD = - Specialized.dyn_cast()) { - if (CTPSD->isMemberSpecialization()) - return Done(); - } else { - auto *CTD = Specialized.get(); - if (CTD->isMemberSpecialization()) - return Done(); - } - return UseNextDecl(CTSD); - } - - Decl * - VisitVarTemplateSpecializationDecl(VarTemplateSpecializationDecl *VTSD) { - // For a class-scope explicit specialization, there are no template - // arguments at this level, but there may be enclosing template arguments. - if (VTSD->isClassScopeExplicitSpecialization()) - return UseNextDecl(VTSD); - - // We're done when we hit an explicit specialization. - if (VTSD->getSpecializationKind() == TSK_ExplicitSpecialization) - return Done(); - - if (Innermost) - AddInnermostTemplateArguments(VTSD); - else - AddOuterTemplateArguments(VTSD, - VTSD->getTemplateInstantiationArgs().asArray(), - /*Final=*/false); - - // If this variable template specialization was instantiated from a - // specialized member that is a variable template, we're done. - assert(VTSD->getSpecializedTemplate() && "No variable template?"); - llvm::PointerUnion - Specialized = VTSD->getSpecializedTemplateOrPartial(); - if (auto *VTPSD = - Specialized.dyn_cast()) { - if (VTPSD->isMemberSpecialization()) - return Done(); - } else { - auto *VTD = Specialized.get(); - if (VTD->isMemberSpecialization()) - return Done(); - } - return UseNextDecl(VTSD); - } - - Decl *VisitImplicitConceptSpecializationDecl( - ImplicitConceptSpecializationDecl *ICSD) { - AddOuterTemplateArguments(ICSD, ICSD->getTemplateArguments(), - /*Final=*/false); - return UseNextDecl(ICSD); } - Decl *VisitDecl(Decl *D) { - if (D->isFileContextDecl()) - return Done(); - - if (isa(D)) - RelativeToPrimary = false; - - return UseNextDecl(D); - } + return Response::UseNextDecl(Rec); +} - Decl *Visit(Decl *D) { - if (TemplateDecl *TD = D->getDescribedTemplate()) - D = TD; - return DeclVisitor::Visit(D); - } -}; +Response HandleImplicitConceptSpecializationDecl( + const ImplicitConceptSpecializationDecl *CSD, + MultiLevelTemplateArgumentList &Result) { + Result.addOuterTemplateArguments( + const_cast(CSD), + CSD->getTemplateArguments(), + /*Final=*/false); + return Response::UseNextDecl(CSD); +} +Response HandleGenericDeclContext(const Decl *CurDecl) { + return Response::UseNextDecl(CurDecl); +} +} // namespace TemplateInstArgsHelpers } // namespace MultiLevelTemplateArgumentList Sema::getTemplateInstantiationArgs( const NamedDecl *ND, const DeclContext *DC, bool Final, std::optional> Innermost, bool RelativeToPrimary, - bool ForConstraintInstantiation) { + const FunctionDecl *Pattern, bool ForConstraintInstantiation, + bool SkipForSpecialization, bool ForDefaultArgumentSubstitution) { assert((ND || DC) && "Can't find arguments for a decl if one isn't provided"); // Accumulate the set of template argument lists in this structure. MultiLevelTemplateArgumentList Result; + + using namespace TemplateInstArgsHelpers; const Decl *CurDecl = ND; if (!CurDecl) CurDecl = Decl::castFromDeclContext(DC); - TemplateInstantiationArgumentCollecter Collecter( - *this, Result, Innermost, RelativeToPrimary, ForConstraintInstantiation); - do { - CurDecl = Collecter.Visit(const_cast(CurDecl)); - } while (CurDecl); + if (Innermost) { + Result.addOuterTemplateArguments(const_cast(ND), *Innermost, + Final); + // Populate placeholder template arguments for TemplateTemplateParmDecls. + // This is essential for the case e.g. + // + // template concept Concept = false; + // template