From 25917f5c68ecfff59fbc6db6f643a20aae7eb72f Mon Sep 17 00:00:00 2001 From: Peter Bell Date: Sun, 10 Nov 2024 00:00:11 +0000 Subject: [PATCH] [InstCombine] Only fold extract element to trunc if vector hasOneUse This fixes a missed optimization caused by the `foldBitcastExtElt` pattern interfering with other combine patterns. In the case I was hitting, we have IR that combines two vectors into a new larger vector by extracting elements and inserting them into the new vector. ```llvm define <4 x half> @bitcast_extract_insert_to_shuffle(i32 %a, i32 %b) { %avec = bitcast i32 %a to <2 x half> %a0 = extractelement <2 x half> %avec, i32 0 %a1 = extractelement <2 x half> %avec, i32 1 %bvec = bitcast i32 %b to <2 x half> %b0 = extractelement <2 x half> %bvec, i32 0 %b1 = extractelement <2 x half> %bvec, i32 1 %ins0 = insertelement <4 x half> undef, half %a0, i32 0 %ins1 = insertelement <4 x half> %ins0, half %a1, i32 1 %ins2 = insertelement <4 x half> %ins1, half %b0, i32 2 %ins3 = insertelement <4 x half> %ins2, half %b1, i32 3 ret <4 x half> %ins3 } ``` With the current behavior, `InstCombine` converts each vector extract sequence to ```llvm %tmp = trunc i32 %a to i16 %a0 = bitcast i16 %tmp to half %a1 = extractelement <2 x half> %avec, i32 1 ``` where the extraction of `%a0` is now done by truncating the original integer. While on it's own this is fairly reasonable, in this case it also blocks the pattern which converts `extractelement` - `insertelement` into shuffles which gives the overall simpler result: ```llvm define <4 x half> @bitcast_extract_insert_to_shuffle(i32 %a, i32 %b) { %avec = bitcast i32 %a to <2 x half> %bvec = bitcast i32 %b to <2 x half> %ins3 = shufflevector <2 x half> %avec, <2 x half> %bvec, <4 x i32> ret <4 x half> %ins3 } ``` In this PR I fix the conflict by obeying the `hasOneUse` check even if there is no shift instruction required. In these cases we can't remove the vector completely, so the pattern has less benefit anyway. Also fwiw, I think dropping the `hasOneUse` check for the 0th element might have been a mistake in the first place. Looking at 535c5d56a7bc9966036a11362d8984983a4bf090 the commit message only mentions loosening the `isDesirableIntType` requirement and doesn't mention changing the `hasOneUse` check at all. --- .../InstCombine/InstCombineVectorOps.cpp | 6 ++--- .../Transforms/InstCombine/extractelement.ll | 18 +++++---------- .../InstCombine/vector_insertelt_shuffle.ll | 22 +++++++++++++++++++ 3 files changed, 31 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index 454fe5a91d375..76878ce4ff88f 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -205,9 +205,9 @@ Instruction *InstCombinerImpl::foldBitcastExtElt(ExtractElementInst &Ext) { if (IsBigEndian) ExtIndexC = NumElts.getKnownMinValue() - 1 - ExtIndexC; unsigned ShiftAmountC = ExtIndexC * DestWidth; - if (!ShiftAmountC || - (isDesirableIntType(X->getType()->getPrimitiveSizeInBits()) && - Ext.getVectorOperand()->hasOneUse())) { + if ((!ShiftAmountC || + isDesirableIntType(X->getType()->getPrimitiveSizeInBits())) && + Ext.getVectorOperand()->hasOneUse()) { if (ShiftAmountC) X = Builder.CreateLShr(X, ShiftAmountC, "extelt.offset"); if (DestTy->isFloatingPointTy()) { diff --git a/llvm/test/Transforms/InstCombine/extractelement.ll b/llvm/test/Transforms/InstCombine/extractelement.ll index 28a4702559c46..2bd719e236137 100644 --- a/llvm/test/Transforms/InstCombine/extractelement.ll +++ b/llvm/test/Transforms/InstCombine/extractelement.ll @@ -722,20 +722,14 @@ define i8 @bitcast_scalar_index_variable(i32 %x, i64 %y) { ret i8 %r } -; extra use is ok if we don't need a shift +; extra use is not ok, even if we don't need a shift define i8 @bitcast_scalar_index0_use(i64 %x) { -; ANYLE-LABEL: @bitcast_scalar_index0_use( -; ANYLE-NEXT: [[V:%.*]] = bitcast i64 [[X:%.*]] to <8 x i8> -; ANYLE-NEXT: call void @use(<8 x i8> [[V]]) -; ANYLE-NEXT: [[R:%.*]] = trunc i64 [[X]] to i8 -; ANYLE-NEXT: ret i8 [[R]] -; -; ANYBE-LABEL: @bitcast_scalar_index0_use( -; ANYBE-NEXT: [[V:%.*]] = bitcast i64 [[X:%.*]] to <8 x i8> -; ANYBE-NEXT: call void @use(<8 x i8> [[V]]) -; ANYBE-NEXT: [[R:%.*]] = extractelement <8 x i8> [[V]], i64 0 -; ANYBE-NEXT: ret i8 [[R]] +; ANY-LABEL: @bitcast_scalar_index0_use( +; ANY-NEXT: [[V:%.*]] = bitcast i64 [[X:%.*]] to <8 x i8> +; ANY-NEXT: call void @use(<8 x i8> [[V]]) +; ANY-NEXT: [[R:%.*]] = extractelement <8 x i8> [[V]], i64 0 +; ANY-NEXT: ret i8 [[R]] ; %v = bitcast i64 %x to <8 x i8> diff --git a/llvm/test/Transforms/InstCombine/vector_insertelt_shuffle.ll b/llvm/test/Transforms/InstCombine/vector_insertelt_shuffle.ll index f745a40364211..ab2a7faa107c7 100644 --- a/llvm/test/Transforms/InstCombine/vector_insertelt_shuffle.ll +++ b/llvm/test/Transforms/InstCombine/vector_insertelt_shuffle.ll @@ -90,4 +90,26 @@ define <4 x float> @bazzzzzz(<4 x float> %x, i32 %a) { ret <4 x float> %ins1 } +; test that foldBitcastExtElt doesn't interfere with shuffle folding + +define <4 x half> @bitcast_extract_insert_to_shuffle(i32 %a, i32 %b) { +; CHECK-LABEL: @bitcast_extract_insert_to_shuffle( +; CHECK-NEXT: [[AVEC:%.*]] = bitcast i32 [[A:%.*]] to <2 x half> +; CHECK-NEXT: [[BVEC:%.*]] = bitcast i32 [[B:%.*]] to <2 x half> +; CHECK-NEXT: [[INS3:%.*]] = shufflevector <2 x half> [[AVEC]], <2 x half> [[BVEC]], <4 x i32> +; CHECK-NEXT: ret <4 x half> [[INS3]] +; + %avec = bitcast i32 %a to <2 x half> + %a0 = extractelement <2 x half> %avec, i32 0 + %a1 = extractelement <2 x half> %avec, i32 1 + %bvec = bitcast i32 %b to <2 x half> + %b0 = extractelement <2 x half> %bvec, i32 0 + %b1 = extractelement <2 x half> %bvec, i32 1 + %ins0 = insertelement <4 x half> undef, half %a0, i32 0 + %ins1 = insertelement <4 x half> %ins0, half %a1, i32 1 + %ins2 = insertelement <4 x half> %ins1, half %b0, i32 2 + %ins3 = insertelement <4 x half> %ins2, half %b1, i32 3 + ret <4 x half> %ins3 +} +