Skip to content
20 changes: 18 additions & 2 deletions llvm/lib/Transforms/Vectorize/VectorCombine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2017,8 +2017,24 @@ bool VectorCombine::scalarizeExtExtract(Instruction &I) {

Value *ScalarV = Ext->getOperand(0);
if (!isGuaranteedNotToBePoison(ScalarV, &AC, dyn_cast<Instruction>(ScalarV),
&DT))
ScalarV = Builder.CreateFreeze(ScalarV);
&DT)) {
// Check if all lanes are extracted and all extracts trigger UB on poison.
// If so, we do not need to insert a freeze.
SmallDenseSet<uint64_t, 8> ExtractedLanes;
bool AllExtractsHaveUB = true;
for (User *U : Ext->users()) {
auto *Extract = cast<ExtractElementInst>(U);
uint64_t Idx =
cast<ConstantInt>(Extract->getIndexOperand())->getZExtValue();
ExtractedLanes.insert(Idx);
if (!programUndefinedIfPoison(Extract)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is under the assumption that the extract will be executed. The extract might be conditionally executed in a different block. I think the rest of the code does not exclude that possibility?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At least initially, it should probably be enough to restrict this to cases where the extracts are in the same block as the extend, and they are guaranteed to execute, if the extend executes

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, thanks! I added checks that the extracts are in the same block as the extend and that they are executed, and tests showing the freeze will still be inserted if those checks fail.

AllExtractsHaveUB = false;
break;
}
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might be worth to see if this logic can be moved to ValueTracking?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would probably make sense to first add an initial implementation here, and then check if there are other cases where this would help via ValueTracking as follow-up

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good, I've addressed the comments in VectorCombine for now. I think I have a proposal on how to move the logic to ValueTracking, will post it as a follow-up.

if (!AllExtractsHaveUB || ExtractedLanes.size() != SrcTy->getNumElements())
ScalarV = Builder.CreateFreeze(ScalarV);
}
ScalarV = Builder.CreateBitCast(
ScalarV,
IntegerType::get(SrcTy->getContext(), DL->getTypeSizeInBits(SrcTy)));
Expand Down
34 changes: 34 additions & 0 deletions llvm/test/Transforms/VectorCombine/AArch64/ext-extract.ll
Original file line number Diff line number Diff line change
Expand Up @@ -346,3 +346,37 @@ entry:
call void @use.i32(i32 %ext.3)
ret void
}

define noundef i32 @zext_v4i8_all_lanes_used_no_freeze(<4 x i8> %src) {
; CHECK-LABEL: define noundef i32 @zext_v4i8_all_lanes_used_no_freeze(
; CHECK-SAME: <4 x i8> [[SRC:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i8> [[SRC]] to i32
; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 24
; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP0]], 16
; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP2]], 255
; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP0]], 8
; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP4]], 255
; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[TMP0]], 255
; CHECK-NEXT: [[EXT:%.*]] = zext nneg <4 x i8> [[SRC]] to <4 x i32>
; CHECK-NEXT: [[EXT_0:%.*]] = extractelement <4 x i32> [[EXT]], i64 0
; CHECK-NEXT: [[EXT_1:%.*]] = extractelement <4 x i32> [[EXT]], i64 1
; CHECK-NEXT: [[EXT_2:%.*]] = extractelement <4 x i32> [[EXT]], i64 2
; CHECK-NEXT: [[EXT_3:%.*]] = extractelement <4 x i32> [[EXT]], i64 3
; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[TMP6]], [[TMP5]]
; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[ADD1]], [[TMP3]]
; CHECK-NEXT: [[ADD3:%.*]] = add i32 [[ADD2]], [[TMP1]]
; CHECK-NEXT: ret i32 [[ADD3]]
;
entry:
%ext = zext nneg <4 x i8> %src to <4 x i32>
%ext.0 = extractelement <4 x i32> %ext, i64 0
%ext.1 = extractelement <4 x i32> %ext, i64 1
%ext.2 = extractelement <4 x i32> %ext, i64 2
%ext.3 = extractelement <4 x i32> %ext, i64 3

%add1 = add i32 %ext.0, %ext.1
%add2 = add i32 %add1, %ext.2
%add3 = add i32 %add2, %ext.3
ret i32 %add3
}