Skip to content

Commit 994a6a3

Browse files
uyoyo0RKSimon
andauthored
[VectorCombine] Fix scalarizeExtExtract for big-endian (#157962)
The scalarizeExtExtract transform assumed little-endian lane ordering, causing miscompiles on big-endian targets such as AIX/PowerPC under -O3 -flto. This patch updates the shift calculation to handle endianness correctly for big-endian targets. No functional change for little-endian targets. Fixes #158197. --------- Co-authored-by: Simon Pilgrim <[email protected]>
1 parent 2c091e6 commit 994a6a3

File tree

4 files changed

+69
-2
lines changed

4 files changed

+69
-2
lines changed

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2014,12 +2014,19 @@ bool VectorCombine::scalarizeExtExtract(Instruction &I) {
20142014
IntegerType::get(SrcTy->getContext(), DL->getTypeSizeInBits(SrcTy)));
20152015
uint64_t SrcEltSizeInBits = DL->getTypeSizeInBits(SrcTy->getElementType());
20162016
uint64_t EltBitMask = (1ull << SrcEltSizeInBits) - 1;
2017+
uint64_t TotalBits = DL->getTypeSizeInBits(SrcTy);
2018+
Type *PackedTy = IntegerType::get(SrcTy->getContext(), TotalBits);
2019+
Value *Mask = ConstantInt::get(PackedTy, EltBitMask);
20172020
for (User *U : Ext->users()) {
20182021
auto *Extract = cast<ExtractElementInst>(U);
20192022
uint64_t Idx =
20202023
cast<ConstantInt>(Extract->getIndexOperand())->getZExtValue();
2021-
Value *LShr = Builder.CreateLShr(ScalarV, Idx * SrcEltSizeInBits);
2022-
Value *And = Builder.CreateAnd(LShr, EltBitMask);
2024+
uint64_t ShiftAmt =
2025+
DL->isBigEndian()
2026+
? (TotalBits - SrcEltSizeInBits - Idx * SrcEltSizeInBits)
2027+
: (Idx * SrcEltSizeInBits);
2028+
Value *LShr = Builder.CreateLShr(ScalarV, ShiftAmt);
2029+
Value *And = Builder.CreateAnd(LShr, Mask);
20232030
U->replaceAllUsesWith(And);
20242031
}
20252032
return true;
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2+
; RUN: opt -passes='vector-combine' -S -mtriple=aarch64-unknown-linux-gnu %s -o - | FileCheck %s --check-prefix=LE
3+
; RUN: opt -passes='vector-combine' -S -mtriple=aarch64_be-unknown-linux-gnu %s -o - | FileCheck %s --check-prefix=BE
4+
5+
define i64 @g(<8 x i8> %v) {
6+
; LE-LABEL: @g(
7+
; LE-NEXT: [[TMP1:%.*]] = freeze <8 x i8> [[V:%.*]]
8+
; LE-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
9+
; LE-NEXT: [[TMP3:%.*]] = lshr i64 [[TMP2]], 56
10+
; LE-NEXT: [[TMP4:%.*]] = and i64 [[TMP2]], 255
11+
; LE-NEXT: [[Z:%.*]] = zext <8 x i8> [[V]] to <8 x i64>
12+
; LE-NEXT: [[E0:%.*]] = extractelement <8 x i64> [[Z]], i32 0
13+
; LE-NEXT: [[E7:%.*]] = extractelement <8 x i64> [[Z]], i32 7
14+
; LE-NEXT: [[SUM:%.*]] = add i64 [[TMP4]], [[TMP3]]
15+
; LE-NEXT: ret i64 [[SUM]]
16+
;
17+
; BE-LABEL: @g(
18+
; BE-NEXT: [[TMP1:%.*]] = freeze <8 x i8> [[V:%.*]]
19+
; BE-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
20+
; BE-NEXT: [[TMP3:%.*]] = and i64 [[TMP2]], 255
21+
; BE-NEXT: [[TMP4:%.*]] = lshr i64 [[TMP2]], 56
22+
; BE-NEXT: [[Z:%.*]] = zext <8 x i8> [[V]] to <8 x i64>
23+
; BE-NEXT: [[E0:%.*]] = extractelement <8 x i64> [[Z]], i32 0
24+
; BE-NEXT: [[E7:%.*]] = extractelement <8 x i64> [[Z]], i32 7
25+
; BE-NEXT: [[SUM:%.*]] = add i64 [[TMP4]], [[TMP3]]
26+
; BE-NEXT: ret i64 [[SUM]]
27+
;
28+
%z = zext <8 x i8> %v to <8 x i64>
29+
%e0 = extractelement <8 x i64> %z, i32 0
30+
%e7 = extractelement <8 x i64> %z, i32 7
31+
%sum = add i64 %e0, %e7
32+
ret i64 %sum
33+
}
34+
35+
36+
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
if 'PowerPC' not in config.root.targets:
2+
config.unsupported = True
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2+
; RUN: opt -passes='vector-combine' -S -mtriple=powerpc64-ibm-aix-xcoff %s -o - | FileCheck %s --check-prefix=BE
3+
4+
define i64 @g(<8 x i8> %v) {
5+
; BE-LABEL: @g(
6+
; BE-NEXT: [[TMP1:%.*]] = freeze <8 x i8> [[V:%.*]]
7+
; BE-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
8+
; BE-NEXT: [[TMP3:%.*]] = and i64 [[TMP2]], 255
9+
; BE-NEXT: [[TMP4:%.*]] = lshr i64 [[TMP2]], 56
10+
; BE-NEXT: [[Z:%.*]] = zext <8 x i8> [[V]] to <8 x i64>
11+
; BE-NEXT: [[E0:%.*]] = extractelement <8 x i64> [[Z]], i32 0
12+
; BE-NEXT: [[E7:%.*]] = extractelement <8 x i64> [[Z]], i32 7
13+
; BE-NEXT: [[SUM:%.*]] = add i64 [[TMP4]], [[TMP3]]
14+
; BE-NEXT: ret i64 [[SUM]]
15+
;
16+
%z = zext <8 x i8> %v to <8 x i64>
17+
%e0 = extractelement <8 x i64> %z, i32 0
18+
%e7 = extractelement <8 x i64> %z, i32 7
19+
%sum = add i64 %e0, %e7
20+
ret i64 %sum
21+
}
22+

0 commit comments

Comments
 (0)