Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -817,6 +817,12 @@ class TargetTransformInfo {
LLVM_ABI bool isLegalMaskedLoad(Type *DataType, Align Alignment,
unsigned AddressSpace) const;

/// Return true if it is legal to widen loads beyond their current width,
/// assuming the result is still well-aligned. For example, converting a load
/// i32 to a load i64, or vectorizing three continuous load i32s into a load
/// <4 x i32>.
LLVM_ABI bool isLegalToWidenLoads(LLVMContext &Context) const;

/// Return true if the target supports nontemporal store.
LLVM_ABI bool isLegalNTStore(Type *DataType, Align Alignment) const;
/// Return true if the target supports nontemporal load.
Expand Down
2 changes: 2 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,8 @@ class TargetTransformInfoImplBase {
return false;
}

virtual bool isLegalToWidenLoads(LLVMContext &Context) const { return false; }

virtual bool isLegalNTStore(Type *DataType, Align Alignment) const {
// By default, assume nontemporal memory stores are available for stores
// that are aligned and have a size that is a power of 2.
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Analysis/TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -476,6 +476,10 @@ bool TargetTransformInfo::isLegalMaskedLoad(Type *DataType, Align Alignment,
return TTIImpl->isLegalMaskedLoad(DataType, Alignment, AddressSpace);
}

bool TargetTransformInfo::isLegalToWidenLoads() const {
return TTIImpl->isLegalToWidenLoads();
}

bool TargetTransformInfo::isLegalNTStore(Type *DataType,
Align Alignment) const {
return TTIImpl->isLegalNTStore(DataType, Alignment);
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,10 @@ class NVPTXTTIImpl final : public BasicTTIImplBase<NVPTXTTIImpl> {
return isLegalToVectorizeLoadChain(ChainSizeInBytes, Alignment, AddrSpace);
}

bool isLegalToWidenLoads(LLVMContext &Context) const override {
return true;
};

// NVPTX has infinite registers of all kinds, but the actual machine doesn't.
// We conservatively return 1 here which is just enough to enable the
// vectorizers but disables heuristics based on the number of registers.
Expand Down
408 changes: 356 additions & 52 deletions llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

Large diffs are not rendered by default.

40 changes: 21 additions & 19 deletions llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
Original file line number Diff line number Diff line change
Expand Up @@ -45,29 +45,31 @@ define half @fh(ptr %p) {
; ENABLED-LABEL: fh(
; ENABLED: {
; ENABLED-NEXT: .reg .b16 %rs<10>;
; ENABLED-NEXT: .reg .b32 %r<13>;
; ENABLED-NEXT: .reg .b32 %r<17>;
; ENABLED-NEXT: .reg .b64 %rd<2>;
; ENABLED-EMPTY:
; ENABLED-NEXT: // %bb.0:
; ENABLED-NEXT: ld.param.b64 %rd1, [fh_param_0];
; ENABLED-NEXT: ld.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
; ENABLED-NEXT: ld.b16 %rs5, [%rd1+8];
; ENABLED-NEXT: cvt.f32.f16 %r1, %rs2;
; ENABLED-NEXT: cvt.f32.f16 %r2, %rs1;
; ENABLED-NEXT: add.rn.f32 %r3, %r2, %r1;
; ENABLED-NEXT: cvt.rn.f16.f32 %rs6, %r3;
; ENABLED-NEXT: cvt.f32.f16 %r4, %rs4;
; ENABLED-NEXT: cvt.f32.f16 %r5, %rs3;
; ENABLED-NEXT: add.rn.f32 %r6, %r5, %r4;
; ENABLED-NEXT: cvt.rn.f16.f32 %rs7, %r6;
; ENABLED-NEXT: cvt.f32.f16 %r7, %rs7;
; ENABLED-NEXT: cvt.f32.f16 %r8, %rs6;
; ENABLED-NEXT: add.rn.f32 %r9, %r8, %r7;
; ENABLED-NEXT: cvt.rn.f16.f32 %rs8, %r9;
; ENABLED-NEXT: cvt.f32.f16 %r10, %rs8;
; ENABLED-NEXT: cvt.f32.f16 %r11, %rs5;
; ENABLED-NEXT: add.rn.f32 %r12, %r10, %r11;
; ENABLED-NEXT: cvt.rn.f16.f32 %rs9, %r12;
; ENABLED-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This does not look right. Our input is presumably an array of f16 elements, but we end up loading 4 x b32, but then appear to ignore the last two elements. It should have been ld.v2.b32, or, perhaps the load should have remained ld.v4.f16.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note the difference in number of ld instructions in the PTX. The old output has two load instructions to load 5 b16s: a ld.v4.b16 and a ld.b16. The new version, in the LSV, "extends" the chain of 5 loads to the next power of two, a chain of 8 loads with 3 unused tail elements, vectorizing it a single load <8 x i16>. This gets lowered by the backend to a ld.v4.b32, with 2.5 elements (containing the packed 5 b16s) ending up being used, the rest unused.

This reduction from two load instructions to one load instruction is an optimization.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've missed the 5th load of f16. Generated code looks correct.

My next question is whether this extension is always beneficial. E.g. if we do that on shared memory, it may potentially increase bank contention due to the extra loads. In the worst case we'd waste ~25% of shared memory bandwidth for this particular extension from v5f16 to v4b32.

I think we should take AS info into account and have some sort of user-controllable knob to enable/disable the gap filling, if needed. E.g. it's probably always good for loads from global AS, it's a maybe for shared memory (less instructions may win over bank conflicts if the extra loads happen to be broadcast to other thread's loads, but would waste bandwidth otherwise), and we can't say much about generic AS, as it could go either way, I think.
For masked writes it's more likely to be a win, as we don't actually write extra data, so the potential downside is a possible register pressure bump.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if we do that on shared memory, it may potentially increase bank contention due to the extra loads.

I don't think that's a concern for CUDA GPU. But it's a good idea to add AS as a parameter to the TTI API, other targets may want to control this feature for specific AS.

; ENABLED-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r3; }
; ENABLED-NEXT: mov.b32 {%rs2, %rs3}, %r2;
; ENABLED-NEXT: mov.b32 {%rs4, %rs5}, %r1;
; ENABLED-NEXT: cvt.f32.f16 %r5, %rs5;
; ENABLED-NEXT: cvt.f32.f16 %r6, %rs4;
; ENABLED-NEXT: add.rn.f32 %r7, %r6, %r5;
; ENABLED-NEXT: cvt.rn.f16.f32 %rs6, %r7;
; ENABLED-NEXT: cvt.f32.f16 %r8, %rs3;
; ENABLED-NEXT: cvt.f32.f16 %r9, %rs2;
; ENABLED-NEXT: add.rn.f32 %r10, %r9, %r8;
; ENABLED-NEXT: cvt.rn.f16.f32 %rs7, %r10;
; ENABLED-NEXT: cvt.f32.f16 %r11, %rs7;
; ENABLED-NEXT: cvt.f32.f16 %r12, %rs6;
; ENABLED-NEXT: add.rn.f32 %r13, %r12, %r11;
; ENABLED-NEXT: cvt.rn.f16.f32 %rs8, %r13;
; ENABLED-NEXT: cvt.f32.f16 %r14, %rs8;
; ENABLED-NEXT: cvt.f32.f16 %r15, %rs1;
; ENABLED-NEXT: add.rn.f32 %r16, %r14, %r15;
; ENABLED-NEXT: cvt.rn.f16.f32 %rs9, %r16;
; ENABLED-NEXT: st.param.b16 [func_retval0], %rs9;
; ENABLED-NEXT: ret;
;
Expand Down
6 changes: 2 additions & 4 deletions llvm/test/CodeGen/NVPTX/param-vectorize-device.ll
Original file line number Diff line number Diff line change
Expand Up @@ -171,8 +171,7 @@ define internal fastcc [3 x i32] @callee_St4x3(ptr nocapture noundef readonly by
; CHECK: .func (.param .align 16 .b8 func_retval0[12])
; CHECK-LABEL: callee_St4x3(
; CHECK-NEXT: .param .align 16 .b8 callee_St4x3_param_0[12]
; CHECK: ld.param.v2.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]]}, [callee_St4x3_param_0];
; CHECK: ld.param.b32 [[R3:%r[0-9]+]], [callee_St4x3_param_0+8];
; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], %{{.*}}}, [callee_St4x3_param_0];
; CHECK-DAG: st.param.v2.b32 [func_retval0], {[[R1]], [[R2]]};
; CHECK-DAG: st.param.b32 [func_retval0+8], [[R3]];
; CHECK-NEXT: ret;
Expand Down Expand Up @@ -394,8 +393,7 @@ define internal fastcc [7 x i32] @callee_St4x7(ptr nocapture noundef readonly by
; CHECK-LABEL: callee_St4x7(
; CHECK-NEXT: .param .align 16 .b8 callee_St4x7_param_0[28]
; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x7_param_0];
; CHECK: ld.param.v2.b32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]]}, [callee_St4x7_param_0+16];
; CHECK: ld.param.b32 [[R7:%r[0-9]+]], [callee_St4x7_param_0+24];
; CHECK: ld.param.v4.b32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]], [[R7:%r[0-9]+]], %{{.*}}}, [callee_St4x7_param_0+16];
; CHECK-DAG: st.param.v4.b32 [func_retval0], {[[R1]], [[R2]], [[R3]], [[R4]]};
; CHECK-DAG: st.param.v2.b32 [func_retval0+16], {[[R5]], [[R6]]};
; CHECK-DAG: st.param.b32 [func_retval0+24], [[R7]];
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/NVPTX/variadics-backend.ll
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ define dso_local i32 @foo() {
; CHECK-PTX-NEXT: // %bb.0: // %entry
; CHECK-PTX-NEXT: mov.b64 %SPL, __local_depot1;
; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL;
; CHECK-PTX-NEXT: st.b64 [%SP], 4294967297;
; CHECK-PTX-NEXT: st.v2.b32 [%SP], {1, 1};
; CHECK-PTX-NEXT: st.b32 [%SP+8], 1;
; CHECK-PTX-NEXT: st.b64 [%SP+16], 1;
; CHECK-PTX-NEXT: st.b64 [%SP+24], 4607182418800017408;
Expand Down
81 changes: 81 additions & 0 deletions llvm/test/Transforms/LoadStoreVectorizer/NVPTX/extend-chain.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S -o - %s | FileCheck %s

;; Check that the vectorizer extends a Chain to the next power of two,
;; essentially loading more vector elements than the original
;; code. Alignment and other requirement for vectorization should
;; still be met.

define void @load3to4(ptr %p) #0 {
; CHECK-LABEL: define void @load3to4(
; CHECK-SAME: ptr [[P:%.*]]) {
; CHECK-NEXT: [[P_0:%.*]] = getelementptr i32, ptr [[P]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P_0]], align 16
; CHECK-NEXT: [[V01:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
; CHECK-NEXT: [[V12:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
; CHECK-NEXT: [[V23:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
; CHECK-NEXT: [[EXTEND4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
; CHECK-NEXT: ret void
;
%p.0 = getelementptr i32, ptr %p, i32 0
%p.1 = getelementptr i32, ptr %p, i32 1
%p.2 = getelementptr i32, ptr %p, i32 2

%v0 = load i32, ptr %p.0, align 16
%v1 = load i32, ptr %p.1, align 4
%v2 = load i32, ptr %p.2, align 8

ret void
}

define void @load5to8(ptr %p) #0 {
; CHECK-LABEL: define void @load5to8(
; CHECK-SAME: ptr [[P:%.*]]) {
; CHECK-NEXT: [[P_0:%.*]] = getelementptr i16, ptr [[P]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P_0]], align 16
; CHECK-NEXT: [[V05:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0
; CHECK-NEXT: [[V16:%.*]] = extractelement <8 x i16> [[TMP1]], i32 1
; CHECK-NEXT: [[V27:%.*]] = extractelement <8 x i16> [[TMP1]], i32 2
; CHECK-NEXT: [[V38:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3
; CHECK-NEXT: [[V49:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4
; CHECK-NEXT: [[EXTEND10:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5
; CHECK-NEXT: [[EXTEND211:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6
; CHECK-NEXT: [[EXTEND412:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
; CHECK-NEXT: ret void
;
%p.0 = getelementptr i16, ptr %p, i32 0
%p.1 = getelementptr i16, ptr %p, i32 1
%p.2 = getelementptr i16, ptr %p, i32 2
%p.3 = getelementptr i16, ptr %p, i32 3
%p.4 = getelementptr i16, ptr %p, i32 4

%v0 = load i16, ptr %p.0, align 16
%v1 = load i16, ptr %p.1, align 2
%v2 = load i16, ptr %p.2, align 4
%v3 = load i16, ptr %p.3, align 8
%v4 = load i16, ptr %p.4, align 2

ret void
}

define void @load3to4_unaligned(ptr %p) #0 {
; CHECK-LABEL: define void @load3to4_unaligned(
; CHECK-SAME: ptr [[P:%.*]]) {
; CHECK-NEXT: [[P_0:%.*]] = getelementptr i32, ptr [[P]], i32 0
; CHECK-NEXT: [[P_2:%.*]] = getelementptr i32, ptr [[P]], i32 2
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[P_0]], align 8
; CHECK-NEXT: [[V01:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
; CHECK-NEXT: [[V12:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
; CHECK-NEXT: [[V2:%.*]] = load i32, ptr [[P_2]], align 8
; CHECK-NEXT: ret void
;
%p.0 = getelementptr i32, ptr %p, i32 0
%p.1 = getelementptr i32, ptr %p, i32 1
%p.2 = getelementptr i32, ptr %p, i32 2

%v0 = load i32, ptr %p.0, align 8
%v1 = load i32, ptr %p.1, align 4
%v2 = load i32, ptr %p.2, align 8

ret void
}
37 changes: 37 additions & 0 deletions llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-cleanup.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S < %s | FileCheck %s

; Test that gap filled instructions get deleted if they are not used
%struct.S10 = type { i32, i32, i32, i32 }

; First, confirm that gap instructions get generated and would be vectorized if the alignment is correct
define void @fillTwoGapsCanVectorize(ptr %in) {
; CHECK-LABEL: define void @fillTwoGapsCanVectorize(
; CHECK-SAME: ptr [[IN:%.*]]) {
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[IN]], align 16
; CHECK-NEXT: [[LOAD03:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
; CHECK-NEXT: [[GAPFILL4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
; CHECK-NEXT: [[GAPFILL25:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
; CHECK-NEXT: [[LOAD36:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
; CHECK-NEXT: ret void
;
%load0 = load i32, ptr %in, align 16
%getElem = getelementptr i8, ptr %in, i64 12
%load3 = load i32, ptr %getElem, align 4
ret void
}

; Then, confirm that gap instructions get deleted if the alignment prevents the vectorization
define void @fillTwoGapsCantVectorize(ptr %in) {
; CHECK-LABEL: define void @fillTwoGapsCantVectorize(
; CHECK-SAME: ptr [[IN:%.*]]) {
; CHECK-NEXT: [[LOAD0:%.*]] = load i32, ptr [[IN]], align 4
; CHECK-NEXT: [[GETELEM:%.*]] = getelementptr i8, ptr [[IN]], i64 12
; CHECK-NEXT: [[LOAD3:%.*]] = load i32, ptr [[GETELEM]], align 4
; CHECK-NEXT: ret void
;
%load0 = load i32, ptr %in, align 4
%getElem = getelementptr i8, ptr %in, i64 12
%load3 = load i32, ptr %getElem, align 4
ret void
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S < %s | FileCheck %s

; Test that gap filled instructions don't lose invariant metadata
%struct.S10 = type { i32, i32, i32, i32 }

; With no gaps, if every load is invariant, the vectorized load will be too.
define i32 @noGaps(ptr %in) {
; CHECK-LABEL: define i32 @noGaps(
; CHECK-SAME: ptr [[IN:%.*]]) {
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[IN]], align 16, !invariant.load [[META0:![0-9]+]]
; CHECK-NEXT: [[TMP01:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
; CHECK-NEXT: [[SUM01:%.*]] = add i32 [[TMP01]], [[TMP12]]
; CHECK-NEXT: [[SUM012:%.*]] = add i32 [[SUM01]], [[TMP23]]
; CHECK-NEXT: [[SUM0123:%.*]] = add i32 [[SUM012]], [[TMP34]]
; CHECK-NEXT: ret i32 [[SUM0123]]
;
%load0 = load i32, ptr %in, align 16, !invariant.load !0
%getElem1 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 1
%load1 = load i32, ptr %getElem1, align 4, !invariant.load !0
%getElem2 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 2
%load2 = load i32, ptr %getElem2, align 4, !invariant.load !0
%getElem3 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 3
%load3 = load i32, ptr %getElem3, align 4, !invariant.load !0
%sum01 = add i32 %load0, %load1
%sum012 = add i32 %sum01, %load2
%sum0123 = add i32 %sum012, %load3
ret i32 %sum0123
}

; If one of the loads is not invariant, the vectorized load will not be invariant.
define i32 @noGapsMissingInvariant(ptr %in) {
; CHECK-LABEL: define i32 @noGapsMissingInvariant(
; CHECK-SAME: ptr [[IN:%.*]]) {
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[IN]], align 16
; CHECK-NEXT: [[TMP01:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
; CHECK-NEXT: [[SUM01:%.*]] = add i32 [[TMP01]], [[TMP12]]
; CHECK-NEXT: [[SUM012:%.*]] = add i32 [[SUM01]], [[TMP23]]
; CHECK-NEXT: [[SUM0123:%.*]] = add i32 [[SUM012]], [[TMP34]]
; CHECK-NEXT: ret i32 [[SUM0123]]
;
%load0 = load i32, ptr %in, align 16, !invariant.load !0
%getElem1 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 1
%load1 = load i32, ptr %getElem1, align 4, !invariant.load !0
%getElem2 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 2
%load2 = load i32, ptr %getElem2, align 4, !invariant.load !0
%getElem3 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 3
%load3 = load i32, ptr %getElem3, align 4
%sum01 = add i32 %load0, %load1
%sum012 = add i32 %sum01, %load2
%sum0123 = add i32 %sum012, %load3
ret i32 %sum0123
}

; With two gaps, if every real load is invariant, the vectorized load will be too.
define i32 @twoGaps(ptr %in) {
; CHECK-LABEL: define i32 @twoGaps(
; CHECK-SAME: ptr [[IN:%.*]]) {
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[IN]], align 16, !invariant.load [[META0]]
; CHECK-NEXT: [[LOAD03:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
; CHECK-NEXT: [[GAPFILL4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
; CHECK-NEXT: [[GAPFILL25:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
; CHECK-NEXT: [[LOAD36:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
; CHECK-NEXT: [[SUM:%.*]] = add i32 [[LOAD03]], [[LOAD36]]
; CHECK-NEXT: ret i32 [[SUM]]
;
%load0 = load i32, ptr %in, align 16, !invariant.load !0
%getElem3 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 3
%load3 = load i32, ptr %getElem3, align 4, !invariant.load !0
%sum = add i32 %load0, %load3
ret i32 %sum
}

!0 = !{}
;.
; CHECK: [[META0]] = !{}
;.
Loading
Loading