Skip to content

[InferAlignment] Propagate alignment between loads/stores of the same base pointer #145733

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,9 @@ class Vectorizer {
/// Postcondition: For all i, ret[i][0].second == 0, because the first instr
/// in the chain is the leader, and an instr touches distance 0 from itself.
std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);

/// Propagates the best alignment in a chain of contiguous accesses
void propagateBestAlignmentsInChain(ArrayRef<ChainElem> C) const;
};

class LoadStoreVectorizerLegacyPass : public FunctionPass {
Expand Down Expand Up @@ -716,6 +719,14 @@ std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
unsigned VecRegBytes = TTI.getLoadStoreVecRegBitWidth(AS) / 8;

// We know that the accesses are contiguous. Propagate alignment
// information so that slices of the chain can still be vectorized.
propagateBestAlignmentsInChain(C);
LLVM_DEBUG({
dbgs() << "LSV: Chain after alignment propagation:\n";
dumpChain(C);
});

std::vector<Chain> Ret;
for (unsigned CBegin = 0; CBegin < C.size(); ++CBegin) {
// Find candidate chains of size not greater than the largest vector reg.
Expand Down Expand Up @@ -1634,3 +1645,27 @@ std::optional<APInt> Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB,
.sextOrTrunc(OrigBitWidth);
return std::nullopt;
}

void Vectorizer::propagateBestAlignmentsInChain(ArrayRef<ChainElem> C) const {
ChainElem BestAlignedElem = C[0];
Align BestAlignSoFar = getLoadStoreAlignment(C[0].Inst);

for (const ChainElem &E : C) {
Align OrigAlign = getLoadStoreAlignment(E.Inst);
if (OrigAlign > BestAlignSoFar) {
BestAlignedElem = E;
BestAlignSoFar = OrigAlign;
}

APInt OffsetFromBestAlignedElem =
E.OffsetFromLeader - BestAlignedElem.OffsetFromLeader;
assert(OffsetFromBestAlignedElem.isNonNegative());
// commonAlignment is equivalent to a greatest common power-of-two divisor;
// it returns the largest power of 2 that divides both A and B.
Align NewAlign = commonAlignment(
BestAlignSoFar, OffsetFromBestAlignedElem.getLimitedValue());
if (NewAlign > OrigAlign)
setLoadStoreAlignment(E.Inst, NewAlign);
}
return;
}
296 changes: 296 additions & 0 deletions llvm/test/Transforms/LoadStoreVectorizer/prop-align.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,296 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -passes=load-store-vectorizer -S < %s | FileCheck %s

; The IR has the first float3 labeled with align 16, and that 16 should
; be propagated such that the second set of 4 values
; can also be vectorized together.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this papering over a missed middle end optimization? The middle end should have inferred the pointer argument to align 16, and then each successive access should already have a refined alignment derived from that

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is nice to have for a variety of edge cases, but the specific motivator for this change is based on InstCombine's handling of nested structs. You are correct that if loads/stores are unpacked from aligned loads/stores of aggregates, alignment will be propagated to the unpacked elements. However, with nested structs, InstCombine unpacks one layer at a time, losing alignment context in between passes over the worklist.

Before IC:

; RUN: opt < %s -passes=instcombine -S | FileCheck %s
%struct.S1 = type { %struct.float3, %struct.float3, i32, i32 }
%struct.float3 = type { float, float, float }

define void @_Z7init_s1P2S1S0_(ptr noundef %v, ptr noundef %vout) {
  %v.addr = alloca ptr, align 8
  %vout.addr = alloca ptr, align 8
  store ptr %v, ptr %v.addr, align 8
  store ptr %vout, ptr %vout.addr, align 8
  %tmp = load ptr, ptr %vout.addr, align 8
  %tmp1 = load ptr, ptr %v.addr, align 8
  %1 = load %struct.S1, ptr %tmp1, align 16
  store %struct.S1 %1, ptr %tmp, align 16
  ret void
}

After IC:

define void @_Z7init_s1P2S1S0_(ptr noundef %v, ptr noundef %vout) {
  %.unpack.unpack = load float, ptr %v, align 16
  %.unpack.elt7 = getelementptr inbounds nuw i8, ptr %v, i64 4
  %.unpack.unpack8 = load float, ptr %.unpack.elt7, align 4
  %.unpack.elt9 = getelementptr inbounds nuw i8, ptr %v, i64 8
  %.unpack.unpack10 = load float, ptr %.unpack.elt9, align 8
  %.elt1 = getelementptr inbounds nuw i8, ptr %v, i64 12
  %.unpack2.unpack = load float, ptr %.elt1, align 4
  %.unpack2.elt12 = getelementptr inbounds nuw i8, ptr %v, i64 16
  %.unpack2.unpack13 = load float, ptr %.unpack2.elt12, align 4 ; <----------- this should be align 16
  %.unpack2.elt14 = getelementptr inbounds nuw i8, ptr %v, i64 20
  %.unpack2.unpack15 = load float, ptr %.unpack2.elt14, align 4
  %.elt3 = getelementptr inbounds nuw i8, ptr %v, i64 24
  %.unpack4 = load i32, ptr %.elt3, align 8
  %.elt5 = getelementptr inbounds nuw i8, ptr %v, i64 28
  %.unpack6 = load i32, ptr %.elt5, align 4
  store float %.unpack.unpack, ptr %vout, align 16
  %vout.repack23 = getelementptr inbounds nuw i8, ptr %vout, i64 4
  store float %.unpack.unpack8, ptr %vout.repack23, align 4
  %vout.repack25 = getelementptr inbounds nuw i8, ptr %vout, i64 8
  store float %.unpack.unpack10, ptr %vout.repack25, align 8
  %vout.repack17 = getelementptr inbounds nuw i8, ptr %vout, i64 12
  store float %.unpack2.unpack, ptr %vout.repack17, align 4
  %vout.repack17.repack27 = getelementptr inbounds nuw i8, ptr %vout, i64 16
  store float %.unpack2.unpack13, ptr %vout.repack17.repack27, align 4
  %vout.repack17.repack29 = getelementptr inbounds nuw i8, ptr %vout, i64 20
  store float %.unpack2.unpack15, ptr %vout.repack17.repack29, align 4
  %vout.repack19 = getelementptr inbounds nuw i8, ptr %vout, i64 24
  store i32 %.unpack4, ptr %vout.repack19, align 8
  %vout.repack21 = getelementptr inbounds nuw i8, ptr %vout, i64 28
  store i32 %.unpack6, ptr %vout.repack21, align 4
  ret void
}

To visualize what's happening under the hood, InstCombine is unpacking the load in stages like this:
%1 = load %struct.S1, ptr %tmp1, align 16
->
load struct.float3 align 16
load struct.float3 align 4
load i32 align 8
load i32 align 4
->
load float align 16
load float align 4
load float align 8
load float align 4
load float align 4
load float align 4
load i32 align 8
load i32 align 4

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the instcombine is fixed to propagate the alignment properly, will this patch still be useful/needed? If not, then I would agree with Matt that we should try fixing the source of the problem.

Copy link
Contributor Author

@dakersnar dakersnar Jul 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This instcombine case is the only one I currently know of those benefits from this specific optimization, but I would suspect there could be a different way to arrive at a pattern like this.

But let's ignore those hypotheticals, because I understand the desire to stick within the realm of known use cases. The question is, are we comfortable changing the unpackLoadToAggregate algorithm in InstCombine to recurse through nested structs?

for (uint64_t i = 0; i < NumElements; i++) {
Value *Indices[2] = {
Zero,
ConstantInt::get(IdxType, i),
};
auto *Ptr = IC.Builder.CreateInBoundsGEP(AT, Addr, ArrayRef(Indices),
Name + ".elt");
auto EltAlign = commonAlignment(Align, Offset.getKnownMinValue());
auto *L = IC.Builder.CreateAlignedLoad(AT->getElementType(), Ptr,
EltAlign, Name + ".unpack");
L->setAAMetadata(LI.getAAMetadata());
V = IC.Builder.CreateInsertValue(V, L, i);
Offset += EltSize;
}

I feel like implementing that would be a bit of a mess, and leads to questions like "how many layers deep should it recurse"? Unpacking one layer at a time and then adding the nested struct elements to the IC worklist to be independently operated on in a later iteration is a much cleaner solution and feels more in line with the design philosophy of InstCombine. I could be wrong though, open to feedback or challenges to my assumptions.

And just to be clear, in case my previous explanation was confusing, the reason it would have to recurse through all layers at once is because we cannot store the knowledge that "the second element of load struct.float3 align 4 is aligned to 16" on the instruction; there isn't a syntax available to express that.

Edit: We would also have to change unpackStoreToAggregate to do this too, further increasing the code complexity

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This has the align 16 with opt -attributor-enable=module -O3. The default attribute passes do a worse job it seems

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are there any existing passes in particular that would benefit from this upgraded alignment?

Isel benefits from better alignment information in general.

I think the compile time tradeoffs that come with implementing it in InstCombine or InferAlignments would be not worth it. For InstCombine specifically, we have a lot of practical issues with the compile time stress it puts on our compiler at NVIDIA, so we tend to avoid adding complexity to it when possible.

We should definitely not infer better alignments in InstCombine. We specifically split out the InferAlignments pass out of InstCombine to reduce compile-time from repeatedly inferring alignment.

I think with the scheme I proposed above doing this in InferAlignment should be close to free, but of course we won't know until we try.

The problem is that I think if we want to avoid those expensive calls and limit the analysis to "shallow" geps that offset directly off of aligned base pointers, we will miss a lot of cases. There are transformations like StraightLineStrengthReduce that build chains of geps that increase the instruction depth needed to find the underlying object.

I'm not sure we'll be losing much in practice. I'd generally expect that after CSE, cases where alignment can be increased in this manner will have the form of constant offset GEPs from a common base. All of the tests you added in this PR are of this form. Did you see motivating cases where this does not hold?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should definitely not infer better alignments in InstCombine. We specifically split out the InferAlignments pass out of InstCombine to reduce compile-time from repeatedly inferring alignment.

I agree. There is technically a solution that involves expanding the aggregate unpacking logic to traverse multiple layers at once, which would theoretically solve this problem for "free", but I think there are other tradeoffs that come with that.

I'm not sure we'll be losing much in practice. I'd generally expect that after CSE, cases where alignment can be increased in this manner will have the form of constant offset GEPs from a common base.

This is a reasonable argument. Let me investigate this possibility and get back to you all with my findings.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This has the align 16 with opt -attributor-enable=module -O3. The default attribute passes do a worse job it seems

Sorry, can you clarify what "this" is? What test/command are you running?

That is the command, opt -attributor-enable=module -O3

Copy link
Contributor Author

@dakersnar dakersnar Jul 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, I see what you mean now. That Attributor pass (which does not run by default) labels the argument pointer with the align 16 attribute, and then later InferAlignment uses computeKnownBits (which sees that attribute) to upgrade the load/store.

I don't know much about the history of the Attributor pass. Is that something we should consider enabling by default? Or maybe adding the alignment attribute to the argument at some other point in the pipeline, such as in InstCombine when the aggregate is split? Edit: or maybe an addition to InferAlignments that attempts to add param attributes before the computeKnownBits pass?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, it's looking like the InferAlignments change is working. I'll put it up soon for review soon, and we can weigh it against a possible Attributor fix.

%struct.float3 = type { float, float, float }
%struct.S1 = type { %struct.float3, %struct.float3, i32, i32 }

define void @testStore(ptr nocapture writeonly %1) {
; CHECK-LABEL: define void @testStore(
; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
; CHECK-NEXT: store <4 x float> zeroinitializer, ptr [[TMP0]], align 16
; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i64 0, i32 1, i32 1
; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[GETELEM10]], align 16
; CHECK-NEXT: ret void
;
store float 0.000000e+00, ptr %1, align 16
%getElem = getelementptr inbounds %struct.float3, ptr %1, i64 0, i32 1
store float 0.000000e+00, ptr %getElem, align 4
%getElem8 = getelementptr inbounds %struct.float3, ptr %1, i64 0, i32 2
store float 0.000000e+00, ptr %getElem8, align 8
%getElem9 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 1
store float 0.000000e+00, ptr %getElem9, align 4
%getElem10 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 1, i32 1
store float 0.000000e+00, ptr %getElem10, align 4
%getElem11 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 1, i32 2
store float 0.000000e+00, ptr %getElem11, align 4
%getElem12 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 2
store i32 0, ptr %getElem12, align 8
%getElem13 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 3
store i32 0, ptr %getElem13, align 4
ret void
}

define void @testLoad(ptr nocapture writeonly %1) {
; CHECK-LABEL: define void @testLoad(
; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0]], align 16
; CHECK-NEXT: [[L11:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
; CHECK-NEXT: [[L22:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
; CHECK-NEXT: [[L33:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
; CHECK-NEXT: [[L44:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i64 0, i32 1, i32 1
; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[GETELEM10]], align 16
; CHECK-NEXT: [[L55:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32 [[L55]] to float
; CHECK-NEXT: [[L66:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[L66]] to float
; CHECK-NEXT: [[L77:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
; CHECK-NEXT: [[L88:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
; CHECK-NEXT: ret void
;
%l1 = load float, ptr %1, align 16
%getElem = getelementptr inbounds %struct.float3, ptr %1, i64 0, i32 1
%l2 = load float, ptr %getElem, align 4
%getElem8 = getelementptr inbounds %struct.float3, ptr %1, i64 0, i32 2
%l3 = load float, ptr %getElem8, align 8
%getElem9 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 1
%l4 = load float, ptr %getElem9, align 4
%getElem10 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 1, i32 1
%l5 = load float, ptr %getElem10, align 4
%getElem11 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 1, i32 2
%l6 = load float, ptr %getElem11, align 4
%getElem12 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 2
%l7 = load i32, ptr %getElem12, align 8
%getElem13 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 3
%l8 = load i32, ptr %getElem13, align 4
ret void
}

; Also, test without the struct geps, to see if it still works with i8 geps/ptradd

define void @testStorei8(ptr nocapture writeonly %1) {
; CHECK-LABEL: define void @testStorei8(
; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
; CHECK-NEXT: store <4 x float> zeroinitializer, ptr [[TMP0]], align 16
; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[GETELEM10]], align 16
; CHECK-NEXT: ret void
;
store float 0.000000e+00, ptr %1, align 16
%getElem = getelementptr inbounds i8, ptr %1, i64 4
store float 0.000000e+00, ptr %getElem, align 4
%getElem8 = getelementptr inbounds i8, ptr %1, i64 8
store float 0.000000e+00, ptr %getElem8, align 8
%getElem9 = getelementptr inbounds i8, ptr %1, i64 12
store float 0.000000e+00, ptr %getElem9, align 4
%getElem10 = getelementptr inbounds i8, ptr %1, i64 16
store float 0.000000e+00, ptr %getElem10, align 4
%getElem11 = getelementptr inbounds i8, ptr %1, i64 20
store float 0.000000e+00, ptr %getElem11, align 4
%getElem12 = getelementptr inbounds i8, ptr %1, i64 24
store i32 0, ptr %getElem12, align 8
%getElem13 = getelementptr inbounds i8, ptr %1, i64 28
store i32 0, ptr %getElem13, align 4
ret void
}

define void @testLoadi8(ptr nocapture writeonly %1) {
; CHECK-LABEL: define void @testLoadi8(
; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0]], align 16
; CHECK-NEXT: [[L11:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
; CHECK-NEXT: [[L22:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
; CHECK-NEXT: [[L33:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
; CHECK-NEXT: [[L44:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[GETELEM10]], align 16
; CHECK-NEXT: [[L55:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32 [[L55]] to float
; CHECK-NEXT: [[L66:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[L66]] to float
; CHECK-NEXT: [[L77:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
; CHECK-NEXT: [[L88:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
; CHECK-NEXT: ret void
;
%l1 = load float, ptr %1, align 16
%getElem = getelementptr inbounds i8, ptr %1, i64 4
%l2 = load float, ptr %getElem, align 4
%getElem8 = getelementptr inbounds i8, ptr %1, i64 8
%l3 = load float, ptr %getElem8, align 8
%getElem9 = getelementptr inbounds i8, ptr %1, i64 12
%l4 = load float, ptr %getElem9, align 4
%getElem10 = getelementptr inbounds i8, ptr %1, i64 16
%l5 = load float, ptr %getElem10, align 4
%getElem11 = getelementptr inbounds i8, ptr %1, i64 20
%l6 = load float, ptr %getElem11, align 4
%getElem12 = getelementptr inbounds i8, ptr %1, i64 24
%l7 = load i32, ptr %getElem12, align 8
%getElem13 = getelementptr inbounds i8, ptr %1, i64 28
%l8 = load i32, ptr %getElem13, align 4
ret void
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you use more representative tests with some uses? As is these all optimize to memset or no-op

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Slight pushback, my understanding is that lit tests are most useful when they are minimal reproducers of the problem the optimization is targeting. Adding uses would not really change the nature of this optimization. Tests like llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i1.ll follow this thinking.

If you think it would be better, I could combine each pair of load and store tests into individual tests, storing the result of the loads. Other LSV tests use that pattern a lot.

}


; This version of the test adjusts the struct to hold two i32s at the beginning,
; but still assumes that the first float3 is 16 aligned. If the alignment
; propagation works correctly, it should be able to load this struct in three
; loads: a 2x32, a 4x32, and a 4x32. Without the alignment propagation, the last
; 4x32 will instead be a 2x32 and a 2x32
%struct.S2 = type { i32, i32, %struct.float3, %struct.float3, i32, i32 }

define void @testStore_2(ptr nocapture writeonly %1) {
; CHECK-LABEL: define void @testStore_2(
; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr [[TMP0]], align 8
; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds [[STRUCT_S2:%.*]], ptr [[TMP0]], i64 0, i32 2
; CHECK-NEXT: store <4 x float> zeroinitializer, ptr [[GETELEM1]], align 16
; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds [[STRUCT_S2]], ptr [[TMP0]], i64 0, i32 3, i32 1
; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[GETELEM10]], align 16
; CHECK-NEXT: ret void
;
store i32 0, ptr %1, align 8
%getElem = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 1
store i32 0, ptr %getElem, align 4
%getElem1 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 2
store float 0.000000e+00, ptr %getElem1, align 16
%getElem2 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 2, i32 1
store float 0.000000e+00, ptr %getElem2, align 4
%getElem8 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 2, i32 2
store float 0.000000e+00, ptr %getElem8, align 8
%getElem9 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 3
store float 0.000000e+00, ptr %getElem9, align 4
%getElem10 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 3, i32 1
store float 0.000000e+00, ptr %getElem10, align 4
%getElem11 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 3, i32 2
store float 0.000000e+00, ptr %getElem11, align 4
%getElem12 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 4
store i32 0, ptr %getElem12, align 8
%getElem13 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 5
store i32 0, ptr %getElem13, align 4
ret void
}

define void @testLoad_2(ptr nocapture writeonly %1) {
; CHECK-LABEL: define void @testLoad_2(
; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[TMP0]], align 8
; CHECK-NEXT: [[L1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
; CHECK-NEXT: [[L22:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds [[STRUCT_S2:%.*]], ptr [[TMP0]], i64 0, i32 2
; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[GETELEM1]], align 16
; CHECK-NEXT: [[L33:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
; CHECK-NEXT: [[L44:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
; CHECK-NEXT: [[L55:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
; CHECK-NEXT: [[L66:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds [[STRUCT_S2]], ptr [[TMP0]], i64 0, i32 3, i32 1
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[GETELEM10]], align 16
; CHECK-NEXT: [[L77:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[L77]] to float
; CHECK-NEXT: [[L88:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32 [[L88]] to float
; CHECK-NEXT: [[L99:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
; CHECK-NEXT: [[L010:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
; CHECK-NEXT: ret void
;
%l = load i32, ptr %1, align 8
%getElem = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 1
%l2 = load i32, ptr %getElem, align 4
%getElem1 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 2
%l3 = load float, ptr %getElem1, align 16
%getElem2 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 2, i32 1
%l4 = load float, ptr %getElem2, align 4
%getElem8 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 2, i32 2
%l5 = load float, ptr %getElem8, align 8
%getElem9 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 3
%l6 = load float, ptr %getElem9, align 4
%getElem10 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 3, i32 1
%l7 = load float, ptr %getElem10, align 4
%getElem11 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 3, i32 2
%l8 = load float, ptr %getElem11, align 4
%getElem12 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 4
%l9 = load i32, ptr %getElem12, align 8
%getElem13 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 5
%l0 = load i32, ptr %getElem13, align 4
ret void
}

; Also, test without the struct geps, to see if it still works with i8 geps/ptradd

define void @testStorei8_2(ptr nocapture writeonly %1) {
; CHECK-LABEL: define void @testStorei8_2(
; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr [[TMP0]], align 8
; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
; CHECK-NEXT: store <4 x float> zeroinitializer, ptr [[GETELEM1]], align 16
; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24
; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[GETELEM10]], align 16
; CHECK-NEXT: ret void
;
store i32 0, ptr %1, align 8
%getElem = getelementptr inbounds i8, ptr %1, i64 4
store i32 0, ptr %getElem, align 4
%getElem1 = getelementptr inbounds i8, ptr %1, i64 8
store float 0.000000e+00, ptr %getElem1, align 16
%getElem2 = getelementptr inbounds i8, ptr %1, i64 12
store float 0.000000e+00, ptr %getElem2, align 4
%getElem8 = getelementptr inbounds i8, ptr %1, i64 16
store float 0.000000e+00, ptr %getElem8, align 8
%getElem9 = getelementptr inbounds i8, ptr %1, i64 20
store float 0.000000e+00, ptr %getElem9, align 4
%getElem10 = getelementptr inbounds i8, ptr %1, i64 24
store float 0.000000e+00, ptr %getElem10, align 4
%getElem11 = getelementptr inbounds i8, ptr %1, i64 28
store float 0.000000e+00, ptr %getElem11, align 4
%getElem12 = getelementptr inbounds i8, ptr %1, i64 32
store i32 0, ptr %getElem12, align 8
%getElem13 = getelementptr inbounds i8, ptr %1, i64 36
store i32 0, ptr %getElem13, align 4
ret void
}

define void @testLoadi8_2(ptr nocapture writeonly %1) {
; CHECK-LABEL: define void @testLoadi8_2(
; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[TMP0]], align 8
; CHECK-NEXT: [[L1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
; CHECK-NEXT: [[L22:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[GETELEM1]], align 16
; CHECK-NEXT: [[L33:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
; CHECK-NEXT: [[L44:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
; CHECK-NEXT: [[L55:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
; CHECK-NEXT: [[L66:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[GETELEM10]], align 16
; CHECK-NEXT: [[L77:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[L77]] to float
; CHECK-NEXT: [[L88:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32 [[L88]] to float
; CHECK-NEXT: [[L99:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
; CHECK-NEXT: [[L010:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
; CHECK-NEXT: ret void
;
%l = load i32, ptr %1, align 8
%getElem = getelementptr inbounds i8, ptr %1, i64 4
%l2 = load i32, ptr %getElem, align 4
%getElem1 = getelementptr inbounds i8, ptr %1, i64 8
%l3 = load float, ptr %getElem1, align 16
%getElem2 = getelementptr inbounds i8, ptr %1, i64 12
%l4 = load float, ptr %getElem2, align 4
%getElem8 = getelementptr inbounds i8, ptr %1, i64 16
%l5 = load float, ptr %getElem8, align 8
%getElem9 = getelementptr inbounds i8, ptr %1, i64 20
%l6 = load float, ptr %getElem9, align 4
%getElem10 = getelementptr inbounds i8, ptr %1, i64 24
%l7 = load float, ptr %getElem10, align 4
%getElem11 = getelementptr inbounds i8, ptr %1, i64 28
%l8 = load float, ptr %getElem11, align 4
%getElem12 = getelementptr inbounds i8, ptr %1, i64 32
%l9 = load i32, ptr %getElem12, align 8
%getElem13 = getelementptr inbounds i8, ptr %1, i64 36
%l0 = load i32, ptr %getElem13, align 4
ret void
}
Loading