From 84c932d12386866bbf4af33a2540e0c3cb3a3091 Mon Sep 17 00:00:00 2001 From: Zach Goldthorpe Date: Wed, 11 Jun 2025 15:19:20 -0500 Subject: [PATCH 1/4] Extended vector promotion to aggregate types. --- .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 96 ++++--- .../CodeGen/AMDGPU/promote-alloca-structs.ll | 263 ++++++++++++++++++ 2 files changed, 318 insertions(+), 41 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 700dc87d2f821..336e3a1db7e73 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -818,6 +818,28 @@ static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB, return I; } +/// Get the underlying type of a homogeneous aggregate type, or nullptr if the +/// type is non-homogeneous. +static Type *getHomogeneousType(Type *Ty) { + if (auto *VectorTy = dyn_cast(Ty)) + return VectorTy->getElementType(); + if (auto *ArrayTy = dyn_cast(Ty)) + return getHomogeneousType(ArrayTy->getElementType()); + if (auto *StructTy = dyn_cast(Ty)) { + if (StructTy->getNumElements() == 0) + return nullptr; + + auto *Iter = StructTy->element_begin(); + Type *HTy = getHomogeneousType(*Iter); + for (; Iter != StructTy->element_end(); ++Iter) + if (getHomogeneousType(*Iter) != HTy) + return nullptr; + + return HTy; + } + return Ty; +} + // FIXME: Should try to pick the most likely to be profitable allocas first. bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n'); @@ -828,42 +850,43 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { } Type *AllocaTy = Alloca.getAllocatedType(); - auto *VectorTy = dyn_cast(AllocaTy); - if (auto *ArrayTy = dyn_cast(AllocaTy)) { - uint64_t NumElems = 1; - Type *ElemTy; - do { - NumElems *= ArrayTy->getNumElements(); - ElemTy = ArrayTy->getElementType(); - } while ((ArrayTy = dyn_cast(ElemTy))); - - // Check for array of vectors - auto *InnerVectorTy = dyn_cast(ElemTy); - if (InnerVectorTy) { - NumElems *= InnerVectorTy->getNumElements(); - ElemTy = InnerVectorTy->getElementType(); - } + Type *ElemTy = getHomogeneousType(AllocaTy); - if (VectorType::isValidElementType(ElemTy) && NumElems > 0) { - unsigned ElementSize = DL->getTypeSizeInBits(ElemTy) / 8; - if (ElementSize > 0) { - unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy); - // Expand vector if required to match padding of inner type, - // i.e. odd size subvectors. - // Storage size of new vector must match that of alloca for correct - // behaviour of byte offsets and GEP computation. - if (NumElems * ElementSize != AllocaSize) - NumElems = AllocaSize / ElementSize; - if (NumElems > 0 && (AllocaSize % ElementSize) == 0) - VectorTy = FixedVectorType::get(ElemTy, NumElems); - } - } + if (!ElemTy || !VectorType::isValidElementType(ElemTy)) { + LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n"); + return false; } - if (!VectorTy) { - LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n"); + unsigned ElementSizeInBits = DL->getTypeSizeInBits(ElemTy); + if (ElementSizeInBits == 0) { + LLVM_DEBUG(dbgs() << " Cannot create vector of zero-sized elements."); + return false; + } + if (ElementSizeInBits != DL->getTypeAllocSizeInBits(ElemTy)) { + LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size " + "does not match the type's size\n"); return false; } + unsigned ElementSize = ElementSizeInBits / 8; + if (ElementSize == 0) + return false; + + // Calculate the size of the corresponding vector, accounting for padding of + // inner types, e.g., odd-sized subvectors. Storage size of new vector must + // match that of alloca for correct behaviour of byte offsets and GEP + // computation. + unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy); + unsigned NumElems = AllocaSize / ElementSize; + if (NumElems == 0) { + LLVM_DEBUG(dbgs() << " Cannot vectorize an empty aggregate type."); + return false; + } + if (NumElems * ElementSize != AllocaSize) { + LLVM_DEBUG(dbgs() << " Cannot convert type into vector of the same size."); + return false; + } + auto *VectorTy = FixedVectorType::get(ElemTy, NumElems); + assert(VectorTy && "Failed to create vector type."); const unsigned MaxElements = (MaxVectorRegs * 32) / DL->getTypeSizeInBits(VectorTy->getElementType()); @@ -895,15 +918,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { LLVM_DEBUG(dbgs() << " Attempting promotion to: " << *VectorTy << "\n"); - Type *VecEltTy = VectorTy->getElementType(); - unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy); - if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) { - LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size " - "does not match the type's size\n"); - return false; - } - unsigned ElementSize = ElementSizeInBits / 8; - assert(ElementSize > 0); for (auto *U : Uses) { Instruction *Inst = cast(U->getUser()); @@ -943,7 +957,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { if (auto *GEP = dyn_cast(Inst)) { // If we can't compute a vector index from this GEP, then we can't // promote this alloca to vector. - Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL, NewGEPInsts); + Value *Index = GEPToVectorIndex(GEP, &Alloca, ElemTy, *DL, NewGEPInsts); if (!Index) return RejectUser(Inst, "cannot compute vector index for GEP"); diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll new file mode 100644 index 0000000000000..d09f6ba1e7b68 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll @@ -0,0 +1,263 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca-to-vector -amdgpu-promote-alloca-to-vector-limit=512 -amdgpu-promote-alloca-to-vector-max-regs=32 %s | FileCheck %s + +declare void @clobber_i8(i8) + +define void @test_v4i8(i64 %idx) { +; CHECK-LABEL: define void @test_v4i8( +; CHECK-SAME: i64 [[IDX:%.*]]) { +; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i8> poison +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[STACK]], i64 [[IDX]] +; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]]) +; CHECK-NEXT: ret void +; + %stack = alloca <4 x i8>, align 4, addrspace(5) + %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx + %val = load i8, ptr addrspace(5) %ptr, align 1 + call void @clobber_i8(i8 %val) + ret void +} + +define void @test_a4i8(i64 %idx) { +; CHECK-LABEL: define void @test_a4i8( +; CHECK-SAME: i64 [[IDX:%.*]]) { +; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i8> poison +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[STACK]], i64 [[IDX]] +; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]]) +; CHECK-NEXT: ret void +; + %stack = alloca [4 x i8], align 4, addrspace(5) + %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx + %val = load i8, ptr addrspace(5) %ptr, align 1 + call void @clobber_i8(i8 %val) + ret void +} + +define void @test_a2v4i8(i64 %idx) { +; CHECK-LABEL: define void @test_a2v4i8( +; CHECK-SAME: i64 [[IDX:%.*]]) { +; CHECK-NEXT: [[STACK:%.*]] = freeze <8 x i8> poison +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[STACK]], i64 [[IDX]] +; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]]) +; CHECK-NEXT: ret void +; + %stack = alloca [2 x <4 x i8>], align 4, addrspace(5) + %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx + %val = load i8, ptr addrspace(5) %ptr, align 1 + call void @clobber_i8(i8 %val) + ret void +} + +define void @test_a2v3i8(i64 %idx) { +; CHECK-LABEL: define void @test_a2v3i8( +; CHECK-SAME: i64 [[IDX:%.*]]) { +; CHECK-NEXT: [[STACK:%.*]] = freeze <8 x i8> poison +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[STACK]], i64 [[IDX]] +; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]]) +; CHECK-NEXT: ret void +; + %stack = alloca [2 x <3 x i8>], align 4, addrspace(5) + %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx + %val = load i8, ptr addrspace(5) %ptr, align 1 + call void @clobber_i8(i8 %val) + ret void +} + +define void @test_a2a4i8(i64 %idx) { +; CHECK-LABEL: define void @test_a2a4i8( +; CHECK-SAME: i64 [[IDX:%.*]]) { +; CHECK-NEXT: [[STACK:%.*]] = freeze <8 x i8> poison +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[STACK]], i64 [[IDX]] +; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]]) +; CHECK-NEXT: ret void +; + %stack = alloca [2 x [4 x i8]], align 4, addrspace(5) + %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx + %val = load i8, ptr addrspace(5) %ptr, align 1 + call void @clobber_i8(i8 %val) + ret void +} + +define void @test_a2a3i8(i64 %idx) { +; CHECK-LABEL: define void @test_a2a3i8( +; CHECK-SAME: i64 [[IDX:%.*]]) { +; CHECK-NEXT: [[STACK:%.*]] = freeze <6 x i8> poison +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <6 x i8> [[STACK]], i64 [[IDX]] +; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]]) +; CHECK-NEXT: ret void +; + %stack = alloca [2 x [3 x i8]], align 4, addrspace(5) + %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx + %val = load i8, ptr addrspace(5) %ptr, align 1 + call void @clobber_i8(i8 %val) + ret void +} + +define void @test_s1v4i8(i64 %idx) { +; CHECK-LABEL: define void @test_s1v4i8( +; CHECK-SAME: i64 [[IDX:%.*]]) { +; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i8> poison +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[STACK]], i64 [[IDX]] +; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]]) +; CHECK-NEXT: ret void +; + %stack = alloca {<4 x i8>}, align 4, addrspace(5) + %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx + %val = load i8, ptr addrspace(5) %ptr, align 1 + call void @clobber_i8(i8 %val) + ret void +} + +define void @test_s1a4i8(i64 %idx) { +; CHECK-LABEL: define void @test_s1a4i8( +; CHECK-SAME: i64 [[IDX:%.*]]) { +; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i8> poison +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[STACK]], i64 [[IDX]] +; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]]) +; CHECK-NEXT: ret void +; + %stack = alloca {[4 x i8]}, align 4, addrspace(5) + %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx + %val = load i8, ptr addrspace(5) %ptr, align 1 + call void @clobber_i8(i8 %val) + ret void +} + +define void @test_s4i8(i64 %idx) { +; CHECK-LABEL: define void @test_s4i8( +; CHECK-SAME: i64 [[IDX:%.*]]) { +; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i8> poison +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[STACK]], i64 [[IDX]] +; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]]) +; CHECK-NEXT: ret void +; + %stack = alloca {i8, i8, i8, i8}, align 4, addrspace(5) + %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx + %val = load i8, ptr addrspace(5) %ptr, align 1 + call void @clobber_i8(i8 %val) + ret void +} + +define void @test_s2v4i8(i64 %idx) { +; CHECK-LABEL: define void @test_s2v4i8( +; CHECK-SAME: i64 [[IDX:%.*]]) { +; CHECK-NEXT: [[STACK:%.*]] = freeze <8 x i8> poison +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[STACK]], i64 [[IDX]] +; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]]) +; CHECK-NEXT: ret void +; + %stack = alloca {<4 x i8>, <4 x i8>}, align 4, addrspace(5) + %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx + %val = load i8, ptr addrspace(5) %ptr, align 1 + call void @clobber_i8(i8 %val) + ret void +} + +define void @test_s2v2i8v4i8(i64 %idx) { +; CHECK-LABEL: define void @test_s2v2i8v4i8( +; CHECK-SAME: i64 [[IDX:%.*]]) { +; CHECK-NEXT: [[STACK:%.*]] = freeze <8 x i8> poison +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[STACK]], i64 [[IDX]] +; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]]) +; CHECK-NEXT: ret void +; + %stack = alloca {<2 x i8>, <4 x i8>}, align 4, addrspace(5) + %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx + %val = load i8, ptr addrspace(5) %ptr, align 1 + call void @clobber_i8(i8 %val) + ret void +} + +define void @test_s2v2i8v3i8(i64 %idx) { +; CHECK-LABEL: define void @test_s2v2i8v3i8( +; CHECK-SAME: i64 [[IDX:%.*]]) { +; CHECK-NEXT: [[STACK:%.*]] = freeze <8 x i8> poison +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[STACK]], i64 [[IDX]] +; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]]) +; CHECK-NEXT: ret void +; + %stack = alloca {<2 x i8>, <3 x i8>}, align 4, addrspace(5) + %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx + %val = load i8, ptr addrspace(5) %ptr, align 1 + call void @clobber_i8(i8 %val) + ret void +} + +define void @test_s2s2i8s4i8(i64 %idx) { +; CHECK-LABEL: define void @test_s2s2i8s4i8( +; CHECK-SAME: i64 [[IDX:%.*]]) { +; CHECK-NEXT: [[STACK:%.*]] = freeze <6 x i8> poison +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <6 x i8> [[STACK]], i64 [[IDX]] +; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]]) +; CHECK-NEXT: ret void +; + %stack = alloca {{i8, i8}, {i8, i8, i8, i8}}, align 4, addrspace(5) + %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx + %val = load i8, ptr addrspace(5) %ptr, align 1 + call void @clobber_i8(i8 %val) + ret void +} + +define void @test_s2s2i8s3i8(i64 %idx) { +; CHECK-LABEL: define void @test_s2s2i8s3i8( +; CHECK-SAME: i64 [[IDX:%.*]]) { +; CHECK-NEXT: [[STACK:%.*]] = freeze <5 x i8> poison +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <5 x i8> [[STACK]], i64 [[IDX]] +; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]]) +; CHECK-NEXT: ret void +; + %stack = alloca {{i8, i8}, {i8, i8, i8}}, align 4, addrspace(5) + %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx + %val = load i8, ptr addrspace(5) %ptr, align 1 + call void @clobber_i8(i8 %val) + ret void +} + +define void @test_s3i8s1i8v2i8(i64 %idx) { +; CHECK-LABEL: define void @test_s3i8s1i8v2i8( +; CHECK-SAME: i64 [[IDX:%.*]]) { +; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i8> poison +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[STACK]], i64 [[IDX]] +; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]]) +; CHECK-NEXT: ret void +; + %stack = alloca {i8, {i8}, <2 x i8>}, align 4, addrspace(5) + %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx + %val = load i8, ptr addrspace(5) %ptr, align 1 + call void @clobber_i8(i8 %val) + ret void +} + +; heterogeneous element types are not supported +define void @test_heterogeneous(i64 %idx) { +; CHECK-LABEL: define void @test_heterogeneous( +; CHECK-SAME: i64 [[IDX:%.*]]) { +; CHECK-NEXT: [[STACK:%.*]] = alloca { i8, i8, i16 }, align 4, addrspace(5) +; CHECK-NEXT: [[PTR:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[STACK]], i64 [[IDX]] +; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr addrspace(5) [[PTR]], align 1 +; CHECK-NEXT: call void @clobber_i8(i8 [[VAL]]) +; CHECK-NEXT: ret void +; + %stack = alloca {i8, i8, i16}, align 4, addrspace(5) + %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx + %val = load i8, ptr addrspace(5) %ptr, align 1 + call void @clobber_i8(i8 %val) + ret void +} + +; empty structs are not supported +define void @test_empty(i64 %idx) { +; CHECK-LABEL: define void @test_empty( +; CHECK-SAME: i64 [[IDX:%.*]]) { +; CHECK-NEXT: [[STACK:%.*]] = alloca { i8, {} }, align 4, addrspace(5) +; CHECK-NEXT: [[PTR:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[STACK]], i64 [[IDX]] +; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr addrspace(5) [[PTR]], align 1 +; CHECK-NEXT: call void @clobber_i8(i8 [[VAL]]) +; CHECK-NEXT: ret void +; + %stack = alloca {i8, {}}, align 4, addrspace(5) + %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx + %val = load i8, ptr addrspace(5) %ptr, align 1 + call void @clobber_i8(i8 %val) + ret void +} From ed920b76d0c77bf5e2aa5191cfca8c4b676d4dc9 Mon Sep 17 00:00:00 2001 From: Zach Goldthorpe Date: Wed, 11 Jun 2025 17:00:31 -0500 Subject: [PATCH 2/4] Refactored away recursion. --- .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 39 ++++++++++++------- .../CodeGen/AMDGPU/promote-alloca-structs.ll | 34 +++++++++++++++- 2 files changed, 58 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 336e3a1db7e73..ab1c3a5919ea1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -821,23 +821,34 @@ static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB, /// Get the underlying type of a homogeneous aggregate type, or nullptr if the /// type is non-homogeneous. static Type *getHomogeneousType(Type *Ty) { - if (auto *VectorTy = dyn_cast(Ty)) - return VectorTy->getElementType(); - if (auto *ArrayTy = dyn_cast(Ty)) - return getHomogeneousType(ArrayTy->getElementType()); - if (auto *StructTy = dyn_cast(Ty)) { - if (StructTy->getNumElements() == 0) - return nullptr; + Type *ElemTy = nullptr; + SmallVector WorkList; + WorkList.push_back(Ty); + while (!WorkList.empty()) { + Type *CurTy = WorkList.pop_back_val(); - auto *Iter = StructTy->element_begin(); - Type *HTy = getHomogeneousType(*Iter); - for (; Iter != StructTy->element_end(); ++Iter) - if (getHomogeneousType(*Iter) != HTy) - return nullptr; + // Check if the current type is an aggregate type. + if (auto *VectorTy = dyn_cast(CurTy)) { + WorkList.push_back(VectorTy->getElementType()); + continue; + } + if (auto *ArrayTy = dyn_cast(CurTy)) { + WorkList.push_back(ArrayTy->getElementType()); + continue; + } + if (auto *StructTy = dyn_cast(CurTy)) { + WorkList.append(StructTy->element_begin(), StructTy->element_end()); + continue; + } - return HTy; + // If not, it must be the same as all other non-aggregate types. + if (!ElemTy) + ElemTy = CurTy; + else if (ElemTy != CurTy) + return nullptr; } - return Ty; + + return ElemTy; } // FIXME: Should try to pick the most likely to be profitable allocas first. diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll index d09f6ba1e7b68..4840e451e4c4c 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll @@ -228,6 +228,21 @@ define void @test_s3i8s1i8v2i8(i64 %idx) { ret void } +define void @test_s3i8i8s0(i64 %idx) { +; CHECK-LABEL: define void @test_s3i8i8s0( +; CHECK-SAME: i64 [[IDX:%.*]]) { +; CHECK-NEXT: [[STACK:%.*]] = freeze <2 x i8> poison +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i8> [[STACK]], i64 [[IDX]] +; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]]) +; CHECK-NEXT: ret void +; + %stack = alloca {i8, i8, {}}, align 4, addrspace(5) + %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx + %val = load i8, ptr addrspace(5) %ptr, align 1 + call void @clobber_i8(i8 %val) + ret void +} + ; heterogeneous element types are not supported define void @test_heterogeneous(i64 %idx) { ; CHECK-LABEL: define void @test_heterogeneous( @@ -245,10 +260,27 @@ define void @test_heterogeneous(i64 %idx) { ret void } -; empty structs are not supported +; empty types are not supported define void @test_empty(i64 %idx) { ; CHECK-LABEL: define void @test_empty( ; CHECK-SAME: i64 [[IDX:%.*]]) { +; CHECK-NEXT: [[STACK:%.*]] = alloca {}, align 4, addrspace(5) +; CHECK-NEXT: [[PTR:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[STACK]], i64 [[IDX]] +; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr addrspace(5) [[PTR]], align 1 +; CHECK-NEXT: call void @clobber_i8(i8 [[VAL]]) +; CHECK-NEXT: ret void +; + %stack = alloca {}, align 4, addrspace(5) + %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx + %val = load i8, ptr addrspace(5) %ptr, align 1 + call void @clobber_i8(i8 %val) + ret void +} + +; singleton types are not supported +define void @test_singleton(i64 %idx) { +; CHECK-LABEL: define void @test_singleton( +; CHECK-SAME: i64 [[IDX:%.*]]) { ; CHECK-NEXT: [[STACK:%.*]] = alloca { i8, {} }, align 4, addrspace(5) ; CHECK-NEXT: [[PTR:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[STACK]], i64 [[IDX]] ; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr addrspace(5) [[PTR]], align 1 From 90dc72d134c1c65ea98e3a9be8a04b2bdf5bcf65 Mon Sep 17 00:00:00 2001 From: Zach Goldthorpe Date: Wed, 11 Jun 2025 20:44:35 -0500 Subject: [PATCH 3/4] Minor revisions. --- llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 13 ++++++------- llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll | 2 +- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index ab1c3a5919ea1..e90a3a275f67c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -869,18 +869,16 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { } unsigned ElementSizeInBits = DL->getTypeSizeInBits(ElemTy); - if (ElementSizeInBits == 0) { - LLVM_DEBUG(dbgs() << " Cannot create vector of zero-sized elements."); - return false; - } if (ElementSizeInBits != DL->getTypeAllocSizeInBits(ElemTy)) { LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size " "does not match the type's size\n"); return false; } unsigned ElementSize = ElementSizeInBits / 8; - if (ElementSize == 0) + if (ElementSize == 0) { + LLVM_DEBUG(dbgs() << " Cannot create vector of zero-sized elements\n"); return false; + } // Calculate the size of the corresponding vector, accounting for padding of // inner types, e.g., odd-sized subvectors. Storage size of new vector must @@ -889,11 +887,12 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy); unsigned NumElems = AllocaSize / ElementSize; if (NumElems == 0) { - LLVM_DEBUG(dbgs() << " Cannot vectorize an empty aggregate type."); + LLVM_DEBUG(dbgs() << " Cannot vectorize an empty aggregate type\n"); return false; } if (NumElems * ElementSize != AllocaSize) { - LLVM_DEBUG(dbgs() << " Cannot convert type into vector of the same size."); + LLVM_DEBUG( + dbgs() << " Cannot convert type into vector of the same size\n"); return false; } auto *VectorTy = FixedVectorType::get(ElemTy, NumElems); diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll index 4840e451e4c4c..9c9a88b97ab93 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca-to-vector -amdgpu-promote-alloca-to-vector-limit=512 -amdgpu-promote-alloca-to-vector-max-regs=32 %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca-to-vector -amdgpu-promote-alloca-to-vector-limit=512 %s | FileCheck %s declare void @clobber_i8(i8) From 86a685e17bc3a61888628ecac56ba003aaaf2b35 Mon Sep 17 00:00:00 2001 From: Zach Goldthorpe Date: Thu, 12 Jun 2025 11:28:12 -0500 Subject: [PATCH 4/4] Improved test suite. --- .../CodeGen/AMDGPU/promote-alloca-structs.ll | 297 +++++++++--------- 1 file changed, 144 insertions(+), 153 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll index 9c9a88b97ab93..1cdd027fef89d 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll @@ -1,295 +1,286 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca-to-vector -amdgpu-promote-alloca-to-vector-limit=512 %s | FileCheck %s -declare void @clobber_i8(i8) - -define void @test_v4i8(i64 %idx) { -; CHECK-LABEL: define void @test_v4i8( -; CHECK-SAME: i64 [[IDX:%.*]]) { +define i8 @test_v4i8(i32 %bits, i64 %idx) { +; CHECK-LABEL: define i8 @test_v4i8( +; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) { ; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i8> poison -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[STACK]], i64 [[IDX]] -; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]]) -; CHECK-NEXT: ret void +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]] +; CHECK-NEXT: ret i8 [[TMP2]] ; %stack = alloca <4 x i8>, align 4, addrspace(5) + store i32 %bits, ptr addrspace(5) %stack %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx %val = load i8, ptr addrspace(5) %ptr, align 1 - call void @clobber_i8(i8 %val) - ret void + ret i8 %val } -define void @test_a4i8(i64 %idx) { -; CHECK-LABEL: define void @test_a4i8( -; CHECK-SAME: i64 [[IDX:%.*]]) { +define i8 @test_a4i8(i32 %bits, i64 %idx) { +; CHECK-LABEL: define i8 @test_a4i8( +; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) { ; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i8> poison -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[STACK]], i64 [[IDX]] -; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]]) -; CHECK-NEXT: ret void +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]] +; CHECK-NEXT: ret i8 [[TMP2]] ; %stack = alloca [4 x i8], align 4, addrspace(5) + store i32 %bits, ptr addrspace(5) %stack %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx %val = load i8, ptr addrspace(5) %ptr, align 1 - call void @clobber_i8(i8 %val) - ret void + ret i8 %val } -define void @test_a2v4i8(i64 %idx) { -; CHECK-LABEL: define void @test_a2v4i8( -; CHECK-SAME: i64 [[IDX:%.*]]) { +define i8 @test_a2v4i8(i64 %bits, i64 %idx) { +; CHECK-LABEL: define i8 @test_a2v4i8( +; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) { ; CHECK-NEXT: [[STACK:%.*]] = freeze <8 x i8> poison -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[STACK]], i64 [[IDX]] -; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]]) -; CHECK-NEXT: ret void +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]] +; CHECK-NEXT: ret i8 [[TMP2]] ; %stack = alloca [2 x <4 x i8>], align 4, addrspace(5) + store i64 %bits, ptr addrspace(5) %stack %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx %val = load i8, ptr addrspace(5) %ptr, align 1 - call void @clobber_i8(i8 %val) - ret void + ret i8 %val } -define void @test_a2v3i8(i64 %idx) { -; CHECK-LABEL: define void @test_a2v3i8( -; CHECK-SAME: i64 [[IDX:%.*]]) { +define i8 @test_a2v3i8(i64 %bits, i64 %idx) { +; CHECK-LABEL: define i8 @test_a2v3i8( +; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) { ; CHECK-NEXT: [[STACK:%.*]] = freeze <8 x i8> poison -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[STACK]], i64 [[IDX]] -; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]]) -; CHECK-NEXT: ret void +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]] +; CHECK-NEXT: ret i8 [[TMP2]] ; %stack = alloca [2 x <3 x i8>], align 4, addrspace(5) + store i64 %bits, ptr addrspace(5) %stack %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx %val = load i8, ptr addrspace(5) %ptr, align 1 - call void @clobber_i8(i8 %val) - ret void + ret i8 %val } -define void @test_a2a4i8(i64 %idx) { -; CHECK-LABEL: define void @test_a2a4i8( -; CHECK-SAME: i64 [[IDX:%.*]]) { +define i8 @test_a2a4i8(i64 %bits, i64 %idx) { +; CHECK-LABEL: define i8 @test_a2a4i8( +; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) { ; CHECK-NEXT: [[STACK:%.*]] = freeze <8 x i8> poison -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[STACK]], i64 [[IDX]] -; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]]) -; CHECK-NEXT: ret void +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]] +; CHECK-NEXT: ret i8 [[TMP2]] ; %stack = alloca [2 x [4 x i8]], align 4, addrspace(5) + store i64 %bits, ptr addrspace(5) %stack %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx %val = load i8, ptr addrspace(5) %ptr, align 1 - call void @clobber_i8(i8 %val) - ret void + ret i8 %val } -define void @test_a2a3i8(i64 %idx) { -; CHECK-LABEL: define void @test_a2a3i8( -; CHECK-SAME: i64 [[IDX:%.*]]) { +define i8 @test_a2a3i8(i48 %bits, i64 %idx) { +; CHECK-LABEL: define i8 @test_a2a3i8( +; CHECK-SAME: i48 [[BITS:%.*]], i64 [[IDX:%.*]]) { ; CHECK-NEXT: [[STACK:%.*]] = freeze <6 x i8> poison -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <6 x i8> [[STACK]], i64 [[IDX]] -; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]]) -; CHECK-NEXT: ret void +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i48 [[BITS]] to <6 x i8> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <6 x i8> [[TMP1]], i64 [[IDX]] +; CHECK-NEXT: ret i8 [[TMP2]] ; %stack = alloca [2 x [3 x i8]], align 4, addrspace(5) + store i48 %bits, ptr addrspace(5) %stack %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx %val = load i8, ptr addrspace(5) %ptr, align 1 - call void @clobber_i8(i8 %val) - ret void + ret i8 %val } -define void @test_s1v4i8(i64 %idx) { -; CHECK-LABEL: define void @test_s1v4i8( -; CHECK-SAME: i64 [[IDX:%.*]]) { +define i8 @test_s1v4i8(i32 %bits, i64 %idx) { +; CHECK-LABEL: define i8 @test_s1v4i8( +; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) { ; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i8> poison -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[STACK]], i64 [[IDX]] -; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]]) -; CHECK-NEXT: ret void +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]] +; CHECK-NEXT: ret i8 [[TMP2]] ; %stack = alloca {<4 x i8>}, align 4, addrspace(5) + store i32 %bits, ptr addrspace(5) %stack %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx %val = load i8, ptr addrspace(5) %ptr, align 1 - call void @clobber_i8(i8 %val) - ret void + ret i8 %val } -define void @test_s1a4i8(i64 %idx) { -; CHECK-LABEL: define void @test_s1a4i8( -; CHECK-SAME: i64 [[IDX:%.*]]) { +define i8 @test_s1a4i8(i32 %bits, i64 %idx) { +; CHECK-LABEL: define i8 @test_s1a4i8( +; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) { ; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i8> poison -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[STACK]], i64 [[IDX]] -; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]]) -; CHECK-NEXT: ret void +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]] +; CHECK-NEXT: ret i8 [[TMP2]] ; %stack = alloca {[4 x i8]}, align 4, addrspace(5) + store i32 %bits, ptr addrspace(5) %stack %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx %val = load i8, ptr addrspace(5) %ptr, align 1 - call void @clobber_i8(i8 %val) - ret void + ret i8 %val } -define void @test_s4i8(i64 %idx) { -; CHECK-LABEL: define void @test_s4i8( -; CHECK-SAME: i64 [[IDX:%.*]]) { +define i8 @test_s4i8(i32 %bits, i64 %idx) { +; CHECK-LABEL: define i8 @test_s4i8( +; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) { ; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i8> poison -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[STACK]], i64 [[IDX]] -; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]]) -; CHECK-NEXT: ret void +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]] +; CHECK-NEXT: ret i8 [[TMP2]] ; %stack = alloca {i8, i8, i8, i8}, align 4, addrspace(5) + store i32 %bits, ptr addrspace(5) %stack %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx %val = load i8, ptr addrspace(5) %ptr, align 1 - call void @clobber_i8(i8 %val) - ret void + ret i8 %val } -define void @test_s2v4i8(i64 %idx) { -; CHECK-LABEL: define void @test_s2v4i8( -; CHECK-SAME: i64 [[IDX:%.*]]) { +define i8 @test_s2v4i8(i64 %bits, i64 %idx) { +; CHECK-LABEL: define i8 @test_s2v4i8( +; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) { ; CHECK-NEXT: [[STACK:%.*]] = freeze <8 x i8> poison -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[STACK]], i64 [[IDX]] -; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]]) -; CHECK-NEXT: ret void +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]] +; CHECK-NEXT: ret i8 [[TMP2]] ; %stack = alloca {<4 x i8>, <4 x i8>}, align 4, addrspace(5) + store i64 %bits, ptr addrspace(5) %stack %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx %val = load i8, ptr addrspace(5) %ptr, align 1 - call void @clobber_i8(i8 %val) - ret void + ret i8 %val } -define void @test_s2v2i8v4i8(i64 %idx) { -; CHECK-LABEL: define void @test_s2v2i8v4i8( -; CHECK-SAME: i64 [[IDX:%.*]]) { +define i8 @test_s2v2i8v4i8(i64 %bits, i64 %idx) { +; CHECK-LABEL: define i8 @test_s2v2i8v4i8( +; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) { ; CHECK-NEXT: [[STACK:%.*]] = freeze <8 x i8> poison -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[STACK]], i64 [[IDX]] -; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]]) -; CHECK-NEXT: ret void +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]] +; CHECK-NEXT: ret i8 [[TMP2]] ; %stack = alloca {<2 x i8>, <4 x i8>}, align 4, addrspace(5) + store i64 %bits, ptr addrspace(5) %stack %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx %val = load i8, ptr addrspace(5) %ptr, align 1 - call void @clobber_i8(i8 %val) - ret void + ret i8 %val } -define void @test_s2v2i8v3i8(i64 %idx) { -; CHECK-LABEL: define void @test_s2v2i8v3i8( -; CHECK-SAME: i64 [[IDX:%.*]]) { +define i8 @test_s2v2i8v3i8(i64 %bits, i64 %idx) { +; CHECK-LABEL: define i8 @test_s2v2i8v3i8( +; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) { ; CHECK-NEXT: [[STACK:%.*]] = freeze <8 x i8> poison -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[STACK]], i64 [[IDX]] -; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]]) -; CHECK-NEXT: ret void +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]] +; CHECK-NEXT: ret i8 [[TMP2]] ; %stack = alloca {<2 x i8>, <3 x i8>}, align 4, addrspace(5) + store i64 %bits, ptr addrspace(5) %stack %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx %val = load i8, ptr addrspace(5) %ptr, align 1 - call void @clobber_i8(i8 %val) - ret void + ret i8 %val } -define void @test_s2s2i8s4i8(i64 %idx) { -; CHECK-LABEL: define void @test_s2s2i8s4i8( -; CHECK-SAME: i64 [[IDX:%.*]]) { +define i8 @test_s2s2i8s4i8(i48 %bits, i64 %idx) { +; CHECK-LABEL: define i8 @test_s2s2i8s4i8( +; CHECK-SAME: i48 [[BITS:%.*]], i64 [[IDX:%.*]]) { ; CHECK-NEXT: [[STACK:%.*]] = freeze <6 x i8> poison -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <6 x i8> [[STACK]], i64 [[IDX]] -; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]]) -; CHECK-NEXT: ret void +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i48 [[BITS]] to <6 x i8> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <6 x i8> [[TMP1]], i64 [[IDX]] +; CHECK-NEXT: ret i8 [[TMP2]] ; %stack = alloca {{i8, i8}, {i8, i8, i8, i8}}, align 4, addrspace(5) + store i48 %bits, ptr addrspace(5) %stack %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx %val = load i8, ptr addrspace(5) %ptr, align 1 - call void @clobber_i8(i8 %val) - ret void + ret i8 %val } -define void @test_s2s2i8s3i8(i64 %idx) { -; CHECK-LABEL: define void @test_s2s2i8s3i8( -; CHECK-SAME: i64 [[IDX:%.*]]) { +define i8 @test_s2s2i8s3i8(i40 %bits, i64 %idx) { +; CHECK-LABEL: define i8 @test_s2s2i8s3i8( +; CHECK-SAME: i40 [[BITS:%.*]], i64 [[IDX:%.*]]) { ; CHECK-NEXT: [[STACK:%.*]] = freeze <5 x i8> poison -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <5 x i8> [[STACK]], i64 [[IDX]] -; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]]) -; CHECK-NEXT: ret void +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i40 [[BITS]] to <5 x i8> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <5 x i8> [[TMP1]], i64 [[IDX]] +; CHECK-NEXT: ret i8 [[TMP2]] ; %stack = alloca {{i8, i8}, {i8, i8, i8}}, align 4, addrspace(5) + store i40 %bits, ptr addrspace(5) %stack %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx %val = load i8, ptr addrspace(5) %ptr, align 1 - call void @clobber_i8(i8 %val) - ret void + ret i8 %val } -define void @test_s3i8s1i8v2i8(i64 %idx) { -; CHECK-LABEL: define void @test_s3i8s1i8v2i8( -; CHECK-SAME: i64 [[IDX:%.*]]) { +define i8 @test_s3i8s1i8v2i8(i32 %bits, i64 %idx) { +; CHECK-LABEL: define i8 @test_s3i8s1i8v2i8( +; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) { ; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i8> poison -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[STACK]], i64 [[IDX]] -; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]]) -; CHECK-NEXT: ret void +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]] +; CHECK-NEXT: ret i8 [[TMP2]] ; %stack = alloca {i8, {i8}, <2 x i8>}, align 4, addrspace(5) + store i32 %bits, ptr addrspace(5) %stack %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx %val = load i8, ptr addrspace(5) %ptr, align 1 - call void @clobber_i8(i8 %val) - ret void + ret i8 %val } -define void @test_s3i8i8s0(i64 %idx) { -; CHECK-LABEL: define void @test_s3i8i8s0( -; CHECK-SAME: i64 [[IDX:%.*]]) { +define i8 @test_s3i8i8s0(i16 %bits, i64 %idx) { +; CHECK-LABEL: define i8 @test_s3i8i8s0( +; CHECK-SAME: i16 [[BITS:%.*]], i64 [[IDX:%.*]]) { ; CHECK-NEXT: [[STACK:%.*]] = freeze <2 x i8> poison -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i8> [[STACK]], i64 [[IDX]] -; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]]) -; CHECK-NEXT: ret void +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[BITS]] to <2 x i8> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i8> [[TMP1]], i64 [[IDX]] +; CHECK-NEXT: ret i8 [[TMP2]] ; %stack = alloca {i8, i8, {}}, align 4, addrspace(5) + store i16 %bits, ptr addrspace(5) %stack %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx %val = load i8, ptr addrspace(5) %ptr, align 1 - call void @clobber_i8(i8 %val) - ret void + ret i8 %val } ; heterogeneous element types are not supported -define void @test_heterogeneous(i64 %idx) { -; CHECK-LABEL: define void @test_heterogeneous( -; CHECK-SAME: i64 [[IDX:%.*]]) { +define i8 @test_heterogeneous(i32 %bits, i64 %idx) { +; CHECK-LABEL: define i8 @test_heterogeneous( +; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) { ; CHECK-NEXT: [[STACK:%.*]] = alloca { i8, i8, i16 }, align 4, addrspace(5) +; CHECK-NEXT: store i32 [[BITS]], ptr addrspace(5) [[STACK]], align 4 ; CHECK-NEXT: [[PTR:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[STACK]], i64 [[IDX]] ; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr addrspace(5) [[PTR]], align 1 -; CHECK-NEXT: call void @clobber_i8(i8 [[VAL]]) -; CHECK-NEXT: ret void +; CHECK-NEXT: ret i8 [[VAL]] ; %stack = alloca {i8, i8, i16}, align 4, addrspace(5) + store i32 %bits, ptr addrspace(5) %stack %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx %val = load i8, ptr addrspace(5) %ptr, align 1 - call void @clobber_i8(i8 %val) - ret void + ret i8 %val } ; empty types are not supported -define void @test_empty(i64 %idx) { -; CHECK-LABEL: define void @test_empty( -; CHECK-SAME: i64 [[IDX:%.*]]) { +define void @test_empty() { +; CHECK-LABEL: define void @test_empty() { ; CHECK-NEXT: [[STACK:%.*]] = alloca {}, align 4, addrspace(5) -; CHECK-NEXT: [[PTR:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[STACK]], i64 [[IDX]] -; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr addrspace(5) [[PTR]], align 1 -; CHECK-NEXT: call void @clobber_i8(i8 [[VAL]]) ; CHECK-NEXT: ret void ; %stack = alloca {}, align 4, addrspace(5) - %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx - %val = load i8, ptr addrspace(5) %ptr, align 1 - call void @clobber_i8(i8 %val) ret void } ; singleton types are not supported -define void @test_singleton(i64 %idx) { -; CHECK-LABEL: define void @test_singleton( -; CHECK-SAME: i64 [[IDX:%.*]]) { +define i8 @test_singleton(i8 %bits, i64 %idx) { +; CHECK-LABEL: define i8 @test_singleton( +; CHECK-SAME: i8 [[BITS:%.*]], i64 [[IDX:%.*]]) { ; CHECK-NEXT: [[STACK:%.*]] = alloca { i8, {} }, align 4, addrspace(5) +; CHECK-NEXT: store i8 [[BITS]], ptr addrspace(5) [[STACK]], align 1 ; CHECK-NEXT: [[PTR:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[STACK]], i64 [[IDX]] ; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr addrspace(5) [[PTR]], align 1 -; CHECK-NEXT: call void @clobber_i8(i8 [[VAL]]) -; CHECK-NEXT: ret void +; CHECK-NEXT: ret i8 [[VAL]] ; %stack = alloca {i8, {}}, align 4, addrspace(5) + store i8 %bits, ptr addrspace(5) %stack %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx %val = load i8, ptr addrspace(5) %ptr, align 1 - call void @clobber_i8(i8 %val) - ret void + ret i8 %val }