Skip to content

Commit df9c8fd

Browse files
michalpaszkowskiigcbot
authored andcommitted
Fix scalarized GEP index for reinterpreted vectors in LowerGEPForPrivMem
The old scalarization advanced the GEP scalarized index by the number of smaller vector elements when a GEP indexed through a reinterpreted vector whose lane size differed from the promoted lane. This over-advanced the index (e.g. using 8 for <8 x i32> over double lanes instead of 4), producing incorrect accesses. The fix: - Track the promoted lane byte size (m_promotedLaneBytes) in TransposeHelper and set it in TransposeHelperPromote's constructor. - In TransposeHelper::getArrSizeAndEltType, when a vector is a reinterpret of the promoted storage, compute the increment as vector_byte_size / m_promotedLaneBytes instead of vector_byte_size / small_element_size.
1 parent 36cd3f0 commit df9c8fd

File tree

4 files changed

+81
-4
lines changed

4 files changed

+81
-4
lines changed

IGC/Compiler/CISACodeGen/LowerGEPForPrivMem.cpp

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -762,7 +762,15 @@ class TransposeHelperPromote : public TransposeHelper {
762762
AllocaInst *pVecAlloca;
763763
// location of lifetime starts
764764
llvm::SmallPtrSet<Instruction *, 4> pStartPoints;
765-
TransposeHelperPromote(AllocaInst *pAI, const DataLayout &DL) : TransposeHelper(DL, false) { pVecAlloca = pAI; }
765+
TransposeHelperPromote(AllocaInst *pAI, const DataLayout &DL) : TransposeHelper(DL, false) {
766+
pVecAlloca = pAI;
767+
// Determine promoted lane scalar type and record its size in bytes so that
768+
// GEP indexing through reinterpreted vectors can advance the scalarized
769+
// index in units of promoted lanes rather than the smaller vector elements.
770+
llvm::Type *AllocTy = pAI->getAllocatedType();
771+
llvm::Type *LaneTy = AllocTy->isVectorTy() ? cast<IGCLLVM::FixedVectorType>(AllocTy)->getElementType() : AllocTy;
772+
m_promotedLaneBytes = (uint32_t)DL.getTypeAllocSize(LaneTy);
773+
}
766774
};
767775

768776
void LowerGEPForPrivMem::handleAllocaInst(llvm::AllocaInst *pAlloca) {
@@ -821,7 +829,15 @@ std::pair<unsigned int, Type *> TransposeHelper::getArrSizeAndEltType(Type *T) {
821829
auto *vTy = cast<IGCLLVM::FixedVectorType>(T);
822830
unsigned int vector_size_in_bytes = int_cast<unsigned int>(m_DL.getTypeAllocSize(T));
823831
unsigned int elt_size_in_bytes = int_cast<unsigned int>(m_DL.getTypeAllocSize(vTy->getElementType()));
824-
arr_sz = vector_size_in_bytes / elt_size_in_bytes;
832+
// If the vector is a reinterpret of the promoted storage (its element size
833+
// differs from the promoted lane size), advance the scalarized index in
834+
// units of promoted lanes, not in units of the smaller vector elements.
835+
if (m_promotedLaneBytes != 0 && m_promotedLaneBytes != elt_size_in_bytes &&
836+
(vector_size_in_bytes % m_promotedLaneBytes) == 0) {
837+
arr_sz = vector_size_in_bytes / m_promotedLaneBytes;
838+
} else {
839+
arr_sz = vector_size_in_bytes / elt_size_in_bytes;
840+
}
825841
}
826842
retTy = cast<VectorType>(T)->getElementType();
827843
} else {

IGC/Compiler/CISACodeGen/LowerGEPForPrivMem.hpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,8 @@ class SOALayoutChecker : public llvm::InstVisitor<SOALayoutChecker, bool> {
137137

138138
class TransposeHelper {
139139
public:
140-
TransposeHelper(const llvm::DataLayout &DL, bool vectorIndex) : m_vectorIndex(vectorIndex), m_DL(DL) {}
140+
TransposeHelper(const llvm::DataLayout &DL, bool vectorIndex)
141+
: m_vectorIndex(vectorIndex), m_DL(DL), m_promotedLaneBytes(0) {}
141142
void HandleAllocaSources(llvm::Instruction *v, llvm::Value *idx);
142143
void handleGEPInst(llvm::GetElementPtrInst *pGEP, llvm::Value *idx);
143144
// Temporary, this is to replace HandleGEPInst
@@ -152,6 +153,10 @@ class TransposeHelper {
152153
protected:
153154
const llvm::DataLayout &m_DL;
154155
std::vector<llvm::Instruction *> m_toBeRemovedGEP;
156+
// Size in bytes of one promoted lane element (base scalar chosen for the promoted alloca).
157+
// Used to correctly scale indices when a use reinterprets memory as a vector
158+
// of smaller element size (e.g. <8 x i32> over double lanes).
159+
uint32_t m_promotedLaneBytes;
155160

156161
private:
157162
bool m_vectorIndex;
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
;=========================== begin_copyright_notice ============================
2+
;
3+
; Copyright (C) 2025 Intel Corporation
4+
;
5+
; SPDX-License-Identifier: MIT
6+
;
7+
;============================ end_copyright_notice =============================
8+
9+
; RUN: igc_opt --opaque-pointers -igc-priv-mem-to-reg -S < %s 2>&1 | FileCheck %s
10+
11+
; Verify correct scalarized GEP index advancement for reinterpreted vectors.
12+
; Before the fix, when storing a <8 x i32> vector into a promoted private alloca
13+
; whose base lane type is double (8 bytes), the legacy scalarization advanced
14+
; the linear index by 8 (number of i32 elements) instead of 4 (bytesCovered / promotedLaneBytes),
15+
; creating gaps (0..3, 8..11, ...) and skipping indices 4..7.
16+
; After the fix, the second vector store must populate lanes 4..7.
17+
18+
; CHECK-LABEL: @test(
19+
; CHECK: alloca <16 x double>
20+
; First vector packet populates lanes 0..3
21+
; CHECK: insertelement <16 x double> {{.*}}, double {{.*}}, i32 0
22+
; CHECK: insertelement <16 x double> {{.*}}, double {{.*}}, i32 1
23+
; CHECK: insertelement <16 x double> {{.*}}, double {{.*}}, i32 2
24+
; CHECK: insertelement <16 x double> {{.*}}, double {{.*}}, i32 3
25+
; CHECK: store <16 x double>
26+
; Second vector packet must immediately fill lanes 4..7 (no gap / no i32 8 yet)
27+
; CHECK: insertelement <16 x double> {{.*}}, double {{.*}}, i32 4
28+
; CHECK: insertelement <16 x double> {{.*}}, double {{.*}}, i32 5
29+
; CHECK: insertelement <16 x double> {{.*}}, double {{.*}}, i32 6
30+
; CHECK: insertelement <16 x double> {{.*}}, double {{.*}}, i32 7
31+
32+
define spir_kernel void @test(ptr addrspace(1) %src) {
33+
entry:
34+
; Private array to be promoted: 16 doubles (128 bytes). Each <8 x i32> (32 bytes) spans 4 double lanes.
35+
%priv = alloca [16 x double], align 8
36+
37+
; Base pointer to first double.
38+
%base = getelementptr inbounds [16 x double], ptr %priv, i32 0, i32 0
39+
40+
; Load first packet and store.
41+
%src.vec0 = load <8 x i32>, ptr addrspace(1) %src, align 32
42+
store <8 x i32> %src.vec0, ptr %base, align 32
43+
44+
; Second packet (should map immediately after first: lanes 4..7 post-promotion).
45+
%src.vec1.ptr = getelementptr <8 x i32>, ptr addrspace(1) %src, i32 1
46+
%src.vec1 = load <8 x i32>, ptr addrspace(1) %src.vec1.ptr, align 32
47+
%vec.cast.1 = getelementptr <8 x i32>, ptr %base, i32 1
48+
store <8 x i32> %src.vec1, ptr %vec.cast.1, align 32
49+
50+
ret void
51+
}
52+
53+
!igc.functions = !{!1}
54+
!1 = !{ptr @test, !2}
55+
!2 = !{!3}
56+
!3 = !{!"function_type", i32 0}

IGC/Compiler/tests/LowerGEPForPrivMem/skip_i8_gep_non_constant_offset.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ entry:
2929
%dst = alloca [64 x i32], align 4
3030
br label %for.body
3131

32-
for.body: ; preds = %entry, %for.body
32+
for.body:
3333
%idx = phi i64 [ 0, %entry ], [ %idx.next, %for.body ]
3434
%src.gep = getelementptr i8, ptr %src, i64 %idx
3535
%dst.gep = getelementptr i8, ptr %dst, i64 %idx

0 commit comments

Comments
 (0)