Skip to content

Commit 7249d00

Browse files
michalpaszkowskiigcbot
authored andcommitted
Support width and pointer type agnostic loads/stores for private memory allocas in LowerGEPForPrivMem
The old handleStoreInst/loadEltsFromVecAlloca assume 1:1 lane mapping and equal sizes between user value and the promoted vector element type. This is insufficient for mixed widths (e.g. <4 x i8> and <... x i32>), cross-lane accesses created by the new byte-offset GEP lowering, or pointers under opaque pointers (bitcasts between pointers and non-pointers are illegal). With the changes: 1) Stores (handleStoreInst and storeEltsToVecAlloca) normalize the source (scalar or vector) to a single integer of NeedBits = N * DstBits using ptrtoint/bitcast, split the big integer into K = ceil( NeedBits / SrcBits) chunks, bitcast/inttoptr each chunk back to the promoted lane type and insert into K consecutive lanes starting at the scalarized index. 2) Loads (handleLoadInst and loadEltsFromVecAlloca) read K promoted lanes starting at the scalarized index, convert each lane to iSrcBits, pack into i(K*SrcBits), truncate to i(NeedBits), then expand to the requested scalar or <N x DstScalarTy>. Use inttoptr for pointer results. There is also still a simple (old) path. If SrcBits == DstBits, just emit extractelement with casts (if needed). All paths do a single load of the promoted vector, extractelement/insertelement, and in case of stores only a single store back. With these changes, the LLVM IR emitted from LowerGEPForPrivMem will look different. Instead of using plain bitcasts, there are now ptrtoint/inttoptr instructions and there is additional packing/splitting logic. For the simple (old) load path, the new implementation should essentially emit the same pattern (potnetially skipping bitcasts). The additional integer/bitcast instruction sequences should be easily foldable. Memory traffic is unchanged (still one vector load/store). Overall register pressure should be similar, the pass still eliminates GEPs and avoids private/scratch accesses.
1 parent 9e80e06 commit 7249d00

File tree

6 files changed

+360
-56
lines changed

6 files changed

+360
-56
lines changed

IGC/Compiler/CISACodeGen/LowerGEPForPrivMem.cpp

Lines changed: 237 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -970,34 +970,149 @@ void TransposeHelper::handleGEPInstNew(llvm::GetElementPtrInst *pGEP, llvm::Valu
970970
HandleAllocaSources(pGEP, linearOffset);
971971
}
972972

973-
// Load N elements from a vector alloca, Idx, ... Idx + N - 1. Return a scalar
974-
// or a vector value depending on N.
973+
// Load a value scalar/vector <N * DstScalarTy> out of a private alloca that has been promoted to a flat vector <M *
974+
// SrcScalarTy>. This assumes the GEP lowering has already produced a linear lane index pScalarizedIdx into that
975+
// promoted vector.
976+
//
977+
// The implementation supports two scenarios:
978+
// 1. Simple path (SrcBits == DstBits). In this case the destination element width matches the promoted lane width, so
979+
// only need to extract N consecutive lanes and legalize type differences.
980+
// 2. General path (SrcBits != DstBits). In this case, the destination element width differs from the promoted lane
981+
// width. So read K = ceil(N * DstBits / SrcBits) promoted lanes, pack them into a big integer of K*SrcBits bits, trim
982+
// to exactly NeedBits = N * DstBits, then expand to either a scalar or <N * DstScalarType>.
983+
//
984+
// Note, "lane" here means one element of the promoted vecotr <M * SrcScalarTy>.
975985
static Value *loadEltsFromVecAlloca(unsigned N, AllocaInst *pVecAlloca, Value *pScalarizedIdx,
976-
IGCLLVM::IRBuilder<> &IRB, Type *scalarType) {
977-
Value *pLoadVecAlloca = IRB.CreateLoad(pVecAlloca->getAllocatedType(), pVecAlloca);
986+
IGCLLVM::IRBuilder<> &IRB, Type *DstScalarTy) {
987+
IGC_ASSERT(pVecAlloca && pScalarizedIdx && DstScalarTy);
988+
IGC_ASSERT(N >= 1);
989+
990+
// Promoted vector type and its scalar element type (destination lanes are of this type).
991+
auto *PromotedVecTy = cast<IGCLLVM::FixedVectorType>(pVecAlloca->getAllocatedType());
992+
Type *SrcScalarTy = PromotedVecTy->getElementType();
993+
const DataLayout &DL = pVecAlloca->getModule()->getDataLayout();
994+
995+
const uint64_t SrcBits = DL.getTypeStoreSizeInBits(SrcScalarTy);
996+
const uint64_t DstBits = DL.getTypeStoreSizeInBits(DstScalarTy);
997+
const uint64_t NeedBits = DstBits * (uint64_t)N;
998+
999+
IGC_ASSERT_MESSAGE(SrcBits > 0 && DstBits > 0, "Unexpected zero-sized scalar type");
1000+
1001+
// Load the entire promoted vector only once, all reads will extract from this value.
1002+
Value *Whole = IRB.CreateLoad(PromotedVecTy, pVecAlloca);
1003+
1004+
// Helper for casting a sinlge scalar to an integer with exactly Bits width.
1005+
auto toIntOfWidth = [&](Value *V, uint64_t Bits) -> Value * {
1006+
Type *Ty = V->getType();
1007+
Type *IntTy = IRB.getIntNTy((unsigned)Bits);
1008+
if (Ty->isPointerTy())
1009+
return IRB.CreatePtrToInt(V, IntTy);
1010+
if (Ty->isIntegerTy((unsigned)Bits))
1011+
return V;
1012+
return IRB.CreateBitCast(V, IntTy);
1013+
};
1014+
1015+
// Helper for casting an integer back to the requested destination scalar type.
1016+
auto intToDstScalar = [&](Value *VInt) -> Value * {
1017+
if (DstScalarTy->isPointerTy())
1018+
return IRB.CreateIntToPtr(VInt, DstScalarTy);
1019+
if (DstScalarTy->isIntegerTy((unsigned)DstBits))
1020+
return VInt;
1021+
return IRB.CreateBitCast(VInt, DstScalarTy);
1022+
};
1023+
1024+
// 1. Fast path
1025+
if (SrcBits == DstBits) {
1026+
if (N == 1) {
1027+
// This is scalar read: extract one lane, legalize type, and return.
1028+
Value *Elem = IRB.CreateExtractElement(Whole, pScalarizedIdx);
1029+
Type *Ty = Elem->getType();
1030+
if (Ty == DstScalarTy)
1031+
return Elem;
1032+
if (Ty->isPointerTy() && DstScalarTy->isIntegerTy((unsigned)DstBits))
1033+
return IRB.CreatePtrToInt(Elem, DstScalarTy);
1034+
if (Ty->isIntegerTy((unsigned)SrcBits) && DstScalarTy->isPointerTy())
1035+
return IRB.CreateIntToPtr(Elem, DstScalarTy);
1036+
return IRB.CreateBitCast(Elem, DstScalarTy);
1037+
}
1038+
1039+
// This is vector read: extract N lanes, legalize per lane type, build <N * DstScalarTy>.
1040+
Type *RetVecTy = IGCLLVM::FixedVectorType::get(DstScalarTy, N);
1041+
Value *Result = PoisonValue::get(RetVecTy);
1042+
for (unsigned i = 0; i < N; ++i) {
1043+
Value *Off = ConstantInt::get(pScalarizedIdx->getType(), i);
1044+
Value *Idx = IRB.CreateAdd(pScalarizedIdx, Off);
1045+
Value *Elem = IRB.CreateExtractElement(Whole, Idx);
1046+
if (Elem->getType() != DstScalarTy) {
1047+
if (Elem->getType()->isPointerTy() && DstScalarTy->isIntegerTy((unsigned)DstBits))
1048+
Elem = IRB.CreatePtrToInt(Elem, DstScalarTy);
1049+
else if (Elem->getType()->isIntegerTy((unsigned)SrcBits) && DstScalarTy->isPointerTy())
1050+
Elem = IRB.CreateIntToPtr(Elem, DstScalarTy);
1051+
else
1052+
Elem = IRB.CreateBitCast(Elem, DstScalarTy);
1053+
}
1054+
Result = IRB.CreateInsertElement(Result, Elem, Off);
1055+
}
1056+
return Result;
1057+
}
1058+
1059+
// 2. General path
1060+
//
1061+
// Algorithm:
1062+
// 1. Determine how many promoted lanes are needed, K = ceil(NeedBits / SrcBits).
1063+
// 2. Extract those K lanes as integers i{SrcBits} and pack into a <K * iSrcBits>.
1064+
// 3. Bitcast <K * iSrcBits> to a single i{K*SrcBits} integer.
1065+
// 4. Truncate to exactly NeedBits.
1066+
// 5. Expand to the requested result type: For scalars, intToDstScalar(i{DstBits}). For vectors, bitcast to <N *
1067+
// iDstBits>, then per lane inttoptr if pointer elements, otherwise just bitcast to <N * DstScalarTy>.
1068+
const uint64_t K = (NeedBits + SrcBits - 1) / SrcBits; // ceil
1069+
1070+
// Build <K * iSrcBits> by extracting K consecutive promoted lanes starting at pScalarizedIdx.
1071+
Type *IntSrcTy = IRB.getIntNTy((unsigned)SrcBits);
1072+
IGCLLVM::FixedVectorType *PackVecIntTy = IGCLLVM::FixedVectorType::get(IntSrcTy, (unsigned)K);
1073+
Value *PackVecInt = PoisonValue::get(PackVecIntTy);
1074+
1075+
for (uint64_t i = 0; i < K; ++i) {
1076+
Value *Off = ConstantInt::get(pScalarizedIdx->getType(), (uint64_t)i);
1077+
Value *Idx = IRB.CreateAdd(pScalarizedIdx, Off);
1078+
Value *SrcElem = IRB.CreateExtractElement(Whole, Idx);
1079+
Value *SrcElemInt = toIntOfWidth(SrcElem, SrcBits);
1080+
PackVecInt = IRB.CreateInsertElement(PackVecInt, SrcElemInt, Off);
1081+
}
1082+
1083+
// Concatenate the K integers into a single i{K*SrcBits} by bitcasting the vector.
1084+
Type *BigIntTy = IRB.getIntNTy((unsigned)(K * SrcBits));
1085+
Value *BigInt = IRB.CreateBitCast(PackVecInt, BigIntTy);
1086+
1087+
// Trim to the requested width.
1088+
Value *NeedInt = (NeedBits == K * SrcBits) ? BigInt : IRB.CreateTrunc(BigInt, IRB.getIntNTy((unsigned)NeedBits));
1089+
1090+
// Expand to the user-visible return type.
9781091
if (N == 1) {
979-
return IRB.CreateBitCast(IRB.CreateExtractElement(pLoadVecAlloca, pScalarizedIdx), scalarType);
1092+
// For scalars, i{DstBits} into DstScalarTy.
1093+
return intToDstScalar(NeedInt);
9801094
}
9811095

982-
// A vector load
983-
// %v = load <2 x float>* %ptr
984-
// becomes
985-
// %w = load <32 x float>* %ptr1
986-
// %v0 = extractelement <32 x float> %w, i32 %idx
987-
// %v1 = extractelement <32 x float> %w, i32 %idx+1
988-
// replace all uses of %v with <%v0, %v1>
989-
IGC_ASSERT_MESSAGE((N > 1), "out of sync");
990-
Type *Ty = IGCLLVM::FixedVectorType::get(scalarType, N);
991-
Value *Result = UndefValue::get(Ty);
1096+
// For vectors, bitcast to <N * iDstBits>, then change element type if needed.
1097+
IGCLLVM::FixedVectorType *VecIntTy = IGCLLVM::FixedVectorType::get(IRB.getIntNTy((unsigned)DstBits), N);
1098+
Value *AsIntVec = (NeedBits == (uint64_t)DL.getTypeStoreSizeInBits(VecIntTy)) ? IRB.CreateBitCast(NeedInt, VecIntTy)
1099+
: (Value *)nullptr;
9921100

1101+
if (!DstScalarTy->isPointerTy()) {
1102+
if (DstScalarTy->isIntegerTy((unsigned)DstBits))
1103+
return AsIntVec;
1104+
return IRB.CreateBitCast(AsIntVec, IGCLLVM::FixedVectorType::get(DstScalarTy, N));
1105+
}
1106+
1107+
IGCLLVM::FixedVectorType *RetVecTy = IGCLLVM::FixedVectorType::get(DstScalarTy, N);
1108+
Value *Ret = PoisonValue::get(RetVecTy);
9931109
for (unsigned i = 0; i < N; ++i) {
994-
Value *VectorIdx = ConstantInt::get(pScalarizedIdx->getType(), i);
995-
auto Idx = IRB.CreateAdd(pScalarizedIdx, VectorIdx);
996-
auto Val = IRB.CreateExtractElement(pLoadVecAlloca, Idx);
997-
Val = IRB.CreateBitCast(Val, scalarType);
998-
Result = IRB.CreateInsertElement(Result, Val, VectorIdx);
1110+
Value *LaneIdx = ConstantInt::get(IRB.getInt32Ty(), i);
1111+
Value *LaneInt = IRB.CreateExtractElement(AsIntVec, LaneIdx);
1112+
Value *LanePtr = IRB.CreateIntToPtr(LaneInt, DstScalarTy);
1113+
Ret = IRB.CreateInsertElement(Ret, LanePtr, LaneIdx);
9991114
}
1000-
return Result;
1115+
return Ret;
10011116
}
10021117

10031118
void TransposeHelperPromote::handleLoadInst(LoadInst *pLoad, Value *pScalarizedIdx) {
@@ -1012,40 +1127,110 @@ void TransposeHelperPromote::handleLoadInst(LoadInst *pLoad, Value *pScalarizedI
10121127
pLoad->eraseFromParent();
10131128
}
10141129

1015-
void TransposeHelperPromote::handleStoreInst(llvm::StoreInst *pStore, llvm::Value *pScalarizedIdx) {
1016-
// Add Store instruction to remove list
1017-
IGC_ASSERT(nullptr != pStore);
1018-
IGC_ASSERT(pStore->isSimple());
1130+
// Store a scalar/vector value into a promoted private alloca that is represented as a flat vector.
1131+
//
1132+
// The implementation is built on the following assumptions:
1133+
// - The promoted destination is a vector <M * SrcScalarTy> where SrcScalarTy is the element type chosen for the
1134+
// promoted alloca.
1135+
// - The source to store (StoreVal) is either a) a scalar of arbitrary type/width or b) a vector <N * DstScalarTy> of
1136+
// arbitrary lane type/width.
1137+
// - The GEP lowering has already produced a linear element index pScalarizedIdx into the promoted vector. This
1138+
// implementation writes the source bytes begining at that index but can possibly span multiple lanes.
1139+
//
1140+
// Overview of the algorithm:
1141+
// 1. Normalize the source scalar/vector into a single integer of NeedBits (the exact size of the payload to store)
1142+
// 2. Split the integer into K parts, where each chunk has the bit-width of a promoted lane (SrcBits).
1143+
// 3. Bitcast/convert through inttoptr each chunk back to SrcScalarTy (if needed) and insert the chunks into consecutive
1144+
// promoted lanes.
1145+
// 4. Store the promoted vector back to the alloca.
1146+
static void storeEltsToVecAlloca(Value *StoreVal, AllocaInst *pVecAlloca, Value *pScalarizedIdx,
1147+
IGCLLVM::IRBuilder<> &IRB) {
1148+
IGC_ASSERT(StoreVal && pVecAlloca && pScalarizedIdx);
1149+
1150+
// Destination (the promoted private is a vector <M * SrcScalarTy>)
1151+
auto *PromotedVecTy = cast<IGCLLVM::FixedVectorType>(pVecAlloca->getAllocatedType());
1152+
Type *SrcScalarTy = PromotedVecTy->getElementType();
1153+
const DataLayout &DL = pVecAlloca->getModule()->getDataLayout();
1154+
1155+
// Calculate lane count (N) and lane scalar type from the store source value.
1156+
Type *ValTy = StoreVal->getType();
1157+
const unsigned N = ValTy->isVectorTy() ? (unsigned)cast<IGCLLVM::FixedVectorType>(ValTy)->getNumElements() : 1;
1158+
Type *DstScalarTy = ValTy->isVectorTy() ? cast<VectorType>(ValTy)->getElementType() : ValTy;
1159+
1160+
// Calculate sizes of the source promoted lane, destination lane, and the total payload to store.
1161+
const uint64_t SrcBits = DL.getTypeStoreSizeInBits(SrcScalarTy);
1162+
const uint64_t DstBits = DL.getTypeStoreSizeInBits(DstScalarTy);
1163+
const uint64_t NeedBits = DstBits * (uint64_t)N;
1164+
1165+
// Convert a lane value of arbitrary-type to an integer of the exact bit width (DstBits).
1166+
auto toIntOfWidth = [&](Value *V, uint64_t Bits) -> Value * {
1167+
Type *IntTy = IRB.getIntNTy((unsigned)Bits);
1168+
Type *Ty = V->getType();
1169+
if (Ty->isPointerTy())
1170+
return IRB.CreatePtrToInt(V, IntTy);
1171+
if (Ty->isIntegerTy((unsigned)Bits))
1172+
return V;
1173+
return IRB.CreateBitCast(V, IntTy); // float-like
1174+
};
10191175

1020-
IGCLLVM::IRBuilder<> IRB(pStore);
1021-
llvm::Value *pStoreVal = pStore->getValueOperand();
1022-
llvm::Value *pLoadVecAlloca = IRB.CreateLoad(pVecAlloca->getAllocatedType(), pVecAlloca);
1023-
llvm::Value *pIns = pLoadVecAlloca;
1024-
IGC_ASSERT(nullptr != pStoreVal);
1025-
IGC_ASSERT(nullptr != pStoreVal->getType());
1026-
if (pStoreVal->getType()->isVectorTy()) {
1027-
// A vector store
1028-
// store <2 x float> %v, <2 x float>* %ptr
1029-
// becomes
1030-
// %w = load <32 x float> *%ptr1
1031-
// %v0 = extractelement <2 x float> %v, i32 0
1032-
// %w0 = insertelement <32 x float> %w, float %v0, i32 %idx
1033-
// %v1 = extractelement <2 x float> %v, i32 1
1034-
// %w1 = insertelement <32 x float> %w0, float %v1, i32 %idx+1
1035-
// store <32 x float> %w1, <32 x float>* %ptr1
1036-
for (unsigned i = 0, e = (unsigned)cast<IGCLLVM::FixedVectorType>(pStoreVal->getType())->getNumElements(); i < e;
1037-
++i) {
1038-
Value *VectorIdx = ConstantInt::get(pScalarizedIdx->getType(), i);
1039-
auto Val = IRB.CreateExtractElement(pStoreVal, VectorIdx);
1040-
Val = IRB.CreateBitCast(Val, pLoadVecAlloca->getType()->getScalarType());
1041-
auto Idx = IRB.CreateAdd(pScalarizedIdx, VectorIdx);
1042-
pIns = IRB.CreateInsertElement(pIns, Val, Idx);
1043-
}
1176+
// Convert an integer chunk of SrcBits back to the promoted lane scalar type (SrcScalarTy).
1177+
auto intToSrcScalar = [&](Value *VInt) -> Value * {
1178+
if (SrcScalarTy->isPointerTy())
1179+
return IRB.CreateIntToPtr(VInt, SrcScalarTy);
1180+
if (SrcScalarTy->isIntegerTy((unsigned)SrcBits))
1181+
return VInt;
1182+
return IRB.CreateBitCast(VInt, SrcScalarTy); // float-like
1183+
};
1184+
1185+
// Pack the entire store payload into a single integer of NeedBits.
1186+
// If N == 1, just normalize the scalar. If N > 1, create a vector of lane-sized integers and then bitcast it into one
1187+
// bit integer.
1188+
Value *NeedInt = nullptr;
1189+
if (N == 1) {
1190+
NeedInt = toIntOfWidth(StoreVal, DstBits);
10441191
} else {
1045-
pStoreVal = IRB.CreateBitCast(pStoreVal, pLoadVecAlloca->getType()->getScalarType());
1046-
pIns = IRB.CreateInsertElement(pLoadVecAlloca, pStoreVal, pScalarizedIdx);
1192+
Type *LaneIntTy = IRB.getIntNTy((unsigned)DstBits);
1193+
auto *VecIntTy = IGCLLVM::FixedVectorType::get(LaneIntTy, N);
1194+
Value *AsIntVec = PoisonValue::get(VecIntTy);
1195+
for (unsigned i = 0; i < N; ++i) {
1196+
Value *Lane = IRB.CreateExtractElement(StoreVal, IRB.getInt32(i));
1197+
Value *LaneInt = toIntOfWidth(Lane, DstBits);
1198+
AsIntVec = IRB.CreateInsertElement(AsIntVec, LaneInt, IRB.getInt32(i));
1199+
}
1200+
NeedInt = IRB.CreateBitCast(AsIntVec, IRB.getIntNTy((unsigned)NeedBits));
1201+
}
1202+
1203+
// Calculate how many promoted lanes (K) are needed to hold NeedBits. BigBits is the total bits occupied by those
1204+
// lanes.
1205+
const uint64_t K = (NeedBits + SrcBits - 1) / SrcBits;
1206+
const uint64_t BigBits = K * SrcBits;
1207+
1208+
// If the store payload does not fill the last lane, zero-extend to K * SrcBits.
1209+
if (NeedBits < BigBits)
1210+
NeedInt = IRB.CreateZExt(NeedInt, IRB.getIntNTy((unsigned)BigBits));
1211+
1212+
// Bitcast i(BigBits) into <K x iSrcBits> or in other words split into K chunks.
1213+
auto *IntSrcTy = IRB.getIntNTy((unsigned)SrcBits);
1214+
auto *PackVecIntTy = IGCLLVM::FixedVectorType::get(IntSrcTy, (unsigned)K);
1215+
Value *PackVecInt = IRB.CreateBitCast(NeedInt, PackVecIntTy);
1216+
1217+
// Load the current promoted vector, overwrite K consecutive lanes atarting at Idx, and then store the updated vector
1218+
// back.
1219+
Value *Whole = IRB.CreateLoad(PromotedVecTy, pVecAlloca);
1220+
for (unsigned i = 0; i < K; ++i) {
1221+
Value *LaneInt = IRB.CreateExtractElement(PackVecInt, IRB.getInt32(i));
1222+
Value *LaneVal = intToSrcScalar(LaneInt);
1223+
Value *Off = ConstantInt::get(pScalarizedIdx->getType(), i);
1224+
Value *Idx = IRB.CreateAdd(pScalarizedIdx, Off);
1225+
Whole = IRB.CreateInsertElement(Whole, LaneVal, Idx);
10471226
}
1048-
IRB.CreateStore(pIns, pVecAlloca);
1227+
IRB.CreateStore(Whole, pVecAlloca);
1228+
}
1229+
1230+
void TransposeHelperPromote::handleStoreInst(llvm::StoreInst *pStore, llvm::Value *pScalarizedIdx) {
1231+
IGC_ASSERT(pStore && pStore->isSimple());
1232+
IGCLLVM::IRBuilder<> IRB(pStore);
1233+
storeEltsToVecAlloca(pStore->getValueOperand(), pVecAlloca, pScalarizedIdx, IRB);
10491234
pStore->eraseFromParent();
10501235
}
10511236

IGC/Compiler/tests/LowerGEPForPrivMem/basic-typed-pointers.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
;=========================== begin_copyright_notice ============================
22
;
3-
; Copyright (C) 2022-2024 Intel Corporation
3+
; Copyright (C) 2022-2025 Intel Corporation
44
;
55
; SPDX-License-Identifier: MIT
66
;
@@ -32,7 +32,7 @@ define void @test(<4 x i32> %a, <4 x i32>* %b) {
3232
; CHECK: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP1]]
3333
; CHECK: [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]]
3434
; CHECK: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP11]], i32 0
35-
; CHECK: [[TMP13:%.*]] = insertelement <4 x i32> undef, i32 [[TMP12]], i32 0
35+
; CHECK: [[TMP13:%.*]] = insertelement <4 x i32> poison, i32 [[TMP12]], i32 0
3636
; CHECK: [[TMP14:%.*]] = extractelement <4 x i32> [[TMP11]], i32 1
3737
; CHECK: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP14]], i32 1
3838
; CHECK: [[TMP16:%.*]] = extractelement <4 x i32> [[TMP11]], i32 2

IGC/Compiler/tests/LowerGEPForPrivMem/basic.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
;=========================== begin_copyright_notice ============================
22
;
3-
; Copyright (C) 2022-2024 Intel Corporation
3+
; Copyright (C) 2022-2025 Intel Corporation
44
;
55
; SPDX-License-Identifier: MIT
66
;
@@ -33,7 +33,7 @@ define void @test(<4 x i32> %a, ptr %b) {
3333
; CHECK: store <4 x i32> [[TMP10]], ptr [[TMP1]]
3434
; CHECK: [[TMP11:%.*]] = load <4 x i32>, ptr [[TMP1]]
3535
; CHECK: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP11]], i32 0
36-
; CHECK: [[TMP13:%.*]] = insertelement <4 x i32> undef, i32 [[TMP12]], i32 0
36+
; CHECK: [[TMP13:%.*]] = insertelement <4 x i32> poison, i32 [[TMP12]], i32 0
3737
; CHECK: [[TMP14:%.*]] = extractelement <4 x i32> [[TMP11]], i32 1
3838
; CHECK: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP14]], i32 1
3939
; CHECK: [[TMP16:%.*]] = extractelement <4 x i32> [[TMP11]], i32 2
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
;=========================== begin_copyright_notice ============================
2+
;
3+
; Copyright (C) 2025 Intel Corporation
4+
;
5+
; SPDX-License-Identifier: MIT
6+
;
7+
;============================ end_copyright_notice =============================
8+
9+
; REQUIRES: llvm-14-plus
10+
; RUN: igc_opt --opaque-pointers -igc-priv-mem-to-reg -S < %s 2>&1 | FileCheck %s
11+
12+
; This test verifies that the pass emits a bitcast with the expected load type.
13+
14+
%g = type { [17 x double] }
15+
16+
define void @test() {
17+
; CHECK-LABEL: @test(
18+
; CHECK: alloca <17 x double>
19+
; CHECK: bitcast {{.*}} {{%.*}} to <2 x i32>
20+
; CHECK: ret void
21+
22+
%a = alloca %g, align 8, !uniform !4
23+
%b = getelementptr i8, ptr %a, i32 128
24+
%c = load <2 x i32>, ptr %b, align 8
25+
ret void
26+
}
27+
28+
!igc.functions = !{!0}
29+
30+
!0 = !{ptr @test, !1}
31+
!1 = !{!2, !3}
32+
!2 = !{!"function_type", i32 0}
33+
!3 = !{!"implicit_arg_desc"}
34+
!4 = !{i1 true}

0 commit comments

Comments
 (0)