@@ -970,34 +970,149 @@ void TransposeHelper::handleGEPInstNew(llvm::GetElementPtrInst *pGEP, llvm::Valu
970970 HandleAllocaSources (pGEP, linearOffset);
971971}
972972
973- // Load N elements from a vector alloca, Idx, ... Idx + N - 1. Return a scalar
974- // or a vector value depending on N.
973+ // Load a value scalar/vector <N * DstScalarTy> out of a private alloca that has been promoted to a flat vector <M *
974+ // SrcScalarTy>. This assumes the GEP lowering has already produced a linear lane index pScalarizedIdx into that
975+ // promoted vector.
976+ //
977+ // The implementation supports two scenarios:
978+ // 1. Simple path (SrcBits == DstBits). In this case the destination element width matches the promoted lane width, so
979+ // only need to extract N consecutive lanes and legalize type differences.
980+ // 2. General path (SrcBits != DstBits). In this case, the destination element width differs from the promoted lane
981+ // width. So read K = ceil(N * DstBits / SrcBits) promoted lanes, pack them into a big integer of K*SrcBits bits, trim
982+ // to exactly NeedBits = N * DstBits, then expand to either a scalar or <N * DstScalarType>.
983+ //
984+ // Note, "lane" here means one element of the promoted vecotr <M * SrcScalarTy>.
975985static Value *loadEltsFromVecAlloca (unsigned N, AllocaInst *pVecAlloca, Value *pScalarizedIdx,
976- IGCLLVM::IRBuilder<> &IRB, Type *scalarType) {
977- Value *pLoadVecAlloca = IRB.CreateLoad (pVecAlloca->getAllocatedType (), pVecAlloca);
986+ IGCLLVM::IRBuilder<> &IRB, Type *DstScalarTy) {
987+ IGC_ASSERT (pVecAlloca && pScalarizedIdx && DstScalarTy);
988+ IGC_ASSERT (N >= 1 );
989+
990+ // Promoted vector type and its scalar element type (destination lanes are of this type).
991+ auto *PromotedVecTy = cast<IGCLLVM::FixedVectorType>(pVecAlloca->getAllocatedType ());
992+ Type *SrcScalarTy = PromotedVecTy->getElementType ();
993+ const DataLayout &DL = pVecAlloca->getModule ()->getDataLayout ();
994+
995+ const uint64_t SrcBits = DL.getTypeStoreSizeInBits (SrcScalarTy);
996+ const uint64_t DstBits = DL.getTypeStoreSizeInBits (DstScalarTy);
997+ const uint64_t NeedBits = DstBits * (uint64_t )N;
998+
999+ IGC_ASSERT_MESSAGE (SrcBits > 0 && DstBits > 0 , " Unexpected zero-sized scalar type" );
1000+
1001+ // Load the entire promoted vector only once, all reads will extract from this value.
1002+ Value *Whole = IRB.CreateLoad (PromotedVecTy, pVecAlloca);
1003+
1004+ // Helper for casting a sinlge scalar to an integer with exactly Bits width.
1005+ auto toIntOfWidth = [&](Value *V, uint64_t Bits) -> Value * {
1006+ Type *Ty = V->getType ();
1007+ Type *IntTy = IRB.getIntNTy ((unsigned )Bits);
1008+ if (Ty->isPointerTy ())
1009+ return IRB.CreatePtrToInt (V, IntTy);
1010+ if (Ty->isIntegerTy ((unsigned )Bits))
1011+ return V;
1012+ return IRB.CreateBitCast (V, IntTy);
1013+ };
1014+
1015+ // Helper for casting an integer back to the requested destination scalar type.
1016+ auto intToDstScalar = [&](Value *VInt) -> Value * {
1017+ if (DstScalarTy->isPointerTy ())
1018+ return IRB.CreateIntToPtr (VInt, DstScalarTy);
1019+ if (DstScalarTy->isIntegerTy ((unsigned )DstBits))
1020+ return VInt;
1021+ return IRB.CreateBitCast (VInt, DstScalarTy);
1022+ };
1023+
1024+ // 1. Fast path
1025+ if (SrcBits == DstBits) {
1026+ if (N == 1 ) {
1027+ // This is scalar read: extract one lane, legalize type, and return.
1028+ Value *Elem = IRB.CreateExtractElement (Whole, pScalarizedIdx);
1029+ Type *Ty = Elem->getType ();
1030+ if (Ty == DstScalarTy)
1031+ return Elem;
1032+ if (Ty->isPointerTy () && DstScalarTy->isIntegerTy ((unsigned )DstBits))
1033+ return IRB.CreatePtrToInt (Elem, DstScalarTy);
1034+ if (Ty->isIntegerTy ((unsigned )SrcBits) && DstScalarTy->isPointerTy ())
1035+ return IRB.CreateIntToPtr (Elem, DstScalarTy);
1036+ return IRB.CreateBitCast (Elem, DstScalarTy);
1037+ }
1038+
1039+ // This is vector read: extract N lanes, legalize per lane type, build <N * DstScalarTy>.
1040+ Type *RetVecTy = IGCLLVM::FixedVectorType::get (DstScalarTy, N);
1041+ Value *Result = PoisonValue::get (RetVecTy);
1042+ for (unsigned i = 0 ; i < N; ++i) {
1043+ Value *Off = ConstantInt::get (pScalarizedIdx->getType (), i);
1044+ Value *Idx = IRB.CreateAdd (pScalarizedIdx, Off);
1045+ Value *Elem = IRB.CreateExtractElement (Whole, Idx);
1046+ if (Elem->getType () != DstScalarTy) {
1047+ if (Elem->getType ()->isPointerTy () && DstScalarTy->isIntegerTy ((unsigned )DstBits))
1048+ Elem = IRB.CreatePtrToInt (Elem, DstScalarTy);
1049+ else if (Elem->getType ()->isIntegerTy ((unsigned )SrcBits) && DstScalarTy->isPointerTy ())
1050+ Elem = IRB.CreateIntToPtr (Elem, DstScalarTy);
1051+ else
1052+ Elem = IRB.CreateBitCast (Elem, DstScalarTy);
1053+ }
1054+ Result = IRB.CreateInsertElement (Result, Elem, Off);
1055+ }
1056+ return Result;
1057+ }
1058+
1059+ // 2. General path
1060+ //
1061+ // Algorithm:
1062+ // 1. Determine how many promoted lanes are needed, K = ceil(NeedBits / SrcBits).
1063+ // 2. Extract those K lanes as integers i{SrcBits} and pack into a <K * iSrcBits>.
1064+ // 3. Bitcast <K * iSrcBits> to a single i{K*SrcBits} integer.
1065+ // 4. Truncate to exactly NeedBits.
1066+ // 5. Expand to the requested result type: For scalars, intToDstScalar(i{DstBits}). For vectors, bitcast to <N *
1067+ // iDstBits>, then per lane inttoptr if pointer elements, otherwise just bitcast to <N * DstScalarTy>.
1068+ const uint64_t K = (NeedBits + SrcBits - 1 ) / SrcBits; // ceil
1069+
1070+ // Build <K * iSrcBits> by extracting K consecutive promoted lanes starting at pScalarizedIdx.
1071+ Type *IntSrcTy = IRB.getIntNTy ((unsigned )SrcBits);
1072+ IGCLLVM::FixedVectorType *PackVecIntTy = IGCLLVM::FixedVectorType::get (IntSrcTy, (unsigned )K);
1073+ Value *PackVecInt = PoisonValue::get (PackVecIntTy);
1074+
1075+ for (uint64_t i = 0 ; i < K; ++i) {
1076+ Value *Off = ConstantInt::get (pScalarizedIdx->getType (), (uint64_t )i);
1077+ Value *Idx = IRB.CreateAdd (pScalarizedIdx, Off);
1078+ Value *SrcElem = IRB.CreateExtractElement (Whole, Idx);
1079+ Value *SrcElemInt = toIntOfWidth (SrcElem, SrcBits);
1080+ PackVecInt = IRB.CreateInsertElement (PackVecInt, SrcElemInt, Off);
1081+ }
1082+
1083+ // Concatenate the K integers into a single i{K*SrcBits} by bitcasting the vector.
1084+ Type *BigIntTy = IRB.getIntNTy ((unsigned )(K * SrcBits));
1085+ Value *BigInt = IRB.CreateBitCast (PackVecInt, BigIntTy);
1086+
1087+ // Trim to the requested width.
1088+ Value *NeedInt = (NeedBits == K * SrcBits) ? BigInt : IRB.CreateTrunc (BigInt, IRB.getIntNTy ((unsigned )NeedBits));
1089+
1090+ // Expand to the user-visible return type.
9781091 if (N == 1 ) {
979- return IRB.CreateBitCast (IRB.CreateExtractElement (pLoadVecAlloca, pScalarizedIdx), scalarType);
1092+ // For scalars, i{DstBits} into DstScalarTy.
1093+ return intToDstScalar (NeedInt);
9801094 }
9811095
982- // A vector load
983- // %v = load <2 x float>* %ptr
984- // becomes
985- // %w = load <32 x float>* %ptr1
986- // %v0 = extractelement <32 x float> %w, i32 %idx
987- // %v1 = extractelement <32 x float> %w, i32 %idx+1
988- // replace all uses of %v with <%v0, %v1>
989- IGC_ASSERT_MESSAGE ((N > 1 ), " out of sync" );
990- Type *Ty = IGCLLVM::FixedVectorType::get (scalarType, N);
991- Value *Result = UndefValue::get (Ty);
1096+ // For vectors, bitcast to <N * iDstBits>, then change element type if needed.
1097+ IGCLLVM::FixedVectorType *VecIntTy = IGCLLVM::FixedVectorType::get (IRB.getIntNTy ((unsigned )DstBits), N);
1098+ Value *AsIntVec = (NeedBits == (uint64_t )DL.getTypeStoreSizeInBits (VecIntTy)) ? IRB.CreateBitCast (NeedInt, VecIntTy)
1099+ : (Value *)nullptr ;
9921100
1101+ if (!DstScalarTy->isPointerTy ()) {
1102+ if (DstScalarTy->isIntegerTy ((unsigned )DstBits))
1103+ return AsIntVec;
1104+ return IRB.CreateBitCast (AsIntVec, IGCLLVM::FixedVectorType::get (DstScalarTy, N));
1105+ }
1106+
1107+ IGCLLVM::FixedVectorType *RetVecTy = IGCLLVM::FixedVectorType::get (DstScalarTy, N);
1108+ Value *Ret = PoisonValue::get (RetVecTy);
9931109 for (unsigned i = 0 ; i < N; ++i) {
994- Value *VectorIdx = ConstantInt::get (pScalarizedIdx->getType (), i);
995- auto Idx = IRB.CreateAdd (pScalarizedIdx, VectorIdx);
996- auto Val = IRB.CreateExtractElement (pLoadVecAlloca, Idx);
997- Val = IRB.CreateBitCast (Val, scalarType);
998- Result = IRB.CreateInsertElement (Result, Val, VectorIdx);
1110+ Value *LaneIdx = ConstantInt::get (IRB.getInt32Ty (), i);
1111+ Value *LaneInt = IRB.CreateExtractElement (AsIntVec, LaneIdx);
1112+ Value *LanePtr = IRB.CreateIntToPtr (LaneInt, DstScalarTy);
1113+ Ret = IRB.CreateInsertElement (Ret, LanePtr, LaneIdx);
9991114 }
1000- return Result ;
1115+ return Ret ;
10011116}
10021117
10031118void TransposeHelperPromote::handleLoadInst (LoadInst *pLoad, Value *pScalarizedIdx) {
@@ -1012,40 +1127,110 @@ void TransposeHelperPromote::handleLoadInst(LoadInst *pLoad, Value *pScalarizedI
10121127 pLoad->eraseFromParent ();
10131128}
10141129
1015- void TransposeHelperPromote::handleStoreInst (llvm::StoreInst *pStore, llvm::Value *pScalarizedIdx) {
1016- // Add Store instruction to remove list
1017- IGC_ASSERT (nullptr != pStore);
1018- IGC_ASSERT (pStore->isSimple ());
1130+ // Store a scalar/vector value into a promoted private alloca that is represented as a flat vector.
1131+ //
1132+ // The implementation is built on the following assumptions:
1133+ // - The promoted destination is a vector <M * SrcScalarTy> where SrcScalarTy is the element type chosen for the
1134+ // promoted alloca.
1135+ // - The source to store (StoreVal) is either a) a scalar of arbitrary type/width or b) a vector <N * DstScalarTy> of
1136+ // arbitrary lane type/width.
1137+ // - The GEP lowering has already produced a linear element index pScalarizedIdx into the promoted vector. This
1138+ // implementation writes the source bytes begining at that index but can possibly span multiple lanes.
1139+ //
1140+ // Overview of the algorithm:
1141+ // 1. Normalize the source scalar/vector into a single integer of NeedBits (the exact size of the payload to store)
1142+ // 2. Split the integer into K parts, where each chunk has the bit-width of a promoted lane (SrcBits).
1143+ // 3. Bitcast/convert through inttoptr each chunk back to SrcScalarTy (if needed) and insert the chunks into consecutive
1144+ // promoted lanes.
1145+ // 4. Store the promoted vector back to the alloca.
1146+ static void storeEltsToVecAlloca (Value *StoreVal, AllocaInst *pVecAlloca, Value *pScalarizedIdx,
1147+ IGCLLVM::IRBuilder<> &IRB) {
1148+ IGC_ASSERT (StoreVal && pVecAlloca && pScalarizedIdx);
1149+
1150+ // Destination (the promoted private is a vector <M * SrcScalarTy>)
1151+ auto *PromotedVecTy = cast<IGCLLVM::FixedVectorType>(pVecAlloca->getAllocatedType ());
1152+ Type *SrcScalarTy = PromotedVecTy->getElementType ();
1153+ const DataLayout &DL = pVecAlloca->getModule ()->getDataLayout ();
1154+
1155+ // Calculate lane count (N) and lane scalar type from the store source value.
1156+ Type *ValTy = StoreVal->getType ();
1157+ const unsigned N = ValTy->isVectorTy () ? (unsigned )cast<IGCLLVM::FixedVectorType>(ValTy)->getNumElements () : 1 ;
1158+ Type *DstScalarTy = ValTy->isVectorTy () ? cast<VectorType>(ValTy)->getElementType () : ValTy;
1159+
1160+ // Calculate sizes of the source promoted lane, destination lane, and the total payload to store.
1161+ const uint64_t SrcBits = DL.getTypeStoreSizeInBits (SrcScalarTy);
1162+ const uint64_t DstBits = DL.getTypeStoreSizeInBits (DstScalarTy);
1163+ const uint64_t NeedBits = DstBits * (uint64_t )N;
1164+
1165+ // Convert a lane value of arbitrary-type to an integer of the exact bit width (DstBits).
1166+ auto toIntOfWidth = [&](Value *V, uint64_t Bits) -> Value * {
1167+ Type *IntTy = IRB.getIntNTy ((unsigned )Bits);
1168+ Type *Ty = V->getType ();
1169+ if (Ty->isPointerTy ())
1170+ return IRB.CreatePtrToInt (V, IntTy);
1171+ if (Ty->isIntegerTy ((unsigned )Bits))
1172+ return V;
1173+ return IRB.CreateBitCast (V, IntTy); // float-like
1174+ };
10191175
1020- IGCLLVM::IRBuilder<> IRB (pStore);
1021- llvm::Value *pStoreVal = pStore->getValueOperand ();
1022- llvm::Value *pLoadVecAlloca = IRB.CreateLoad (pVecAlloca->getAllocatedType (), pVecAlloca);
1023- llvm::Value *pIns = pLoadVecAlloca;
1024- IGC_ASSERT (nullptr != pStoreVal);
1025- IGC_ASSERT (nullptr != pStoreVal->getType ());
1026- if (pStoreVal->getType ()->isVectorTy ()) {
1027- // A vector store
1028- // store <2 x float> %v, <2 x float>* %ptr
1029- // becomes
1030- // %w = load <32 x float> *%ptr1
1031- // %v0 = extractelement <2 x float> %v, i32 0
1032- // %w0 = insertelement <32 x float> %w, float %v0, i32 %idx
1033- // %v1 = extractelement <2 x float> %v, i32 1
1034- // %w1 = insertelement <32 x float> %w0, float %v1, i32 %idx+1
1035- // store <32 x float> %w1, <32 x float>* %ptr1
1036- for (unsigned i = 0 , e = (unsigned )cast<IGCLLVM::FixedVectorType>(pStoreVal->getType ())->getNumElements (); i < e;
1037- ++i) {
1038- Value *VectorIdx = ConstantInt::get (pScalarizedIdx->getType (), i);
1039- auto Val = IRB.CreateExtractElement (pStoreVal, VectorIdx);
1040- Val = IRB.CreateBitCast (Val, pLoadVecAlloca->getType ()->getScalarType ());
1041- auto Idx = IRB.CreateAdd (pScalarizedIdx, VectorIdx);
1042- pIns = IRB.CreateInsertElement (pIns, Val, Idx);
1043- }
1176+ // Convert an integer chunk of SrcBits back to the promoted lane scalar type (SrcScalarTy).
1177+ auto intToSrcScalar = [&](Value *VInt) -> Value * {
1178+ if (SrcScalarTy->isPointerTy ())
1179+ return IRB.CreateIntToPtr (VInt, SrcScalarTy);
1180+ if (SrcScalarTy->isIntegerTy ((unsigned )SrcBits))
1181+ return VInt;
1182+ return IRB.CreateBitCast (VInt, SrcScalarTy); // float-like
1183+ };
1184+
1185+ // Pack the entire store payload into a single integer of NeedBits.
1186+ // If N == 1, just normalize the scalar. If N > 1, create a vector of lane-sized integers and then bitcast it into one
1187+ // bit integer.
1188+ Value *NeedInt = nullptr ;
1189+ if (N == 1 ) {
1190+ NeedInt = toIntOfWidth (StoreVal, DstBits);
10441191 } else {
1045- pStoreVal = IRB.CreateBitCast (pStoreVal, pLoadVecAlloca->getType ()->getScalarType ());
1046- pIns = IRB.CreateInsertElement (pLoadVecAlloca, pStoreVal, pScalarizedIdx);
1192+ Type *LaneIntTy = IRB.getIntNTy ((unsigned )DstBits);
1193+ auto *VecIntTy = IGCLLVM::FixedVectorType::get (LaneIntTy, N);
1194+ Value *AsIntVec = PoisonValue::get (VecIntTy);
1195+ for (unsigned i = 0 ; i < N; ++i) {
1196+ Value *Lane = IRB.CreateExtractElement (StoreVal, IRB.getInt32 (i));
1197+ Value *LaneInt = toIntOfWidth (Lane, DstBits);
1198+ AsIntVec = IRB.CreateInsertElement (AsIntVec, LaneInt, IRB.getInt32 (i));
1199+ }
1200+ NeedInt = IRB.CreateBitCast (AsIntVec, IRB.getIntNTy ((unsigned )NeedBits));
1201+ }
1202+
1203+ // Calculate how many promoted lanes (K) are needed to hold NeedBits. BigBits is the total bits occupied by those
1204+ // lanes.
1205+ const uint64_t K = (NeedBits + SrcBits - 1 ) / SrcBits;
1206+ const uint64_t BigBits = K * SrcBits;
1207+
1208+ // If the store payload does not fill the last lane, zero-extend to K * SrcBits.
1209+ if (NeedBits < BigBits)
1210+ NeedInt = IRB.CreateZExt (NeedInt, IRB.getIntNTy ((unsigned )BigBits));
1211+
1212+ // Bitcast i(BigBits) into <K x iSrcBits> or in other words split into K chunks.
1213+ auto *IntSrcTy = IRB.getIntNTy ((unsigned )SrcBits);
1214+ auto *PackVecIntTy = IGCLLVM::FixedVectorType::get (IntSrcTy, (unsigned )K);
1215+ Value *PackVecInt = IRB.CreateBitCast (NeedInt, PackVecIntTy);
1216+
1217+ // Load the current promoted vector, overwrite K consecutive lanes atarting at Idx, and then store the updated vector
1218+ // back.
1219+ Value *Whole = IRB.CreateLoad (PromotedVecTy, pVecAlloca);
1220+ for (unsigned i = 0 ; i < K; ++i) {
1221+ Value *LaneInt = IRB.CreateExtractElement (PackVecInt, IRB.getInt32 (i));
1222+ Value *LaneVal = intToSrcScalar (LaneInt);
1223+ Value *Off = ConstantInt::get (pScalarizedIdx->getType (), i);
1224+ Value *Idx = IRB.CreateAdd (pScalarizedIdx, Off);
1225+ Whole = IRB.CreateInsertElement (Whole, LaneVal, Idx);
10471226 }
1048- IRB.CreateStore (pIns, pVecAlloca);
1227+ IRB.CreateStore (Whole, pVecAlloca);
1228+ }
1229+
1230+ void TransposeHelperPromote::handleStoreInst (llvm::StoreInst *pStore, llvm::Value *pScalarizedIdx) {
1231+ IGC_ASSERT (pStore && pStore->isSimple ());
1232+ IGCLLVM::IRBuilder<> IRB (pStore);
1233+ storeEltsToVecAlloca (pStore->getValueOperand (), pVecAlloca, pScalarizedIdx, IRB);
10491234 pStore->eraseFromParent ();
10501235}
10511236
0 commit comments