@@ -970,34 +970,149 @@ void TransposeHelper::handleGEPInstNew(llvm::GetElementPtrInst *pGEP, llvm::Valu
970
970
HandleAllocaSources (pGEP, linearOffset);
971
971
}
972
972
973
- // Load N elements from a vector alloca, Idx, ... Idx + N - 1. Return a scalar
974
- // or a vector value depending on N.
973
+ // Load a value scalar/vector <N * DstScalarTy> out of a private alloca that has been promoted to a flat vector <M *
974
+ // SrcScalarTy>. This assumes the GEP lowering has already produced a linear lane index pScalarizedIdx into that
975
+ // promoted vector.
976
+ //
977
+ // The implementation supports two scenarios:
978
+ // 1. Simple path (SrcBits == DstBits). In this case the destination element width matches the promoted lane width, so
979
+ // only need to extract N consecutive lanes and legalize type differences.
980
+ // 2. General path (SrcBits != DstBits). In this case, the destination element width differs from the promoted lane
981
+ // width. So read K = ceil(N * DstBits / SrcBits) promoted lanes, pack them into a big integer of K*SrcBits bits, trim
982
+ // to exactly NeedBits = N * DstBits, then expand to either a scalar or <N * DstScalarType>.
983
+ //
984
+ // Note, "lane" here means one element of the promoted vecotr <M * SrcScalarTy>.
975
985
static Value *loadEltsFromVecAlloca (unsigned N, AllocaInst *pVecAlloca, Value *pScalarizedIdx,
976
- IGCLLVM::IRBuilder<> &IRB, Type *scalarType) {
977
- Value *pLoadVecAlloca = IRB.CreateLoad (pVecAlloca->getAllocatedType (), pVecAlloca);
986
+ IGCLLVM::IRBuilder<> &IRB, Type *DstScalarTy) {
987
+ IGC_ASSERT (pVecAlloca && pScalarizedIdx && DstScalarTy);
988
+ IGC_ASSERT (N >= 1 );
989
+
990
+ // Promoted vector type and its scalar element type (destination lanes are of this type).
991
+ auto *PromotedVecTy = cast<IGCLLVM::FixedVectorType>(pVecAlloca->getAllocatedType ());
992
+ Type *SrcScalarTy = PromotedVecTy->getElementType ();
993
+ const DataLayout &DL = pVecAlloca->getModule ()->getDataLayout ();
994
+
995
+ const uint64_t SrcBits = DL.getTypeStoreSizeInBits (SrcScalarTy);
996
+ const uint64_t DstBits = DL.getTypeStoreSizeInBits (DstScalarTy);
997
+ const uint64_t NeedBits = DstBits * (uint64_t )N;
998
+
999
+ IGC_ASSERT_MESSAGE (SrcBits > 0 && DstBits > 0 , " Unexpected zero-sized scalar type" );
1000
+
1001
+ // Load the entire promoted vector only once, all reads will extract from this value.
1002
+ Value *Whole = IRB.CreateLoad (PromotedVecTy, pVecAlloca);
1003
+
1004
+ // Helper for casting a sinlge scalar to an integer with exactly Bits width.
1005
+ auto toIntOfWidth = [&](Value *V, uint64_t Bits) -> Value * {
1006
+ Type *Ty = V->getType ();
1007
+ Type *IntTy = IRB.getIntNTy ((unsigned )Bits);
1008
+ if (Ty->isPointerTy ())
1009
+ return IRB.CreatePtrToInt (V, IntTy);
1010
+ if (Ty->isIntegerTy ((unsigned )Bits))
1011
+ return V;
1012
+ return IRB.CreateBitCast (V, IntTy);
1013
+ };
1014
+
1015
+ // Helper for casting an integer back to the requested destination scalar type.
1016
+ auto intToDstScalar = [&](Value *VInt) -> Value * {
1017
+ if (DstScalarTy->isPointerTy ())
1018
+ return IRB.CreateIntToPtr (VInt, DstScalarTy);
1019
+ if (DstScalarTy->isIntegerTy ((unsigned )DstBits))
1020
+ return VInt;
1021
+ return IRB.CreateBitCast (VInt, DstScalarTy);
1022
+ };
1023
+
1024
+ // 1. Fast path
1025
+ if (SrcBits == DstBits) {
1026
+ if (N == 1 ) {
1027
+ // This is scalar read: extract one lane, legalize type, and return.
1028
+ Value *Elem = IRB.CreateExtractElement (Whole, pScalarizedIdx);
1029
+ Type *Ty = Elem->getType ();
1030
+ if (Ty == DstScalarTy)
1031
+ return Elem;
1032
+ if (Ty->isPointerTy () && DstScalarTy->isIntegerTy ((unsigned )DstBits))
1033
+ return IRB.CreatePtrToInt (Elem, DstScalarTy);
1034
+ if (Ty->isIntegerTy ((unsigned )SrcBits) && DstScalarTy->isPointerTy ())
1035
+ return IRB.CreateIntToPtr (Elem, DstScalarTy);
1036
+ return IRB.CreateBitCast (Elem, DstScalarTy);
1037
+ }
1038
+
1039
+ // This is vector read: extract N lanes, legalize per lane type, build <N * DstScalarTy>.
1040
+ Type *RetVecTy = IGCLLVM::FixedVectorType::get (DstScalarTy, N);
1041
+ Value *Result = PoisonValue::get (RetVecTy);
1042
+ for (unsigned i = 0 ; i < N; ++i) {
1043
+ Value *Off = ConstantInt::get (pScalarizedIdx->getType (), i);
1044
+ Value *Idx = IRB.CreateAdd (pScalarizedIdx, Off);
1045
+ Value *Elem = IRB.CreateExtractElement (Whole, Idx);
1046
+ if (Elem->getType () != DstScalarTy) {
1047
+ if (Elem->getType ()->isPointerTy () && DstScalarTy->isIntegerTy ((unsigned )DstBits))
1048
+ Elem = IRB.CreatePtrToInt (Elem, DstScalarTy);
1049
+ else if (Elem->getType ()->isIntegerTy ((unsigned )SrcBits) && DstScalarTy->isPointerTy ())
1050
+ Elem = IRB.CreateIntToPtr (Elem, DstScalarTy);
1051
+ else
1052
+ Elem = IRB.CreateBitCast (Elem, DstScalarTy);
1053
+ }
1054
+ Result = IRB.CreateInsertElement (Result, Elem, Off);
1055
+ }
1056
+ return Result;
1057
+ }
1058
+
1059
+ // 2. General path
1060
+ //
1061
+ // Algorithm:
1062
+ // 1. Determine how many promoted lanes are needed, K = ceil(NeedBits / SrcBits).
1063
+ // 2. Extract those K lanes as integers i{SrcBits} and pack into a <K * iSrcBits>.
1064
+ // 3. Bitcast <K * iSrcBits> to a single i{K*SrcBits} integer.
1065
+ // 4. Truncate to exactly NeedBits.
1066
+ // 5. Expand to the requested result type: For scalars, intToDstScalar(i{DstBits}). For vectors, bitcast to <N *
1067
+ // iDstBits>, then per lane inttoptr if pointer elements, otherwise just bitcast to <N * DstScalarTy>.
1068
+ const uint64_t K = (NeedBits + SrcBits - 1 ) / SrcBits; // ceil
1069
+
1070
+ // Build <K * iSrcBits> by extracting K consecutive promoted lanes starting at pScalarizedIdx.
1071
+ Type *IntSrcTy = IRB.getIntNTy ((unsigned )SrcBits);
1072
+ IGCLLVM::FixedVectorType *PackVecIntTy = IGCLLVM::FixedVectorType::get (IntSrcTy, (unsigned )K);
1073
+ Value *PackVecInt = PoisonValue::get (PackVecIntTy);
1074
+
1075
+ for (uint64_t i = 0 ; i < K; ++i) {
1076
+ Value *Off = ConstantInt::get (pScalarizedIdx->getType (), (uint64_t )i);
1077
+ Value *Idx = IRB.CreateAdd (pScalarizedIdx, Off);
1078
+ Value *SrcElem = IRB.CreateExtractElement (Whole, Idx);
1079
+ Value *SrcElemInt = toIntOfWidth (SrcElem, SrcBits);
1080
+ PackVecInt = IRB.CreateInsertElement (PackVecInt, SrcElemInt, Off);
1081
+ }
1082
+
1083
+ // Concatenate the K integers into a single i{K*SrcBits} by bitcasting the vector.
1084
+ Type *BigIntTy = IRB.getIntNTy ((unsigned )(K * SrcBits));
1085
+ Value *BigInt = IRB.CreateBitCast (PackVecInt, BigIntTy);
1086
+
1087
+ // Trim to the requested width.
1088
+ Value *NeedInt = (NeedBits == K * SrcBits) ? BigInt : IRB.CreateTrunc (BigInt, IRB.getIntNTy ((unsigned )NeedBits));
1089
+
1090
+ // Expand to the user-visible return type.
978
1091
if (N == 1 ) {
979
- return IRB.CreateBitCast (IRB.CreateExtractElement (pLoadVecAlloca, pScalarizedIdx), scalarType);
1092
+ // For scalars, i{DstBits} into DstScalarTy.
1093
+ return intToDstScalar (NeedInt);
980
1094
}
981
1095
982
- // A vector load
983
- // %v = load <2 x float>* %ptr
984
- // becomes
985
- // %w = load <32 x float>* %ptr1
986
- // %v0 = extractelement <32 x float> %w, i32 %idx
987
- // %v1 = extractelement <32 x float> %w, i32 %idx+1
988
- // replace all uses of %v with <%v0, %v1>
989
- IGC_ASSERT_MESSAGE ((N > 1 ), " out of sync" );
990
- Type *Ty = IGCLLVM::FixedVectorType::get (scalarType, N);
991
- Value *Result = UndefValue::get (Ty);
1096
+ // For vectors, bitcast to <N * iDstBits>, then change element type if needed.
1097
+ IGCLLVM::FixedVectorType *VecIntTy = IGCLLVM::FixedVectorType::get (IRB.getIntNTy ((unsigned )DstBits), N);
1098
+ Value *AsIntVec = (NeedBits == (uint64_t )DL.getTypeStoreSizeInBits (VecIntTy)) ? IRB.CreateBitCast (NeedInt, VecIntTy)
1099
+ : (Value *)nullptr ;
992
1100
1101
+ if (!DstScalarTy->isPointerTy ()) {
1102
+ if (DstScalarTy->isIntegerTy ((unsigned )DstBits))
1103
+ return AsIntVec;
1104
+ return IRB.CreateBitCast (AsIntVec, IGCLLVM::FixedVectorType::get (DstScalarTy, N));
1105
+ }
1106
+
1107
+ IGCLLVM::FixedVectorType *RetVecTy = IGCLLVM::FixedVectorType::get (DstScalarTy, N);
1108
+ Value *Ret = PoisonValue::get (RetVecTy);
993
1109
for (unsigned i = 0 ; i < N; ++i) {
994
- Value *VectorIdx = ConstantInt::get (pScalarizedIdx->getType (), i);
995
- auto Idx = IRB.CreateAdd (pScalarizedIdx, VectorIdx);
996
- auto Val = IRB.CreateExtractElement (pLoadVecAlloca, Idx);
997
- Val = IRB.CreateBitCast (Val, scalarType);
998
- Result = IRB.CreateInsertElement (Result, Val, VectorIdx);
1110
+ Value *LaneIdx = ConstantInt::get (IRB.getInt32Ty (), i);
1111
+ Value *LaneInt = IRB.CreateExtractElement (AsIntVec, LaneIdx);
1112
+ Value *LanePtr = IRB.CreateIntToPtr (LaneInt, DstScalarTy);
1113
+ Ret = IRB.CreateInsertElement (Ret, LanePtr, LaneIdx);
999
1114
}
1000
- return Result ;
1115
+ return Ret ;
1001
1116
}
1002
1117
1003
1118
void TransposeHelperPromote::handleLoadInst (LoadInst *pLoad, Value *pScalarizedIdx) {
@@ -1012,40 +1127,110 @@ void TransposeHelperPromote::handleLoadInst(LoadInst *pLoad, Value *pScalarizedI
1012
1127
pLoad->eraseFromParent ();
1013
1128
}
1014
1129
1015
- void TransposeHelperPromote::handleStoreInst (llvm::StoreInst *pStore, llvm::Value *pScalarizedIdx) {
1016
- // Add Store instruction to remove list
1017
- IGC_ASSERT (nullptr != pStore);
1018
- IGC_ASSERT (pStore->isSimple ());
1130
+ // Store a scalar/vector value into a promoted private alloca that is represented as a flat vector.
1131
+ //
1132
+ // The implementation is built on the following assumptions:
1133
+ // - The promoted destination is a vector <M * SrcScalarTy> where SrcScalarTy is the element type chosen for the
1134
+ // promoted alloca.
1135
+ // - The source to store (StoreVal) is either a) a scalar of arbitrary type/width or b) a vector <N * DstScalarTy> of
1136
+ // arbitrary lane type/width.
1137
+ // - The GEP lowering has already produced a linear element index pScalarizedIdx into the promoted vector. This
1138
+ // implementation writes the source bytes begining at that index but can possibly span multiple lanes.
1139
+ //
1140
+ // Overview of the algorithm:
1141
+ // 1. Normalize the source scalar/vector into a single integer of NeedBits (the exact size of the payload to store)
1142
+ // 2. Split the integer into K parts, where each chunk has the bit-width of a promoted lane (SrcBits).
1143
+ // 3. Bitcast/convert through inttoptr each chunk back to SrcScalarTy (if needed) and insert the chunks into consecutive
1144
+ // promoted lanes.
1145
+ // 4. Store the promoted vector back to the alloca.
1146
+ static void storeEltsToVecAlloca (Value *StoreVal, AllocaInst *pVecAlloca, Value *pScalarizedIdx,
1147
+ IGCLLVM::IRBuilder<> &IRB) {
1148
+ IGC_ASSERT (StoreVal && pVecAlloca && pScalarizedIdx);
1149
+
1150
+ // Destination (the promoted private is a vector <M * SrcScalarTy>)
1151
+ auto *PromotedVecTy = cast<IGCLLVM::FixedVectorType>(pVecAlloca->getAllocatedType ());
1152
+ Type *SrcScalarTy = PromotedVecTy->getElementType ();
1153
+ const DataLayout &DL = pVecAlloca->getModule ()->getDataLayout ();
1154
+
1155
+ // Calculate lane count (N) and lane scalar type from the store source value.
1156
+ Type *ValTy = StoreVal->getType ();
1157
+ const unsigned N = ValTy->isVectorTy () ? (unsigned )cast<IGCLLVM::FixedVectorType>(ValTy)->getNumElements () : 1 ;
1158
+ Type *DstScalarTy = ValTy->isVectorTy () ? cast<VectorType>(ValTy)->getElementType () : ValTy;
1159
+
1160
+ // Calculate sizes of the source promoted lane, destination lane, and the total payload to store.
1161
+ const uint64_t SrcBits = DL.getTypeStoreSizeInBits (SrcScalarTy);
1162
+ const uint64_t DstBits = DL.getTypeStoreSizeInBits (DstScalarTy);
1163
+ const uint64_t NeedBits = DstBits * (uint64_t )N;
1164
+
1165
+ // Convert a lane value of arbitrary-type to an integer of the exact bit width (DstBits).
1166
+ auto toIntOfWidth = [&](Value *V, uint64_t Bits) -> Value * {
1167
+ Type *IntTy = IRB.getIntNTy ((unsigned )Bits);
1168
+ Type *Ty = V->getType ();
1169
+ if (Ty->isPointerTy ())
1170
+ return IRB.CreatePtrToInt (V, IntTy);
1171
+ if (Ty->isIntegerTy ((unsigned )Bits))
1172
+ return V;
1173
+ return IRB.CreateBitCast (V, IntTy); // float-like
1174
+ };
1019
1175
1020
- IGCLLVM::IRBuilder<> IRB (pStore);
1021
- llvm::Value *pStoreVal = pStore->getValueOperand ();
1022
- llvm::Value *pLoadVecAlloca = IRB.CreateLoad (pVecAlloca->getAllocatedType (), pVecAlloca);
1023
- llvm::Value *pIns = pLoadVecAlloca;
1024
- IGC_ASSERT (nullptr != pStoreVal);
1025
- IGC_ASSERT (nullptr != pStoreVal->getType ());
1026
- if (pStoreVal->getType ()->isVectorTy ()) {
1027
- // A vector store
1028
- // store <2 x float> %v, <2 x float>* %ptr
1029
- // becomes
1030
- // %w = load <32 x float> *%ptr1
1031
- // %v0 = extractelement <2 x float> %v, i32 0
1032
- // %w0 = insertelement <32 x float> %w, float %v0, i32 %idx
1033
- // %v1 = extractelement <2 x float> %v, i32 1
1034
- // %w1 = insertelement <32 x float> %w0, float %v1, i32 %idx+1
1035
- // store <32 x float> %w1, <32 x float>* %ptr1
1036
- for (unsigned i = 0 , e = (unsigned )cast<IGCLLVM::FixedVectorType>(pStoreVal->getType ())->getNumElements (); i < e;
1037
- ++i) {
1038
- Value *VectorIdx = ConstantInt::get (pScalarizedIdx->getType (), i);
1039
- auto Val = IRB.CreateExtractElement (pStoreVal, VectorIdx);
1040
- Val = IRB.CreateBitCast (Val, pLoadVecAlloca->getType ()->getScalarType ());
1041
- auto Idx = IRB.CreateAdd (pScalarizedIdx, VectorIdx);
1042
- pIns = IRB.CreateInsertElement (pIns, Val, Idx);
1043
- }
1176
+ // Convert an integer chunk of SrcBits back to the promoted lane scalar type (SrcScalarTy).
1177
+ auto intToSrcScalar = [&](Value *VInt) -> Value * {
1178
+ if (SrcScalarTy->isPointerTy ())
1179
+ return IRB.CreateIntToPtr (VInt, SrcScalarTy);
1180
+ if (SrcScalarTy->isIntegerTy ((unsigned )SrcBits))
1181
+ return VInt;
1182
+ return IRB.CreateBitCast (VInt, SrcScalarTy); // float-like
1183
+ };
1184
+
1185
+ // Pack the entire store payload into a single integer of NeedBits.
1186
+ // If N == 1, just normalize the scalar. If N > 1, create a vector of lane-sized integers and then bitcast it into one
1187
+ // bit integer.
1188
+ Value *NeedInt = nullptr ;
1189
+ if (N == 1 ) {
1190
+ NeedInt = toIntOfWidth (StoreVal, DstBits);
1044
1191
} else {
1045
- pStoreVal = IRB.CreateBitCast (pStoreVal, pLoadVecAlloca->getType ()->getScalarType ());
1046
- pIns = IRB.CreateInsertElement (pLoadVecAlloca, pStoreVal, pScalarizedIdx);
1192
+ Type *LaneIntTy = IRB.getIntNTy ((unsigned )DstBits);
1193
+ auto *VecIntTy = IGCLLVM::FixedVectorType::get (LaneIntTy, N);
1194
+ Value *AsIntVec = PoisonValue::get (VecIntTy);
1195
+ for (unsigned i = 0 ; i < N; ++i) {
1196
+ Value *Lane = IRB.CreateExtractElement (StoreVal, IRB.getInt32 (i));
1197
+ Value *LaneInt = toIntOfWidth (Lane, DstBits);
1198
+ AsIntVec = IRB.CreateInsertElement (AsIntVec, LaneInt, IRB.getInt32 (i));
1199
+ }
1200
+ NeedInt = IRB.CreateBitCast (AsIntVec, IRB.getIntNTy ((unsigned )NeedBits));
1201
+ }
1202
+
1203
+ // Calculate how many promoted lanes (K) are needed to hold NeedBits. BigBits is the total bits occupied by those
1204
+ // lanes.
1205
+ const uint64_t K = (NeedBits + SrcBits - 1 ) / SrcBits;
1206
+ const uint64_t BigBits = K * SrcBits;
1207
+
1208
+ // If the store payload does not fill the last lane, zero-extend to K * SrcBits.
1209
+ if (NeedBits < BigBits)
1210
+ NeedInt = IRB.CreateZExt (NeedInt, IRB.getIntNTy ((unsigned )BigBits));
1211
+
1212
+ // Bitcast i(BigBits) into <K x iSrcBits> or in other words split into K chunks.
1213
+ auto *IntSrcTy = IRB.getIntNTy ((unsigned )SrcBits);
1214
+ auto *PackVecIntTy = IGCLLVM::FixedVectorType::get (IntSrcTy, (unsigned )K);
1215
+ Value *PackVecInt = IRB.CreateBitCast (NeedInt, PackVecIntTy);
1216
+
1217
+ // Load the current promoted vector, overwrite K consecutive lanes atarting at Idx, and then store the updated vector
1218
+ // back.
1219
+ Value *Whole = IRB.CreateLoad (PromotedVecTy, pVecAlloca);
1220
+ for (unsigned i = 0 ; i < K; ++i) {
1221
+ Value *LaneInt = IRB.CreateExtractElement (PackVecInt, IRB.getInt32 (i));
1222
+ Value *LaneVal = intToSrcScalar (LaneInt);
1223
+ Value *Off = ConstantInt::get (pScalarizedIdx->getType (), i);
1224
+ Value *Idx = IRB.CreateAdd (pScalarizedIdx, Off);
1225
+ Whole = IRB.CreateInsertElement (Whole, LaneVal, Idx);
1047
1226
}
1048
- IRB.CreateStore (pIns, pVecAlloca);
1227
+ IRB.CreateStore (Whole, pVecAlloca);
1228
+ }
1229
+
1230
+ void TransposeHelperPromote::handleStoreInst (llvm::StoreInst *pStore, llvm::Value *pScalarizedIdx) {
1231
+ IGC_ASSERT (pStore && pStore->isSimple ());
1232
+ IGCLLVM::IRBuilder<> IRB (pStore);
1233
+ storeEltsToVecAlloca (pStore->getValueOperand (), pVecAlloca, pScalarizedIdx, IRB);
1049
1234
pStore->eraseFromParent ();
1050
1235
}
1051
1236
0 commit comments