@@ -125,27 +125,31 @@ void CComputeShader::ParseShaderSpecificOpcode(llvm::Instruction* inst)
125125 }
126126}
127127
128- void CComputeShader::CreateThreadPayloadData (void * & pThreadPayload, uint& threadPayloadSize )
128+ void CComputeShader::CreateThreadPayloadData (void * & pThreadPayload, uint& curbeTotalDataLength, uint& curbeReadLength )
129129{
130+ typedef uint16_t ThreadPayloadEntry;
131+
130132 // Find the max thread group dimension
131133 const OctEltUnit SIZE_OF_DQWORD = OctEltUnit (2 );
134+ const OctEltUnit SIZE_OF_OWORD = OctEltUnit (1 );
132135 uint numberOfId = GetNumberOfId ();
133136 uint dimX = numLanes (m_dispatchSize);
134- uint dimY = ( iSTD::Align (m_threadGroupSize, dimX)/dimX) * numberOfId;
135-
136- typedef uint ThreadPayloadEntry;
137-
138- uint alignedVal = EltUnit (SIZE_OF_DQWORD). Count () * sizeof (DWORD); // Oct Element is 8 DWORDS
137+ // dimX must align to alignment_X bytes (one GRF)
138+ uint alignment_X = EltUnit (SIZE_OF_OWORD). Count () * sizeof (DWORD);
139+ uint dimX_aligned = iSTD::Align (dimX * sizeof ( ThreadPayloadEntry), alignment_X) / sizeof (ThreadPayloadEntry) ;
140+ uint dimY = ( iSTD::Align (m_threadGroupSize, dimX) / dimX) * numberOfId;
141+ curbeReadLength = dimX_aligned * numberOfId * sizeof (ThreadPayloadEntry) / alignment_X;
139142
143+ uint alignedVal = EltUnit (SIZE_OF_DQWORD).Count () * sizeof (ThreadPayloadEntry); // Oct Element is 8 Entries
140144 // m_NOSBufferSize is the additional space for cross-thread constant data (constants set by driver).
141- threadPayloadSize = iSTD::Align ( dimX * dimY * sizeof ( ThreadPayloadEntry ) + m_NOSBufferSize, alignedVal );
145+ curbeTotalDataLength = iSTD::Align (dimX_aligned * dimY * sizeof (ThreadPayloadEntry) + m_NOSBufferSize, alignedVal);
142146
143147 assert (pThreadPayload == nullptr && " Thread payload should be a null variable" );
144148
145- unsigned threadPayloadEntries = threadPayloadSize / sizeof (ThreadPayloadEntry);
149+ unsigned threadPayloadEntries = curbeTotalDataLength / sizeof (ThreadPayloadEntry);
146150
147151 ThreadPayloadEntry* pThreadPayloadMem =
148- (ThreadPayloadEntry*)IGC::aligned_malloc (threadPayloadEntries* sizeof (ThreadPayloadEntry), 16 );
152+ (ThreadPayloadEntry*)IGC::aligned_malloc (threadPayloadEntries * sizeof (ThreadPayloadEntry), 16 );
149153 std::fill (pThreadPayloadMem, pThreadPayloadMem + threadPayloadEntries, 0 );
150154
151155 pThreadPayload = pThreadPayloadMem;
@@ -169,17 +173,17 @@ void CComputeShader::CreateThreadPayloadData(void* & pThreadPayload, uint& threa
169173 uint lane = 0 ;
170174 if (m_pThread_ID_in_Group_X)
171175 {
172- pThreadPayloadMem[(y + lane) * dimX + x] = currThreadX;
176+ pThreadPayloadMem[(y + lane) * dimX_aligned + x] = currThreadX;
173177 lane++;
174178 }
175179 if (m_pThread_ID_in_Group_Y)
176180 {
177- pThreadPayloadMem[(y + lane) * dimX + x] = currThreadY;
181+ pThreadPayloadMem[(y + lane) * dimX_aligned + x] = currThreadY;
178182 lane++;
179183 }
180184 if (m_pThread_ID_in_Group_Z)
181185 {
182- pThreadPayloadMem[(y + lane) * dimX + x] = currThreadZ;
186+ pThreadPayloadMem[(y + lane) * dimX_aligned + x] = currThreadZ;
183187 lane++;
184188 }
185189
@@ -259,19 +263,19 @@ CVariable* CComputeShader::CreateThreadIDinGroup(uint channelNum)
259263 case 0 :
260264 if (m_pThread_ID_in_Group_X == nullptr )
261265 {
262- m_pThread_ID_in_Group_X = GetNewVariable (numLanes (m_SIMDSize), ISA_TYPE_D , EALIGN_GRF, false , m_numberInstance);
266+ m_pThread_ID_in_Group_X = GetNewVariable (numLanes (m_SIMDSize), ISA_TYPE_W , EALIGN_GRF, false , m_numberInstance);
263267 }
264268 return m_pThread_ID_in_Group_X;
265269 case 1 :
266270 if (m_pThread_ID_in_Group_Y == nullptr )
267271 {
268- m_pThread_ID_in_Group_Y = GetNewVariable (numLanes (m_SIMDSize), ISA_TYPE_D , EALIGN_GRF, false , m_numberInstance);
272+ m_pThread_ID_in_Group_Y = GetNewVariable (numLanes (m_SIMDSize), ISA_TYPE_W , EALIGN_GRF, false , m_numberInstance);
269273 }
270274 return m_pThread_ID_in_Group_Y;
271275 case 2 :
272276 if (m_pThread_ID_in_Group_Z == nullptr )
273277 {
274- m_pThread_ID_in_Group_Z = GetNewVariable (numLanes (m_SIMDSize), ISA_TYPE_D , EALIGN_GRF, false , m_numberInstance);
278+ m_pThread_ID_in_Group_Z = GetNewVariable (numLanes (m_SIMDSize), ISA_TYPE_W , EALIGN_GRF, false , m_numberInstance);
275279 }
276280 return m_pThread_ID_in_Group_Z;
277281 default :
@@ -335,6 +339,7 @@ void CComputeShader::AllocatePayload()
335339 {
336340 AllocateInput (m_pThread_ID_in_Group_X, offset, i);
337341 offset += m_pThread_ID_in_Group_X->GetSize ();
342+ offset = iSTD::Round (offset, alignmentSize[m_pThread_ID_in_Group_X->GetAlign ()]);
338343 }
339344 }
340345
@@ -344,6 +349,7 @@ void CComputeShader::AllocatePayload()
344349 {
345350 AllocateInput (m_pThread_ID_in_Group_Y, offset, i);
346351 offset += m_pThread_ID_in_Group_Y->GetSize ();
352+ offset = iSTD::Round (offset, alignmentSize[m_pThread_ID_in_Group_Y->GetAlign ()]);
347353 }
348354 }
349355
@@ -353,6 +359,7 @@ void CComputeShader::AllocatePayload()
353359 {
354360 AllocateInput (m_pThread_ID_in_Group_Z, offset, i);
355361 offset += m_pThread_ID_in_Group_Z->GetSize ();
362+ offset = iSTD::Round (offset, alignmentSize[m_pThread_ID_in_Group_Z->GetAlign ()]);
356363 }
357364 }
358365
@@ -466,8 +473,6 @@ void CComputeShader::FillProgram(SComputeShaderKernelProgram* pKernelProgram)
466473 pKernelProgram->FloatingPointMode = USC::GFX3DSTATE_FLOATING_POINT_IEEE_754;
467474 pKernelProgram->SingleProgramFlow = USC::GFX3DSTATE_PROGRAM_FLOW_MULTIPLE;
468475 pKernelProgram->CurbeReadOffset = 0 ;
469- pKernelProgram->CurbeReadLength = GetNumberOfId () * (numLanes (m_dispatchSize) / numLanes (SIMDMode::SIMD8));
470-
471476 pKernelProgram->PhysicalThreadsInGroup = static_cast <int >(
472477 std::ceil ((static_cast <float >(m_threadGroupSize) /
473478 static_cast <float >((numLanes (m_dispatchSize))))));
@@ -487,7 +492,8 @@ void CComputeShader::FillProgram(SComputeShaderKernelProgram* pKernelProgram)
487492 pKernelProgram->ThreadPayloadData = nullptr ;
488493 CreateThreadPayloadData (
489494 pKernelProgram->ThreadPayloadData ,
490- pKernelProgram->CurbeTotalDataLength );
495+ pKernelProgram->CurbeTotalDataLength ,
496+ pKernelProgram->CurbeReadLength );
491497
492498 pKernelProgram->ThreadGroupSize = m_threadGroupSize;
493499
0 commit comments