@@ -42,256 +42,36 @@ namespace IGC
4242
4343 CComputeShaderBase::~CComputeShaderBase () {}
4444
45- void CComputeShaderBase::CreateThreadPayloadData (
46- void *& pThreadPayload,
47- uint& curbeTotalDataLength,
48- uint& curbeReadLength,
49- ThreadIDLayout layout) const
45+ void CComputeShaderBase::selectWalkOrder (
46+ bool useLinearWalk,
47+ uint numberOfTypedAccess,
48+ uint numberOfUntypedAccess,
49+ uint threadGroupSize_X,
50+ uint threadGroupSize_Y,
51+ uint threadGroupSize_Z)
5052 {
51- typedef uint16_t ThreadPayloadEntry ;
53+ const CodeGenContext* pCtx = GetContext () ;
5254
53- // Find the max thread group dimension
54- const OctEltUnit SIZE_OF_DQWORD = OctEltUnit (2 );
55- const OctEltUnit SIZE_OF_OWORD = OctEltUnit (1 );
56- uint numberOfId = GetNumberOfId ();
57- uint dimX = numLanes (m_dispatchSize);
58- // dimX must align to alignment_X bytes (one GRF)
59- uint alignment_X = EltUnit (SIZE_OF_OWORD).Count () * sizeof (DWORD);
60- uint dimX_aligned = iSTD::Align (dimX * sizeof (ThreadPayloadEntry), alignment_X) / sizeof (ThreadPayloadEntry);
61- uint dimY = (iSTD::Align (m_threadGroupSize, dimX) / dimX) * numberOfId;
62- curbeReadLength = dimX_aligned * numberOfId * sizeof (ThreadPayloadEntry) / alignment_X;
63-
64- uint alignedVal = EltUnit (SIZE_OF_DQWORD).Count () * sizeof (ThreadPayloadEntry); // Oct Element is 8 Entries
65- // m_NOSBufferSize is the additional space for cross-thread constant data (constants set by driver).
66- curbeTotalDataLength = iSTD::Align (dimX_aligned * dimY * sizeof (ThreadPayloadEntry) + m_NOSBufferSize, alignedVal);
67-
68- IGC_ASSERT_MESSAGE ((pThreadPayload == nullptr ), " Thread payload should be a null variable" );
69-
70- unsigned threadPayloadEntries = curbeTotalDataLength / sizeof (ThreadPayloadEntry);
71-
72- ThreadPayloadEntry* pThreadPayloadMem = (ThreadPayloadEntry*)IGC::aligned_malloc (threadPayloadEntries * sizeof (ThreadPayloadEntry), 16 );
73- IGC_ASSERT (nullptr != pThreadPayloadMem);
74- std::fill (pThreadPayloadMem, pThreadPayloadMem + threadPayloadEntries, 0 );
75-
76- pThreadPayload = pThreadPayloadMem;
77-
78- // Increase the pointer to per-thread constant data by the number of allocated
79- // cross-thread constants.
80- pThreadPayloadMem += (m_NOSBufferSize / sizeof (ThreadPayloadEntry));
81-
82- uint currThreadX = 0 ;
83- uint currThreadY = 0 ;
84- uint currThreadZ = 0 ;
85-
86- // Current heuristic is trivial, if there are more typed access than untyped access we walk in tile
87- // otherwise we walk linearly
88-
89- for (uint y = 0 ; y < dimY; y += numberOfId)
90- {
91- for (uint x = 0 ; x < dimX; ++x)
92- {
93- uint lane = 0 ;
94- if (m_pThread_ID_in_Group_X)
95- {
96- pThreadPayloadMem[(y + lane) * dimX_aligned + x] = currThreadX;
97- lane++;
98- }
99- if (m_pThread_ID_in_Group_Y)
100- {
101- pThreadPayloadMem[(y + lane) * dimX_aligned + x] = currThreadY;
102- lane++;
103- }
104- if (m_pThread_ID_in_Group_Z)
105- {
106- pThreadPayloadMem[(y + lane) * dimX_aligned + x] = currThreadZ;
107- lane++;
108- }
109-
110- if (layout == ThreadIDLayout::TileY)
111- {
112- const unsigned int tileSizeY = 4 ;
113- ++currThreadY;
114-
115- if (currThreadY % tileSizeY == 0 )
116- {
117- currThreadY -= tileSizeY;
118- ++currThreadX;
119- }
120-
121- if (currThreadX >= m_threadGroupSize_X)
122- {
123- currThreadX = 0 ;
124- currThreadY += tileSizeY;
125- }
126-
127- if (currThreadY >= m_threadGroupSize_Y)
128- {
129- currThreadY = 0 ;
130- ++currThreadZ;
131- }
132-
133- if (currThreadZ >= m_threadGroupSize_Z)
134- {
135- currThreadZ = 0 ;
136- }
137- }
138- else if (layout == ThreadIDLayout::X)
139- {
140- ++currThreadX;
141-
142- if (currThreadX >= m_threadGroupSize_X)
143- {
144- currThreadX = 0 ;
145- ++currThreadY;
146- }
147-
148- if (currThreadY >= m_threadGroupSize_Y)
149- {
150- currThreadY = 0 ;
151- ++currThreadZ;
152- }
153-
154- if (currThreadZ >= m_threadGroupSize_Z)
155- {
156- currThreadZ = 0 ;
157- }
158- }
159- else if (layout == ThreadIDLayout::QuadTile)
160- {
161- const unsigned int tileSizeX = 2 ;
162- const unsigned int tileSizeY = 2 ;
163- ++currThreadX;
164-
165- if (currThreadX % tileSizeX == 0 )
166- {
167- ++currThreadY;
168- }
169-
170- if ((currThreadX % tileSizeX == 0 ) &&
171- (currThreadY % tileSizeY == 0 ))
172- {
173- currThreadY -= tileSizeY;
174- }
175- else if (currThreadX % tileSizeX == 0 )
176- {
177- currThreadX -= tileSizeX;
178- }
179-
180- if (currThreadX >= m_threadGroupSize_X)
181- {
182- currThreadX = 0 ;
183- currThreadY += tileSizeY;
184- }
185-
186- if (currThreadY >= m_threadGroupSize_Y)
187- {
188- currThreadX = 0 ;
189- currThreadY = 0 ;
190- ++currThreadZ;
191- }
192-
193- if (currThreadZ >= m_threadGroupSize_Z)
194- {
195- currThreadZ = 0 ;
196- }
197- }
198- else
199- {
200- IGC_ASSERT_MESSAGE (0 , " unhandled layout!" );
201- }
202- }
203- }
204- }
205-
206- CVariable* CComputeShaderBase::CreateThreadIDinGroup (SGVUsage channelNum)
207- {
208- IGC_ASSERT_MESSAGE ((channelNum <= THREAD_ID_IN_GROUP_Z), " Thread id's are in 3 dimensions only" );
209- IGC_ASSERT_MESSAGE ((channelNum >= THREAD_ID_IN_GROUP_X), " Thread id's are in 3 dimensions only" );
210-
211- switch (channelNum)
212- {
213- case THREAD_ID_IN_GROUP_X:
214- if (m_pThread_ID_in_Group_X == nullptr )
215- {
216- m_pThread_ID_in_Group_X = GetNewVariable (
217- numLanes (m_SIMDSize), ISA_TYPE_W, getGRFAlignment (), false , m_numberInstance, " threadIdInGroupX" );
218- }
219- return m_pThread_ID_in_Group_X;
220- case THREAD_ID_IN_GROUP_Y:
221- if (m_pThread_ID_in_Group_Y == nullptr )
222- {
223- m_pThread_ID_in_Group_Y = GetNewVariable (
224- numLanes (m_SIMDSize), ISA_TYPE_W, getGRFAlignment (), false , m_numberInstance, " threadIdInGroupY" );
225- }
226- return m_pThread_ID_in_Group_Y;
227- case THREAD_ID_IN_GROUP_Z:
228- if (m_pThread_ID_in_Group_Z == nullptr )
229- {
230- m_pThread_ID_in_Group_Z = GetNewVariable (
231- numLanes (m_SIMDSize), ISA_TYPE_W, getGRFAlignment (), false , m_numberInstance, " threadIdInGroupZ" );
232- }
233- return m_pThread_ID_in_Group_Z;
234- default :
235- IGC_ASSERT_MESSAGE (0 , " Invalid channel number" );
236- break ;
237- }
238-
239- return nullptr ;
240- }
241-
242- void CComputeShaderBase::AllocatePerThreadConstantData (uint32_t &offset)
243- {
244- // Per-thread constant data.
245- if (m_pThread_ID_in_Group_X)
246- {
247- for (uint i = 0 ; i < m_pThread_ID_in_Group_X->GetNumberInstance (); i++)
248- {
249- AllocateInput (m_pThread_ID_in_Group_X, offset, i);
250- offset += m_pThread_ID_in_Group_X->GetSize ();
251- offset = iSTD::Round (offset, alignmentSize[m_pThread_ID_in_Group_X->GetAlign ()]);
252- }
253- }
254-
255- if (m_pThread_ID_in_Group_Y)
256- {
257- for (uint i = 0 ; i < m_pThread_ID_in_Group_Y->GetNumberInstance (); i++)
258- {
259- AllocateInput (m_pThread_ID_in_Group_Y, offset, i);
260- offset += m_pThread_ID_in_Group_Y->GetSize ();
261- offset = iSTD::Round (offset, alignmentSize[m_pThread_ID_in_Group_Y->GetAlign ()]);
262- }
263- }
264-
265- if (m_pThread_ID_in_Group_Z)
266- {
267- for (uint i = 0 ; i < m_pThread_ID_in_Group_Z->GetNumberInstance (); i++)
268- {
269- AllocateInput (m_pThread_ID_in_Group_Z, offset, i);
270- offset += m_pThread_ID_in_Group_Z->GetSize ();
271- offset = iSTD::Round (offset, alignmentSize[m_pThread_ID_in_Group_Z->GetAlign ()]);
272- }
273- }
274- }
275-
276- uint CComputeShaderBase::GetNumberOfId () const
277- {
278- uint numberIdPushed = 0 ;
279-
280- if (m_pThread_ID_in_Group_X)
55+ if (pCtx->getModuleMetaData ()->csInfo .neededThreadIdLayout == ThreadIDLayout::QuadTile)
28156 {
282- ++numberIdPushed;
57+ m_ThreadIDLayout = ThreadIDLayout::QuadTile;
58+ return ;
28359 }
28460
285- if (m_pThread_ID_in_Group_Y)
286- {
287- ++numberIdPushed;
61+ if ((numberOfTypedAccess >= numberOfUntypedAccess) &&
62+ threadGroupSize_Y % 4 == 0 &&
63+ !pCtx->getModuleMetaData ()->csInfo .disableLocalIdOrderOptimizations &&
64+ IGC_IS_FLAG_ENABLED (UseTiledCSThreadOrder)) {
65+ m_ThreadIDLayout = ThreadIDLayout::TileY;
66+ m_walkOrder = WO_YXZ;
28867 }
28968
290- if (m_pThread_ID_in_Group_Z)
69+ bool needsLinearWalk =
70+ pCtx->getModuleMetaData ()->csInfo .neededThreadIdLayout == ThreadIDLayout::X;
71+ if (needsLinearWalk)
29172 {
292- ++numberIdPushed;
73+ m_ThreadIDLayout = ThreadIDLayout::X;
74+ m_walkOrder = WO_XYZ;
29375 }
294-
295- return numberIdPushed;
29676 }
29777}
0 commit comments