Skip to content

Commit 50d82fc

Browse files
scottp101igcbot
authored andcommitted
Move walk order determination to base.
1 parent 9d35196 commit 50d82fc

File tree

7 files changed

+402
-311
lines changed

7 files changed

+402
-311
lines changed

IGC/Compiler/CISACodeGen/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ set(IGC_BUILD__SRC__CISACodeGen_Common
3737
"${CMAKE_CURRENT_SOURCE_DIR}/CodeHoisting.cpp"
3838
"${CMAKE_CURRENT_SOURCE_DIR}/CollectGeometryShaderProperties.cpp"
3939
"${CMAKE_CURRENT_SOURCE_DIR}/ComputeShaderBase.cpp"
40+
"${CMAKE_CURRENT_SOURCE_DIR}/ComputeShaderCommon.cpp"
4041
"${CMAKE_CURRENT_SOURCE_DIR}/ComputeShaderCodeGen.cpp"
4142
"${CMAKE_CURRENT_SOURCE_DIR}/ConstantCoalescing.cpp"
4243
"${CMAKE_CURRENT_SOURCE_DIR}/CShader.cpp"
@@ -130,6 +131,7 @@ set(IGC_BUILD__HDR__CISACodeGen_Common
130131
"${CMAKE_CURRENT_SOURCE_DIR}/CodeHoisting.hpp"
131132
"${CMAKE_CURRENT_SOURCE_DIR}/CollectGeometryShaderProperties.hpp"
132133
"${CMAKE_CURRENT_SOURCE_DIR}/ComputeShaderBase.hpp"
134+
"${CMAKE_CURRENT_SOURCE_DIR}/ComputeShaderCommon.hpp"
133135
"${CMAKE_CURRENT_SOURCE_DIR}/ComputeShaderCodeGen.hpp"
134136
"${CMAKE_CURRENT_SOURCE_DIR}/ConstantCoalescing.hpp"
135137
"${CMAKE_CURRENT_SOURCE_DIR}/CVariable.hpp"

IGC/Compiler/CISACodeGen/ComputeShaderBase.cpp

Lines changed: 22 additions & 242 deletions
Original file line numberDiff line numberDiff line change
@@ -42,256 +42,36 @@ namespace IGC
4242

4343
CComputeShaderBase::~CComputeShaderBase() {}
4444

45-
void CComputeShaderBase::CreateThreadPayloadData(
46-
void*& pThreadPayload,
47-
uint& curbeTotalDataLength,
48-
uint& curbeReadLength,
49-
ThreadIDLayout layout) const
45+
void CComputeShaderBase::selectWalkOrder(
46+
bool useLinearWalk,
47+
uint numberOfTypedAccess,
48+
uint numberOfUntypedAccess,
49+
uint threadGroupSize_X,
50+
uint threadGroupSize_Y,
51+
uint threadGroupSize_Z)
5052
{
51-
typedef uint16_t ThreadPayloadEntry;
53+
const CodeGenContext* pCtx = GetContext();
5254

53-
// Find the max thread group dimension
54-
const OctEltUnit SIZE_OF_DQWORD = OctEltUnit(2);
55-
const OctEltUnit SIZE_OF_OWORD = OctEltUnit(1);
56-
uint numberOfId = GetNumberOfId();
57-
uint dimX = numLanes(m_dispatchSize);
58-
// dimX must align to alignment_X bytes (one GRF)
59-
uint alignment_X = EltUnit(SIZE_OF_OWORD).Count() * sizeof(DWORD);
60-
uint dimX_aligned = iSTD::Align(dimX * sizeof(ThreadPayloadEntry), alignment_X) / sizeof(ThreadPayloadEntry);
61-
uint dimY = (iSTD::Align(m_threadGroupSize, dimX) / dimX) * numberOfId;
62-
curbeReadLength = dimX_aligned * numberOfId * sizeof(ThreadPayloadEntry) / alignment_X;
63-
64-
uint alignedVal = EltUnit(SIZE_OF_DQWORD).Count() * sizeof(ThreadPayloadEntry); // Oct Element is 8 Entries
65-
// m_NOSBufferSize is the additional space for cross-thread constant data (constants set by driver).
66-
curbeTotalDataLength = iSTD::Align(dimX_aligned * dimY * sizeof(ThreadPayloadEntry) + m_NOSBufferSize, alignedVal);
67-
68-
IGC_ASSERT_MESSAGE((pThreadPayload == nullptr), "Thread payload should be a null variable");
69-
70-
unsigned threadPayloadEntries = curbeTotalDataLength / sizeof(ThreadPayloadEntry);
71-
72-
ThreadPayloadEntry* pThreadPayloadMem = (ThreadPayloadEntry*)IGC::aligned_malloc(threadPayloadEntries * sizeof(ThreadPayloadEntry), 16);
73-
IGC_ASSERT(nullptr != pThreadPayloadMem);
74-
std::fill(pThreadPayloadMem, pThreadPayloadMem + threadPayloadEntries, 0);
75-
76-
pThreadPayload = pThreadPayloadMem;
77-
78-
// Increase the pointer to per-thread constant data by the number of allocated
79-
// cross-thread constants.
80-
pThreadPayloadMem += (m_NOSBufferSize / sizeof(ThreadPayloadEntry));
81-
82-
uint currThreadX = 0;
83-
uint currThreadY = 0;
84-
uint currThreadZ = 0;
85-
86-
// Current heuristic is trivial, if there are more typed access than untyped access we walk in tile
87-
// otherwise we walk linearly
88-
89-
for (uint y = 0; y < dimY; y += numberOfId)
90-
{
91-
for (uint x = 0; x < dimX; ++x)
92-
{
93-
uint lane = 0;
94-
if (m_pThread_ID_in_Group_X)
95-
{
96-
pThreadPayloadMem[(y + lane) * dimX_aligned + x] = currThreadX;
97-
lane++;
98-
}
99-
if (m_pThread_ID_in_Group_Y)
100-
{
101-
pThreadPayloadMem[(y + lane) * dimX_aligned + x] = currThreadY;
102-
lane++;
103-
}
104-
if (m_pThread_ID_in_Group_Z)
105-
{
106-
pThreadPayloadMem[(y + lane) * dimX_aligned + x] = currThreadZ;
107-
lane++;
108-
}
109-
110-
if(layout == ThreadIDLayout::TileY)
111-
{
112-
const unsigned int tileSizeY = 4;
113-
++currThreadY;
114-
115-
if (currThreadY % tileSizeY == 0)
116-
{
117-
currThreadY -= tileSizeY;
118-
++currThreadX;
119-
}
120-
121-
if (currThreadX >= m_threadGroupSize_X)
122-
{
123-
currThreadX = 0;
124-
currThreadY += tileSizeY;
125-
}
126-
127-
if (currThreadY >= m_threadGroupSize_Y)
128-
{
129-
currThreadY = 0;
130-
++currThreadZ;
131-
}
132-
133-
if (currThreadZ >= m_threadGroupSize_Z)
134-
{
135-
currThreadZ = 0;
136-
}
137-
}
138-
else if (layout == ThreadIDLayout::X)
139-
{
140-
++currThreadX;
141-
142-
if (currThreadX >= m_threadGroupSize_X)
143-
{
144-
currThreadX = 0;
145-
++currThreadY;
146-
}
147-
148-
if (currThreadY >= m_threadGroupSize_Y)
149-
{
150-
currThreadY = 0;
151-
++currThreadZ;
152-
}
153-
154-
if (currThreadZ >= m_threadGroupSize_Z)
155-
{
156-
currThreadZ = 0;
157-
}
158-
}
159-
else if (layout == ThreadIDLayout::QuadTile)
160-
{
161-
const unsigned int tileSizeX = 2;
162-
const unsigned int tileSizeY = 2;
163-
++currThreadX;
164-
165-
if (currThreadX % tileSizeX == 0)
166-
{
167-
++currThreadY;
168-
}
169-
170-
if ((currThreadX % tileSizeX == 0) &&
171-
(currThreadY % tileSizeY == 0))
172-
{
173-
currThreadY -= tileSizeY;
174-
}
175-
else if (currThreadX % tileSizeX == 0)
176-
{
177-
currThreadX -= tileSizeX;
178-
}
179-
180-
if (currThreadX >= m_threadGroupSize_X)
181-
{
182-
currThreadX = 0;
183-
currThreadY += tileSizeY;
184-
}
185-
186-
if (currThreadY >= m_threadGroupSize_Y)
187-
{
188-
currThreadX = 0;
189-
currThreadY = 0;
190-
++currThreadZ;
191-
}
192-
193-
if (currThreadZ >= m_threadGroupSize_Z)
194-
{
195-
currThreadZ = 0;
196-
}
197-
}
198-
else
199-
{
200-
IGC_ASSERT_MESSAGE(0, "unhandled layout!");
201-
}
202-
}
203-
}
204-
}
205-
206-
CVariable* CComputeShaderBase::CreateThreadIDinGroup(SGVUsage channelNum)
207-
{
208-
IGC_ASSERT_MESSAGE((channelNum <= THREAD_ID_IN_GROUP_Z), "Thread id's are in 3 dimensions only");
209-
IGC_ASSERT_MESSAGE((channelNum >= THREAD_ID_IN_GROUP_X), "Thread id's are in 3 dimensions only");
210-
211-
switch(channelNum)
212-
{
213-
case THREAD_ID_IN_GROUP_X:
214-
if(m_pThread_ID_in_Group_X == nullptr)
215-
{
216-
m_pThread_ID_in_Group_X = GetNewVariable(
217-
numLanes(m_SIMDSize), ISA_TYPE_W, getGRFAlignment(), false, m_numberInstance, "threadIdInGroupX");
218-
}
219-
return m_pThread_ID_in_Group_X;
220-
case THREAD_ID_IN_GROUP_Y:
221-
if(m_pThread_ID_in_Group_Y == nullptr)
222-
{
223-
m_pThread_ID_in_Group_Y = GetNewVariable(
224-
numLanes(m_SIMDSize), ISA_TYPE_W, getGRFAlignment(), false, m_numberInstance, "threadIdInGroupY");
225-
}
226-
return m_pThread_ID_in_Group_Y;
227-
case THREAD_ID_IN_GROUP_Z:
228-
if(m_pThread_ID_in_Group_Z == nullptr)
229-
{
230-
m_pThread_ID_in_Group_Z = GetNewVariable(
231-
numLanes(m_SIMDSize), ISA_TYPE_W, getGRFAlignment(), false, m_numberInstance, "threadIdInGroupZ");
232-
}
233-
return m_pThread_ID_in_Group_Z;
234-
default:
235-
IGC_ASSERT_MESSAGE(0, "Invalid channel number");
236-
break;
237-
}
238-
239-
return nullptr;
240-
}
241-
242-
void CComputeShaderBase::AllocatePerThreadConstantData(uint32_t &offset)
243-
{
244-
// Per-thread constant data.
245-
if (m_pThread_ID_in_Group_X)
246-
{
247-
for (uint i = 0; i < m_pThread_ID_in_Group_X->GetNumberInstance(); i++)
248-
{
249-
AllocateInput(m_pThread_ID_in_Group_X, offset, i);
250-
offset += m_pThread_ID_in_Group_X->GetSize();
251-
offset = iSTD::Round(offset, alignmentSize[m_pThread_ID_in_Group_X->GetAlign()]);
252-
}
253-
}
254-
255-
if (m_pThread_ID_in_Group_Y)
256-
{
257-
for (uint i = 0; i < m_pThread_ID_in_Group_Y->GetNumberInstance(); i++)
258-
{
259-
AllocateInput(m_pThread_ID_in_Group_Y, offset, i);
260-
offset += m_pThread_ID_in_Group_Y->GetSize();
261-
offset = iSTD::Round(offset, alignmentSize[m_pThread_ID_in_Group_Y->GetAlign()]);
262-
}
263-
}
264-
265-
if (m_pThread_ID_in_Group_Z)
266-
{
267-
for (uint i = 0; i < m_pThread_ID_in_Group_Z->GetNumberInstance(); i++)
268-
{
269-
AllocateInput(m_pThread_ID_in_Group_Z, offset, i);
270-
offset += m_pThread_ID_in_Group_Z->GetSize();
271-
offset = iSTD::Round(offset, alignmentSize[m_pThread_ID_in_Group_Z->GetAlign()]);
272-
}
273-
}
274-
}
275-
276-
uint CComputeShaderBase::GetNumberOfId() const
277-
{
278-
uint numberIdPushed = 0;
279-
280-
if (m_pThread_ID_in_Group_X)
55+
if (pCtx->getModuleMetaData()->csInfo.neededThreadIdLayout == ThreadIDLayout::QuadTile)
28156
{
282-
++numberIdPushed;
57+
m_ThreadIDLayout = ThreadIDLayout::QuadTile;
58+
return;
28359
}
28460

285-
if (m_pThread_ID_in_Group_Y)
286-
{
287-
++numberIdPushed;
61+
if ((numberOfTypedAccess >= numberOfUntypedAccess) &&
62+
threadGroupSize_Y % 4 == 0 &&
63+
!pCtx->getModuleMetaData()->csInfo.disableLocalIdOrderOptimizations &&
64+
IGC_IS_FLAG_ENABLED(UseTiledCSThreadOrder)) {
65+
m_ThreadIDLayout = ThreadIDLayout::TileY;
66+
m_walkOrder = WO_YXZ;
28867
}
28968

290-
if (m_pThread_ID_in_Group_Z)
69+
bool needsLinearWalk =
70+
pCtx->getModuleMetaData()->csInfo.neededThreadIdLayout == ThreadIDLayout::X;
71+
if (needsLinearWalk)
29172
{
292-
++numberIdPushed;
73+
m_ThreadIDLayout = ThreadIDLayout::X;
74+
m_walkOrder = WO_XYZ;
29375
}
294-
295-
return numberIdPushed;
29676
}
29777
}

IGC/Compiler/CISACodeGen/ComputeShaderBase.hpp

Lines changed: 19 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -33,28 +33,25 @@ namespace IGC
3333
public:
3434
CComputeShaderBase(llvm::Function* pFunc, CShaderProgram* pProgram);
3535
virtual ~CComputeShaderBase();
36-
37-
void CreateThreadPayloadData(
38-
void*& pThreadPayload,
39-
uint& curbeTotalDataLength,
40-
uint& curbeReadLength,
41-
ThreadIDLayout layout) const;
42-
void AllocatePerThreadConstantData(uint32_t &offset);
43-
uint GetNumberOfId() const;
44-
45-
/// Get the Thread ID's in Group
46-
CVariable* CreateThreadIDinGroup(SGVUsage channelNum);
47-
uint GetThreadGroupSize() const { return m_threadGroupSize; }
4836
protected:
49-
/// Size of a thread group (X x Y x Z) provided by the front-end.
50-
uint m_threadGroupSize = 0;
51-
uint m_threadGroupSize_X = 0;
52-
uint m_threadGroupSize_Y = 0;
53-
uint m_threadGroupSize_Z = 0;
54-
55-
/// The set of X/Y/Z that form the local thread ID for each channel.
56-
CVariable* m_pThread_ID_in_Group_X = nullptr;
57-
CVariable* m_pThread_ID_in_Group_Y = nullptr;
58-
CVariable* m_pThread_ID_in_Group_Z = nullptr;
37+
void selectWalkOrder(
38+
bool useLinearWalk,
39+
uint numberOfTypedAccess,
40+
uint numberOfUntypedAccess,
41+
uint threadGroupSize_X,
42+
uint threadGroupSize_Y,
43+
uint threadGroupSize_Z);
44+
45+
ThreadIDLayout m_ThreadIDLayout = ThreadIDLayout::X;
46+
47+
enum WALK_ORDER {
48+
WO_XYZ,
49+
WO_XZY,
50+
WO_YXZ,
51+
WO_ZXY,
52+
WO_YZX,
53+
WO_ZYX
54+
};
55+
WALK_ORDER m_walkOrder = WALK_ORDER::WO_XYZ;
5956
};
6057
}

0 commit comments

Comments
 (0)