Skip to content

Commit 608ec93

Browse files
Extract GpgpuWalker related functions to bdw_plus files
Change-Id: I3b2081af8e350d4072da5e1482a4bfc50e06fb6d Related-To: NEO-3016 Signed-off-by: Maciej Dziuban <[email protected]>
1 parent 7218bdb commit 608ec93

12 files changed

+898
-914
lines changed

manifests/manifest.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ components:
1717
branch: infra
1818
clean_on_sync: true
1919
dest_dir: infra
20-
revision: ebe0b77203ce767148d3bf64cfc96a9ab83fa53b
20+
revision: d8e410fed8397e7615cedfa358b0ead060b144ab
2121
type: git
2222
internal:
2323
branch: master

runtime/command_queue/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,11 @@ set(RUNTIME_SRCS_COMMAND_QUEUE
3535
${CMAKE_CURRENT_SOURCE_DIR}/finish.h
3636
${CMAKE_CURRENT_SOURCE_DIR}/flush.h
3737
${CMAKE_CURRENT_SOURCE_DIR}/gpgpu_walker.h
38-
${CMAKE_CURRENT_SOURCE_DIR}/gpgpu_walker.inl
3938
${CMAKE_CURRENT_SOURCE_DIR}/gpgpu_walker_base.inl
39+
${CMAKE_CURRENT_SOURCE_DIR}/gpgpu_walker_bdw_plus.inl
4040
${CMAKE_CURRENT_SOURCE_DIR}/hardware_interface.h
41-
${CMAKE_CURRENT_SOURCE_DIR}/hardware_interface.inl
4241
${CMAKE_CURRENT_SOURCE_DIR}/hardware_interface_base.inl
42+
${CMAKE_CURRENT_SOURCE_DIR}/hardware_interface_bdw_plus.inl
4343
${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.cpp
4444
${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.h
4545
${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.inl

runtime/command_queue/gpgpu_walker.inl

Lines changed: 0 additions & 393 deletions
This file was deleted.

runtime/command_queue/gpgpu_walker_base.inl

Lines changed: 324 additions & 160 deletions
Large diffs are not rendered by default.
Lines changed: 229 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,229 @@
1+
/*
2+
* Copyright (C) 2018-2019 Intel Corporation
3+
*
4+
* SPDX-License-Identifier: MIT
5+
*
6+
*/
7+
8+
#pragma once
9+
#include "runtime/command_queue/gpgpu_walker_base.inl"
10+
11+
namespace NEO {
12+
13+
template <typename GfxFamily>
14+
inline size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
15+
WALKER_TYPE<GfxFamily> *walkerCmd,
16+
const size_t globalOffsets[3],
17+
const size_t startWorkGroups[3],
18+
const size_t numWorkGroups[3],
19+
const size_t localWorkSizesIn[3],
20+
uint32_t simd,
21+
uint32_t workDim,
22+
bool localIdsGenerationByRuntime,
23+
bool inlineDataProgrammingRequired,
24+
const iOpenCL::SPatchThreadPayload &threadPayload) {
25+
auto localWorkSize = localWorkSizesIn[0] * localWorkSizesIn[1] * localWorkSizesIn[2];
26+
27+
auto threadsPerWorkGroup = getThreadsPerWG(simd, localWorkSize);
28+
walkerCmd->setThreadWidthCounterMaximum(static_cast<uint32_t>(threadsPerWorkGroup));
29+
30+
walkerCmd->setThreadGroupIdXDimension(static_cast<uint32_t>(numWorkGroups[0]));
31+
walkerCmd->setThreadGroupIdYDimension(static_cast<uint32_t>(numWorkGroups[1]));
32+
walkerCmd->setThreadGroupIdZDimension(static_cast<uint32_t>(numWorkGroups[2]));
33+
34+
// compute executionMask - to tell which SIMD lines are active within thread
35+
auto remainderSimdLanes = localWorkSize & (simd - 1);
36+
uint64_t executionMask = (1ull << remainderSimdLanes) - 1;
37+
if (!executionMask)
38+
executionMask = ~executionMask;
39+
40+
using SIMD_SIZE = typename WALKER_TYPE<GfxFamily>::SIMD_SIZE;
41+
42+
walkerCmd->setRightExecutionMask(static_cast<uint32_t>(executionMask));
43+
walkerCmd->setBottomExecutionMask(static_cast<uint32_t>(0xffffffff));
44+
walkerCmd->setSimdSize(static_cast<SIMD_SIZE>(simd >> 4));
45+
46+
walkerCmd->setThreadGroupIdStartingX(static_cast<uint32_t>(startWorkGroups[0]));
47+
walkerCmd->setThreadGroupIdStartingY(static_cast<uint32_t>(startWorkGroups[1]));
48+
walkerCmd->setThreadGroupIdStartingResumeZ(static_cast<uint32_t>(startWorkGroups[2]));
49+
50+
return localWorkSize;
51+
}
52+
53+
template <typename GfxFamily>
54+
void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
55+
LinearStream &commandStream,
56+
DeviceQueueHw<GfxFamily> &devQueueHw,
57+
PreemptionMode preemptionMode,
58+
SchedulerKernel &scheduler,
59+
IndirectHeap *ssh,
60+
IndirectHeap *dsh) {
61+
62+
using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
63+
using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER;
64+
using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;
65+
66+
bool dcFlush = false;
67+
PipeControlHelper<GfxFamily>::addPipeControl(commandStream, dcFlush);
68+
69+
uint32_t interfaceDescriptorIndex = devQueueHw.schedulerIDIndex;
70+
const size_t offsetInterfaceDescriptorTable = devQueueHw.colorCalcStateSize;
71+
const size_t offsetInterfaceDescriptor = offsetInterfaceDescriptorTable;
72+
const size_t totalInterfaceDescriptorTableSize = devQueueHw.interfaceDescriptorEntries * sizeof(INTERFACE_DESCRIPTOR_DATA);
73+
74+
// Program media interface descriptor load
75+
KernelCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
76+
commandStream,
77+
offsetInterfaceDescriptor,
78+
totalInterfaceDescriptorTableSize);
79+
80+
DEBUG_BREAK_IF(offsetInterfaceDescriptorTable % 64 != 0);
81+
82+
// Determine SIMD size
83+
uint32_t simd = scheduler.getKernelInfo().getMaxSimdSize();
84+
DEBUG_BREAK_IF(simd != PARALLEL_SCHEDULER_COMPILATION_SIZE_20);
85+
86+
// Patch our kernel constants
87+
*scheduler.globalWorkOffsetX = 0;
88+
*scheduler.globalWorkOffsetY = 0;
89+
*scheduler.globalWorkOffsetZ = 0;
90+
91+
*scheduler.globalWorkSizeX = (uint32_t)scheduler.getGws();
92+
*scheduler.globalWorkSizeY = 1;
93+
*scheduler.globalWorkSizeZ = 1;
94+
95+
*scheduler.localWorkSizeX = (uint32_t)scheduler.getLws();
96+
*scheduler.localWorkSizeY = 1;
97+
*scheduler.localWorkSizeZ = 1;
98+
99+
*scheduler.localWorkSizeX2 = (uint32_t)scheduler.getLws();
100+
*scheduler.localWorkSizeY2 = 1;
101+
*scheduler.localWorkSizeZ2 = 1;
102+
103+
*scheduler.enqueuedLocalWorkSizeX = (uint32_t)scheduler.getLws();
104+
*scheduler.enqueuedLocalWorkSizeY = 1;
105+
*scheduler.enqueuedLocalWorkSizeZ = 1;
106+
107+
*scheduler.numWorkGroupsX = (uint32_t)(scheduler.getGws() / scheduler.getLws());
108+
*scheduler.numWorkGroupsY = 0;
109+
*scheduler.numWorkGroupsZ = 0;
110+
111+
*scheduler.workDim = 1;
112+
113+
// Send our indirect object data
114+
size_t localWorkSizes[3] = {scheduler.getLws(), 1, 1};
115+
size_t globalWorkSizes[3] = {scheduler.getGws(), 1, 1};
116+
117+
// Create indirectHeap for IOH that is located at the end of device enqueue DSH
118+
size_t curbeOffset = devQueueHw.setSchedulerCrossThreadData(scheduler);
119+
IndirectHeap indirectObjectHeap(dsh->getCpuBase(), dsh->getMaxAvailableSpace());
120+
indirectObjectHeap.getSpace(curbeOffset);
121+
IndirectHeap *ioh = &indirectObjectHeap;
122+
123+
// Program the walker. Invokes execution so all state should already be programmed
124+
auto pGpGpuWalkerCmd = static_cast<GPGPU_WALKER *>(commandStream.getSpace(sizeof(GPGPU_WALKER)));
125+
*pGpGpuWalkerCmd = GfxFamily::cmdInitGpgpuWalker;
126+
127+
bool localIdsGenerationByRuntime = KernelCommandsHelper<GfxFamily>::isRuntimeLocalIdsGenerationRequired(1, globalWorkSizes, localWorkSizes);
128+
bool inlineDataProgrammingRequired = KernelCommandsHelper<GfxFamily>::inlineDataProgrammingRequired(scheduler);
129+
KernelCommandsHelper<GfxFamily>::sendIndirectState(
130+
commandStream,
131+
*dsh,
132+
*ioh,
133+
*ssh,
134+
scheduler,
135+
simd,
136+
localWorkSizes,
137+
offsetInterfaceDescriptorTable,
138+
interfaceDescriptorIndex,
139+
preemptionMode,
140+
pGpGpuWalkerCmd,
141+
nullptr,
142+
localIdsGenerationByRuntime);
143+
144+
// Implement enabling special WA DisableLSQCROPERFforOCL if needed
145+
GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(&commandStream, scheduler, true);
146+
147+
size_t globalOffsets[3] = {0, 0, 0};
148+
size_t workGroups[3] = {(scheduler.getGws() / scheduler.getLws()), 1, 1};
149+
GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(pGpGpuWalkerCmd, globalOffsets, globalOffsets, workGroups, localWorkSizes,
150+
simd, 1, localIdsGenerationByRuntime, inlineDataProgrammingRequired,
151+
*scheduler.getKernelInfo().patchInfo.threadPayload);
152+
153+
// Implement disabling special WA DisableLSQCROPERFforOCL if needed
154+
GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(&commandStream, scheduler, false);
155+
156+
// Do not put BB_START only when returning in first Scheduler run
157+
if (devQueueHw.getSchedulerReturnInstance() != 1) {
158+
159+
PipeControlHelper<GfxFamily>::addPipeControl(commandStream, true);
160+
161+
// Add BB Start Cmd to the SLB in the Primary Batch Buffer
162+
auto *bbStart = static_cast<MI_BATCH_BUFFER_START *>(commandStream.getSpace(sizeof(MI_BATCH_BUFFER_START)));
163+
*bbStart = GfxFamily::cmdInitBatchBufferStart;
164+
bbStart->setSecondLevelBatchBuffer(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER_FIRST_LEVEL_BATCH);
165+
uint64_t slbAddress = devQueueHw.getSlbBuffer()->getGpuAddress();
166+
bbStart->setBatchBufferStartAddressGraphicsaddress472(slbAddress);
167+
}
168+
}
169+
170+
template <typename GfxFamily>
171+
void GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(
172+
LinearStream *cmdStream,
173+
WALKER_TYPE<GfxFamily> *walkerCmd,
174+
TagNode<TimestampPacketStorage> *timestampPacketNode,
175+
TimestampPacketStorage::WriteOperationType writeOperationType) {
176+
177+
if (TimestampPacketStorage::WriteOperationType::AfterWalker == writeOperationType) {
178+
uint64_t address = timestampPacketNode->getGpuAddress() + offsetof(TimestampPacketStorage, packets[0].contextEnd);
179+
PipeControlHelper<GfxFamily>::obtainPipeControlAndProgramPostSyncOperation(cmdStream, PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, address, 0, false);
180+
}
181+
}
182+
183+
template <typename GfxFamily>
184+
size_t EnqueueOperation<GfxFamily>::getSizeRequiredCSKernel(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel) {
185+
size_t size = sizeof(typename GfxFamily::GPGPU_WALKER) + KernelCommandsHelper<GfxFamily>::getSizeRequiredCS(pKernel) +
186+
sizeof(PIPE_CONTROL) * (KernelCommandsHelper<GfxFamily>::isPipeControlWArequired() ? 2 : 1);
187+
size += KernelCommandsHelper<GfxFamily>::getSizeRequiredForCacheFlush(commandQueue, pKernel, 0U, 0U);
188+
size += PreemptionHelper::getPreemptionWaCsSize<GfxFamily>(commandQueue.getDevice());
189+
if (reserveProfilingCmdsSpace) {
190+
size += 2 * sizeof(PIPE_CONTROL) + 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
191+
}
192+
if (reservePerfCounters) {
193+
//start cmds
194+
//P_C: flush CS & TimeStamp BEGIN
195+
size += 2 * sizeof(PIPE_CONTROL);
196+
//SRM NOOPID & Frequency
197+
size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
198+
//gp registers
199+
size += NEO::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
200+
//report perf count
201+
size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
202+
//user registers
203+
size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
204+
205+
//end cmds
206+
//P_C: flush CS & TimeStamp END;
207+
size += 2 * sizeof(PIPE_CONTROL);
208+
//OA buffer (status head, tail)
209+
size += 3 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
210+
//report perf count
211+
size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
212+
//gp registers
213+
size += NEO::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
214+
//SRM NOOPID & Frequency
215+
size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
216+
//user registers
217+
size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
218+
}
219+
size += GpgpuWalkerHelper<GfxFamily>::getSizeForWADisableLSQCROPERFforOCL(pKernel);
220+
221+
return size;
222+
}
223+
224+
template <typename GfxFamily>
225+
size_t EnqueueOperation<GfxFamily>::getSizeRequiredForTimestampPacketWrite() {
226+
return sizeof(PIPE_CONTROL);
227+
}
228+
229+
} // namespace NEO

0 commit comments

Comments
 (0)