Skip to content

Commit 3fdb17b

Browse files
Move hw specific GpgpuWalkerHelper functions to separate file
Change-Id: If2e793d0c3de1a5245bbdee065111a504807b134 Signed-off-by: Filip Hazubski <filip.hazubski@intel.com>
1 parent ce29770 commit 3fdb17b

File tree

6 files changed

+193
-176
lines changed

6 files changed

+193
-176
lines changed

runtime/command_queue/gpgpu_walker.inl

Lines changed: 0 additions & 176 deletions
Original file line numberDiff line numberDiff line change
@@ -99,44 +99,6 @@ void GpgpuWalkerHelper<GfxFamily>::addAluReadModifyWriteRegister(
9999
pCmd5->setStateCacheInvalidationEnable(true);
100100
}
101101

102-
template <typename GfxFamily>
103-
inline size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
104-
WALKER_TYPE<GfxFamily> *walkerCmd,
105-
const size_t globalOffsets[3],
106-
const size_t startWorkGroups[3],
107-
const size_t numWorkGroups[3],
108-
const size_t localWorkSizesIn[3],
109-
uint32_t simd,
110-
uint32_t workDim,
111-
bool localIdsGeneration) {
112-
auto localWorkSize = localWorkSizesIn[0] * localWorkSizesIn[1] * localWorkSizesIn[2];
113-
114-
auto threadsPerWorkGroup = getThreadsPerWG(simd, localWorkSize);
115-
walkerCmd->setThreadWidthCounterMaximum(static_cast<uint32_t>(threadsPerWorkGroup));
116-
117-
walkerCmd->setThreadGroupIdXDimension(static_cast<uint32_t>(numWorkGroups[0]));
118-
walkerCmd->setThreadGroupIdYDimension(static_cast<uint32_t>(numWorkGroups[1]));
119-
walkerCmd->setThreadGroupIdZDimension(static_cast<uint32_t>(numWorkGroups[2]));
120-
121-
// compute executionMask - to tell which SIMD lines are active within thread
122-
auto remainderSimdLanes = localWorkSize & (simd - 1);
123-
uint64_t executionMask = (1ull << remainderSimdLanes) - 1;
124-
if (!executionMask)
125-
executionMask = ~executionMask;
126-
127-
using SIMD_SIZE = typename WALKER_TYPE<GfxFamily>::SIMD_SIZE;
128-
129-
walkerCmd->setRightExecutionMask(static_cast<uint32_t>(executionMask));
130-
walkerCmd->setBottomExecutionMask(static_cast<uint32_t>(0xffffffff));
131-
walkerCmd->setSimdSize(static_cast<SIMD_SIZE>(simd >> 4));
132-
133-
walkerCmd->setThreadGroupIdStartingX(static_cast<uint32_t>(startWorkGroups[0]));
134-
walkerCmd->setThreadGroupIdStartingY(static_cast<uint32_t>(startWorkGroups[1]));
135-
walkerCmd->setThreadGroupIdStartingResumeZ(static_cast<uint32_t>(startWorkGroups[2]));
136-
137-
return localWorkSize;
138-
}
139-
140102
template <typename GfxFamily>
141103
void GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsStart(
142104
HwTimeStamps &hwTimeStamps,
@@ -427,144 +389,6 @@ inline void GpgpuWalkerHelper<GfxFamily>::dispatchOnDeviceWaitlistSemaphores(Lin
427389
}
428390
}
429391

430-
template <typename GfxFamily>
431-
void GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(
432-
LinearStream *cmdStream,
433-
WALKER_TYPE<GfxFamily> *walkerCmd,
434-
TimestampPacket *timestampPacket,
435-
TimestampPacket::WriteOperationType writeOperationType) {
436-
437-
if (TimestampPacket::WriteOperationType::AfterWalker == writeOperationType) {
438-
uint64_t address = timestampPacket->pickAddressForDataWrite(TimestampPacket::DataIndex::ContextEnd);
439-
auto pipeControlCmd = cmdStream->getSpaceForCmd<PIPE_CONTROL>();
440-
*pipeControlCmd = PIPE_CONTROL::sInit();
441-
pipeControlCmd->setCommandStreamerStallEnable(true);
442-
pipeControlCmd->setPostSyncOperation(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA);
443-
pipeControlCmd->setAddress(static_cast<uint32_t>(address & 0x0000FFFFFFFFULL));
444-
pipeControlCmd->setAddressHigh(static_cast<uint32_t>(address >> 32));
445-
pipeControlCmd->setImmediateData(0);
446-
}
447-
}
448-
449-
template <typename GfxFamily>
450-
void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
451-
CommandQueue &commandQueue,
452-
DeviceQueueHw<GfxFamily> &devQueueHw,
453-
PreemptionMode preemptionMode,
454-
SchedulerKernel &scheduler,
455-
IndirectHeap *ssh,
456-
IndirectHeap *dsh) {
457-
458-
using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
459-
using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER;
460-
using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;
461-
462-
OCLRT::LinearStream *commandStream = nullptr;
463-
OCLRT::IndirectHeap *ioh = nullptr;
464-
465-
commandStream = &commandQueue.getCS(0);
466-
467-
bool dcFlush = false;
468-
commandQueue.getDevice().getCommandStreamReceiver().addPipeControl(*commandStream, dcFlush);
469-
470-
uint32_t interfaceDescriptorIndex = devQueueHw.schedulerIDIndex;
471-
const size_t offsetInterfaceDescriptorTable = devQueueHw.colorCalcStateSize;
472-
const size_t offsetInterfaceDescriptor = offsetInterfaceDescriptorTable;
473-
const size_t totalInterfaceDescriptorTableSize = devQueueHw.interfaceDescriptorEntries * sizeof(INTERFACE_DESCRIPTOR_DATA);
474-
475-
// Program media interface descriptor load
476-
KernelCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
477-
*commandStream,
478-
offsetInterfaceDescriptor,
479-
totalInterfaceDescriptorTableSize);
480-
481-
DEBUG_BREAK_IF(offsetInterfaceDescriptorTable % 64 != 0);
482-
483-
// Determine SIMD size
484-
uint32_t simd = scheduler.getKernelInfo().getMaxSimdSize();
485-
DEBUG_BREAK_IF(simd != PARALLEL_SCHEDULER_COMPILATION_SIZE_20);
486-
487-
// Patch our kernel constants
488-
*scheduler.globalWorkOffsetX = 0;
489-
*scheduler.globalWorkOffsetY = 0;
490-
*scheduler.globalWorkOffsetZ = 0;
491-
492-
*scheduler.globalWorkSizeX = (uint32_t)scheduler.getGws();
493-
*scheduler.globalWorkSizeY = 1;
494-
*scheduler.globalWorkSizeZ = 1;
495-
496-
*scheduler.localWorkSizeX = (uint32_t)scheduler.getLws();
497-
*scheduler.localWorkSizeY = 1;
498-
*scheduler.localWorkSizeZ = 1;
499-
500-
*scheduler.localWorkSizeX2 = (uint32_t)scheduler.getLws();
501-
*scheduler.localWorkSizeY2 = 1;
502-
*scheduler.localWorkSizeZ2 = 1;
503-
504-
*scheduler.enqueuedLocalWorkSizeX = (uint32_t)scheduler.getLws();
505-
*scheduler.enqueuedLocalWorkSizeY = 1;
506-
*scheduler.enqueuedLocalWorkSizeZ = 1;
507-
508-
*scheduler.numWorkGroupsX = (uint32_t)(scheduler.getGws() / scheduler.getLws());
509-
*scheduler.numWorkGroupsY = 0;
510-
*scheduler.numWorkGroupsZ = 0;
511-
512-
*scheduler.workDim = 1;
513-
514-
// Send our indirect object data
515-
size_t localWorkSizes[3] = {scheduler.getLws(), 1, 1};
516-
size_t globalWorkSizes[3] = {scheduler.getGws(), 1, 1};
517-
518-
// Create indirectHeap for IOH that is located at the end of device enqueue DSH
519-
size_t curbeOffset = devQueueHw.setSchedulerCrossThreadData(scheduler);
520-
IndirectHeap indirectObjectHeap(dsh->getCpuBase(), dsh->getMaxAvailableSpace());
521-
indirectObjectHeap.getSpace(curbeOffset);
522-
ioh = &indirectObjectHeap;
523-
524-
// Program the walker. Invokes execution so all state should already be programmed
525-
auto pGpGpuWalkerCmd = (GPGPU_WALKER *)commandStream->getSpace(sizeof(GPGPU_WALKER));
526-
*pGpGpuWalkerCmd = GfxFamily::cmdInitGpgpuWalker;
527-
528-
bool localIdsGeneration = KernelCommandsHelper<GfxFamily>::isRuntimeLocalIdsGenerationRequired(1, globalWorkSizes, localWorkSizes);
529-
KernelCommandsHelper<GfxFamily>::sendIndirectState(
530-
*commandStream,
531-
*dsh,
532-
*ioh,
533-
*ssh,
534-
scheduler,
535-
simd,
536-
localWorkSizes,
537-
offsetInterfaceDescriptorTable,
538-
interfaceDescriptorIndex,
539-
preemptionMode,
540-
pGpGpuWalkerCmd,
541-
nullptr,
542-
localIdsGeneration);
543-
544-
// Implement enabling special WA DisableLSQCROPERFforOCL if needed
545-
GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, scheduler, true);
546-
547-
size_t globalOffsets[3] = {0, 0, 0};
548-
size_t workGroups[3] = {(scheduler.getGws() / scheduler.getLws()), 1, 1};
549-
GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(pGpGpuWalkerCmd, globalOffsets, globalOffsets, workGroups, localWorkSizes, simd, 1, localIdsGeneration);
550-
551-
// Implement disabling special WA DisableLSQCROPERFforOCL if needed
552-
GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, scheduler, false);
553-
554-
// Do not put BB_START only when returning in first Scheduler run
555-
if (devQueueHw.getSchedulerReturnInstance() != 1) {
556-
557-
commandQueue.getDevice().getCommandStreamReceiver().addPipeControl(*commandStream, true);
558-
559-
// Add BB Start Cmd to the SLB in the Primary Batch Buffer
560-
auto *bbStart = (MI_BATCH_BUFFER_START *)commandStream->getSpace(sizeof(MI_BATCH_BUFFER_START));
561-
*bbStart = MI_BATCH_BUFFER_START::sInit();
562-
bbStart->setSecondLevelBatchBuffer(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER_FIRST_LEVEL_BATCH);
563-
uint64_t slbAddress = devQueueHw.getSlbBuffer()->getGpuAddress();
564-
bbStart->setBatchBufferStartAddressGraphicsaddress472(slbAddress);
565-
}
566-
}
567-
568392
template <typename GfxFamily>
569393
void GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(OCLRT::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode) {
570394
}
Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
/*
2+
* Copyright (C) 2018 Intel Corporation
3+
*
4+
* SPDX-License-Identifier: MIT
5+
*
6+
*/
7+
8+
#pragma once
9+
#include "runtime/command_queue/gpgpu_walker.h"
10+
11+
namespace OCLRT {
12+
13+
template <typename GfxFamily>
14+
inline size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
15+
WALKER_TYPE<GfxFamily> *walkerCmd,
16+
const size_t globalOffsets[3],
17+
const size_t startWorkGroups[3],
18+
const size_t numWorkGroups[3],
19+
const size_t localWorkSizesIn[3],
20+
uint32_t simd,
21+
uint32_t workDim,
22+
bool localIdsGeneration) {
23+
auto localWorkSize = localWorkSizesIn[0] * localWorkSizesIn[1] * localWorkSizesIn[2];
24+
25+
auto threadsPerWorkGroup = getThreadsPerWG(simd, localWorkSize);
26+
walkerCmd->setThreadWidthCounterMaximum(static_cast<uint32_t>(threadsPerWorkGroup));
27+
28+
walkerCmd->setThreadGroupIdXDimension(static_cast<uint32_t>(numWorkGroups[0]));
29+
walkerCmd->setThreadGroupIdYDimension(static_cast<uint32_t>(numWorkGroups[1]));
30+
walkerCmd->setThreadGroupIdZDimension(static_cast<uint32_t>(numWorkGroups[2]));
31+
32+
// compute executionMask - to tell which SIMD lines are active within thread
33+
auto remainderSimdLanes = localWorkSize & (simd - 1);
34+
uint64_t executionMask = (1ull << remainderSimdLanes) - 1;
35+
if (!executionMask)
36+
executionMask = ~executionMask;
37+
38+
using SIMD_SIZE = typename WALKER_TYPE<GfxFamily>::SIMD_SIZE;
39+
40+
walkerCmd->setRightExecutionMask(static_cast<uint32_t>(executionMask));
41+
walkerCmd->setBottomExecutionMask(static_cast<uint32_t>(0xffffffff));
42+
walkerCmd->setSimdSize(static_cast<SIMD_SIZE>(simd >> 4));
43+
44+
walkerCmd->setThreadGroupIdStartingX(static_cast<uint32_t>(startWorkGroups[0]));
45+
walkerCmd->setThreadGroupIdStartingY(static_cast<uint32_t>(startWorkGroups[1]));
46+
walkerCmd->setThreadGroupIdStartingResumeZ(static_cast<uint32_t>(startWorkGroups[2]));
47+
48+
return localWorkSize;
49+
}
50+
51+
template <typename GfxFamily>
52+
void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
53+
CommandQueue &commandQueue,
54+
DeviceQueueHw<GfxFamily> &devQueueHw,
55+
PreemptionMode preemptionMode,
56+
SchedulerKernel &scheduler,
57+
IndirectHeap *ssh,
58+
IndirectHeap *dsh) {
59+
60+
using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
61+
using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER;
62+
using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;
63+
64+
OCLRT::LinearStream *commandStream = nullptr;
65+
OCLRT::IndirectHeap *ioh = nullptr;
66+
67+
commandStream = &commandQueue.getCS(0);
68+
69+
bool dcFlush = false;
70+
commandQueue.getDevice().getCommandStreamReceiver().addPipeControl(*commandStream, dcFlush);
71+
72+
uint32_t interfaceDescriptorIndex = devQueueHw.schedulerIDIndex;
73+
const size_t offsetInterfaceDescriptorTable = devQueueHw.colorCalcStateSize;
74+
const size_t offsetInterfaceDescriptor = offsetInterfaceDescriptorTable;
75+
const size_t totalInterfaceDescriptorTableSize = devQueueHw.interfaceDescriptorEntries * sizeof(INTERFACE_DESCRIPTOR_DATA);
76+
77+
// Program media interface descriptor load
78+
KernelCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
79+
*commandStream,
80+
offsetInterfaceDescriptor,
81+
totalInterfaceDescriptorTableSize);
82+
83+
DEBUG_BREAK_IF(offsetInterfaceDescriptorTable % 64 != 0);
84+
85+
// Determine SIMD size
86+
uint32_t simd = scheduler.getKernelInfo().getMaxSimdSize();
87+
DEBUG_BREAK_IF(simd != PARALLEL_SCHEDULER_COMPILATION_SIZE_20);
88+
89+
// Patch our kernel constants
90+
*scheduler.globalWorkOffsetX = 0;
91+
*scheduler.globalWorkOffsetY = 0;
92+
*scheduler.globalWorkOffsetZ = 0;
93+
94+
*scheduler.globalWorkSizeX = (uint32_t)scheduler.getGws();
95+
*scheduler.globalWorkSizeY = 1;
96+
*scheduler.globalWorkSizeZ = 1;
97+
98+
*scheduler.localWorkSizeX = (uint32_t)scheduler.getLws();
99+
*scheduler.localWorkSizeY = 1;
100+
*scheduler.localWorkSizeZ = 1;
101+
102+
*scheduler.localWorkSizeX2 = (uint32_t)scheduler.getLws();
103+
*scheduler.localWorkSizeY2 = 1;
104+
*scheduler.localWorkSizeZ2 = 1;
105+
106+
*scheduler.enqueuedLocalWorkSizeX = (uint32_t)scheduler.getLws();
107+
*scheduler.enqueuedLocalWorkSizeY = 1;
108+
*scheduler.enqueuedLocalWorkSizeZ = 1;
109+
110+
*scheduler.numWorkGroupsX = (uint32_t)(scheduler.getGws() / scheduler.getLws());
111+
*scheduler.numWorkGroupsY = 0;
112+
*scheduler.numWorkGroupsZ = 0;
113+
114+
*scheduler.workDim = 1;
115+
116+
// Send our indirect object data
117+
size_t localWorkSizes[3] = {scheduler.getLws(), 1, 1};
118+
size_t globalWorkSizes[3] = {scheduler.getGws(), 1, 1};
119+
120+
// Create indirectHeap for IOH that is located at the end of device enqueue DSH
121+
size_t curbeOffset = devQueueHw.setSchedulerCrossThreadData(scheduler);
122+
IndirectHeap indirectObjectHeap(dsh->getCpuBase(), dsh->getMaxAvailableSpace());
123+
indirectObjectHeap.getSpace(curbeOffset);
124+
ioh = &indirectObjectHeap;
125+
126+
// Program the walker. Invokes execution so all state should already be programmed
127+
auto pGpGpuWalkerCmd = (GPGPU_WALKER *)commandStream->getSpace(sizeof(GPGPU_WALKER));
128+
*pGpGpuWalkerCmd = GfxFamily::cmdInitGpgpuWalker;
129+
130+
bool localIdsGeneration = KernelCommandsHelper<GfxFamily>::isRuntimeLocalIdsGenerationRequired(1, globalWorkSizes, localWorkSizes);
131+
KernelCommandsHelper<GfxFamily>::sendIndirectState(
132+
*commandStream,
133+
*dsh,
134+
*ioh,
135+
*ssh,
136+
scheduler,
137+
simd,
138+
localWorkSizes,
139+
offsetInterfaceDescriptorTable,
140+
interfaceDescriptorIndex,
141+
preemptionMode,
142+
pGpGpuWalkerCmd,
143+
nullptr,
144+
localIdsGeneration);
145+
146+
// Implement enabling special WA DisableLSQCROPERFforOCL if needed
147+
GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, scheduler, true);
148+
149+
size_t globalOffsets[3] = {0, 0, 0};
150+
size_t workGroups[3] = {(scheduler.getGws() / scheduler.getLws()), 1, 1};
151+
GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(pGpGpuWalkerCmd, globalOffsets, globalOffsets, workGroups, localWorkSizes, simd, 1, localIdsGeneration);
152+
153+
// Implement disabling special WA DisableLSQCROPERFforOCL if needed
154+
GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, scheduler, false);
155+
156+
// Do not put BB_START only when returning in first Scheduler run
157+
if (devQueueHw.getSchedulerReturnInstance() != 1) {
158+
159+
commandQueue.getDevice().getCommandStreamReceiver().addPipeControl(*commandStream, true);
160+
161+
// Add BB Start Cmd to the SLB in the Primary Batch Buffer
162+
auto *bbStart = (MI_BATCH_BUFFER_START *)commandStream->getSpace(sizeof(MI_BATCH_BUFFER_START));
163+
*bbStart = MI_BATCH_BUFFER_START::sInit();
164+
bbStart->setSecondLevelBatchBuffer(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER_FIRST_LEVEL_BATCH);
165+
uint64_t slbAddress = devQueueHw.getSlbBuffer()->getGpuAddress();
166+
bbStart->setBatchBufferStartAddressGraphicsaddress472(slbAddress);
167+
}
168+
}
169+
170+
template <typename GfxFamily>
171+
void GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(
172+
LinearStream *cmdStream,
173+
WALKER_TYPE<GfxFamily> *walkerCmd,
174+
TimestampPacket *timestampPacket,
175+
TimestampPacket::WriteOperationType writeOperationType) {
176+
177+
if (TimestampPacket::WriteOperationType::AfterWalker == writeOperationType) {
178+
uint64_t address = timestampPacket->pickAddressForDataWrite(TimestampPacket::DataIndex::ContextEnd);
179+
auto pipeControlCmd = cmdStream->getSpaceForCmd<PIPE_CONTROL>();
180+
*pipeControlCmd = PIPE_CONTROL::sInit();
181+
pipeControlCmd->setCommandStreamerStallEnable(true);
182+
pipeControlCmd->setPostSyncOperation(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA);
183+
pipeControlCmd->setAddress(static_cast<uint32_t>(address & 0x0000FFFFFFFFULL));
184+
pipeControlCmd->setAddressHigh(static_cast<uint32_t>(address >> 32));
185+
pipeControlCmd->setImmediateData(0);
186+
}
187+
}
188+
189+
} // namespace OCLRT

runtime/gen10/gpgpu_walker_gen10.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include "runtime/gen10/hw_info.h"
99
#include "runtime/command_queue/gpgpu_walker.h"
1010
#include "runtime/command_queue/gpgpu_walker.inl"
11+
#include "runtime/command_queue/gpgpu_walker_base.inl"
1112
#include "runtime/command_queue/hardware_interface.h"
1213
#include "runtime/command_queue/hardware_interface.inl"
1314
#include "runtime/command_queue/hardware_interface_base.inl"

runtime/gen8/gpgpu_walker_gen8.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include "runtime/gen8/hw_info.h"
99
#include "runtime/command_queue/gpgpu_walker.h"
1010
#include "runtime/command_queue/gpgpu_walker.inl"
11+
#include "runtime/command_queue/gpgpu_walker_base.inl"
1112
#include "runtime/command_queue/hardware_interface.h"
1213
#include "runtime/command_queue/hardware_interface.inl"
1314
#include "runtime/command_queue/hardware_interface_base.inl"

0 commit comments

Comments
 (0)