@@ -99,44 +99,6 @@ void GpgpuWalkerHelper<GfxFamily>::addAluReadModifyWriteRegister(
99
99
pCmd5->setStateCacheInvalidationEnable (true );
100
100
}
101
101
102
- template <typename GfxFamily>
103
- inline size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
104
- WALKER_TYPE<GfxFamily> *walkerCmd,
105
- const size_t globalOffsets[3 ],
106
- const size_t startWorkGroups[3 ],
107
- const size_t numWorkGroups[3 ],
108
- const size_t localWorkSizesIn[3 ],
109
- uint32_t simd,
110
- uint32_t workDim,
111
- bool localIdsGeneration) {
112
- auto localWorkSize = localWorkSizesIn[0 ] * localWorkSizesIn[1 ] * localWorkSizesIn[2 ];
113
-
114
- auto threadsPerWorkGroup = getThreadsPerWG (simd, localWorkSize);
115
- walkerCmd->setThreadWidthCounterMaximum (static_cast <uint32_t >(threadsPerWorkGroup));
116
-
117
- walkerCmd->setThreadGroupIdXDimension (static_cast <uint32_t >(numWorkGroups[0 ]));
118
- walkerCmd->setThreadGroupIdYDimension (static_cast <uint32_t >(numWorkGroups[1 ]));
119
- walkerCmd->setThreadGroupIdZDimension (static_cast <uint32_t >(numWorkGroups[2 ]));
120
-
121
- // compute executionMask - to tell which SIMD lines are active within thread
122
- auto remainderSimdLanes = localWorkSize & (simd - 1 );
123
- uint64_t executionMask = (1ull << remainderSimdLanes) - 1 ;
124
- if (!executionMask)
125
- executionMask = ~executionMask;
126
-
127
- using SIMD_SIZE = typename WALKER_TYPE<GfxFamily>::SIMD_SIZE;
128
-
129
- walkerCmd->setRightExecutionMask (static_cast <uint32_t >(executionMask));
130
- walkerCmd->setBottomExecutionMask (static_cast <uint32_t >(0xffffffff ));
131
- walkerCmd->setSimdSize (static_cast <SIMD_SIZE>(simd >> 4 ));
132
-
133
- walkerCmd->setThreadGroupIdStartingX (static_cast <uint32_t >(startWorkGroups[0 ]));
134
- walkerCmd->setThreadGroupIdStartingY (static_cast <uint32_t >(startWorkGroups[1 ]));
135
- walkerCmd->setThreadGroupIdStartingResumeZ (static_cast <uint32_t >(startWorkGroups[2 ]));
136
-
137
- return localWorkSize;
138
- }
139
-
140
102
template <typename GfxFamily>
141
103
void GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsStart(
142
104
HwTimeStamps &hwTimeStamps,
@@ -427,144 +389,6 @@ inline void GpgpuWalkerHelper<GfxFamily>::dispatchOnDeviceWaitlistSemaphores(Lin
427
389
}
428
390
}
429
391
430
- template <typename GfxFamily>
431
- void GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(
432
- LinearStream *cmdStream,
433
- WALKER_TYPE<GfxFamily> *walkerCmd,
434
- TimestampPacket *timestampPacket,
435
- TimestampPacket::WriteOperationType writeOperationType) {
436
-
437
- if (TimestampPacket::WriteOperationType::AfterWalker == writeOperationType) {
438
- uint64_t address = timestampPacket->pickAddressForDataWrite (TimestampPacket::DataIndex::ContextEnd);
439
- auto pipeControlCmd = cmdStream->getSpaceForCmd <PIPE_CONTROL>();
440
- *pipeControlCmd = PIPE_CONTROL::sInit ();
441
- pipeControlCmd->setCommandStreamerStallEnable (true );
442
- pipeControlCmd->setPostSyncOperation (PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA);
443
- pipeControlCmd->setAddress (static_cast <uint32_t >(address & 0x0000FFFFFFFFULL ));
444
- pipeControlCmd->setAddressHigh (static_cast <uint32_t >(address >> 32 ));
445
- pipeControlCmd->setImmediateData (0 );
446
- }
447
- }
448
-
449
- template <typename GfxFamily>
450
- void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
451
- CommandQueue &commandQueue,
452
- DeviceQueueHw<GfxFamily> &devQueueHw,
453
- PreemptionMode preemptionMode,
454
- SchedulerKernel &scheduler,
455
- IndirectHeap *ssh,
456
- IndirectHeap *dsh) {
457
-
458
- using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
459
- using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER;
460
- using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;
461
-
462
- OCLRT::LinearStream *commandStream = nullptr ;
463
- OCLRT::IndirectHeap *ioh = nullptr ;
464
-
465
- commandStream = &commandQueue.getCS (0 );
466
-
467
- bool dcFlush = false ;
468
- commandQueue.getDevice ().getCommandStreamReceiver ().addPipeControl (*commandStream, dcFlush);
469
-
470
- uint32_t interfaceDescriptorIndex = devQueueHw.schedulerIDIndex ;
471
- const size_t offsetInterfaceDescriptorTable = devQueueHw.colorCalcStateSize ;
472
- const size_t offsetInterfaceDescriptor = offsetInterfaceDescriptorTable;
473
- const size_t totalInterfaceDescriptorTableSize = devQueueHw.interfaceDescriptorEntries * sizeof (INTERFACE_DESCRIPTOR_DATA);
474
-
475
- // Program media interface descriptor load
476
- KernelCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad (
477
- *commandStream,
478
- offsetInterfaceDescriptor,
479
- totalInterfaceDescriptorTableSize);
480
-
481
- DEBUG_BREAK_IF (offsetInterfaceDescriptorTable % 64 != 0 );
482
-
483
- // Determine SIMD size
484
- uint32_t simd = scheduler.getKernelInfo ().getMaxSimdSize ();
485
- DEBUG_BREAK_IF (simd != PARALLEL_SCHEDULER_COMPILATION_SIZE_20);
486
-
487
- // Patch our kernel constants
488
- *scheduler.globalWorkOffsetX = 0 ;
489
- *scheduler.globalWorkOffsetY = 0 ;
490
- *scheduler.globalWorkOffsetZ = 0 ;
491
-
492
- *scheduler.globalWorkSizeX = (uint32_t )scheduler.getGws ();
493
- *scheduler.globalWorkSizeY = 1 ;
494
- *scheduler.globalWorkSizeZ = 1 ;
495
-
496
- *scheduler.localWorkSizeX = (uint32_t )scheduler.getLws ();
497
- *scheduler.localWorkSizeY = 1 ;
498
- *scheduler.localWorkSizeZ = 1 ;
499
-
500
- *scheduler.localWorkSizeX2 = (uint32_t )scheduler.getLws ();
501
- *scheduler.localWorkSizeY2 = 1 ;
502
- *scheduler.localWorkSizeZ2 = 1 ;
503
-
504
- *scheduler.enqueuedLocalWorkSizeX = (uint32_t )scheduler.getLws ();
505
- *scheduler.enqueuedLocalWorkSizeY = 1 ;
506
- *scheduler.enqueuedLocalWorkSizeZ = 1 ;
507
-
508
- *scheduler.numWorkGroupsX = (uint32_t )(scheduler.getGws () / scheduler.getLws ());
509
- *scheduler.numWorkGroupsY = 0 ;
510
- *scheduler.numWorkGroupsZ = 0 ;
511
-
512
- *scheduler.workDim = 1 ;
513
-
514
- // Send our indirect object data
515
- size_t localWorkSizes[3 ] = {scheduler.getLws (), 1 , 1 };
516
- size_t globalWorkSizes[3 ] = {scheduler.getGws (), 1 , 1 };
517
-
518
- // Create indirectHeap for IOH that is located at the end of device enqueue DSH
519
- size_t curbeOffset = devQueueHw.setSchedulerCrossThreadData (scheduler);
520
- IndirectHeap indirectObjectHeap (dsh->getCpuBase (), dsh->getMaxAvailableSpace ());
521
- indirectObjectHeap.getSpace (curbeOffset);
522
- ioh = &indirectObjectHeap;
523
-
524
- // Program the walker. Invokes execution so all state should already be programmed
525
- auto pGpGpuWalkerCmd = (GPGPU_WALKER *)commandStream->getSpace (sizeof (GPGPU_WALKER));
526
- *pGpGpuWalkerCmd = GfxFamily::cmdInitGpgpuWalker;
527
-
528
- bool localIdsGeneration = KernelCommandsHelper<GfxFamily>::isRuntimeLocalIdsGenerationRequired (1 , globalWorkSizes, localWorkSizes);
529
- KernelCommandsHelper<GfxFamily>::sendIndirectState (
530
- *commandStream,
531
- *dsh,
532
- *ioh,
533
- *ssh,
534
- scheduler,
535
- simd,
536
- localWorkSizes,
537
- offsetInterfaceDescriptorTable,
538
- interfaceDescriptorIndex,
539
- preemptionMode,
540
- pGpGpuWalkerCmd,
541
- nullptr ,
542
- localIdsGeneration);
543
-
544
- // Implement enabling special WA DisableLSQCROPERFforOCL if needed
545
- GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL (commandStream, scheduler, true );
546
-
547
- size_t globalOffsets[3 ] = {0 , 0 , 0 };
548
- size_t workGroups[3 ] = {(scheduler.getGws () / scheduler.getLws ()), 1 , 1 };
549
- GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData (pGpGpuWalkerCmd, globalOffsets, globalOffsets, workGroups, localWorkSizes, simd, 1 , localIdsGeneration);
550
-
551
- // Implement disabling special WA DisableLSQCROPERFforOCL if needed
552
- GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL (commandStream, scheduler, false );
553
-
554
- // Do not put BB_START only when returning in first Scheduler run
555
- if (devQueueHw.getSchedulerReturnInstance () != 1 ) {
556
-
557
- commandQueue.getDevice ().getCommandStreamReceiver ().addPipeControl (*commandStream, true );
558
-
559
- // Add BB Start Cmd to the SLB in the Primary Batch Buffer
560
- auto *bbStart = (MI_BATCH_BUFFER_START *)commandStream->getSpace (sizeof (MI_BATCH_BUFFER_START));
561
- *bbStart = MI_BATCH_BUFFER_START::sInit ();
562
- bbStart->setSecondLevelBatchBuffer (MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER_FIRST_LEVEL_BATCH);
563
- uint64_t slbAddress = devQueueHw.getSlbBuffer ()->getGpuAddress ();
564
- bbStart->setBatchBufferStartAddressGraphicsaddress472 (slbAddress);
565
- }
566
- }
567
-
568
392
template <typename GfxFamily>
569
393
void GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(OCLRT::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode) {
570
394
}
0 commit comments