From ab8b3551616ee760eb16730e60996aa8133e54e3 Mon Sep 17 00:00:00 2001 From: "Hu, Peisen" Date: Thu, 28 May 2026 07:36:50 -0700 Subject: [PATCH 1/4] [SYCL] Add khr_flush and respective UR code Signed-off-by: Hu, Peisen --- sycl/include/sycl/queue.hpp | 5 +++++ sycl/source/detail/queue_impl.cpp | 9 +++++++++ sycl/source/detail/queue_impl.hpp | 2 ++ sycl/source/queue.cpp | 2 ++ unified-runtime/include/unified-runtime/ur_api.h | 14 ++++++++++++++ .../include/unified-runtime/ur_api_funcs.def | 1 + unified-runtime/include/unified-runtime/ur_ddi.h | 5 +++++ unified-runtime/source/adapters/cuda/queue.cpp | 4 ++++ .../source/adapters/cuda/ur_interface_loader.cpp | 1 + unified-runtime/source/adapters/hip/queue.cpp | 4 ++++ .../source/adapters/hip/ur_interface_loader.cpp | 1 + .../adapters/level_zero/ur_interface_loader.cpp | 1 + .../source/adapters/native_cpu/queue.cpp | 5 +++++ .../adapters/native_cpu/ur_interface_loader.cpp | 1 + unified-runtime/source/adapters/offload/queue.cpp | 4 ++++ .../adapters/offload/ur_interface_loader.cpp | 1 + unified-runtime/source/adapters/opencl/queue.cpp | 6 ++++++ .../source/adapters/opencl/ur_interface_loader.cpp | 1 + unified-runtime/source/ur_api.cpp | 7 +++++++ 19 files changed, 74 insertions(+) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 16068d0c81d84..ea03529d4d95f 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -3611,6 +3611,11 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { /// completed, otherwise returns false. bool khr_empty() const; + /// Flushes all commands in the queue to the device, but doesn't wait for them + /// to complete unlike wait(). + /// + void khr_flush() const; + std::optional ext_oneapi_get_last_event() const { return static_cast>(ext_oneapi_get_last_event_impl()); } diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 96ac7fbd17eeb..7c0f44688b741 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -1268,6 +1268,15 @@ bool queue_impl::queue_empty() const { return IsReady; } +void queue_impl::queue_flush() const { + if (MGraph.lock()) { + throw sycl::exception(make_error_code(errc::invalid), + "flush cannot be called for a queue which is " + "recording to a command graph."); + } + getAdapter().call(MQueue); +} + void queue_impl::revisitUnenqueuedCommandsState( const EventImplPtr &CompletedHostTask) { if (MIsInorder) diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index e5716a2a57775..954ed744ba831 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -613,6 +613,8 @@ class queue_impl : public std::enable_shared_from_this { bool queue_empty() const; + void queue_flush() const; + EventImplPtr memcpyToDeviceGlobal(void *DeviceGlobalPtr, const void *Src, bool IsDeviceImageScope, size_t NumBytes, size_t Offset, diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index a8588c3d3d1ce..c6f6dbc34f2ca 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -247,6 +247,8 @@ bool queue::ext_oneapi_empty() const { return impl->queue_empty(); } bool queue::khr_empty() const { return impl->queue_empty(); } +void queue::khr_flush() const { return impl->queue_flush(); } + void queue::ext_oneapi_prod() { impl->flush(); } ur_native_handle_t queue::getNative(int32_t &NativeHandleDesc) const { diff --git a/unified-runtime/include/unified-runtime/ur_api.h b/unified-runtime/include/unified-runtime/ur_api.h index 798423b6ae55a..6857958c88526 100644 --- a/unified-runtime/include/unified-runtime/ur_api.h +++ b/unified-runtime/include/unified-runtime/ur_api.h @@ -512,6 +512,8 @@ typedef enum ur_function_t { UR_FUNCTION_QUEUE_GET_GRAPH_EXP = 314, /// Enumerator for ::urGraphSetDestructionCallbackExp UR_FUNCTION_GRAPH_SET_DESTRUCTION_CALLBACK_EXP = 315, + /// Enumerator for ::urKhrFlush + UR_FUNCTION_KHR_FLUSH = 316, /// @cond UR_FUNCTION_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -6607,6 +6609,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush( /// [in] handle of the queue to be flushed. ur_queue_handle_t hQueue); +UR_APIEXPORT ur_result_t UR_APICALL urKhrFlush( + /// [in] handle of the queue to be flushed. + ur_queue_handle_t hQueue); + #if !defined(__GNUC__) #pragma endregion #endif @@ -14700,6 +14706,14 @@ typedef struct ur_queue_flush_params_t { ur_queue_handle_t *phQueue; } ur_queue_flush_params_t; +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function parameters for urKhrFlush +/// @details Each entry is a pointer to the parameter passed to the function; +/// allowing the callback the ability to modify the parameter's value +typedef struct ur_khr_flush_params_t { + ur_queue_handle_t *phQueue; +} ur_khr_flush_params_t; + /////////////////////////////////////////////////////////////////////////////// /// @brief Function parameters for urQueueBeginGraphCaptureExp /// @details Each entry is a pointer to the parameter passed to the function; diff --git a/unified-runtime/include/unified-runtime/ur_api_funcs.def b/unified-runtime/include/unified-runtime/ur_api_funcs.def index fc481d947a8c7..5b6b67676227a 100644 --- a/unified-runtime/include/unified-runtime/ur_api_funcs.def +++ b/unified-runtime/include/unified-runtime/ur_api_funcs.def @@ -75,6 +75,7 @@ _UR_API(urKernelGetSuggestedLocalWorkSizeWithArgs) _UR_API(urKernelSetExecInfo) _UR_API(urKernelSetSpecializationConstants) _UR_API(urKernelSuggestMaxCooperativeGroupCount) +_UR_API(urKhrFlush) _UR_API(urQueueGetInfo) _UR_API(urQueueCreate) _UR_API(urQueueRetain) diff --git a/unified-runtime/include/unified-runtime/ur_ddi.h b/unified-runtime/include/unified-runtime/ur_ddi.h index cc734e6194d92..1f19ae132a2ec 100644 --- a/unified-runtime/include/unified-runtime/ur_ddi.h +++ b/unified-runtime/include/unified-runtime/ur_ddi.h @@ -623,6 +623,10 @@ typedef ur_result_t(UR_APICALL *ur_pfnQueueFinish_t)(ur_queue_handle_t); /// @brief Function-pointer for urQueueFlush typedef ur_result_t(UR_APICALL *ur_pfnQueueFlush_t)(ur_queue_handle_t); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function-pointer for urQueueFlush +typedef ur_result_t(UR_APICALL *ur_pfnKhrFlush_t)(ur_queue_handle_t); + /////////////////////////////////////////////////////////////////////////////// /// @brief Table of Queue functions pointers typedef struct ur_queue_dditable_t { @@ -634,6 +638,7 @@ typedef struct ur_queue_dditable_t { ur_pfnQueueCreateWithNativeHandle_t pfnCreateWithNativeHandle; ur_pfnQueueFinish_t pfnFinish; ur_pfnQueueFlush_t pfnFlush; + ur_pfnKhrFlush_t pfnKhrFlush; } ur_queue_dditable_t; /////////////////////////////////////////////////////////////////////////////// diff --git a/unified-runtime/source/adapters/cuda/queue.cpp b/unified-runtime/source/adapters/cuda/queue.cpp index f42f59cc8e516..420c845a90cec 100644 --- a/unified-runtime/source/adapters/cuda/queue.cpp +++ b/unified-runtime/source/adapters/cuda/queue.cpp @@ -171,6 +171,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush(ur_queue_handle_t /*hQueue*/) { return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL urKhrFlush(ur_queue_handle_t /*hQueue*/) { + return UR_RESULT_SUCCESS; +} + UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle( ur_queue_handle_t hQueue, ur_queue_native_desc_t * /*pDesc*/, ur_native_handle_t *phNativeQueue) { diff --git a/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp b/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp index 1f6434b2c140f..be438df40c560 100644 --- a/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp @@ -239,6 +239,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetQueueProcAddrTable( pDdiTable->pfnCreateWithNativeHandle = urQueueCreateWithNativeHandle; pDdiTable->pfnFinish = urQueueFinish; pDdiTable->pfnFlush = urQueueFlush; + pDdiTable->pfnKhrFlush = urKhrFlush; pDdiTable->pfnGetInfo = urQueueGetInfo; pDdiTable->pfnGetNativeHandle = urQueueGetNativeHandle; pDdiTable->pfnRelease = urQueueRelease; diff --git a/unified-runtime/source/adapters/hip/queue.cpp b/unified-runtime/source/adapters/hip/queue.cpp index f9b58b6989a41..3736d2e5068f4 100644 --- a/unified-runtime/source/adapters/hip/queue.cpp +++ b/unified-runtime/source/adapters/hip/queue.cpp @@ -196,6 +196,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush(ur_queue_handle_t) { return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL urKhrFlush(ur_queue_handle_t) { + return UR_RESULT_SUCCESS; +} + /// Gets the native HIP handle of a UR queue object /// /// \param[in] hQueue The UR queue to get the native HIP object of. diff --git a/unified-runtime/source/adapters/hip/ur_interface_loader.cpp b/unified-runtime/source/adapters/hip/ur_interface_loader.cpp index 1d6fbdf7a9c0c..af2da615ca5b2 100644 --- a/unified-runtime/source/adapters/hip/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/hip/ur_interface_loader.cpp @@ -239,6 +239,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetQueueProcAddrTable( pDdiTable->pfnCreateWithNativeHandle = urQueueCreateWithNativeHandle; pDdiTable->pfnFinish = urQueueFinish; pDdiTable->pfnFlush = urQueueFlush; + pDdiTable->pfnKhrFlush = urKhrFlush; pDdiTable->pfnGetInfo = urQueueGetInfo; pDdiTable->pfnGetNativeHandle = urQueueGetNativeHandle; pDdiTable->pfnRelease = urQueueRelease; diff --git a/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp b/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp index dc76f7729f937..e3ed8ea67eed0 100644 --- a/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp @@ -459,6 +459,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urGetQueueProcAddrTable( ur::level_zero::urQueueCreateWithNativeHandle; pDdiTable->pfnFinish = ur::level_zero::urQueueFinish; pDdiTable->pfnFlush = ur::level_zero::urQueueFlush; + pDdiTable->pfnKhrFlush = ur::level_zero::urKhrFlush; return result; } diff --git a/unified-runtime/source/adapters/native_cpu/queue.cpp b/unified-runtime/source/adapters/native_cpu/queue.cpp index fb3a8c74cb15a..1026a32163673 100644 --- a/unified-runtime/source/adapters/native_cpu/queue.cpp +++ b/unified-runtime/source/adapters/native_cpu/queue.cpp @@ -94,6 +94,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush(ur_queue_handle_t /*hQueue*/) { DIE_NO_IMPLEMENTATION; } +UR_APIEXPORT ur_result_t UR_APICALL urKhrFlush(ur_queue_handle_t /*hQueue*/) { + + DIE_NO_IMPLEMENTATION; +} + UR_APIEXPORT ur_result_t urQueueBeginGraphCaptureExp(ur_queue_handle_t /* hQueue */) { diff --git a/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp b/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp index c3b121bfb0cdb..e32ce43ea0be2 100644 --- a/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp @@ -239,6 +239,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetQueueProcAddrTable( pDdiTable->pfnCreateWithNativeHandle = urQueueCreateWithNativeHandle; pDdiTable->pfnFinish = urQueueFinish; pDdiTable->pfnFlush = urQueueFlush; + pDdiTable->pfnKhrFlush = urKhrFlush; pDdiTable->pfnGetInfo = urQueueGetInfo; pDdiTable->pfnGetNativeHandle = urQueueGetNativeHandle; pDdiTable->pfnRelease = urQueueRelease; diff --git a/unified-runtime/source/adapters/offload/queue.cpp b/unified-runtime/source/adapters/offload/queue.cpp index 6c213e3bc6450..f8cffdf3b7370 100644 --- a/unified-runtime/source/adapters/offload/queue.cpp +++ b/unified-runtime/source/adapters/offload/queue.cpp @@ -124,6 +124,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush(ur_queue_handle_t) { return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL urKhrFlush(ur_queue_handle_t) { + return UR_RESULT_SUCCESS; +} + UR_APIEXPORT ur_result_t UR_APICALL urQueueBeginGraphCaptureExp(ur_queue_handle_t /* hQueue */) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; diff --git a/unified-runtime/source/adapters/offload/ur_interface_loader.cpp b/unified-runtime/source/adapters/offload/ur_interface_loader.cpp index ce4ab5351a527..86531a8410e21 100644 --- a/unified-runtime/source/adapters/offload/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/offload/ur_interface_loader.cpp @@ -249,6 +249,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetQueueProcAddrTable( pDdiTable->pfnCreateWithNativeHandle = urQueueCreateWithNativeHandle; pDdiTable->pfnFinish = urQueueFinish; pDdiTable->pfnFlush = urQueueFlush; + pDdiTable->pfnKhrFlush = urKhrFlush; pDdiTable->pfnGetInfo = urQueueGetInfo; pDdiTable->pfnGetNativeHandle = urQueueGetNativeHandle; pDdiTable->pfnRelease = urQueueRelease; diff --git a/unified-runtime/source/adapters/opencl/queue.cpp b/unified-runtime/source/adapters/opencl/queue.cpp index 35d56df08e539..cdcc7ed36da5d 100644 --- a/unified-runtime/source/adapters/opencl/queue.cpp +++ b/unified-runtime/source/adapters/opencl/queue.cpp @@ -300,6 +300,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush(ur_queue_handle_t hQueue) { return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL urKhrFlush(ur_queue_handle_t hQueue) { + cl_int RetErr = clFlush(hQueue->CLQueue); + CL_RETURN_ON_FAILURE(RetErr); + return UR_RESULT_SUCCESS; +} + UR_APIEXPORT ur_result_t UR_APICALL urQueueRetain(ur_queue_handle_t hQueue) { hQueue->RefCount.retain(); return UR_RESULT_SUCCESS; diff --git a/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp b/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp index 3205af2df209e..cbd2cc7a302ff 100644 --- a/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp @@ -223,6 +223,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetQueueProcAddrTable( pDdiTable->pfnCreateWithNativeHandle = urQueueCreateWithNativeHandle; pDdiTable->pfnFinish = urQueueFinish; pDdiTable->pfnFlush = urQueueFlush; + pDdiTable->pfnKhrFlush = urKhrFlush; pDdiTable->pfnGetInfo = urQueueGetInfo; pDdiTable->pfnGetNativeHandle = urQueueGetNativeHandle; pDdiTable->pfnRelease = urQueueRelease; diff --git a/unified-runtime/source/ur_api.cpp b/unified-runtime/source/ur_api.cpp index d6b7be91aa80d..1ed74e28aa91b 100644 --- a/unified-runtime/source/ur_api.cpp +++ b/unified-runtime/source/ur_api.cpp @@ -3582,6 +3582,13 @@ ur_result_t UR_APICALL urQueueFlush( return result; } +ur_result_t UR_APICALL urKhrFlush( + /// [in] handle of the queue to be flushed. + ur_queue_handle_t hQueue) { + ur_result_t result = UR_RESULT_SUCCESS; + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Get event object information /// From 2d1e0784692a6a2abdfad9280c50728721120928 Mon Sep 17 00:00:00 2001 From: "Hu, Peisen" Date: Thu, 28 May 2026 07:44:00 -0700 Subject: [PATCH 2/4] [SYCL] Add L0 code Signed-off-by: Hu, Peisen --- unified-runtime/source/adapters/level_zero/queue.cpp | 6 ++++++ .../source/adapters/level_zero/ur_interface_loader.hpp | 1 + unified-runtime/source/adapters/level_zero/v2/queue_api.cpp | 5 +++++ 3 files changed, 12 insertions(+) diff --git a/unified-runtime/source/adapters/level_zero/queue.cpp b/unified-runtime/source/adapters/level_zero/queue.cpp index 2afab55a99c6d..4378cbb3ee908 100644 --- a/unified-runtime/source/adapters/level_zero/queue.cpp +++ b/unified-runtime/source/adapters/level_zero/queue.cpp @@ -929,6 +929,12 @@ ur_result_t urQueueFlush( return Queue->executeAllOpenCommandLists(); } +ur_result_t urKhrFlush( + /// [in] handle of the queue to be flushed. + ur_queue_handle_t Queue) { + return Queue->executeAllOpenCommandLists(); +} + ur_result_t urQueueBeginGraphCaptureExp(ur_queue_handle_t /* hQueue */) { UR_LOG_LEGACY(ERR, logger::LegacyMessage("[UR][L0] {} function not implemented!"), diff --git a/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp b/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp index b56cc5e803c5d..11c7d2ca448b1 100644 --- a/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp +++ b/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp @@ -254,6 +254,7 @@ ur_result_t urQueueCreateWithNativeHandle( ur_queue_handle_t *phQueue); ur_result_t urQueueFinish(ur_queue_handle_t hQueue); ur_result_t urQueueFlush(ur_queue_handle_t hQueue); +ur_result_t urKhrFlush(ur_queue_handle_t hQueue); ur_result_t urEventGetInfo(ur_event_handle_t hEvent, ur_event_info_t propName, size_t propSize, void *pPropValue, size_t *pPropSizeRet); diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp index 0c501870bbf1f..c94b9bd5b16f6 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp @@ -55,6 +55,11 @@ ur_result_t urQueueFlush(ur_queue_handle_t hQueue) try { } catch (...) { return exceptionToResult(std::current_exception()); } +ur_result_t urKhrFlush(ur_queue_handle_t hQueue) try { + return hQueue->get().queueFlush(); +} catch (...) { + return exceptionToResult(std::current_exception()); +} ur_result_t urEnqueueEventsWait(ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, From b51972c5b281fc955a2d010763bba16042804ae2 Mon Sep 17 00:00:00 2001 From: "Hu, Peisen" Date: Tue, 2 Jun 2026 07:10:49 -0700 Subject: [PATCH 3/4] [SYCL] Add Intercept Funcs Signed-off-by: Hu, Peisen --- .../include/unified-runtime/ur_ddi.h | 2 +- .../include/unified-runtime/ur_print.hpp | 6 ++++ .../loader/layers/tracing/ur_trcddi.cpp | 35 +++++++++++++++++++ .../loader/layers/validation/ur_valddi.cpp | 29 +++++++++++++++ unified-runtime/source/loader/loader.def.in | 1 + unified-runtime/source/loader/loader.map.in | 1 + unified-runtime/source/loader/ur_ldrddi.cpp | 17 +++++++++ unified-runtime/source/loader/ur_libapi.cpp | 12 +++++++ unified-runtime/source/ur_api.cpp | 23 ++++++++++++ 9 files changed, 125 insertions(+), 1 deletion(-) diff --git a/unified-runtime/include/unified-runtime/ur_ddi.h b/unified-runtime/include/unified-runtime/ur_ddi.h index 1f19ae132a2ec..4a40098f8e244 100644 --- a/unified-runtime/include/unified-runtime/ur_ddi.h +++ b/unified-runtime/include/unified-runtime/ur_ddi.h @@ -624,7 +624,7 @@ typedef ur_result_t(UR_APICALL *ur_pfnQueueFinish_t)(ur_queue_handle_t); typedef ur_result_t(UR_APICALL *ur_pfnQueueFlush_t)(ur_queue_handle_t); /////////////////////////////////////////////////////////////////////////////// -/// @brief Function-pointer for urQueueFlush +/// @brief Function-pointer for urKhrFlush typedef ur_result_t(UR_APICALL *ur_pfnKhrFlush_t)(ur_queue_handle_t); /////////////////////////////////////////////////////////////////////////////// diff --git a/unified-runtime/include/unified-runtime/ur_print.hpp b/unified-runtime/include/unified-runtime/ur_print.hpp index c29373a7d162a..a90df8f625740 100644 --- a/unified-runtime/include/unified-runtime/ur_print.hpp +++ b/unified-runtime/include/unified-runtime/ur_print.hpp @@ -926,6 +926,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) { case UR_FUNCTION_QUEUE_FLUSH: os << "UR_FUNCTION_QUEUE_FLUSH"; break; + case UR_FUNCTION_KHR_FLUSH: + os << "UR_FUNCTION_KHR_FLUSH"; + break; case UR_FUNCTION_SAMPLER_CREATE: os << "UR_FUNCTION_SAMPLER_CREATE"; break; @@ -22811,6 +22814,9 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os, case UR_FUNCTION_QUEUE_FLUSH: { os << (const struct ur_queue_flush_params_t *)params; } break; + case UR_FUNCTION_KHR_FLUSH: { + os << (const struct ur_khr_flush_params_t *)params; + } break; case UR_FUNCTION_QUEUE_BEGIN_GRAPH_CAPTURE_EXP: { os << (const struct ur_queue_begin_graph_capture_exp_params_t *)params; } break; diff --git a/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp b/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp index 5f8bfca375b61..8a385251d1ee8 100644 --- a/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp +++ b/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp @@ -3603,6 +3603,38 @@ __urdlllocal ur_result_t UR_APICALL urQueueFlush( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKhrFlush +__urdlllocal ur_result_t UR_APICALL urKhrFlush( + /// [in] handle of the queue to be flushed. + ur_queue_handle_t hQueue) { + auto pfnKhrFlush = getContext()->urDdiTable.Queue.pfnKhrFlush; + + if (nullptr == pfnKhrFlush) + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + + ur_khr_flush_params_t params = {&hQueue}; + uint64_t instance = getContext()->notify_begin(UR_FUNCTION_KHR_FLUSH, + "urKhrFlush", ¶ms); + + auto &logger = getContext()->logger; + UR_LOG_L(logger, INFO, " ---> urKhrFlush\n"); + + ur_result_t result = pfnKhrFlush(hQueue); + + getContext()->notify_end(UR_FUNCTION_KHR_FLUSH, "urKhrFlush", ¶ms, + &result, instance); + + if (logger.getLevel() <= UR_LOGGER_LEVEL_INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_KHR_FLUSH, ¶ms); + UR_LOG_L(logger, INFO, " <--- urKhrFlush({}) -> {};\n", args_str.str(), + result); + } + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urEventGetInfo __urdlllocal ur_result_t UR_APICALL urEventGetInfo( @@ -12164,6 +12196,9 @@ __urdlllocal ur_result_t UR_APICALL urGetQueueProcAddrTable( dditable.pfnFlush = pDdiTable->pfnFlush; pDdiTable->pfnFlush = ur_tracing_layer::urQueueFlush; + dditable.pfnKhrFlush = pDdiTable->pfnKhrFlush; + pDdiTable->pfnKhrFlush = ur_tracing_layer::urKhrFlush; + return result; } /////////////////////////////////////////////////////////////////////////////// diff --git a/unified-runtime/source/loader/layers/validation/ur_valddi.cpp b/unified-runtime/source/loader/layers/validation/ur_valddi.cpp index 077e6b736846a..c5e2ab6a78a7c 100644 --- a/unified-runtime/source/loader/layers/validation/ur_valddi.cpp +++ b/unified-runtime/source/loader/layers/validation/ur_valddi.cpp @@ -3541,6 +3541,32 @@ __urdlllocal ur_result_t UR_APICALL urQueueFlush( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKhrFlush +__urdlllocal ur_result_t UR_APICALL urKhrFlush( + /// [in] handle of the queue to be flushed. + ur_queue_handle_t hQueue) { + auto pfnKhrFlush = getContext()->urDdiTable.Queue.pfnKhrFlush; + + if (nullptr == pfnKhrFlush) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + if (getContext()->enableParameterValidation) { + if (NULL == hQueue) + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + } + + if (getContext()->enableLifetimeValidation && + !getContext()->refCountContext->isReferenceValid(hQueue)) { + URLOG_CTX_INVALID_REFERENCE(hQueue); + } + + ur_result_t result = pfnKhrFlush(hQueue); + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urEventGetInfo __urdlllocal ur_result_t UR_APICALL urEventGetInfo( @@ -12947,6 +12973,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetQueueProcAddrTable( dditable.pfnFlush = pDdiTable->pfnFlush; pDdiTable->pfnFlush = ur_validation_layer::urQueueFlush; + dditable.pfnKhrFlush = pDdiTable->pfnKhrFlush; + pDdiTable->pfnKhrFlush = ur_validation_layer::urKhrFlush; + return result; } diff --git a/unified-runtime/source/loader/loader.def.in b/unified-runtime/source/loader/loader.def.in index cc6ec03371edb..bcd7b19eda4df 100644 --- a/unified-runtime/source/loader/loader.def.in +++ b/unified-runtime/source/loader/loader.def.in @@ -163,6 +163,7 @@ EXPORTS urKernelSetExecInfo urKernelSetSpecializationConstants urKernelSuggestMaxCooperativeGroupCount + urKhrFlush urLoaderConfigCreate urLoaderConfigEnableLayer urLoaderConfigGetInfo diff --git a/unified-runtime/source/loader/loader.map.in b/unified-runtime/source/loader/loader.map.in index 9ffb00211b6dc..f347dc6c4113f 100644 --- a/unified-runtime/source/loader/loader.map.in +++ b/unified-runtime/source/loader/loader.map.in @@ -163,6 +163,7 @@ urKernelSetExecInfo; urKernelSetSpecializationConstants; urKernelSuggestMaxCooperativeGroupCount; + urKhrFlush; urLoaderConfigCreate; urLoaderConfigEnableLayer; urLoaderConfigGetInfo; diff --git a/unified-runtime/source/loader/ur_ldrddi.cpp b/unified-runtime/source/loader/ur_ldrddi.cpp index a68a92742892e..1f8eb2ff6ff9c 100644 --- a/unified-runtime/source/loader/ur_ldrddi.cpp +++ b/unified-runtime/source/loader/ur_ldrddi.cpp @@ -1988,6 +1988,22 @@ __urdlllocal ur_result_t UR_APICALL urQueueFlush( return pfnFlush(hQueue); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKhrFlush +__urdlllocal ur_result_t UR_APICALL urKhrFlush( + /// [in] handle of the queue to be flushed. + ur_queue_handle_t hQueue) { + + auto *dditable = *reinterpret_cast(hQueue); + + auto *pfnKhrFlush = dditable->Queue.pfnKhrFlush; + if (nullptr == pfnKhrFlush) + return UR_RESULT_ERROR_UNINITIALIZED; + + // forward to device-platform + return pfnKhrFlush(hQueue); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urEventGetInfo __urdlllocal ur_result_t UR_APICALL urEventGetInfo( @@ -7395,6 +7411,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetQueueProcAddrTable( ur_loader::urQueueCreateWithNativeHandle; pDdiTable->pfnFinish = ur_loader::urQueueFinish; pDdiTable->pfnFlush = ur_loader::urQueueFlush; + pDdiTable->pfnKhrFlush = ur_loader::urKhrFlush; } else { // return pointers directly to platform's DDIs *pDdiTable = ur_loader::getContext()->platforms.front().dditable.Queue; diff --git a/unified-runtime/source/loader/ur_libapi.cpp b/unified-runtime/source/loader/ur_libapi.cpp index 1188978674036..261817c8b415d 100644 --- a/unified-runtime/source/loader/ur_libapi.cpp +++ b/unified-runtime/source/loader/ur_libapi.cpp @@ -4085,6 +4085,18 @@ ur_result_t UR_APICALL urQueueFlush( return exceptionToResult(std::current_exception()); } +ur_result_t UR_APICALL urKhrFlush( + /// [in] handle of the queue to be flushed. + ur_queue_handle_t hQueue) try { + auto pfnKhrFlush = ur_lib::getContext()->urDdiTable.Queue.pfnKhrFlush; + if (nullptr == pfnKhrFlush) + return UR_RESULT_ERROR_UNINITIALIZED; + + return pfnKhrFlush(hQueue); +} catch (...) { + return exceptionToResult(std::current_exception()); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Get event object information /// diff --git a/unified-runtime/source/ur_api.cpp b/unified-runtime/source/ur_api.cpp index 1ed74e28aa91b..0860683da0230 100644 --- a/unified-runtime/source/ur_api.cpp +++ b/unified-runtime/source/ur_api.cpp @@ -3582,6 +3582,29 @@ ur_result_t UR_APICALL urQueueFlush( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Issues all previously enqueued commands in a command queue to the +/// device. +/// +/// @details +/// - Guarantees that all enqueued commands will be issued to the +/// appropriate device. +/// - There is no guarantee that they will be completed after ::urKhrFlush +/// returns. +/// +/// @remarks +/// _Analogues_ +/// - **clFlush** +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hQueue` +/// - ::UR_RESULT_ERROR_INVALID_QUEUE +/// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY ur_result_t UR_APICALL urKhrFlush( /// [in] handle of the queue to be flushed. ur_queue_handle_t hQueue) { From 3f03a26a30713daeed35cdd54674e0a8d6eb1c6e Mon Sep 17 00:00:00 2001 From: "Hu, Peisen" Date: Tue, 2 Jun 2026 07:20:39 -0700 Subject: [PATCH 4/4] [SYCL] Squash all commits and format Signed-off-by: Hu, Peisen --- sycl/include/sycl/queue.hpp | 5 +++ sycl/source/detail/queue_impl.cpp | 9 +++++ sycl/source/detail/queue_impl.hpp | 2 ++ sycl/source/queue.cpp | 2 ++ .../include/unified-runtime/ur_api.h | 14 ++++++++ .../include/unified-runtime/ur_api_funcs.def | 1 + .../include/unified-runtime/ur_ddi.h | 5 +++ .../include/unified-runtime/ur_print.hpp | 6 ++++ .../source/adapters/cuda/queue.cpp | 4 +++ .../adapters/cuda/ur_interface_loader.cpp | 1 + unified-runtime/source/adapters/hip/queue.cpp | 4 +++ .../adapters/hip/ur_interface_loader.cpp | 1 + .../source/adapters/level_zero/queue.cpp | 7 ++++ .../level_zero/ur_interface_loader.cpp | 1 + .../level_zero/ur_interface_loader.hpp | 1 + .../adapters/level_zero/v2/queue_api.cpp | 5 +++ .../source/adapters/native_cpu/queue.cpp | 5 +++ .../native_cpu/ur_interface_loader.cpp | 1 + .../source/adapters/offload/queue.cpp | 4 +++ .../adapters/offload/ur_interface_loader.cpp | 1 + .../source/adapters/opencl/queue.cpp | 6 ++++ .../adapters/opencl/ur_interface_loader.cpp | 1 + .../loader/layers/tracing/ur_trcddi.cpp | 35 +++++++++++++++++++ .../loader/layers/validation/ur_valddi.cpp | 29 +++++++++++++++ unified-runtime/source/loader/loader.def.in | 1 + unified-runtime/source/loader/loader.map.in | 1 + unified-runtime/source/loader/ur_ldrddi.cpp | 17 +++++++++ unified-runtime/source/loader/ur_libapi.cpp | 12 +++++++ unified-runtime/source/ur_api.cpp | 30 ++++++++++++++++ 29 files changed, 211 insertions(+) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 16068d0c81d84..ea03529d4d95f 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -3611,6 +3611,11 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { /// completed, otherwise returns false. bool khr_empty() const; + /// Flushes all commands in the queue to the device, but doesn't wait for them + /// to complete unlike wait(). + /// + void khr_flush() const; + std::optional ext_oneapi_get_last_event() const { return static_cast>(ext_oneapi_get_last_event_impl()); } diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 96ac7fbd17eeb..7c0f44688b741 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -1268,6 +1268,15 @@ bool queue_impl::queue_empty() const { return IsReady; } +void queue_impl::queue_flush() const { + if (MGraph.lock()) { + throw sycl::exception(make_error_code(errc::invalid), + "flush cannot be called for a queue which is " + "recording to a command graph."); + } + getAdapter().call(MQueue); +} + void queue_impl::revisitUnenqueuedCommandsState( const EventImplPtr &CompletedHostTask) { if (MIsInorder) diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index e5716a2a57775..954ed744ba831 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -613,6 +613,8 @@ class queue_impl : public std::enable_shared_from_this { bool queue_empty() const; + void queue_flush() const; + EventImplPtr memcpyToDeviceGlobal(void *DeviceGlobalPtr, const void *Src, bool IsDeviceImageScope, size_t NumBytes, size_t Offset, diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index a8588c3d3d1ce..c6f6dbc34f2ca 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -247,6 +247,8 @@ bool queue::ext_oneapi_empty() const { return impl->queue_empty(); } bool queue::khr_empty() const { return impl->queue_empty(); } +void queue::khr_flush() const { return impl->queue_flush(); } + void queue::ext_oneapi_prod() { impl->flush(); } ur_native_handle_t queue::getNative(int32_t &NativeHandleDesc) const { diff --git a/unified-runtime/include/unified-runtime/ur_api.h b/unified-runtime/include/unified-runtime/ur_api.h index 798423b6ae55a..6857958c88526 100644 --- a/unified-runtime/include/unified-runtime/ur_api.h +++ b/unified-runtime/include/unified-runtime/ur_api.h @@ -512,6 +512,8 @@ typedef enum ur_function_t { UR_FUNCTION_QUEUE_GET_GRAPH_EXP = 314, /// Enumerator for ::urGraphSetDestructionCallbackExp UR_FUNCTION_GRAPH_SET_DESTRUCTION_CALLBACK_EXP = 315, + /// Enumerator for ::urKhrFlush + UR_FUNCTION_KHR_FLUSH = 316, /// @cond UR_FUNCTION_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -6607,6 +6609,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush( /// [in] handle of the queue to be flushed. ur_queue_handle_t hQueue); +UR_APIEXPORT ur_result_t UR_APICALL urKhrFlush( + /// [in] handle of the queue to be flushed. + ur_queue_handle_t hQueue); + #if !defined(__GNUC__) #pragma endregion #endif @@ -14700,6 +14706,14 @@ typedef struct ur_queue_flush_params_t { ur_queue_handle_t *phQueue; } ur_queue_flush_params_t; +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function parameters for urKhrFlush +/// @details Each entry is a pointer to the parameter passed to the function; +/// allowing the callback the ability to modify the parameter's value +typedef struct ur_khr_flush_params_t { + ur_queue_handle_t *phQueue; +} ur_khr_flush_params_t; + /////////////////////////////////////////////////////////////////////////////// /// @brief Function parameters for urQueueBeginGraphCaptureExp /// @details Each entry is a pointer to the parameter passed to the function; diff --git a/unified-runtime/include/unified-runtime/ur_api_funcs.def b/unified-runtime/include/unified-runtime/ur_api_funcs.def index fc481d947a8c7..5b6b67676227a 100644 --- a/unified-runtime/include/unified-runtime/ur_api_funcs.def +++ b/unified-runtime/include/unified-runtime/ur_api_funcs.def @@ -75,6 +75,7 @@ _UR_API(urKernelGetSuggestedLocalWorkSizeWithArgs) _UR_API(urKernelSetExecInfo) _UR_API(urKernelSetSpecializationConstants) _UR_API(urKernelSuggestMaxCooperativeGroupCount) +_UR_API(urKhrFlush) _UR_API(urQueueGetInfo) _UR_API(urQueueCreate) _UR_API(urQueueRetain) diff --git a/unified-runtime/include/unified-runtime/ur_ddi.h b/unified-runtime/include/unified-runtime/ur_ddi.h index cc734e6194d92..4a40098f8e244 100644 --- a/unified-runtime/include/unified-runtime/ur_ddi.h +++ b/unified-runtime/include/unified-runtime/ur_ddi.h @@ -623,6 +623,10 @@ typedef ur_result_t(UR_APICALL *ur_pfnQueueFinish_t)(ur_queue_handle_t); /// @brief Function-pointer for urQueueFlush typedef ur_result_t(UR_APICALL *ur_pfnQueueFlush_t)(ur_queue_handle_t); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function-pointer for urKhrFlush +typedef ur_result_t(UR_APICALL *ur_pfnKhrFlush_t)(ur_queue_handle_t); + /////////////////////////////////////////////////////////////////////////////// /// @brief Table of Queue functions pointers typedef struct ur_queue_dditable_t { @@ -634,6 +638,7 @@ typedef struct ur_queue_dditable_t { ur_pfnQueueCreateWithNativeHandle_t pfnCreateWithNativeHandle; ur_pfnQueueFinish_t pfnFinish; ur_pfnQueueFlush_t pfnFlush; + ur_pfnKhrFlush_t pfnKhrFlush; } ur_queue_dditable_t; /////////////////////////////////////////////////////////////////////////////// diff --git a/unified-runtime/include/unified-runtime/ur_print.hpp b/unified-runtime/include/unified-runtime/ur_print.hpp index c29373a7d162a..a90df8f625740 100644 --- a/unified-runtime/include/unified-runtime/ur_print.hpp +++ b/unified-runtime/include/unified-runtime/ur_print.hpp @@ -926,6 +926,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) { case UR_FUNCTION_QUEUE_FLUSH: os << "UR_FUNCTION_QUEUE_FLUSH"; break; + case UR_FUNCTION_KHR_FLUSH: + os << "UR_FUNCTION_KHR_FLUSH"; + break; case UR_FUNCTION_SAMPLER_CREATE: os << "UR_FUNCTION_SAMPLER_CREATE"; break; @@ -22811,6 +22814,9 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os, case UR_FUNCTION_QUEUE_FLUSH: { os << (const struct ur_queue_flush_params_t *)params; } break; + case UR_FUNCTION_KHR_FLUSH: { + os << (const struct ur_khr_flush_params_t *)params; + } break; case UR_FUNCTION_QUEUE_BEGIN_GRAPH_CAPTURE_EXP: { os << (const struct ur_queue_begin_graph_capture_exp_params_t *)params; } break; diff --git a/unified-runtime/source/adapters/cuda/queue.cpp b/unified-runtime/source/adapters/cuda/queue.cpp index f42f59cc8e516..420c845a90cec 100644 --- a/unified-runtime/source/adapters/cuda/queue.cpp +++ b/unified-runtime/source/adapters/cuda/queue.cpp @@ -171,6 +171,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush(ur_queue_handle_t /*hQueue*/) { return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL urKhrFlush(ur_queue_handle_t /*hQueue*/) { + return UR_RESULT_SUCCESS; +} + UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle( ur_queue_handle_t hQueue, ur_queue_native_desc_t * /*pDesc*/, ur_native_handle_t *phNativeQueue) { diff --git a/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp b/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp index 1f6434b2c140f..be438df40c560 100644 --- a/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp @@ -239,6 +239,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetQueueProcAddrTable( pDdiTable->pfnCreateWithNativeHandle = urQueueCreateWithNativeHandle; pDdiTable->pfnFinish = urQueueFinish; pDdiTable->pfnFlush = urQueueFlush; + pDdiTable->pfnKhrFlush = urKhrFlush; pDdiTable->pfnGetInfo = urQueueGetInfo; pDdiTable->pfnGetNativeHandle = urQueueGetNativeHandle; pDdiTable->pfnRelease = urQueueRelease; diff --git a/unified-runtime/source/adapters/hip/queue.cpp b/unified-runtime/source/adapters/hip/queue.cpp index f9b58b6989a41..3736d2e5068f4 100644 --- a/unified-runtime/source/adapters/hip/queue.cpp +++ b/unified-runtime/source/adapters/hip/queue.cpp @@ -196,6 +196,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush(ur_queue_handle_t) { return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL urKhrFlush(ur_queue_handle_t) { + return UR_RESULT_SUCCESS; +} + /// Gets the native HIP handle of a UR queue object /// /// \param[in] hQueue The UR queue to get the native HIP object of. diff --git a/unified-runtime/source/adapters/hip/ur_interface_loader.cpp b/unified-runtime/source/adapters/hip/ur_interface_loader.cpp index 1d6fbdf7a9c0c..af2da615ca5b2 100644 --- a/unified-runtime/source/adapters/hip/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/hip/ur_interface_loader.cpp @@ -239,6 +239,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetQueueProcAddrTable( pDdiTable->pfnCreateWithNativeHandle = urQueueCreateWithNativeHandle; pDdiTable->pfnFinish = urQueueFinish; pDdiTable->pfnFlush = urQueueFlush; + pDdiTable->pfnKhrFlush = urKhrFlush; pDdiTable->pfnGetInfo = urQueueGetInfo; pDdiTable->pfnGetNativeHandle = urQueueGetNativeHandle; pDdiTable->pfnRelease = urQueueRelease; diff --git a/unified-runtime/source/adapters/level_zero/queue.cpp b/unified-runtime/source/adapters/level_zero/queue.cpp index 2afab55a99c6d..437c988d5613e 100644 --- a/unified-runtime/source/adapters/level_zero/queue.cpp +++ b/unified-runtime/source/adapters/level_zero/queue.cpp @@ -929,6 +929,13 @@ ur_result_t urQueueFlush( return Queue->executeAllOpenCommandLists(); } +ur_result_t urKhrFlush( + /// [in] handle of the queue to be flushed. + ur_queue_handle_t Queue) { + std::scoped_lock Lock(Queue->Mutex); + return Queue->executeAllOpenCommandLists(); +} + ur_result_t urQueueBeginGraphCaptureExp(ur_queue_handle_t /* hQueue */) { UR_LOG_LEGACY(ERR, logger::LegacyMessage("[UR][L0] {} function not implemented!"), diff --git a/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp b/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp index dc76f7729f937..e3ed8ea67eed0 100644 --- a/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp @@ -459,6 +459,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urGetQueueProcAddrTable( ur::level_zero::urQueueCreateWithNativeHandle; pDdiTable->pfnFinish = ur::level_zero::urQueueFinish; pDdiTable->pfnFlush = ur::level_zero::urQueueFlush; + pDdiTable->pfnKhrFlush = ur::level_zero::urKhrFlush; return result; } diff --git a/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp b/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp index b56cc5e803c5d..11c7d2ca448b1 100644 --- a/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp +++ b/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp @@ -254,6 +254,7 @@ ur_result_t urQueueCreateWithNativeHandle( ur_queue_handle_t *phQueue); ur_result_t urQueueFinish(ur_queue_handle_t hQueue); ur_result_t urQueueFlush(ur_queue_handle_t hQueue); +ur_result_t urKhrFlush(ur_queue_handle_t hQueue); ur_result_t urEventGetInfo(ur_event_handle_t hEvent, ur_event_info_t propName, size_t propSize, void *pPropValue, size_t *pPropSizeRet); diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp index 0c501870bbf1f..c94b9bd5b16f6 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp @@ -55,6 +55,11 @@ ur_result_t urQueueFlush(ur_queue_handle_t hQueue) try { } catch (...) { return exceptionToResult(std::current_exception()); } +ur_result_t urKhrFlush(ur_queue_handle_t hQueue) try { + return hQueue->get().queueFlush(); +} catch (...) { + return exceptionToResult(std::current_exception()); +} ur_result_t urEnqueueEventsWait(ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, diff --git a/unified-runtime/source/adapters/native_cpu/queue.cpp b/unified-runtime/source/adapters/native_cpu/queue.cpp index fb3a8c74cb15a..1026a32163673 100644 --- a/unified-runtime/source/adapters/native_cpu/queue.cpp +++ b/unified-runtime/source/adapters/native_cpu/queue.cpp @@ -94,6 +94,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush(ur_queue_handle_t /*hQueue*/) { DIE_NO_IMPLEMENTATION; } +UR_APIEXPORT ur_result_t UR_APICALL urKhrFlush(ur_queue_handle_t /*hQueue*/) { + + DIE_NO_IMPLEMENTATION; +} + UR_APIEXPORT ur_result_t urQueueBeginGraphCaptureExp(ur_queue_handle_t /* hQueue */) { diff --git a/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp b/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp index c3b121bfb0cdb..e32ce43ea0be2 100644 --- a/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp @@ -239,6 +239,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetQueueProcAddrTable( pDdiTable->pfnCreateWithNativeHandle = urQueueCreateWithNativeHandle; pDdiTable->pfnFinish = urQueueFinish; pDdiTable->pfnFlush = urQueueFlush; + pDdiTable->pfnKhrFlush = urKhrFlush; pDdiTable->pfnGetInfo = urQueueGetInfo; pDdiTable->pfnGetNativeHandle = urQueueGetNativeHandle; pDdiTable->pfnRelease = urQueueRelease; diff --git a/unified-runtime/source/adapters/offload/queue.cpp b/unified-runtime/source/adapters/offload/queue.cpp index 6c213e3bc6450..f8cffdf3b7370 100644 --- a/unified-runtime/source/adapters/offload/queue.cpp +++ b/unified-runtime/source/adapters/offload/queue.cpp @@ -124,6 +124,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush(ur_queue_handle_t) { return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL urKhrFlush(ur_queue_handle_t) { + return UR_RESULT_SUCCESS; +} + UR_APIEXPORT ur_result_t UR_APICALL urQueueBeginGraphCaptureExp(ur_queue_handle_t /* hQueue */) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; diff --git a/unified-runtime/source/adapters/offload/ur_interface_loader.cpp b/unified-runtime/source/adapters/offload/ur_interface_loader.cpp index ce4ab5351a527..86531a8410e21 100644 --- a/unified-runtime/source/adapters/offload/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/offload/ur_interface_loader.cpp @@ -249,6 +249,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetQueueProcAddrTable( pDdiTable->pfnCreateWithNativeHandle = urQueueCreateWithNativeHandle; pDdiTable->pfnFinish = urQueueFinish; pDdiTable->pfnFlush = urQueueFlush; + pDdiTable->pfnKhrFlush = urKhrFlush; pDdiTable->pfnGetInfo = urQueueGetInfo; pDdiTable->pfnGetNativeHandle = urQueueGetNativeHandle; pDdiTable->pfnRelease = urQueueRelease; diff --git a/unified-runtime/source/adapters/opencl/queue.cpp b/unified-runtime/source/adapters/opencl/queue.cpp index 35d56df08e539..cdcc7ed36da5d 100644 --- a/unified-runtime/source/adapters/opencl/queue.cpp +++ b/unified-runtime/source/adapters/opencl/queue.cpp @@ -300,6 +300,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush(ur_queue_handle_t hQueue) { return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL urKhrFlush(ur_queue_handle_t hQueue) { + cl_int RetErr = clFlush(hQueue->CLQueue); + CL_RETURN_ON_FAILURE(RetErr); + return UR_RESULT_SUCCESS; +} + UR_APIEXPORT ur_result_t UR_APICALL urQueueRetain(ur_queue_handle_t hQueue) { hQueue->RefCount.retain(); return UR_RESULT_SUCCESS; diff --git a/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp b/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp index 3205af2df209e..cbd2cc7a302ff 100644 --- a/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp @@ -223,6 +223,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetQueueProcAddrTable( pDdiTable->pfnCreateWithNativeHandle = urQueueCreateWithNativeHandle; pDdiTable->pfnFinish = urQueueFinish; pDdiTable->pfnFlush = urQueueFlush; + pDdiTable->pfnKhrFlush = urKhrFlush; pDdiTable->pfnGetInfo = urQueueGetInfo; pDdiTable->pfnGetNativeHandle = urQueueGetNativeHandle; pDdiTable->pfnRelease = urQueueRelease; diff --git a/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp b/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp index 5f8bfca375b61..4b73fb4760efb 100644 --- a/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp +++ b/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp @@ -3603,6 +3603,38 @@ __urdlllocal ur_result_t UR_APICALL urQueueFlush( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKhrFlush +__urdlllocal ur_result_t UR_APICALL urKhrFlush( + /// [in] handle of the queue to be flushed. + ur_queue_handle_t hQueue) { + auto pfnKhrFlush = getContext()->urDdiTable.Queue.pfnKhrFlush; + + if (nullptr == pfnKhrFlush) + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + + ur_khr_flush_params_t params = {&hQueue}; + uint64_t instance = + getContext()->notify_begin(UR_FUNCTION_KHR_FLUSH, "urKhrFlush", ¶ms); + + auto &logger = getContext()->logger; + UR_LOG_L(logger, INFO, " ---> urKhrFlush\n"); + + ur_result_t result = pfnKhrFlush(hQueue); + + getContext()->notify_end(UR_FUNCTION_KHR_FLUSH, "urKhrFlush", ¶ms, + &result, instance); + + if (logger.getLevel() <= UR_LOGGER_LEVEL_INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_KHR_FLUSH, ¶ms); + UR_LOG_L(logger, INFO, " <--- urKhrFlush({}) -> {};\n", args_str.str(), + result); + } + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urEventGetInfo __urdlllocal ur_result_t UR_APICALL urEventGetInfo( @@ -12164,6 +12196,9 @@ __urdlllocal ur_result_t UR_APICALL urGetQueueProcAddrTable( dditable.pfnFlush = pDdiTable->pfnFlush; pDdiTable->pfnFlush = ur_tracing_layer::urQueueFlush; + dditable.pfnKhrFlush = pDdiTable->pfnKhrFlush; + pDdiTable->pfnKhrFlush = ur_tracing_layer::urKhrFlush; + return result; } /////////////////////////////////////////////////////////////////////////////// diff --git a/unified-runtime/source/loader/layers/validation/ur_valddi.cpp b/unified-runtime/source/loader/layers/validation/ur_valddi.cpp index 077e6b736846a..c5e2ab6a78a7c 100644 --- a/unified-runtime/source/loader/layers/validation/ur_valddi.cpp +++ b/unified-runtime/source/loader/layers/validation/ur_valddi.cpp @@ -3541,6 +3541,32 @@ __urdlllocal ur_result_t UR_APICALL urQueueFlush( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKhrFlush +__urdlllocal ur_result_t UR_APICALL urKhrFlush( + /// [in] handle of the queue to be flushed. + ur_queue_handle_t hQueue) { + auto pfnKhrFlush = getContext()->urDdiTable.Queue.pfnKhrFlush; + + if (nullptr == pfnKhrFlush) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + if (getContext()->enableParameterValidation) { + if (NULL == hQueue) + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + } + + if (getContext()->enableLifetimeValidation && + !getContext()->refCountContext->isReferenceValid(hQueue)) { + URLOG_CTX_INVALID_REFERENCE(hQueue); + } + + ur_result_t result = pfnKhrFlush(hQueue); + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urEventGetInfo __urdlllocal ur_result_t UR_APICALL urEventGetInfo( @@ -12947,6 +12973,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetQueueProcAddrTable( dditable.pfnFlush = pDdiTable->pfnFlush; pDdiTable->pfnFlush = ur_validation_layer::urQueueFlush; + dditable.pfnKhrFlush = pDdiTable->pfnKhrFlush; + pDdiTable->pfnKhrFlush = ur_validation_layer::urKhrFlush; + return result; } diff --git a/unified-runtime/source/loader/loader.def.in b/unified-runtime/source/loader/loader.def.in index cc6ec03371edb..bcd7b19eda4df 100644 --- a/unified-runtime/source/loader/loader.def.in +++ b/unified-runtime/source/loader/loader.def.in @@ -163,6 +163,7 @@ EXPORTS urKernelSetExecInfo urKernelSetSpecializationConstants urKernelSuggestMaxCooperativeGroupCount + urKhrFlush urLoaderConfigCreate urLoaderConfigEnableLayer urLoaderConfigGetInfo diff --git a/unified-runtime/source/loader/loader.map.in b/unified-runtime/source/loader/loader.map.in index 9ffb00211b6dc..f347dc6c4113f 100644 --- a/unified-runtime/source/loader/loader.map.in +++ b/unified-runtime/source/loader/loader.map.in @@ -163,6 +163,7 @@ urKernelSetExecInfo; urKernelSetSpecializationConstants; urKernelSuggestMaxCooperativeGroupCount; + urKhrFlush; urLoaderConfigCreate; urLoaderConfigEnableLayer; urLoaderConfigGetInfo; diff --git a/unified-runtime/source/loader/ur_ldrddi.cpp b/unified-runtime/source/loader/ur_ldrddi.cpp index a68a92742892e..1f8eb2ff6ff9c 100644 --- a/unified-runtime/source/loader/ur_ldrddi.cpp +++ b/unified-runtime/source/loader/ur_ldrddi.cpp @@ -1988,6 +1988,22 @@ __urdlllocal ur_result_t UR_APICALL urQueueFlush( return pfnFlush(hQueue); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKhrFlush +__urdlllocal ur_result_t UR_APICALL urKhrFlush( + /// [in] handle of the queue to be flushed. + ur_queue_handle_t hQueue) { + + auto *dditable = *reinterpret_cast(hQueue); + + auto *pfnKhrFlush = dditable->Queue.pfnKhrFlush; + if (nullptr == pfnKhrFlush) + return UR_RESULT_ERROR_UNINITIALIZED; + + // forward to device-platform + return pfnKhrFlush(hQueue); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urEventGetInfo __urdlllocal ur_result_t UR_APICALL urEventGetInfo( @@ -7395,6 +7411,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetQueueProcAddrTable( ur_loader::urQueueCreateWithNativeHandle; pDdiTable->pfnFinish = ur_loader::urQueueFinish; pDdiTable->pfnFlush = ur_loader::urQueueFlush; + pDdiTable->pfnKhrFlush = ur_loader::urKhrFlush; } else { // return pointers directly to platform's DDIs *pDdiTable = ur_loader::getContext()->platforms.front().dditable.Queue; diff --git a/unified-runtime/source/loader/ur_libapi.cpp b/unified-runtime/source/loader/ur_libapi.cpp index 1188978674036..261817c8b415d 100644 --- a/unified-runtime/source/loader/ur_libapi.cpp +++ b/unified-runtime/source/loader/ur_libapi.cpp @@ -4085,6 +4085,18 @@ ur_result_t UR_APICALL urQueueFlush( return exceptionToResult(std::current_exception()); } +ur_result_t UR_APICALL urKhrFlush( + /// [in] handle of the queue to be flushed. + ur_queue_handle_t hQueue) try { + auto pfnKhrFlush = ur_lib::getContext()->urDdiTable.Queue.pfnKhrFlush; + if (nullptr == pfnKhrFlush) + return UR_RESULT_ERROR_UNINITIALIZED; + + return pfnKhrFlush(hQueue); +} catch (...) { + return exceptionToResult(std::current_exception()); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Get event object information /// diff --git a/unified-runtime/source/ur_api.cpp b/unified-runtime/source/ur_api.cpp index d6b7be91aa80d..0860683da0230 100644 --- a/unified-runtime/source/ur_api.cpp +++ b/unified-runtime/source/ur_api.cpp @@ -3582,6 +3582,36 @@ ur_result_t UR_APICALL urQueueFlush( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Issues all previously enqueued commands in a command queue to the +/// device. +/// +/// @details +/// - Guarantees that all enqueued commands will be issued to the +/// appropriate device. +/// - There is no guarantee that they will be completed after ::urKhrFlush +/// returns. +/// +/// @remarks +/// _Analogues_ +/// - **clFlush** +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hQueue` +/// - ::UR_RESULT_ERROR_INVALID_QUEUE +/// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY +ur_result_t UR_APICALL urKhrFlush( + /// [in] handle of the queue to be flushed. + ur_queue_handle_t hQueue) { + ur_result_t result = UR_RESULT_SUCCESS; + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Get event object information ///