diff --git a/.lintrunner.toml b/.lintrunner.toml index 4289239e46c..8ae656c0903 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -173,6 +173,7 @@ exclude_patterns = [ 'extension/asr/runner/transducer_runner.h', 'extension/aten_util/**', 'extension/benchmark/apple/**', + 'extension/cuda/**', 'extension/data_loader/**', 'extension/evalue_util/**', 'extension/flat_tensor/**', diff --git a/CMakeLists.txt b/CMakeLists.txt index b6bae68b0c5..bf6701123df 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -764,6 +764,20 @@ if(EXECUTORCH_BUILD_CUDA find_package_torch() endif() +# Backend-neutral caller-stream guard consumed by the CUDA AOTI backend (and the +# vendored torch-tensorrt delegate). Built before backends/aoti and +# backends/cuda, which link it. +if(EXECUTORCH_BUILD_CUDA) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/cuda) + install( + DIRECTORY extension/cuda/ + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/executorch/extension/cuda + FILES_MATCHING + PATTERN "*.h" + ) + list(APPEND _executorch_extensions extension_cuda) +endif() + # Build common AOTI functionality if needed by CUDA or Metal backends if(EXECUTORCH_BUILD_CUDA OR EXECUTORCH_BUILD_METAL) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/aoti) diff --git a/backends/aoti/CMakeLists.txt b/backends/aoti/CMakeLists.txt index 667bf4f2695..4634f36eb9d 100644 --- a/backends/aoti/CMakeLists.txt +++ b/backends/aoti/CMakeLists.txt @@ -87,7 +87,7 @@ target_compile_definitions( if(EXECUTORCH_BUILD_CUDA) find_package(CUDAToolkit REQUIRED) target_include_directories(slimtensor INTERFACE ${CUDAToolkit_INCLUDE_DIRS}) - target_link_libraries(slimtensor INTERFACE CUDA::cudart) + target_link_libraries(slimtensor INTERFACE CUDA::cudart extension_cuda) endif() install( diff --git a/backends/aoti/slim/core/storage.h b/backends/aoti/slim/core/storage.h index 5e08011d3bd..a9d2ada675b 100644 --- a/backends/aoti/slim/core/storage.h +++ b/backends/aoti/slim/core/storage.h @@ -14,6 +14,7 @@ #include #include #include +#include #endif #include @@ -181,7 +182,7 @@ struct DeviceTraits { // green context would not confine. When a caller stream is active, copy // on it asynchronously and synchronize it to preserve blocking // semantics; otherwise fall back to the plain synchronous copy. - const auto caller_stream = executorch::backends::cuda::getCallerStream(); + const auto caller_stream = executorch::extension::cuda::getCallerStream(); if (caller_stream) { ET_CUDA_CHECK( cudaMemcpyAsync(dst, src, nbytes, direction, *caller_stream)); diff --git a/backends/aoti/slim/core/targets.bzl b/backends/aoti/slim/core/targets.bzl index 42a7b79da6e..616faa3e927 100644 --- a/backends/aoti/slim/core/targets.bzl +++ b/backends/aoti/slim/core/targets.bzl @@ -20,6 +20,7 @@ def define_common_targets(): "//executorch/backends/aoti/slim/c10/cuda:exception", "//executorch/backends/aoti/slim/cuda:guard", "//executorch/backends/cuda/runtime:cuda_allocator", + "//executorch/extension/cuda:caller_stream", ], ) diff --git a/backends/aoti/slim/cuda/guard.cpp b/backends/aoti/slim/cuda/guard.cpp index 8f1ec44d6b6..0d73b414c2d 100644 --- a/backends/aoti/slim/cuda/guard.cpp +++ b/backends/aoti/slim/cuda/guard.cpp @@ -17,7 +17,6 @@ namespace executorch::backends::cuda { namespace { // Thread-local stream storage (private to this file) thread_local std::unordered_map current_streams_; -thread_local std::optional caller_stream_; } // namespace Error setCurrentCUDAStream(cudaStream_t stream, DeviceIndex device_index) { @@ -81,19 +80,6 @@ void clearCurrentCUDAStream(DeviceIndex device_index) { current_streams_.erase(device_index); } -std::optional getCallerStream() { - return caller_stream_; -} - -CallerStreamGuard::CallerStreamGuard(cudaStream_t stream) - : previous_(caller_stream_) { - caller_stream_ = stream; -} - -CallerStreamGuard::~CallerStreamGuard() { - caller_stream_ = previous_; -} - CUDAGuard::CUDAGuard(CUDAGuard&& other) noexcept : original_device_index_(other.original_device_index_), current_device_index_(other.current_device_index_) { diff --git a/backends/aoti/slim/cuda/guard.h b/backends/aoti/slim/cuda/guard.h index 8b51edbbbda..31ea70705ac 100644 --- a/backends/aoti/slim/cuda/guard.h +++ b/backends/aoti/slim/cuda/guard.h @@ -66,33 +66,6 @@ std::optional peekCurrentCUDAStream( */ void clearCurrentCUDAStream(DeviceIndex device_index = -1); -/** - * The CUDA stream the caller selected for this thread (via CallerStreamGuard), - * or std::nullopt if none. The CUDA backend runs on it when set, otherwise it - * uses its own stream. Kept separate from getCurrentCUDAStream so an explicit - * caller choice is distinguishable from a lazily-created stream. - */ -std::optional getCallerStream(); - -/** - * Scopes the CUDA stream the backend should run on for the calling thread, and - * restores the previous selection on destruction. One value per thread; a - * cuGreenCtxStreamCreate stream confines work to that green context's SM - * partition. - */ -class CallerStreamGuard { - public: - explicit CallerStreamGuard(cudaStream_t stream); - ~CallerStreamGuard(); - CallerStreamGuard(const CallerStreamGuard&) = delete; - CallerStreamGuard& operator=(const CallerStreamGuard&) = delete; - CallerStreamGuard(CallerStreamGuard&&) = delete; - CallerStreamGuard& operator=(CallerStreamGuard&&) = delete; - - private: - std::optional previous_; -}; - /** * RAII guard that sets the current CUDA device and restores it on destruction. * This ensures that the device is properly restored even if an exception diff --git a/backends/aoti/slim/cuda/test/targets.bzl b/backends/aoti/slim/cuda/test/targets.bzl index 079f769a509..aef540f7be3 100644 --- a/backends/aoti/slim/cuda/test/targets.bzl +++ b/backends/aoti/slim/cuda/test/targets.bzl @@ -9,6 +9,7 @@ def cuda_slim_cpp_unittest(name): ], deps = [ "//executorch/backends/aoti/slim/cuda:guard", + "//executorch/extension/cuda:caller_stream", "//executorch/runtime/core:core", "//executorch/runtime/core/exec_aten:lib", "//executorch/runtime/platform:platform", diff --git a/backends/aoti/slim/cuda/test/test_cuda_stream_guard.cpp b/backends/aoti/slim/cuda/test/test_cuda_stream_guard.cpp index 0624aaf232d..df618a7b8e9 100644 --- a/backends/aoti/slim/cuda/test/test_cuda_stream_guard.cpp +++ b/backends/aoti/slim/cuda/test/test_cuda_stream_guard.cpp @@ -8,12 +8,14 @@ #include #include +#include #include #include #include using namespace executorch::backends::cuda; +using namespace executorch::extension::cuda; using namespace executorch::runtime; // TODO(gasoonjia): Multiple device tests were not included due to test diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt index e5929bc8174..0ce48d85e92 100644 --- a/backends/cuda/CMakeLists.txt +++ b/backends/cuda/CMakeLists.txt @@ -213,7 +213,7 @@ endif() # consumers. target_link_libraries( aoti_cuda_backend PUBLIC cuda_platform extension_tensor CUDA::cudart - ${CMAKE_DL_LIBS} + extension_cuda ${CMAKE_DL_LIBS} ) if(_cuda_is_msvc_toolchain) diff --git a/backends/cuda/runtime/TARGETS b/backends/cuda/runtime/TARGETS index c8449a95718..f62780b29c2 100644 --- a/backends/cuda/runtime/TARGETS +++ b/backends/cuda/runtime/TARGETS @@ -126,6 +126,7 @@ runtime.cxx_library( "//executorch/backends/aoti/slim/factory:empty", "//executorch/backends/aoti/slim/factory:from_blob", "//executorch/backends/aoti/slim/factory:from_etensor", + "//executorch/extension/cuda:caller_stream", "//executorch/extension/tensor:tensor", "//executorch/runtime/backend:interface", "//executorch/runtime/core/exec_aten/util:tensor_util", diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp index a77ce7b357b..2c11fa57b82 100644 --- a/backends/cuda/runtime/cuda_backend.cpp +++ b/backends/cuda/runtime/cuda_backend.cpp @@ -38,6 +38,7 @@ #include #include #include +#include // Include our shim layer headers #include @@ -490,7 +491,7 @@ class ET_EXPERIMENTAL CudaBackend final // choice here routes the whole execution; restore the prior selection on // return so a caller stream does not linger for later work on this thread. const std::optional caller_stream = - executorch::backends::cuda::getCallerStream(); + executorch::extension::cuda::getCallerStream(); // A captured CUDA graph is bound to its capture stream and cannot be safely // replayed on a different, caller-provided stream. diff --git a/extension/cuda/BUCK b/extension/cuda/BUCK new file mode 100644 index 00000000000..1e8cc179228 --- /dev/null +++ b/extension/cuda/BUCK @@ -0,0 +1,8 @@ +# Any targets that should be shared between fbcode and xplat must be defined in +# targets.bzl. This file can contain xplat-only targets. + +load(":targets.bzl", "define_common_targets") + +oncall("executorch") + +define_common_targets() diff --git a/extension/cuda/CMakeLists.txt b/extension/cuda/CMakeLists.txt new file mode 100644 index 00000000000..dbd74ec7596 --- /dev/null +++ b/extension/cuda/CMakeLists.txt @@ -0,0 +1,41 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Please keep this file formatted by running: +# ~~~ +# cmake-format -i CMakeLists.txt +# ~~~ + +cmake_minimum_required(VERSION 3.19) + +# Source root directory for executorch. +if(NOT EXECUTORCH_ROOT) + set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..) +endif() + +find_package(CUDAToolkit REQUIRED) + +# SHARED on purpose: the caller-stream thread-local must have a single +# definition across every shared object in the process (see export.h). A static +# copy linked into multiple shared libraries would create multiple thread-locals +# and silently break the caller-stream handshake. +add_library(extension_cuda SHARED caller_stream.cpp) +target_link_libraries(extension_cuda PUBLIC CUDA::cudart) +target_include_directories(extension_cuda PUBLIC ${_common_include_directories}) +target_compile_options(extension_cuda PUBLIC ${_common_compile_options}) +target_compile_definitions( + extension_cuda PRIVATE EXECUTORCH_EXTENSION_CUDA_BUILDING +) + +install( + TARGETS extension_cuda + EXPORT ExecuTorchTargets + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + INCLUDES + DESTINATION ${_common_include_directories} +) diff --git a/extension/cuda/TARGETS b/extension/cuda/TARGETS new file mode 100644 index 00000000000..2341af9282f --- /dev/null +++ b/extension/cuda/TARGETS @@ -0,0 +1,8 @@ +# Any targets that should be shared between fbcode and xplat must be defined in +# targets.bzl. This file can contain fbcode-only targets. + +load(":targets.bzl", "define_common_targets") + +oncall("executorch") + +define_common_targets() diff --git a/extension/cuda/caller_stream.cpp b/extension/cuda/caller_stream.cpp new file mode 100644 index 00000000000..b7ec0b19e58 --- /dev/null +++ b/extension/cuda/caller_stream.cpp @@ -0,0 +1,30 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +namespace executorch::extension::cuda { + +namespace { +thread_local std::optional caller_stream_; +} // namespace + +std::optional getCallerStream() { + return caller_stream_; +} + +CallerStreamGuard::CallerStreamGuard(cudaStream_t stream) + : previous_(caller_stream_) { + caller_stream_ = stream; +} + +CallerStreamGuard::~CallerStreamGuard() { + caller_stream_ = previous_; +} + +} // namespace executorch::extension::cuda diff --git a/extension/cuda/caller_stream.h b/extension/cuda/caller_stream.h new file mode 100644 index 00000000000..a2341d380cf --- /dev/null +++ b/extension/cuda/caller_stream.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include + +#include + +namespace executorch::extension::cuda { + +/** + * The CUDA stream selected by the innermost CallerStreamGuard active on this + * thread, or std::nullopt if none is active. + * + * This reports only a stream the caller explicitly selected, so a backend can + * honor that choice or fall back to its own default. It is backend-neutral: any + * CUDA backend (e.g. the CUDA/AOTI delegate and the TensorRT delegate) can + * consult it, so a single caller-provided stream -- including a CUDA + * green-context stream -- can drive several delegates in one program. + */ +EXECUTORCH_EXTENSION_CUDA_API std::optional getCallerStream(); + +/** + * Scopes, for the calling thread, the CUDA stream a backend should run on, and + * restores the previous selection on destruction. Scope it on the thread that + * runs the call; the selection is one value per thread. + * + * A stream created with cuGreenCtxStreamCreate confines work to that green + * context's SM partition; the confinement rides the stream, so the green + * context need not be made current. The caller owns the stream for the guard's + * lifetime. + */ +class EXECUTORCH_EXTENSION_CUDA_API CallerStreamGuard { + public: + explicit CallerStreamGuard(cudaStream_t stream); + ~CallerStreamGuard(); + CallerStreamGuard(const CallerStreamGuard&) = delete; + CallerStreamGuard& operator=(const CallerStreamGuard&) = delete; + CallerStreamGuard(CallerStreamGuard&&) = delete; + CallerStreamGuard& operator=(CallerStreamGuard&&) = delete; + + private: + std::optional previous_; +}; + +// std::optional is trivially copyable (asserted below), so it +// crosses the shared-library boundary unaffected by the libstdc++ CXX11 ABI, +// which only changes the layout of types like std::string and std::list. +static_assert(std::is_trivially_copyable_v>); + +} // namespace executorch::extension::cuda diff --git a/extension/cuda/export.h b/extension/cuda/export.h new file mode 100644 index 00000000000..4d0655b665d --- /dev/null +++ b/extension/cuda/export.h @@ -0,0 +1,23 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +// extension_cuda is a shared library so the caller-stream thread-local has a +// single definition across every shared object in the process; a static copy +// linked into two .so's would create two thread-locals and silently break the +// handshake. These macros export the public symbols from that one library. +#if defined(_WIN32) +#if defined(EXECUTORCH_EXTENSION_CUDA_BUILDING) +#define EXECUTORCH_EXTENSION_CUDA_API __declspec(dllexport) +#else +#define EXECUTORCH_EXTENSION_CUDA_API __declspec(dllimport) +#endif +#else +#define EXECUTORCH_EXTENSION_CUDA_API __attribute__((visibility("default"))) +#endif diff --git a/extension/cuda/targets.bzl b/extension/cuda/targets.bzl new file mode 100644 index 00000000000..6152b9d4835 --- /dev/null +++ b/extension/cuda/targets.bzl @@ -0,0 +1,38 @@ +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +def define_common_targets(): + """Defines targets that should be shared between fbcode and xplat. + + The directory containing this targets.bzl file should also contain both + TARGETS and BUCK files that call this function. + """ + + # Backend-neutral: both the CUDA and TensorRT delegates can depend on it to + # share a caller's stream. The caller-stream thread-local must be one + # instance per process, so the main target stays shareable: OSS cxx_library + # defaults force_static=True, which would duplicate the thread-local into + # every dependent shared object (see export.h). The :caller_stream_static + # variant stays available for fully-static consumers. + runtime.cxx_library( + name = "caller_stream", + srcs = [ + "caller_stream.cpp", + ], + exported_headers = [ + "caller_stream.h", + "export.h", + ], + # Opt out of the OSS force_static default so consumers *can* link one + # shared instance and keep the thread-local unique (see above); the + # wrapper pins preferred_linkage="any", so this allows shared linkage + # rather than forcing it. + force_static = False, + # dllexport branch of export.h when building this lib; inert off Windows. + preprocessor_flags = [ + "-DEXECUTORCH_EXTENSION_CUDA_BUILDING", + ], + visibility = ["PUBLIC"], + external_deps = [ + ("cuda", None, "cuda-lazy"), + ], + )