Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .lintrunner.toml
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ exclude_patterns = [
'extension/asr/runner/transducer_runner.h',
'extension/aten_util/**',
'extension/benchmark/apple/**',
'extension/cuda/**',
'extension/data_loader/**',
'extension/evalue_util/**',
'extension/flat_tensor/**',
Expand Down
14 changes: 14 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -764,6 +764,20 @@ if(EXECUTORCH_BUILD_CUDA
find_package_torch()
endif()

# Backend-neutral caller-stream guard consumed by the CUDA AOTI backend (and the
# vendored torch-tensorrt delegate). Built before backends/aoti and
# backends/cuda, which link it.
if(EXECUTORCH_BUILD_CUDA)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/cuda)
install(
DIRECTORY extension/cuda/
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/executorch/extension/cuda
FILES_MATCHING
PATTERN "*.h"
)
list(APPEND _executorch_extensions extension_cuda)
endif()

# Build common AOTI functionality if needed by CUDA or Metal backends
if(EXECUTORCH_BUILD_CUDA OR EXECUTORCH_BUILD_METAL)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/aoti)
Expand Down
2 changes: 1 addition & 1 deletion backends/aoti/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ target_compile_definitions(
if(EXECUTORCH_BUILD_CUDA)
find_package(CUDAToolkit REQUIRED)
target_include_directories(slimtensor INTERFACE ${CUDAToolkit_INCLUDE_DIRS})
target_link_libraries(slimtensor INTERFACE CUDA::cudart)
target_link_libraries(slimtensor INTERFACE CUDA::cudart extension_cuda)
endif()

install(
Expand Down
3 changes: 2 additions & 1 deletion backends/aoti/slim/core/storage.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include <executorch/backends/aoti/slim/c10/cuda/Exception.h>
#include <executorch/backends/aoti/slim/cuda/guard.h>
#include <executorch/backends/cuda/runtime/cuda_allocator.h>
#include <executorch/extension/cuda/caller_stream.h>
#endif

#include <executorch/backends/aoti/slim/c10/core/Device.h>
Expand Down Expand Up @@ -181,7 +182,7 @@ struct DeviceTraits<c10::DeviceType::CUDA> {
// green context would not confine. When a caller stream is active, copy
// on it asynchronously and synchronize it to preserve blocking
// semantics; otherwise fall back to the plain synchronous copy.
const auto caller_stream = executorch::backends::cuda::getCallerStream();
const auto caller_stream = executorch::extension::cuda::getCallerStream();
if (caller_stream) {
ET_CUDA_CHECK(
cudaMemcpyAsync(dst, src, nbytes, direction, *caller_stream));
Expand Down
1 change: 1 addition & 0 deletions backends/aoti/slim/core/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def define_common_targets():
"//executorch/backends/aoti/slim/c10/cuda:exception",
"//executorch/backends/aoti/slim/cuda:guard",
"//executorch/backends/cuda/runtime:cuda_allocator",
"//executorch/extension/cuda:caller_stream",
],
)

Expand Down
14 changes: 0 additions & 14 deletions backends/aoti/slim/cuda/guard.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ namespace executorch::backends::cuda {
namespace {
// Thread-local stream storage (private to this file)
thread_local std::unordered_map<DeviceIndex, cudaStream_t> current_streams_;
thread_local std::optional<cudaStream_t> caller_stream_;
} // namespace

Error setCurrentCUDAStream(cudaStream_t stream, DeviceIndex device_index) {
Expand Down Expand Up @@ -81,19 +80,6 @@ void clearCurrentCUDAStream(DeviceIndex device_index) {
current_streams_.erase(device_index);
}

std::optional<cudaStream_t> getCallerStream() {
return caller_stream_;
}

CallerStreamGuard::CallerStreamGuard(cudaStream_t stream)
: previous_(caller_stream_) {
caller_stream_ = stream;
}

CallerStreamGuard::~CallerStreamGuard() {
caller_stream_ = previous_;
}

CUDAGuard::CUDAGuard(CUDAGuard&& other) noexcept
: original_device_index_(other.original_device_index_),
current_device_index_(other.current_device_index_) {
Expand Down
27 changes: 0 additions & 27 deletions backends/aoti/slim/cuda/guard.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,33 +66,6 @@ std::optional<cudaStream_t> peekCurrentCUDAStream(
*/
void clearCurrentCUDAStream(DeviceIndex device_index = -1);

/**
* The CUDA stream the caller selected for this thread (via CallerStreamGuard),
* or std::nullopt if none. The CUDA backend runs on it when set, otherwise it
* uses its own stream. Kept separate from getCurrentCUDAStream so an explicit
* caller choice is distinguishable from a lazily-created stream.
*/
std::optional<cudaStream_t> getCallerStream();

/**
* Scopes the CUDA stream the backend should run on for the calling thread, and
* restores the previous selection on destruction. One value per thread; a
* cuGreenCtxStreamCreate stream confines work to that green context's SM
* partition.
*/
class CallerStreamGuard {
public:
explicit CallerStreamGuard(cudaStream_t stream);
~CallerStreamGuard();
CallerStreamGuard(const CallerStreamGuard&) = delete;
CallerStreamGuard& operator=(const CallerStreamGuard&) = delete;
CallerStreamGuard(CallerStreamGuard&&) = delete;
CallerStreamGuard& operator=(CallerStreamGuard&&) = delete;

private:
std::optional<cudaStream_t> previous_;
};

/**
* RAII guard that sets the current CUDA device and restores it on destruction.
* This ensures that the device is properly restored even if an exception
Expand Down
1 change: 1 addition & 0 deletions backends/aoti/slim/cuda/test/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ def cuda_slim_cpp_unittest(name):
],
deps = [
"//executorch/backends/aoti/slim/cuda:guard",
"//executorch/extension/cuda:caller_stream",
"//executorch/runtime/core:core",
"//executorch/runtime/core/exec_aten:lib",
"//executorch/runtime/platform:platform",
Expand Down
2 changes: 2 additions & 0 deletions backends/aoti/slim/cuda/test/test_cuda_stream_guard.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,14 @@

#include <cuda_runtime.h>
#include <executorch/backends/aoti/slim/cuda/guard.h>
#include <executorch/extension/cuda/caller_stream.h>
#include <executorch/runtime/platform/platform.h>
#include <gtest/gtest.h>

#include <type_traits>

using namespace executorch::backends::cuda;
using namespace executorch::extension::cuda;
using namespace executorch::runtime;

// TODO(gasoonjia): Multiple device tests were not included due to test
Expand Down
2 changes: 1 addition & 1 deletion backends/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ endif()
# consumers.
target_link_libraries(
aoti_cuda_backend PUBLIC cuda_platform extension_tensor CUDA::cudart
${CMAKE_DL_LIBS}
extension_cuda ${CMAKE_DL_LIBS}
)

if(_cuda_is_msvc_toolchain)
Expand Down
1 change: 1 addition & 0 deletions backends/cuda/runtime/TARGETS
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ runtime.cxx_library(
"//executorch/backends/aoti/slim/factory:empty",
"//executorch/backends/aoti/slim/factory:from_blob",
"//executorch/backends/aoti/slim/factory:from_etensor",
"//executorch/extension/cuda:caller_stream",
"//executorch/extension/tensor:tensor",
"//executorch/runtime/backend:interface",
"//executorch/runtime/core/exec_aten/util:tensor_util",
Expand Down
3 changes: 2 additions & 1 deletion backends/cuda/runtime/cuda_backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
#include <executorch/backends/aoti/slim/factory/from_blob.h>
#include <executorch/backends/aoti/slim/factory/from_etensor.h>
#include <executorch/backends/aoti/slim/util/array_ref_util.h>
#include <executorch/extension/cuda/caller_stream.h>

// Include our shim layer headers
#include <executorch/backends/aoti/aoti_delegate_handle.h>
Expand Down Expand Up @@ -490,7 +491,7 @@ class ET_EXPERIMENTAL CudaBackend final
// choice here routes the whole execution; restore the prior selection on
// return so a caller stream does not linger for later work on this thread.
const std::optional<cudaStream_t> caller_stream =
executorch::backends::cuda::getCallerStream();
executorch::extension::cuda::getCallerStream();

// A captured CUDA graph is bound to its capture stream and cannot be safely
// replayed on a different, caller-provided stream.
Expand Down
8 changes: 8 additions & 0 deletions extension/cuda/BUCK
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Any targets that should be shared between fbcode and xplat must be defined in
# targets.bzl. This file can contain xplat-only targets.

load(":targets.bzl", "define_common_targets")

oncall("executorch")

define_common_targets()
41 changes: 41 additions & 0 deletions extension/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

# Please keep this file formatted by running:
# ~~~
# cmake-format -i CMakeLists.txt
# ~~~

cmake_minimum_required(VERSION 3.19)

# Source root directory for executorch.
if(NOT EXECUTORCH_ROOT)
set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
endif()

find_package(CUDAToolkit REQUIRED)

# SHARED on purpose: the caller-stream thread-local must have a single
# definition across every shared object in the process (see export.h). A static
# copy linked into multiple shared libraries would create multiple thread-locals
# and silently break the caller-stream handshake.
add_library(extension_cuda SHARED caller_stream.cpp)
target_link_libraries(extension_cuda PUBLIC CUDA::cudart)
target_include_directories(extension_cuda PUBLIC ${_common_include_directories})
target_compile_options(extension_cuda PUBLIC ${_common_compile_options})
target_compile_definitions(
extension_cuda PRIVATE EXECUTORCH_EXTENSION_CUDA_BUILDING
)

install(
TARGETS extension_cuda
EXPORT ExecuTorchTargets
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
INCLUDES
DESTINATION ${_common_include_directories}
)
8 changes: 8 additions & 0 deletions extension/cuda/TARGETS
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Any targets that should be shared between fbcode and xplat must be defined in
# targets.bzl. This file can contain fbcode-only targets.

load(":targets.bzl", "define_common_targets")

oncall("executorch")

define_common_targets()
30 changes: 30 additions & 0 deletions extension/cuda/caller_stream.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/extension/cuda/caller_stream.h>

namespace executorch::extension::cuda {

namespace {
thread_local std::optional<cudaStream_t> caller_stream_;
} // namespace

std::optional<cudaStream_t> getCallerStream() {
return caller_stream_;
}

CallerStreamGuard::CallerStreamGuard(cudaStream_t stream)
: previous_(caller_stream_) {
caller_stream_ = stream;
}

CallerStreamGuard::~CallerStreamGuard() {
caller_stream_ = previous_;
}

} // namespace executorch::extension::cuda
59 changes: 59 additions & 0 deletions extension/cuda/caller_stream.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#pragma once

#include <cuda_runtime.h>
#include <optional>
#include <type_traits>

#include <executorch/extension/cuda/export.h>

namespace executorch::extension::cuda {

/**
* The CUDA stream selected by the innermost CallerStreamGuard active on this
* thread, or std::nullopt if none is active.
*
* This reports only a stream the caller explicitly selected, so a backend can
* honor that choice or fall back to its own default. It is backend-neutral: any
* CUDA backend (e.g. the CUDA/AOTI delegate and the TensorRT delegate) can
* consult it, so a single caller-provided stream -- including a CUDA
* green-context stream -- can drive several delegates in one program.
*/
EXECUTORCH_EXTENSION_CUDA_API std::optional<cudaStream_t> getCallerStream();

/**
* Scopes, for the calling thread, the CUDA stream a backend should run on, and
* restores the previous selection on destruction. Scope it on the thread that
* runs the call; the selection is one value per thread.
*
* A stream created with cuGreenCtxStreamCreate confines work to that green
* context's SM partition; the confinement rides the stream, so the green
* context need not be made current. The caller owns the stream for the guard's
* lifetime.
*/
class EXECUTORCH_EXTENSION_CUDA_API CallerStreamGuard {
public:
explicit CallerStreamGuard(cudaStream_t stream);
~CallerStreamGuard();
CallerStreamGuard(const CallerStreamGuard&) = delete;
CallerStreamGuard& operator=(const CallerStreamGuard&) = delete;
CallerStreamGuard(CallerStreamGuard&&) = delete;
CallerStreamGuard& operator=(CallerStreamGuard&&) = delete;

private:
std::optional<cudaStream_t> previous_;
};

// std::optional<cudaStream_t> is trivially copyable (asserted below), so it
// crosses the shared-library boundary unaffected by the libstdc++ CXX11 ABI,
// which only changes the layout of types like std::string and std::list.
static_assert(std::is_trivially_copyable_v<std::optional<cudaStream_t>>);

} // namespace executorch::extension::cuda
23 changes: 23 additions & 0 deletions extension/cuda/export.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#pragma once

// extension_cuda is a shared library so the caller-stream thread-local has a
// single definition across every shared object in the process; a static copy
// linked into two .so's would create two thread-locals and silently break the
// handshake. These macros export the public symbols from that one library.
#if defined(_WIN32)
#if defined(EXECUTORCH_EXTENSION_CUDA_BUILDING)
#define EXECUTORCH_EXTENSION_CUDA_API __declspec(dllexport)
#else
#define EXECUTORCH_EXTENSION_CUDA_API __declspec(dllimport)
#endif
#else
#define EXECUTORCH_EXTENSION_CUDA_API __attribute__((visibility("default")))
#endif
38 changes: 38 additions & 0 deletions extension/cuda/targets.bzl
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")

def define_common_targets():
"""Defines targets that should be shared between fbcode and xplat.

The directory containing this targets.bzl file should also contain both
TARGETS and BUCK files that call this function.
"""

# Backend-neutral: both the CUDA and TensorRT delegates can depend on it to
# share a caller's stream. The caller-stream thread-local must be one
# instance per process, so the main target stays shareable: OSS cxx_library
# defaults force_static=True, which would duplicate the thread-local into
# every dependent shared object (see export.h). The :caller_stream_static
# variant stays available for fully-static consumers.
runtime.cxx_library(
name = "caller_stream",
srcs = [
"caller_stream.cpp",
],
exported_headers = [
"caller_stream.h",
"export.h",
],
# Opt out of the OSS force_static default so consumers *can* link one
# shared instance and keep the thread-local unique (see above); the
# wrapper pins preferred_linkage="any", so this allows shared linkage
# rather than forcing it.
force_static = False,
# dllexport branch of export.h when building this lib; inert off Windows.
preprocessor_flags = [
"-DEXECUTORCH_EXTENSION_CUDA_BUILDING",
],
visibility = ["PUBLIC"],
external_deps = [
("cuda", None, "cuda-lazy"),
],
)
Loading