From 833360e5094dd65e598a2f0aed568d685b0768de Mon Sep 17 00:00:00 2001
From: Anthony Shoumikhin <shoumikhin@meta.com>
Date: Tue, 9 Jun 2026 10:49:14 -0700
Subject: [PATCH] Make the CUDA caller-stream guard a shared extension/cuda
 library (#20158)

Summary:

Move the caller-stream handshake (`CallerStreamGuard` + `getCallerStream()`) out of the CUDA backend's `backends/aoti/slim/cuda/guard` into a standalone `extension/cuda` library, and build that library as SHARED so several CUDA backends can share one caller-selected stream.

The handshake is a process-wide thread-local: the caller records the stream it wants, and each backend reads it. That only works if there is exactly one copy of the thread-local in the process. If the library were static and linked into two shared objects (for example the CUDA backend and a TensorRT delegate, each whole-archived for backend registration), each shared object would get its own copy, so the caller would write one and the backend would read the other and silently ignore the caller's stream. Building `extension_cuda` as SHARED gives one definition that every consumer references. It must be linked PUBLIC and never whole-archived.

The two public functions are exported through a visibility macro (`extension/cuda/export.h`, mirroring `backends/aoti/export.h`) while the thread-local stays internal to the library. The C++ API is used directly: `getCallerStream()` returns `std::optional<cudaStream_t>`, a trivially copyable pointer and bool that does not depend on the libstdc++ CXX11 ABI, so no C ABI is needed. The header is installed so an external project (such as a TensorRT delegate) can include it.

Differential Revision: D108023495
---
 .lintrunner.toml                              |  1 +
 CMakeLists.txt                                | 14 +++++
 backends/aoti/CMakeLists.txt                  |  2 +-
 backends/aoti/slim/core/storage.h             |  3 +-
 backends/aoti/slim/core/targets.bzl           |  1 +
 backends/aoti/slim/cuda/guard.cpp             | 14 -----
 backends/aoti/slim/cuda/guard.h               | 27 ---------
 backends/aoti/slim/cuda/test/targets.bzl      |  1 +
 .../slim/cuda/test/test_cuda_stream_guard.cpp |  2 +
 backends/cuda/CMakeLists.txt                  |  2 +-
 backends/cuda/runtime/TARGETS                 |  1 +
 backends/cuda/runtime/cuda_backend.cpp        |  3 +-
 extension/cuda/BUCK                           |  8 +++
 extension/cuda/CMakeLists.txt                 | 41 +++++++++++++
 extension/cuda/TARGETS                        |  8 +++
 extension/cuda/caller_stream.cpp              | 30 ++++++++++
 extension/cuda/caller_stream.h                | 59 +++++++++++++++++++
 extension/cuda/export.h                       | 23 ++++++++
 extension/cuda/targets.bzl                    | 38 ++++++++++++
 19 files changed, 233 insertions(+), 45 deletions(-)
 create mode 100644 extension/cuda/BUCK
 create mode 100644 extension/cuda/CMakeLists.txt
 create mode 100644 extension/cuda/TARGETS
 create mode 100644 extension/cuda/caller_stream.cpp
 create mode 100644 extension/cuda/caller_stream.h
 create mode 100644 extension/cuda/export.h
 create mode 100644 extension/cuda/targets.bzl

diff --git a/.lintrunner.toml b/.lintrunner.toml
index 4289239e46c..8ae656c0903 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -173,6 +173,7 @@ exclude_patterns = [
     'extension/asr/runner/transducer_runner.h',
     'extension/aten_util/**',
     'extension/benchmark/apple/**',
+    'extension/cuda/**',
     'extension/data_loader/**',
     'extension/evalue_util/**',
     'extension/flat_tensor/**',
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b6bae68b0c5..bf6701123df 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -764,6 +764,20 @@ if(EXECUTORCH_BUILD_CUDA
   find_package_torch()
 endif()
 
+# Backend-neutral caller-stream guard consumed by the CUDA AOTI backend (and the
+# vendored torch-tensorrt delegate). Built before backends/aoti and
+# backends/cuda, which link it.
+if(EXECUTORCH_BUILD_CUDA)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/cuda)
+  install(
+    DIRECTORY extension/cuda/
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/executorch/extension/cuda
+    FILES_MATCHING
+    PATTERN "*.h"
+  )
+  list(APPEND _executorch_extensions extension_cuda)
+endif()
+
 # Build common AOTI functionality if needed by CUDA or Metal backends
 if(EXECUTORCH_BUILD_CUDA OR EXECUTORCH_BUILD_METAL)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/aoti)
diff --git a/backends/aoti/CMakeLists.txt b/backends/aoti/CMakeLists.txt
index 667bf4f2695..4634f36eb9d 100644
--- a/backends/aoti/CMakeLists.txt
+++ b/backends/aoti/CMakeLists.txt
@@ -87,7 +87,7 @@ target_compile_definitions(
 if(EXECUTORCH_BUILD_CUDA)
   find_package(CUDAToolkit REQUIRED)
   target_include_directories(slimtensor INTERFACE ${CUDAToolkit_INCLUDE_DIRS})
-  target_link_libraries(slimtensor INTERFACE CUDA::cudart)
+  target_link_libraries(slimtensor INTERFACE CUDA::cudart extension_cuda)
 endif()
 
 install(
diff --git a/backends/aoti/slim/core/storage.h b/backends/aoti/slim/core/storage.h
index 5e08011d3bd..a9d2ada675b 100644
--- a/backends/aoti/slim/core/storage.h
+++ b/backends/aoti/slim/core/storage.h
@@ -14,6 +14,7 @@
 #include <executorch/backends/aoti/slim/c10/cuda/Exception.h>
 #include <executorch/backends/aoti/slim/cuda/guard.h>
 #include <executorch/backends/cuda/runtime/cuda_allocator.h>
+#include <executorch/extension/cuda/caller_stream.h>
 #endif
 
 #include <executorch/backends/aoti/slim/c10/core/Device.h>
@@ -181,7 +182,7 @@ struct DeviceTraits<c10::DeviceType::CUDA> {
     // green context would not confine. When a caller stream is active, copy
     // on it asynchronously and synchronize it to preserve blocking
     // semantics; otherwise fall back to the plain synchronous copy.
-    const auto caller_stream = executorch::backends::cuda::getCallerStream();
+    const auto caller_stream = executorch::extension::cuda::getCallerStream();
     if (caller_stream) {
       ET_CUDA_CHECK(
           cudaMemcpyAsync(dst, src, nbytes, direction, *caller_stream));
diff --git a/backends/aoti/slim/core/targets.bzl b/backends/aoti/slim/core/targets.bzl
index 42a7b79da6e..616faa3e927 100644
--- a/backends/aoti/slim/core/targets.bzl
+++ b/backends/aoti/slim/core/targets.bzl
@@ -20,6 +20,7 @@ def define_common_targets():
             "//executorch/backends/aoti/slim/c10/cuda:exception",
             "//executorch/backends/aoti/slim/cuda:guard",
             "//executorch/backends/cuda/runtime:cuda_allocator",
+            "//executorch/extension/cuda:caller_stream",
         ],
     )
 
diff --git a/backends/aoti/slim/cuda/guard.cpp b/backends/aoti/slim/cuda/guard.cpp
index 8f1ec44d6b6..0d73b414c2d 100644
--- a/backends/aoti/slim/cuda/guard.cpp
+++ b/backends/aoti/slim/cuda/guard.cpp
@@ -17,7 +17,6 @@ namespace executorch::backends::cuda {
 namespace {
 // Thread-local stream storage (private to this file)
 thread_local std::unordered_map<DeviceIndex, cudaStream_t> current_streams_;
-thread_local std::optional<cudaStream_t> caller_stream_;
 } // namespace
 
 Error setCurrentCUDAStream(cudaStream_t stream, DeviceIndex device_index) {
@@ -81,19 +80,6 @@ void clearCurrentCUDAStream(DeviceIndex device_index) {
   current_streams_.erase(device_index);
 }
 
-std::optional<cudaStream_t> getCallerStream() {
-  return caller_stream_;
-}
-
-CallerStreamGuard::CallerStreamGuard(cudaStream_t stream)
-    : previous_(caller_stream_) {
-  caller_stream_ = stream;
-}
-
-CallerStreamGuard::~CallerStreamGuard() {
-  caller_stream_ = previous_;
-}
-
 CUDAGuard::CUDAGuard(CUDAGuard&& other) noexcept
     : original_device_index_(other.original_device_index_),
       current_device_index_(other.current_device_index_) {
diff --git a/backends/aoti/slim/cuda/guard.h b/backends/aoti/slim/cuda/guard.h
index 8b51edbbbda..31ea70705ac 100644
--- a/backends/aoti/slim/cuda/guard.h
+++ b/backends/aoti/slim/cuda/guard.h
@@ -66,33 +66,6 @@ std::optional<cudaStream_t> peekCurrentCUDAStream(
  */
 void clearCurrentCUDAStream(DeviceIndex device_index = -1);
 
-/**
- * The CUDA stream the caller selected for this thread (via CallerStreamGuard),
- * or std::nullopt if none. The CUDA backend runs on it when set, otherwise it
- * uses its own stream. Kept separate from getCurrentCUDAStream so an explicit
- * caller choice is distinguishable from a lazily-created stream.
- */
-std::optional<cudaStream_t> getCallerStream();
-
-/**
- * Scopes the CUDA stream the backend should run on for the calling thread, and
- * restores the previous selection on destruction. One value per thread; a
- * cuGreenCtxStreamCreate stream confines work to that green context's SM
- * partition.
- */
-class CallerStreamGuard {
- public:
-  explicit CallerStreamGuard(cudaStream_t stream);
-  ~CallerStreamGuard();
-  CallerStreamGuard(const CallerStreamGuard&) = delete;
-  CallerStreamGuard& operator=(const CallerStreamGuard&) = delete;
-  CallerStreamGuard(CallerStreamGuard&&) = delete;
-  CallerStreamGuard& operator=(CallerStreamGuard&&) = delete;
-
- private:
-  std::optional<cudaStream_t> previous_;
-};
-
 /**
  * RAII guard that sets the current CUDA device and restores it on destruction.
  * This ensures that the device is properly restored even if an exception
diff --git a/backends/aoti/slim/cuda/test/targets.bzl b/backends/aoti/slim/cuda/test/targets.bzl
index 079f769a509..aef540f7be3 100644
--- a/backends/aoti/slim/cuda/test/targets.bzl
+++ b/backends/aoti/slim/cuda/test/targets.bzl
@@ -9,6 +9,7 @@ def cuda_slim_cpp_unittest(name):
         ],
         deps = [
             "//executorch/backends/aoti/slim/cuda:guard",
+            "//executorch/extension/cuda:caller_stream",
             "//executorch/runtime/core:core",
             "//executorch/runtime/core/exec_aten:lib",
             "//executorch/runtime/platform:platform",
diff --git a/backends/aoti/slim/cuda/test/test_cuda_stream_guard.cpp b/backends/aoti/slim/cuda/test/test_cuda_stream_guard.cpp
index 0624aaf232d..df618a7b8e9 100644
--- a/backends/aoti/slim/cuda/test/test_cuda_stream_guard.cpp
+++ b/backends/aoti/slim/cuda/test/test_cuda_stream_guard.cpp
@@ -8,12 +8,14 @@
 
 #include <cuda_runtime.h>
 #include <executorch/backends/aoti/slim/cuda/guard.h>
+#include <executorch/extension/cuda/caller_stream.h>
 #include <executorch/runtime/platform/platform.h>
 #include <gtest/gtest.h>
 
 #include <type_traits>
 
 using namespace executorch::backends::cuda;
+using namespace executorch::extension::cuda;
 using namespace executorch::runtime;
 
 // TODO(gasoonjia): Multiple device tests were not included due to test
diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
index e5929bc8174..0ce48d85e92 100644
--- a/backends/cuda/CMakeLists.txt
+++ b/backends/cuda/CMakeLists.txt
@@ -213,7 +213,7 @@ endif()
 # consumers.
 target_link_libraries(
   aoti_cuda_backend PUBLIC cuda_platform extension_tensor CUDA::cudart
-                           ${CMAKE_DL_LIBS}
+                           extension_cuda ${CMAKE_DL_LIBS}
 )
 
 if(_cuda_is_msvc_toolchain)
diff --git a/backends/cuda/runtime/TARGETS b/backends/cuda/runtime/TARGETS
index c8449a95718..f62780b29c2 100644
--- a/backends/cuda/runtime/TARGETS
+++ b/backends/cuda/runtime/TARGETS
@@ -126,6 +126,7 @@ runtime.cxx_library(
         "//executorch/backends/aoti/slim/factory:empty",
         "//executorch/backends/aoti/slim/factory:from_blob",
         "//executorch/backends/aoti/slim/factory:from_etensor",
+        "//executorch/extension/cuda:caller_stream",
         "//executorch/extension/tensor:tensor",
         "//executorch/runtime/backend:interface",
         "//executorch/runtime/core/exec_aten/util:tensor_util",
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index a77ce7b357b..2c11fa57b82 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -38,6 +38,7 @@
 #include <executorch/backends/aoti/slim/factory/from_blob.h>
 #include <executorch/backends/aoti/slim/factory/from_etensor.h>
 #include <executorch/backends/aoti/slim/util/array_ref_util.h>
+#include <executorch/extension/cuda/caller_stream.h>
 
 // Include our shim layer headers
 #include <executorch/backends/aoti/aoti_delegate_handle.h>
@@ -490,7 +491,7 @@ class ET_EXPERIMENTAL CudaBackend final
     // choice here routes the whole execution; restore the prior selection on
     // return so a caller stream does not linger for later work on this thread.
     const std::optional<cudaStream_t> caller_stream =
-        executorch::backends::cuda::getCallerStream();
+        executorch::extension::cuda::getCallerStream();
 
     // A captured CUDA graph is bound to its capture stream and cannot be safely
     // replayed on a different, caller-provided stream.
diff --git a/extension/cuda/BUCK b/extension/cuda/BUCK
new file mode 100644
index 00000000000..1e8cc179228
--- /dev/null
+++ b/extension/cuda/BUCK
@@ -0,0 +1,8 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain xplat-only targets.
+
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/extension/cuda/CMakeLists.txt b/extension/cuda/CMakeLists.txt
new file mode 100644
index 00000000000..dbd74ec7596
--- /dev/null
+++ b/extension/cuda/CMakeLists.txt
@@ -0,0 +1,41 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Please keep this file formatted by running:
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+
+cmake_minimum_required(VERSION 3.19)
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
+endif()
+
+find_package(CUDAToolkit REQUIRED)
+
+# SHARED on purpose: the caller-stream thread-local must have a single
+# definition across every shared object in the process (see export.h). A static
+# copy linked into multiple shared libraries would create multiple thread-locals
+# and silently break the caller-stream handshake.
+add_library(extension_cuda SHARED caller_stream.cpp)
+target_link_libraries(extension_cuda PUBLIC CUDA::cudart)
+target_include_directories(extension_cuda PUBLIC ${_common_include_directories})
+target_compile_options(extension_cuda PUBLIC ${_common_compile_options})
+target_compile_definitions(
+  extension_cuda PRIVATE EXECUTORCH_EXTENSION_CUDA_BUILDING
+)
+
+install(
+  TARGETS extension_cuda
+  EXPORT ExecuTorchTargets
+  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+  INCLUDES
+  DESTINATION ${_common_include_directories}
+)
diff --git a/extension/cuda/TARGETS b/extension/cuda/TARGETS
new file mode 100644
index 00000000000..2341af9282f
--- /dev/null
+++ b/extension/cuda/TARGETS
@@ -0,0 +1,8 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/extension/cuda/caller_stream.cpp b/extension/cuda/caller_stream.cpp
new file mode 100644
index 00000000000..b7ec0b19e58
--- /dev/null
+++ b/extension/cuda/caller_stream.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/cuda/caller_stream.h>
+
+namespace executorch::extension::cuda {
+
+namespace {
+thread_local std::optional<cudaStream_t> caller_stream_;
+} // namespace
+
+std::optional<cudaStream_t> getCallerStream() {
+  return caller_stream_;
+}
+
+CallerStreamGuard::CallerStreamGuard(cudaStream_t stream)
+    : previous_(caller_stream_) {
+  caller_stream_ = stream;
+}
+
+CallerStreamGuard::~CallerStreamGuard() {
+  caller_stream_ = previous_;
+}
+
+} // namespace executorch::extension::cuda
diff --git a/extension/cuda/caller_stream.h b/extension/cuda/caller_stream.h
new file mode 100644
index 00000000000..a2341d380cf
--- /dev/null
+++ b/extension/cuda/caller_stream.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <optional>
+#include <type_traits>
+
+#include <executorch/extension/cuda/export.h>
+
+namespace executorch::extension::cuda {
+
+/**
+ * The CUDA stream selected by the innermost CallerStreamGuard active on this
+ * thread, or std::nullopt if none is active.
+ *
+ * This reports only a stream the caller explicitly selected, so a backend can
+ * honor that choice or fall back to its own default. It is backend-neutral: any
+ * CUDA backend (e.g. the CUDA/AOTI delegate and the TensorRT delegate) can
+ * consult it, so a single caller-provided stream -- including a CUDA
+ * green-context stream -- can drive several delegates in one program.
+ */
+EXECUTORCH_EXTENSION_CUDA_API std::optional<cudaStream_t> getCallerStream();
+
+/**
+ * Scopes, for the calling thread, the CUDA stream a backend should run on, and
+ * restores the previous selection on destruction. Scope it on the thread that
+ * runs the call; the selection is one value per thread.
+ *
+ * A stream created with cuGreenCtxStreamCreate confines work to that green
+ * context's SM partition; the confinement rides the stream, so the green
+ * context need not be made current. The caller owns the stream for the guard's
+ * lifetime.
+ */
+class EXECUTORCH_EXTENSION_CUDA_API CallerStreamGuard {
+ public:
+  explicit CallerStreamGuard(cudaStream_t stream);
+  ~CallerStreamGuard();
+  CallerStreamGuard(const CallerStreamGuard&) = delete;
+  CallerStreamGuard& operator=(const CallerStreamGuard&) = delete;
+  CallerStreamGuard(CallerStreamGuard&&) = delete;
+  CallerStreamGuard& operator=(CallerStreamGuard&&) = delete;
+
+ private:
+  std::optional<cudaStream_t> previous_;
+};
+
+// std::optional<cudaStream_t> is trivially copyable (asserted below), so it
+// crosses the shared-library boundary unaffected by the libstdc++ CXX11 ABI,
+// which only changes the layout of types like std::string and std::list.
+static_assert(std::is_trivially_copyable_v<std::optional<cudaStream_t>>);
+
+} // namespace executorch::extension::cuda
diff --git a/extension/cuda/export.h b/extension/cuda/export.h
new file mode 100644
index 00000000000..4d0655b665d
--- /dev/null
+++ b/extension/cuda/export.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+// extension_cuda is a shared library so the caller-stream thread-local has a
+// single definition across every shared object in the process; a static copy
+// linked into two .so's would create two thread-locals and silently break the
+// handshake. These macros export the public symbols from that one library.
+#if defined(_WIN32)
+#if defined(EXECUTORCH_EXTENSION_CUDA_BUILDING)
+#define EXECUTORCH_EXTENSION_CUDA_API __declspec(dllexport)
+#else
+#define EXECUTORCH_EXTENSION_CUDA_API __declspec(dllimport)
+#endif
+#else
+#define EXECUTORCH_EXTENSION_CUDA_API __attribute__((visibility("default")))
+#endif
diff --git a/extension/cuda/targets.bzl b/extension/cuda/targets.bzl
new file mode 100644
index 00000000000..6152b9d4835
--- /dev/null
+++ b/extension/cuda/targets.bzl
@@ -0,0 +1,38 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+
+    # Backend-neutral: both the CUDA and TensorRT delegates can depend on it to
+    # share a caller's stream. The caller-stream thread-local must be one
+    # instance per process, so the main target stays shareable: OSS cxx_library
+    # defaults force_static=True, which would duplicate the thread-local into
+    # every dependent shared object (see export.h). The :caller_stream_static
+    # variant stays available for fully-static consumers.
+    runtime.cxx_library(
+        name = "caller_stream",
+        srcs = [
+            "caller_stream.cpp",
+        ],
+        exported_headers = [
+            "caller_stream.h",
+            "export.h",
+        ],
+        # Opt out of the OSS force_static default so consumers *can* link one
+        # shared instance and keep the thread-local unique (see above); the
+        # wrapper pins preferred_linkage="any", so this allows shared linkage
+        # rather than forcing it.
+        force_static = False,
+        # dllexport branch of export.h when building this lib; inert off Windows.
+        preprocessor_flags = [
+            "-DEXECUTORCH_EXTENSION_CUDA_BUILDING",
+        ],
+        visibility = ["PUBLIC"],
+        external_deps = [
+            ("cuda", None, "cuda-lazy"),
+        ],
+    )