pytorch · shoumikhin · Jun 10, 2026 · Jun 9, 2026
diff --git a/.lintrunner.toml b/.lintrunner.toml
@@ -173,6 +173,7 @@ exclude_patterns = [
     'extension/asr/runner/transducer_runner.h',
     'extension/aten_util/**',
     'extension/benchmark/apple/**',
+    'extension/cuda/**',
     'extension/data_loader/**',
     'extension/evalue_util/**',
     'extension/flat_tensor/**',

@@ -764,6 +764,20 @@ if(EXECUTORCH_BUILD_CUDA
   find_package_torch()
 endif()
 
+# Backend-neutral caller-stream guard consumed by the CUDA AOTI backend (and the
+# vendored torch-tensorrt delegate). Built before backends/aoti and
+# backends/cuda, which link it.
+if(EXECUTORCH_BUILD_CUDA)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/cuda)
+  install(
+    DIRECTORY extension/cuda/
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/executorch/extension/cuda
+    FILES_MATCHING
+    PATTERN "*.h"
+  )
+  list(APPEND _executorch_extensions extension_cuda)
+endif()
+
 # Build common AOTI functionality if needed by CUDA or Metal backends
 if(EXECUTORCH_BUILD_CUDA OR EXECUTORCH_BUILD_METAL)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/aoti)

@@ -87,7 +87,7 @@ target_compile_definitions(
 if(EXECUTORCH_BUILD_CUDA)
   find_package(CUDAToolkit REQUIRED)
   target_include_directories(slimtensor INTERFACE ${CUDAToolkit_INCLUDE_DIRS})
-  target_link_libraries(slimtensor INTERFACE CUDA::cudart)
+  target_link_libraries(slimtensor INTERFACE CUDA::cudart extension_cuda)
 endif()
 
 install(

diff --git a/backends/aoti/slim/core/storage.h b/backends/aoti/slim/core/storage.h
@@ -14,6 +14,7 @@
 #include <executorch/backends/aoti/slim/c10/cuda/Exception.h>
 #include <executorch/backends/aoti/slim/cuda/guard.h>
 #include <executorch/backends/cuda/runtime/cuda_allocator.h>
+#include <executorch/extension/cuda/caller_stream.h>
 #endif
 
 #include <executorch/backends/aoti/slim/c10/core/Device.h>
@@ -181,7 +182,7 @@ struct DeviceTraits<c10::DeviceType::CUDA> {
     // green context would not confine. When a caller stream is active, copy
     // on it asynchronously and synchronize it to preserve blocking
     // semantics; otherwise fall back to the plain synchronous copy.
-    const auto caller_stream = executorch::backends::cuda::getCallerStream();
+    const auto caller_stream = executorch::extension::cuda::getCallerStream();
     if (caller_stream) {
       ET_CUDA_CHECK(
           cudaMemcpyAsync(dst, src, nbytes, direction, *caller_stream));

diff --git a/backends/aoti/slim/core/targets.bzl b/backends/aoti/slim/core/targets.bzl
@@ -20,6 +20,7 @@ def define_common_targets():
             "//executorch/backends/aoti/slim/c10/cuda:exception",
             "//executorch/backends/aoti/slim/cuda:guard",
             "//executorch/backends/cuda/runtime:cuda_allocator",
+            "//executorch/extension/cuda:caller_stream",
         ],
     )
 

diff --git a/backends/aoti/slim/cuda/guard.cpp b/backends/aoti/slim/cuda/guard.cpp
@@ -17,7 +17,6 @@ namespace executorch::backends::cuda {
 namespace {
 // Thread-local stream storage (private to this file)
 thread_local std::unordered_map<DeviceIndex, cudaStream_t> current_streams_;
-thread_local std::optional<cudaStream_t> caller_stream_;
 } // namespace
 
 Error setCurrentCUDAStream(cudaStream_t stream, DeviceIndex device_index) {
@@ -81,19 +80,6 @@ void clearCurrentCUDAStream(DeviceIndex device_index) {
   current_streams_.erase(device_index);
 }
 
-std::optional<cudaStream_t> getCallerStream() {
-  return caller_stream_;
-}
-
-CallerStreamGuard::CallerStreamGuard(cudaStream_t stream)
-    : previous_(caller_stream_) {
-  caller_stream_ = stream;
-}
-
-CallerStreamGuard::~CallerStreamGuard() {
-  caller_stream_ = previous_;
-}
-
 CUDAGuard::CUDAGuard(CUDAGuard&& other) noexcept
     : original_device_index_(other.original_device_index_),
       current_device_index_(other.current_device_index_) {

diff --git a/backends/aoti/slim/cuda/guard.h b/backends/aoti/slim/cuda/guard.h
@@ -66,33 +66,6 @@ std::optional<cudaStream_t> peekCurrentCUDAStream(
  */
 void clearCurrentCUDAStream(DeviceIndex device_index = -1);
 
-/**
- * The CUDA stream the caller selected for this thread (via CallerStreamGuard),
- * or std::nullopt if none. The CUDA backend runs on it when set, otherwise it
- * uses its own stream. Kept separate from getCurrentCUDAStream so an explicit
- * caller choice is distinguishable from a lazily-created stream.
- */
-std::optional<cudaStream_t> getCallerStream();
-
-/**
- * Scopes the CUDA stream the backend should run on for the calling thread, and
- * restores the previous selection on destruction. One value per thread; a
- * cuGreenCtxStreamCreate stream confines work to that green context's SM
- * partition.
- */
-class CallerStreamGuard {
- public:
-  explicit CallerStreamGuard(cudaStream_t stream);
-  ~CallerStreamGuard();
-  CallerStreamGuard(const CallerStreamGuard&) = delete;
-  CallerStreamGuard& operator=(const CallerStreamGuard&) = delete;
-  CallerStreamGuard(CallerStreamGuard&&) = delete;
-  CallerStreamGuard& operator=(CallerStreamGuard&&) = delete;
-
- private:
-  std::optional<cudaStream_t> previous_;
-};
-
 /**
  * RAII guard that sets the current CUDA device and restores it on destruction.
  * This ensures that the device is properly restored even if an exception

diff --git a/backends/aoti/slim/cuda/test/targets.bzl b/backends/aoti/slim/cuda/test/targets.bzl
@@ -9,6 +9,7 @@ def cuda_slim_cpp_unittest(name):
         ],
         deps = [
             "//executorch/backends/aoti/slim/cuda:guard",
+            "//executorch/extension/cuda:caller_stream",
             "//executorch/runtime/core:core",
             "//executorch/runtime/core/exec_aten:lib",
             "//executorch/runtime/platform:platform",

diff --git a/backends/aoti/slim/cuda/test/test_cuda_stream_guard.cpp b/backends/aoti/slim/cuda/test/test_cuda_stream_guard.cpp
@@ -8,12 +8,14 @@
 
 #include <cuda_runtime.h>
 #include <executorch/backends/aoti/slim/cuda/guard.h>
+#include <executorch/extension/cuda/caller_stream.h>
 #include <executorch/runtime/platform/platform.h>
 #include <gtest/gtest.h>
 
 #include <type_traits>
 
 using namespace executorch::backends::cuda;
+using namespace executorch::extension::cuda;
 using namespace executorch::runtime;
 
 // TODO(gasoonjia): Multiple device tests were not included due to test

@@ -213,7 +213,7 @@ endif()
 # consumers.
 target_link_libraries(
   aoti_cuda_backend PUBLIC cuda_platform extension_tensor CUDA::cudart
-                           ${CMAKE_DL_LIBS}
+                           extension_cuda ${CMAKE_DL_LIBS}
 )
 
 if(_cuda_is_msvc_toolchain)

diff --git a/backends/cuda/runtime/TARGETS b/backends/cuda/runtime/TARGETS
@@ -126,6 +126,7 @@ runtime.cxx_library(
         "//executorch/backends/aoti/slim/factory:empty",
         "//executorch/backends/aoti/slim/factory:from_blob",
         "//executorch/backends/aoti/slim/factory:from_etensor",
+        "//executorch/extension/cuda:caller_stream",
         "//executorch/extension/tensor:tensor",
         "//executorch/runtime/backend:interface",
         "//executorch/runtime/core/exec_aten/util:tensor_util",

diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
@@ -38,6 +38,7 @@
 #include <executorch/backends/aoti/slim/factory/from_blob.h>
 #include <executorch/backends/aoti/slim/factory/from_etensor.h>
 #include <executorch/backends/aoti/slim/util/array_ref_util.h>
+#include <executorch/extension/cuda/caller_stream.h>
 
 // Include our shim layer headers
 #include <executorch/backends/aoti/aoti_delegate_handle.h>
@@ -490,7 +491,7 @@ class ET_EXPERIMENTAL CudaBackend final
     // choice here routes the whole execution; restore the prior selection on
     // return so a caller stream does not linger for later work on this thread.
     const std::optional<cudaStream_t> caller_stream =
-        executorch::backends::cuda::getCallerStream();
+        executorch::extension::cuda::getCallerStream();
 
     // A captured CUDA graph is bound to its capture stream and cannot be safely
     // replayed on a different, caller-provided stream.

diff --git a/extension/cuda/BUCK b/extension/cuda/BUCK
@@ -0,0 +1,8 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain xplat-only targets.
+
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
@@ -0,0 +1,41 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Please keep this file formatted by running:
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+
+cmake_minimum_required(VERSION 3.19)
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
+endif()
+
+find_package(CUDAToolkit REQUIRED)
+
+# SHARED on purpose: the caller-stream thread-local must have a single
+# definition across every shared object in the process (see export.h). A static
+# copy linked into multiple shared libraries would create multiple thread-locals
+# and silently break the caller-stream handshake.
+add_library(extension_cuda SHARED caller_stream.cpp)
+target_link_libraries(extension_cuda PUBLIC CUDA::cudart)
+target_include_directories(extension_cuda PUBLIC ${_common_include_directories})
+target_compile_options(extension_cuda PUBLIC ${_common_compile_options})
+target_compile_definitions(
+  extension_cuda PRIVATE EXECUTORCH_EXTENSION_CUDA_BUILDING
+)
+
+install(
+  TARGETS extension_cuda
+  EXPORT ExecuTorchTargets
+  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+  INCLUDES
+  DESTINATION ${_common_include_directories}
+)
diff --git a/extension/cuda/TARGETS b/extension/cuda/TARGETS
@@ -0,0 +1,8 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/extension/cuda/caller_stream.cpp b/extension/cuda/caller_stream.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/cuda/caller_stream.h>
+
+namespace executorch::extension::cuda {
+
+namespace {
+thread_local std::optional<cudaStream_t> caller_stream_;
+} // namespace
+
+std::optional<cudaStream_t> getCallerStream() {
+  return caller_stream_;
+}
+
+CallerStreamGuard::CallerStreamGuard(cudaStream_t stream)
+    : previous_(caller_stream_) {
+  caller_stream_ = stream;
+}
+
+CallerStreamGuard::~CallerStreamGuard() {
+  caller_stream_ = previous_;
+}
+
+} // namespace executorch::extension::cuda
diff --git a/extension/cuda/caller_stream.h b/extension/cuda/caller_stream.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <optional>
+#include <type_traits>
+
+#include <executorch/extension/cuda/export.h>
+
+namespace executorch::extension::cuda {
+
+/**
+ * The CUDA stream selected by the innermost CallerStreamGuard active on this
+ * thread, or std::nullopt if none is active.
+ *
+ * This reports only a stream the caller explicitly selected, so a backend can
+ * honor that choice or fall back to its own default. It is backend-neutral: any
+ * CUDA backend (e.g. the CUDA/AOTI delegate and the TensorRT delegate) can
+ * consult it, so a single caller-provided stream -- including a CUDA
+ * green-context stream -- can drive several delegates in one program.
+ */
+EXECUTORCH_EXTENSION_CUDA_API std::optional<cudaStream_t> getCallerStream();
+
+/**
+ * Scopes, for the calling thread, the CUDA stream a backend should run on, and
+ * restores the previous selection on destruction. Scope it on the thread that
+ * runs the call; the selection is one value per thread.
+ *
+ * A stream created with cuGreenCtxStreamCreate confines work to that green
+ * context's SM partition; the confinement rides the stream, so the green
+ * context need not be made current. The caller owns the stream for the guard's
+ * lifetime.
+ */
+class EXECUTORCH_EXTENSION_CUDA_API CallerStreamGuard {
+ public:
+  explicit CallerStreamGuard(cudaStream_t stream);
+  ~CallerStreamGuard();
+  CallerStreamGuard(const CallerStreamGuard&) = delete;
+  CallerStreamGuard& operator=(const CallerStreamGuard&) = delete;
+  CallerStreamGuard(CallerStreamGuard&&) = delete;
+  CallerStreamGuard& operator=(CallerStreamGuard&&) = delete;
+
+ private:
+  std::optional<cudaStream_t> previous_;
+};
+
+// std::optional<cudaStream_t> is trivially copyable (asserted below), so it
+// crosses the shared-library boundary unaffected by the libstdc++ CXX11 ABI,
+// which only changes the layout of types like std::string and std::list.
+static_assert(std::is_trivially_copyable_v<std::optional<cudaStream_t>>);
+
+} // namespace executorch::extension::cuda
diff --git a/extension/cuda/export.h b/extension/cuda/export.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+// extension_cuda is a shared library so the caller-stream thread-local has a
+// single definition across every shared object in the process; a static copy
+// linked into two .so's would create two thread-locals and silently break the
+// handshake. These macros export the public symbols from that one library.
+#if defined(_WIN32)
+#if defined(EXECUTORCH_EXTENSION_CUDA_BUILDING)
+#define EXECUTORCH_EXTENSION_CUDA_API __declspec(dllexport)
+#else
+#define EXECUTORCH_EXTENSION_CUDA_API __declspec(dllimport)
+#endif
+#else
+#define EXECUTORCH_EXTENSION_CUDA_API __attribute__((visibility("default")))
+#endif
diff --git a/extension/cuda/targets.bzl b/extension/cuda/targets.bzl
@@ -0,0 +1,38 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+
+    # Backend-neutral: both the CUDA and TensorRT delegates can depend on it to
+    # share a caller's stream. The caller-stream thread-local must be one
+    # instance per process, so the main target stays shareable: OSS cxx_library
+    # defaults force_static=True, which would duplicate the thread-local into
+    # every dependent shared object (see export.h). The :caller_stream_static
+    # variant stays available for fully-static consumers.
+    runtime.cxx_library(
+        name = "caller_stream",
+        srcs = [
+            "caller_stream.cpp",
+        ],
+        exported_headers = [
+            "caller_stream.h",
+            "export.h",
+        ],
+        # Opt out of the OSS force_static default so consumers *can* link one
+        # shared instance and keep the thread-local unique (see above); the
+        # wrapper pins preferred_linkage="any", so this allows shared linkage
+        # rather than forcing it.
+        force_static = False,
+        # dllexport branch of export.h when building this lib; inert off Windows.
+        preprocessor_flags = [
+            "-DEXECUTORCH_EXTENSION_CUDA_BUILDING",
+        ],
+        visibility = ["PUBLIC"],
+        external_deps = [
+            ("cuda", None, "cuda-lazy"),
+        ],
+    )