pytorch · digantdesai · Apr 7, 2026 · Apr 7, 2026 · Apr 7, 2026 · Apr 7, 2026
diff --git a/.gitignore b/.gitignore
@@ -16,6 +16,7 @@ cmake-android-out/
 cmake-ios-out/
 cmake-out*
 cmake-out-android/
+backends/webgpu/third-party/
 build-android/
 build-x86/
 build-hexagon/

@@ -1154,6 +1154,11 @@ if(EXECUTORCH_BUILD_VULKAN)
   list(APPEND _executorch_backends vulkan_backend vulkan_schema)
 endif()
 
+if(EXECUTORCH_BUILD_WEBGPU)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/webgpu)
+  list(APPEND _executorch_backends webgpu_backend)
+endif()
+
 if(EXECUTORCH_BUILD_VGF)
   list(APPEND _executorch_backends vgf_backend)
 endif()

@@ -26,7 +26,7 @@ endif()
 
 find_program(GLSLC_PATH glslc PATHS $ENV{PATH})
 
-if(NOT GLSLC_PATH)
+if(NOT GLSLC_PATH AND EXECUTORCH_BUILD_VULKAN)
   message(
     FATAL_ERROR
       "glslc from the Vulkan SDK must be installed to build the Vulkan backend. "

@@ -0,0 +1,124 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+cmake_minimum_required(VERSION 3.19)
+
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
+endif()
+
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+
+# Ensure vulkan_schema is available even when EXECUTORCH_BUILD_VULKAN is OFF.
+# The WebGPU backend reuses the Vulkan FlatBuffer serialization format.
+if(NOT TARGET vulkan_schema)
+  # We need the schema generation from the Vulkan backend. Build only the
+  # schema target by including the Vulkan CMakeLists.txt. The full Vulkan
+  # backend will only build if EXECUTORCH_BUILD_VULKAN is ON (which gates the
+  # vulkan_backend target), but vulkan_schema is unconditionally defined.
+  add_subdirectory(
+    ${CMAKE_CURRENT_SOURCE_DIR}/../vulkan
+    ${CMAKE_CURRENT_BINARY_DIR}/_vulkan_schema
+  )
-# Ensure vulkan_schema is available even when EXECUTORCH_BUILD_VULKAN is OFF.
-# The WebGPU backend reuses the Vulkan FlatBuffer serialization format.
-if(NOT TARGET vulkan_schema)
-  # We need the schema generation from the Vulkan backend. Build only the
-  # schema target by including the Vulkan CMakeLists.txt. The full Vulkan
-  # backend will only build if EXECUTORCH_BUILD_VULKAN is ON (which gates the
-  # vulkan_backend target), but vulkan_schema is unconditionally defined.
-  add_subdirectory(
-    ${CMAKE_CURRENT_SOURCE_DIR}/../vulkan
-    ${CMAKE_CURRENT_BINARY_DIR}/_vulkan_schema
-  )
+# WebGPU reuses the Vulkan FlatBuffer serialization format and therefore
+# requires the vulkan_schema target to be defined before this file is
+# processed. Do not pull in ../vulkan here with add_subdirectory(), because
+# that imports the full Vulkan backend build and can introduce extra
+# toolchain requirements (for example shader compilation tools) as well as
+# duplicate backend registration side effects.
+if(NOT TARGET vulkan_schema)
+  message(FATAL_ERROR
+    "webgpu_backend requires the vulkan_schema target, but it is not "
+    "available. Provide vulkan_schema before including "
+    "backends/webgpu/CMakeLists.txt. Do not use add_subdirectory(../vulkan) "
+    "from here; instead expose vulkan_schema via a schema-only Vulkan CMake "
+    "include or define vulkan_schema earlier in the build.")
-# Ensure vulkan_schema is available even when EXECUTORCH_BUILD_VULKAN is OFF.
-# The WebGPU backend reuses the Vulkan FlatBuffer serialization format.
-if(NOT TARGET vulkan_schema)
-  # We need the schema generation from the Vulkan backend. Build only the
-  # schema target by including the Vulkan CMakeLists.txt. The full Vulkan
-  # backend will only build if EXECUTORCH_BUILD_VULKAN is ON (which gates the
-  # vulkan_backend target), but vulkan_schema is unconditionally defined.
-  add_subdirectory(
-    ${CMAKE_CURRENT_SOURCE_DIR}/../vulkan
-    ${CMAKE_CURRENT_BINARY_DIR}/_vulkan_schema
-  )
+# WebGPU reuses the Vulkan FlatBuffer serialization format and therefore
+# requires the vulkan_schema target to be defined before this file is
+# processed. Do not pull in ../vulkan here with add_subdirectory(), because
+# that imports the full Vulkan backend build and can introduce extra
+# toolchain requirements (for example shader compilation tools) as well as
+# duplicate backend registration side effects.
+if(NOT TARGET vulkan_schema)
+  message(FATAL_ERROR
+    "webgpu_backend requires the vulkan_schema target, but it is not "
+    "available. Provide vulkan_schema before including "
+    "backends/webgpu/CMakeLists.txt. Do not use add_subdirectory(../vulkan) "
+    "from here; instead expose vulkan_schema via a schema-only Vulkan CMake "
+    "include or define vulkan_schema earlier in the build.")
+endif()
+
+set(WEBGPU_SRCS
+    runtime/WebGPUBackend.cpp
+    runtime/WebGPUGraph.cpp
+    runtime/WebGPUDelegateHeader.cpp
+    runtime/WebGPUDevice.cpp
+    runtime/ops/OperatorRegistry.cpp
+    runtime/ops/add/BinaryOp.cpp
+)
+
+add_library(webgpu_backend ${WEBGPU_SRCS})
+
+target_include_directories(
+  webgpu_backend
+  PRIVATE $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
+)
+
+target_link_libraries(webgpu_backend PRIVATE vulkan_schema executorch_core)
+
+# Native build: link against wgpu-native
+set(WGPU_NATIVE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/third-party/wgpu-native"
+    CACHE PATH "Path to wgpu-native installation")
+
+if(NOT EXISTS "${WGPU_NATIVE_DIR}/lib/libwgpu_native.a")
+  message(FATAL_ERROR
+    "wgpu-native not found at ${WGPU_NATIVE_DIR}. "
+    "Run: bash backends/webgpu/scripts/setup-wgpu-native.sh")
+endif()
+
+add_library(wgpu_native STATIC IMPORTED)
+set_target_properties(wgpu_native PROPERTIES
+  IMPORTED_LOCATION "${WGPU_NATIVE_DIR}/lib/libwgpu_native.a"
+)
+
+target_include_directories(webgpu_backend
+  PUBLIC $<BUILD_INTERFACE:${WGPU_NATIVE_DIR}/include>
+)
+target_link_libraries(webgpu_backend PRIVATE wgpu_native)
+
+if(APPLE)
+  target_link_libraries(webgpu_backend PRIVATE
+    "-framework Metal"
+    "-framework QuartzCore"
+    "-framework CoreGraphics"
+    "-framework Foundation"
+  )
+else()
+  target_link_libraries(webgpu_backend PRIVATE dl m pthread)
+endif()
+
+target_compile_options(webgpu_backend PRIVATE -fexceptions)
+
+# Link with --whole-archive for static registration of backend + ops
+executorch_target_link_options_shared_lib(webgpu_backend)
+
+set_property(TARGET webgpu_backend PROPERTY CXX_STANDARD 17)
+
+install(
+  TARGETS webgpu_backend
+  EXPORT ExecuTorchTargets
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
+
+# Native test target
+if(EXECUTORCH_BUILD_WEBGPU_TEST)
+  add_executable(webgpu_native_test test/test_webgpu_native.cpp)
+
+  target_include_directories(webgpu_native_test
+    PRIVATE
+      $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
+      "${WGPU_NATIVE_DIR}/include"
+  )
+
+  target_link_libraries(webgpu_native_test
+    PRIVATE
+      webgpu_backend
+      wgpu_native
+      executorch_core
+      extension_module_static
+      extension_data_loader
+      extension_tensor
+      portable_kernels
+      portable_ops_lib
+  )
+
+  if(APPLE)
+    target_link_libraries(webgpu_native_test PRIVATE
+      "-framework Metal"
+      "-framework QuartzCore"
+      "-framework CoreGraphics"
+    )
+  else()
+    target_link_libraries(webgpu_native_test PRIVATE dl m pthread)
+  endif()
+
+  target_compile_options(webgpu_native_test PRIVATE -fexceptions)
+  set_property(TARGET webgpu_native_test PROPERTY CXX_STANDARD 17)
+endif()
diff --git a/backends/webgpu/README.md b/backends/webgpu/README.md
@@ -0,0 +1,113 @@
+# WebGPU Backend
+
+Run ExecuTorch models on the GPU via [WebGPU](https://www.w3.org/TR/webgpu/). The backend compiles delegated subgraphs into WGSL compute shaders executed natively through [wgpu-native](https://github.com/gfx-rs/wgpu-native) (Metal on macOS, Vulkan on Linux/Windows).
-Run ExecuTorch models on the GPU via [WebGPU](https://www.w3.org/TR/webgpu/). The backend compiles delegated subgraphs into WGSL compute shaders executed natively through [wgpu-native](https://github.com/gfx-rs/wgpu-native) (Metal on macOS, Vulkan on Linux/Windows).
+Run ExecuTorch models on the GPU via [WebGPU](https://www.w3.org/TR/webgpu/). The backend compiles delegated subgraphs into WGSL compute shaders executed natively through [wgpu-native](https://github.com/gfx-rs/wgpu-native) (Metal on macOS, Vulkan on Linux). Windows is not supported yet in this prototype.
-Run ExecuTorch models on the GPU via [WebGPU](https://www.w3.org/TR/webgpu/). The backend compiles delegated subgraphs into WGSL compute shaders executed natively through [wgpu-native](https://github.com/gfx-rs/wgpu-native) (Metal on macOS, Vulkan on Linux/Windows).
+Run ExecuTorch models on the GPU via [WebGPU](https://www.w3.org/TR/webgpu/). The backend compiles delegated subgraphs into WGSL compute shaders executed natively through [wgpu-native](https://github.com/gfx-rs/wgpu-native) (Metal on macOS, Vulkan on Linux). Windows is not supported yet in this prototype.
+
+> **Status: Prototype.** The backend supports a single operator today and is under active development. See [TODO.md](TODO.md) for the roadmap.
+
+## Architecture
+
+```
+PyTorch model
+    │  torch.export
+    ▼
+Exported Program
+    │  VulkanPartitioner (tags supported fp32 ops)
+    ▼
+Edge Dialect IR
+    │  VulkanBackend.preprocess (builds Vulkan FlatBuffer, buffer-only storage)
+    ▼
+.pte file (with VH00/VK00 delegate blob)
+    │
+    ▼
+Native runtime (wgpu-native → Metal / Vulkan)
+    │  WebGPUGraph::build  → creates GPU buffers, pipelines, bind groups
+    │  WebGPUGraph::execute → encodes + submits compute passes
+    ▼
+GPU output (mapped back to CPU via wgpuDevicePoll)
+```
+
+Key design choices:
+- **Reuses Vulkan serialization** — the delegate blob is a Vulkan FlatBuffer (`VK00`) with a `VH00` header. All tensor storage is forced to `BUFFER` (WebGPU has no 3D storage textures).
+- **Built-in WGSL shaders** — shader source is compiled as C++ string constants. Future work will embed fused shaders in the FlatBuffer for compile-time mega-kernel fusion.
+- **No Python AOT code** — directly consumes .pte files exported via `VulkanPartitioner`.
+
+## Operator Support
+
+| Operator | WGSL Shader | Notes |
+|---|---|---|
+| `aten.add.Tensor` | `binary_add.wgsl` | Element-wise with alpha: `out = in1 + alpha * in2` |
+
+**Planned:** `sub`, `mul`, `relu`, `linear` (matmul), `softmax`, `layer_norm`
+
+## Quick Start
+
+### 1. Setup
+
+```bash
+bash backends/webgpu/scripts/setup-wgpu-native.sh
+```
+
+This downloads prebuilt wgpu-native binaries for your platform.
+
+### 2. Export a model
+
+```python
+import torch
+from executorch.backends.vulkan import VulkanPartitioner
+from executorch.exir import to_edge_transform_and_lower
+
+class AddModule(torch.nn.Module):
+    def forward(self, a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+        return a + b
+
+ep = torch.export.export(AddModule(), (torch.randn(4, 4), torch.randn(4, 4)))
+et_program = to_edge_transform_and_lower(
+    ep, partitioner=[VulkanPartitioner()]
+).to_executorch()
+
+with open("add.pte", "wb") as f:
+    f.write(et_program.buffer)
+```
+
+### 3. Build and run
+
+```bash
+bash backends/webgpu/test/test_build_webgpu.sh
+```
+
+This runs Python export tests, exports a .pte, builds the native runtime, and validates GPU output.
+
+## Directory Structure
+
+```
+backends/webgpu/
+├── CMakeLists.txt
+├── README.md
+├── TODO.md
+├── runtime/
+│   ├── WebGPUBackend.h/cpp        # BackendInterface (init/execute)
+│   ├── WebGPUGraph.h/cpp          # GPU graph: buffers, pipelines, dispatch
+│   ├── WebGPUDelegateHeader.h/cpp # VH00 header parser
+│   ├── WebGPUDevice.h/cpp         # wgpu-native device abstraction
+│   └── ops/
+│       ├── OperatorRegistry.h/cpp # Op dispatch table
+│       └── add/
+│           ├── BinaryOp.cpp       # aten.add.Tensor implementation
+│           ├── binary_add.wgsl    # WGSL shader source
+│           └── binary_add_wgsl.h  # Shader as C++ string constant
+├── scripts/
+│   └── setup-wgpu-native.sh      # Download wgpu-native binaries
+└── test/
+    ├── conftest.py
+    ├── test_build_webgpu.sh       # End-to-end build + test
+    ├── test_webgpu_native.cpp     # C++ native test runner
+    └── ops/
+        └── add/
+            └── test_add.py        # Python export tests
+```
+
+## Requirements
+
+- **macOS**: Metal-capable GPU
+- **Linux**: Vulkan-capable GPU + drivers
+- **Build**: CMake 3.19+, conda environment with ExecuTorch installed
diff --git a/backends/webgpu/TODO.md b/backends/webgpu/TODO.md
@@ -0,0 +1,39 @@
+# WebGPU Backend — TODO
+
+## Current State (Prototype)
+- Single op: `aten.add.Tensor` (fp32, buffer storage)
+- No Python AOT code — directly consumes Vulkan delegate (.pte exported via VulkanPartitioner)
+- Reuses Vulkan FlatBuffer format (VH00 header + VK00 payload)
+- Registers as `"VulkanBackend"` at runtime — mutually exclusive with Vulkan backend at link time
+- Built-in WGSL shaders (not embedded in .pte)
+
+## Architecture
+```
+VulkanPartitioner (Python) → VkGraphBuilder → VK00 FlatBuffer → .pte
+    → WebGPU Runtime: registers as "VulkanBackend", parses VH00/VK00
+    → WebGPUGraph::build → GPU buffers/pipelines/bind groups
+    → WebGPUGraph::execute → encode + submit compute passes
+```
+
+Adding a new op requires only C++ runtime work:
+1. WGSL shader + header
+2. C++ op implementation (read args from VkGraph, create pipeline, record dispatch)
+3. Register in CMakeLists.txt
+4. Test with VulkanPartitioner export
+
+## Performance: Command Encoding Overhead
+WebGPU `GPUCommandBuffer` is single-use (no equivalent to Vulkan's cached command lists).
+Per-dispatch API call cost adds up for large graphs.
+
+**Primary mitigation: mega-kernel fusion.** Generate fused WGSL shaders for chains of
+element-wise ops (add→relu→mul→clamp) at compile time. Embed via the existing
+`shaders: [VkBytes]` field in schema.fbs.
+
+## Next Steps
+1. **More ops**: sub, mul, relu, linear (matmul), softmax, layer_norm
+2. **fp16 support**: Feature-detect `shader-f16`, fallback to fp32
+3. **Buffer pooling**: Reuse GPU buffers to avoid OOM at scale
+4. **Pipeline caching**: Cache compiled pipelines across runs
+5. **Profiling**: Wire WebGPU timestamp queries into ETDump/EventTracer
+6. **LLM support**: KV cache management, Flash Attention in WGSL, quantized ops (int4/int8)
+7. **Browser/JS runtime**: Emscripten build, JS harness, browser test page