Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ cmake-android-out/
cmake-ios-out/
cmake-out*
cmake-out-android/
backends/webgpu/third-party/
build-android/
build-x86/
build-hexagon/
Expand Down
5 changes: 5 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1154,6 +1154,11 @@ if(EXECUTORCH_BUILD_VULKAN)
list(APPEND _executorch_backends vulkan_backend vulkan_schema)
endif()

if(EXECUTORCH_BUILD_WEBGPU)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/webgpu)
list(APPEND _executorch_backends webgpu_backend)
endif()

if(EXECUTORCH_BUILD_VGF)
list(APPEND _executorch_backends vgf_backend)
endif()
Expand Down
2 changes: 1 addition & 1 deletion backends/vulkan/cmake/ShaderLibrary.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ endif()

find_program(GLSLC_PATH glslc PATHS $ENV{PATH})

if(NOT GLSLC_PATH)
if(NOT GLSLC_PATH AND EXECUTORCH_BUILD_VULKAN)
message(
FATAL_ERROR
"glslc from the Vulkan SDK must be installed to build the Vulkan backend. "
Expand Down
124 changes: 124 additions & 0 deletions backends/webgpu/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

cmake_minimum_required(VERSION 3.19)

if(NOT EXECUTORCH_ROOT)
set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
endif()

include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)

# Ensure vulkan_schema is available even when EXECUTORCH_BUILD_VULKAN is OFF.
# The WebGPU backend reuses the Vulkan FlatBuffer serialization format.
if(NOT TARGET vulkan_schema)
# We need the schema generation from the Vulkan backend. Build only the
# schema target by including the Vulkan CMakeLists.txt. The full Vulkan
# backend will only build if EXECUTORCH_BUILD_VULKAN is ON (which gates the
# vulkan_backend target), but vulkan_schema is unconditionally defined.
add_subdirectory(
${CMAKE_CURRENT_SOURCE_DIR}/../vulkan
${CMAKE_CURRENT_BINARY_DIR}/_vulkan_schema
)
Comment on lines +15 to +25
Copy link

Copilot AI Apr 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add_subdirectory(../vulkan ...) pulls in the full Vulkan CMakeLists.txt, which currently unconditionally builds shader libraries and the vulkan_backend target (not just vulkan_schema). This makes EXECUTORCH_BUILD_WEBGPU=ON effectively require the full Vulkan toolchain (e.g., glslc) and can also introduce duplicate backend registration. Consider factoring vulkan_schema into a standalone CMake include, or adding a schema-only mode/guards in backends/vulkan/CMakeLists.txt so including it here does not build the full Vulkan backend.

Suggested change
# Ensure vulkan_schema is available even when EXECUTORCH_BUILD_VULKAN is OFF.
# The WebGPU backend reuses the Vulkan FlatBuffer serialization format.
if(NOT TARGET vulkan_schema)
# We need the schema generation from the Vulkan backend. Build only the
# schema target by including the Vulkan CMakeLists.txt. The full Vulkan
# backend will only build if EXECUTORCH_BUILD_VULKAN is ON (which gates the
# vulkan_backend target), but vulkan_schema is unconditionally defined.
add_subdirectory(
${CMAKE_CURRENT_SOURCE_DIR}/../vulkan
${CMAKE_CURRENT_BINARY_DIR}/_vulkan_schema
)
# WebGPU reuses the Vulkan FlatBuffer serialization format and therefore
# requires the vulkan_schema target to be defined before this file is
# processed. Do not pull in ../vulkan here with add_subdirectory(), because
# that imports the full Vulkan backend build and can introduce extra
# toolchain requirements (for example shader compilation tools) as well as
# duplicate backend registration side effects.
if(NOT TARGET vulkan_schema)
message(FATAL_ERROR
"webgpu_backend requires the vulkan_schema target, but it is not "
"available. Provide vulkan_schema before including "
"backends/webgpu/CMakeLists.txt. Do not use add_subdirectory(../vulkan) "
"from here; instead expose vulkan_schema via a schema-only Vulkan CMake "
"include or define vulkan_schema earlier in the build.")

Copilot uses AI. Check for mistakes.
endif()

set(WEBGPU_SRCS
runtime/WebGPUBackend.cpp
runtime/WebGPUGraph.cpp
runtime/WebGPUDelegateHeader.cpp
runtime/WebGPUDevice.cpp
runtime/ops/OperatorRegistry.cpp
runtime/ops/add/BinaryOp.cpp
)

add_library(webgpu_backend ${WEBGPU_SRCS})

target_include_directories(
webgpu_backend
PRIVATE $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
)

target_link_libraries(webgpu_backend PRIVATE vulkan_schema executorch_core)

# Native build: link against wgpu-native
set(WGPU_NATIVE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/third-party/wgpu-native"
CACHE PATH "Path to wgpu-native installation")

if(NOT EXISTS "${WGPU_NATIVE_DIR}/lib/libwgpu_native.a")
message(FATAL_ERROR
"wgpu-native not found at ${WGPU_NATIVE_DIR}. "
"Run: bash backends/webgpu/scripts/setup-wgpu-native.sh")
endif()

add_library(wgpu_native STATIC IMPORTED)
set_target_properties(wgpu_native PROPERTIES
IMPORTED_LOCATION "${WGPU_NATIVE_DIR}/lib/libwgpu_native.a"
)

target_include_directories(webgpu_backend
PUBLIC $<BUILD_INTERFACE:${WGPU_NATIVE_DIR}/include>
)
target_link_libraries(webgpu_backend PRIVATE wgpu_native)

if(APPLE)
target_link_libraries(webgpu_backend PRIVATE
"-framework Metal"
"-framework QuartzCore"
"-framework CoreGraphics"
"-framework Foundation"
)
else()
target_link_libraries(webgpu_backend PRIVATE dl m pthread)
endif()

target_compile_options(webgpu_backend PRIVATE -fexceptions)

# Link with --whole-archive for static registration of backend + ops
executorch_target_link_options_shared_lib(webgpu_backend)

set_property(TARGET webgpu_backend PROPERTY CXX_STANDARD 17)

install(
TARGETS webgpu_backend
EXPORT ExecuTorchTargets
DESTINATION ${CMAKE_INSTALL_LIBDIR}
)

# Native test target
if(EXECUTORCH_BUILD_WEBGPU_TEST)
add_executable(webgpu_native_test test/test_webgpu_native.cpp)

target_include_directories(webgpu_native_test
PRIVATE
$<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
"${WGPU_NATIVE_DIR}/include"
)

target_link_libraries(webgpu_native_test
PRIVATE
webgpu_backend
wgpu_native
executorch_core
extension_module_static
extension_data_loader
extension_tensor
portable_kernels
portable_ops_lib
)

if(APPLE)
target_link_libraries(webgpu_native_test PRIVATE
"-framework Metal"
"-framework QuartzCore"
"-framework CoreGraphics"
)
else()
target_link_libraries(webgpu_native_test PRIVATE dl m pthread)
endif()

target_compile_options(webgpu_native_test PRIVATE -fexceptions)
set_property(TARGET webgpu_native_test PROPERTY CXX_STANDARD 17)
endif()
113 changes: 113 additions & 0 deletions backends/webgpu/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
# WebGPU Backend

Run ExecuTorch models on the GPU via [WebGPU](https://www.w3.org/TR/webgpu/). The backend compiles delegated subgraphs into WGSL compute shaders executed natively through [wgpu-native](https://github.com/gfx-rs/wgpu-native) (Metal on macOS, Vulkan on Linux/Windows).
Copy link

Copilot AI Apr 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The README states wgpu-native runs on "Vulkan on Linux/Windows", but both the setup script (setup-wgpu-native.sh) and the CMake link logic only handle macOS/Linux (no Windows zip selection, and links dl m pthread in the non-APPLE branch). Either add Windows support or clarify in the README that Windows is not supported yet for this prototype.

Suggested change
Run ExecuTorch models on the GPU via [WebGPU](https://www.w3.org/TR/webgpu/). The backend compiles delegated subgraphs into WGSL compute shaders executed natively through [wgpu-native](https://github.com/gfx-rs/wgpu-native) (Metal on macOS, Vulkan on Linux/Windows).
Run ExecuTorch models on the GPU via [WebGPU](https://www.w3.org/TR/webgpu/). The backend compiles delegated subgraphs into WGSL compute shaders executed natively through [wgpu-native](https://github.com/gfx-rs/wgpu-native) (Metal on macOS, Vulkan on Linux). Windows is not supported yet in this prototype.

Copilot uses AI. Check for mistakes.

> **Status: Prototype.** The backend supports a single operator today and is under active development. See [TODO.md](TODO.md) for the roadmap.

## Architecture

```
PyTorch model
│ torch.export
Exported Program
│ VulkanPartitioner (tags supported fp32 ops)
Edge Dialect IR
│ VulkanBackend.preprocess (builds Vulkan FlatBuffer, buffer-only storage)
.pte file (with VH00/VK00 delegate blob)
Native runtime (wgpu-native → Metal / Vulkan)
│ WebGPUGraph::build → creates GPU buffers, pipelines, bind groups
│ WebGPUGraph::execute → encodes + submits compute passes
GPU output (mapped back to CPU via wgpuDevicePoll)
```

Key design choices:
- **Reuses Vulkan serialization** — the delegate blob is a Vulkan FlatBuffer (`VK00`) with a `VH00` header. All tensor storage is forced to `BUFFER` (WebGPU has no 3D storage textures).
- **Built-in WGSL shaders** — shader source is compiled as C++ string constants. Future work will embed fused shaders in the FlatBuffer for compile-time mega-kernel fusion.
- **No Python AOT code** — directly consumes .pte files exported via `VulkanPartitioner`.

## Operator Support

| Operator | WGSL Shader | Notes |
|---|---|---|
| `aten.add.Tensor` | `binary_add.wgsl` | Element-wise with alpha: `out = in1 + alpha * in2` |

**Planned:** `sub`, `mul`, `relu`, `linear` (matmul), `softmax`, `layer_norm`

## Quick Start

### 1. Setup

```bash
bash backends/webgpu/scripts/setup-wgpu-native.sh
```

This downloads prebuilt wgpu-native binaries for your platform.

### 2. Export a model

```python
import torch
from executorch.backends.vulkan import VulkanPartitioner
from executorch.exir import to_edge_transform_and_lower

class AddModule(torch.nn.Module):
def forward(self, a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
return a + b

ep = torch.export.export(AddModule(), (torch.randn(4, 4), torch.randn(4, 4)))
et_program = to_edge_transform_and_lower(
ep, partitioner=[VulkanPartitioner()]
).to_executorch()

with open("add.pte", "wb") as f:
f.write(et_program.buffer)
```

### 3. Build and run

```bash
bash backends/webgpu/test/test_build_webgpu.sh
```

This runs Python export tests, exports a .pte, builds the native runtime, and validates GPU output.

## Directory Structure

```
backends/webgpu/
├── CMakeLists.txt
├── README.md
├── TODO.md
├── runtime/
│ ├── WebGPUBackend.h/cpp # BackendInterface (init/execute)
│ ├── WebGPUGraph.h/cpp # GPU graph: buffers, pipelines, dispatch
│ ├── WebGPUDelegateHeader.h/cpp # VH00 header parser
│ ├── WebGPUDevice.h/cpp # wgpu-native device abstraction
│ └── ops/
│ ├── OperatorRegistry.h/cpp # Op dispatch table
│ └── add/
│ ├── BinaryOp.cpp # aten.add.Tensor implementation
│ ├── binary_add.wgsl # WGSL shader source
│ └── binary_add_wgsl.h # Shader as C++ string constant
├── scripts/
│ └── setup-wgpu-native.sh # Download wgpu-native binaries
└── test/
├── conftest.py
├── test_build_webgpu.sh # End-to-end build + test
├── test_webgpu_native.cpp # C++ native test runner
└── ops/
└── add/
└── test_add.py # Python export tests
```

## Requirements

- **macOS**: Metal-capable GPU
- **Linux**: Vulkan-capable GPU + drivers
- **Build**: CMake 3.19+, conda environment with ExecuTorch installed
39 changes: 39 additions & 0 deletions backends/webgpu/TODO.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# WebGPU Backend — TODO

## Current State (Prototype)
- Single op: `aten.add.Tensor` (fp32, buffer storage)
- No Python AOT code — directly consumes Vulkan delegate (.pte exported via VulkanPartitioner)
- Reuses Vulkan FlatBuffer format (VH00 header + VK00 payload)
- Registers as `"VulkanBackend"` at runtime — mutually exclusive with Vulkan backend at link time
- Built-in WGSL shaders (not embedded in .pte)

## Architecture
```
VulkanPartitioner (Python) → VkGraphBuilder → VK00 FlatBuffer → .pte
→ WebGPU Runtime: registers as "VulkanBackend", parses VH00/VK00
→ WebGPUGraph::build → GPU buffers/pipelines/bind groups
→ WebGPUGraph::execute → encode + submit compute passes
```

Adding a new op requires only C++ runtime work:
1. WGSL shader + header
2. C++ op implementation (read args from VkGraph, create pipeline, record dispatch)
3. Register in CMakeLists.txt
4. Test with VulkanPartitioner export

## Performance: Command Encoding Overhead
WebGPU `GPUCommandBuffer` is single-use (no equivalent to Vulkan's cached command lists).
Per-dispatch API call cost adds up for large graphs.

**Primary mitigation: mega-kernel fusion.** Generate fused WGSL shaders for chains of
element-wise ops (add→relu→mul→clamp) at compile time. Embed via the existing
`shaders: [VkBytes]` field in schema.fbs.

## Next Steps
1. **More ops**: sub, mul, relu, linear (matmul), softmax, layer_norm
2. **fp16 support**: Feature-detect `shader-f16`, fallback to fp32
3. **Buffer pooling**: Reuse GPU buffers to avoid OOM at scale
4. **Pipeline caching**: Cache compiled pipelines across runs
5. **Profiling**: Wire WebGPU timestamp queries into ETDump/EventTracer
6. **LLM support**: KV cache management, Flash Attention in WGSL, quantized ops (int4/int8)
7. **Browser/JS runtime**: Emscripten build, JS harness, browser test page
Loading
Loading