From ae9482df056036247411db88e80eac9cd50b775e Mon Sep 17 00:00:00 2001 From: Digant Desai Date: Tue, 7 Apr 2026 10:28:02 -0500 Subject: [PATCH 1/9] [WebGPU] Add delegate header parser Parses the VH00/VK00 FlatBuffer envelope from the Vulkan partitioner to extract the serialized graph payload. --- .../webgpu/runtime/WebGPUDelegateHeader.cpp | 99 +++++++++++++++++++ .../webgpu/runtime/WebGPUDelegateHeader.h | 32 ++++++ 2 files changed, 131 insertions(+) create mode 100644 backends/webgpu/runtime/WebGPUDelegateHeader.cpp create mode 100644 backends/webgpu/runtime/WebGPUDelegateHeader.h diff --git a/backends/webgpu/runtime/WebGPUDelegateHeader.cpp b/backends/webgpu/runtime/WebGPUDelegateHeader.cpp new file mode 100644 index 00000000000..d1e8b2110a7 --- /dev/null +++ b/backends/webgpu/runtime/WebGPUDelegateHeader.cpp @@ -0,0 +1,99 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#include +#include + +namespace executorch { +namespace backends { +namespace webgpu { + +using executorch::runtime::Error; +using executorch::runtime::Result; + +namespace { + +struct ByteSlice { + size_t offset; + size_t size; +}; + +constexpr size_t kExpectedSize = 30; +constexpr char kExpectedMagic[4] = {'V', 'H', '0', '0'}; + +constexpr ByteSlice kMagic = {4, 4}; +constexpr ByteSlice kHeaderSize = {8, 2}; +constexpr ByteSlice kFlatbufferOffset = {10, 4}; +constexpr ByteSlice kFlatbufferSize = {14, 4}; +constexpr ByteSlice kBytesOffset = {18, 4}; +constexpr ByteSlice kBytesSize = {22, 8}; + +uint64_t getUInt64LE(const uint8_t* data) { + return (uint64_t)data[0] | ((uint64_t)data[1] << 8) | + ((uint64_t)data[2] << 16) | ((uint64_t)data[3] << 24) | + ((uint64_t)data[4] << 32) | ((uint64_t)data[5] << 40) | + ((uint64_t)data[6] << 48) | ((uint64_t)data[7] << 56); +} + +uint32_t getUInt32LE(const uint8_t* data) { + return (uint32_t)data[0] | ((uint32_t)data[1] << 8) | + ((uint32_t)data[2] << 16) | ((uint32_t)data[3] << 24); +} + +uint32_t getUInt16LE(const uint8_t* data) { + return (uint32_t)data[0] | ((uint32_t)data[1] << 8); +} + +} // namespace + +bool WebGPUDelegateHeader::is_valid() const { + if (header_size < kExpectedSize) { + return false; + } + if (flatbuffer_offset < header_size) { + return false; + } + if (flatbuffer_size == 0) { + return false; + } + if (bytes_offset < flatbuffer_offset + flatbuffer_size) { + return false; + } + return true; +} + +Result WebGPUDelegateHeader::parse(const void* data) { + const uint8_t* header_data = (const uint8_t*)data; + + const uint8_t* magic_start = header_data + kMagic.offset; + if (std::memcmp(magic_start, kExpectedMagic, kMagic.size) != 0) { + return Error::NotFound; + } + + WebGPUDelegateHeader header = WebGPUDelegateHeader{ + getUInt16LE(header_data + kHeaderSize.offset), + getUInt32LE(header_data + kFlatbufferOffset.offset), + getUInt32LE(header_data + kFlatbufferSize.offset), + getUInt32LE(header_data + kBytesOffset.offset), + getUInt64LE(header_data + kBytesSize.offset), + }; + + if (!header.is_valid()) { + return Error::InvalidArgument; + } + + return header; +} + +} // namespace webgpu +} // namespace backends +} // namespace executorch diff --git a/backends/webgpu/runtime/WebGPUDelegateHeader.h b/backends/webgpu/runtime/WebGPUDelegateHeader.h new file mode 100644 index 00000000000..6f2f65130c7 --- /dev/null +++ b/backends/webgpu/runtime/WebGPUDelegateHeader.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +namespace executorch { +namespace backends { +namespace webgpu { + +struct WebGPUDelegateHeader { + bool is_valid() const; + + static executorch::runtime::Result parse( + const void* data); + + uint32_t header_size; + uint32_t flatbuffer_offset; + uint32_t flatbuffer_size; + uint32_t bytes_offset; + uint64_t bytes_size; +}; + +} // namespace webgpu +} // namespace backends +} // namespace executorch From 576afdce78d85898edafa59bf2d59c6cb3cdae11 Mon Sep 17 00:00:00 2001 From: Digant Desai Date: Tue, 7 Apr 2026 10:28:02 -0500 Subject: [PATCH 2/9] [WebGPU] Add operator registry and aten.add shader Operator registry with registration macros, WGSL binary-add shader (plus inline C++ header), and the aten.add.Tensor implementation that creates a compute pipeline and records dispatch. --- .../webgpu/runtime/ops/OperatorRegistry.cpp | 43 +++++ .../webgpu/runtime/ops/OperatorRegistry.h | 62 +++++++ backends/webgpu/runtime/ops/add/BinaryOp.cpp | 168 ++++++++++++++++++ .../webgpu/runtime/ops/add/binary_add.wgsl | 18 ++ .../webgpu/runtime/ops/add/binary_add_wgsl.h | 41 +++++ 5 files changed, 332 insertions(+) create mode 100644 backends/webgpu/runtime/ops/OperatorRegistry.cpp create mode 100644 backends/webgpu/runtime/ops/OperatorRegistry.h create mode 100644 backends/webgpu/runtime/ops/add/BinaryOp.cpp create mode 100644 backends/webgpu/runtime/ops/add/binary_add.wgsl create mode 100644 backends/webgpu/runtime/ops/add/binary_add_wgsl.h diff --git a/backends/webgpu/runtime/ops/OperatorRegistry.cpp b/backends/webgpu/runtime/ops/OperatorRegistry.cpp new file mode 100644 index 00000000000..ddb16a45b97 --- /dev/null +++ b/backends/webgpu/runtime/ops/OperatorRegistry.cpp @@ -0,0 +1,43 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +namespace executorch { +namespace backends { +namespace webgpu { + +bool OperatorRegistry::has_op(const std::string& name) { + return table_.count(name) > 0; +} + +OpFunction& OperatorRegistry::get_op_fn(const std::string& name) { + const auto it = table_.find(name); + if (it == table_.end()) { + throw std::runtime_error( + "WebGPU OperatorRegistry: could not find operator: " + name); + } + return it->second; +} + +void OperatorRegistry::register_op( + const std::string& name, + const OpFunction& fn) { + table_.insert(std::make_pair(name, fn)); +} + +OperatorRegistry& webgpu_operator_registry() { + static OperatorRegistry registry; + return registry; +} + +} // namespace webgpu +} // namespace backends +} // namespace executorch diff --git a/backends/webgpu/runtime/ops/OperatorRegistry.h b/backends/webgpu/runtime/ops/OperatorRegistry.h new file mode 100644 index 00000000000..2e09b6f8140 --- /dev/null +++ b/backends/webgpu/runtime/ops/OperatorRegistry.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include + +namespace executorch { +namespace backends { +namespace webgpu { + +class WebGPUGraph; + +using OpFunction = + std::function&)>; + +class OperatorRegistry final { + using OpTable = std::unordered_map; + OpTable table_; + + public: + bool has_op(const std::string& name); + OpFunction& get_op_fn(const std::string& name); + void register_op(const std::string& name, const OpFunction& fn); +}; + +class OperatorRegisterInit final { + using InitFn = void(); + + public: + explicit OperatorRegisterInit(InitFn* init_fn) { + init_fn(); + } +}; + +OperatorRegistry& webgpu_operator_registry(); + +#define WEBGPU_REGISTER_OP(name, function) \ + ::executorch::backends::webgpu::webgpu_operator_registry() \ + .register_op( \ + #name, \ + std::bind( \ + &function, std::placeholders::_1, \ + std::placeholders::_2)) + +#define WEBGPU_REGISTER_OPERATORS \ + static void register_webgpu_ops(); \ + static const ::executorch::backends::webgpu::OperatorRegisterInit \ + webgpu_reg(®ister_webgpu_ops); \ + static void register_webgpu_ops() + +} // namespace webgpu +} // namespace backends +} // namespace executorch diff --git a/backends/webgpu/runtime/ops/add/BinaryOp.cpp b/backends/webgpu/runtime/ops/add/BinaryOp.cpp new file mode 100644 index 00000000000..0fbde4c6997 --- /dev/null +++ b/backends/webgpu/runtime/ops/add/BinaryOp.cpp @@ -0,0 +1,168 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +#include + +#include +#include + +namespace executorch { +namespace backends { +namespace webgpu { + +namespace { + +// Uniform buffer layout matching the WGSL Params struct. +// Must be 16-byte aligned for WebGPU uniform buffer requirements. +struct AddParams { + uint32_t num_elements; + float alpha; + uint32_t _pad[2]; // pad to 16 bytes +}; + +void add_impl(WebGPUGraph& graph, const std::vector& args) { + // aten.add.Tensor args: [in1, in2, alpha, out] + const int in1_id = args.at(0); + const int in2_id = args.at(1); + const int alpha_id = args.at(2); + const int out_id = args.at(3); + + WGPUDevice device = graph.device(); + + // Get alpha value (defaults to 1.0 if not a scalar) + float alpha = 1.0f; + if (graph.get_value_type(alpha_id) == WebGPUGraph::ValueType::Int) { + alpha = static_cast(graph.get_int(alpha_id)); + } else if (graph.get_value_type(alpha_id) == WebGPUGraph::ValueType::Double) { + alpha = static_cast(graph.get_double(alpha_id)); + } + + const auto& out_tensor = graph.get_tensor(out_id); + uint32_t num_elements = + static_cast(out_tensor.nbytes / sizeof(float)); + + // Create uniform buffer for params + AddParams params = {}; + params.num_elements = num_elements; + params.alpha = alpha; + + WGPUBufferDescriptor uniform_desc = {}; + uniform_desc.size = sizeof(AddParams); + uniform_desc.usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst; + uniform_desc.mappedAtCreation = true; + WGPUBuffer uniform_buffer = wgpuDeviceCreateBuffer(device, &uniform_desc); + void* mapped = wgpuBufferGetMappedRange(uniform_buffer, 0, sizeof(AddParams)); + std::memcpy(mapped, ¶ms, sizeof(AddParams)); + wgpuBufferUnmap(uniform_buffer); + + // Create shader module from built-in WGSL source + WGPUShaderSourceWGSL wgsl_desc = {}; + wgsl_desc.chain.sType = WGPUSType_ShaderSourceWGSL; + wgsl_desc.code = {kBinaryAddWGSL, WGPU_STRLEN}; + + WGPUShaderModuleDescriptor shader_desc = {}; + shader_desc.nextInChain = &wgsl_desc.chain; + WGPUShaderModule shader = wgpuDeviceCreateShaderModule(device, &shader_desc); + + // Create bind group layout: 3 storage buffers + 1 uniform + WGPUBindGroupLayoutEntry entries[4] = {}; + + // input1 - storage buffer, read-only + entries[0].binding = 0; + entries[0].visibility = WGPUShaderStage_Compute; + entries[0].buffer.type = WGPUBufferBindingType_ReadOnlyStorage; + + // input2 - storage buffer, read-only + entries[1].binding = 1; + entries[1].visibility = WGPUShaderStage_Compute; + entries[1].buffer.type = WGPUBufferBindingType_ReadOnlyStorage; + + // output - storage buffer, read-write + entries[2].binding = 2; + entries[2].visibility = WGPUShaderStage_Compute; + entries[2].buffer.type = WGPUBufferBindingType_Storage; + + // params - uniform buffer + entries[3].binding = 3; + entries[3].visibility = WGPUShaderStage_Compute; + entries[3].buffer.type = WGPUBufferBindingType_Uniform; + + WGPUBindGroupLayoutDescriptor bgl_desc = {}; + bgl_desc.entryCount = 4; + bgl_desc.entries = entries; + WGPUBindGroupLayout bgl = + wgpuDeviceCreateBindGroupLayout(device, &bgl_desc); + + // Create pipeline layout + WGPUPipelineLayoutDescriptor pl_desc = {}; + pl_desc.bindGroupLayoutCount = 1; + pl_desc.bindGroupLayouts = &bgl; + WGPUPipelineLayout pipeline_layout = + wgpuDeviceCreatePipelineLayout(device, &pl_desc); + + // Create compute pipeline + WGPUComputePipelineDescriptor pipeline_desc = {}; + pipeline_desc.layout = pipeline_layout; + pipeline_desc.compute.module = shader; + pipeline_desc.compute.entryPoint = {"main", WGPU_STRLEN}; + WGPUComputePipeline pipeline = + wgpuDeviceCreateComputePipeline(device, &pipeline_desc); + + // Create bind group with actual buffers + const auto& in1_tensor = graph.get_tensor(in1_id); + const auto& in2_tensor = graph.get_tensor(in2_id); + + WGPUBindGroupEntry bg_entries[4] = {}; + + bg_entries[0].binding = 0; + bg_entries[0].buffer = in1_tensor.buffer; + bg_entries[0].size = in1_tensor.nbytes; + + bg_entries[1].binding = 1; + bg_entries[1].buffer = in2_tensor.buffer; + bg_entries[1].size = in2_tensor.nbytes; + + bg_entries[2].binding = 2; + bg_entries[2].buffer = out_tensor.buffer; + bg_entries[2].size = out_tensor.nbytes; + + bg_entries[3].binding = 3; + bg_entries[3].buffer = uniform_buffer; + bg_entries[3].size = sizeof(AddParams); + + WGPUBindGroupDescriptor bg_desc = {}; + bg_desc.layout = bgl; + bg_desc.entryCount = 4; + bg_desc.entries = bg_entries; + WGPUBindGroup bind_group = wgpuDeviceCreateBindGroup(device, &bg_desc); + + uint32_t workgroup_count = + (num_elements + kBinaryAddWorkgroupSize - 1) / kBinaryAddWorkgroupSize; + + graph.add_dispatch({pipeline, bind_group, workgroup_count}); + + // Release intermediate objects (pipeline and bind_group are kept by dispatch) + wgpuShaderModuleRelease(shader); + wgpuBindGroupLayoutRelease(bgl); + wgpuPipelineLayoutRelease(pipeline_layout); + // uniform_buffer is kept alive by the bind group +} + +} // namespace + +WEBGPU_REGISTER_OPERATORS { + WEBGPU_REGISTER_OP(aten.add.Tensor, add_impl); +} + +} // namespace webgpu +} // namespace backends +} // namespace executorch diff --git a/backends/webgpu/runtime/ops/add/binary_add.wgsl b/backends/webgpu/runtime/ops/add/binary_add.wgsl new file mode 100644 index 00000000000..4d5ec97e6d3 --- /dev/null +++ b/backends/webgpu/runtime/ops/add/binary_add.wgsl @@ -0,0 +1,18 @@ +@group(0) @binding(0) var input1: array; +@group(0) @binding(1) var input2: array; +@group(0) @binding(2) var output: array; + +struct Params { + num_elements: u32, + alpha: f32, +} +@group(0) @binding(3) var params: Params; + +@compute @workgroup_size(256) +fn main(@builtin(global_invocation_id) gid: vec3) { + let idx = gid.x; + if (idx >= params.num_elements) { + return; + } + output[idx] = input1[idx] + params.alpha * input2[idx]; +} diff --git a/backends/webgpu/runtime/ops/add/binary_add_wgsl.h b/backends/webgpu/runtime/ops/add/binary_add_wgsl.h new file mode 100644 index 00000000000..cd94625dbdf --- /dev/null +++ b/backends/webgpu/runtime/ops/add/binary_add_wgsl.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +namespace executorch { +namespace backends { +namespace webgpu { + +// WGSL shader source for element-wise add: output = input1 + alpha * input2 +inline constexpr const char* kBinaryAddWGSL = R"( +@group(0) @binding(0) var input1: array; +@group(0) @binding(1) var input2: array; +@group(0) @binding(2) var output: array; + +struct Params { + num_elements: u32, + alpha: f32, +} +@group(0) @binding(3) var params: Params; + +@compute @workgroup_size(256) +fn main(@builtin(global_invocation_id) gid: vec3) { + let idx = gid.x; + if (idx >= params.num_elements) { + return; + } + output[idx] = input1[idx] + params.alpha * input2[idx]; +} +)"; + +inline constexpr uint32_t kBinaryAddWorkgroupSize = 256; + +} // namespace webgpu +} // namespace backends +} // namespace executorch From 9548f74939e25833daa682198773b340fb0c682e Mon Sep 17 00:00:00 2001 From: Digant Desai Date: Tue, 7 Apr 2026 10:28:02 -0500 Subject: [PATCH 3/9] [WebGPU] Add compute graph Buffer management, pipeline creation, and compute dispatch. Parses the Vulkan FlatBuffer delegate blob and builds a runnable graph of compute passes. --- backends/webgpu/runtime/WebGPUGraph.cpp | 305 ++++++++++++++++++++++++ backends/webgpu/runtime/WebGPUGraph.h | 119 +++++++++ 2 files changed, 424 insertions(+) create mode 100644 backends/webgpu/runtime/WebGPUGraph.cpp create mode 100644 backends/webgpu/runtime/WebGPUGraph.h diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp new file mode 100644 index 00000000000..99b570ad7e5 --- /dev/null +++ b/backends/webgpu/runtime/WebGPUGraph.cpp @@ -0,0 +1,305 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +#include + +#include +#include + +namespace executorch { +namespace backends { +namespace webgpu { + +// vkgraph namespace is declared at global scope in the generated FlatBuffer header + +namespace { + +size_t vk_datatype_size(vkgraph::VkDataType dtype) { + switch (dtype) { + case vkgraph::VkDataType::BOOL: + case vkgraph::VkDataType::UINT8: + case vkgraph::VkDataType::INT8: + return 1; + case vkgraph::VkDataType::FLOAT16: + return 2; + case vkgraph::VkDataType::INT32: + case vkgraph::VkDataType::FLOAT32: + return 4; + case vkgraph::VkDataType::INT64: + case vkgraph::VkDataType::FLOAT64: + return 8; + default: + return 0; + } +} + +} // namespace + +WebGPUGraph::WebGPUGraph() = default; + +WebGPUGraph::~WebGPUGraph() { + for (auto& t : tensors_) { + if (t.buffer) { + wgpuBufferRelease(t.buffer); + } + } + for (auto& buf : output_staging_buffers_) { + if (buf) { + wgpuBufferRelease(buf); + } + } + for (auto& d : dispatches_) { + if (d.pipeline) { + wgpuComputePipelineRelease(d.pipeline); + } + if (d.bind_group) { + wgpuBindGroupRelease(d.bind_group); + } + } +} + +void WebGPUGraph::build( + const void* flatbuffer_data, + const uint8_t* constant_data) { + if (!device_) { + throw std::runtime_error( + "WebGPU device not available. " + "Call set_default_webgpu_context() before loading."); + } + queue_ = wgpuDeviceGetQueue(device_); + + const auto* graph = vkgraph::GetVkGraph(flatbuffer_data); + + // Phase 1: Create all values + const auto* values = graph->values(); + const int num_vals = values ? values->size() : 0; + value_types_.resize(num_vals, ValueType::Null); + tensors_.resize(num_vals); + ints_.resize(num_vals, 0); + doubles_.resize(num_vals, 0.0); + bools_.resize(num_vals, false); + + for (int i = 0; i < num_vals; i++) { + const auto* val = values->Get(i); + if (!val || val->value_type() == vkgraph::GraphTypes::NONE) { + value_types_[i] = ValueType::Null; + continue; + } + + switch (val->value_type()) { + case vkgraph::GraphTypes::VkTensor: { + value_types_[i] = ValueType::Tensor; + const auto* vk_tensor = val->value_as_VkTensor(); + auto& tensor = tensors_[i]; + + const auto* dims = vk_tensor->dims(); + size_t numel = 1; + if (dims) { + for (unsigned j = 0; j < dims->size(); j++) { + tensor.dims.push_back(static_cast(dims->Get(j))); + numel *= dims->Get(j); + } + } + tensor.nbytes = numel * vk_datatype_size(vk_tensor->datatype()); + + // Create GPU buffer + WGPUBufferDescriptor buf_desc = {}; + buf_desc.size = tensor.nbytes > 0 ? tensor.nbytes : 4; + buf_desc.usage = + WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst | + WGPUBufferUsage_CopySrc; + buf_desc.mappedAtCreation = false; + tensor.buffer = wgpuDeviceCreateBuffer(device_, &buf_desc); + + // Upload constant data if this tensor has a constant_id + int constant_id = vk_tensor->constant_id(); + if (constant_id >= 0 && constant_data) { + const auto* constants = graph->constants(); + if (constants && + constant_id < static_cast(constants->size())) { + const auto* vk_bytes = constants->Get(constant_id); + // Only upload from embedded bytes (not named data map) + if (vk_bytes->offset() != UINT64_MAX) { + const uint8_t* src = constant_data + vk_bytes->offset(); + wgpuQueueWriteBuffer( + queue_, tensor.buffer, 0, src, tensor.nbytes); + } + } + } + break; + } + case vkgraph::GraphTypes::Int: { + value_types_[i] = ValueType::Int; + ints_[i] = val->value_as_Int()->int_val(); + break; + } + case vkgraph::GraphTypes::Double: { + value_types_[i] = ValueType::Double; + doubles_[i] = val->value_as_Double()->double_val(); + break; + } + case vkgraph::GraphTypes::Bool: { + value_types_[i] = ValueType::Bool; + bools_[i] = val->value_as_Bool()->bool_val(); + break; + } + default: + value_types_[i] = ValueType::Null; + break; + } + } + + // Phase 2: Record input and output IDs + const auto* fb_input_ids = graph->input_ids(); + if (fb_input_ids) { + for (unsigned i = 0; i < fb_input_ids->size(); i++) { + input_ids_.push_back(static_cast(fb_input_ids->Get(i))); + } + } + const auto* fb_output_ids = graph->output_ids(); + if (fb_output_ids) { + for (unsigned i = 0; i < fb_output_ids->size(); i++) { + int oid = static_cast(fb_output_ids->Get(i)); + output_ids_.push_back(oid); + + // Create staging buffer for output readback + WGPUBufferDescriptor staging_desc = {}; + staging_desc.size = tensors_[oid].nbytes > 0 ? tensors_[oid].nbytes : 4; + staging_desc.usage = WGPUBufferUsage_MapRead | WGPUBufferUsage_CopyDst; + staging_desc.mappedAtCreation = false; + output_staging_buffers_.push_back( + wgpuDeviceCreateBuffer(device_, &staging_desc)); + } + } + + // Phase 3: Build operator dispatch chain + const auto* chain = graph->chain(); + if (chain) { + for (unsigned i = 0; i < chain->size(); i++) { + const auto* op_call = chain->Get(i); + std::string op_name = op_call->name()->str(); + + if (!webgpu_operator_registry().has_op(op_name)) { + throw std::runtime_error( + "WebGPU backend: unsupported op: " + op_name); + } + + const auto* fb_args = op_call->args(); + std::vector args; + if (fb_args) { + for (unsigned j = 0; j < fb_args->size(); j++) { + args.push_back(static_cast(fb_args->Get(j))); + } + } + + webgpu_operator_registry().get_op_fn(op_name)(*this, args); + } + } +} + +void WebGPUGraph::copy_inputs( + const std::vector>& inputs) { + for (size_t i = 0; i < inputs.size() && i < input_ids_.size(); i++) { + int tid = input_ids_[i]; + const auto& tensor = tensors_[tid]; + wgpuQueueWriteBuffer( + queue_, tensor.buffer, 0, inputs[i].first, inputs[i].second); + } +} + +void WebGPUGraph::execute() { + WGPUCommandEncoderDescriptor enc_desc = {}; + WGPUCommandEncoder encoder = + wgpuDeviceCreateCommandEncoder(device_, &enc_desc); + + WGPUComputePassDescriptor pass_desc = {}; + WGPUComputePassEncoder pass = + wgpuCommandEncoderBeginComputePass(encoder, &pass_desc); + + for (const auto& dispatch : dispatches_) { + wgpuComputePassEncoderSetPipeline(pass, dispatch.pipeline); + wgpuComputePassEncoderSetBindGroup(pass, 0, dispatch.bind_group, 0, nullptr); + wgpuComputePassEncoderDispatchWorkgroups( + pass, dispatch.workgroup_count_x, 1, 1); + } + + wgpuComputePassEncoderEnd(pass); + wgpuComputePassEncoderRelease(pass); + + // Copy outputs to staging buffers + for (size_t i = 0; i < output_ids_.size(); i++) { + int oid = output_ids_[i]; + wgpuCommandEncoderCopyBufferToBuffer( + encoder, + tensors_[oid].buffer, + 0, + output_staging_buffers_[i], + 0, + tensors_[oid].nbytes); + } + + WGPUCommandBufferDescriptor cmd_desc = {}; + WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(encoder, &cmd_desc); + wgpuQueueSubmit(queue_, 1, &cmd); + + wgpuCommandBufferRelease(cmd); + wgpuCommandEncoderRelease(encoder); +} + +namespace { + +struct MapCallbackData { + bool done = false; + WGPUMapAsyncStatus status = WGPUMapAsyncStatus_Error; +}; + +void buffer_map_callback( + WGPUMapAsyncStatus status, + WGPUStringView /*message*/, + void* userdata1, + void* /*userdata2*/) { + auto* data = static_cast(userdata1); + data->status = status; + data->done = true; +} + +} // namespace + +void WebGPUGraph::copy_outputs( + std::vector>& outputs) { + for (size_t i = 0; i < outputs.size() && i < output_staging_buffers_.size(); + i++) { + MapCallbackData cb_data; + WGPUBufferMapCallbackInfo cb_info = {}; + cb_info.mode = WGPUCallbackMode_AllowSpontaneous; + cb_info.callback = buffer_map_callback; + cb_info.userdata1 = &cb_data; + wgpuBufferMapAsync( + output_staging_buffers_[i], + WGPUMapMode_Read, + 0, + outputs[i].second, + cb_info); + + if (cb_data.status == WGPUMapAsyncStatus_Success) { + const void* mapped = + wgpuBufferGetConstMappedRange(output_staging_buffers_[i], 0, outputs[i].second); + std::memcpy(outputs[i].first, mapped, outputs[i].second); + wgpuBufferUnmap(output_staging_buffers_[i]); + } else { + throw std::runtime_error("WebGPU buffer map failed for output"); + } + } +} + +} // namespace webgpu +} // namespace backends +} // namespace executorch diff --git a/backends/webgpu/runtime/WebGPUGraph.h b/backends/webgpu/runtime/WebGPUGraph.h new file mode 100644 index 00000000000..ce9c2b3a84b --- /dev/null +++ b/backends/webgpu/runtime/WebGPUGraph.h @@ -0,0 +1,119 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include +#include +#include + +namespace executorch { +namespace backends { +namespace webgpu { + +struct WebGPUTensor { + WGPUBuffer buffer = nullptr; + std::vector dims; + size_t nbytes = 0; +}; + +struct WebGPUDispatch { + WGPUComputePipeline pipeline = nullptr; + WGPUBindGroup bind_group = nullptr; + uint32_t workgroup_count_x = 1; +}; + +class WebGPUGraph { + public: + WebGPUGraph(); + ~WebGPUGraph(); + + // Build the graph from a deserialized VkGraph flatbuffer and constant data. + // The flatbuffer_data pointer must remain valid during build(). + void build(const void* flatbuffer_data, const uint8_t* constant_data); + + // Copy input tensor data from host pointers into GPU buffers. + void copy_inputs(const std::vector>& inputs); + + // Execute all recorded dispatches. + void execute(); + + // Copy output tensor data from GPU buffers back to host pointers. + // Uses mapAsync + ASYNCIFY in Wasm. + void copy_outputs(std::vector>& outputs); + + const std::vector& input_ids() const { + return input_ids_; + } + const std::vector& output_ids() const { + return output_ids_; + } + + // Access tensors by value ID (used by op implementations). + WebGPUTensor& get_tensor(int id) { + return tensors_[id]; + } + const WebGPUTensor& get_tensor(int id) const { + return tensors_[id]; + } + + // Access scalar values stored during graph build. + double get_double(int id) const { + return doubles_[id]; + } + int64_t get_int(int id) const { + return ints_[id]; + } + + WGPUDevice device() const { + return device_; + } + WGPUQueue queue() const { + return queue_; + } + + void add_dispatch(WebGPUDispatch dispatch) { + dispatches_.push_back(dispatch); + } + + int num_values() const { + return static_cast(value_types_.size()); + } + + enum class ValueType { Tensor, Int, Double, Bool, Null, String }; + + ValueType get_value_type(int id) const { + return value_types_[id]; + } + + private: + WGPUDevice device_ = nullptr; + WGPUQueue queue_ = nullptr; + + // Flat arrays indexed by value ID. Only the relevant one is populated + // per ID based on value_types_. + std::vector value_types_; + std::vector tensors_; + std::vector ints_; + std::vector doubles_; + std::vector bools_; + + std::vector input_ids_; + std::vector output_ids_; + + // Staging buffers for reading back outputs (MapRead | CopyDst). + std::vector output_staging_buffers_; + + std::vector dispatches_; +}; + +} // namespace webgpu +} // namespace backends +} // namespace executorch From cdb38bf73df5e7db1bd739ca141630e1c66e1e34 Mon Sep 17 00:00:00 2001 From: Digant Desai Date: Tue, 7 Apr 2026 10:28:02 -0500 Subject: [PATCH 4/9] [WebGPU] Add backend interface BackendInterface implementation that wires init/execute into ExecuTorch. Registers as "VulkanBackend" to consume .pte files from the Vulkan partitioner directly. --- backends/webgpu/runtime/WebGPUBackend.cpp | 139 ++++++++++++++++++++++ backends/webgpu/runtime/WebGPUBackend.h | 40 +++++++ 2 files changed, 179 insertions(+) create mode 100644 backends/webgpu/runtime/WebGPUBackend.cpp create mode 100644 backends/webgpu/runtime/WebGPUBackend.h diff --git a/backends/webgpu/runtime/WebGPUBackend.cpp b/backends/webgpu/runtime/WebGPUBackend.cpp new file mode 100644 index 00000000000..ec349d05bd2 --- /dev/null +++ b/backends/webgpu/runtime/WebGPUBackend.cpp @@ -0,0 +1,139 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +#include + +#include +#include +#include + +#include + +namespace executorch { +namespace backends { +namespace webgpu { + +// vkgraph namespace is declared at global scope in the generated FlatBuffer header + +using executorch::runtime::ArrayRef; +using executorch::runtime::Backend; +using executorch::runtime::BackendExecutionContext; +using executorch::runtime::BackendInitContext; +using executorch::runtime::CompileSpec; +using executorch::runtime::DelegateHandle; +using executorch::runtime::Error; +using executorch::runtime::EValue; +using executorch::runtime::FreeableBuffer; +using executorch::runtime::register_backend; +using executorch::runtime::Result; +using executorch::runtime::Span; + +bool WebGPUBackend::is_available() const { + return true; +} + +Result WebGPUBackend::init( + BackendInitContext& context, + FreeableBuffer* processed, + ArrayRef compile_specs) const { + // Allocate graph on the runtime allocator + WebGPUGraph* graph = + context.get_runtime_allocator()->allocateInstance(); + if (graph == nullptr) { + return Error::MemoryAllocationFailed; + } + new (graph) WebGPUGraph(); + + // Parse header to locate flatbuffer and constant data + Result header = + WebGPUDelegateHeader::parse(processed->data()); + if (!header.ok()) { + ET_LOG(Error, "WebGPUDelegateHeader may be corrupt"); + return header.error(); + } + + const uint8_t* buffer_start = + reinterpret_cast(processed->data()); + const uint8_t* flatbuffer_data = buffer_start + header->flatbuffer_offset; + const uint8_t* constant_data = buffer_start + header->bytes_offset; + + // Verify FlatBuffer identifier + if (!vkgraph::VkGraphBufferHasIdentifier(flatbuffer_data)) { + ET_LOG( + Error, + "WebGPU delegate FlatBuffer identifier mismatch (expected VK00)"); + return Error::DelegateInvalidCompatibility; + } + + try { + graph->build(flatbuffer_data, constant_data); + } catch (const std::exception& e) { + ET_LOG(Error, "WebGPU graph build failed: %s", e.what()); + graph->~WebGPUGraph(); + return Error::DelegateInvalidCompatibility; + } + + processed->Free(); + + return graph; +} + +Error WebGPUBackend::execute( + BackendExecutionContext& context, + DelegateHandle* handle, + Span args) const { + WebGPUGraph* graph = static_cast(handle); + + const size_t num_inputs = graph->input_ids().size(); + const size_t num_outputs = graph->output_ids().size(); + + // Copy inputs from EValue tensors to GPU buffers + std::vector> inputs; + inputs.reserve(num_inputs); + for (size_t i = 0; i < num_inputs; i++) { + const auto& tensor = args[i]->toTensor(); + inputs.emplace_back(tensor.const_data_ptr(), tensor.nbytes()); + } + graph->copy_inputs(inputs); + + // Execute the compute graph + graph->execute(); + + // Copy outputs from GPU staging buffers to EValue tensor data pointers + std::vector> outputs; + outputs.reserve(num_outputs); + for (size_t i = 0; i < num_outputs; i++) { + const size_t arg_idx = num_inputs + i; + auto& tensor = args[arg_idx]->toTensor(); + outputs.emplace_back(tensor.mutable_data_ptr(), tensor.nbytes()); + } + graph->copy_outputs(outputs); + + return Error::Ok; +} + +void WebGPUBackend::destroy(DelegateHandle* handle) const { + if (handle != nullptr) { + WebGPUGraph* graph = static_cast(handle); + graph->~WebGPUGraph(); + } +} + +namespace { +auto cls = WebGPUBackend(); +Backend backend{"VulkanBackend", &cls}; +static auto success_with_compiler = register_backend(backend); +} // namespace + +} // namespace webgpu +} // namespace backends +} // namespace executorch diff --git a/backends/webgpu/runtime/WebGPUBackend.h b/backends/webgpu/runtime/WebGPUBackend.h new file mode 100644 index 00000000000..9c20a3d53be --- /dev/null +++ b/backends/webgpu/runtime/WebGPUBackend.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +namespace executorch { +namespace backends { +namespace webgpu { + +class WebGPUBackend final : public ::executorch::runtime::BackendInterface { + public: + ~WebGPUBackend() override = default; + + bool is_available() const override; + + executorch::runtime::Result init( + executorch::runtime::BackendInitContext& context, + executorch::runtime::FreeableBuffer* processed, + executorch::runtime::ArrayRef + compile_specs) const override; + + executorch::runtime::Error execute( + executorch::runtime::BackendExecutionContext& context, + executorch::runtime::DelegateHandle* handle, + executorch::runtime::Span args) + const override; + + void destroy(executorch::runtime::DelegateHandle* handle) const override; +}; + +} // namespace webgpu +} // namespace backends +} // namespace executorch From 2a8a3cdec8d2007d4c22c3e2155e1b8b868cf5ea Mon Sep 17 00:00:00 2001 From: Digant Desai Date: Thu, 9 Apr 2026 23:33:43 -0500 Subject: [PATCH 5/9] [WebGPU] Add backend build system CMake integration: backend library target, Vulkan FlatBuffer schema dependency, root build flags, and glslc guard fix. --- .gitignore | 1 + CMakeLists.txt | 5 + backends/vulkan/cmake/ShaderLibrary.cmake | 2 +- backends/webgpu/CMakeLists.txt | 56 +++++++++++ backends/webgpu/README.md | 113 ++++++++++++++++++++++ tools/cmake/preset/default.cmake | 3 + 6 files changed, 179 insertions(+), 1 deletion(-) create mode 100644 backends/webgpu/CMakeLists.txt create mode 100644 backends/webgpu/README.md diff --git a/.gitignore b/.gitignore index aeb4aa14e93..852cb1e8445 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,7 @@ cmake-android-out/ cmake-ios-out/ cmake-out* cmake-out-android/ +backends/webgpu/third-party/ build-android/ build-x86/ build-hexagon/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 1f44b650aa1..171184b2e7e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1154,6 +1154,11 @@ if(EXECUTORCH_BUILD_VULKAN) list(APPEND _executorch_backends vulkan_backend vulkan_schema) endif() +if(EXECUTORCH_BUILD_WEBGPU) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/webgpu) + list(APPEND _executorch_backends webgpu_backend) +endif() + if(EXECUTORCH_BUILD_VGF) list(APPEND _executorch_backends vgf_backend) endif() diff --git a/backends/vulkan/cmake/ShaderLibrary.cmake b/backends/vulkan/cmake/ShaderLibrary.cmake index 19fbae4b2ab..a026660dd41 100644 --- a/backends/vulkan/cmake/ShaderLibrary.cmake +++ b/backends/vulkan/cmake/ShaderLibrary.cmake @@ -26,7 +26,7 @@ endif() find_program(GLSLC_PATH glslc PATHS $ENV{PATH}) -if(NOT GLSLC_PATH) +if(NOT GLSLC_PATH AND EXECUTORCH_BUILD_VULKAN) message( FATAL_ERROR "glslc from the Vulkan SDK must be installed to build the Vulkan backend. " diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt new file mode 100644 index 00000000000..28499e46e26 --- /dev/null +++ b/backends/webgpu/CMakeLists.txt @@ -0,0 +1,56 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +cmake_minimum_required(VERSION 3.19) + +if(NOT EXECUTORCH_ROOT) + set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..) +endif() + +include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake) + +# Ensure vulkan_schema is available even when EXECUTORCH_BUILD_VULKAN is OFF. +# The WebGPU backend reuses the Vulkan FlatBuffer serialization format. +if(NOT TARGET vulkan_schema) + # We need the schema generation from the Vulkan backend. Build only the + # schema target by including the Vulkan CMakeLists.txt. The full Vulkan + # backend will only build if EXECUTORCH_BUILD_VULKAN is ON (which gates the + # vulkan_backend target), but vulkan_schema is unconditionally defined. + add_subdirectory( + ${CMAKE_CURRENT_SOURCE_DIR}/../vulkan + ${CMAKE_CURRENT_BINARY_DIR}/_vulkan_schema + ) +endif() + +set(WEBGPU_SRCS + runtime/WebGPUBackend.cpp + runtime/WebGPUGraph.cpp + runtime/WebGPUDelegateHeader.cpp + runtime/ops/OperatorRegistry.cpp + runtime/ops/add/BinaryOp.cpp +) + +add_library(webgpu_backend ${WEBGPU_SRCS}) + +target_include_directories( + webgpu_backend + PRIVATE $ +) + +target_link_libraries(webgpu_backend PRIVATE vulkan_schema executorch_core) + +target_compile_options(webgpu_backend PRIVATE -fexceptions) + +# Link with --whole-archive for static registration of backend + ops +executorch_target_link_options_shared_lib(webgpu_backend) + +set_property(TARGET webgpu_backend PROPERTY CXX_STANDARD 17) + +install( + TARGETS webgpu_backend + EXPORT ExecuTorchTargets + DESTINATION ${CMAKE_INSTALL_LIBDIR} +) diff --git a/backends/webgpu/README.md b/backends/webgpu/README.md new file mode 100644 index 00000000000..c4886bbc64c --- /dev/null +++ b/backends/webgpu/README.md @@ -0,0 +1,113 @@ +# WebGPU Backend + +Run ExecuTorch models on the GPU via [WebGPU](https://www.w3.org/TR/webgpu/). The backend compiles delegated subgraphs into WGSL compute shaders executed natively through [wgpu-native](https://github.com/gfx-rs/wgpu-native) (Metal on macOS, Vulkan on Linux/Windows). + +> **Status: Prototype.** The backend supports a single operator today and is under active development. See [TODO.md](TODO.md) for the roadmap. + +## Architecture + +``` +PyTorch model + │ torch.export + ▼ +Exported Program + │ VulkanPartitioner (tags supported fp32 ops) + ▼ +Edge Dialect IR + │ VulkanBackend.preprocess (builds Vulkan FlatBuffer, buffer-only storage) + ▼ +.pte file (with VH00/VK00 delegate blob) + │ + ▼ +Native runtime (wgpu-native → Metal / Vulkan) + │ WebGPUGraph::build → creates GPU buffers, pipelines, bind groups + │ WebGPUGraph::execute → encodes + submits compute passes + ▼ +GPU output (mapped back to CPU via wgpuDevicePoll) +``` + +Key design choices: +- **Reuses Vulkan serialization** — the delegate blob is a Vulkan FlatBuffer (`VK00`) with a `VH00` header. All tensor storage is forced to `BUFFER` (WebGPU has no 3D storage textures). +- **Built-in WGSL shaders** — shader source is compiled as C++ string constants. Future work will embed fused shaders in the FlatBuffer for compile-time mega-kernel fusion. +- **No Python AOT code** — directly consumes .pte files exported via `VulkanPartitioner`. + +## Operator Support + +| Operator | WGSL Shader | Notes | +|---|---|---| +| `aten.add.Tensor` | `binary_add.wgsl` | Element-wise with alpha: `out = in1 + alpha * in2` | + +**Planned:** `sub`, `mul`, `relu`, `linear` (matmul), `softmax`, `layer_norm` + +## Quick Start + +### 1. Setup + +```bash +bash backends/webgpu/scripts/setup-wgpu-native.sh +``` + +This downloads prebuilt wgpu-native binaries for your platform. + +### 2. Export a model + +```python +import torch +from executorch.backends.vulkan import VulkanPartitioner +from executorch.exir import to_edge_transform_and_lower + +class AddModule(torch.nn.Module): + def forward(self, a: torch.Tensor, b: torch.Tensor) -> torch.Tensor: + return a + b + +ep = torch.export.export(AddModule(), (torch.randn(4, 4), torch.randn(4, 4))) +et_program = to_edge_transform_and_lower( + ep, partitioner=[VulkanPartitioner()] +).to_executorch() + +with open("add.pte", "wb") as f: + f.write(et_program.buffer) +``` + +### 3. Build and run + +```bash +bash backends/webgpu/test/test_build_webgpu.sh +``` + +This runs Python export tests, exports a .pte, builds the native runtime, and validates GPU output. + +## Directory Structure + +``` +backends/webgpu/ +├── CMakeLists.txt +├── README.md +├── TODO.md +├── runtime/ +│ ├── WebGPUBackend.h/cpp # BackendInterface (init/execute) +│ ├── WebGPUGraph.h/cpp # GPU graph: buffers, pipelines, dispatch +│ ├── WebGPUDelegateHeader.h/cpp # VH00 header parser +│ ├── WebGPUDevice.h/cpp # wgpu-native device abstraction +│ └── ops/ +│ ├── OperatorRegistry.h/cpp # Op dispatch table +│ └── add/ +│ ├── BinaryOp.cpp # aten.add.Tensor implementation +│ ├── binary_add.wgsl # WGSL shader source +│ └── binary_add_wgsl.h # Shader as C++ string constant +├── scripts/ +│ └── setup-wgpu-native.sh # Download wgpu-native binaries +└── test/ + ├── conftest.py + ├── test_build_webgpu.sh # End-to-end build + test + ├── test_webgpu_native.cpp # C++ native test runner + └── ops/ + └── add/ + └── test_add.py # Python export tests +``` + +## Requirements + +- **macOS**: Metal-capable GPU +- **Linux**: Vulkan-capable GPU + drivers +- **Build**: CMake 3.19+, conda environment with ExecuTorch installed diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake index 423194776bc..2c1be2dc9da 100644 --- a/tools/cmake/preset/default.cmake +++ b/tools/cmake/preset/default.cmake @@ -168,6 +168,9 @@ define_overridable_option( define_overridable_option( EXECUTORCH_BUILD_VULKAN "Build the Vulkan backend" BOOL OFF ) +define_overridable_option( + EXECUTORCH_BUILD_WEBGPU "Build the WebGPU backend" BOOL OFF +) define_overridable_option( EXECUTORCH_BUILD_PORTABLE_OPS "Build portable_ops library" BOOL ON ) From 0c155b01f50cb6464d240c2fe6e0f18e2928d1ea Mon Sep 17 00:00:00 2001 From: Digant Desai Date: Tue, 7 Apr 2026 10:28:03 -0500 Subject: [PATCH 6/9] [WebGPU] Add export test using Vulkan partitioner Export tests verify fp32 torch.add models produce a .pte with VulkanBackend delegate: 2D/3D/4D shapes, broadcasting, self-add, scalar add, and chained adds. Includes TODO with architecture notes and next steps. --- backends/webgpu/TODO.md | 39 ++++++++++ backends/webgpu/test/conftest.py | 28 +++++++ backends/webgpu/test/ops/__init__.py | 0 backends/webgpu/test/ops/add/test_add.py | 99 ++++++++++++++++++++++++ 4 files changed, 166 insertions(+) create mode 100644 backends/webgpu/TODO.md create mode 100644 backends/webgpu/test/conftest.py create mode 100644 backends/webgpu/test/ops/__init__.py create mode 100644 backends/webgpu/test/ops/add/test_add.py diff --git a/backends/webgpu/TODO.md b/backends/webgpu/TODO.md new file mode 100644 index 00000000000..02259d5c3eb --- /dev/null +++ b/backends/webgpu/TODO.md @@ -0,0 +1,39 @@ +# WebGPU Backend — TODO + +## Current State (Prototype) +- Single op: `aten.add.Tensor` (fp32, buffer storage) +- No Python AOT code — directly consumes Vulkan delegate (.pte exported via VulkanPartitioner) +- Reuses Vulkan FlatBuffer format (VH00 header + VK00 payload) +- Registers as `"VulkanBackend"` at runtime — mutually exclusive with Vulkan backend at link time +- Built-in WGSL shaders (not embedded in .pte) + +## Architecture +``` +VulkanPartitioner (Python) → VkGraphBuilder → VK00 FlatBuffer → .pte + → WebGPU Runtime: registers as "VulkanBackend", parses VH00/VK00 + → WebGPUGraph::build → GPU buffers/pipelines/bind groups + → WebGPUGraph::execute → encode + submit compute passes +``` + +Adding a new op requires only C++ runtime work: +1. WGSL shader + header +2. C++ op implementation (read args from VkGraph, create pipeline, record dispatch) +3. Register in CMakeLists.txt +4. Test with VulkanPartitioner export + +## Performance: Command Encoding Overhead +WebGPU `GPUCommandBuffer` is single-use (no equivalent to Vulkan's cached command lists). +Per-dispatch API call cost adds up for large graphs. + +**Primary mitigation: mega-kernel fusion.** Generate fused WGSL shaders for chains of +element-wise ops (add→relu→mul→clamp) at compile time. Embed via the existing +`shaders: [VkBytes]` field in schema.fbs. + +## Next Steps +1. **More ops**: sub, mul, relu, linear (matmul), softmax, layer_norm +2. **fp16 support**: Feature-detect `shader-f16`, fallback to fp32 +3. **Buffer pooling**: Reuse GPU buffers to avoid OOM at scale +4. **Pipeline caching**: Cache compiled pipelines across runs +5. **Profiling**: Wire WebGPU timestamp queries into ETDump/EventTracer +6. **LLM support**: KV cache management, Flash Attention in WGSL, quantized ops (int4/int8) +7. **Browser/JS runtime**: Emscripten build, JS harness, browser test page diff --git a/backends/webgpu/test/conftest.py b/backends/webgpu/test/conftest.py new file mode 100644 index 00000000000..b275b70d6f6 --- /dev/null +++ b/backends/webgpu/test/conftest.py @@ -0,0 +1,28 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Workaround for PyTorch 2.11 bug where LeafSpec dataclass fields +# (type, _context, _children) are not initialized by the C++ constructor, +# causing AttributeError in run_decompositions and copy.deepcopy. +import dataclasses + +from torch.utils._pytree import LeafSpec + + +def _leafspec_getattr(self, name): # type: ignore[no-untyped-def] + for f in dataclasses.fields(type(self)): + if f.name == name: + if f.default is not dataclasses.MISSING: + return f.default + elif f.default_factory is not dataclasses.MISSING: + val = f.default_factory() + object.__setattr__(self, name, val) + return val + raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'") + + +if not hasattr(LeafSpec(), "type"): + LeafSpec.__getattr__ = _leafspec_getattr diff --git a/backends/webgpu/test/ops/__init__.py b/backends/webgpu/test/ops/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/backends/webgpu/test/ops/add/test_add.py b/backends/webgpu/test/ops/add/test_add.py new file mode 100644 index 00000000000..5a91ac05755 --- /dev/null +++ b/backends/webgpu/test/ops/add/test_add.py @@ -0,0 +1,99 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import torch +from executorch.backends.vulkan import VulkanPartitioner +from executorch.exir import to_edge_transform_and_lower + + +class AddModule(torch.nn.Module): + def forward(self, a: torch.Tensor, b: torch.Tensor) -> torch.Tensor: + return a + b + + +class AddSelfModule(torch.nn.Module): + def forward(self, x: torch.Tensor) -> torch.Tensor: + return x + x + + +class AddScalarModule(torch.nn.Module): + def forward(self, x: torch.Tensor) -> torch.Tensor: + return x + 3.0 + + +class AddChainedModule(torch.nn.Module): + def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: + z = x + y + z = z + x + z = z + y + return z + + +class TestAdd(unittest.TestCase): + """fp32 torch.add export tests — uses VulkanPartitioner since the WebGPU + runtime directly consumes the Vulkan delegate (VK00 FlatBuffer).""" + + def _export_and_check(self, model, example_inputs) -> None: + ep = torch.export.export(model, example_inputs) + et_program = to_edge_transform_and_lower( + ep, partitioner=[VulkanPartitioner()] + ).to_executorch() + + found_vulkan = False + for plan in et_program.executorch_program.execution_plan: + for delegate in plan.delegates: + if delegate.id == "VulkanBackend": + found_vulkan = True + break + self.assertTrue(found_vulkan, "Expected VulkanBackend delegate in .pte") + self.assertGreater(len(et_program.buffer), 100) + + def test_add_2d(self) -> None: + self._export_and_check(AddModule(), (torch.randn(4, 4), torch.randn(4, 4))) + + def test_add_3d(self) -> None: + self._export_and_check(AddModule(), (torch.randn(2, 3, 4), torch.randn(2, 3, 4))) + + def test_add_4d(self) -> None: + self._export_and_check( + AddModule(), (torch.randn(1, 2, 3, 4), torch.randn(1, 2, 3, 4)) + ) + + def test_add_broadcast_last_dim(self) -> None: + self._export_and_check(AddModule(), (torch.randn(4, 4), torch.randn(4, 1))) + + def test_add_broadcast_first_dim(self) -> None: + self._export_and_check(AddModule(), (torch.randn(4, 4), torch.randn(1, 4))) + + def test_add_self(self) -> None: + self._export_and_check(AddSelfModule(), (torch.randn(4, 4),)) + + def test_add_scalar(self) -> None: + self._export_and_check(AddScalarModule(), (torch.randn(4, 4),)) + + def test_add_chained(self) -> None: + self._export_and_check( + AddChainedModule(), (torch.randn(4, 4), torch.randn(4, 4)) + ) + + +def export_add_model(output_path: str) -> None: + """Export a simple add model to .pte for native runtime testing.""" + model = AddModule() + example_inputs = (torch.randn(1024, 1024), torch.randn(1024, 1024)) + ep = torch.export.export(model, example_inputs) + et_program = to_edge_transform_and_lower( + ep, partitioner=[VulkanPartitioner()] + ).to_executorch() + with open(output_path, "wb") as f: + f.write(et_program.buffer) + print(f"Exported {output_path}") + + +if __name__ == "__main__": + unittest.main() From 55e93f2aebb47368dcffccc54ed4b7e1adaf5bc5 Mon Sep 17 00:00:00 2001 From: Digant Desai Date: Tue, 7 Apr 2026 10:28:03 -0500 Subject: [PATCH 7/9] [WebGPU] Add device abstraction via wgpu-native WebGPUDevice wraps wgpu-native (Metal/Vulkan) behind a uniform C++ interface. Includes a setup script that downloads prebuilt wgpu-native binaries. --- backends/webgpu/runtime/WebGPUDevice.cpp | 178 +++++++++++++++++++ backends/webgpu/runtime/WebGPUDevice.h | 33 ++++ backends/webgpu/scripts/setup-wgpu-native.sh | 58 ++++++ 3 files changed, 269 insertions(+) create mode 100644 backends/webgpu/runtime/WebGPUDevice.cpp create mode 100644 backends/webgpu/runtime/WebGPUDevice.h create mode 100755 backends/webgpu/scripts/setup-wgpu-native.sh diff --git a/backends/webgpu/runtime/WebGPUDevice.cpp b/backends/webgpu/runtime/WebGPUDevice.cpp new file mode 100644 index 00000000000..07a7c85dc9e --- /dev/null +++ b/backends/webgpu/runtime/WebGPUDevice.cpp @@ -0,0 +1,178 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include + +namespace executorch { +namespace backends { +namespace webgpu { + +namespace { + +struct AdapterResult { + WGPUAdapter adapter = nullptr; + bool done = false; +}; + +struct DeviceResult { + WGPUDevice device = nullptr; + bool done = false; +}; + +void on_adapter_request( + WGPURequestAdapterStatus status, + WGPUAdapter adapter, + WGPUStringView message, + void* userdata1, + void* /*userdata2*/) { + auto* result = static_cast(userdata1); + if (status == WGPURequestAdapterStatus_Success) { + result->adapter = adapter; + } else { + fprintf( + stderr, + "WebGPU adapter request failed (status %d): %.*s\n", + static_cast(status), + static_cast(message.length), + message.data); + } + result->done = true; +} + +void on_device_request( + WGPURequestDeviceStatus status, + WGPUDevice device, + WGPUStringView message, + void* userdata1, + void* /*userdata2*/) { + auto* result = static_cast(userdata1); + if (status == WGPURequestDeviceStatus_Success) { + result->device = device; + } else { + fprintf( + stderr, + "WebGPU device request failed (status %d): %.*s\n", + static_cast(status), + static_cast(message.length), + message.data); + } + result->done = true; +} + +void on_device_error( + WGPUDevice const* /*device*/, + WGPUErrorType type, + WGPUStringView message, + void* /*userdata1*/, + void* /*userdata2*/) { + fprintf( + stderr, + "WebGPU device error (type %d): %.*s\n", + static_cast(type), + static_cast(message.length), + message.data); +} + +} // namespace + +WebGPUContext create_webgpu_context() { + WebGPUContext ctx; + + ctx.instance = wgpuCreateInstance(nullptr); + if (!ctx.instance) { + throw std::runtime_error("Failed to create WebGPU instance"); + } + + // Request adapter using AllowSpontaneous mode (fires during + // wgpuInstanceProcessEvents or any other API call). + AdapterResult adapter_result; + WGPURequestAdapterCallbackInfo adapter_cb = {}; + adapter_cb.mode = WGPUCallbackMode_AllowSpontaneous; + adapter_cb.callback = on_adapter_request; + adapter_cb.userdata1 = &adapter_result; + + wgpuInstanceRequestAdapter(ctx.instance, nullptr, adapter_cb); + while (!adapter_result.done) { + wgpuInstanceProcessEvents(ctx.instance); + } + + if (!adapter_result.adapter) { + wgpuInstanceRelease(ctx.instance); + ctx.instance = nullptr; + throw std::runtime_error( + "Failed to get WebGPU adapter. " + "Ensure a GPU with Vulkan (Linux) or Metal (macOS) is available."); + } + ctx.adapter = adapter_result.adapter; + + // Request device + DeviceResult device_result; + WGPURequestDeviceCallbackInfo device_cb = {}; + device_cb.mode = WGPUCallbackMode_AllowSpontaneous; + device_cb.callback = on_device_request; + device_cb.userdata1 = &device_result; + + WGPUDeviceDescriptor device_desc = {}; + device_desc.uncapturedErrorCallbackInfo.callback = on_device_error; + + wgpuAdapterRequestDevice(ctx.adapter, &device_desc, device_cb); + while (!device_result.done) { + wgpuInstanceProcessEvents(ctx.instance); + } + + if (!device_result.device) { + wgpuAdapterRelease(ctx.adapter); + wgpuInstanceRelease(ctx.instance); + ctx.adapter = nullptr; + ctx.instance = nullptr; + throw std::runtime_error("Failed to get WebGPU device"); + } + ctx.device = device_result.device; + ctx.queue = wgpuDeviceGetQueue(ctx.device); + + return ctx; +} + +namespace { +WebGPUContext* g_default_context = nullptr; +} // namespace + +void set_default_webgpu_context(WebGPUContext* ctx) { + g_default_context = ctx; +} + +WebGPUContext* get_default_webgpu_context() { + return g_default_context; +} + +void destroy_webgpu_context(WebGPUContext& ctx) { + if (ctx.queue) { + wgpuQueueRelease(ctx.queue); + ctx.queue = nullptr; + } + if (ctx.device) { + wgpuDeviceRelease(ctx.device); + ctx.device = nullptr; + } + if (ctx.adapter) { + wgpuAdapterRelease(ctx.adapter); + ctx.adapter = nullptr; + } + if (ctx.instance) { + wgpuInstanceRelease(ctx.instance); + ctx.instance = nullptr; + } +} + +} // namespace webgpu +} // namespace backends +} // namespace executorch diff --git a/backends/webgpu/runtime/WebGPUDevice.h b/backends/webgpu/runtime/WebGPUDevice.h new file mode 100644 index 00000000000..78afd96316a --- /dev/null +++ b/backends/webgpu/runtime/WebGPUDevice.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +namespace executorch { +namespace backends { +namespace webgpu { + +struct WebGPUContext { + WGPUInstance instance = nullptr; + WGPUAdapter adapter = nullptr; + WGPUDevice device = nullptr; + WGPUQueue queue = nullptr; +}; + +WebGPUContext create_webgpu_context(); +void destroy_webgpu_context(WebGPUContext& ctx); + +// Global context used by WebGPUGraph::build() when no device is pre-set. +void set_default_webgpu_context(WebGPUContext* ctx); +WebGPUContext* get_default_webgpu_context(); + +} // namespace webgpu +} // namespace backends +} // namespace executorch diff --git a/backends/webgpu/scripts/setup-wgpu-native.sh b/backends/webgpu/scripts/setup-wgpu-native.sh new file mode 100755 index 00000000000..ea427be2713 --- /dev/null +++ b/backends/webgpu/scripts/setup-wgpu-native.sh @@ -0,0 +1,58 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Download prebuilt wgpu-native binaries for native (non-browser) WebGPU testing. +# Usage: bash backends/webgpu/scripts/setup-wgpu-native.sh + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +WGPU_DIR="${SCRIPT_DIR}/../third-party/wgpu-native" + +WGPU_VERSION="v27.0.4.0" +WGPU_BASE_URL="https://github.com/gfx-rs/wgpu-native/releases/download/${WGPU_VERSION}" + +if [[ -f "${WGPU_DIR}/lib/libwgpu_native.a" ]]; then + echo "wgpu-native already installed at ${WGPU_DIR}" + exit 0 +fi + +OS="$(uname -s)" +ARCH="$(uname -m)" + +case "${OS}" in + Darwin) PLATFORM="macos" ;; + Linux) PLATFORM="linux" ;; + *) + echo "Unsupported OS: ${OS}" + exit 1 + ;; +esac + +case "${ARCH}" in + x86_64) WGPU_ARCH="x86_64" ;; + aarch64|arm64) WGPU_ARCH="aarch64" ;; + *) + echo "Unsupported architecture: ${ARCH}" + exit 1 + ;; +esac + +ZIP_NAME="wgpu-${PLATFORM}-${WGPU_ARCH}-release.zip" +URL="${WGPU_BASE_URL}/${ZIP_NAME}" + +echo "Downloading wgpu-native ${WGPU_VERSION} for ${PLATFORM}-${WGPU_ARCH}..." +TMPDIR_DL="$(mktemp -d)" +trap "rm -rf ${TMPDIR_DL}" EXIT + +curl -sL "${URL}" -o "${TMPDIR_DL}/${ZIP_NAME}" + +mkdir -p "${WGPU_DIR}" +unzip -qo "${TMPDIR_DL}/${ZIP_NAME}" -d "${WGPU_DIR}" + +echo "Installed wgpu-native to ${WGPU_DIR}" +ls -la "${WGPU_DIR}/lib/" From d7376e53dec3300925b6f57f9a3e674d72bc9d09 Mon Sep 17 00:00:00 2001 From: Digant Desai Date: Thu, 9 Apr 2026 23:33:44 -0500 Subject: [PATCH 8/9] [WebGPU] Add native CMake build and runtime integration Wire wgpu-native into the CMake build and integrate WebGPUDevice into the compute graph for native Metal/Vulkan execution. --- backends/webgpu/CMakeLists.txt | 68 ++++++++++++++++++++ backends/webgpu/runtime/WebGPUGraph.cpp | 29 +++++++++ backends/webgpu/runtime/WebGPUGraph.h | 28 ++++++++ backends/webgpu/runtime/ops/add/BinaryOp.cpp | 2 + 4 files changed, 127 insertions(+) diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt index 28499e46e26..88b220bf922 100644 --- a/backends/webgpu/CMakeLists.txt +++ b/backends/webgpu/CMakeLists.txt @@ -29,6 +29,7 @@ set(WEBGPU_SRCS runtime/WebGPUBackend.cpp runtime/WebGPUGraph.cpp runtime/WebGPUDelegateHeader.cpp + runtime/WebGPUDevice.cpp runtime/ops/OperatorRegistry.cpp runtime/ops/add/BinaryOp.cpp ) @@ -42,6 +43,37 @@ target_include_directories( target_link_libraries(webgpu_backend PRIVATE vulkan_schema executorch_core) +# Native build: link against wgpu-native +set(WGPU_NATIVE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/third-party/wgpu-native" + CACHE PATH "Path to wgpu-native installation") + +if(NOT EXISTS "${WGPU_NATIVE_DIR}/lib/libwgpu_native.a") + message(FATAL_ERROR + "wgpu-native not found at ${WGPU_NATIVE_DIR}. " + "Run: bash backends/webgpu/scripts/setup-wgpu-native.sh") +endif() + +add_library(wgpu_native STATIC IMPORTED) +set_target_properties(wgpu_native PROPERTIES + IMPORTED_LOCATION "${WGPU_NATIVE_DIR}/lib/libwgpu_native.a" +) + +target_include_directories(webgpu_backend + PUBLIC $ +) +target_link_libraries(webgpu_backend PRIVATE wgpu_native) + +if(APPLE) + target_link_libraries(webgpu_backend PRIVATE + "-framework Metal" + "-framework QuartzCore" + "-framework CoreGraphics" + "-framework Foundation" + ) +else() + target_link_libraries(webgpu_backend PRIVATE dl m pthread) +endif() + target_compile_options(webgpu_backend PRIVATE -fexceptions) # Link with --whole-archive for static registration of backend + ops @@ -54,3 +86,39 @@ install( EXPORT ExecuTorchTargets DESTINATION ${CMAKE_INSTALL_LIBDIR} ) + +# Native test target +if(EXECUTORCH_BUILD_WEBGPU_TEST) + add_executable(webgpu_native_test test/test_webgpu_native.cpp) + + target_include_directories(webgpu_native_test + PRIVATE + $ + "${WGPU_NATIVE_DIR}/include" + ) + + target_link_libraries(webgpu_native_test + PRIVATE + webgpu_backend + wgpu_native + executorch_core + extension_module_static + extension_data_loader + extension_tensor + portable_kernels + portable_ops_lib + ) + + if(APPLE) + target_link_libraries(webgpu_native_test PRIVATE + "-framework Metal" + "-framework QuartzCore" + "-framework CoreGraphics" + ) + else() + target_link_libraries(webgpu_native_test PRIVATE dl m pthread) + endif() + + target_compile_options(webgpu_native_test PRIVATE -fexceptions) + set_property(TARGET webgpu_native_test PROPERTY CXX_STANDARD 17) +endif() diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp index 99b570ad7e5..df2bfb1e688 100644 --- a/backends/webgpu/runtime/WebGPUGraph.cpp +++ b/backends/webgpu/runtime/WebGPUGraph.cpp @@ -11,6 +11,9 @@ #include +#include +#include + #include #include @@ -69,6 +72,13 @@ WebGPUGraph::~WebGPUGraph() { void WebGPUGraph::build( const void* flatbuffer_data, const uint8_t* constant_data) { + if (!device_) { + auto* ctx = get_default_webgpu_context(); + if (ctx) { + device_ = ctx->device; + instance_ = ctx->instance; + } + } if (!device_) { throw std::runtime_error( "WebGPU device not available. " @@ -289,6 +299,9 @@ void WebGPUGraph::copy_outputs( outputs[i].second, cb_info); + // Poll until the map callback fires. + wgpuDevicePoll(device_, true, nullptr); + if (cb_data.status == WGPUMapAsyncStatus_Success) { const void* mapped = wgpuBufferGetConstMappedRange(output_staging_buffers_[i], 0, outputs[i].second); @@ -300,6 +313,22 @@ void WebGPUGraph::copy_outputs( } } +WebGPUMemoryStats WebGPUGraph::memory_stats() const { + WebGPUMemoryStats stats; + for (size_t i = 0; i < value_types_.size(); i++) { + if (value_types_[i] == ValueType::Tensor && tensors_[i].nbytes > 0) { + stats.tensor_buffer_bytes += tensors_[i].nbytes; + stats.num_tensors++; + } + } + for (size_t i = 0; i < output_ids_.size(); i++) { + stats.staging_buffer_bytes += tensors_[output_ids_[i]].nbytes; + } + stats.uniform_buffer_bytes = uniform_buffer_bytes_; + stats.num_dispatches = static_cast(dispatches_.size()); + return stats; +} + } // namespace webgpu } // namespace backends } // namespace executorch diff --git a/backends/webgpu/runtime/WebGPUGraph.h b/backends/webgpu/runtime/WebGPUGraph.h index ce9c2b3a84b..2d6996e9219 100644 --- a/backends/webgpu/runtime/WebGPUGraph.h +++ b/backends/webgpu/runtime/WebGPUGraph.h @@ -30,6 +30,18 @@ struct WebGPUDispatch { uint32_t workgroup_count_x = 1; }; +struct WebGPUMemoryStats { + size_t tensor_buffer_bytes = 0; + size_t staging_buffer_bytes = 0; + size_t uniform_buffer_bytes = 0; + int num_tensors = 0; + int num_dispatches = 0; + + size_t total_bytes() const { + return tensor_buffer_bytes + staging_buffer_bytes + uniform_buffer_bytes; + } +}; + class WebGPUGraph { public: WebGPUGraph(); @@ -83,6 +95,19 @@ class WebGPUGraph { dispatches_.push_back(dispatch); } + void add_uniform_buffer_bytes(size_t bytes) { + uniform_buffer_bytes_ += bytes; + } + + void set_instance(WGPUInstance instance) { + instance_ = instance; + } + void set_device(WGPUDevice device) { + device_ = device; + } + + WebGPUMemoryStats memory_stats() const; + int num_values() const { return static_cast(value_types_.size()); } @@ -94,6 +119,7 @@ class WebGPUGraph { } private: + WGPUInstance instance_ = nullptr; WGPUDevice device_ = nullptr; WGPUQueue queue_ = nullptr; @@ -112,6 +138,8 @@ class WebGPUGraph { std::vector output_staging_buffers_; std::vector dispatches_; + + size_t uniform_buffer_bytes_ = 0; }; } // namespace webgpu diff --git a/backends/webgpu/runtime/ops/add/BinaryOp.cpp b/backends/webgpu/runtime/ops/add/BinaryOp.cpp index 0fbde4c6997..5a6a8e07a51 100644 --- a/backends/webgpu/runtime/ops/add/BinaryOp.cpp +++ b/backends/webgpu/runtime/ops/add/BinaryOp.cpp @@ -64,6 +64,8 @@ void add_impl(WebGPUGraph& graph, const std::vector& args) { std::memcpy(mapped, ¶ms, sizeof(AddParams)); wgpuBufferUnmap(uniform_buffer); + graph.add_uniform_buffer_bytes(sizeof(AddParams)); + // Create shader module from built-in WGSL source WGPUShaderSourceWGSL wgsl_desc = {}; wgsl_desc.chain.sType = WGPUSType_ShaderSourceWGSL; From f5b4f3f24704fc5efd40a22f0be82a101a6e8b9b Mon Sep 17 00:00:00 2001 From: Digant Desai Date: Tue, 7 Apr 2026 10:28:04 -0500 Subject: [PATCH 9/9] [WebGPU] Add native test runner and build script C++ test runner that loads a .pte and runs inference via wgpu-native. End-to-end build script that exports a model, builds the native runtime, and validates output. --- backends/webgpu/test/test_build_webgpu.sh | 65 ++++++++++++ backends/webgpu/test/test_webgpu_native.cpp | 108 ++++++++++++++++++++ 2 files changed, 173 insertions(+) create mode 100755 backends/webgpu/test/test_build_webgpu.sh create mode 100644 backends/webgpu/test/test_webgpu_native.cpp diff --git a/backends/webgpu/test/test_build_webgpu.sh b/backends/webgpu/test/test_build_webgpu.sh new file mode 100755 index 00000000000..684926cb181 --- /dev/null +++ b/backends/webgpu/test/test_build_webgpu.sh @@ -0,0 +1,65 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# End-to-end build and test script for the WebGPU backend (native via wgpu-native). +# Usage: bash backends/webgpu/test/test_build_webgpu.sh + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +EXECUTORCH_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" +PYTHON_EXECUTABLE="${PYTHON_EXECUTABLE:-python3}" +NPROC=$(nproc 2>/dev/null || sysctl -n hw.ncpu) + +# ── Step 1: Python export tests ────────────────────────────────────────────── + +echo "=== Step 1: Run Python export test ===" +$PYTHON_EXECUTABLE -m pytest "${SCRIPT_DIR}/ops/add/test_add.py" -v + +# ── Step 2: Export .pte model ───────────────────────────────────────────────── + +echo "=== Step 2: Export test model ===" +PTE_MODEL="/tmp/webgpu_add_test.pte" +cd "${EXECUTORCH_ROOT}" +$PYTHON_EXECUTABLE -c " +from executorch.backends.webgpu.test.ops.add.test_add import export_add_model +export_add_model('${PTE_MODEL}') +" + +# ── Step 3: Native build + test (wgpu-native) ──────────────────────────────── + +WGPU_DIR="${EXECUTORCH_ROOT}/backends/webgpu/third-party/wgpu-native" + +# Auto-download wgpu-native if not present +if [[ ! -d "${WGPU_DIR}/lib" ]]; then + echo "=== Installing wgpu-native ===" + bash "${EXECUTORCH_ROOT}/backends/webgpu/scripts/setup-wgpu-native.sh" +fi + +echo "=== Step 3: Native build with wgpu-native ===" +NATIVE_BUILD_DIR="${EXECUTORCH_ROOT}/cmake-out-webgpu-native" +rm -rf "${NATIVE_BUILD_DIR}" + +cmake \ + -DEXECUTORCH_BUILD_WEBGPU=ON \ + -DEXECUTORCH_BUILD_WEBGPU_TEST=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ + -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \ + -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -B "${NATIVE_BUILD_DIR}" \ + "${EXECUTORCH_ROOT}" + +cmake --build "${NATIVE_BUILD_DIR}" --target webgpu_native_test -j${NPROC} + +echo "=== Step 4: Run native test ===" +WEBGPU_TEST_MODEL="${PTE_MODEL}" \ + "${NATIVE_BUILD_DIR}/backends/webgpu/webgpu_native_test" + +echo "=== Done ===" diff --git a/backends/webgpu/test/test_webgpu_native.cpp b/backends/webgpu/test/test_webgpu_native.cpp new file mode 100644 index 00000000000..c60695e11c9 --- /dev/null +++ b/backends/webgpu/test/test_webgpu_native.cpp @@ -0,0 +1,108 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +#include +#include +#include +#include + +using namespace executorch::backends::webgpu; +using namespace executorch::extension; +using namespace executorch::runtime; + +static bool test_single_add(const std::string& model_path) { + printf("\n--- Test: single add (1024x1024) ---\n"); + + Module module(model_path); + auto err = module.load_forward(); + if (err != Error::Ok) { + printf("FAIL: could not load forward method (error %d)\n", (int)err); + return false; + } + printf("Model loaded: %s\n", model_path.c_str()); + + constexpr int dim = 1024; + constexpr int size = dim * dim; + + std::vector a_data(size); + std::vector b_data(size); + for (int i = 0; i < size; i++) { + a_data[i] = static_cast(i) * 1.0f; + b_data[i] = static_cast(i) * 2.0f; + } + + auto a = make_tensor_ptr({dim, dim}, std::vector(a_data)); + auto b = make_tensor_ptr({dim, dim}, std::vector(b_data)); + + auto result = module.forward({EValue(a), EValue(b)}); + if (!result.ok()) { + printf("FAIL: forward failed (error %d)\n", (int)result.error()); + return false; + } + + const auto& outputs = result.get(); + if (outputs.empty() || !outputs[0].isTensor()) { + printf("FAIL: no tensor output\n"); + return false; + } + + const auto& out_tensor = outputs[0].toTensor(); + const float* out_data = out_tensor.const_data_ptr(); + + float max_error = 0.0f; + int check_count = std::min(size, 1024); + for (int i = 0; i < check_count; i++) { + float expected = a_data[i] + b_data[i]; + float error = std::abs(out_data[i] - expected); + max_error = std::max(max_error, error); + } + + printf("Max error: %e (checked %d elements)\n", max_error, check_count); + if (max_error > 1e-3f) { + printf("FAIL: max error exceeds tolerance 1e-3\n"); + return false; + } + printf("PASS: single add test\n"); + return true; +} + +int main(int argc, char** argv) { + std::string model_path = "webgpu_add_test.pte"; + if (argc > 1) { + model_path = argv[1]; + } + if (const char* env = std::getenv("WEBGPU_TEST_MODEL")) { + model_path = env; + } + + WebGPUContext ctx; + try { + ctx = create_webgpu_context(); + } catch (const std::exception& e) { + printf("SKIP: %s\n", e.what()); + return 0; + } + + set_default_webgpu_context(&ctx); + printf("WebGPU device acquired (native)\n"); + + bool ok = test_single_add(model_path); + + set_default_webgpu_context(nullptr); + destroy_webgpu_context(ctx); + + if (!ok) { + return 1; + } + printf("\nAll tests passed\n"); + return 0; +}