From ae9482df056036247411db88e80eac9cd50b775e Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@meta.com>
Date: Tue, 7 Apr 2026 10:28:02 -0500
Subject: [PATCH 1/9] [WebGPU] Add delegate header parser

Parses the VH00/VK00 FlatBuffer envelope from the Vulkan
partitioner to extract the serialized graph payload.
---
 .../webgpu/runtime/WebGPUDelegateHeader.cpp   | 99 +++++++++++++++++++
 .../webgpu/runtime/WebGPUDelegateHeader.h     | 32 ++++++
 2 files changed, 131 insertions(+)
 create mode 100644 backends/webgpu/runtime/WebGPUDelegateHeader.cpp
 create mode 100644 backends/webgpu/runtime/WebGPUDelegateHeader.h
diff --git a/backends/webgpu/runtime/WebGPUDelegateHeader.cpp b/backends/webgpu/runtime/WebGPUDelegateHeader.cpp
new file mode 100644
index 00000000000..d1e8b2110a7
--- /dev/null
+++ b/backends/webgpu/runtime/WebGPUDelegateHeader.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/webgpu/runtime/WebGPUDelegateHeader.h>
+
+#include <cstring>
+
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/result.h>
+
+namespace executorch {
+namespace backends {
+namespace webgpu {
+
+using executorch::runtime::Error;
+using executorch::runtime::Result;
+
+namespace {
+
+struct ByteSlice {
+  size_t offset;
+  size_t size;
+};
+
+constexpr size_t kExpectedSize = 30;
+constexpr char kExpectedMagic[4] = {'V', 'H', '0', '0'};
+
+constexpr ByteSlice kMagic = {4, 4};
+constexpr ByteSlice kHeaderSize = {8, 2};
+constexpr ByteSlice kFlatbufferOffset = {10, 4};
+constexpr ByteSlice kFlatbufferSize = {14, 4};
+constexpr ByteSlice kBytesOffset = {18, 4};
+constexpr ByteSlice kBytesSize = {22, 8};
+
+uint64_t getUInt64LE(const uint8_t* data) {
+  return (uint64_t)data[0] | ((uint64_t)data[1] << 8) |
+      ((uint64_t)data[2] << 16) | ((uint64_t)data[3] << 24) |
+      ((uint64_t)data[4] << 32) | ((uint64_t)data[5] << 40) |
+      ((uint64_t)data[6] << 48) | ((uint64_t)data[7] << 56);
+}
+
+uint32_t getUInt32LE(const uint8_t* data) {
+  return (uint32_t)data[0] | ((uint32_t)data[1] << 8) |
+      ((uint32_t)data[2] << 16) | ((uint32_t)data[3] << 24);
+}
+
+uint32_t getUInt16LE(const uint8_t* data) {
+  return (uint32_t)data[0] | ((uint32_t)data[1] << 8);
+}
+
+} // namespace
+
+bool WebGPUDelegateHeader::is_valid() const {
+  if (header_size < kExpectedSize) {
+    return false;
+  }
+  if (flatbuffer_offset < header_size) {
+    return false;
+  }
+  if (flatbuffer_size == 0) {
+    return false;
+  }
+  if (bytes_offset < flatbuffer_offset + flatbuffer_size) {
+    return false;
+  }
+  return true;
+}
+
+Result<WebGPUDelegateHeader> WebGPUDelegateHeader::parse(const void* data) {
+  const uint8_t* header_data = (const uint8_t*)data;
+
+  const uint8_t* magic_start = header_data + kMagic.offset;
+  if (std::memcmp(magic_start, kExpectedMagic, kMagic.size) != 0) {
+    return Error::NotFound;
+  }
+
+  WebGPUDelegateHeader header = WebGPUDelegateHeader{
+      getUInt16LE(header_data + kHeaderSize.offset),
+      getUInt32LE(header_data + kFlatbufferOffset.offset),
+      getUInt32LE(header_data + kFlatbufferSize.offset),
+      getUInt32LE(header_data + kBytesOffset.offset),
+      getUInt64LE(header_data + kBytesSize.offset),
+  };
+
+  if (!header.is_valid()) {
+    return Error::InvalidArgument;
+  }
+
+  return header;
+}
+
+} // namespace webgpu
+} // namespace backends
+} // namespace executorch
diff --git a/backends/webgpu/runtime/WebGPUDelegateHeader.h b/backends/webgpu/runtime/WebGPUDelegateHeader.h
new file mode 100644
index 00000000000..6f2f65130c7
--- /dev/null
+++ b/backends/webgpu/runtime/WebGPUDelegateHeader.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/result.h>
+
+namespace executorch {
+namespace backends {
+namespace webgpu {
+
+struct WebGPUDelegateHeader {
+  bool is_valid() const;
+
+  static executorch::runtime::Result<WebGPUDelegateHeader> parse(
+      const void* data);
+
+  uint32_t header_size;
+  uint32_t flatbuffer_offset;
+  uint32_t flatbuffer_size;
+  uint32_t bytes_offset;
+  uint64_t bytes_size;
+};
+
+} // namespace webgpu
+} // namespace backends
+} // namespace executorch

From 576afdce78d85898edafa59bf2d59c6cb3cdae11 Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@meta.com>
Date: Tue, 7 Apr 2026 10:28:02 -0500
Subject: [PATCH 2/9] [WebGPU] Add operator registry and aten.add shader

Operator registry with registration macros, WGSL binary-add
shader (plus inline C++ header), and the aten.add.Tensor
implementation that creates a compute pipeline and records
dispatch.
---
 .../webgpu/runtime/ops/OperatorRegistry.cpp   |  43 +++++
 .../webgpu/runtime/ops/OperatorRegistry.h     |  62 +++++++
 backends/webgpu/runtime/ops/add/BinaryOp.cpp  | 168 ++++++++++++++++++
 .../webgpu/runtime/ops/add/binary_add.wgsl    |  18 ++
 .../webgpu/runtime/ops/add/binary_add_wgsl.h  |  41 +++++
 5 files changed, 332 insertions(+)
 create mode 100644 backends/webgpu/runtime/ops/OperatorRegistry.cpp
 create mode 100644 backends/webgpu/runtime/ops/OperatorRegistry.h
 create mode 100644 backends/webgpu/runtime/ops/add/BinaryOp.cpp
 create mode 100644 backends/webgpu/runtime/ops/add/binary_add.wgsl
 create mode 100644 backends/webgpu/runtime/ops/add/binary_add_wgsl.h

diff --git a/backends/webgpu/runtime/ops/OperatorRegistry.cpp b/backends/webgpu/runtime/ops/OperatorRegistry.cpp
new file mode 100644
index 00000000000..ddb16a45b97
--- /dev/null
+++ b/backends/webgpu/runtime/ops/OperatorRegistry.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/webgpu/runtime/ops/OperatorRegistry.h>
+
+#include <stdexcept>
+
+namespace executorch {
+namespace backends {
+namespace webgpu {
+
+bool OperatorRegistry::has_op(const std::string& name) {
+  return table_.count(name) > 0;
+}
+
+OpFunction& OperatorRegistry::get_op_fn(const std::string& name) {
+  const auto it = table_.find(name);
+  if (it == table_.end()) {
+    throw std::runtime_error(
+        "WebGPU OperatorRegistry: could not find operator: " + name);
+  }
+  return it->second;
+}
+
+void OperatorRegistry::register_op(
+    const std::string& name,
+    const OpFunction& fn) {
+  table_.insert(std::make_pair(name, fn));
+}
+
+OperatorRegistry& webgpu_operator_registry() {
+  static OperatorRegistry registry;
+  return registry;
+}
+
+} // namespace webgpu
+} // namespace backends
+} // namespace executorch
diff --git a/backends/webgpu/runtime/ops/OperatorRegistry.h b/backends/webgpu/runtime/ops/OperatorRegistry.h
new file mode 100644
index 00000000000..2e09b6f8140
--- /dev/null
+++ b/backends/webgpu/runtime/ops/OperatorRegistry.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <functional>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace executorch {
+namespace backends {
+namespace webgpu {
+
+class WebGPUGraph;
+
+using OpFunction =
+    std::function<void(WebGPUGraph&, const std::vector<int>&)>;
+
+class OperatorRegistry final {
+  using OpTable = std::unordered_map<std::string, OpFunction>;
+  OpTable table_;
+
+ public:
+  bool has_op(const std::string& name);
+  OpFunction& get_op_fn(const std::string& name);
+  void register_op(const std::string& name, const OpFunction& fn);
+};
+
+class OperatorRegisterInit final {
+  using InitFn = void();
+
+ public:
+  explicit OperatorRegisterInit(InitFn* init_fn) {
+    init_fn();
+  }
+};
+
+OperatorRegistry& webgpu_operator_registry();
+
+#define WEBGPU_REGISTER_OP(name, function)                      \
+  ::executorch::backends::webgpu::webgpu_operator_registry()    \
+      .register_op(                                             \
+          #name,                                                \
+          std::bind(                                            \
+              &function, std::placeholders::_1,                 \
+              std::placeholders::_2))
+
+#define WEBGPU_REGISTER_OPERATORS                                       \
+  static void register_webgpu_ops();                                    \
+  static const ::executorch::backends::webgpu::OperatorRegisterInit     \
+      webgpu_reg(&register_webgpu_ops);                                 \
+  static void register_webgpu_ops()
+
+} // namespace webgpu
+} // namespace backends
+} // namespace executorch
diff --git a/backends/webgpu/runtime/ops/add/BinaryOp.cpp b/backends/webgpu/runtime/ops/add/BinaryOp.cpp
new file mode 100644
index 00000000000..0fbde4c6997
--- /dev/null
+++ b/backends/webgpu/runtime/ops/add/BinaryOp.cpp
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/webgpu/runtime/WebGPUGraph.h>
+#include <executorch/backends/webgpu/runtime/ops/OperatorRegistry.h>
+#include <executorch/backends/webgpu/runtime/ops/add/binary_add_wgsl.h>
+
+#include <webgpu/webgpu.h>
+
+#include <cmath>
+#include <cstring>
+
+namespace executorch {
+namespace backends {
+namespace webgpu {
+
+namespace {
+
+// Uniform buffer layout matching the WGSL Params struct.
+// Must be 16-byte aligned for WebGPU uniform buffer requirements.
+struct AddParams {
+  uint32_t num_elements;
+  float alpha;
+  uint32_t _pad[2]; // pad to 16 bytes
+};
+
+void add_impl(WebGPUGraph& graph, const std::vector<int>& args) {
+  // aten.add.Tensor args: [in1, in2, alpha, out]
+  const int in1_id = args.at(0);
+  const int in2_id = args.at(1);
+  const int alpha_id = args.at(2);
+  const int out_id = args.at(3);
+
+  WGPUDevice device = graph.device();
+
+  // Get alpha value (defaults to 1.0 if not a scalar)
+  float alpha = 1.0f;
+  if (graph.get_value_type(alpha_id) == WebGPUGraph::ValueType::Int) {
+    alpha = static_cast<float>(graph.get_int(alpha_id));
+  } else if (graph.get_value_type(alpha_id) == WebGPUGraph::ValueType::Double) {
+    alpha = static_cast<float>(graph.get_double(alpha_id));
+  }
+
+  const auto& out_tensor = graph.get_tensor(out_id);
+  uint32_t num_elements =
+      static_cast<uint32_t>(out_tensor.nbytes / sizeof(float));
+
+  // Create uniform buffer for params
+  AddParams params = {};
+  params.num_elements = num_elements;
+  params.alpha = alpha;
+
+  WGPUBufferDescriptor uniform_desc = {};
+  uniform_desc.size = sizeof(AddParams);
+  uniform_desc.usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst;
+  uniform_desc.mappedAtCreation = true;
+  WGPUBuffer uniform_buffer = wgpuDeviceCreateBuffer(device, &uniform_desc);
+  void* mapped = wgpuBufferGetMappedRange(uniform_buffer, 0, sizeof(AddParams));
+  std::memcpy(mapped, &params, sizeof(AddParams));
+  wgpuBufferUnmap(uniform_buffer);
+
+  // Create shader module from built-in WGSL source
+  WGPUShaderSourceWGSL wgsl_desc = {};
+  wgsl_desc.chain.sType = WGPUSType_ShaderSourceWGSL;
+  wgsl_desc.code = {kBinaryAddWGSL, WGPU_STRLEN};
+
+  WGPUShaderModuleDescriptor shader_desc = {};
+  shader_desc.nextInChain = &wgsl_desc.chain;
+  WGPUShaderModule shader = wgpuDeviceCreateShaderModule(device, &shader_desc);
+
+  // Create bind group layout: 3 storage buffers + 1 uniform
+  WGPUBindGroupLayoutEntry entries[4] = {};
+
+  // input1 - storage buffer, read-only
+  entries[0].binding = 0;
+  entries[0].visibility = WGPUShaderStage_Compute;
+  entries[0].buffer.type = WGPUBufferBindingType_ReadOnlyStorage;
+
+  // input2 - storage buffer, read-only
+  entries[1].binding = 1;
+  entries[1].visibility = WGPUShaderStage_Compute;
+  entries[1].buffer.type = WGPUBufferBindingType_ReadOnlyStorage;
+
+  // output - storage buffer, read-write
+  entries[2].binding = 2;
+  entries[2].visibility = WGPUShaderStage_Compute;
+  entries[2].buffer.type = WGPUBufferBindingType_Storage;
+
+  // params - uniform buffer
+  entries[3].binding = 3;
+  entries[3].visibility = WGPUShaderStage_Compute;
+  entries[3].buffer.type = WGPUBufferBindingType_Uniform;
+
+  WGPUBindGroupLayoutDescriptor bgl_desc = {};
+  bgl_desc.entryCount = 4;
+  bgl_desc.entries = entries;
+  WGPUBindGroupLayout bgl =
+      wgpuDeviceCreateBindGroupLayout(device, &bgl_desc);
+
+  // Create pipeline layout
+  WGPUPipelineLayoutDescriptor pl_desc = {};
+  pl_desc.bindGroupLayoutCount = 1;
+  pl_desc.bindGroupLayouts = &bgl;
+  WGPUPipelineLayout pipeline_layout =
+      wgpuDeviceCreatePipelineLayout(device, &pl_desc);
+
+  // Create compute pipeline
+  WGPUComputePipelineDescriptor pipeline_desc = {};
+  pipeline_desc.layout = pipeline_layout;
+  pipeline_desc.compute.module = shader;
+  pipeline_desc.compute.entryPoint = {"main", WGPU_STRLEN};
+  WGPUComputePipeline pipeline =
+      wgpuDeviceCreateComputePipeline(device, &pipeline_desc);
+
+  // Create bind group with actual buffers
+  const auto& in1_tensor = graph.get_tensor(in1_id);
+  const auto& in2_tensor = graph.get_tensor(in2_id);
+
+  WGPUBindGroupEntry bg_entries[4] = {};
+
+  bg_entries[0].binding = 0;
+  bg_entries[0].buffer = in1_tensor.buffer;
+  bg_entries[0].size = in1_tensor.nbytes;
+
+  bg_entries[1].binding = 1;
+  bg_entries[1].buffer = in2_tensor.buffer;
+  bg_entries[1].size = in2_tensor.nbytes;
+
+  bg_entries[2].binding = 2;
+  bg_entries[2].buffer = out_tensor.buffer;
+  bg_entries[2].size = out_tensor.nbytes;
+
+  bg_entries[3].binding = 3;
+  bg_entries[3].buffer = uniform_buffer;
+  bg_entries[3].size = sizeof(AddParams);
+
+  WGPUBindGroupDescriptor bg_desc = {};
+  bg_desc.layout = bgl;
+  bg_desc.entryCount = 4;
+  bg_desc.entries = bg_entries;
+  WGPUBindGroup bind_group = wgpuDeviceCreateBindGroup(device, &bg_desc);
+
+  uint32_t workgroup_count =
+      (num_elements + kBinaryAddWorkgroupSize - 1) / kBinaryAddWorkgroupSize;
+
+  graph.add_dispatch({pipeline, bind_group, workgroup_count});
+
+  // Release intermediate objects (pipeline and bind_group are kept by dispatch)
+  wgpuShaderModuleRelease(shader);
+  wgpuBindGroupLayoutRelease(bgl);
+  wgpuPipelineLayoutRelease(pipeline_layout);
+  // uniform_buffer is kept alive by the bind group
+}
+
+} // namespace
+
+WEBGPU_REGISTER_OPERATORS {
+  WEBGPU_REGISTER_OP(aten.add.Tensor, add_impl);
+}
+
+} // namespace webgpu
+} // namespace backends
+} // namespace executorch
diff --git a/backends/webgpu/runtime/ops/add/binary_add.wgsl b/backends/webgpu/runtime/ops/add/binary_add.wgsl
new file mode 100644
index 00000000000..4d5ec97e6d3
--- /dev/null
+++ b/backends/webgpu/runtime/ops/add/binary_add.wgsl
@@ -0,0 +1,18 @@
+@group(0) @binding(0) var<storage, read> input1: array<f32>;
+@group(0) @binding(1) var<storage, read> input2: array<f32>;
+@group(0) @binding(2) var<storage, read_write> output: array<f32>;
+
+struct Params {
+  num_elements: u32,
+  alpha: f32,
+}
+@group(0) @binding(3) var<uniform> params: Params;
+
+@compute @workgroup_size(256)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let idx = gid.x;
+    if (idx >= params.num_elements) {
+        return;
+    }
+    output[idx] = input1[idx] + params.alpha * input2[idx];
+}
diff --git a/backends/webgpu/runtime/ops/add/binary_add_wgsl.h b/backends/webgpu/runtime/ops/add/binary_add_wgsl.h
new file mode 100644
index 00000000000..cd94625dbdf
--- /dev/null
+++ b/backends/webgpu/runtime/ops/add/binary_add_wgsl.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+namespace executorch {
+namespace backends {
+namespace webgpu {
+
+// WGSL shader source for element-wise add: output = input1 + alpha * input2
+inline constexpr const char* kBinaryAddWGSL = R"(
+@group(0) @binding(0) var<storage, read> input1: array<f32>;
+@group(0) @binding(1) var<storage, read> input2: array<f32>;
+@group(0) @binding(2) var<storage, read_write> output: array<f32>;
+
+struct Params {
+  num_elements: u32,
+  alpha: f32,
+}
+@group(0) @binding(3) var<uniform> params: Params;
+
+@compute @workgroup_size(256)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let idx = gid.x;
+    if (idx >= params.num_elements) {
+        return;
+    }
+    output[idx] = input1[idx] + params.alpha * input2[idx];
+}
+)";
+
+inline constexpr uint32_t kBinaryAddWorkgroupSize = 256;
+
+} // namespace webgpu
+} // namespace backends
+} // namespace executorch

From 9548f74939e25833daa682198773b340fb0c682e Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@meta.com>
Date: Tue, 7 Apr 2026 10:28:02 -0500
Subject: [PATCH 3/9] [WebGPU] Add compute graph

Buffer management, pipeline creation, and compute dispatch.
Parses the Vulkan FlatBuffer delegate blob and builds a
runnable graph of compute passes.
---
 backends/webgpu/runtime/WebGPUGraph.cpp | 305 ++++++++++++++++++++++++
 backends/webgpu/runtime/WebGPUGraph.h   | 119 +++++++++
 2 files changed, 424 insertions(+)
 create mode 100644 backends/webgpu/runtime/WebGPUGraph.cpp
 create mode 100644 backends/webgpu/runtime/WebGPUGraph.h

diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp
new file mode 100644
index 00000000000..99b570ad7e5
--- /dev/null
+++ b/backends/webgpu/runtime/WebGPUGraph.cpp
@@ -0,0 +1,305 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/webgpu/runtime/WebGPUGraph.h>
+#include <executorch/backends/webgpu/runtime/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/serialization/schema_generated.h>
+
+#include <cstring>
+#include <stdexcept>
+
+namespace executorch {
+namespace backends {
+namespace webgpu {
+
+// vkgraph namespace is declared at global scope in the generated FlatBuffer header
+
+namespace {
+
+size_t vk_datatype_size(vkgraph::VkDataType dtype) {
+  switch (dtype) {
+    case vkgraph::VkDataType::BOOL:
+    case vkgraph::VkDataType::UINT8:
+    case vkgraph::VkDataType::INT8:
+      return 1;
+    case vkgraph::VkDataType::FLOAT16:
+      return 2;
+    case vkgraph::VkDataType::INT32:
+    case vkgraph::VkDataType::FLOAT32:
+      return 4;
+    case vkgraph::VkDataType::INT64:
+    case vkgraph::VkDataType::FLOAT64:
+      return 8;
+    default:
+      return 0;
+  }
+}
+
+} // namespace
+
+WebGPUGraph::WebGPUGraph() = default;
+
+WebGPUGraph::~WebGPUGraph() {
+  for (auto& t : tensors_) {
+    if (t.buffer) {
+      wgpuBufferRelease(t.buffer);
+    }
+  }
+  for (auto& buf : output_staging_buffers_) {
+    if (buf) {
+      wgpuBufferRelease(buf);
+    }
+  }
+  for (auto& d : dispatches_) {
+    if (d.pipeline) {
+      wgpuComputePipelineRelease(d.pipeline);
+    }
+    if (d.bind_group) {
+      wgpuBindGroupRelease(d.bind_group);
+    }
+  }
+}
+
+void WebGPUGraph::build(
+    const void* flatbuffer_data,
+    const uint8_t* constant_data) {
+  if (!device_) {
+    throw std::runtime_error(
+        "WebGPU device not available. "
+        "Call set_default_webgpu_context() before loading.");
+  }
+  queue_ = wgpuDeviceGetQueue(device_);
+
+  const auto* graph = vkgraph::GetVkGraph(flatbuffer_data);
+
+  // Phase 1: Create all values
+  const auto* values = graph->values();
+  const int num_vals = values ? values->size() : 0;
+  value_types_.resize(num_vals, ValueType::Null);
+  tensors_.resize(num_vals);
+  ints_.resize(num_vals, 0);
+  doubles_.resize(num_vals, 0.0);
+  bools_.resize(num_vals, false);
+
+  for (int i = 0; i < num_vals; i++) {
+    const auto* val = values->Get(i);
+    if (!val || val->value_type() == vkgraph::GraphTypes::NONE) {
+      value_types_[i] = ValueType::Null;
+      continue;
+    }
+
+    switch (val->value_type()) {
+      case vkgraph::GraphTypes::VkTensor: {
+        value_types_[i] = ValueType::Tensor;
+        const auto* vk_tensor = val->value_as_VkTensor();
+        auto& tensor = tensors_[i];
+
+        const auto* dims = vk_tensor->dims();
+        size_t numel = 1;
+        if (dims) {
+          for (unsigned j = 0; j < dims->size(); j++) {
+            tensor.dims.push_back(static_cast<int64_t>(dims->Get(j)));
+            numel *= dims->Get(j);
+          }
+        }
+        tensor.nbytes = numel * vk_datatype_size(vk_tensor->datatype());
+
+        // Create GPU buffer
+        WGPUBufferDescriptor buf_desc = {};
+        buf_desc.size = tensor.nbytes > 0 ? tensor.nbytes : 4;
+        buf_desc.usage =
+            WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst |
+            WGPUBufferUsage_CopySrc;
+        buf_desc.mappedAtCreation = false;
+        tensor.buffer = wgpuDeviceCreateBuffer(device_, &buf_desc);
+
+        // Upload constant data if this tensor has a constant_id
+        int constant_id = vk_tensor->constant_id();
+        if (constant_id >= 0 && constant_data) {
+          const auto* constants = graph->constants();
+          if (constants &&
+              constant_id < static_cast<int>(constants->size())) {
+            const auto* vk_bytes = constants->Get(constant_id);
+            // Only upload from embedded bytes (not named data map)
+            if (vk_bytes->offset() != UINT64_MAX) {
+              const uint8_t* src = constant_data + vk_bytes->offset();
+              wgpuQueueWriteBuffer(
+                  queue_, tensor.buffer, 0, src, tensor.nbytes);
+            }
+          }
+        }
+        break;
+      }
+      case vkgraph::GraphTypes::Int: {
+        value_types_[i] = ValueType::Int;
+        ints_[i] = val->value_as_Int()->int_val();
+        break;
+      }
+      case vkgraph::GraphTypes::Double: {
+        value_types_[i] = ValueType::Double;
+        doubles_[i] = val->value_as_Double()->double_val();
+        break;
+      }
+      case vkgraph::GraphTypes::Bool: {
+        value_types_[i] = ValueType::Bool;
+        bools_[i] = val->value_as_Bool()->bool_val();
+        break;
+      }
+      default:
+        value_types_[i] = ValueType::Null;
+        break;
+    }
+  }
+
+  // Phase 2: Record input and output IDs
+  const auto* fb_input_ids = graph->input_ids();
+  if (fb_input_ids) {
+    for (unsigned i = 0; i < fb_input_ids->size(); i++) {
+      input_ids_.push_back(static_cast<int>(fb_input_ids->Get(i)));
+    }
+  }
+  const auto* fb_output_ids = graph->output_ids();
+  if (fb_output_ids) {
+    for (unsigned i = 0; i < fb_output_ids->size(); i++) {
+      int oid = static_cast<int>(fb_output_ids->Get(i));
+      output_ids_.push_back(oid);
+
+      // Create staging buffer for output readback
+      WGPUBufferDescriptor staging_desc = {};
+      staging_desc.size = tensors_[oid].nbytes > 0 ? tensors_[oid].nbytes : 4;
+      staging_desc.usage = WGPUBufferUsage_MapRead | WGPUBufferUsage_CopyDst;
+      staging_desc.mappedAtCreation = false;
+      output_staging_buffers_.push_back(
+          wgpuDeviceCreateBuffer(device_, &staging_desc));
+    }
+  }
+
+  // Phase 3: Build operator dispatch chain
+  const auto* chain = graph->chain();
+  if (chain) {
+    for (unsigned i = 0; i < chain->size(); i++) {
+      const auto* op_call = chain->Get(i);
+      std::string op_name = op_call->name()->str();
+
+      if (!webgpu_operator_registry().has_op(op_name)) {
+        throw std::runtime_error(
+            "WebGPU backend: unsupported op: " + op_name);
+      }
+
+      const auto* fb_args = op_call->args();
+      std::vector<int> args;
+      if (fb_args) {
+        for (unsigned j = 0; j < fb_args->size(); j++) {
+          args.push_back(static_cast<int>(fb_args->Get(j)));
+        }
+      }
+
+      webgpu_operator_registry().get_op_fn(op_name)(*this, args);
+    }
+  }
+}
+
+void WebGPUGraph::copy_inputs(
+    const std::vector<std::pair<const void*, size_t>>& inputs) {
+  for (size_t i = 0; i < inputs.size() && i < input_ids_.size(); i++) {
+    int tid = input_ids_[i];
+    const auto& tensor = tensors_[tid];
+    wgpuQueueWriteBuffer(
+        queue_, tensor.buffer, 0, inputs[i].first, inputs[i].second);
+  }
+}
+
+void WebGPUGraph::execute() {
+  WGPUCommandEncoderDescriptor enc_desc = {};
+  WGPUCommandEncoder encoder =
+      wgpuDeviceCreateCommandEncoder(device_, &enc_desc);
+
+  WGPUComputePassDescriptor pass_desc = {};
+  WGPUComputePassEncoder pass =
+      wgpuCommandEncoderBeginComputePass(encoder, &pass_desc);
+
+  for (const auto& dispatch : dispatches_) {
+    wgpuComputePassEncoderSetPipeline(pass, dispatch.pipeline);
+    wgpuComputePassEncoderSetBindGroup(pass, 0, dispatch.bind_group, 0, nullptr);
+    wgpuComputePassEncoderDispatchWorkgroups(
+        pass, dispatch.workgroup_count_x, 1, 1);
+  }
+
+  wgpuComputePassEncoderEnd(pass);
+  wgpuComputePassEncoderRelease(pass);
+
+  // Copy outputs to staging buffers
+  for (size_t i = 0; i < output_ids_.size(); i++) {
+    int oid = output_ids_[i];
+    wgpuCommandEncoderCopyBufferToBuffer(
+        encoder,
+        tensors_[oid].buffer,
+        0,
+        output_staging_buffers_[i],
+        0,
+        tensors_[oid].nbytes);
+  }
+
+  WGPUCommandBufferDescriptor cmd_desc = {};
+  WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(encoder, &cmd_desc);
+  wgpuQueueSubmit(queue_, 1, &cmd);
+
+  wgpuCommandBufferRelease(cmd);
+  wgpuCommandEncoderRelease(encoder);
+}
+
+namespace {
+
+struct MapCallbackData {
+  bool done = false;
+  WGPUMapAsyncStatus status = WGPUMapAsyncStatus_Error;
+};
+
+void buffer_map_callback(
+    WGPUMapAsyncStatus status,
+    WGPUStringView /*message*/,
+    void* userdata1,
+    void* /*userdata2*/) {
+  auto* data = static_cast<MapCallbackData*>(userdata1);
+  data->status = status;
+  data->done = true;
+}
+
+} // namespace
+
+void WebGPUGraph::copy_outputs(
+    std::vector<std::pair<void*, size_t>>& outputs) {
+  for (size_t i = 0; i < outputs.size() && i < output_staging_buffers_.size();
+       i++) {
+    MapCallbackData cb_data;
+    WGPUBufferMapCallbackInfo cb_info = {};
+    cb_info.mode = WGPUCallbackMode_AllowSpontaneous;
+    cb_info.callback = buffer_map_callback;
+    cb_info.userdata1 = &cb_data;
+    wgpuBufferMapAsync(
+        output_staging_buffers_[i],
+        WGPUMapMode_Read,
+        0,
+        outputs[i].second,
+        cb_info);
+
+    if (cb_data.status == WGPUMapAsyncStatus_Success) {
+      const void* mapped =
+          wgpuBufferGetConstMappedRange(output_staging_buffers_[i], 0, outputs[i].second);
+      std::memcpy(outputs[i].first, mapped, outputs[i].second);
+      wgpuBufferUnmap(output_staging_buffers_[i]);
+    } else {
+      throw std::runtime_error("WebGPU buffer map failed for output");
+    }
+  }
+}
+
+} // namespace webgpu
+} // namespace backends
+} // namespace executorch
diff --git a/backends/webgpu/runtime/WebGPUGraph.h b/backends/webgpu/runtime/WebGPUGraph.h
new file mode 100644
index 00000000000..ce9c2b3a84b
--- /dev/null
+++ b/backends/webgpu/runtime/WebGPUGraph.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <webgpu/webgpu.h>
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+namespace executorch {
+namespace backends {
+namespace webgpu {
+
+struct WebGPUTensor {
+  WGPUBuffer buffer = nullptr;
+  std::vector<int64_t> dims;
+  size_t nbytes = 0;
+};
+
+struct WebGPUDispatch {
+  WGPUComputePipeline pipeline = nullptr;
+  WGPUBindGroup bind_group = nullptr;
+  uint32_t workgroup_count_x = 1;
+};
+
+class WebGPUGraph {
+ public:
+  WebGPUGraph();
+  ~WebGPUGraph();
+
+  // Build the graph from a deserialized VkGraph flatbuffer and constant data.
+  // The flatbuffer_data pointer must remain valid during build().
+  void build(const void* flatbuffer_data, const uint8_t* constant_data);
+
+  // Copy input tensor data from host pointers into GPU buffers.
+  void copy_inputs(const std::vector<std::pair<const void*, size_t>>& inputs);
+
+  // Execute all recorded dispatches.
+  void execute();
+
+  // Copy output tensor data from GPU buffers back to host pointers.
+  // Uses mapAsync + ASYNCIFY in Wasm.
+  void copy_outputs(std::vector<std::pair<void*, size_t>>& outputs);
+
+  const std::vector<int>& input_ids() const {
+    return input_ids_;
+  }
+  const std::vector<int>& output_ids() const {
+    return output_ids_;
+  }
+
+  // Access tensors by value ID (used by op implementations).
+  WebGPUTensor& get_tensor(int id) {
+    return tensors_[id];
+  }
+  const WebGPUTensor& get_tensor(int id) const {
+    return tensors_[id];
+  }
+
+  // Access scalar values stored during graph build.
+  double get_double(int id) const {
+    return doubles_[id];
+  }
+  int64_t get_int(int id) const {
+    return ints_[id];
+  }
+
+  WGPUDevice device() const {
+    return device_;
+  }
+  WGPUQueue queue() const {
+    return queue_;
+  }
+
+  void add_dispatch(WebGPUDispatch dispatch) {
+    dispatches_.push_back(dispatch);
+  }
+
+  int num_values() const {
+    return static_cast<int>(value_types_.size());
+  }
+
+  enum class ValueType { Tensor, Int, Double, Bool, Null, String };
+
+  ValueType get_value_type(int id) const {
+    return value_types_[id];
+  }
+
+ private:
+  WGPUDevice device_ = nullptr;
+  WGPUQueue queue_ = nullptr;
+
+  // Flat arrays indexed by value ID. Only the relevant one is populated
+  // per ID based on value_types_.
+  std::vector<ValueType> value_types_;
+  std::vector<WebGPUTensor> tensors_;
+  std::vector<int64_t> ints_;
+  std::vector<double> doubles_;
+  std::vector<bool> bools_;
+
+  std::vector<int> input_ids_;
+  std::vector<int> output_ids_;
+
+  // Staging buffers for reading back outputs (MapRead | CopyDst).
+  std::vector<WGPUBuffer> output_staging_buffers_;
+
+  std::vector<WebGPUDispatch> dispatches_;
+};
+
+} // namespace webgpu
+} // namespace backends
+} // namespace executorch

From cdb38bf73df5e7db1bd739ca141630e1c66e1e34 Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@meta.com>
Date: Tue, 7 Apr 2026 10:28:02 -0500
Subject: [PATCH 4/9] [WebGPU] Add backend interface

BackendInterface implementation that wires init/execute into
ExecuTorch. Registers as "VulkanBackend" to consume .pte files
from the Vulkan partitioner directly.
---
 backends/webgpu/runtime/WebGPUBackend.cpp | 139 ++++++++++++++++++++++
 backends/webgpu/runtime/WebGPUBackend.h   |  40 +++++++
 2 files changed, 179 insertions(+)
 create mode 100644 backends/webgpu/runtime/WebGPUBackend.cpp
 create mode 100644 backends/webgpu/runtime/WebGPUBackend.h

diff --git a/backends/webgpu/runtime/WebGPUBackend.cpp b/backends/webgpu/runtime/WebGPUBackend.cpp
new file mode 100644
index 00000000000..ec349d05bd2
--- /dev/null
+++ b/backends/webgpu/runtime/WebGPUBackend.cpp
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/webgpu/runtime/WebGPUBackend.h>
+#include <executorch/backends/webgpu/runtime/WebGPUDelegateHeader.h>
+#include <executorch/backends/webgpu/runtime/WebGPUGraph.h>
+
+#include <executorch/backends/vulkan/serialization/schema_generated.h>
+
+#include <executorch/runtime/backend/interface.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/platform/log.h>
+
+#include <new>
+
+namespace executorch {
+namespace backends {
+namespace webgpu {
+
+// vkgraph namespace is declared at global scope in the generated FlatBuffer header
+
+using executorch::runtime::ArrayRef;
+using executorch::runtime::Backend;
+using executorch::runtime::BackendExecutionContext;
+using executorch::runtime::BackendInitContext;
+using executorch::runtime::CompileSpec;
+using executorch::runtime::DelegateHandle;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::FreeableBuffer;
+using executorch::runtime::register_backend;
+using executorch::runtime::Result;
+using executorch::runtime::Span;
+
+bool WebGPUBackend::is_available() const {
+  return true;
+}
+
+Result<DelegateHandle*> WebGPUBackend::init(
+    BackendInitContext& context,
+    FreeableBuffer* processed,
+    ArrayRef<CompileSpec> compile_specs) const {
+  // Allocate graph on the runtime allocator
+  WebGPUGraph* graph =
+      context.get_runtime_allocator()->allocateInstance<WebGPUGraph>();
+  if (graph == nullptr) {
+    return Error::MemoryAllocationFailed;
+  }
+  new (graph) WebGPUGraph();
+
+  // Parse header to locate flatbuffer and constant data
+  Result<WebGPUDelegateHeader> header =
+      WebGPUDelegateHeader::parse(processed->data());
+  if (!header.ok()) {
+    ET_LOG(Error, "WebGPUDelegateHeader may be corrupt");
+    return header.error();
+  }
+
+  const uint8_t* buffer_start =
+      reinterpret_cast<const uint8_t*>(processed->data());
+  const uint8_t* flatbuffer_data = buffer_start + header->flatbuffer_offset;
+  const uint8_t* constant_data = buffer_start + header->bytes_offset;
+
+  // Verify FlatBuffer identifier
+  if (!vkgraph::VkGraphBufferHasIdentifier(flatbuffer_data)) {
+    ET_LOG(
+        Error,
+        "WebGPU delegate FlatBuffer identifier mismatch (expected VK00)");
+    return Error::DelegateInvalidCompatibility;
+  }
+
+  try {
+    graph->build(flatbuffer_data, constant_data);
+  } catch (const std::exception& e) {
+    ET_LOG(Error, "WebGPU graph build failed: %s", e.what());
+    graph->~WebGPUGraph();
+    return Error::DelegateInvalidCompatibility;
+  }
+
+  processed->Free();
+
+  return graph;
+}
+
+Error WebGPUBackend::execute(
+    BackendExecutionContext& context,
+    DelegateHandle* handle,
+    Span<EValue*> args) const {
+  WebGPUGraph* graph = static_cast<WebGPUGraph*>(handle);
+
+  const size_t num_inputs = graph->input_ids().size();
+  const size_t num_outputs = graph->output_ids().size();
+
+  // Copy inputs from EValue tensors to GPU buffers
+  std::vector<std::pair<const void*, size_t>> inputs;
+  inputs.reserve(num_inputs);
+  for (size_t i = 0; i < num_inputs; i++) {
+    const auto& tensor = args[i]->toTensor();
+    inputs.emplace_back(tensor.const_data_ptr(), tensor.nbytes());
+  }
+  graph->copy_inputs(inputs);
+
+  // Execute the compute graph
+  graph->execute();
+
+  // Copy outputs from GPU staging buffers to EValue tensor data pointers
+  std::vector<std::pair<void*, size_t>> outputs;
+  outputs.reserve(num_outputs);
+  for (size_t i = 0; i < num_outputs; i++) {
+    const size_t arg_idx = num_inputs + i;
+    auto& tensor = args[arg_idx]->toTensor();
+    outputs.emplace_back(tensor.mutable_data_ptr(), tensor.nbytes());
+  }
+  graph->copy_outputs(outputs);
+
+  return Error::Ok;
+}
+
+void WebGPUBackend::destroy(DelegateHandle* handle) const {
+  if (handle != nullptr) {
+    WebGPUGraph* graph = static_cast<WebGPUGraph*>(handle);
+    graph->~WebGPUGraph();
+  }
+}
+
+namespace {
+auto cls = WebGPUBackend();
+Backend backend{"VulkanBackend", &cls};
+static auto success_with_compiler = register_backend(backend);
+} // namespace
+
+} // namespace webgpu
+} // namespace backends
+} // namespace executorch
diff --git a/backends/webgpu/runtime/WebGPUBackend.h b/backends/webgpu/runtime/WebGPUBackend.h
new file mode 100644
index 00000000000..9c20a3d53be
--- /dev/null
+++ b/backends/webgpu/runtime/WebGPUBackend.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/backend/interface.h>
+
+namespace executorch {
+namespace backends {
+namespace webgpu {
+
+class WebGPUBackend final : public ::executorch::runtime::BackendInterface {
+ public:
+  ~WebGPUBackend() override = default;
+
+  bool is_available() const override;
+
+  executorch::runtime::Result<executorch::runtime::DelegateHandle*> init(
+      executorch::runtime::BackendInitContext& context,
+      executorch::runtime::FreeableBuffer* processed,
+      executorch::runtime::ArrayRef<executorch::runtime::CompileSpec>
+          compile_specs) const override;
+
+  executorch::runtime::Error execute(
+      executorch::runtime::BackendExecutionContext& context,
+      executorch::runtime::DelegateHandle* handle,
+      executorch::runtime::Span<executorch::runtime::EValue*> args)
+      const override;
+
+  void destroy(executorch::runtime::DelegateHandle* handle) const override;
+};
+
+} // namespace webgpu
+} // namespace backends
+} // namespace executorch

From 2a8a3cdec8d2007d4c22c3e2155e1b8b868cf5ea Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@meta.com>
Date: Thu, 9 Apr 2026 23:33:43 -0500
Subject: [PATCH 5/9] [WebGPU] Add backend build system

CMake integration: backend library target, Vulkan FlatBuffer
schema dependency, root build flags, and glslc guard fix.
---
 .gitignore                                |   1 +
 CMakeLists.txt                            |   5 +
 backends/vulkan/cmake/ShaderLibrary.cmake |   2 +-
 backends/webgpu/CMakeLists.txt            |  56 +++++++++++
 backends/webgpu/README.md                 | 113 ++++++++++++++++++++++
 tools/cmake/preset/default.cmake          |   3 +
 6 files changed, 179 insertions(+), 1 deletion(-)
 create mode 100644 backends/webgpu/CMakeLists.txt
 create mode 100644 backends/webgpu/README.md

diff --git a/.gitignore b/.gitignore
index aeb4aa14e93..852cb1e8445 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,6 +16,7 @@ cmake-android-out/
 cmake-ios-out/
 cmake-out*
 cmake-out-android/
+backends/webgpu/third-party/
 build-android/
 build-x86/
 build-hexagon/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1f44b650aa1..171184b2e7e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1154,6 +1154,11 @@ if(EXECUTORCH_BUILD_VULKAN)
   list(APPEND _executorch_backends vulkan_backend vulkan_schema)
 endif()
 
+if(EXECUTORCH_BUILD_WEBGPU)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/webgpu)
+  list(APPEND _executorch_backends webgpu_backend)
+endif()
+
 if(EXECUTORCH_BUILD_VGF)
   list(APPEND _executorch_backends vgf_backend)
 endif()
diff --git a/backends/vulkan/cmake/ShaderLibrary.cmake b/backends/vulkan/cmake/ShaderLibrary.cmake
index 19fbae4b2ab..a026660dd41 100644
--- a/backends/vulkan/cmake/ShaderLibrary.cmake
+++ b/backends/vulkan/cmake/ShaderLibrary.cmake
@@ -26,7 +26,7 @@ endif()
 
 find_program(GLSLC_PATH glslc PATHS $ENV{PATH})
 
-if(NOT GLSLC_PATH)
+if(NOT GLSLC_PATH AND EXECUTORCH_BUILD_VULKAN)
   message(
     FATAL_ERROR
       "glslc from the Vulkan SDK must be installed to build the Vulkan backend. "
diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt
new file mode 100644
index 00000000000..28499e46e26
--- /dev/null
+++ b/backends/webgpu/CMakeLists.txt
@@ -0,0 +1,56 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+cmake_minimum_required(VERSION 3.19)
+
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
+endif()
+
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+
+# Ensure vulkan_schema is available even when EXECUTORCH_BUILD_VULKAN is OFF.
+# The WebGPU backend reuses the Vulkan FlatBuffer serialization format.
+if(NOT TARGET vulkan_schema)
+  # We need the schema generation from the Vulkan backend. Build only the
+  # schema target by including the Vulkan CMakeLists.txt. The full Vulkan
+  # backend will only build if EXECUTORCH_BUILD_VULKAN is ON (which gates the
+  # vulkan_backend target), but vulkan_schema is unconditionally defined.
+  add_subdirectory(
+    ${CMAKE_CURRENT_SOURCE_DIR}/../vulkan
+    ${CMAKE_CURRENT_BINARY_DIR}/_vulkan_schema
+  )
+endif()
+
+set(WEBGPU_SRCS
+    runtime/WebGPUBackend.cpp
+    runtime/WebGPUGraph.cpp
+    runtime/WebGPUDelegateHeader.cpp
+    runtime/ops/OperatorRegistry.cpp
+    runtime/ops/add/BinaryOp.cpp
+)
+
+add_library(webgpu_backend ${WEBGPU_SRCS})
+
+target_include_directories(
+  webgpu_backend
+  PRIVATE $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
+)
+
+target_link_libraries(webgpu_backend PRIVATE vulkan_schema executorch_core)
+
+target_compile_options(webgpu_backend PRIVATE -fexceptions)
+
+# Link with --whole-archive for static registration of backend + ops
+executorch_target_link_options_shared_lib(webgpu_backend)
+
+set_property(TARGET webgpu_backend PROPERTY CXX_STANDARD 17)
+
+install(
+  TARGETS webgpu_backend
+  EXPORT ExecuTorchTargets
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
diff --git a/backends/webgpu/README.md b/backends/webgpu/README.md
new file mode 100644
index 00000000000..c4886bbc64c
--- /dev/null
+++ b/backends/webgpu/README.md
@@ -0,0 +1,113 @@
+# WebGPU Backend
+
+Run ExecuTorch models on the GPU via [WebGPU](https://www.w3.org/TR/webgpu/). The backend compiles delegated subgraphs into WGSL compute shaders executed natively through [wgpu-native](https://github.com/gfx-rs/wgpu-native) (Metal on macOS, Vulkan on Linux/Windows).
+
+> **Status: Prototype.** The backend supports a single operator today and is under active development. See [TODO.md](TODO.md) for the roadmap.
+
+## Architecture
+
+```
+PyTorch model
+    │  torch.export
+    ▼
+Exported Program
+    │  VulkanPartitioner (tags supported fp32 ops)
+    ▼
+Edge Dialect IR
+    │  VulkanBackend.preprocess (builds Vulkan FlatBuffer, buffer-only storage)
+    ▼
+.pte file (with VH00/VK00 delegate blob)
+    │
+    ▼
+Native runtime (wgpu-native → Metal / Vulkan)
+    │  WebGPUGraph::build  → creates GPU buffers, pipelines, bind groups
+    │  WebGPUGraph::execute → encodes + submits compute passes
+    ▼
+GPU output (mapped back to CPU via wgpuDevicePoll)
+```
+
+Key design choices:
+- **Reuses Vulkan serialization** — the delegate blob is a Vulkan FlatBuffer (`VK00`) with a `VH00` header. All tensor storage is forced to `BUFFER` (WebGPU has no 3D storage textures).
+- **Built-in WGSL shaders** — shader source is compiled as C++ string constants. Future work will embed fused shaders in the FlatBuffer for compile-time mega-kernel fusion.
+- **No Python AOT code** — directly consumes .pte files exported via `VulkanPartitioner`.
+
+## Operator Support
+
+| Operator | WGSL Shader | Notes |
+|---|---|---|
+| `aten.add.Tensor` | `binary_add.wgsl` | Element-wise with alpha: `out = in1 + alpha * in2` |
+
+**Planned:** `sub`, `mul`, `relu`, `linear` (matmul), `softmax`, `layer_norm`
+
+## Quick Start
+
+### 1. Setup
+
+```bash
+bash backends/webgpu/scripts/setup-wgpu-native.sh
+```
+
+This downloads prebuilt wgpu-native binaries for your platform.
+
+### 2. Export a model
+
+```python
+import torch
+from executorch.backends.vulkan import VulkanPartitioner
+from executorch.exir import to_edge_transform_and_lower
+
+class AddModule(torch.nn.Module):
+    def forward(self, a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+        return a + b
+
+ep = torch.export.export(AddModule(), (torch.randn(4, 4), torch.randn(4, 4)))
+et_program = to_edge_transform_and_lower(
+    ep, partitioner=[VulkanPartitioner()]
+).to_executorch()
+
+with open("add.pte", "wb") as f:
+    f.write(et_program.buffer)
+```
+
+### 3. Build and run
+
+```bash
+bash backends/webgpu/test/test_build_webgpu.sh
+```
+
+This runs Python export tests, exports a .pte, builds the native runtime, and validates GPU output.
+
+## Directory Structure
+
+```
+backends/webgpu/
+├── CMakeLists.txt
+├── README.md
+├── TODO.md
+├── runtime/
+│   ├── WebGPUBackend.h/cpp        # BackendInterface (init/execute)
+│   ├── WebGPUGraph.h/cpp          # GPU graph: buffers, pipelines, dispatch
+│   ├── WebGPUDelegateHeader.h/cpp # VH00 header parser
+│   ├── WebGPUDevice.h/cpp         # wgpu-native device abstraction
+│   └── ops/
+│       ├── OperatorRegistry.h/cpp # Op dispatch table
+│       └── add/
+│           ├── BinaryOp.cpp       # aten.add.Tensor implementation
+│           ├── binary_add.wgsl    # WGSL shader source
+│           └── binary_add_wgsl.h  # Shader as C++ string constant
+├── scripts/
+│   └── setup-wgpu-native.sh      # Download wgpu-native binaries
+└── test/
+    ├── conftest.py
+    ├── test_build_webgpu.sh       # End-to-end build + test
+    ├── test_webgpu_native.cpp     # C++ native test runner
+    └── ops/
+        └── add/
+            └── test_add.py        # Python export tests
+```
+
+## Requirements
+
+- **macOS**: Metal-capable GPU
+- **Linux**: Vulkan-capable GPU + drivers
+- **Build**: CMake 3.19+, conda environment with ExecuTorch installed
diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake
index 423194776bc..2c1be2dc9da 100644
--- a/tools/cmake/preset/default.cmake
+++ b/tools/cmake/preset/default.cmake
@@ -168,6 +168,9 @@ define_overridable_option(
 define_overridable_option(
   EXECUTORCH_BUILD_VULKAN "Build the Vulkan backend" BOOL OFF
 )
+define_overridable_option(
+  EXECUTORCH_BUILD_WEBGPU "Build the WebGPU backend" BOOL OFF
+)
 define_overridable_option(
   EXECUTORCH_BUILD_PORTABLE_OPS "Build portable_ops library" BOOL ON
 )

From 0c155b01f50cb6464d240c2fe6e0f18e2928d1ea Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@meta.com>
Date: Tue, 7 Apr 2026 10:28:03 -0500
Subject: [PATCH 6/9] [WebGPU] Add export test using Vulkan partitioner

Export tests verify fp32 torch.add models produce a .pte with
VulkanBackend delegate: 2D/3D/4D shapes, broadcasting,
self-add, scalar add, and chained adds. Includes TODO with
architecture notes and next steps.
---
 backends/webgpu/TODO.md                  | 39 ++++++++++
 backends/webgpu/test/conftest.py         | 28 +++++++
 backends/webgpu/test/ops/__init__.py     |  0
 backends/webgpu/test/ops/add/test_add.py | 99 ++++++++++++++++++++++++
 4 files changed, 166 insertions(+)
 create mode 100644 backends/webgpu/TODO.md
 create mode 100644 backends/webgpu/test/conftest.py
 create mode 100644 backends/webgpu/test/ops/__init__.py
 create mode 100644 backends/webgpu/test/ops/add/test_add.py

diff --git a/backends/webgpu/TODO.md b/backends/webgpu/TODO.md
new file mode 100644
index 00000000000..02259d5c3eb
--- /dev/null
+++ b/backends/webgpu/TODO.md
@@ -0,0 +1,39 @@
+# WebGPU Backend — TODO
+
+## Current State (Prototype)
+- Single op: `aten.add.Tensor` (fp32, buffer storage)
+- No Python AOT code — directly consumes Vulkan delegate (.pte exported via VulkanPartitioner)
+- Reuses Vulkan FlatBuffer format (VH00 header + VK00 payload)
+- Registers as `"VulkanBackend"` at runtime — mutually exclusive with Vulkan backend at link time
+- Built-in WGSL shaders (not embedded in .pte)
+
+## Architecture
+```
+VulkanPartitioner (Python) → VkGraphBuilder → VK00 FlatBuffer → .pte
+    → WebGPU Runtime: registers as "VulkanBackend", parses VH00/VK00
+    → WebGPUGraph::build → GPU buffers/pipelines/bind groups
+    → WebGPUGraph::execute → encode + submit compute passes
+```
+
+Adding a new op requires only C++ runtime work:
+1. WGSL shader + header
+2. C++ op implementation (read args from VkGraph, create pipeline, record dispatch)
+3. Register in CMakeLists.txt
+4. Test with VulkanPartitioner export
+
+## Performance: Command Encoding Overhead
+WebGPU `GPUCommandBuffer` is single-use (no equivalent to Vulkan's cached command lists).
+Per-dispatch API call cost adds up for large graphs.
+
+**Primary mitigation: mega-kernel fusion.** Generate fused WGSL shaders for chains of
+element-wise ops (add→relu→mul→clamp) at compile time. Embed via the existing
+`shaders: [VkBytes]` field in schema.fbs.
+
+## Next Steps
+1. **More ops**: sub, mul, relu, linear (matmul), softmax, layer_norm
+2. **fp16 support**: Feature-detect `shader-f16`, fallback to fp32
+3. **Buffer pooling**: Reuse GPU buffers to avoid OOM at scale
+4. **Pipeline caching**: Cache compiled pipelines across runs
+5. **Profiling**: Wire WebGPU timestamp queries into ETDump/EventTracer
+6. **LLM support**: KV cache management, Flash Attention in WGSL, quantized ops (int4/int8)
+7. **Browser/JS runtime**: Emscripten build, JS harness, browser test page
diff --git a/backends/webgpu/test/conftest.py b/backends/webgpu/test/conftest.py
new file mode 100644
index 00000000000..b275b70d6f6
--- /dev/null
+++ b/backends/webgpu/test/conftest.py
@@ -0,0 +1,28 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Workaround for PyTorch 2.11 bug where LeafSpec dataclass fields
+# (type, _context, _children) are not initialized by the C++ constructor,
+# causing AttributeError in run_decompositions and copy.deepcopy.
+import dataclasses
+
+from torch.utils._pytree import LeafSpec
+
+
+def _leafspec_getattr(self, name):  # type: ignore[no-untyped-def]
+    for f in dataclasses.fields(type(self)):
+        if f.name == name:
+            if f.default is not dataclasses.MISSING:
+                return f.default
+            elif f.default_factory is not dataclasses.MISSING:
+                val = f.default_factory()
+                object.__setattr__(self, name, val)
+                return val
+    raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
+
+
+if not hasattr(LeafSpec(), "type"):
+    LeafSpec.__getattr__ = _leafspec_getattr
diff --git a/backends/webgpu/test/ops/__init__.py b/backends/webgpu/test/ops/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/backends/webgpu/test/ops/add/test_add.py b/backends/webgpu/test/ops/add/test_add.py
new file mode 100644
index 00000000000..5a91ac05755
--- /dev/null
+++ b/backends/webgpu/test/ops/add/test_add.py
@@ -0,0 +1,99 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from executorch.backends.vulkan import VulkanPartitioner
+from executorch.exir import to_edge_transform_and_lower
+
+
+class AddModule(torch.nn.Module):
+    def forward(self, a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+        return a + b
+
+
+class AddSelfModule(torch.nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x + x
+
+
+class AddScalarModule(torch.nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x + 3.0
+
+
+class AddChainedModule(torch.nn.Module):
+    def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        z = x + y
+        z = z + x
+        z = z + y
+        return z
+
+
+class TestAdd(unittest.TestCase):
+    """fp32 torch.add export tests — uses VulkanPartitioner since the WebGPU
+    runtime directly consumes the Vulkan delegate (VK00 FlatBuffer)."""
+
+    def _export_and_check(self, model, example_inputs) -> None:
+        ep = torch.export.export(model, example_inputs)
+        et_program = to_edge_transform_and_lower(
+            ep, partitioner=[VulkanPartitioner()]
+        ).to_executorch()
+
+        found_vulkan = False
+        for plan in et_program.executorch_program.execution_plan:
+            for delegate in plan.delegates:
+                if delegate.id == "VulkanBackend":
+                    found_vulkan = True
+                    break
+        self.assertTrue(found_vulkan, "Expected VulkanBackend delegate in .pte")
+        self.assertGreater(len(et_program.buffer), 100)
+
+    def test_add_2d(self) -> None:
+        self._export_and_check(AddModule(), (torch.randn(4, 4), torch.randn(4, 4)))
+
+    def test_add_3d(self) -> None:
+        self._export_and_check(AddModule(), (torch.randn(2, 3, 4), torch.randn(2, 3, 4)))
+
+    def test_add_4d(self) -> None:
+        self._export_and_check(
+            AddModule(), (torch.randn(1, 2, 3, 4), torch.randn(1, 2, 3, 4))
+        )
+
+    def test_add_broadcast_last_dim(self) -> None:
+        self._export_and_check(AddModule(), (torch.randn(4, 4), torch.randn(4, 1)))
+
+    def test_add_broadcast_first_dim(self) -> None:
+        self._export_and_check(AddModule(), (torch.randn(4, 4), torch.randn(1, 4)))
+
+    def test_add_self(self) -> None:
+        self._export_and_check(AddSelfModule(), (torch.randn(4, 4),))
+
+    def test_add_scalar(self) -> None:
+        self._export_and_check(AddScalarModule(), (torch.randn(4, 4),))
+
+    def test_add_chained(self) -> None:
+        self._export_and_check(
+            AddChainedModule(), (torch.randn(4, 4), torch.randn(4, 4))
+        )
+
+
+def export_add_model(output_path: str) -> None:
+    """Export a simple add model to .pte for native runtime testing."""
+    model = AddModule()
+    example_inputs = (torch.randn(1024, 1024), torch.randn(1024, 1024))
+    ep = torch.export.export(model, example_inputs)
+    et_program = to_edge_transform_and_lower(
+        ep, partitioner=[VulkanPartitioner()]
+    ).to_executorch()
+    with open(output_path, "wb") as f:
+        f.write(et_program.buffer)
+    print(f"Exported {output_path}")
+
+
+if __name__ == "__main__":
+    unittest.main()

From 55e93f2aebb47368dcffccc54ed4b7e1adaf5bc5 Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@meta.com>
Date: Tue, 7 Apr 2026 10:28:03 -0500
Subject: [PATCH 7/9] [WebGPU] Add device abstraction via wgpu-native

WebGPUDevice wraps wgpu-native (Metal/Vulkan) behind a
uniform C++ interface. Includes a setup script that downloads
prebuilt wgpu-native binaries.
---
 backends/webgpu/runtime/WebGPUDevice.cpp     | 178 +++++++++++++++++++
 backends/webgpu/runtime/WebGPUDevice.h       |  33 ++++
 backends/webgpu/scripts/setup-wgpu-native.sh |  58 ++++++
 3 files changed, 269 insertions(+)
 create mode 100644 backends/webgpu/runtime/WebGPUDevice.cpp
 create mode 100644 backends/webgpu/runtime/WebGPUDevice.h
 create mode 100755 backends/webgpu/scripts/setup-wgpu-native.sh

diff --git a/backends/webgpu/runtime/WebGPUDevice.cpp b/backends/webgpu/runtime/WebGPUDevice.cpp
new file mode 100644
index 00000000000..07a7c85dc9e
--- /dev/null
+++ b/backends/webgpu/runtime/WebGPUDevice.cpp
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/webgpu/runtime/WebGPUDevice.h>
+
+#include <cstdio>
+#include <cstdlib>
+#include <stdexcept>
+
+namespace executorch {
+namespace backends {
+namespace webgpu {
+
+namespace {
+
+struct AdapterResult {
+  WGPUAdapter adapter = nullptr;
+  bool done = false;
+};
+
+struct DeviceResult {
+  WGPUDevice device = nullptr;
+  bool done = false;
+};
+
+void on_adapter_request(
+    WGPURequestAdapterStatus status,
+    WGPUAdapter adapter,
+    WGPUStringView message,
+    void* userdata1,
+    void* /*userdata2*/) {
+  auto* result = static_cast<AdapterResult*>(userdata1);
+  if (status == WGPURequestAdapterStatus_Success) {
+    result->adapter = adapter;
+  } else {
+    fprintf(
+        stderr,
+        "WebGPU adapter request failed (status %d): %.*s\n",
+        static_cast<int>(status),
+        static_cast<int>(message.length),
+        message.data);
+  }
+  result->done = true;
+}
+
+void on_device_request(
+    WGPURequestDeviceStatus status,
+    WGPUDevice device,
+    WGPUStringView message,
+    void* userdata1,
+    void* /*userdata2*/) {
+  auto* result = static_cast<DeviceResult*>(userdata1);
+  if (status == WGPURequestDeviceStatus_Success) {
+    result->device = device;
+  } else {
+    fprintf(
+        stderr,
+        "WebGPU device request failed (status %d): %.*s\n",
+        static_cast<int>(status),
+        static_cast<int>(message.length),
+        message.data);
+  }
+  result->done = true;
+}
+
+void on_device_error(
+    WGPUDevice const* /*device*/,
+    WGPUErrorType type,
+    WGPUStringView message,
+    void* /*userdata1*/,
+    void* /*userdata2*/) {
+  fprintf(
+      stderr,
+      "WebGPU device error (type %d): %.*s\n",
+      static_cast<int>(type),
+      static_cast<int>(message.length),
+      message.data);
+}
+
+} // namespace
+
+WebGPUContext create_webgpu_context() {
+  WebGPUContext ctx;
+
+  ctx.instance = wgpuCreateInstance(nullptr);
+  if (!ctx.instance) {
+    throw std::runtime_error("Failed to create WebGPU instance");
+  }
+
+  // Request adapter using AllowSpontaneous mode (fires during
+  // wgpuInstanceProcessEvents or any other API call).
+  AdapterResult adapter_result;
+  WGPURequestAdapterCallbackInfo adapter_cb = {};
+  adapter_cb.mode = WGPUCallbackMode_AllowSpontaneous;
+  adapter_cb.callback = on_adapter_request;
+  adapter_cb.userdata1 = &adapter_result;
+
+  wgpuInstanceRequestAdapter(ctx.instance, nullptr, adapter_cb);
+  while (!adapter_result.done) {
+    wgpuInstanceProcessEvents(ctx.instance);
+  }
+
+  if (!adapter_result.adapter) {
+    wgpuInstanceRelease(ctx.instance);
+    ctx.instance = nullptr;
+    throw std::runtime_error(
+        "Failed to get WebGPU adapter. "
+        "Ensure a GPU with Vulkan (Linux) or Metal (macOS) is available.");
+  }
+  ctx.adapter = adapter_result.adapter;
+
+  // Request device
+  DeviceResult device_result;
+  WGPURequestDeviceCallbackInfo device_cb = {};
+  device_cb.mode = WGPUCallbackMode_AllowSpontaneous;
+  device_cb.callback = on_device_request;
+  device_cb.userdata1 = &device_result;
+
+  WGPUDeviceDescriptor device_desc = {};
+  device_desc.uncapturedErrorCallbackInfo.callback = on_device_error;
+
+  wgpuAdapterRequestDevice(ctx.adapter, &device_desc, device_cb);
+  while (!device_result.done) {
+    wgpuInstanceProcessEvents(ctx.instance);
+  }
+
+  if (!device_result.device) {
+    wgpuAdapterRelease(ctx.adapter);
+    wgpuInstanceRelease(ctx.instance);
+    ctx.adapter = nullptr;
+    ctx.instance = nullptr;
+    throw std::runtime_error("Failed to get WebGPU device");
+  }
+  ctx.device = device_result.device;
+  ctx.queue = wgpuDeviceGetQueue(ctx.device);
+
+  return ctx;
+}
+
+namespace {
+WebGPUContext* g_default_context = nullptr;
+} // namespace
+
+void set_default_webgpu_context(WebGPUContext* ctx) {
+  g_default_context = ctx;
+}
+
+WebGPUContext* get_default_webgpu_context() {
+  return g_default_context;
+}
+
+void destroy_webgpu_context(WebGPUContext& ctx) {
+  if (ctx.queue) {
+    wgpuQueueRelease(ctx.queue);
+    ctx.queue = nullptr;
+  }
+  if (ctx.device) {
+    wgpuDeviceRelease(ctx.device);
+    ctx.device = nullptr;
+  }
+  if (ctx.adapter) {
+    wgpuAdapterRelease(ctx.adapter);
+    ctx.adapter = nullptr;
+  }
+  if (ctx.instance) {
+    wgpuInstanceRelease(ctx.instance);
+    ctx.instance = nullptr;
+  }
+}
+
+} // namespace webgpu
+} // namespace backends
+} // namespace executorch
diff --git a/backends/webgpu/runtime/WebGPUDevice.h b/backends/webgpu/runtime/WebGPUDevice.h
new file mode 100644
index 00000000000..78afd96316a
--- /dev/null
+++ b/backends/webgpu/runtime/WebGPUDevice.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <webgpu/webgpu.h>
+
+namespace executorch {
+namespace backends {
+namespace webgpu {
+
+struct WebGPUContext {
+  WGPUInstance instance = nullptr;
+  WGPUAdapter adapter = nullptr;
+  WGPUDevice device = nullptr;
+  WGPUQueue queue = nullptr;
+};
+
+WebGPUContext create_webgpu_context();
+void destroy_webgpu_context(WebGPUContext& ctx);
+
+// Global context used by WebGPUGraph::build() when no device is pre-set.
+void set_default_webgpu_context(WebGPUContext* ctx);
+WebGPUContext* get_default_webgpu_context();
+
+} // namespace webgpu
+} // namespace backends
+} // namespace executorch
diff --git a/backends/webgpu/scripts/setup-wgpu-native.sh b/backends/webgpu/scripts/setup-wgpu-native.sh
new file mode 100755
index 00000000000..ea427be2713
--- /dev/null
+++ b/backends/webgpu/scripts/setup-wgpu-native.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Download prebuilt wgpu-native binaries for native (non-browser) WebGPU testing.
+# Usage: bash backends/webgpu/scripts/setup-wgpu-native.sh
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+WGPU_DIR="${SCRIPT_DIR}/../third-party/wgpu-native"
+
+WGPU_VERSION="v27.0.4.0"
+WGPU_BASE_URL="https://github.com/gfx-rs/wgpu-native/releases/download/${WGPU_VERSION}"
+
+if [[ -f "${WGPU_DIR}/lib/libwgpu_native.a" ]]; then
+    echo "wgpu-native already installed at ${WGPU_DIR}"
+    exit 0
+fi
+
+OS="$(uname -s)"
+ARCH="$(uname -m)"
+
+case "${OS}" in
+    Darwin) PLATFORM="macos" ;;
+    Linux)  PLATFORM="linux" ;;
+    *)
+        echo "Unsupported OS: ${OS}"
+        exit 1
+        ;;
+esac
+
+case "${ARCH}" in
+    x86_64)  WGPU_ARCH="x86_64" ;;
+    aarch64|arm64) WGPU_ARCH="aarch64" ;;
+    *)
+        echo "Unsupported architecture: ${ARCH}"
+        exit 1
+        ;;
+esac
+
+ZIP_NAME="wgpu-${PLATFORM}-${WGPU_ARCH}-release.zip"
+URL="${WGPU_BASE_URL}/${ZIP_NAME}"
+
+echo "Downloading wgpu-native ${WGPU_VERSION} for ${PLATFORM}-${WGPU_ARCH}..."
+TMPDIR_DL="$(mktemp -d)"
+trap "rm -rf ${TMPDIR_DL}" EXIT
+
+curl -sL "${URL}" -o "${TMPDIR_DL}/${ZIP_NAME}"
+
+mkdir -p "${WGPU_DIR}"
+unzip -qo "${TMPDIR_DL}/${ZIP_NAME}" -d "${WGPU_DIR}"
+
+echo "Installed wgpu-native to ${WGPU_DIR}"
+ls -la "${WGPU_DIR}/lib/"

From d7376e53dec3300925b6f57f9a3e674d72bc9d09 Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@meta.com>
Date: Thu, 9 Apr 2026 23:33:44 -0500
Subject: [PATCH 8/9] [WebGPU] Add native CMake build and runtime integration

Wire wgpu-native into the CMake build and integrate
WebGPUDevice into the compute graph for native
Metal/Vulkan execution.
---
 backends/webgpu/CMakeLists.txt               | 68 ++++++++++++++++++++
 backends/webgpu/runtime/WebGPUGraph.cpp      | 29 +++++++++
 backends/webgpu/runtime/WebGPUGraph.h        | 28 ++++++++
 backends/webgpu/runtime/ops/add/BinaryOp.cpp |  2 +
 4 files changed, 127 insertions(+)

diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt
index 28499e46e26..88b220bf922 100644
--- a/backends/webgpu/CMakeLists.txt
+++ b/backends/webgpu/CMakeLists.txt
@@ -29,6 +29,7 @@ set(WEBGPU_SRCS
     runtime/WebGPUBackend.cpp
     runtime/WebGPUGraph.cpp
     runtime/WebGPUDelegateHeader.cpp
+    runtime/WebGPUDevice.cpp
     runtime/ops/OperatorRegistry.cpp
     runtime/ops/add/BinaryOp.cpp
 )
@@ -42,6 +43,37 @@ target_include_directories(
 
 target_link_libraries(webgpu_backend PRIVATE vulkan_schema executorch_core)
 
+# Native build: link against wgpu-native
+set(WGPU_NATIVE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/third-party/wgpu-native"
+    CACHE PATH "Path to wgpu-native installation")
+
+if(NOT EXISTS "${WGPU_NATIVE_DIR}/lib/libwgpu_native.a")
+  message(FATAL_ERROR
+    "wgpu-native not found at ${WGPU_NATIVE_DIR}. "
+    "Run: bash backends/webgpu/scripts/setup-wgpu-native.sh")
+endif()
+
+add_library(wgpu_native STATIC IMPORTED)
+set_target_properties(wgpu_native PROPERTIES
+  IMPORTED_LOCATION "${WGPU_NATIVE_DIR}/lib/libwgpu_native.a"
+)
+
+target_include_directories(webgpu_backend
+  PUBLIC $<BUILD_INTERFACE:${WGPU_NATIVE_DIR}/include>
+)
+target_link_libraries(webgpu_backend PRIVATE wgpu_native)
+
+if(APPLE)
+  target_link_libraries(webgpu_backend PRIVATE
+    "-framework Metal"
+    "-framework QuartzCore"
+    "-framework CoreGraphics"
+    "-framework Foundation"
+  )
+else()
+  target_link_libraries(webgpu_backend PRIVATE dl m pthread)
+endif()
+
 target_compile_options(webgpu_backend PRIVATE -fexceptions)
 
 # Link with --whole-archive for static registration of backend + ops
@@ -54,3 +86,39 @@ install(
   EXPORT ExecuTorchTargets
   DESTINATION ${CMAKE_INSTALL_LIBDIR}
 )
+
+# Native test target
+if(EXECUTORCH_BUILD_WEBGPU_TEST)
+  add_executable(webgpu_native_test test/test_webgpu_native.cpp)
+
+  target_include_directories(webgpu_native_test
+    PRIVATE
+      $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
+      "${WGPU_NATIVE_DIR}/include"
+  )
+
+  target_link_libraries(webgpu_native_test
+    PRIVATE
+      webgpu_backend
+      wgpu_native
+      executorch_core
+      extension_module_static
+      extension_data_loader
+      extension_tensor
+      portable_kernels
+      portable_ops_lib
+  )
+
+  if(APPLE)
+    target_link_libraries(webgpu_native_test PRIVATE
+      "-framework Metal"
+      "-framework QuartzCore"
+      "-framework CoreGraphics"
+    )
+  else()
+    target_link_libraries(webgpu_native_test PRIVATE dl m pthread)
+  endif()
+
+  target_compile_options(webgpu_native_test PRIVATE -fexceptions)
+  set_property(TARGET webgpu_native_test PROPERTY CXX_STANDARD 17)
+endif()
diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp
index 99b570ad7e5..df2bfb1e688 100644
--- a/backends/webgpu/runtime/WebGPUGraph.cpp
+++ b/backends/webgpu/runtime/WebGPUGraph.cpp
@@ -11,6 +11,9 @@
 
 #include <executorch/backends/vulkan/serialization/schema_generated.h>
 
+#include <executorch/backends/webgpu/runtime/WebGPUDevice.h>
+#include <webgpu/wgpu.h>
+
 #include <cstring>
 #include <stdexcept>
 
@@ -69,6 +72,13 @@ WebGPUGraph::~WebGPUGraph() {
 void WebGPUGraph::build(
     const void* flatbuffer_data,
     const uint8_t* constant_data) {
+  if (!device_) {
+    auto* ctx = get_default_webgpu_context();
+    if (ctx) {
+      device_ = ctx->device;
+      instance_ = ctx->instance;
+    }
+  }
   if (!device_) {
     throw std::runtime_error(
         "WebGPU device not available. "
@@ -289,6 +299,9 @@ void WebGPUGraph::copy_outputs(
         outputs[i].second,
         cb_info);
 
+    // Poll until the map callback fires.
+    wgpuDevicePoll(device_, true, nullptr);
+
     if (cb_data.status == WGPUMapAsyncStatus_Success) {
       const void* mapped =
           wgpuBufferGetConstMappedRange(output_staging_buffers_[i], 0, outputs[i].second);
@@ -300,6 +313,22 @@ void WebGPUGraph::copy_outputs(
   }
 }
 
+WebGPUMemoryStats WebGPUGraph::memory_stats() const {
+  WebGPUMemoryStats stats;
+  for (size_t i = 0; i < value_types_.size(); i++) {
+    if (value_types_[i] == ValueType::Tensor && tensors_[i].nbytes > 0) {
+      stats.tensor_buffer_bytes += tensors_[i].nbytes;
+      stats.num_tensors++;
+    }
+  }
+  for (size_t i = 0; i < output_ids_.size(); i++) {
+    stats.staging_buffer_bytes += tensors_[output_ids_[i]].nbytes;
+  }
+  stats.uniform_buffer_bytes = uniform_buffer_bytes_;
+  stats.num_dispatches = static_cast<int>(dispatches_.size());
+  return stats;
+}
+
 } // namespace webgpu
 } // namespace backends
 } // namespace executorch
diff --git a/backends/webgpu/runtime/WebGPUGraph.h b/backends/webgpu/runtime/WebGPUGraph.h
index ce9c2b3a84b..2d6996e9219 100644
--- a/backends/webgpu/runtime/WebGPUGraph.h
+++ b/backends/webgpu/runtime/WebGPUGraph.h
@@ -30,6 +30,18 @@ struct WebGPUDispatch {
   uint32_t workgroup_count_x = 1;
 };
 
+struct WebGPUMemoryStats {
+  size_t tensor_buffer_bytes = 0;
+  size_t staging_buffer_bytes = 0;
+  size_t uniform_buffer_bytes = 0;
+  int num_tensors = 0;
+  int num_dispatches = 0;
+
+  size_t total_bytes() const {
+    return tensor_buffer_bytes + staging_buffer_bytes + uniform_buffer_bytes;
+  }
+};
+
 class WebGPUGraph {
  public:
   WebGPUGraph();
@@ -83,6 +95,19 @@ class WebGPUGraph {
     dispatches_.push_back(dispatch);
   }
 
+  void add_uniform_buffer_bytes(size_t bytes) {
+    uniform_buffer_bytes_ += bytes;
+  }
+
+  void set_instance(WGPUInstance instance) {
+    instance_ = instance;
+  }
+  void set_device(WGPUDevice device) {
+    device_ = device;
+  }
+
+  WebGPUMemoryStats memory_stats() const;
+
   int num_values() const {
     return static_cast<int>(value_types_.size());
   }
@@ -94,6 +119,7 @@ class WebGPUGraph {
   }
 
  private:
+  WGPUInstance instance_ = nullptr;
   WGPUDevice device_ = nullptr;
   WGPUQueue queue_ = nullptr;
 
@@ -112,6 +138,8 @@ class WebGPUGraph {
   std::vector<WGPUBuffer> output_staging_buffers_;
 
   std::vector<WebGPUDispatch> dispatches_;
+
+  size_t uniform_buffer_bytes_ = 0;
 };
 
 } // namespace webgpu
diff --git a/backends/webgpu/runtime/ops/add/BinaryOp.cpp b/backends/webgpu/runtime/ops/add/BinaryOp.cpp
index 0fbde4c6997..5a6a8e07a51 100644
--- a/backends/webgpu/runtime/ops/add/BinaryOp.cpp
+++ b/backends/webgpu/runtime/ops/add/BinaryOp.cpp
@@ -64,6 +64,8 @@ void add_impl(WebGPUGraph& graph, const std::vector<int>& args) {
   std::memcpy(mapped, &params, sizeof(AddParams));
   wgpuBufferUnmap(uniform_buffer);
 
+  graph.add_uniform_buffer_bytes(sizeof(AddParams));
+
   // Create shader module from built-in WGSL source
   WGPUShaderSourceWGSL wgsl_desc = {};
   wgsl_desc.chain.sType = WGPUSType_ShaderSourceWGSL;

From f5b4f3f24704fc5efd40a22f0be82a101a6e8b9b Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@meta.com>
Date: Tue, 7 Apr 2026 10:28:04 -0500
Subject: [PATCH 9/9] [WebGPU] Add native test runner and build script

C++ test runner that loads a .pte and runs inference via
wgpu-native. End-to-end build script that exports a model,
builds the native runtime, and validates output.
---
 backends/webgpu/test/test_build_webgpu.sh   |  65 ++++++++++++
 backends/webgpu/test/test_webgpu_native.cpp | 108 ++++++++++++++++++++
 2 files changed, 173 insertions(+)
 create mode 100755 backends/webgpu/test/test_build_webgpu.sh
 create mode 100644 backends/webgpu/test/test_webgpu_native.cpp

diff --git a/backends/webgpu/test/test_build_webgpu.sh b/backends/webgpu/test/test_build_webgpu.sh
new file mode 100755
index 00000000000..684926cb181
--- /dev/null
+++ b/backends/webgpu/test/test_build_webgpu.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# End-to-end build and test script for the WebGPU backend (native via wgpu-native).
+# Usage: bash backends/webgpu/test/test_build_webgpu.sh
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+EXECUTORCH_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)"
+PYTHON_EXECUTABLE="${PYTHON_EXECUTABLE:-python3}"
+NPROC=$(nproc 2>/dev/null || sysctl -n hw.ncpu)
+
+# ── Step 1: Python export tests ──────────────────────────────────────────────
+
+echo "=== Step 1: Run Python export test ==="
+$PYTHON_EXECUTABLE -m pytest "${SCRIPT_DIR}/ops/add/test_add.py" -v
+
+# ── Step 2: Export .pte model ─────────────────────────────────────────────────
+
+echo "=== Step 2: Export test model ==="
+PTE_MODEL="/tmp/webgpu_add_test.pte"
+cd "${EXECUTORCH_ROOT}"
+$PYTHON_EXECUTABLE -c "
+from executorch.backends.webgpu.test.ops.add.test_add import export_add_model
+export_add_model('${PTE_MODEL}')
+"
+
+# ── Step 3: Native build + test (wgpu-native) ────────────────────────────────
+
+WGPU_DIR="${EXECUTORCH_ROOT}/backends/webgpu/third-party/wgpu-native"
+
+# Auto-download wgpu-native if not present
+if [[ ! -d "${WGPU_DIR}/lib" ]]; then
+    echo "=== Installing wgpu-native ==="
+    bash "${EXECUTORCH_ROOT}/backends/webgpu/scripts/setup-wgpu-native.sh"
+fi
+
+echo "=== Step 3: Native build with wgpu-native ==="
+NATIVE_BUILD_DIR="${EXECUTORCH_ROOT}/cmake-out-webgpu-native"
+rm -rf "${NATIVE_BUILD_DIR}"
+
+cmake \
+    -DEXECUTORCH_BUILD_WEBGPU=ON \
+    -DEXECUTORCH_BUILD_WEBGPU_TEST=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
+    -DCMAKE_BUILD_TYPE=Release \
+    -B "${NATIVE_BUILD_DIR}" \
+    "${EXECUTORCH_ROOT}"
+
+cmake --build "${NATIVE_BUILD_DIR}" --target webgpu_native_test -j${NPROC}
+
+echo "=== Step 4: Run native test ==="
+WEBGPU_TEST_MODEL="${PTE_MODEL}" \
+    "${NATIVE_BUILD_DIR}/backends/webgpu/webgpu_native_test"
+
+echo "=== Done ==="
diff --git a/backends/webgpu/test/test_webgpu_native.cpp b/backends/webgpu/test/test_webgpu_native.cpp
new file mode 100644
index 00000000000..c60695e11c9
--- /dev/null
+++ b/backends/webgpu/test/test_webgpu_native.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/webgpu/runtime/WebGPUDevice.h>
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor.h>
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+
+using namespace executorch::backends::webgpu;
+using namespace executorch::extension;
+using namespace executorch::runtime;
+
+static bool test_single_add(const std::string& model_path) {
+  printf("\n--- Test: single add (1024x1024) ---\n");
+
+  Module module(model_path);
+  auto err = module.load_forward();
+  if (err != Error::Ok) {
+    printf("FAIL: could not load forward method (error %d)\n", (int)err);
+    return false;
+  }
+  printf("Model loaded: %s\n", model_path.c_str());
+
+  constexpr int dim = 1024;
+  constexpr int size = dim * dim;
+
+  std::vector<float> a_data(size);
+  std::vector<float> b_data(size);
+  for (int i = 0; i < size; i++) {
+    a_data[i] = static_cast<float>(i) * 1.0f;
+    b_data[i] = static_cast<float>(i) * 2.0f;
+  }
+
+  auto a = make_tensor_ptr({dim, dim}, std::vector<float>(a_data));
+  auto b = make_tensor_ptr({dim, dim}, std::vector<float>(b_data));
+
+  auto result = module.forward({EValue(a), EValue(b)});
+  if (!result.ok()) {
+    printf("FAIL: forward failed (error %d)\n", (int)result.error());
+    return false;
+  }
+
+  const auto& outputs = result.get();
+  if (outputs.empty() || !outputs[0].isTensor()) {
+    printf("FAIL: no tensor output\n");
+    return false;
+  }
+
+  const auto& out_tensor = outputs[0].toTensor();
+  const float* out_data = out_tensor.const_data_ptr<float>();
+
+  float max_error = 0.0f;
+  int check_count = std::min(size, 1024);
+  for (int i = 0; i < check_count; i++) {
+    float expected = a_data[i] + b_data[i];
+    float error = std::abs(out_data[i] - expected);
+    max_error = std::max(max_error, error);
+  }
+
+  printf("Max error: %e (checked %d elements)\n", max_error, check_count);
+  if (max_error > 1e-3f) {
+    printf("FAIL: max error exceeds tolerance 1e-3\n");
+    return false;
+  }
+  printf("PASS: single add test\n");
+  return true;
+}
+
+int main(int argc, char** argv) {
+  std::string model_path = "webgpu_add_test.pte";
+  if (argc > 1) {
+    model_path = argv[1];
+  }
+  if (const char* env = std::getenv("WEBGPU_TEST_MODEL")) {
+    model_path = env;
+  }
+
+  WebGPUContext ctx;
+  try {
+    ctx = create_webgpu_context();
+  } catch (const std::exception& e) {
+    printf("SKIP: %s\n", e.what());
+    return 0;
+  }
+
+  set_default_webgpu_context(&ctx);
+  printf("WebGPU device acquired (native)\n");
+
+  bool ok = test_single_add(model_path);
+
+  set_default_webgpu_context(nullptr);
+  destroy_webgpu_context(ctx);
+
+  if (!ok) {
+    return 1;
+  }
+  printf("\nAll tests passed\n");
+  return 0;
+}