diff --git a/include/infinicore/ops.hpp b/include/infinicore/ops.hpp
index 747735bcd..061bfbfd1 100644
--- a/include/infinicore/ops.hpp
+++ b/include/infinicore/ops.hpp
@@ -22,6 +22,7 @@
 #include "ops/cdist.hpp"
 #include "ops/conv2d.hpp"
 #include "ops/cross_entropy.hpp"
+#include "ops/deepseek_moe.hpp"
 #include "ops/embedding.hpp"
 #include "ops/flash_attention.hpp"
 #include "ops/fmin.hpp"
diff --git a/include/infinicore/ops/deepseek_moe.hpp b/include/infinicore/ops/deepseek_moe.hpp
new file mode 100644
index 000000000..e8e7ffef9
--- /dev/null
+++ b/include/infinicore/ops/deepseek_moe.hpp
@@ -0,0 +1,41 @@
+#pragma once
+
+#include "../device.hpp"
+#include "../graph/graph.hpp"
+#include "common/op.hpp"
+#include <vector>
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_CLASS(
+    DeepseekMoe,
+    Tensor,
+    const Tensor &,
+    const Tensor &,
+    const Tensor &,
+    const std::vector<Tensor> &,
+    const std::vector<Tensor> &,
+    const std::vector<Tensor> &,
+    size_t,
+    size_t);
+
+Tensor deepseek_moe(const Tensor &hidden,
+                    const Tensor &topk_indices,
+                    const Tensor &topk_weights,
+                    const std::vector<Tensor> &gate_weights,
+                    const std::vector<Tensor> &up_weights,
+                    const std::vector<Tensor> &down_weights,
+                    size_t intermediate_size,
+                    size_t num_experts);
+
+void deepseek_moe_(Tensor out,
+                   const Tensor &hidden,
+                   const Tensor &topk_indices,
+                   const Tensor &topk_weights,
+                   const std::vector<Tensor> &gate_weights,
+                   const std::vector<Tensor> &up_weights,
+                   const std::vector<Tensor> &down_weights,
+                   size_t intermediate_size,
+                   size_t num_experts);
+
+} // namespace infinicore::op
diff --git a/include/infiniop.h b/include/infiniop.h
index dbd00d037..5f58805b9 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -34,6 +34,7 @@
 #include "infiniop/ops/clip.h"
 #include "infiniop/ops/conv.h"
 #include "infiniop/ops/cross_entropy.h"
+#include "infiniop/ops/deepseek_moe.h"
 #include "infiniop/ops/dequant/per_tensor_dequant_int8.h"
 #include "infiniop/ops/dequantize_awq.h"
 #include "infiniop/ops/dequantize_gptq.h"
diff --git a/include/infiniop/ops/deepseek_moe.h b/include/infiniop/ops/deepseek_moe.h
new file mode 100644
index 000000000..ae55ae977
--- /dev/null
+++ b/include/infiniop/ops/deepseek_moe.h
@@ -0,0 +1,57 @@
+#ifndef __INFINIOP_DEEPSEEK_MOE_API_H__
+#define __INFINIOP_DEEPSEEK_MOE_API_H__
+
+#include "../operator_descriptor.h"
+
+#ifdef __cplusplus
+#include <cstddef>
+#else
+#include <stddef.h>
+#endif
+
+typedef struct InfiniopDescriptor *infiniopDeepseekMoeDescriptor_t;
+
+__INFINI_C __export infiniStatus_t infiniopCreateDeepseekMoeDescriptor(
+    infiniopHandle_t handle,
+    infiniopDeepseekMoeDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t hidden_desc,
+    infiniopTensorDescriptor_t topk_indices_desc,
+    infiniopTensorDescriptor_t topk_weights_desc,
+    size_t intermediate_size,
+    size_t num_experts);
+
+__INFINI_C __export infiniStatus_t infiniopGetDeepseekMoeWorkspaceSize(
+    infiniopDeepseekMoeDescriptor_t desc,
+    size_t *size);
+
+__INFINI_C __export infiniStatus_t infiniopDeepseekMoe(
+    infiniopDeepseekMoeDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *out,
+    const void *hidden,
+    const void *topk_indices,
+    const void *topk_weights,
+    const void *const *gate_weights,
+    const void *const *up_weights,
+    const void *const *down_weights,
+    void *stream);
+
+__INFINI_C __export infiniStatus_t infiniopDeepseekMoeWithDevicePtrs(
+    infiniopDeepseekMoeDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *out,
+    const void *hidden,
+    const void *topk_indices,
+    const void *topk_weights,
+    const void *gate_weight_ptrs,
+    const void *up_weight_ptrs,
+    const void *down_weight_ptrs,
+    void *stream);
+
+__INFINI_C __export infiniStatus_t infiniopDestroyDeepseekMoeDescriptor(
+    infiniopDeepseekMoeDescriptor_t desc);
+
+#endif
diff --git a/src/infinicore/ops/deepseek_moe/deepseek_moe.cc b/src/infinicore/ops/deepseek_moe/deepseek_moe.cc
new file mode 100644
index 000000000..82e1c56dd
--- /dev/null
+++ b/src/infinicore/ops/deepseek_moe/deepseek_moe.cc
@@ -0,0 +1,83 @@
+#include "infinicore/ops/deepseek_moe.hpp"
+#include "../../utils.hpp"
+#include <stdexcept>
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(DeepseekMoe);
+
+namespace {
+
+void check_weights(const std::vector<Tensor> &gate_weights,
+                   const std::vector<Tensor> &up_weights,
+                   const std::vector<Tensor> &down_weights,
+                   size_t num_experts) {
+    if (gate_weights.size() != num_experts || up_weights.size() != num_experts || down_weights.size() != num_experts) {
+        throw std::runtime_error("DeepseekMoe: expert weight vector size mismatch");
+    }
+}
+
+} // namespace
+
+DeepseekMoe::DeepseekMoe(Tensor out,
+                         const Tensor &hidden,
+                         const Tensor &topk_indices,
+                         const Tensor &topk_weights,
+                         const std::vector<Tensor> &gate_weights,
+                         const std::vector<Tensor> &up_weights,
+                         const std::vector<Tensor> &down_weights,
+                         size_t intermediate_size,
+                         size_t num_experts) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, hidden, topk_indices, topk_weights);
+    check_weights(gate_weights, up_weights, down_weights, num_experts);
+    for (size_t i = 0; i < num_experts; ++i) {
+        INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, gate_weights[i], up_weights[i], down_weights[i]);
+    }
+    INFINICORE_GRAPH_OP_DISPATCH(out->device().getType(),
+                                 out, hidden, topk_indices, topk_weights,
+                                 gate_weights, up_weights, down_weights,
+                                 intermediate_size, num_experts);
+}
+
+void DeepseekMoe::execute(Tensor out,
+                          const Tensor &hidden,
+                          const Tensor &topk_indices,
+                          const Tensor &topk_weights,
+                          const std::vector<Tensor> &gate_weights,
+                          const std::vector<Tensor> &up_weights,
+                          const std::vector<Tensor> &down_weights,
+                          size_t intermediate_size,
+                          size_t num_experts) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(
+        DeepseekMoe,
+        out, hidden, topk_indices, topk_weights,
+        gate_weights, up_weights, down_weights,
+        intermediate_size, num_experts);
+}
+
+void deepseek_moe_(Tensor out,
+                   const Tensor &hidden,
+                   const Tensor &topk_indices,
+                   const Tensor &topk_weights,
+                   const std::vector<Tensor> &gate_weights,
+                   const std::vector<Tensor> &up_weights,
+                   const std::vector<Tensor> &down_weights,
+                   size_t intermediate_size,
+                   size_t num_experts) {
+    DeepseekMoe::execute(out, hidden, topk_indices, topk_weights, gate_weights, up_weights, down_weights, intermediate_size, num_experts);
+}
+
+Tensor deepseek_moe(const Tensor &hidden,
+                    const Tensor &topk_indices,
+                    const Tensor &topk_weights,
+                    const std::vector<Tensor> &gate_weights,
+                    const std::vector<Tensor> &up_weights,
+                    const std::vector<Tensor> &down_weights,
+                    size_t intermediate_size,
+                    size_t num_experts) {
+    auto out = Tensor::empty(hidden->shape(), hidden->dtype(), hidden->device());
+    deepseek_moe_(out, hidden, topk_indices, topk_weights, gate_weights, up_weights, down_weights, intermediate_size, num_experts);
+    return out;
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/deepseek_moe/deepseek_moe_infiniop.cc b/src/infinicore/ops/deepseek_moe/deepseek_moe_infiniop.cc
new file mode 100644
index 000000000..13ced0468
--- /dev/null
+++ b/src/infinicore/ops/deepseek_moe/deepseek_moe_infiniop.cc
@@ -0,0 +1,118 @@
+#include "infinicore/ops/deepseek_moe.hpp"
+
+#include "../infiniop_impl.hpp"
+
+namespace infinicore::op::deepseek_moe_impl::infiniop {
+
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, DeepseekMoe, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    Tensor workspace_owner, out_owner, hidden_owner, topk_indices_owner, topk_weights_owner;
+    graph::GraphTensor workspace, out, hidden, topk_indices, topk_weights;
+    std::vector<graph::GraphTensor> gate_weights, up_weights, down_weights;
+    std::vector<const void *> gate_ptrs, up_ptrs, down_ptrs;
+    std::shared_ptr<Memory> gate_ptrs_device, up_ptrs_device, down_ptrs_device;
+};
+
+static std::vector<graph::GraphTensor> to_graph_tensors(const std::vector<Tensor> &tensors) {
+    std::vector<graph::GraphTensor> result;
+    result.reserve(tensors.size());
+    for (const auto &tensor : tensors) {
+        result.emplace_back(tensor);
+    }
+    return result;
+}
+
+static std::vector<const void *> data_ptrs(const std::vector<graph::GraphTensor> &tensors) {
+    std::vector<const void *> result;
+    result.reserve(tensors.size());
+    for (const auto &tensor : tensors) {
+        result.push_back(tensor->data());
+    }
+    return result;
+}
+
+void *plan(Tensor out,
+           const Tensor &hidden,
+           const Tensor &topk_indices,
+           const Tensor &topk_weights,
+           const std::vector<Tensor> &gate_weights,
+           const std::vector<Tensor> &up_weights,
+           const std::vector<Tensor> &down_weights,
+           size_t intermediate_size,
+           size_t num_experts) {
+    size_t seed = hash_combine(out, hidden, topk_indices, topk_weights);
+    hash_combine(seed, intermediate_size, num_experts);
+
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, DeepseekMoe, seed,
+        out->desc(), hidden->desc(), topk_indices->desc(), topk_weights->desc(),
+        intermediate_size, num_experts);
+
+    INFINIOP_WORKSPACE_TENSOR(workspace, DeepseekMoe, descriptor);
+
+    auto planned = new PlannedMeta{
+        descriptor,
+        workspace,
+        out,
+        hidden,
+        topk_indices,
+        topk_weights,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(out),
+        graph::GraphTensor(hidden),
+        graph::GraphTensor(topk_indices),
+        graph::GraphTensor(topk_weights),
+        to_graph_tensors(gate_weights),
+        to_graph_tensors(up_weights),
+        to_graph_tensors(down_weights),
+        {},
+        {},
+        {},
+        nullptr,
+        nullptr,
+        nullptr};
+    planned->gate_ptrs = data_ptrs(planned->gate_weights);
+    planned->up_ptrs = data_ptrs(planned->up_weights);
+    planned->down_ptrs = data_ptrs(planned->down_weights);
+    const size_t ptr_bytes = num_experts * sizeof(void *);
+    planned->gate_ptrs_device = context::allocateMemory(ptr_bytes);
+    planned->up_ptrs_device = context::allocateMemory(ptr_bytes);
+    planned->down_ptrs_device = context::allocateMemory(ptr_bytes);
+    context::memcpyH2D(planned->gate_ptrs_device->data(), planned->gate_ptrs.data(), ptr_bytes, false);
+    context::memcpyH2D(planned->up_ptrs_device->data(), planned->up_ptrs.data(), ptr_bytes, false);
+    context::memcpyH2D(planned->down_ptrs_device->data(), planned->down_ptrs.data(), ptr_bytes, false);
+    return planned;
+}
+
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
+
+    INFINICORE_CHECK_ERROR(infiniopDeepseekMoeWithDevicePtrs(
+        planned->descriptor->desc,
+        planned->workspace->data(),
+        planned->workspace->numel(),
+        planned->out->data(),
+        planned->hidden->data(),
+        planned->topk_indices->data(),
+        planned->topk_weights->data(),
+        planned->gate_ptrs_device->data(),
+        planned->up_ptrs_device->data(),
+        planned->down_ptrs_device->data(),
+        context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
+}
+
+static bool registered = []() {
+    DeepseekMoe::plan_dispatcher().registerDevice(Device::Type::NVIDIA, &plan);
+    DeepseekMoe::run_dispatcher().registerDevice(Device::Type::NVIDIA, &run);
+    DeepseekMoe::cleanup_dispatcher().registerDevice(Device::Type::NVIDIA, &cleanup);
+    return true;
+}();
+
+} // namespace infinicore::op::deepseek_moe_impl::infiniop
diff --git a/src/infiniop/ops/deepseek_moe/deepseek_moe.h b/src/infiniop/ops/deepseek_moe/deepseek_moe.h
new file mode 100644
index 000000000..f3f246886
--- /dev/null
+++ b/src/infiniop/ops/deepseek_moe/deepseek_moe.h
@@ -0,0 +1,42 @@
+#ifndef DEEPSEEK_MOE_H
+#define DEEPSEEK_MOE_H
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                                                            \
+    namespace op::deepseek_moe::NAMESPACE {                                                              \
+    class Descriptor final : public InfiniopDescriptor {                                                 \
+        struct Opaque;                                                                                   \
+        Opaque *_opaque;                                                                                 \
+        DeepseekMoeInfo _info;                                                                           \
+        size_t _workspace_size;                                                                          \
+                                                                                                         \
+        Descriptor(Opaque *opaque, DeepseekMoeInfo info, size_t workspace_size,                          \
+                   infiniDevice_t device_type, int device_id)                                            \
+            : InfiniopDescriptor{device_type, device_id}, _opaque(opaque),                               \
+              _info(info), _workspace_size(workspace_size) {}                                            \
+                                                                                                         \
+    public:                                                                                              \
+        ~Descriptor();                                                                                   \
+        size_t workspaceSize() const { return _workspace_size; }                                         \
+        static infiniStatus_t create(infiniopHandle_t handle, Descriptor **desc_ptr,                     \
+                                     infiniopTensorDescriptor_t out_desc,                                \
+                                     infiniopTensorDescriptor_t hidden_desc,                             \
+                                     infiniopTensorDescriptor_t topk_indices_desc,                       \
+                                     infiniopTensorDescriptor_t topk_weights_desc,                       \
+                                     size_t intermediate_size, size_t num_experts);                      \
+        infiniStatus_t calculate(void *workspace, size_t workspace_size, void *out,                      \
+                                 const void *hidden, const void *topk_indices,                           \
+                                 const void *topk_weights, const void *const *gate_weights,              \
+                                 const void *const *up_weights, const void *const *down_weights,         \
+                                 void *stream) const;                                                    \
+        infiniStatus_t calculateWithDevicePtrs(void *workspace, size_t workspace_size,                   \
+                                               void *out, const void *hidden, const void *topk_indices,  \
+                                               const void *topk_weights, const void *gate_weight_ptrs,   \
+                                               const void *up_weight_ptrs, const void *down_weight_ptrs, \
+                                               void *stream) const;                                      \
+    };                                                                                                   \
+    }
+
+#endif
diff --git a/src/infiniop/ops/deepseek_moe/info.h b/src/infiniop/ops/deepseek_moe/info.h
new file mode 100644
index 000000000..be115d3ab
--- /dev/null
+++ b/src/infiniop/ops/deepseek_moe/info.h
@@ -0,0 +1,65 @@
+#ifndef __DEEPSEEK_MOE_INFO_H__
+#define __DEEPSEEK_MOE_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+#include <vector>
+
+namespace op::deepseek_moe {
+
+class DeepseekMoeInfo {
+    DeepseekMoeInfo() = default;
+
+public:
+    infiniDtype_t dtype;
+    size_t ntokens;
+    size_t hidden_size;
+    size_t topk;
+    size_t intermediate_size;
+    size_t num_experts;
+
+    static utils::Result<DeepseekMoeInfo> create(
+        infiniopTensorDescriptor_t out_desc,
+        infiniopTensorDescriptor_t hidden_desc,
+        infiniopTensorDescriptor_t topk_indices_desc,
+        infiniopTensorDescriptor_t topk_weights_desc,
+        size_t intermediate_size,
+        size_t num_experts) {
+
+        auto dtype = hidden_desc->dtype();
+        if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_BF16) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+        if (out_desc->dtype() != dtype || topk_indices_desc->dtype() != INFINI_DTYPE_I32 || topk_weights_desc->dtype() != INFINI_DTYPE_F32) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+        if (out_desc->ndim() != 2 || hidden_desc->ndim() != 2 || topk_indices_desc->ndim() != 2 || topk_weights_desc->ndim() != 2) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+        auto hidden_shape = hidden_desc->shape();
+        auto out_shape = out_desc->shape();
+        auto indices_shape = topk_indices_desc->shape();
+        auto weights_shape = topk_weights_desc->shape();
+        if (out_shape != hidden_shape || indices_shape != weights_shape || indices_shape[0] != hidden_shape[0]) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+        if (intermediate_size == 0 || num_experts == 0 || indices_shape[1] == 0 || indices_shape[1] > num_experts) {
+            return INFINI_STATUS_BAD_PARAM;
+        }
+        if (hidden_desc->strides()[1] != 1 || out_desc->strides()[1] != 1 || topk_indices_desc->strides()[1] != 1 || topk_weights_desc->strides()[1] != 1) {
+            return INFINI_STATUS_BAD_TENSOR_STRIDES;
+        }
+
+        return utils::Result<DeepseekMoeInfo>(DeepseekMoeInfo{
+            dtype,
+            hidden_shape[0],
+            hidden_shape[1],
+            indices_shape[1],
+            intermediate_size,
+            num_experts});
+    }
+};
+
+} // namespace op::deepseek_moe
+
+#endif
diff --git a/src/infiniop/ops/deepseek_moe/nvidia/deepseek_moe_nvidia.cu b/src/infiniop/ops/deepseek_moe/nvidia/deepseek_moe_nvidia.cu
new file mode 100644
index 000000000..9c63f97d2
--- /dev/null
+++ b/src/infiniop/ops/deepseek_moe/nvidia/deepseek_moe_nvidia.cu
@@ -0,0 +1,312 @@
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "deepseek_moe_nvidia.cuh"
+
+namespace op::deepseek_moe::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+namespace {
+
+constexpr size_t align_up(size_t value, size_t alignment) {
+    return (value + alignment - 1) / alignment * alignment;
+}
+
+template <typename T>
+__device__ float to_float(T value) {
+    return static_cast<float>(value);
+}
+
+template <>
+__device__ float to_float<half>(half value) {
+    return __half2float(value);
+}
+
+template <>
+__device__ float to_float<__nv_bfloat16>(__nv_bfloat16 value) {
+    return __bfloat162float(value);
+}
+
+template <typename T>
+__device__ T from_float(float value) {
+    return static_cast<T>(value);
+}
+
+template <>
+__device__ half from_float<half>(float value) {
+    return __float2half_rn(value);
+}
+
+template <>
+__device__ __nv_bfloat16 from_float<__nv_bfloat16>(float value) {
+    return __float2bfloat16_rn(value);
+}
+
+template <typename T>
+__global__ void gate_up_kernel(
+    T *intermediate,
+    const T *hidden,
+    const int *topk_indices,
+    const float *topk_weights,
+    const void *const *gate_weights,
+    const void *const *up_weights,
+    size_t ntokens,
+    size_t hidden_size,
+    size_t topk,
+    size_t intermediate_size,
+    size_t num_experts) {
+
+    const size_t route = blockIdx.x / intermediate_size;
+    const size_t j = blockIdx.x - route * intermediate_size;
+    if (route >= ntokens * topk) {
+        return;
+    }
+    const int expert = topk_indices[route];
+    if (expert < 0 || static_cast<size_t>(expert) >= num_experts) {
+        return;
+    }
+    const size_t token = route / topk;
+    const T *x = hidden + token * hidden_size;
+    const T *gate = reinterpret_cast<const T *>(gate_weights[expert]) + j * hidden_size;
+    const T *up = reinterpret_cast<const T *>(up_weights[expert]) + j * hidden_size;
+
+    float gate_sum = 0.0f;
+    float up_sum = 0.0f;
+    for (size_t h = threadIdx.x; h < hidden_size; h += blockDim.x) {
+        const float xv = to_float<T>(x[h]);
+        gate_sum += xv * to_float<T>(gate[h]);
+        up_sum += xv * to_float<T>(up[h]);
+    }
+
+    __shared__ float gate_shared[256];
+    __shared__ float up_shared[256];
+    gate_shared[threadIdx.x] = gate_sum;
+    up_shared[threadIdx.x] = up_sum;
+    __syncthreads();
+
+    for (unsigned int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
+        if (threadIdx.x < stride) {
+            gate_shared[threadIdx.x] += gate_shared[threadIdx.x + stride];
+            up_shared[threadIdx.x] += up_shared[threadIdx.x + stride];
+        }
+        __syncthreads();
+    }
+
+    if (threadIdx.x == 0) {
+        const float g = gate_shared[0];
+        const float silu = g / (1.0f + __expf(-g));
+        intermediate[route * intermediate_size + j] = from_float<T>(silu * up_shared[0] * topk_weights[route]);
+    }
+}
+
+template <typename T>
+__global__ void down_kernel(
+    T *out,
+    const T *intermediate,
+    const int *topk_indices,
+    const void *const *down_weights,
+    size_t ntokens,
+    size_t hidden_size,
+    size_t topk,
+    size_t intermediate_size,
+    size_t num_experts) {
+
+    const size_t linear = blockIdx.x;
+    const size_t token = linear / hidden_size;
+    const size_t h = linear - token * hidden_size;
+    if (token >= ntokens) {
+        return;
+    }
+
+    float acc = 0.0f;
+    const size_t route_base = token * topk;
+    const size_t count = topk * intermediate_size;
+    for (size_t idx = threadIdx.x; idx < count; idx += blockDim.x) {
+        const size_t k = idx / intermediate_size;
+        const size_t j = idx - k * intermediate_size;
+        const size_t route = route_base + k;
+        const int expert = topk_indices[route];
+        if (expert >= 0 && static_cast<size_t>(expert) < num_experts) {
+            const T *down = reinterpret_cast<const T *>(down_weights[expert]) + h * intermediate_size;
+            acc += to_float<T>(intermediate[route * intermediate_size + j]) * to_float<T>(down[j]);
+        }
+    }
+
+    __shared__ float shared[256];
+    shared[threadIdx.x] = acc;
+    __syncthreads();
+    for (unsigned int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
+        if (threadIdx.x < stride) {
+            shared[threadIdx.x] += shared[threadIdx.x + stride];
+        }
+        __syncthreads();
+    }
+    if (threadIdx.x == 0) {
+        out[token * hidden_size + h] = from_float<T>(shared[0]);
+    }
+}
+
+template <typename T>
+infiniStatus_t launch_typed(
+    void *workspace,
+    size_t workspace_size,
+    const DeepseekMoeInfo &info,
+    void *out,
+    const void *hidden,
+    const void *topk_indices,
+    const void *topk_weights,
+    const void *const *gate_weights,
+    const void *const *up_weights,
+    const void *const *down_weights,
+    cudaStream_t stream,
+    bool weight_ptrs_on_device) {
+
+    const size_t ptr_bytes = align_up(info.num_experts * sizeof(void *), 256);
+    const size_t ptr_workspace = ptr_bytes * 3;
+    const size_t intermediate_offset = align_up(ptr_workspace, 256);
+    const size_t intermediate_bytes = info.ntokens * info.topk * info.intermediate_size * sizeof(T);
+    if (workspace_size < intermediate_offset + intermediate_bytes) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    auto *base = reinterpret_cast<char *>(workspace);
+    const void *const *gate_ptrs = reinterpret_cast<const void *const *>(base);
+    const void *const *up_ptrs = reinterpret_cast<const void *const *>(base + ptr_bytes);
+    const void *const *down_ptrs = reinterpret_cast<const void *const *>(base + ptr_bytes * 2);
+    auto *intermediate = reinterpret_cast<T *>(base + intermediate_offset);
+
+    if (weight_ptrs_on_device) {
+        gate_ptrs = gate_weights;
+        up_ptrs = up_weights;
+        down_ptrs = down_weights;
+    } else {
+        auto **gate_workspace = reinterpret_cast<const void **>(base);
+        auto **up_workspace = reinterpret_cast<const void **>(base + ptr_bytes);
+        auto **down_workspace = reinterpret_cast<const void **>(base + ptr_bytes * 2);
+        CHECK_CUDA(cudaMemcpyAsync(gate_workspace, gate_weights, info.num_experts * sizeof(void *), cudaMemcpyHostToDevice, stream));
+        CHECK_CUDA(cudaMemcpyAsync(up_workspace, up_weights, info.num_experts * sizeof(void *), cudaMemcpyHostToDevice, stream));
+        CHECK_CUDA(cudaMemcpyAsync(down_workspace, down_weights, info.num_experts * sizeof(void *), cudaMemcpyHostToDevice, stream));
+        gate_ptrs = gate_workspace;
+        up_ptrs = up_workspace;
+        down_ptrs = down_workspace;
+    }
+
+    constexpr int threads = 256;
+    const dim3 gate_blocks(static_cast<unsigned int>(info.ntokens * info.topk * info.intermediate_size));
+    gate_up_kernel<T><<<gate_blocks, threads, 0, stream>>>(
+        intermediate,
+        reinterpret_cast<const T *>(hidden),
+        reinterpret_cast<const int *>(topk_indices),
+        reinterpret_cast<const float *>(topk_weights),
+        gate_ptrs,
+        up_ptrs,
+        info.ntokens,
+        info.hidden_size,
+        info.topk,
+        info.intermediate_size,
+        info.num_experts);
+
+    const dim3 down_blocks(static_cast<unsigned int>(info.ntokens * info.hidden_size));
+    down_kernel<T><<<down_blocks, threads, 0, stream>>>(
+        reinterpret_cast<T *>(out),
+        intermediate,
+        reinterpret_cast<const int *>(topk_indices),
+        down_ptrs,
+        info.ntokens,
+        info.hidden_size,
+        info.topk,
+        info.intermediate_size,
+        info.num_experts);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t hidden_desc,
+    infiniopTensorDescriptor_t topk_indices_desc,
+    infiniopTensorDescriptor_t topk_weights_desc,
+    size_t intermediate_size,
+    size_t num_experts) {
+
+    auto result = DeepseekMoeInfo::create(out_desc, hidden_desc, topk_indices_desc, topk_weights_desc, intermediate_size, num_experts);
+    CHECK_RESULT(result);
+    auto info = result.take();
+
+    const size_t dtype_size = info.dtype == INFINI_DTYPE_F16 ? sizeof(half) : sizeof(__nv_bfloat16);
+    const size_t ptr_bytes = align_up(info.num_experts * sizeof(void *), 256);
+    const size_t intermediate_offset = align_up(ptr_bytes * 3, 256);
+    const size_t intermediate_bytes = info.ntokens * info.topk * info.intermediate_size * dtype_size;
+    const size_t workspace_size = intermediate_offset + intermediate_bytes;
+
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
+        info,
+        workspace_size,
+        handle->device,
+        handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *out,
+    const void *hidden,
+    const void *topk_indices,
+    const void *topk_weights,
+    const void *const *gate_weights,
+    const void *const *up_weights,
+    const void *const *down_weights,
+    void *stream_) const {
+
+    auto stream = reinterpret_cast<cudaStream_t>(stream_);
+    if (_info.dtype == INFINI_DTYPE_F16) {
+        return launch_typed<half>(workspace, workspace_size, _info, out, hidden, topk_indices, topk_weights, gate_weights, up_weights, down_weights, stream, false);
+    }
+    if (_info.dtype == INFINI_DTYPE_BF16) {
+        return launch_typed<__nv_bfloat16>(workspace, workspace_size, _info, out, hidden, topk_indices, topk_weights, gate_weights, up_weights, down_weights, stream, false);
+    }
+    return INFINI_STATUS_BAD_TENSOR_DTYPE;
+}
+
+infiniStatus_t Descriptor::calculateWithDevicePtrs(
+    void *workspace,
+    size_t workspace_size,
+    void *out,
+    const void *hidden,
+    const void *topk_indices,
+    const void *topk_weights,
+    const void *gate_weight_ptrs,
+    const void *up_weight_ptrs,
+    const void *down_weight_ptrs,
+    void *stream_) const {
+
+    auto stream = reinterpret_cast<cudaStream_t>(stream_);
+    auto gate_weights = reinterpret_cast<const void *const *>(gate_weight_ptrs);
+    auto up_weights = reinterpret_cast<const void *const *>(up_weight_ptrs);
+    auto down_weights = reinterpret_cast<const void *const *>(down_weight_ptrs);
+    if (_info.dtype == INFINI_DTYPE_F16) {
+        return launch_typed<half>(workspace, workspace_size, _info, out, hidden, topk_indices, topk_weights, gate_weights, up_weights, down_weights, stream, true);
+    }
+    if (_info.dtype == INFINI_DTYPE_BF16) {
+        return launch_typed<__nv_bfloat16>(workspace, workspace_size, _info, out, hidden, topk_indices, topk_weights, gate_weights, up_weights, down_weights, stream, true);
+    }
+    return INFINI_STATUS_BAD_TENSOR_DTYPE;
+}
+
+} // namespace op::deepseek_moe::nvidia
diff --git a/src/infiniop/ops/deepseek_moe/nvidia/deepseek_moe_nvidia.cuh b/src/infiniop/ops/deepseek_moe/nvidia/deepseek_moe_nvidia.cuh
new file mode 100644
index 000000000..f8c8c4de7
--- /dev/null
+++ b/src/infiniop/ops/deepseek_moe/nvidia/deepseek_moe_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef DEEPSEEK_MOE_NVIDIA_CUH
+#define DEEPSEEK_MOE_NVIDIA_CUH
+
+#include "../deepseek_moe.h"
+
+DESCRIPTOR(nvidia)
+
+#endif
diff --git a/src/infiniop/ops/deepseek_moe/operator.cc b/src/infiniop/ops/deepseek_moe/operator.cc
new file mode 100644
index 000000000..759896859
--- /dev/null
+++ b/src/infiniop/ops/deepseek_moe/operator.cc
@@ -0,0 +1,129 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/deepseek_moe.h"
+
+#ifdef ENABLE_NVIDIA_API
+#include "nvidia/deepseek_moe_nvidia.cuh"
+#endif
+
+__INFINI_C infiniStatus_t infiniopCreateDeepseekMoeDescriptor(
+    infiniopHandle_t handle,
+    infiniopDeepseekMoeDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t hidden_desc,
+    infiniopTensorDescriptor_t topk_indices_desc,
+    infiniopTensorDescriptor_t topk_weights_desc,
+    size_t intermediate_size,
+    size_t num_experts) {
+
+#define CREATE(CASE, NAMESPACE)                                                             \
+    case CASE:                                                                              \
+        return op::deepseek_moe::NAMESPACE::Descriptor::create(                             \
+            handle, reinterpret_cast<op::deepseek_moe::NAMESPACE::Descriptor **>(desc_ptr), \
+            out_desc, hidden_desc, topk_indices_desc, topk_weights_desc,                    \
+            intermediate_size, num_experts)
+
+    switch (handle->device) {
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CREATE
+}
+
+__INFINI_C infiniStatus_t infiniopGetDeepseekMoeWorkspaceSize(
+    infiniopDeepseekMoeDescriptor_t desc,
+    size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                        \
+    case CASE:                                                                                      \
+        *size = reinterpret_cast<op::deepseek_moe::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+}
+
+__INFINI_C infiniStatus_t infiniopDeepseekMoe(
+    infiniopDeepseekMoeDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *out,
+    const void *hidden,
+    const void *topk_indices,
+    const void *topk_weights,
+    const void *const *gate_weights,
+    const void *const *up_weights,
+    const void *const *down_weights,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                           \
+    case CASE:                                                                               \
+        return reinterpret_cast<op::deepseek_moe::NAMESPACE::Descriptor *>(desc)->calculate( \
+            workspace, workspace_size, out, hidden, topk_indices, topk_weights,              \
+            gate_weights, up_weights, down_weights, stream)
+
+    switch (desc->device_type) {
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CALCULATE
+}
+
+__INFINI_C infiniStatus_t infiniopDeepseekMoeWithDevicePtrs(
+    infiniopDeepseekMoeDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *out,
+    const void *hidden,
+    const void *topk_indices,
+    const void *topk_weights,
+    const void *gate_weight_ptrs,
+    const void *up_weight_ptrs,
+    const void *down_weight_ptrs,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                                         \
+    case CASE:                                                                                             \
+        return reinterpret_cast<op::deepseek_moe::NAMESPACE::Descriptor *>(desc)->calculateWithDevicePtrs( \
+            workspace, workspace_size, out, hidden, topk_indices, topk_weights,                            \
+            gate_weight_ptrs, up_weight_ptrs, down_weight_ptrs, stream)
+
+    switch (desc->device_type) {
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CALCULATE
+}
+
+__INFINI_C infiniStatus_t infiniopDestroyDeepseekMoeDescriptor(
+    infiniopDeepseekMoeDescriptor_t desc) {
+
+#define DESTROY(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                    \
+        delete reinterpret_cast<op::deepseek_moe::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_NVIDIA_API
+        DESTROY(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef DESTROY
+}
diff --git a/test/infiniop/deepseek_moe.py b/test/infiniop/deepseek_moe.py
new file mode 100644
index 000000000..f8830e878
--- /dev/null
+++ b/test/infiniop/deepseek_moe.py
@@ -0,0 +1,267 @@
+import ctypes
+from ctypes import c_uint64, c_void_p
+
+import torch
+import torch.nn.functional as F
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+    torch_device_map,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+
+# ntokens, hidden_size, topk, intermediate_size, num_experts, use_device_ptrs
+_TEST_CASES_ = [
+    (1, 8, 1, 4, 2, False),
+    (2, 16, 2, 8, 4, False),
+    (3, 32, 2, 16, 4, True),
+]
+
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.BF16: {"atol": 5e-2, "rtol": 5e-2},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 100
+
+
+def _make_tensor(shape, dtype, device, start, scale):
+    values = torch.arange(
+        start, start + torch.tensor(shape).prod().item(), dtype=torch.float32
+    )
+    values = ((values % 17) - 8) * scale
+    return TestTensor.from_torch(values.reshape(shape), dtype, device)
+
+
+def _reference_deepseek_moe(
+    hidden, topk_indices, topk_weights, gate_weights, up_weights, down_weights
+):
+    ntokens, hidden_size = hidden.shape
+    topk = topk_indices.shape[1]
+    out = torch.empty_like(hidden)
+
+    for token in range(ntokens):
+        token_out = torch.zeros(hidden_size, dtype=torch.float32, device=hidden.device)
+        x = hidden[token].float()
+        for k in range(topk):
+            expert = int(topk_indices[token, k].item())
+            gate = F.linear(x, gate_weights[expert].float())
+            up = F.linear(x, up_weights[expert].float())
+            intermediate = (F.silu(gate) * up * topk_weights[token, k]).to(hidden.dtype)
+            token_out += F.linear(intermediate.float(), down_weights[expert].float())
+        out[token] = token_out.to(hidden.dtype)
+
+    return out
+
+
+def _ptr_array(tensors):
+    return (c_void_p * len(tensors))(*[tensor.data() for tensor in tensors])
+
+
+def _device_ptr_tensor(tensors, device):
+    return torch.tensor(
+        [tensor.data() for tensor in tensors],
+        dtype=torch.uint64,
+        device=torch_device_map[device],
+    )
+
+
+def test(
+    handle,
+    device,
+    ntokens,
+    hidden_size,
+    topk,
+    intermediate_size,
+    num_experts,
+    use_device_ptrs,
+    dtype=InfiniDtype.F16,
+    sync=None,
+):
+    print(
+        f"Testing DeepseekMoe on {InfiniDeviceNames[device]} with "
+        f"ntokens:{ntokens} hidden_size:{hidden_size} topk:{topk} "
+        f"intermediate_size:{intermediate_size} num_experts:{num_experts} "
+        f"use_device_ptrs:{use_device_ptrs} dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    hidden = _make_tensor((ntokens, hidden_size), dtype, device, 0, 0.02)
+    topk_indices_data = (
+        torch.arange(ntokens * topk, dtype=torch.int32).reshape(ntokens, topk)
+        % num_experts
+    )
+    topk_indices = TestTensor.from_torch(topk_indices_data, InfiniDtype.I32, device)
+
+    topk_weights_data = torch.arange(
+        1, ntokens * topk + 1, dtype=torch.float32
+    ).reshape(ntokens, topk)
+    topk_weights_data = topk_weights_data / topk_weights_data.sum(dim=-1, keepdim=True)
+    topk_weights = TestTensor.from_torch(topk_weights_data, InfiniDtype.F32, device)
+
+    gate_weights = [
+        _make_tensor(
+            (intermediate_size, hidden_size), dtype, device, 100 + i * 97, 0.01
+        )
+        for i in range(num_experts)
+    ]
+    up_weights = [
+        _make_tensor(
+            (intermediate_size, hidden_size), dtype, device, 200 + i * 97, 0.01
+        )
+        for i in range(num_experts)
+    ]
+    down_weights = [
+        _make_tensor(
+            (hidden_size, intermediate_size), dtype, device, 300 + i * 97, 0.01
+        )
+        for i in range(num_experts)
+    ]
+    out = TestTensor((ntokens, hidden_size), None, dtype, device, mode="zeros")
+
+    ans = _reference_deepseek_moe(
+        hidden.torch_tensor(),
+        topk_indices.torch_tensor(),
+        topk_weights.torch_tensor(),
+        [weight.torch_tensor() for weight in gate_weights],
+        [weight.torch_tensor() for weight in up_weights],
+        [weight.torch_tensor() for weight in down_weights],
+    )
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateDeepseekMoeDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            out.descriptor,
+            hidden.descriptor,
+            topk_indices.descriptor,
+            topk_weights.descriptor,
+            intermediate_size,
+            num_experts,
+        )
+    )
+
+    for tensor in [
+        out,
+        hidden,
+        topk_indices,
+        topk_weights,
+        *gate_weights,
+        *up_weights,
+        *down_weights,
+    ]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetDeepseekMoeWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, out.device)
+
+    gate_ptrs = _ptr_array(gate_weights)
+    up_ptrs = _ptr_array(up_weights)
+    down_ptrs = _ptr_array(down_weights)
+    gate_device_ptrs = _device_ptr_tensor(gate_weights, device)
+    up_device_ptrs = _device_ptr_tensor(up_weights, device)
+    down_device_ptrs = _device_ptr_tensor(down_weights, device)
+
+    def lib_deepseek_moe():
+        if use_device_ptrs:
+            check_error(
+                LIBINFINIOP.infiniopDeepseekMoeWithDevicePtrs(
+                    descriptor,
+                    workspace.data(),
+                    workspace_size.value,
+                    out.data(),
+                    hidden.data(),
+                    topk_indices.data(),
+                    topk_weights.data(),
+                    gate_device_ptrs.data_ptr(),
+                    up_device_ptrs.data_ptr(),
+                    down_device_ptrs.data_ptr(),
+                    None,
+                )
+            )
+        else:
+            check_error(
+                LIBINFINIOP.infiniopDeepseekMoe(
+                    descriptor,
+                    workspace.data(),
+                    workspace_size.value,
+                    out.data(),
+                    hidden.data(),
+                    topk_indices.data(),
+                    topk_weights.data(),
+                    gate_ptrs,
+                    up_ptrs,
+                    down_ptrs,
+                    None,
+                )
+            )
+
+    lib_deepseek_moe()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(out.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(out.actual_tensor(), ans, atol=atol, rtol=rtol)
+
+    if PROFILE:
+        profile_operation(
+            "PyTorch",
+            lambda: _reference_deepseek_moe(
+                hidden.torch_tensor(),
+                topk_indices.torch_tensor(),
+                topk_weights.torch_tensor(),
+                [weight.torch_tensor() for weight in gate_weights],
+                [weight.torch_tensor() for weight in up_weights],
+                [weight.torch_tensor() for weight in down_weights],
+            ),
+            device,
+            NUM_PRERUN,
+            NUM_ITERATIONS,
+        )
+        profile_operation(
+            "    lib", lambda: lib_deepseek_moe(), device, NUM_PRERUN, NUM_ITERATIONS
+        )
+
+    check_error(LIBINFINIOP.infiniopDestroyDeepseekMoeDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES_, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py
index 3ec20a72e..a59bbdf3f 100644
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -2292,6 +2292,62 @@ def fused_ffn_(lib):
     ]
 
 
+@OpRegister.operator
+def deepseek_moe_(lib):
+    lib.infiniopCreateDeepseekMoeDescriptor.restype = c_int32
+    lib.infiniopCreateDeepseekMoeDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_size_t,
+        c_size_t,
+    ]
+
+    lib.infiniopGetDeepseekMoeWorkspaceSize.restype = c_int32
+    lib.infiniopGetDeepseekMoeWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopDeepseekMoe.restype = c_int32
+    lib.infiniopDeepseekMoe.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        POINTER(c_void_p),
+        POINTER(c_void_p),
+        POINTER(c_void_p),
+        c_void_p,
+    ]
+
+    lib.infiniopDeepseekMoeWithDevicePtrs.restype = c_int32
+    lib.infiniopDeepseekMoeWithDevicePtrs.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyDeepseekMoeDescriptor.restype = c_int32
+    lib.infiniopDestroyDeepseekMoeDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
 @OpRegister.operator
 def blas_amax_(lib):
     lib.infiniopCreateBlasAmaxDescriptor.restype = c_int32