diff --git a/include/infinicore/ops.hpp b/include/infinicore/ops.hpp index 747735bcd..061bfbfd1 100644 --- a/include/infinicore/ops.hpp +++ b/include/infinicore/ops.hpp @@ -22,6 +22,7 @@ #include "ops/cdist.hpp" #include "ops/conv2d.hpp" #include "ops/cross_entropy.hpp" +#include "ops/deepseek_moe.hpp" #include "ops/embedding.hpp" #include "ops/flash_attention.hpp" #include "ops/fmin.hpp" diff --git a/include/infinicore/ops/deepseek_moe.hpp b/include/infinicore/ops/deepseek_moe.hpp new file mode 100644 index 000000000..e8e7ffef9 --- /dev/null +++ b/include/infinicore/ops/deepseek_moe.hpp @@ -0,0 +1,41 @@ +#pragma once + +#include "../device.hpp" +#include "../graph/graph.hpp" +#include "common/op.hpp" +#include + +namespace infinicore::op { + +INFINICORE_GRAPH_OP_CLASS( + DeepseekMoe, + Tensor, + const Tensor &, + const Tensor &, + const Tensor &, + const std::vector &, + const std::vector &, + const std::vector &, + size_t, + size_t); + +Tensor deepseek_moe(const Tensor &hidden, + const Tensor &topk_indices, + const Tensor &topk_weights, + const std::vector &gate_weights, + const std::vector &up_weights, + const std::vector &down_weights, + size_t intermediate_size, + size_t num_experts); + +void deepseek_moe_(Tensor out, + const Tensor &hidden, + const Tensor &topk_indices, + const Tensor &topk_weights, + const std::vector &gate_weights, + const std::vector &up_weights, + const std::vector &down_weights, + size_t intermediate_size, + size_t num_experts); + +} // namespace infinicore::op diff --git a/include/infiniop.h b/include/infiniop.h index dbd00d037..5f58805b9 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -34,6 +34,7 @@ #include "infiniop/ops/clip.h" #include "infiniop/ops/conv.h" #include "infiniop/ops/cross_entropy.h" +#include "infiniop/ops/deepseek_moe.h" #include "infiniop/ops/dequant/per_tensor_dequant_int8.h" #include "infiniop/ops/dequantize_awq.h" #include "infiniop/ops/dequantize_gptq.h" diff --git a/include/infiniop/ops/deepseek_moe.h b/include/infiniop/ops/deepseek_moe.h new file mode 100644 index 000000000..ae55ae977 --- /dev/null +++ b/include/infiniop/ops/deepseek_moe.h @@ -0,0 +1,57 @@ +#ifndef __INFINIOP_DEEPSEEK_MOE_API_H__ +#define __INFINIOP_DEEPSEEK_MOE_API_H__ + +#include "../operator_descriptor.h" + +#ifdef __cplusplus +#include +#else +#include +#endif + +typedef struct InfiniopDescriptor *infiniopDeepseekMoeDescriptor_t; + +__INFINI_C __export infiniStatus_t infiniopCreateDeepseekMoeDescriptor( + infiniopHandle_t handle, + infiniopDeepseekMoeDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t hidden_desc, + infiniopTensorDescriptor_t topk_indices_desc, + infiniopTensorDescriptor_t topk_weights_desc, + size_t intermediate_size, + size_t num_experts); + +__INFINI_C __export infiniStatus_t infiniopGetDeepseekMoeWorkspaceSize( + infiniopDeepseekMoeDescriptor_t desc, + size_t *size); + +__INFINI_C __export infiniStatus_t infiniopDeepseekMoe( + infiniopDeepseekMoeDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *out, + const void *hidden, + const void *topk_indices, + const void *topk_weights, + const void *const *gate_weights, + const void *const *up_weights, + const void *const *down_weights, + void *stream); + +__INFINI_C __export infiniStatus_t infiniopDeepseekMoeWithDevicePtrs( + infiniopDeepseekMoeDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *out, + const void *hidden, + const void *topk_indices, + const void *topk_weights, + const void *gate_weight_ptrs, + const void *up_weight_ptrs, + const void *down_weight_ptrs, + void *stream); + +__INFINI_C __export infiniStatus_t infiniopDestroyDeepseekMoeDescriptor( + infiniopDeepseekMoeDescriptor_t desc); + +#endif diff --git a/src/infinicore/ops/deepseek_moe/deepseek_moe.cc b/src/infinicore/ops/deepseek_moe/deepseek_moe.cc new file mode 100644 index 000000000..82e1c56dd --- /dev/null +++ b/src/infinicore/ops/deepseek_moe/deepseek_moe.cc @@ -0,0 +1,83 @@ +#include "infinicore/ops/deepseek_moe.hpp" +#include "../../utils.hpp" +#include + +namespace infinicore::op { + +INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(DeepseekMoe); + +namespace { + +void check_weights(const std::vector &gate_weights, + const std::vector &up_weights, + const std::vector &down_weights, + size_t num_experts) { + if (gate_weights.size() != num_experts || up_weights.size() != num_experts || down_weights.size() != num_experts) { + throw std::runtime_error("DeepseekMoe: expert weight vector size mismatch"); + } +} + +} // namespace + +DeepseekMoe::DeepseekMoe(Tensor out, + const Tensor &hidden, + const Tensor &topk_indices, + const Tensor &topk_weights, + const std::vector &gate_weights, + const std::vector &up_weights, + const std::vector &down_weights, + size_t intermediate_size, + size_t num_experts) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, hidden, topk_indices, topk_weights); + check_weights(gate_weights, up_weights, down_weights, num_experts); + for (size_t i = 0; i < num_experts; ++i) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, gate_weights[i], up_weights[i], down_weights[i]); + } + INFINICORE_GRAPH_OP_DISPATCH(out->device().getType(), + out, hidden, topk_indices, topk_weights, + gate_weights, up_weights, down_weights, + intermediate_size, num_experts); +} + +void DeepseekMoe::execute(Tensor out, + const Tensor &hidden, + const Tensor &topk_indices, + const Tensor &topk_weights, + const std::vector &gate_weights, + const std::vector &up_weights, + const std::vector &down_weights, + size_t intermediate_size, + size_t num_experts) { + INFINICORE_GRAPH_OP_RECORD_OR_RUN( + DeepseekMoe, + out, hidden, topk_indices, topk_weights, + gate_weights, up_weights, down_weights, + intermediate_size, num_experts); +} + +void deepseek_moe_(Tensor out, + const Tensor &hidden, + const Tensor &topk_indices, + const Tensor &topk_weights, + const std::vector &gate_weights, + const std::vector &up_weights, + const std::vector &down_weights, + size_t intermediate_size, + size_t num_experts) { + DeepseekMoe::execute(out, hidden, topk_indices, topk_weights, gate_weights, up_weights, down_weights, intermediate_size, num_experts); +} + +Tensor deepseek_moe(const Tensor &hidden, + const Tensor &topk_indices, + const Tensor &topk_weights, + const std::vector &gate_weights, + const std::vector &up_weights, + const std::vector &down_weights, + size_t intermediate_size, + size_t num_experts) { + auto out = Tensor::empty(hidden->shape(), hidden->dtype(), hidden->device()); + deepseek_moe_(out, hidden, topk_indices, topk_weights, gate_weights, up_weights, down_weights, intermediate_size, num_experts); + return out; +} + +} // namespace infinicore::op diff --git a/src/infinicore/ops/deepseek_moe/deepseek_moe_infiniop.cc b/src/infinicore/ops/deepseek_moe/deepseek_moe_infiniop.cc new file mode 100644 index 000000000..13ced0468 --- /dev/null +++ b/src/infinicore/ops/deepseek_moe/deepseek_moe_infiniop.cc @@ -0,0 +1,118 @@ +#include "infinicore/ops/deepseek_moe.hpp" + +#include "../infiniop_impl.hpp" + +namespace infinicore::op::deepseek_moe_impl::infiniop { + +INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, DeepseekMoe, 100); + +struct PlannedMeta { + std::shared_ptr descriptor; + Tensor workspace_owner, out_owner, hidden_owner, topk_indices_owner, topk_weights_owner; + graph::GraphTensor workspace, out, hidden, topk_indices, topk_weights; + std::vector gate_weights, up_weights, down_weights; + std::vector gate_ptrs, up_ptrs, down_ptrs; + std::shared_ptr gate_ptrs_device, up_ptrs_device, down_ptrs_device; +}; + +static std::vector to_graph_tensors(const std::vector &tensors) { + std::vector result; + result.reserve(tensors.size()); + for (const auto &tensor : tensors) { + result.emplace_back(tensor); + } + return result; +} + +static std::vector data_ptrs(const std::vector &tensors) { + std::vector result; + result.reserve(tensors.size()); + for (const auto &tensor : tensors) { + result.push_back(tensor->data()); + } + return result; +} + +void *plan(Tensor out, + const Tensor &hidden, + const Tensor &topk_indices, + const Tensor &topk_weights, + const std::vector &gate_weights, + const std::vector &up_weights, + const std::vector &down_weights, + size_t intermediate_size, + size_t num_experts) { + size_t seed = hash_combine(out, hidden, topk_indices, topk_weights); + hash_combine(seed, intermediate_size, num_experts); + + INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE( + Descriptor, descriptor, DeepseekMoe, seed, + out->desc(), hidden->desc(), topk_indices->desc(), topk_weights->desc(), + intermediate_size, num_experts); + + INFINIOP_WORKSPACE_TENSOR(workspace, DeepseekMoe, descriptor); + + auto planned = new PlannedMeta{ + descriptor, + workspace, + out, + hidden, + topk_indices, + topk_weights, + graph::GraphTensor(workspace), + graph::GraphTensor(out), + graph::GraphTensor(hidden), + graph::GraphTensor(topk_indices), + graph::GraphTensor(topk_weights), + to_graph_tensors(gate_weights), + to_graph_tensors(up_weights), + to_graph_tensors(down_weights), + {}, + {}, + {}, + nullptr, + nullptr, + nullptr}; + planned->gate_ptrs = data_ptrs(planned->gate_weights); + planned->up_ptrs = data_ptrs(planned->up_weights); + planned->down_ptrs = data_ptrs(planned->down_weights); + const size_t ptr_bytes = num_experts * sizeof(void *); + planned->gate_ptrs_device = context::allocateMemory(ptr_bytes); + planned->up_ptrs_device = context::allocateMemory(ptr_bytes); + planned->down_ptrs_device = context::allocateMemory(ptr_bytes); + context::memcpyH2D(planned->gate_ptrs_device->data(), planned->gate_ptrs.data(), ptr_bytes, false); + context::memcpyH2D(planned->up_ptrs_device->data(), planned->up_ptrs.data(), ptr_bytes, false); + context::memcpyH2D(planned->down_ptrs_device->data(), planned->down_ptrs.data(), ptr_bytes, false); + return planned; +} + +void run(void *planned_meta) { + auto planned = reinterpret_cast(planned_meta); + + INFINICORE_CHECK_ERROR(infiniopDeepseekMoeWithDevicePtrs( + planned->descriptor->desc, + planned->workspace->data(), + planned->workspace->numel(), + planned->out->data(), + planned->hidden->data(), + planned->topk_indices->data(), + planned->topk_weights->data(), + planned->gate_ptrs_device->data(), + planned->up_ptrs_device->data(), + planned->down_ptrs_device->data(), + context::getStream())); +} + +void cleanup(void **planned_meta_ptr) { + delete *reinterpret_cast(planned_meta_ptr); + *planned_meta_ptr = nullptr; +} + +static bool registered = []() { + DeepseekMoe::plan_dispatcher().registerDevice(Device::Type::NVIDIA, &plan); + DeepseekMoe::run_dispatcher().registerDevice(Device::Type::NVIDIA, &run); + DeepseekMoe::cleanup_dispatcher().registerDevice(Device::Type::NVIDIA, &cleanup); + return true; +}(); + +} // namespace infinicore::op::deepseek_moe_impl::infiniop diff --git a/src/infiniop/ops/deepseek_moe/deepseek_moe.h b/src/infiniop/ops/deepseek_moe/deepseek_moe.h new file mode 100644 index 000000000..f3f246886 --- /dev/null +++ b/src/infiniop/ops/deepseek_moe/deepseek_moe.h @@ -0,0 +1,42 @@ +#ifndef DEEPSEEK_MOE_H +#define DEEPSEEK_MOE_H + +#include "../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + namespace op::deepseek_moe::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + DeepseekMoeInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor(Opaque *opaque, DeepseekMoeInfo info, size_t workspace_size, \ + infiniDevice_t device_type, int device_id) \ + : InfiniopDescriptor{device_type, device_id}, _opaque(opaque), \ + _info(info), _workspace_size(workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + size_t workspaceSize() const { return _workspace_size; } \ + static infiniStatus_t create(infiniopHandle_t handle, Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t out_desc, \ + infiniopTensorDescriptor_t hidden_desc, \ + infiniopTensorDescriptor_t topk_indices_desc, \ + infiniopTensorDescriptor_t topk_weights_desc, \ + size_t intermediate_size, size_t num_experts); \ + infiniStatus_t calculate(void *workspace, size_t workspace_size, void *out, \ + const void *hidden, const void *topk_indices, \ + const void *topk_weights, const void *const *gate_weights, \ + const void *const *up_weights, const void *const *down_weights, \ + void *stream) const; \ + infiniStatus_t calculateWithDevicePtrs(void *workspace, size_t workspace_size, \ + void *out, const void *hidden, const void *topk_indices, \ + const void *topk_weights, const void *gate_weight_ptrs, \ + const void *up_weight_ptrs, const void *down_weight_ptrs, \ + void *stream) const; \ + }; \ + } + +#endif diff --git a/src/infiniop/ops/deepseek_moe/info.h b/src/infiniop/ops/deepseek_moe/info.h new file mode 100644 index 000000000..be115d3ab --- /dev/null +++ b/src/infiniop/ops/deepseek_moe/info.h @@ -0,0 +1,65 @@ +#ifndef __DEEPSEEK_MOE_INFO_H__ +#define __DEEPSEEK_MOE_INFO_H__ + +#include "../../../utils.h" +#include "../../tensor.h" +#include + +namespace op::deepseek_moe { + +class DeepseekMoeInfo { + DeepseekMoeInfo() = default; + +public: + infiniDtype_t dtype; + size_t ntokens; + size_t hidden_size; + size_t topk; + size_t intermediate_size; + size_t num_experts; + + static utils::Result create( + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t hidden_desc, + infiniopTensorDescriptor_t topk_indices_desc, + infiniopTensorDescriptor_t topk_weights_desc, + size_t intermediate_size, + size_t num_experts) { + + auto dtype = hidden_desc->dtype(); + if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_BF16) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + if (out_desc->dtype() != dtype || topk_indices_desc->dtype() != INFINI_DTYPE_I32 || topk_weights_desc->dtype() != INFINI_DTYPE_F32) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + if (out_desc->ndim() != 2 || hidden_desc->ndim() != 2 || topk_indices_desc->ndim() != 2 || topk_weights_desc->ndim() != 2) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + auto hidden_shape = hidden_desc->shape(); + auto out_shape = out_desc->shape(); + auto indices_shape = topk_indices_desc->shape(); + auto weights_shape = topk_weights_desc->shape(); + if (out_shape != hidden_shape || indices_shape != weights_shape || indices_shape[0] != hidden_shape[0]) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + if (intermediate_size == 0 || num_experts == 0 || indices_shape[1] == 0 || indices_shape[1] > num_experts) { + return INFINI_STATUS_BAD_PARAM; + } + if (hidden_desc->strides()[1] != 1 || out_desc->strides()[1] != 1 || topk_indices_desc->strides()[1] != 1 || topk_weights_desc->strides()[1] != 1) { + return INFINI_STATUS_BAD_TENSOR_STRIDES; + } + + return utils::Result(DeepseekMoeInfo{ + dtype, + hidden_shape[0], + hidden_shape[1], + indices_shape[1], + intermediate_size, + num_experts}); + } +}; + +} // namespace op::deepseek_moe + +#endif diff --git a/src/infiniop/ops/deepseek_moe/nvidia/deepseek_moe_nvidia.cu b/src/infiniop/ops/deepseek_moe/nvidia/deepseek_moe_nvidia.cu new file mode 100644 index 000000000..9c63f97d2 --- /dev/null +++ b/src/infiniop/ops/deepseek_moe/nvidia/deepseek_moe_nvidia.cu @@ -0,0 +1,312 @@ +#include +#include +#include + +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "deepseek_moe_nvidia.cuh" + +namespace op::deepseek_moe::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +namespace { + +constexpr size_t align_up(size_t value, size_t alignment) { + return (value + alignment - 1) / alignment * alignment; +} + +template +__device__ float to_float(T value) { + return static_cast(value); +} + +template <> +__device__ float to_float(half value) { + return __half2float(value); +} + +template <> +__device__ float to_float<__nv_bfloat16>(__nv_bfloat16 value) { + return __bfloat162float(value); +} + +template +__device__ T from_float(float value) { + return static_cast(value); +} + +template <> +__device__ half from_float(float value) { + return __float2half_rn(value); +} + +template <> +__device__ __nv_bfloat16 from_float<__nv_bfloat16>(float value) { + return __float2bfloat16_rn(value); +} + +template +__global__ void gate_up_kernel( + T *intermediate, + const T *hidden, + const int *topk_indices, + const float *topk_weights, + const void *const *gate_weights, + const void *const *up_weights, + size_t ntokens, + size_t hidden_size, + size_t topk, + size_t intermediate_size, + size_t num_experts) { + + const size_t route = blockIdx.x / intermediate_size; + const size_t j = blockIdx.x - route * intermediate_size; + if (route >= ntokens * topk) { + return; + } + const int expert = topk_indices[route]; + if (expert < 0 || static_cast(expert) >= num_experts) { + return; + } + const size_t token = route / topk; + const T *x = hidden + token * hidden_size; + const T *gate = reinterpret_cast(gate_weights[expert]) + j * hidden_size; + const T *up = reinterpret_cast(up_weights[expert]) + j * hidden_size; + + float gate_sum = 0.0f; + float up_sum = 0.0f; + for (size_t h = threadIdx.x; h < hidden_size; h += blockDim.x) { + const float xv = to_float(x[h]); + gate_sum += xv * to_float(gate[h]); + up_sum += xv * to_float(up[h]); + } + + __shared__ float gate_shared[256]; + __shared__ float up_shared[256]; + gate_shared[threadIdx.x] = gate_sum; + up_shared[threadIdx.x] = up_sum; + __syncthreads(); + + for (unsigned int stride = blockDim.x / 2; stride > 0; stride >>= 1) { + if (threadIdx.x < stride) { + gate_shared[threadIdx.x] += gate_shared[threadIdx.x + stride]; + up_shared[threadIdx.x] += up_shared[threadIdx.x + stride]; + } + __syncthreads(); + } + + if (threadIdx.x == 0) { + const float g = gate_shared[0]; + const float silu = g / (1.0f + __expf(-g)); + intermediate[route * intermediate_size + j] = from_float(silu * up_shared[0] * topk_weights[route]); + } +} + +template +__global__ void down_kernel( + T *out, + const T *intermediate, + const int *topk_indices, + const void *const *down_weights, + size_t ntokens, + size_t hidden_size, + size_t topk, + size_t intermediate_size, + size_t num_experts) { + + const size_t linear = blockIdx.x; + const size_t token = linear / hidden_size; + const size_t h = linear - token * hidden_size; + if (token >= ntokens) { + return; + } + + float acc = 0.0f; + const size_t route_base = token * topk; + const size_t count = topk * intermediate_size; + for (size_t idx = threadIdx.x; idx < count; idx += blockDim.x) { + const size_t k = idx / intermediate_size; + const size_t j = idx - k * intermediate_size; + const size_t route = route_base + k; + const int expert = topk_indices[route]; + if (expert >= 0 && static_cast(expert) < num_experts) { + const T *down = reinterpret_cast(down_weights[expert]) + h * intermediate_size; + acc += to_float(intermediate[route * intermediate_size + j]) * to_float(down[j]); + } + } + + __shared__ float shared[256]; + shared[threadIdx.x] = acc; + __syncthreads(); + for (unsigned int stride = blockDim.x / 2; stride > 0; stride >>= 1) { + if (threadIdx.x < stride) { + shared[threadIdx.x] += shared[threadIdx.x + stride]; + } + __syncthreads(); + } + if (threadIdx.x == 0) { + out[token * hidden_size + h] = from_float(shared[0]); + } +} + +template +infiniStatus_t launch_typed( + void *workspace, + size_t workspace_size, + const DeepseekMoeInfo &info, + void *out, + const void *hidden, + const void *topk_indices, + const void *topk_weights, + const void *const *gate_weights, + const void *const *up_weights, + const void *const *down_weights, + cudaStream_t stream, + bool weight_ptrs_on_device) { + + const size_t ptr_bytes = align_up(info.num_experts * sizeof(void *), 256); + const size_t ptr_workspace = ptr_bytes * 3; + const size_t intermediate_offset = align_up(ptr_workspace, 256); + const size_t intermediate_bytes = info.ntokens * info.topk * info.intermediate_size * sizeof(T); + if (workspace_size < intermediate_offset + intermediate_bytes) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + auto *base = reinterpret_cast(workspace); + const void *const *gate_ptrs = reinterpret_cast(base); + const void *const *up_ptrs = reinterpret_cast(base + ptr_bytes); + const void *const *down_ptrs = reinterpret_cast(base + ptr_bytes * 2); + auto *intermediate = reinterpret_cast(base + intermediate_offset); + + if (weight_ptrs_on_device) { + gate_ptrs = gate_weights; + up_ptrs = up_weights; + down_ptrs = down_weights; + } else { + auto **gate_workspace = reinterpret_cast(base); + auto **up_workspace = reinterpret_cast(base + ptr_bytes); + auto **down_workspace = reinterpret_cast(base + ptr_bytes * 2); + CHECK_CUDA(cudaMemcpyAsync(gate_workspace, gate_weights, info.num_experts * sizeof(void *), cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(up_workspace, up_weights, info.num_experts * sizeof(void *), cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(down_workspace, down_weights, info.num_experts * sizeof(void *), cudaMemcpyHostToDevice, stream)); + gate_ptrs = gate_workspace; + up_ptrs = up_workspace; + down_ptrs = down_workspace; + } + + constexpr int threads = 256; + const dim3 gate_blocks(static_cast(info.ntokens * info.topk * info.intermediate_size)); + gate_up_kernel<<>>( + intermediate, + reinterpret_cast(hidden), + reinterpret_cast(topk_indices), + reinterpret_cast(topk_weights), + gate_ptrs, + up_ptrs, + info.ntokens, + info.hidden_size, + info.topk, + info.intermediate_size, + info.num_experts); + + const dim3 down_blocks(static_cast(info.ntokens * info.hidden_size)); + down_kernel<<>>( + reinterpret_cast(out), + intermediate, + reinterpret_cast(topk_indices), + down_ptrs, + info.ntokens, + info.hidden_size, + info.topk, + info.intermediate_size, + info.num_experts); + + return INFINI_STATUS_SUCCESS; +} + +} // namespace + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t hidden_desc, + infiniopTensorDescriptor_t topk_indices_desc, + infiniopTensorDescriptor_t topk_weights_desc, + size_t intermediate_size, + size_t num_experts) { + + auto result = DeepseekMoeInfo::create(out_desc, hidden_desc, topk_indices_desc, topk_weights_desc, intermediate_size, num_experts); + CHECK_RESULT(result); + auto info = result.take(); + + const size_t dtype_size = info.dtype == INFINI_DTYPE_F16 ? sizeof(half) : sizeof(__nv_bfloat16); + const size_t ptr_bytes = align_up(info.num_experts * sizeof(void *), 256); + const size_t intermediate_offset = align_up(ptr_bytes * 3, 256); + const size_t intermediate_bytes = info.ntokens * info.topk * info.intermediate_size * dtype_size; + const size_t workspace_size = intermediate_offset + intermediate_bytes; + + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + info, + workspace_size, + handle->device, + handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *out, + const void *hidden, + const void *topk_indices, + const void *topk_weights, + const void *const *gate_weights, + const void *const *up_weights, + const void *const *down_weights, + void *stream_) const { + + auto stream = reinterpret_cast(stream_); + if (_info.dtype == INFINI_DTYPE_F16) { + return launch_typed(workspace, workspace_size, _info, out, hidden, topk_indices, topk_weights, gate_weights, up_weights, down_weights, stream, false); + } + if (_info.dtype == INFINI_DTYPE_BF16) { + return launch_typed<__nv_bfloat16>(workspace, workspace_size, _info, out, hidden, topk_indices, topk_weights, gate_weights, up_weights, down_weights, stream, false); + } + return INFINI_STATUS_BAD_TENSOR_DTYPE; +} + +infiniStatus_t Descriptor::calculateWithDevicePtrs( + void *workspace, + size_t workspace_size, + void *out, + const void *hidden, + const void *topk_indices, + const void *topk_weights, + const void *gate_weight_ptrs, + const void *up_weight_ptrs, + const void *down_weight_ptrs, + void *stream_) const { + + auto stream = reinterpret_cast(stream_); + auto gate_weights = reinterpret_cast(gate_weight_ptrs); + auto up_weights = reinterpret_cast(up_weight_ptrs); + auto down_weights = reinterpret_cast(down_weight_ptrs); + if (_info.dtype == INFINI_DTYPE_F16) { + return launch_typed(workspace, workspace_size, _info, out, hidden, topk_indices, topk_weights, gate_weights, up_weights, down_weights, stream, true); + } + if (_info.dtype == INFINI_DTYPE_BF16) { + return launch_typed<__nv_bfloat16>(workspace, workspace_size, _info, out, hidden, topk_indices, topk_weights, gate_weights, up_weights, down_weights, stream, true); + } + return INFINI_STATUS_BAD_TENSOR_DTYPE; +} + +} // namespace op::deepseek_moe::nvidia diff --git a/src/infiniop/ops/deepseek_moe/nvidia/deepseek_moe_nvidia.cuh b/src/infiniop/ops/deepseek_moe/nvidia/deepseek_moe_nvidia.cuh new file mode 100644 index 000000000..f8c8c4de7 --- /dev/null +++ b/src/infiniop/ops/deepseek_moe/nvidia/deepseek_moe_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef DEEPSEEK_MOE_NVIDIA_CUH +#define DEEPSEEK_MOE_NVIDIA_CUH + +#include "../deepseek_moe.h" + +DESCRIPTOR(nvidia) + +#endif diff --git a/src/infiniop/ops/deepseek_moe/operator.cc b/src/infiniop/ops/deepseek_moe/operator.cc new file mode 100644 index 000000000..759896859 --- /dev/null +++ b/src/infiniop/ops/deepseek_moe/operator.cc @@ -0,0 +1,129 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/deepseek_moe.h" + +#ifdef ENABLE_NVIDIA_API +#include "nvidia/deepseek_moe_nvidia.cuh" +#endif + +__INFINI_C infiniStatus_t infiniopCreateDeepseekMoeDescriptor( + infiniopHandle_t handle, + infiniopDeepseekMoeDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t hidden_desc, + infiniopTensorDescriptor_t topk_indices_desc, + infiniopTensorDescriptor_t topk_weights_desc, + size_t intermediate_size, + size_t num_experts) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::deepseek_moe::NAMESPACE::Descriptor::create( \ + handle, reinterpret_cast(desc_ptr), \ + out_desc, hidden_desc, topk_indices_desc, topk_weights_desc, \ + intermediate_size, num_experts) + + switch (handle->device) { +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef CREATE +} + +__INFINI_C infiniStatus_t infiniopGetDeepseekMoeWorkspaceSize( + infiniopDeepseekMoeDescriptor_t desc, + size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET +} + +__INFINI_C infiniStatus_t infiniopDeepseekMoe( + infiniopDeepseekMoeDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *out, + const void *hidden, + const void *topk_indices, + const void *topk_weights, + const void *const *gate_weights, + const void *const *up_weights, + const void *const *down_weights, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, workspace_size, out, hidden, topk_indices, topk_weights, \ + gate_weights, up_weights, down_weights, stream) + + switch (desc->device_type) { +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef CALCULATE +} + +__INFINI_C infiniStatus_t infiniopDeepseekMoeWithDevicePtrs( + infiniopDeepseekMoeDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *out, + const void *hidden, + const void *topk_indices, + const void *topk_weights, + const void *gate_weight_ptrs, + const void *up_weight_ptrs, + const void *down_weight_ptrs, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculateWithDevicePtrs( \ + workspace, workspace_size, out, hidden, topk_indices, topk_weights, \ + gate_weight_ptrs, up_weight_ptrs, down_weight_ptrs, stream) + + switch (desc->device_type) { +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef CALCULATE +} + +__INFINI_C infiniStatus_t infiniopDestroyDeepseekMoeDescriptor( + infiniopDeepseekMoeDescriptor_t desc) { + +#define DESTROY(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_NVIDIA_API + DESTROY(INFINI_DEVICE_NVIDIA, nvidia); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef DESTROY +} diff --git a/test/infiniop/deepseek_moe.py b/test/infiniop/deepseek_moe.py new file mode 100644 index 000000000..f8830e878 --- /dev/null +++ b/test/infiniop/deepseek_moe.py @@ -0,0 +1,267 @@ +import ctypes +from ctypes import c_uint64, c_void_p + +import torch +import torch.nn.functional as F +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, + torch_device_map, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== + +# ntokens, hidden_size, topk, intermediate_size, num_experts, use_device_ptrs +_TEST_CASES_ = [ + (1, 8, 1, 4, 2, False), + (2, 16, 2, 8, 4, False), + (3, 32, 2, 16, 4, True), +] + +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16] + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.BF16: {"atol": 5e-2, "rtol": 5e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 100 + + +def _make_tensor(shape, dtype, device, start, scale): + values = torch.arange( + start, start + torch.tensor(shape).prod().item(), dtype=torch.float32 + ) + values = ((values % 17) - 8) * scale + return TestTensor.from_torch(values.reshape(shape), dtype, device) + + +def _reference_deepseek_moe( + hidden, topk_indices, topk_weights, gate_weights, up_weights, down_weights +): + ntokens, hidden_size = hidden.shape + topk = topk_indices.shape[1] + out = torch.empty_like(hidden) + + for token in range(ntokens): + token_out = torch.zeros(hidden_size, dtype=torch.float32, device=hidden.device) + x = hidden[token].float() + for k in range(topk): + expert = int(topk_indices[token, k].item()) + gate = F.linear(x, gate_weights[expert].float()) + up = F.linear(x, up_weights[expert].float()) + intermediate = (F.silu(gate) * up * topk_weights[token, k]).to(hidden.dtype) + token_out += F.linear(intermediate.float(), down_weights[expert].float()) + out[token] = token_out.to(hidden.dtype) + + return out + + +def _ptr_array(tensors): + return (c_void_p * len(tensors))(*[tensor.data() for tensor in tensors]) + + +def _device_ptr_tensor(tensors, device): + return torch.tensor( + [tensor.data() for tensor in tensors], + dtype=torch.uint64, + device=torch_device_map[device], + ) + + +def test( + handle, + device, + ntokens, + hidden_size, + topk, + intermediate_size, + num_experts, + use_device_ptrs, + dtype=InfiniDtype.F16, + sync=None, +): + print( + f"Testing DeepseekMoe on {InfiniDeviceNames[device]} with " + f"ntokens:{ntokens} hidden_size:{hidden_size} topk:{topk} " + f"intermediate_size:{intermediate_size} num_experts:{num_experts} " + f"use_device_ptrs:{use_device_ptrs} dtype:{InfiniDtypeNames[dtype]}" + ) + + hidden = _make_tensor((ntokens, hidden_size), dtype, device, 0, 0.02) + topk_indices_data = ( + torch.arange(ntokens * topk, dtype=torch.int32).reshape(ntokens, topk) + % num_experts + ) + topk_indices = TestTensor.from_torch(topk_indices_data, InfiniDtype.I32, device) + + topk_weights_data = torch.arange( + 1, ntokens * topk + 1, dtype=torch.float32 + ).reshape(ntokens, topk) + topk_weights_data = topk_weights_data / topk_weights_data.sum(dim=-1, keepdim=True) + topk_weights = TestTensor.from_torch(topk_weights_data, InfiniDtype.F32, device) + + gate_weights = [ + _make_tensor( + (intermediate_size, hidden_size), dtype, device, 100 + i * 97, 0.01 + ) + for i in range(num_experts) + ] + up_weights = [ + _make_tensor( + (intermediate_size, hidden_size), dtype, device, 200 + i * 97, 0.01 + ) + for i in range(num_experts) + ] + down_weights = [ + _make_tensor( + (hidden_size, intermediate_size), dtype, device, 300 + i * 97, 0.01 + ) + for i in range(num_experts) + ] + out = TestTensor((ntokens, hidden_size), None, dtype, device, mode="zeros") + + ans = _reference_deepseek_moe( + hidden.torch_tensor(), + topk_indices.torch_tensor(), + topk_weights.torch_tensor(), + [weight.torch_tensor() for weight in gate_weights], + [weight.torch_tensor() for weight in up_weights], + [weight.torch_tensor() for weight in down_weights], + ) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateDeepseekMoeDescriptor( + handle, + ctypes.byref(descriptor), + out.descriptor, + hidden.descriptor, + topk_indices.descriptor, + topk_weights.descriptor, + intermediate_size, + num_experts, + ) + ) + + for tensor in [ + out, + hidden, + topk_indices, + topk_weights, + *gate_weights, + *up_weights, + *down_weights, + ]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetDeepseekMoeWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, out.device) + + gate_ptrs = _ptr_array(gate_weights) + up_ptrs = _ptr_array(up_weights) + down_ptrs = _ptr_array(down_weights) + gate_device_ptrs = _device_ptr_tensor(gate_weights, device) + up_device_ptrs = _device_ptr_tensor(up_weights, device) + down_device_ptrs = _device_ptr_tensor(down_weights, device) + + def lib_deepseek_moe(): + if use_device_ptrs: + check_error( + LIBINFINIOP.infiniopDeepseekMoeWithDevicePtrs( + descriptor, + workspace.data(), + workspace_size.value, + out.data(), + hidden.data(), + topk_indices.data(), + topk_weights.data(), + gate_device_ptrs.data_ptr(), + up_device_ptrs.data_ptr(), + down_device_ptrs.data_ptr(), + None, + ) + ) + else: + check_error( + LIBINFINIOP.infiniopDeepseekMoe( + descriptor, + workspace.data(), + workspace_size.value, + out.data(), + hidden.data(), + topk_indices.data(), + topk_weights.data(), + gate_ptrs, + up_ptrs, + down_ptrs, + None, + ) + ) + + lib_deepseek_moe() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(out.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(out.actual_tensor(), ans, atol=atol, rtol=rtol) + + if PROFILE: + profile_operation( + "PyTorch", + lambda: _reference_deepseek_moe( + hidden.torch_tensor(), + topk_indices.torch_tensor(), + topk_weights.torch_tensor(), + [weight.torch_tensor() for weight in gate_weights], + [weight.torch_tensor() for weight in up_weights], + [weight.torch_tensor() for weight in down_weights], + ), + device, + NUM_PRERUN, + NUM_ITERATIONS, + ) + profile_operation( + " lib", lambda: lib_deepseek_moe(), device, NUM_PRERUN, NUM_ITERATIONS + ) + + check_error(LIBINFINIOP.infiniopDestroyDeepseekMoeDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES_, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py index 3ec20a72e..a59bbdf3f 100644 --- a/test/infiniop/libinfiniop/op_register.py +++ b/test/infiniop/libinfiniop/op_register.py @@ -2292,6 +2292,62 @@ def fused_ffn_(lib): ] +@OpRegister.operator +def deepseek_moe_(lib): + lib.infiniopCreateDeepseekMoeDescriptor.restype = c_int32 + lib.infiniopCreateDeepseekMoeDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_size_t, + c_size_t, + ] + + lib.infiniopGetDeepseekMoeWorkspaceSize.restype = c_int32 + lib.infiniopGetDeepseekMoeWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopDeepseekMoe.restype = c_int32 + lib.infiniopDeepseekMoe.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + POINTER(c_void_p), + POINTER(c_void_p), + POINTER(c_void_p), + c_void_p, + ] + + lib.infiniopDeepseekMoeWithDevicePtrs.restype = c_int32 + lib.infiniopDeepseekMoeWithDevicePtrs.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyDeepseekMoeDescriptor.restype = c_int32 + lib.infiniopDestroyDeepseekMoeDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + @OpRegister.operator def blas_amax_(lib): lib.infiniopCreateBlasAmaxDescriptor.restype = c_int32