Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions include/infinicore/ops.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "ops/cdist.hpp"
#include "ops/conv2d.hpp"
#include "ops/cross_entropy.hpp"
#include "ops/deepseek_moe.hpp"
#include "ops/embedding.hpp"
#include "ops/flash_attention.hpp"
#include "ops/fmin.hpp"
Expand Down
41 changes: 41 additions & 0 deletions include/infinicore/ops/deepseek_moe.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#pragma once

#include "../device.hpp"
#include "../graph/graph.hpp"
#include "common/op.hpp"
#include <vector>

namespace infinicore::op {

INFINICORE_GRAPH_OP_CLASS(
DeepseekMoe,
Tensor,
const Tensor &,
const Tensor &,
const Tensor &,
const std::vector<Tensor> &,
const std::vector<Tensor> &,
const std::vector<Tensor> &,
size_t,
size_t);

Tensor deepseek_moe(const Tensor &hidden,
const Tensor &topk_indices,
const Tensor &topk_weights,
const std::vector<Tensor> &gate_weights,
const std::vector<Tensor> &up_weights,
const std::vector<Tensor> &down_weights,
size_t intermediate_size,
size_t num_experts);

void deepseek_moe_(Tensor out,
const Tensor &hidden,
const Tensor &topk_indices,
const Tensor &topk_weights,
const std::vector<Tensor> &gate_weights,
const std::vector<Tensor> &up_weights,
const std::vector<Tensor> &down_weights,
size_t intermediate_size,
size_t num_experts);

} // namespace infinicore::op
1 change: 1 addition & 0 deletions include/infiniop.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include "infiniop/ops/clip.h"
#include "infiniop/ops/conv.h"
#include "infiniop/ops/cross_entropy.h"
#include "infiniop/ops/deepseek_moe.h"
#include "infiniop/ops/dequant/per_tensor_dequant_int8.h"
#include "infiniop/ops/dequantize_awq.h"
#include "infiniop/ops/dequantize_gptq.h"
Expand Down
57 changes: 57 additions & 0 deletions include/infiniop/ops/deepseek_moe.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#ifndef __INFINIOP_DEEPSEEK_MOE_API_H__
#define __INFINIOP_DEEPSEEK_MOE_API_H__

#include "../operator_descriptor.h"

#ifdef __cplusplus
#include <cstddef>
#else
#include <stddef.h>
#endif

typedef struct InfiniopDescriptor *infiniopDeepseekMoeDescriptor_t;

__INFINI_C __export infiniStatus_t infiniopCreateDeepseekMoeDescriptor(
infiniopHandle_t handle,
infiniopDeepseekMoeDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t out_desc,
infiniopTensorDescriptor_t hidden_desc,
infiniopTensorDescriptor_t topk_indices_desc,
infiniopTensorDescriptor_t topk_weights_desc,
size_t intermediate_size,
size_t num_experts);

__INFINI_C __export infiniStatus_t infiniopGetDeepseekMoeWorkspaceSize(
infiniopDeepseekMoeDescriptor_t desc,
size_t *size);

__INFINI_C __export infiniStatus_t infiniopDeepseekMoe(
infiniopDeepseekMoeDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *out,
const void *hidden,
const void *topk_indices,
const void *topk_weights,
const void *const *gate_weights,
const void *const *up_weights,
const void *const *down_weights,
void *stream);

__INFINI_C __export infiniStatus_t infiniopDeepseekMoeWithDevicePtrs(
infiniopDeepseekMoeDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *out,
const void *hidden,
const void *topk_indices,
const void *topk_weights,
const void *gate_weight_ptrs,
const void *up_weight_ptrs,
const void *down_weight_ptrs,
void *stream);

__INFINI_C __export infiniStatus_t infiniopDestroyDeepseekMoeDescriptor(
infiniopDeepseekMoeDescriptor_t desc);

#endif
83 changes: 83 additions & 0 deletions src/infinicore/ops/deepseek_moe/deepseek_moe.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#include "infinicore/ops/deepseek_moe.hpp"
#include "../../utils.hpp"
#include <stdexcept>

namespace infinicore::op {

INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(DeepseekMoe);

namespace {

void check_weights(const std::vector<Tensor> &gate_weights,
const std::vector<Tensor> &up_weights,
const std::vector<Tensor> &down_weights,
size_t num_experts) {
if (gate_weights.size() != num_experts || up_weights.size() != num_experts || down_weights.size() != num_experts) {
throw std::runtime_error("DeepseekMoe: expert weight vector size mismatch");
}
}

} // namespace

DeepseekMoe::DeepseekMoe(Tensor out,
const Tensor &hidden,
const Tensor &topk_indices,
const Tensor &topk_weights,
const std::vector<Tensor> &gate_weights,
const std::vector<Tensor> &up_weights,
const std::vector<Tensor> &down_weights,
size_t intermediate_size,
size_t num_experts) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, hidden, topk_indices, topk_weights);
check_weights(gate_weights, up_weights, down_weights, num_experts);
for (size_t i = 0; i < num_experts; ++i) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, gate_weights[i], up_weights[i], down_weights[i]);
}
INFINICORE_GRAPH_OP_DISPATCH(out->device().getType(),
out, hidden, topk_indices, topk_weights,
gate_weights, up_weights, down_weights,
intermediate_size, num_experts);
}

void DeepseekMoe::execute(Tensor out,
const Tensor &hidden,
const Tensor &topk_indices,
const Tensor &topk_weights,
const std::vector<Tensor> &gate_weights,
const std::vector<Tensor> &up_weights,
const std::vector<Tensor> &down_weights,
size_t intermediate_size,
size_t num_experts) {
INFINICORE_GRAPH_OP_RECORD_OR_RUN(
DeepseekMoe,
out, hidden, topk_indices, topk_weights,
gate_weights, up_weights, down_weights,
intermediate_size, num_experts);
}

void deepseek_moe_(Tensor out,
const Tensor &hidden,
const Tensor &topk_indices,
const Tensor &topk_weights,
const std::vector<Tensor> &gate_weights,
const std::vector<Tensor> &up_weights,
const std::vector<Tensor> &down_weights,
size_t intermediate_size,
size_t num_experts) {
DeepseekMoe::execute(out, hidden, topk_indices, topk_weights, gate_weights, up_weights, down_weights, intermediate_size, num_experts);
}

Tensor deepseek_moe(const Tensor &hidden,
const Tensor &topk_indices,
const Tensor &topk_weights,
const std::vector<Tensor> &gate_weights,
const std::vector<Tensor> &up_weights,
const std::vector<Tensor> &down_weights,
size_t intermediate_size,
size_t num_experts) {
auto out = Tensor::empty(hidden->shape(), hidden->dtype(), hidden->device());
deepseek_moe_(out, hidden, topk_indices, topk_weights, gate_weights, up_weights, down_weights, intermediate_size, num_experts);
return out;
}

} // namespace infinicore::op
118 changes: 118 additions & 0 deletions src/infinicore/ops/deepseek_moe/deepseek_moe_infiniop.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
#include "infinicore/ops/deepseek_moe.hpp"

#include "../infiniop_impl.hpp"

namespace infinicore::op::deepseek_moe_impl::infiniop {

INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, DeepseekMoe, 100);

struct PlannedMeta {
std::shared_ptr<Descriptor> descriptor;
Tensor workspace_owner, out_owner, hidden_owner, topk_indices_owner, topk_weights_owner;
graph::GraphTensor workspace, out, hidden, topk_indices, topk_weights;
std::vector<graph::GraphTensor> gate_weights, up_weights, down_weights;
std::vector<const void *> gate_ptrs, up_ptrs, down_ptrs;
std::shared_ptr<Memory> gate_ptrs_device, up_ptrs_device, down_ptrs_device;
};

static std::vector<graph::GraphTensor> to_graph_tensors(const std::vector<Tensor> &tensors) {
std::vector<graph::GraphTensor> result;
result.reserve(tensors.size());
for (const auto &tensor : tensors) {
result.emplace_back(tensor);
}
return result;
}

static std::vector<const void *> data_ptrs(const std::vector<graph::GraphTensor> &tensors) {
std::vector<const void *> result;
result.reserve(tensors.size());
for (const auto &tensor : tensors) {
result.push_back(tensor->data());
}
return result;
}

void *plan(Tensor out,
const Tensor &hidden,
const Tensor &topk_indices,
const Tensor &topk_weights,
const std::vector<Tensor> &gate_weights,
const std::vector<Tensor> &up_weights,
const std::vector<Tensor> &down_weights,
size_t intermediate_size,
size_t num_experts) {
size_t seed = hash_combine(out, hidden, topk_indices, topk_weights);
hash_combine(seed, intermediate_size, num_experts);

INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
Descriptor, descriptor, DeepseekMoe, seed,
out->desc(), hidden->desc(), topk_indices->desc(), topk_weights->desc(),
intermediate_size, num_experts);

INFINIOP_WORKSPACE_TENSOR(workspace, DeepseekMoe, descriptor);

auto planned = new PlannedMeta{
descriptor,
workspace,
out,
hidden,
topk_indices,
topk_weights,
graph::GraphTensor(workspace),
graph::GraphTensor(out),
graph::GraphTensor(hidden),
graph::GraphTensor(topk_indices),
graph::GraphTensor(topk_weights),
to_graph_tensors(gate_weights),
to_graph_tensors(up_weights),
to_graph_tensors(down_weights),
{},
{},
{},
nullptr,
nullptr,
nullptr};
planned->gate_ptrs = data_ptrs(planned->gate_weights);
planned->up_ptrs = data_ptrs(planned->up_weights);
planned->down_ptrs = data_ptrs(planned->down_weights);
const size_t ptr_bytes = num_experts * sizeof(void *);
planned->gate_ptrs_device = context::allocateMemory(ptr_bytes);
planned->up_ptrs_device = context::allocateMemory(ptr_bytes);
planned->down_ptrs_device = context::allocateMemory(ptr_bytes);
context::memcpyH2D(planned->gate_ptrs_device->data(), planned->gate_ptrs.data(), ptr_bytes, false);
context::memcpyH2D(planned->up_ptrs_device->data(), planned->up_ptrs.data(), ptr_bytes, false);
context::memcpyH2D(planned->down_ptrs_device->data(), planned->down_ptrs.data(), ptr_bytes, false);
return planned;
}

void run(void *planned_meta) {
auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);

INFINICORE_CHECK_ERROR(infiniopDeepseekMoeWithDevicePtrs(
planned->descriptor->desc,
planned->workspace->data(),
planned->workspace->numel(),
planned->out->data(),
planned->hidden->data(),
planned->topk_indices->data(),
planned->topk_weights->data(),
planned->gate_ptrs_device->data(),
planned->up_ptrs_device->data(),
planned->down_ptrs_device->data(),
context::getStream()));
}

void cleanup(void **planned_meta_ptr) {
delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
*planned_meta_ptr = nullptr;
}

static bool registered = []() {
DeepseekMoe::plan_dispatcher().registerDevice(Device::Type::NVIDIA, &plan);
DeepseekMoe::run_dispatcher().registerDevice(Device::Type::NVIDIA, &run);
DeepseekMoe::cleanup_dispatcher().registerDevice(Device::Type::NVIDIA, &cleanup);
return true;
}();

} // namespace infinicore::op::deepseek_moe_impl::infiniop
42 changes: 42 additions & 0 deletions src/infiniop/ops/deepseek_moe/deepseek_moe.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#ifndef DEEPSEEK_MOE_H
#define DEEPSEEK_MOE_H

#include "../../operator.h"
#include "info.h"

#define DESCRIPTOR(NAMESPACE) \
namespace op::deepseek_moe::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
DeepseekMoeInfo _info; \
size_t _workspace_size; \
\
Descriptor(Opaque *opaque, DeepseekMoeInfo info, size_t workspace_size, \
infiniDevice_t device_type, int device_id) \
: InfiniopDescriptor{device_type, device_id}, _opaque(opaque), \
_info(info), _workspace_size(workspace_size) {} \
\
public: \
~Descriptor(); \
size_t workspaceSize() const { return _workspace_size; } \
static infiniStatus_t create(infiniopHandle_t handle, Descriptor **desc_ptr, \
infiniopTensorDescriptor_t out_desc, \
infiniopTensorDescriptor_t hidden_desc, \
infiniopTensorDescriptor_t topk_indices_desc, \
infiniopTensorDescriptor_t topk_weights_desc, \
size_t intermediate_size, size_t num_experts); \
infiniStatus_t calculate(void *workspace, size_t workspace_size, void *out, \
const void *hidden, const void *topk_indices, \
const void *topk_weights, const void *const *gate_weights, \
const void *const *up_weights, const void *const *down_weights, \
void *stream) const; \
infiniStatus_t calculateWithDevicePtrs(void *workspace, size_t workspace_size, \
void *out, const void *hidden, const void *topk_indices, \
const void *topk_weights, const void *gate_weight_ptrs, \
const void *up_weight_ptrs, const void *down_weight_ptrs, \
void *stream) const; \
}; \
}

#endif
Loading
Loading