From c5580c6a317f9c24407250c0449ead555218edb7 Mon Sep 17 00:00:00 2001 From: zhushuang Date: Tue, 16 Jun 2026 09:27:25 +0000 Subject: [PATCH 01/12] feat: refactor InfiniOps runtime apis through InfiniRT --- CMakeLists.txt | 10 ++ examples/runtime_api.h | 2 +- scripts/generate_wrappers.py | 88 ++++++++++++- src/CMakeLists.txt | 16 ++- src/data_type.h | 181 ++------------------------ src/device.h | 109 +--------------- src/native/cpu/caster_.h | 2 +- src/native/cpu/data_type_.h | 21 --- src/native/cpu/device_.h | 13 -- src/native/cpu/runtime_.h | 34 ----- src/native/cuda/iluvatar/data_type_.h | 23 +--- src/native/cuda/iluvatar/device_.h | 9 +- src/native/cuda/iluvatar/runtime_.h | 38 +----- src/native/cuda/metax/data_type_.h | 22 +--- src/native/cuda/metax/device_.h | 9 +- src/native/cuda/metax/runtime_.h | 32 +---- src/native/cuda/moore/data_type_.h | 21 +-- src/native/cuda/moore/device_.h | 9 +- src/native/cuda/moore/runtime_.h | 40 +----- src/native/cuda/nvidia/data_type_.h | 23 +--- src/native/cuda/nvidia/device_.h | 9 +- src/native/cuda/nvidia/runtime_.h | 38 +----- src/runtime.h | 45 +------ src/tensor.h | 149 +-------------------- 24 files changed, 152 insertions(+), 791 deletions(-) delete mode 100644 src/native/cpu/data_type_.h delete mode 100644 src/native/cpu/device_.h delete mode 100644 src/native/cpu/runtime_.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 2d10efdbe..cab00b619 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -502,6 +502,16 @@ if(WITH_HYGON AND NOT EXISTS "${DTK_ROOT}/llvm/lib/LLVMgold.so") set(PYBIND11_ENABLE_EXTRAS OFF) endif() +set(INFINIRT_SOURCE_DIR "${PROJECT_SOURCE_DIR}/../InfiniRT" CACHE PATH "InfiniRT source directory") +if(NOT EXISTS "${INFINIRT_SOURCE_DIR}/CMakeLists.txt") + message(FATAL_ERROR + "InfiniRT not found at `${INFINIRT_SOURCE_DIR}`. " + "Set `INFINIRT_SOURCE_DIR` to the InfiniRT source directory.") +endif() +if(NOT TARGET infinirt) + add_subdirectory("${INFINIRT_SOURCE_DIR}" "${CMAKE_BINARY_DIR}/InfiniRT") +endif() + add_subdirectory(src) if(NOT GENERATE_PYTHON_BINDINGS) diff --git a/examples/runtime_api.h b/examples/runtime_api.h index 101fcad59..7df6a6eb3 100644 --- a/examples/runtime_api.h +++ b/examples/runtime_api.h @@ -23,8 +23,8 @@ #include "native/ascend/ops/gemm/kernel.h" #include "native/ascend/runtime_.h" #elif WITH_CPU +#include "infini_rt/cpu/runtime_.h" #include "native/cpu/ops/gemm/gemm.h" -#include "native/cpu/runtime_.h" #else #error "One `WITH_*` backend must be enabled for the examples." #endif diff --git a/scripts/generate_wrappers.py b/scripts/generate_wrappers.py index 31e1ab0aa..4cffa6b08 100644 --- a/scripts/generate_wrappers.py +++ b/scripts/generate_wrappers.py @@ -192,7 +192,7 @@ def __call__(self, op_name): "src", "-I", str(_GENERATION_DIR), - ) + _get_system_include_flags() + ) + _get_infinirt_include_flags() + _get_system_include_flags() translation_unit = index.parse(str(_find_base_header(op_name)), args=args) nodes = tuple(type(self)._find(translation_unit.cursor, op_name)) @@ -407,6 +407,19 @@ def _find_vector_int64_params(op_name): return set(re.findall(r"std::vector\s+(\w+)", source)) +def _find_tensor_params(op_name): + source = _find_base_header(op_name).read_text() + + params = set() + params.update( + re.findall(r"(?:^|[,(]\s*)(?:const\s+)?Tensor\s+(\w+)", source) + ) + params.update(_find_optional_tensor_params(op_name)) + params.update(_find_vector_tensor_params(op_name)) + + return params + + def _generate_pybind11(operator): optional_tensor_params = _find_optional_tensor_params(operator.name) optional_non_tensor_params = _find_optional_non_tensor_params(operator.name) @@ -840,6 +853,7 @@ def _generate_tensor_caster(name, is_data=False): def _generate_generated_dispatch_entries(operator): optional_tensor_params = _find_optional_tensor_params(operator.name) optional_non_tensor_params = _find_optional_non_tensor_params(operator.name) + tensor_params = _find_tensor_params(operator.name) vector_tensor_params = _find_vector_tensor_params(operator.name) vector_int64_params = _find_vector_int64_params(operator.name) @@ -863,6 +877,15 @@ def _is_vector_tensor(arg): def _is_vector_int64(arg): return arg.spelling in vector_int64_params + def _is_tensor(arg): + if arg.spelling in optional_non_tensor_params: + return False + + if arg.spelling in tensor_params: + return True + + return "Tensor" in arg.type.spelling or "TensorView" in arg.type.spelling + def _generate_params(node): parts = [] @@ -876,6 +899,8 @@ def _generate_params(node): parts.append(f"std::vector {arg.spelling}") elif _is_vector_int64(arg): parts.append(f"std::vector {arg.spelling}") + elif _is_tensor(arg): + parts.append(f"Tensor {arg.spelling}") else: parts.append(f"{arg.type.spelling} {arg.spelling}") @@ -1032,16 +1057,71 @@ def _strip_top_level_const(type_spelling): def _generate_operator_call_instantiation_entries(operator): + optional_tensor_params = _find_optional_tensor_params(operator.name) + optional_non_tensor_params = _find_optional_non_tensor_params(operator.name) + tensor_params = _find_tensor_params(operator.name) + vector_tensor_params = _find_vector_tensor_params(operator.name) + vector_int64_params = _find_vector_int64_params(operator.name) + + def _is_optional_tensor(arg): + spelling = arg.type.spelling + + if "std::optional" in spelling: + return "Tensor" in spelling or "TensorView" in spelling + + if arg.spelling in optional_non_tensor_params: + return False + + if arg.spelling in optional_tensor_params: + return True + + return False + + def _is_vector_tensor(arg): + if arg.spelling in vector_tensor_params: + return True + + return "std::vector" in arg.type.spelling and ( + "Tensor" in arg.type.spelling or "TensorView" in arg.type.spelling + ) + + def _is_vector_int64(arg): + return arg.spelling in vector_int64_params + + def _is_tensor(arg): + if arg.spelling in optional_non_tensor_params: + return False + + if arg.spelling in tensor_params: + return True + + return "Tensor" in arg.type.spelling or "TensorView" in arg.type.spelling + + def _normalized_type(arg): + if _is_optional_tensor(arg): + return "std::optional" + + if _is_vector_tensor(arg): + return "std::vector" + + if _is_vector_int64(arg): + return "std::vector" + + if _is_tensor(arg): + return "Tensor" + + return _strip_top_level_const(arg.type.spelling) + def _generate_template_arguments(node): return ", ".join( - _strip_top_level_const(arg.type.spelling) + _normalized_type(arg) for arg in node.get_arguments() if arg.spelling != "stream" ) def _generate_parameters(node): return ", ".join( - f"const {_strip_top_level_const(arg.type.spelling)}& {arg.spelling}" + f"const {_normalized_type(arg)}& {arg.spelling}" for arg in node.get_arguments() if arg.spelling != "stream" ) @@ -1153,7 +1233,7 @@ def _generate_operator_call_instantiation_source(devices, impl_paths, definition def _device_marker_headers(devices): paths = { - "cpu": "native/cpu/device_.h", + "cpu": "infini_rt/cpu/device_.h", "nvidia": "native/cuda/nvidia/device_.h", "cambricon": "native/cambricon/device_.h", "ascend": "native/ascend/device_.h", diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 61d3103c0..eab0d4bdd 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,4 +1,5 @@ -add_library(infiniops SHARED) +add_library(infiniops SHARED operator.h) +set_target_properties(infiniops PROPERTIES LINKER_LANGUAGE CXX) function(_infini_ops_write_if_different path content) set(_should_write TRUE) @@ -35,8 +36,11 @@ endfunction() include(GNUInstallDirs) file(GLOB BASE_SRCS CONFIGURE_DEPENDS "*.cc") +list(FILTER BASE_SRCS EXCLUDE REGEX ".*tensor\\.cc$") target_sources(infiniops PRIVATE ${BASE_SRCS}) +target_link_libraries(infiniops PUBLIC infinirt) + set(DEVICE_LIST "") if(WITH_CPU) @@ -46,7 +50,7 @@ if(WITH_CPU) ) file(GLOB_RECURSE CPU_SOURCES CONFIGURE_DEPENDS ${CPU_PATTERNS}) - list(APPEND CORE_SOURCES ${CPU_SOURCES}) + target_sources(infiniops PRIVATE ${CPU_SOURCES}) target_compile_definitions(infiniops PUBLIC WITH_CPU=1) @@ -597,7 +601,11 @@ if(GENERATE_OPERATOR_CALL_INSTANTIATIONS OR GENERATE_PYTHON_BINDINGS) endif() execute_process( - COMMAND ${Python_EXECUTABLE} ${PROJECT_SOURCE_DIR}/scripts/generate_wrappers.py ${GENERATOR_ARGS} + COMMAND ${CMAKE_COMMAND} -E env + INFINIRT_SOURCE_DIR=${INFINIRT_SOURCE_DIR} + ${Python_EXECUTABLE} + ${PROJECT_SOURCE_DIR}/scripts/generate_wrappers.py + ${GENERATOR_ARGS} WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} RESULT_VARIABLE script_result ) @@ -924,7 +932,7 @@ if(GENERATE_PYTHON_BINDINGS) set_target_properties(infiniops PROPERTIES INSTALL_RPATH "${_INFINI_OPS_INSTALL_RPATH}") set_target_properties(ops PROPERTIES INSTALL_RPATH "${_INFINI_OPS_INSTALL_RPATH}") - install(TARGETS infiniops ops DESTINATION .) + install(TARGETS infinirt infiniops ops DESTINATION .) file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/__init__.py" "") install(FILES "${CMAKE_CURRENT_BINARY_DIR}/__init__.py" DESTINATION .) diff --git a/src/data_type.h b/src/data_type.h index 75483d2b8..af991bb96 100644 --- a/src/data_type.h +++ b/src/data_type.h @@ -1,192 +1,33 @@ #ifndef INFINI_OPS_DATA_TYPE_H_ #define INFINI_OPS_DATA_TYPE_H_ -#include -#include -#include - -#include "common/constexpr_map.h" #include "common/traits.h" #include "device.h" +#include "infini_rt/data_type.h" namespace infini::ops { -enum class DataType : std::int8_t { - kInt8, - kInt16, - kInt32, - kInt64, - kUInt8, - kUInt16, - kUInt32, - kUInt64, - kFloat16, - kBFloat16, - kFloat32, - kFloat64 -}; - -constexpr ConstexprMap kDataTypeToSize{{{ - {DataType::kInt8, 1}, - {DataType::kInt16, 2}, - {DataType::kInt32, 4}, - {DataType::kInt64, 8}, - {DataType::kUInt8, 1}, - {DataType::kUInt16, 2}, - {DataType::kUInt32, 4}, - {DataType::kUInt64, 8}, - {DataType::kFloat16, 2}, - {DataType::kBFloat16, 2}, - {DataType::kFloat32, 4}, - {DataType::kFloat64, 8}, -}}}; - -constexpr ConstexprMap kDataTypeToDesc{{{ - {DataType::kInt8, "int8"}, - {DataType::kInt16, "int16"}, - {DataType::kInt32, "int32"}, - {DataType::kInt64, "int64"}, - {DataType::kUInt8, "uint8"}, - {DataType::kUInt16, "uint16"}, - {DataType::kUInt32, "uint32"}, - {DataType::kUInt64, "uint64"}, - {DataType::kFloat16, "float16"}, - {DataType::kBFloat16, "bfloat16"}, - {DataType::kFloat32, "float32"}, - {DataType::kFloat64, "float64"}, -}}}; - -constexpr ConstexprMap kStringToDataType{{{ - {"int8", DataType::kInt8}, - {"int16", DataType::kInt16}, - {"int32", DataType::kInt32}, - {"int64", DataType::kInt64}, - {"uint8", DataType::kUInt8}, - {"uint16", DataType::kUInt16}, - {"uint32", DataType::kUInt32}, - {"uint64", DataType::kUInt64}, - {"float16", DataType::kFloat16}, - {"bfloat16", DataType::kBFloat16}, - {"float32", DataType::kFloat32}, - {"float64", DataType::kFloat64}, -}}}; - -struct Float16 { - std::uint16_t bits; - - static inline Float16 FromFloat(float val) { - std::uint32_t f32; - std::memcpy(&f32, &val, sizeof(f32)); - std::uint16_t sign = (f32 >> 16) & 0x8000; - std::int32_t exponent = ((f32 >> 23) & 0xFF) - 127; - std::uint32_t mantissa = f32 & 0x7FFFFF; - - if (exponent >= 16) { - // NaN - if (exponent == 128 && mantissa != 0) { - return {static_cast(sign | 0x7E00)}; - } - // Inf - return {static_cast(sign | 0x7C00)}; - } else if (exponent >= -14) { - return {static_cast(sign | ((exponent + 15) << 10) | - (mantissa >> 13))}; - } else if (exponent >= -24) { - mantissa |= 0x800000; - mantissa >>= (-14 - exponent); - return {static_cast(sign | (mantissa >> 13))}; - } - // Too small for subnormal: return signed zero. - return {sign}; - } +using infini::rt::DataType; - inline float ToFloat() const { - std::uint32_t sign = (bits & 0x8000) << 16; - std::int32_t exponent = (bits >> 10) & 0x1F; - std::uint32_t mantissa = bits & 0x3FF; - std::uint32_t f32_bits; +using infini::rt::Float16; +using infini::rt::BFloat16; - if (exponent == 31) { - f32_bits = sign | 0x7F800000 | (mantissa << 13); - } else if (exponent == 0) { - if (mantissa == 0) { - f32_bits = sign; - } else { - exponent = -14; - while ((mantissa & 0x400) == 0) { - mantissa <<= 1; - exponent--; - } - mantissa &= 0x3FF; - f32_bits = sign | ((exponent + 127) << 23) | (mantissa << 13); - } - } else { - f32_bits = sign | ((exponent + 127 - 15) << 23) | (mantissa << 13); - } - - float result; - std::memcpy(&result, &f32_bits, sizeof(result)); - return result; - } -}; - -struct BFloat16 { - std::uint16_t bits; - - static inline BFloat16 FromFloat(float val) { - std::uint32_t bits32; - std::memcpy(&bits32, &val, sizeof(bits32)); - - const std::uint32_t rounding_bias = 0x00007FFF + ((bits32 >> 16) & 1); - std::uint16_t bf16_bits = - static_cast((bits32 + rounding_bias) >> 16); - return {bf16_bits}; - } - - inline float ToFloat() const { - std::uint32_t bits32 = static_cast(bits) << 16; - float result; - std::memcpy(&result, &bits32, sizeof(result)); - return result; - } -}; +using infini::rt::kDataTypeToSize; +using infini::rt::kDataTypeToDesc; +using infini::rt::kStringToDataType; template -struct TypeMap; +using TypeMap = infini::rt::TypeMap; template -using TypeMapType = typename TypeMap::type; - -#define DEFINE_DATA_TYPE_MAPPING(ENUM_VALUE, CPP_TYPE) \ - template \ - struct TypeMap { \ - using type = CPP_TYPE; \ - }; - -DEFINE_DATA_TYPE_MAPPING(kUInt8, std::uint8_t) -DEFINE_DATA_TYPE_MAPPING(kInt8, std::int8_t) -DEFINE_DATA_TYPE_MAPPING(kUInt16, std::uint16_t) -DEFINE_DATA_TYPE_MAPPING(kInt16, std::int16_t) -DEFINE_DATA_TYPE_MAPPING(kUInt32, std::uint32_t) -DEFINE_DATA_TYPE_MAPPING(kInt32, std::int32_t) -DEFINE_DATA_TYPE_MAPPING(kUInt64, std::uint64_t) -DEFINE_DATA_TYPE_MAPPING(kInt64, std::int64_t) -DEFINE_DATA_TYPE_MAPPING(kFloat32, float) -DEFINE_DATA_TYPE_MAPPING(kFloat64, double) -#undef DEFINE_DATA_TYPE_MAPPING +using TypeMapType = infini::rt::TypeMapType; -// Checks whether a C++ type is the bfloat16 or float16 type for the given -// device. Full specializations for each device's float16/bfloat16 types are -// provided in the corresponding platform-specific device type headers. template -inline constexpr bool IsBFloat16 = - std::is_same_v>; +inline constexpr bool IsBFloat16 = infini::rt::IsBFloat16; template -inline constexpr bool IsFP16 = - std::is_same_v>; +inline constexpr bool IsFP16 = infini::rt::IsFP16; -// Defines the common categories of data types using List. using FloatTypes = List; using ReducedFloatTypes = List; using IntTypes = diff --git a/src/device.h b/src/device.h index 688cd0dc2..8c18efc5c 100644 --- a/src/device.h +++ b/src/device.h @@ -1,110 +1,22 @@ #ifndef INFINI_OPS_DEVICE_H_ #define INFINI_OPS_DEVICE_H_ -#include -#include - -#include "common/constexpr_map.h" #include "common/traits.h" -#include "hash.h" +#include "infini_rt/device.h" namespace infini::ops { -class Device { - public: - enum class Type { - kCpu = 0, - kNvidia = 1, - kCambricon = 2, - kAscend = 3, - kMetax = 4, - kMoore = 5, - kIluvatar = 6, - kKunlun = 7, - kHygon = 8, - kQy = 9, - kCount - }; - - Device() = default; - - Device(const Type& type, const int& index = 0) : type_{type}, index_{index} {} - - static const Type TypeFromString(const std::string& name) { - return kDescToDevice.at(name); - } - - static const std::string_view StringFromType(const Type& type) { - return kDeviceToDesc.at(type); - } - - const Type& type() const { return type_; } - - const int& index() const { return index_; } - - std::string ToString() const { - return std::string{StringFromType(type_)} + ":" + std::to_string(index_); - } - - bool operator==(const Device& other) const { - return type_ == other.type_ && index_ == other.index_; - } - - bool operator!=(const Device& other) const { return !(*this == other); } +using Device = infini::rt::Device; - private: - Type type_{Type::kCpu}; +template +using DeviceEnabled = infini::rt::DeviceEnabled; - static constexpr ConstexprMap(Device::Type::kCount)> - kDeviceToDesc{{{ - {Type::kCpu, "cpu"}, - {Type::kNvidia, "nvidia"}, - {Type::kCambricon, "cambricon"}, - {Type::kAscend, "ascend"}, - {Type::kMetax, "metax"}, - {Type::kMoore, "moore"}, - {Type::kIluvatar, "iluvatar"}, - {Type::kKunlun, "kunlun"}, - {Type::kHygon, "hygon"}, - {Type::kQy, "qy"}, - }}}; - - static constexpr ConstexprMap(Device::Type::kCount)> - kDescToDevice{{{ - {"cpu", Type::kCpu}, - {"nvidia", Type::kNvidia}, - {"cambricon", Type::kCambricon}, - {"ascend", Type::kAscend}, - {"metax", Type::kMetax}, - {"moore", Type::kMoore}, - {"iluvatar", Type::kIluvatar}, - {"kunlun", Type::kKunlun}, - {"hygon", Type::kHygon}, - {"qy", Type::kQy}, - }}}; - - int index_{0}; -}; - -// Primary template: Devices are disabled by default. Platform-specific -// headers (e.g. `cpu/device_.h`) specialize this to `std::true_type`. -template -struct DeviceEnabled : std::false_type {}; - -// Defines the common categories of devices using List. using AllDeviceTypes = List; -// Deferred computation of active devices. The `Filter` and `FilterList` -// evaluation are nested inside a class template so that `DeviceEnabled` -// specializations from platform `device_.h` headers are visible at -// instantiation time. Use with a dependent type parameter -// (e.g. `ActiveDevices`) to ensure deferred instantiation. template struct ActiveDevicesImpl { struct Filter { @@ -121,17 +33,4 @@ using ActiveDevices = typename ActiveDevicesImpl::type; } // namespace infini::ops -template <> -struct std::hash { - std::size_t operator()(const infini::ops::Device& device) const { - std::size_t seed{0}; - - HashCombine(seed, device.type()); - - HashCombine(seed, device.index()); - - return seed; - } -}; - #endif diff --git a/src/native/cpu/caster_.h b/src/native/cpu/caster_.h index 7da1bf365..43f23e3b3 100644 --- a/src/native/cpu/caster_.h +++ b/src/native/cpu/caster_.h @@ -4,7 +4,7 @@ #include #include "caster.h" -#include "native/cpu/data_type_.h" +#include "infini_rt/cpu/data_type_.h" namespace infini::ops { diff --git a/src/native/cpu/data_type_.h b/src/native/cpu/data_type_.h deleted file mode 100644 index 36231db51..000000000 --- a/src/native/cpu/data_type_.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef INFINI_OPS_CPU_DATA_TYPE__H_ -#define INFINI_OPS_CPU_DATA_TYPE__H_ - -#include "data_type.h" -#include "native/cpu/device_.h" - -namespace infini::ops { - -template <> -struct TypeMap { - using type = Float16; -}; - -template <> -struct TypeMap { - using type = BFloat16; -}; - -} // namespace infini::ops - -#endif diff --git a/src/native/cpu/device_.h b/src/native/cpu/device_.h deleted file mode 100644 index e5e7d85a3..000000000 --- a/src/native/cpu/device_.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef INFINI_OPS_CPU_DEVICE__H_ -#define INFINI_OPS_CPU_DEVICE__H_ - -#include "device.h" - -namespace infini::ops { - -template <> -struct DeviceEnabled : std::true_type {}; - -} // namespace infini::ops - -#endif diff --git a/src/native/cpu/runtime_.h b/src/native/cpu/runtime_.h deleted file mode 100644 index cb6176ba1..000000000 --- a/src/native/cpu/runtime_.h +++ /dev/null @@ -1,34 +0,0 @@ -#ifndef INFINI_OPS_CPU_RUNTIME_H_ -#define INFINI_OPS_CPU_RUNTIME_H_ - -#include -#include - -#include "runtime.h" - -namespace infini::ops { - -template <> -struct Runtime : RuntimeBase> { - static constexpr Device::Type kDeviceType = Device::Type::kCpu; - - static void Malloc(void** ptr, std::size_t size) { *ptr = std::malloc(size); } - - static void Free(void* ptr) { std::free(ptr); } - - static void Memcpy(void* dst, const void* src, std::size_t size, int) { - std::memcpy(dst, src, size); - } - - static constexpr auto Memset = std::memset; - - static constexpr int MemcpyHostToDevice = 0; - - static constexpr int MemcpyDeviceToHost = 1; -}; - -static_assert(Runtime::Validate()); - -} // namespace infini::ops - -#endif diff --git a/src/native/cuda/iluvatar/data_type_.h b/src/native/cuda/iluvatar/data_type_.h index ac997f992..7a641f39b 100644 --- a/src/native/cuda/iluvatar/data_type_.h +++ b/src/native/cuda/iluvatar/data_type_.h @@ -1,29 +1,12 @@ #ifndef INFINI_OPS_ILUVATAR_DATA_TYPE__H_ #define INFINI_OPS_ILUVATAR_DATA_TYPE__H_ -// clang-format off -#include -#include -// clang-format on - -#include "data_type.h" -#include "native/cuda/iluvatar/device_.h" +#include "infini_rt/iluvatar/data_type_.h" namespace infini::ops { -using cuda_bfloat16 = nv_bfloat16; - -using cuda_bfloat162 = nv_bfloat162; - -template <> -struct TypeMap { - using type = half; -}; - -template <> -struct TypeMap { - using type = __nv_bfloat16; -}; +using infini::rt::cuda_bfloat16; +using infini::rt::cuda_bfloat162; } // namespace infini::ops diff --git a/src/native/cuda/iluvatar/device_.h b/src/native/cuda/iluvatar/device_.h index 4773201da..f4076ecf1 100644 --- a/src/native/cuda/iluvatar/device_.h +++ b/src/native/cuda/iluvatar/device_.h @@ -1,13 +1,6 @@ #ifndef INFINI_OPS_ILUVATAR_DEVICE__H_ #define INFINI_OPS_ILUVATAR_DEVICE__H_ -#include "device.h" - -namespace infini::ops { - -template <> -struct DeviceEnabled : std::true_type {}; - -} // namespace infini::ops +#include "infini_rt/iluvatar/device_.h" #endif diff --git a/src/native/cuda/iluvatar/runtime_.h b/src/native/cuda/iluvatar/runtime_.h index 1e41b68e9..c85fb3d90 100644 --- a/src/native/cuda/iluvatar/runtime_.h +++ b/src/native/cuda/iluvatar/runtime_.h @@ -1,42 +1,8 @@ #ifndef INFINI_OPS_ILUVATAR_RUNTIME_H_ #define INFINI_OPS_ILUVATAR_RUNTIME_H_ -#include - -// clang-format off -#include -// clang-format on - -#include "native/cuda/iluvatar/device_.h" +#include "infini_rt/iluvatar/runtime_.h" #include "native/cuda/iluvatar/runtime_utils.h" -#include "native/cuda/runtime_.h" - -namespace infini::ops { - -template <> -struct Runtime - : CudaRuntime> { - using Stream = cudaStream_t; - - static constexpr Device::Type kDeviceType = Device::Type::kIluvatar; - - static constexpr auto Malloc = [](auto&&... args) { - return cudaMalloc(std::forward(args)...); - }; - - static constexpr auto Memcpy = cudaMemcpy; - - static constexpr auto Free = cudaFree; - - static constexpr auto MemcpyHostToDevice = cudaMemcpyHostToDevice; - - static constexpr auto MemcpyDeviceToHost = cudaMemcpyDeviceToHost; - - static constexpr auto Memset = cudaMemset; -}; - -static_assert(Runtime::Validate()); - -} // namespace infini::ops +#include "runtime.h" #endif diff --git a/src/native/cuda/metax/data_type_.h b/src/native/cuda/metax/data_type_.h index 73e498774..ac0214cc2 100644 --- a/src/native/cuda/metax/data_type_.h +++ b/src/native/cuda/metax/data_type_.h @@ -1,28 +1,12 @@ #ifndef INFINI_OPS_METAX_DATA_TYPE__H_ #define INFINI_OPS_METAX_DATA_TYPE__H_ -#include -#include -#include - -#include "data_type.h" -#include "native/cuda/metax/device_.h" +#include "infini_rt/metax/data_type_.h" namespace infini::ops { -using cuda_bfloat16 = maca_bfloat16; - -using cuda_bfloat162 = maca_bfloat162; - -template <> -struct TypeMap { - using type = __half; -}; - -template <> -struct TypeMap { - using type = __maca_bfloat16; -}; +using infini::rt::cuda_bfloat16; +using infini::rt::cuda_bfloat162; } // namespace infini::ops diff --git a/src/native/cuda/metax/device_.h b/src/native/cuda/metax/device_.h index 6e7c677d7..230d2ee84 100644 --- a/src/native/cuda/metax/device_.h +++ b/src/native/cuda/metax/device_.h @@ -1,13 +1,6 @@ #ifndef INFINI_OPS_METAX_DEVICE__H_ #define INFINI_OPS_METAX_DEVICE__H_ -#include "device.h" - -namespace infini::ops { - -template <> -struct DeviceEnabled : std::true_type {}; - -} // namespace infini::ops +#include "infini_rt/metax/device_.h" #endif diff --git a/src/native/cuda/metax/runtime_.h b/src/native/cuda/metax/runtime_.h index 6563b6b41..a23524039 100644 --- a/src/native/cuda/metax/runtime_.h +++ b/src/native/cuda/metax/runtime_.h @@ -1,36 +1,8 @@ #ifndef INFINI_OPS_METAX_RUNTIME_H_ #define INFINI_OPS_METAX_RUNTIME_H_ -#include - -#include "native/cuda/metax/device_.h" +#include "infini_rt/metax/runtime_.h" #include "native/cuda/metax/runtime_utils.h" -#include "native/cuda/runtime_.h" - -namespace infini::ops { - -template <> -struct Runtime - : CudaRuntime> { - using Stream = mcStream_t; - - static constexpr Device::Type kDeviceType = Device::Type::kMetax; - - static constexpr auto Malloc = mcMalloc; - - static constexpr auto Memcpy = mcMemcpy; - - static constexpr auto Free = mcFree; - - static constexpr auto MemcpyHostToDevice = mcMemcpyHostToDevice; - - static constexpr auto MemcpyDeviceToHost = mcMemcpyDeviceToHost; - - static constexpr auto Memset = mcMemset; -}; - -static_assert(Runtime::Validate()); - -} // namespace infini::ops +#include "runtime.h" #endif diff --git a/src/native/cuda/moore/data_type_.h b/src/native/cuda/moore/data_type_.h index c97cb33dd..d708be7c2 100644 --- a/src/native/cuda/moore/data_type_.h +++ b/src/native/cuda/moore/data_type_.h @@ -1,27 +1,12 @@ #ifndef INFINI_OPS_MOORE_DATA_TYPE__H_ #define INFINI_OPS_MOORE_DATA_TYPE__H_ -#include -#include - -#include "data_type.h" -#include "native/cuda/moore/device_.h" +#include "infini_rt/moore/data_type_.h" namespace infini::ops { -using cuda_bfloat16 = __mt_bfloat16; - -using cuda_bfloat162 = __mt_bfloat162; - -template <> -struct TypeMap { - using type = half; -}; - -template <> -struct TypeMap { - using type = __mt_bfloat16; -}; +using infini::rt::cuda_bfloat16; +using infini::rt::cuda_bfloat162; } // namespace infini::ops diff --git a/src/native/cuda/moore/device_.h b/src/native/cuda/moore/device_.h index 472b9acf0..e7ea0b11b 100644 --- a/src/native/cuda/moore/device_.h +++ b/src/native/cuda/moore/device_.h @@ -1,13 +1,6 @@ #ifndef INFINI_OPS_MOORE_DEVICE__H_ #define INFINI_OPS_MOORE_DEVICE__H_ -#include "device.h" - -namespace infini::ops { - -template <> -struct DeviceEnabled : std::true_type {}; - -} // namespace infini::ops +#include "infini_rt/moore/device_.h" #endif diff --git a/src/native/cuda/moore/runtime_.h b/src/native/cuda/moore/runtime_.h index bc519b41e..6d8dda735 100644 --- a/src/native/cuda/moore/runtime_.h +++ b/src/native/cuda/moore/runtime_.h @@ -1,44 +1,8 @@ #ifndef INFINI_OPS_MOORE_RUNTIME_H_ #define INFINI_OPS_MOORE_RUNTIME_H_ -#include - -#include - -#include "native/cuda/moore/device_.h" +#include "infini_rt/moore/runtime_.h" #include "native/cuda/moore/runtime_utils.h" -#include "native/cuda/runtime_.h" - -namespace infini::ops { - -template <> -struct Runtime - : CudaRuntime> { - using Stream = musaStream_t; - - static constexpr Device::Type kDeviceType = Device::Type::kMoore; - - static constexpr auto Malloc = [](auto&&... args) { - return musaMalloc(std::forward(args)...); - }; - - static constexpr auto Memcpy = [](auto&&... args) { - return musaMemcpy(std::forward(args)...); - }; - - static constexpr auto Free = [](auto&&... args) { - return musaFree(std::forward(args)...); - }; - - static constexpr auto MemcpyHostToDevice = musaMemcpyHostToDevice; - - static constexpr auto MemcpyDeviceToHost = musaMemcpyDeviceToHost; - - static constexpr auto Memset = musaMemset; -}; - -static_assert(Runtime::Validate()); - -} // namespace infini::ops +#include "runtime.h" #endif diff --git a/src/native/cuda/nvidia/data_type_.h b/src/native/cuda/nvidia/data_type_.h index 1266c0c8a..eee5eeb12 100644 --- a/src/native/cuda/nvidia/data_type_.h +++ b/src/native/cuda/nvidia/data_type_.h @@ -1,29 +1,12 @@ #ifndef INFINI_OPS_NVIDIA_DATA_TYPE__H_ #define INFINI_OPS_NVIDIA_DATA_TYPE__H_ -// clang-format off -#include -#include -// clang-format on - -#include "data_type.h" -#include "native/cuda/nvidia/device_.h" +#include "infini_rt/nvidia/data_type_.h" namespace infini::ops { -using cuda_bfloat16 = nv_bfloat16; - -using cuda_bfloat162 = nv_bfloat162; - -template <> -struct TypeMap { - using type = half; -}; - -template <> -struct TypeMap { - using type = __nv_bfloat16; -}; +using infini::rt::cuda_bfloat16; +using infini::rt::cuda_bfloat162; } // namespace infini::ops diff --git a/src/native/cuda/nvidia/device_.h b/src/native/cuda/nvidia/device_.h index 52cbb2b84..63524ec85 100644 --- a/src/native/cuda/nvidia/device_.h +++ b/src/native/cuda/nvidia/device_.h @@ -1,13 +1,6 @@ #ifndef INFINI_OPS_NVIDIA_DEVICE__H_ #define INFINI_OPS_NVIDIA_DEVICE__H_ -#include "device.h" - -namespace infini::ops { - -template <> -struct DeviceEnabled : std::true_type {}; - -} // namespace infini::ops +#include "infini_rt/nvidia/device_.h" #endif diff --git a/src/native/cuda/nvidia/runtime_.h b/src/native/cuda/nvidia/runtime_.h index 326ecdb24..9d8c018cb 100644 --- a/src/native/cuda/nvidia/runtime_.h +++ b/src/native/cuda/nvidia/runtime_.h @@ -1,42 +1,8 @@ #ifndef INFINI_OPS_NVIDIA_RUNTIME_H_ #define INFINI_OPS_NVIDIA_RUNTIME_H_ -#include - -// clang-format off -#include -// clang-format on - -#include "native/cuda/nvidia/device_.h" +#include "infini_rt/nvidia/runtime_.h" #include "native/cuda/nvidia/runtime_utils.h" -#include "native/cuda/runtime_.h" - -namespace infini::ops { - -template <> -struct Runtime - : CudaRuntime> { - using Stream = cudaStream_t; - - static constexpr Device::Type kDeviceType = Device::Type::kNvidia; - - static constexpr auto Malloc = [](auto&&... args) { - return cudaMalloc(std::forward(args)...); - }; - - static constexpr auto Memcpy = cudaMemcpy; - - static constexpr auto Free = cudaFree; - - static constexpr auto MemcpyHostToDevice = cudaMemcpyHostToDevice; - - static constexpr auto MemcpyDeviceToHost = cudaMemcpyDeviceToHost; - - static constexpr auto Memset = cudaMemset; -}; - -static_assert(Runtime::Validate()); - -} // namespace infini::ops +#include "runtime.h" #endif diff --git a/src/runtime.h b/src/runtime.h index 38257893c..a3e9624b6 100644 --- a/src/runtime.h +++ b/src/runtime.h @@ -1,54 +1,15 @@ #ifndef INFINI_OPS_RUNTIME_H_ #define INFINI_OPS_RUNTIME_H_ -#include - -#include "device.h" +#include "infini_rt/runtime.h" namespace infini::ops { template -struct Runtime; - -/// ## Interface enforcement via CRTP. -/// -/// Inherit from the appropriate base to declare which interface level a -/// `Runtime` specialization implements. After the struct is fully defined, call -/// `static_assert(Runtime<...>::Validate())`. The chained `Validate()` checks -/// every required member's existence and signature at compile time, analogous -/// to how `override` catches signature mismatches for virtual functions. -/// -/// - `RuntimeBase`: `kDeviceType` only (e.g. CPU). -/// - `DeviceRuntime`: adds `Stream`, `Malloc`, and `Free` (e.g. Cambricon). - -/// Every Runtime must provide `static constexpr Device::Type kDeviceType`. -template -struct RuntimeBase { - static constexpr bool Validate() { - static_assert( - std::is_same_v, - Device::Type>, - "`Runtime` must define `static constexpr Device::Type kDeviceType`."); - return true; - } -}; +using Runtime = infini::rt::Runtime; -/// Runtimes with device memory must additionally provide `Stream`, `Malloc`, -/// and `Free`. template -struct DeviceRuntime : RuntimeBase { - static constexpr bool Validate() { - RuntimeBase::Validate(); - static_assert(sizeof(typename Derived::Stream) > 0, - "`Runtime` must define a `Stream` type alias."); - static_assert( - std::is_invocable_v, - "`Runtime::Malloc` must be callable with `(void**, size_t)`."); - static_assert(std::is_invocable_v, - "`Runtime::Free` must be callable with `(void*)`."); - return true; - } -}; +using DeviceRuntime = infini::rt::DeviceRuntime; } // namespace infini::ops diff --git a/src/tensor.h b/src/tensor.h index 290e3cf96..576e0ba2d 100644 --- a/src/tensor.h +++ b/src/tensor.h @@ -1,157 +1,12 @@ #ifndef INFINI_OPS_TENSOR_H_ #define INFINI_OPS_TENSOR_H_ -#include -#include -#include - -#include "data_type.h" -#include "device.h" -#include "hash.h" +#include "infini_rt/tensor_view.h" namespace infini::ops { -class Tensor { - public: - using Size = std::size_t; - - using Stride = std::ptrdiff_t; - - using Index = Stride; - - using Shape = std::vector; - - using Strides = std::vector; - - template - Tensor(void* data, const Shape& shape) - : data_{data}, - shape_{shape}, - dtype_{DefaultDataType()}, - device_{DefaultDevice()}, - strides_{DefaultStrides(shape)} {} - - template - Tensor(void* data, const Shape& shape, const DataType& dtype) - : data_{data}, - shape_{shape}, - dtype_{dtype}, - device_{DefaultDevice()}, - strides_{DefaultStrides(shape)} {} - - template - Tensor(void* data, const Shape& shape, const Device& device) - : data_{data}, - shape_{shape}, - dtype_{DefaultDataType()}, - device_{device}, - strides_{DefaultStrides(shape)} {} - - template - Tensor(void* data, const Shape& shape, const DataType& dtype, - const Device& device) - : data_{data}, - shape_{shape}, - dtype_{dtype}, - device_{device}, - strides_{DefaultStrides(shape)} {} - - template - Tensor(void* data, const Shape& shape, const DataType& dtype, - const Device& device, const Strides& strides) - : data_{data}, - shape_{shape}, - dtype_{dtype}, - device_{device}, - strides_{strides} {} - - Tensor(void* data, std::initializer_list shape, const DataType& dtype, - const Device& device, std::initializer_list strides); - - Tensor operator[](const Index& index) const; - - void*& data(); - - const void* data() const; - - const DataType& dtype() const; - - const Device& device() const; - - const Shape& shape() const; - - const Strides& strides() const; - - Size size(const Index& index) const; - - Stride stride(const Index& index) const; - - Size ndim() const; - - Size element_size() const; - - Size numel() const; - - Tensor T() const; - - std::string ToString() const; - - bool HasBroadcastDim() const; - - bool IsContiguous() const; - - private: - static const DataType DefaultDataType(); - - static Device DefaultDevice(); - - static Strides DefaultStrides(const Shape& shape); - - std::string ToStringHelper() const; - - bool IsMergeable(Size dim_start, Size dim_end) const; - - void* data_{nullptr}; - - Shape shape_; - - const DataType dtype_; - - Device device_; - - Strides strides_; -}; +using Tensor = infini::rt::TensorView; } // namespace infini::ops -template <> -struct std::hash { - std::size_t operator()(const infini::ops::Tensor& tensor) const { - std::size_t seed{0}; - - for (const auto& size : tensor.shape()) { - HashCombine(seed, size); - } - - HashCombine(seed, tensor.dtype()); - - HashCombine(seed, tensor.device()); - - for (const auto& stride : tensor.strides()) { - HashCombine(seed, stride); - } - - return seed; - } -}; - -template <> -struct std::equal_to { - bool operator()(const infini::ops::Tensor& a, - const infini::ops::Tensor& b) const { - return a.dtype() == b.dtype() && a.device() == b.device() && - a.shape() == b.shape() && a.strides() == b.strides(); - } -}; - #endif From 3ca204438f92174d52b213a9c10a6707d5110ce4 Mon Sep 17 00:00:00 2001 From: Jiacheng Huang Date: Wed, 17 Jun 2026 17:25:09 +0800 Subject: [PATCH 02/12] fix: consume InfiniRT public headers --- CMakeLists.txt | 12 ++++----- examples/runtime_api.h | 3 ++- scripts/generate_wrappers.py | 37 ++++++++++++++------------- src/CMakeLists.txt | 2 +- src/data_type.h | 2 +- src/device.h | 2 +- src/native/cpu/caster_.h | 2 +- src/native/cuda/iluvatar/data_type_.h | 2 +- src/native/cuda/iluvatar/device_.h | 2 +- src/native/cuda/iluvatar/runtime_.h | 2 +- src/native/cuda/metax/data_type_.h | 2 +- src/native/cuda/metax/device_.h | 2 +- src/native/cuda/metax/runtime_.h | 2 +- src/native/cuda/moore/data_type_.h | 2 +- src/native/cuda/moore/device_.h | 2 +- src/native/cuda/moore/runtime_.h | 2 +- src/native/cuda/nvidia/data_type_.h | 2 +- src/native/cuda/nvidia/device_.h | 2 +- src/native/cuda/nvidia/runtime_.h | 2 +- src/runtime.h | 2 +- src/tensor.h | 2 +- 21 files changed, 44 insertions(+), 44 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cab00b619..ab8d8bb5d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -294,8 +294,6 @@ if(WITH_TORCH) message(STATUS "Found PyTorch: ${TORCH_INCLUDE_DIRS}") endif() -include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src) - # Only one CUDA-like GPU backend can be enabled at a time. set(_gpu_backend_count 0) foreach(_gpu_backend WITH_NVIDIA WITH_ILUVATAR WITH_HYGON WITH_METAX WITH_MOORE WITH_ASCEND) @@ -502,14 +500,14 @@ if(WITH_HYGON AND NOT EXISTS "${DTK_ROOT}/llvm/lib/LLVMgold.so") set(PYBIND11_ENABLE_EXTRAS OFF) endif() -set(INFINIRT_SOURCE_DIR "${PROJECT_SOURCE_DIR}/../InfiniRT" CACHE PATH "InfiniRT source directory") -if(NOT EXISTS "${INFINIRT_SOURCE_DIR}/CMakeLists.txt") +set(INFINI_RT_SOURCE_DIR "${PROJECT_SOURCE_DIR}/../InfiniRT" CACHE PATH "InfiniRT source directory") +if(NOT EXISTS "${INFINI_RT_SOURCE_DIR}/CMakeLists.txt") message(FATAL_ERROR - "InfiniRT not found at `${INFINIRT_SOURCE_DIR}`. " - "Set `INFINIRT_SOURCE_DIR` to the InfiniRT source directory.") + "InfiniRT not found at `${INFINI_RT_SOURCE_DIR}`. " + "Set `INFINI_RT_SOURCE_DIR` to the InfiniRT source directory.") endif() if(NOT TARGET infinirt) - add_subdirectory("${INFINIRT_SOURCE_DIR}" "${CMAKE_BINARY_DIR}/InfiniRT") + add_subdirectory("${INFINI_RT_SOURCE_DIR}" "${CMAKE_BINARY_DIR}/InfiniRT") endif() add_subdirectory(src) diff --git a/examples/runtime_api.h b/examples/runtime_api.h index 7df6a6eb3..292b40408 100644 --- a/examples/runtime_api.h +++ b/examples/runtime_api.h @@ -2,6 +2,7 @@ #define INFINI_OPS_EXAMPLES_RUNTIME_API_H_ #include "device.h" +#include "runtime.h" #ifdef WITH_NVIDIA #include "native/cuda/nvidia/ops/gemm/cublas.h" @@ -23,7 +24,7 @@ #include "native/ascend/ops/gemm/kernel.h" #include "native/ascend/runtime_.h" #elif WITH_CPU -#include "infini_rt/cpu/runtime_.h" +#include #include "native/cpu/ops/gemm/gemm.h" #else #error "One `WITH_*` backend must be enabled for the examples." diff --git a/scripts/generate_wrappers.py b/scripts/generate_wrappers.py index 4cffa6b08..84075bd49 100644 --- a/scripts/generate_wrappers.py +++ b/scripts/generate_wrappers.py @@ -185,14 +185,17 @@ def __call__(self, op_name): index = clang.cindex.Index.create() args = ( - "-std=c++17", - "-x", - "c++", - "-I", - "src", - "-I", - str(_GENERATION_DIR), - ) + _get_infinirt_include_flags() + _get_system_include_flags() + ( + "-std=c++17", + "-x", + "c++", + "-I", + "src", + "-I", + str(_GENERATION_DIR), + ) + + _get_system_include_flags() + ) translation_unit = index.parse(str(_find_base_header(op_name)), args=args) nodes = tuple(type(self)._find(translation_unit.cursor, op_name)) @@ -411,9 +414,7 @@ def _find_tensor_params(op_name): source = _find_base_header(op_name).read_text() params = set() - params.update( - re.findall(r"(?:^|[,(]\s*)(?:const\s+)?Tensor\s+(\w+)", source) - ) + params.update(re.findall(r"(?:^|[,(]\s*)(?:const\s+)?Tensor\s+(\w+)", source)) params.update(_find_optional_tensor_params(op_name)) params.update(_find_vector_tensor_params(op_name)) @@ -1233,13 +1234,13 @@ def _generate_operator_call_instantiation_source(devices, impl_paths, definition def _device_marker_headers(devices): paths = { - "cpu": "infini_rt/cpu/device_.h", - "nvidia": "native/cuda/nvidia/device_.h", - "cambricon": "native/cambricon/device_.h", - "ascend": "native/ascend/device_.h", - "metax": "native/cuda/metax/device_.h", - "moore": "native/cuda/moore/device_.h", - "iluvatar": "native/cuda/iluvatar/device_.h", + "cpu": "infini/rt/cpu/device_.h", + "nvidia": "infini/rt/nvidia/device_.h", + "cambricon": "infini/rt/cambricon/device_.h", + "ascend": "infini/rt/ascend/device_.h", + "metax": "infini/rt/metax/device_.h", + "moore": "infini/rt/moore/device_.h", + "iluvatar": "infini/rt/iluvatar/device_.h", } return [paths[device] for device in devices if device in paths] diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index eab0d4bdd..b43c40612 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -602,7 +602,7 @@ if(GENERATE_OPERATOR_CALL_INSTANTIATIONS OR GENERATE_PYTHON_BINDINGS) execute_process( COMMAND ${CMAKE_COMMAND} -E env - INFINIRT_SOURCE_DIR=${INFINIRT_SOURCE_DIR} + INFINI_RT_SOURCE_DIR=${INFINI_RT_SOURCE_DIR} ${Python_EXECUTABLE} ${PROJECT_SOURCE_DIR}/scripts/generate_wrappers.py ${GENERATOR_ARGS} diff --git a/src/data_type.h b/src/data_type.h index af991bb96..f5d53b2e1 100644 --- a/src/data_type.h +++ b/src/data_type.h @@ -3,7 +3,7 @@ #include "common/traits.h" #include "device.h" -#include "infini_rt/data_type.h" +#include namespace infini::ops { diff --git a/src/device.h b/src/device.h index 8c18efc5c..0f41f647c 100644 --- a/src/device.h +++ b/src/device.h @@ -2,7 +2,7 @@ #define INFINI_OPS_DEVICE_H_ #include "common/traits.h" -#include "infini_rt/device.h" +#include namespace infini::ops { diff --git a/src/native/cpu/caster_.h b/src/native/cpu/caster_.h index 43f23e3b3..10348c0e0 100644 --- a/src/native/cpu/caster_.h +++ b/src/native/cpu/caster_.h @@ -4,7 +4,7 @@ #include #include "caster.h" -#include "infini_rt/cpu/data_type_.h" +#include namespace infini::ops { diff --git a/src/native/cuda/iluvatar/data_type_.h b/src/native/cuda/iluvatar/data_type_.h index 7a641f39b..b9e3ed8a3 100644 --- a/src/native/cuda/iluvatar/data_type_.h +++ b/src/native/cuda/iluvatar/data_type_.h @@ -1,7 +1,7 @@ #ifndef INFINI_OPS_ILUVATAR_DATA_TYPE__H_ #define INFINI_OPS_ILUVATAR_DATA_TYPE__H_ -#include "infini_rt/iluvatar/data_type_.h" +#include namespace infini::ops { diff --git a/src/native/cuda/iluvatar/device_.h b/src/native/cuda/iluvatar/device_.h index f4076ecf1..96743518f 100644 --- a/src/native/cuda/iluvatar/device_.h +++ b/src/native/cuda/iluvatar/device_.h @@ -1,6 +1,6 @@ #ifndef INFINI_OPS_ILUVATAR_DEVICE__H_ #define INFINI_OPS_ILUVATAR_DEVICE__H_ -#include "infini_rt/iluvatar/device_.h" +#include #endif diff --git a/src/native/cuda/iluvatar/runtime_.h b/src/native/cuda/iluvatar/runtime_.h index c85fb3d90..28f2c67bc 100644 --- a/src/native/cuda/iluvatar/runtime_.h +++ b/src/native/cuda/iluvatar/runtime_.h @@ -1,7 +1,7 @@ #ifndef INFINI_OPS_ILUVATAR_RUNTIME_H_ #define INFINI_OPS_ILUVATAR_RUNTIME_H_ -#include "infini_rt/iluvatar/runtime_.h" +#include #include "native/cuda/iluvatar/runtime_utils.h" #include "runtime.h" diff --git a/src/native/cuda/metax/data_type_.h b/src/native/cuda/metax/data_type_.h index ac0214cc2..003dd48c0 100644 --- a/src/native/cuda/metax/data_type_.h +++ b/src/native/cuda/metax/data_type_.h @@ -1,7 +1,7 @@ #ifndef INFINI_OPS_METAX_DATA_TYPE__H_ #define INFINI_OPS_METAX_DATA_TYPE__H_ -#include "infini_rt/metax/data_type_.h" +#include namespace infini::ops { diff --git a/src/native/cuda/metax/device_.h b/src/native/cuda/metax/device_.h index 230d2ee84..7aa554ea2 100644 --- a/src/native/cuda/metax/device_.h +++ b/src/native/cuda/metax/device_.h @@ -1,6 +1,6 @@ #ifndef INFINI_OPS_METAX_DEVICE__H_ #define INFINI_OPS_METAX_DEVICE__H_ -#include "infini_rt/metax/device_.h" +#include #endif diff --git a/src/native/cuda/metax/runtime_.h b/src/native/cuda/metax/runtime_.h index a23524039..f952fb8b2 100644 --- a/src/native/cuda/metax/runtime_.h +++ b/src/native/cuda/metax/runtime_.h @@ -1,7 +1,7 @@ #ifndef INFINI_OPS_METAX_RUNTIME_H_ #define INFINI_OPS_METAX_RUNTIME_H_ -#include "infini_rt/metax/runtime_.h" +#include #include "native/cuda/metax/runtime_utils.h" #include "runtime.h" diff --git a/src/native/cuda/moore/data_type_.h b/src/native/cuda/moore/data_type_.h index d708be7c2..7bc17d047 100644 --- a/src/native/cuda/moore/data_type_.h +++ b/src/native/cuda/moore/data_type_.h @@ -1,7 +1,7 @@ #ifndef INFINI_OPS_MOORE_DATA_TYPE__H_ #define INFINI_OPS_MOORE_DATA_TYPE__H_ -#include "infini_rt/moore/data_type_.h" +#include namespace infini::ops { diff --git a/src/native/cuda/moore/device_.h b/src/native/cuda/moore/device_.h index e7ea0b11b..2b23db2dd 100644 --- a/src/native/cuda/moore/device_.h +++ b/src/native/cuda/moore/device_.h @@ -1,6 +1,6 @@ #ifndef INFINI_OPS_MOORE_DEVICE__H_ #define INFINI_OPS_MOORE_DEVICE__H_ -#include "infini_rt/moore/device_.h" +#include #endif diff --git a/src/native/cuda/moore/runtime_.h b/src/native/cuda/moore/runtime_.h index 6d8dda735..bc960d949 100644 --- a/src/native/cuda/moore/runtime_.h +++ b/src/native/cuda/moore/runtime_.h @@ -1,7 +1,7 @@ #ifndef INFINI_OPS_MOORE_RUNTIME_H_ #define INFINI_OPS_MOORE_RUNTIME_H_ -#include "infini_rt/moore/runtime_.h" +#include #include "native/cuda/moore/runtime_utils.h" #include "runtime.h" diff --git a/src/native/cuda/nvidia/data_type_.h b/src/native/cuda/nvidia/data_type_.h index eee5eeb12..7f1fdc9e8 100644 --- a/src/native/cuda/nvidia/data_type_.h +++ b/src/native/cuda/nvidia/data_type_.h @@ -1,7 +1,7 @@ #ifndef INFINI_OPS_NVIDIA_DATA_TYPE__H_ #define INFINI_OPS_NVIDIA_DATA_TYPE__H_ -#include "infini_rt/nvidia/data_type_.h" +#include namespace infini::ops { diff --git a/src/native/cuda/nvidia/device_.h b/src/native/cuda/nvidia/device_.h index 63524ec85..b46caf035 100644 --- a/src/native/cuda/nvidia/device_.h +++ b/src/native/cuda/nvidia/device_.h @@ -1,6 +1,6 @@ #ifndef INFINI_OPS_NVIDIA_DEVICE__H_ #define INFINI_OPS_NVIDIA_DEVICE__H_ -#include "infini_rt/nvidia/device_.h" +#include #endif diff --git a/src/native/cuda/nvidia/runtime_.h b/src/native/cuda/nvidia/runtime_.h index 9d8c018cb..5745b3ffd 100644 --- a/src/native/cuda/nvidia/runtime_.h +++ b/src/native/cuda/nvidia/runtime_.h @@ -1,7 +1,7 @@ #ifndef INFINI_OPS_NVIDIA_RUNTIME_H_ #define INFINI_OPS_NVIDIA_RUNTIME_H_ -#include "infini_rt/nvidia/runtime_.h" +#include #include "native/cuda/nvidia/runtime_utils.h" #include "runtime.h" diff --git a/src/runtime.h b/src/runtime.h index a3e9624b6..251be195d 100644 --- a/src/runtime.h +++ b/src/runtime.h @@ -1,7 +1,7 @@ #ifndef INFINI_OPS_RUNTIME_H_ #define INFINI_OPS_RUNTIME_H_ -#include "infini_rt/runtime.h" +#include namespace infini::ops { diff --git a/src/tensor.h b/src/tensor.h index 576e0ba2d..48dc21d54 100644 --- a/src/tensor.h +++ b/src/tensor.h @@ -1,7 +1,7 @@ #ifndef INFINI_OPS_TENSOR_H_ #define INFINI_OPS_TENSOR_H_ -#include "infini_rt/tensor_view.h" +#include namespace infini::ops { From 0f3c5c25b2f2d1c7f29ffc808aa2cd27570a2948 Mon Sep 17 00:00:00 2001 From: Jiacheng Huang Date: Wed, 17 Jun 2026 21:40:47 +0800 Subject: [PATCH 03/12] refactor: remove InfiniRT runtime shims --- examples/runtime_api.h | 16 ++++--- src/CMakeLists.txt | 10 ++++- src/native/ascend/data_type_.h | 2 +- src/native/ascend/device_.h | 13 ------ src/native/ascend/runtime_.h | 44 ------------------- src/native/cambricon/data_type_.h | 23 ---------- src/native/cambricon/device_.h | 13 ------ src/native/cambricon/ops/rms_norm/rms_norm.h | 2 +- src/native/cambricon/runtime_.h | 35 --------------- src/native/cuda/iluvatar/blas.h | 3 +- src/native/cuda/iluvatar/caster.cuh | 2 +- src/native/cuda/iluvatar/data_type_.h | 13 ------ src/native/cuda/iluvatar/device_.h | 6 --- src/native/cuda/iluvatar/ops/add/kernel.h | 3 +- .../cuda/iluvatar/ops/add_rms_norm/kernel.h | 3 +- .../cuda/iluvatar/ops/causal_softmax/kernel.h | 3 +- .../cuda/iluvatar/ops/conv_infinilm/kernel.h | 3 +- .../cuda/iluvatar/ops/embedding/kernel.h | 3 +- .../cuda/iluvatar/ops/gelu_infinilm/kernel.h | 3 +- .../iluvatar/ops/gelutanh_infinilm/kernel.h | 3 +- .../iluvatar/ops/kv_caching_infinilm/kernel.h | 3 +- .../ops/paged_attention_infinilm/kernel.h | 3 +- .../paged_attention_prefill_infinilm/kernel.h | 3 +- .../ops/paged_caching_infinilm/kernel.h | 3 +- .../ops/random_sample_infinilm/kernel.h | 3 +- .../iluvatar/ops/rearrange_infinilm/kernel.h | 3 +- .../cuda/iluvatar/ops/relu_infinilm/kernel.h | 3 +- .../cuda/iluvatar/ops/rms_norm/kernel.h | 3 +- .../ops/rotary_embedding_infinilm/kernel.h | 3 +- .../iluvatar/ops/sigmoid_infinilm/kernel.h | 3 +- src/native/cuda/iluvatar/ops/silu/kernel.h | 3 +- .../ops/silu_and_mul_infinilm/kernel.h | 3 +- .../iluvatar/ops/softmax_infinilm/kernel.h | 3 +- src/native/cuda/iluvatar/ops/swiglu/kernel.h | 3 +- .../ops/topksoftmax_infinilm/kernel.h | 3 +- .../cuda/iluvatar/ops/zeros_infinilm/kernel.h | 3 +- src/native/cuda/iluvatar/runtime_.h | 8 ---- src/native/cuda/metax/blas.h | 3 +- src/native/cuda/metax/caster.cuh | 2 +- src/native/cuda/metax/data_type_.h | 13 ------ src/native/cuda/metax/device_.h | 6 --- src/native/cuda/metax/ops/add/kernel.h | 3 +- .../cuda/metax/ops/add_rms_norm/kernel.h | 3 +- .../cuda/metax/ops/causal_softmax/kernel.h | 3 +- .../cuda/metax/ops/conv_infinilm/kernel.h | 3 +- src/native/cuda/metax/ops/embedding/kernel.h | 3 +- .../cuda/metax/ops/gelu_infinilm/kernel.h | 3 +- .../cuda/metax/ops/gelutanh_infinilm/kernel.h | 3 +- .../metax/ops/kv_caching_infinilm/kernel.h | 3 +- .../ops/paged_attention_infinilm/kernel.h | 3 +- .../paged_attention_prefill_infinilm/kernel.h | 3 +- .../metax/ops/paged_caching_infinilm/kernel.h | 3 +- .../metax/ops/random_sample_infinilm/kernel.h | 3 +- .../metax/ops/rearrange_infinilm/kernel.h | 3 +- .../cuda/metax/ops/relu_infinilm/kernel.h | 3 +- src/native/cuda/metax/ops/rms_norm/kernel.h | 3 +- .../ops/rotary_embedding_infinilm/kernel.h | 3 +- .../cuda/metax/ops/sigmoid_infinilm/kernel.h | 3 +- src/native/cuda/metax/ops/silu/kernel.h | 3 +- .../metax/ops/silu_and_mul_infinilm/kernel.h | 3 +- .../cuda/metax/ops/softmax_infinilm/kernel.h | 3 +- src/native/cuda/metax/ops/swiglu/kernel.h | 3 +- .../metax/ops/topksoftmax_infinilm/kernel.h | 3 +- .../cuda/metax/ops/zeros_infinilm/kernel.h | 3 +- src/native/cuda/metax/runtime_.h | 8 ---- src/native/cuda/moore/blas.h | 3 +- src/native/cuda/moore/caster.cuh | 2 +- src/native/cuda/moore/data_type_.h | 13 ------ src/native/cuda/moore/device_.h | 6 --- src/native/cuda/moore/ops/add/kernel.h | 3 +- .../cuda/moore/ops/add_rms_norm/kernel.h | 3 +- .../cuda/moore/ops/causal_softmax/kernel.h | 5 ++- .../cuda/moore/ops/conv_infinilm/kernel.h | 3 +- src/native/cuda/moore/ops/embedding/kernel.h | 3 +- .../cuda/moore/ops/gelu_infinilm/kernel.h | 3 +- .../cuda/moore/ops/gelutanh_infinilm/kernel.h | 3 +- .../moore/ops/kv_caching_infinilm/kernel.h | 3 +- .../ops/paged_attention_infinilm/kernel.h | 3 +- .../paged_attention_prefill_infinilm/kernel.h | 3 +- .../moore/ops/paged_caching_infinilm/kernel.h | 3 +- .../moore/ops/random_sample_infinilm/kernel.h | 3 +- .../moore/ops/rearrange_infinilm/kernel.h | 3 +- .../cuda/moore/ops/relu_infinilm/kernel.h | 3 +- src/native/cuda/moore/ops/rms_norm/kernel.h | 3 +- .../ops/rotary_embedding_infinilm/kernel.h | 3 +- .../cuda/moore/ops/sigmoid_infinilm/kernel.h | 3 +- src/native/cuda/moore/ops/silu/kernel.h | 3 +- .../moore/ops/silu_and_mul_infinilm/kernel.h | 3 +- .../cuda/moore/ops/softmax_infinilm/kernel.h | 3 +- src/native/cuda/moore/ops/swiglu/kernel.h | 3 +- .../moore/ops/topksoftmax_infinilm/kernel.h | 3 +- .../cuda/moore/ops/zeros_infinilm/kernel.h | 3 +- src/native/cuda/moore/runtime_.h | 8 ---- src/native/cuda/nvidia/blas.h | 3 +- src/native/cuda/nvidia/caster.cuh | 2 +- src/native/cuda/nvidia/data_type_.h | 13 ------ src/native/cuda/nvidia/device_.h | 6 --- src/native/cuda/nvidia/ops/add/kernel.h | 3 +- .../cuda/nvidia/ops/add_rms_norm/kernel.h | 3 +- .../cuda/nvidia/ops/causal_softmax/kernel.h | 3 +- .../cuda/nvidia/ops/conv_infinilm/kernel.h | 3 +- src/native/cuda/nvidia/ops/embedding/kernel.h | 3 +- .../cuda/nvidia/ops/gelu_infinilm/kernel.h | 3 +- .../nvidia/ops/gelutanh_infinilm/kernel.h | 3 +- src/native/cuda/nvidia/ops/gemm/cublaslt.h | 3 +- .../nvidia/ops/kv_caching_infinilm/kernel.h | 3 +- .../ops/paged_attention_infinilm/kernel.h | 3 +- .../paged_attention_prefill_infinilm/kernel.h | 3 +- .../ops/paged_caching_infinilm/kernel.h | 3 +- .../ops/random_sample_infinilm/kernel.h | 3 +- .../nvidia/ops/rearrange_infinilm/kernel.h | 3 +- .../cuda/nvidia/ops/relu_infinilm/kernel.h | 3 +- src/native/cuda/nvidia/ops/rms_norm/kernel.h | 3 +- .../ops/rotary_embedding_infinilm/kernel.h | 3 +- .../cuda/nvidia/ops/sigmoid_infinilm/kernel.h | 3 +- src/native/cuda/nvidia/ops/silu/kernel.h | 3 +- .../nvidia/ops/silu_and_mul_infinilm/kernel.h | 3 +- .../cuda/nvidia/ops/softmax_infinilm/kernel.h | 3 +- src/native/cuda/nvidia/ops/swiglu/kernel.h | 3 +- .../nvidia/ops/topksoftmax_infinilm/kernel.h | 3 +- .../cuda/nvidia/ops/zeros_infinilm/kernel.h | 3 +- src/native/cuda/nvidia/runtime_.h | 8 ---- .../ops/rotary_embedding_infinilm/kernel.cuh | 3 ++ src/native/cuda/runtime_utils.h | 2 +- 124 files changed, 223 insertions(+), 349 deletions(-) delete mode 100644 src/native/ascend/device_.h delete mode 100644 src/native/ascend/runtime_.h delete mode 100644 src/native/cambricon/data_type_.h delete mode 100644 src/native/cambricon/device_.h delete mode 100644 src/native/cambricon/runtime_.h delete mode 100644 src/native/cuda/iluvatar/data_type_.h delete mode 100644 src/native/cuda/iluvatar/device_.h delete mode 100644 src/native/cuda/iluvatar/runtime_.h delete mode 100644 src/native/cuda/metax/data_type_.h delete mode 100644 src/native/cuda/metax/device_.h delete mode 100644 src/native/cuda/metax/runtime_.h delete mode 100644 src/native/cuda/moore/data_type_.h delete mode 100644 src/native/cuda/moore/device_.h delete mode 100644 src/native/cuda/moore/runtime_.h delete mode 100644 src/native/cuda/nvidia/data_type_.h delete mode 100644 src/native/cuda/nvidia/device_.h delete mode 100644 src/native/cuda/nvidia/runtime_.h diff --git a/examples/runtime_api.h b/examples/runtime_api.h index 292b40408..93681c8f8 100644 --- a/examples/runtime_api.h +++ b/examples/runtime_api.h @@ -7,22 +7,26 @@ #ifdef WITH_NVIDIA #include "native/cuda/nvidia/ops/gemm/cublas.h" #include "native/cuda/nvidia/ops/gemm/cublaslt.h" -#include "native/cuda/nvidia/runtime_.h" +#include +#include "native/cuda/nvidia/runtime_utils.h" #elif WITH_ILUVATAR #include "native/cuda/iluvatar/ops/gemm/cublas.h" -#include "native/cuda/iluvatar/runtime_.h" +#include +#include "native/cuda/iluvatar/runtime_utils.h" #elif WITH_METAX #include "native/cuda/metax/ops/gemm/mcblas.h" -#include "native/cuda/metax/runtime_.h" +#include +#include "native/cuda/metax/runtime_utils.h" #elif WITH_CAMBRICON #include "native/cambricon/ops/gemm/cnblas.h" -#include "native/cambricon/runtime_.h" +#include #elif WITH_MOORE #include "native/cuda/moore/ops/gemm/mublas.h" -#include "native/cuda/moore/runtime_.h" +#include +#include "native/cuda/moore/runtime_utils.h" #elif WITH_ASCEND #include "native/ascend/ops/gemm/kernel.h" -#include "native/ascend/runtime_.h" +#include #elif WITH_CPU #include #include "native/cpu/ops/gemm/gemm.h" diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b43c40612..e46f516bf 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -226,6 +226,8 @@ if(WITH_CAMBRICON) set(MLU_COMPILE_OPTS -c --bang-mlu-arch=mtp_592 -O3 -fPIC -Wall -Werror -std=c++17 -pthread -I${CMAKE_CURRENT_SOURCE_DIR} -I${NEUWARE_HOME}/include + -I${INFINI_RT_SOURCE_DIR}/include + -I${INFINI_RT_SOURCE_DIR}/generated/include -idirafter /usr/local/neuware/lib/clang/11.1.0/include ) function(compile_mlu_file src_file) @@ -630,7 +632,9 @@ if(GENERATE_OPERATOR_CALL_INSTANTIATIONS) "-I${CMAKE_CURRENT_SOURCE_DIR}" "-I${PROJECT_SOURCE_DIR}" "-I${PROJECT_SOURCE_DIR}/generated" - "-I${PROJECT_SOURCE_DIR}/generated/include") + "-I${PROJECT_SOURCE_DIR}/generated/include" + "-I${INFINI_RT_SOURCE_DIR}/include" + "-I${INFINI_RT_SOURCE_DIR}/generated/include") foreach(_dir IN LISTS TORCH_INCLUDE_DIRS CUDAToolkit_INCLUDE_DIRS) list(APPEND _iluvatar_call_instantiation_include_flags "-I${_dir}") endforeach() @@ -804,7 +808,9 @@ if(GENERATE_PYTHON_BINDINGS) set(_iluvatar_dispatch_include_flags "-I${CMAKE_CURRENT_SOURCE_DIR}" "-I${PROJECT_SOURCE_DIR}" - "-I${PROJECT_SOURCE_DIR}/generated") + "-I${PROJECT_SOURCE_DIR}/generated" + "-I${INFINI_RT_SOURCE_DIR}/include" + "-I${INFINI_RT_SOURCE_DIR}/generated/include") foreach(_dir IN LISTS TORCH_INCLUDE_DIRS CUDAToolkit_INCLUDE_DIRS) list(APPEND _iluvatar_dispatch_include_flags "-I${_dir}") endforeach() diff --git a/src/native/ascend/data_type_.h b/src/native/ascend/data_type_.h index 81253a9b3..095ad99bf 100644 --- a/src/native/ascend/data_type_.h +++ b/src/native/ascend/data_type_.h @@ -5,7 +5,7 @@ #include "acl/acl.h" #include "data_type.h" -#include "native/ascend/device_.h" +#include namespace infini::ops::ascend { diff --git a/src/native/ascend/device_.h b/src/native/ascend/device_.h deleted file mode 100644 index 1b246ad38..000000000 --- a/src/native/ascend/device_.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef INFINI_OPS_ASCEND_DEVICE__H_ -#define INFINI_OPS_ASCEND_DEVICE__H_ - -#include "device.h" - -namespace infini::ops { - -template <> -struct DeviceEnabled : std::true_type {}; - -} // namespace infini::ops - -#endif diff --git a/src/native/ascend/runtime_.h b/src/native/ascend/runtime_.h deleted file mode 100644 index 2b9e14136..000000000 --- a/src/native/ascend/runtime_.h +++ /dev/null @@ -1,44 +0,0 @@ -#ifndef INFINI_OPS_ASCEND_RUNTIME__H_ -#define INFINI_OPS_ASCEND_RUNTIME__H_ - -// clang-format off -#include "acl/acl.h" -// clang-format on - -#include "native/ascend/device_.h" -#include "runtime.h" - -namespace infini::ops { - -template <> -struct Runtime - : DeviceRuntime> { - using Stream = aclrtStream; - - static constexpr Device::Type kDeviceType = Device::Type::kAscend; - - static constexpr auto Malloc = [](void** ptr, size_t size) { - return aclrtMalloc(ptr, size, ACL_MEM_MALLOC_HUGE_FIRST); - }; - - static constexpr auto Free = aclrtFree; - - static constexpr auto Memcpy = [](void* dst, const void* src, size_t count, - aclrtMemcpyKind kind) { - return aclrtMemcpy(dst, count, src, count, kind); - }; - - static constexpr auto MemcpyHostToDevice = ACL_MEMCPY_HOST_TO_DEVICE; - - static constexpr auto MemcpyDeviceToHost = ACL_MEMCPY_DEVICE_TO_HOST; - - static constexpr auto Memset = [](void* ptr, int value, size_t count) { - return aclrtMemset(ptr, count, value, count); - }; -}; - -static_assert(Runtime::Validate()); - -} // namespace infini::ops - -#endif diff --git a/src/native/cambricon/data_type_.h b/src/native/cambricon/data_type_.h deleted file mode 100644 index f4ca82da8..000000000 --- a/src/native/cambricon/data_type_.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef INFINI_OPS_CAMBRICON_DATA_TYPE__H_ -#define INFINI_OPS_CAMBRICON_DATA_TYPE__H_ - -#include "bang_bf16.h" -#include "bang_fp16.h" -#include "data_type.h" -#include "native/cambricon/device_.h" - -namespace infini::ops { - -template <> -struct TypeMap { - using type = __half; -}; - -template <> -struct TypeMap { - using type = __bang_bfloat16; -}; - -} // namespace infini::ops - -#endif diff --git a/src/native/cambricon/device_.h b/src/native/cambricon/device_.h deleted file mode 100644 index f168e4f25..000000000 --- a/src/native/cambricon/device_.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef INFINI_OPS_CAMBRICON_DEVICE__H_ -#define INFINI_OPS_CAMBRICON_DEVICE__H_ - -#include "device.h" - -namespace infini::ops { - -template <> -struct DeviceEnabled : std::true_type {}; - -} // namespace infini::ops - -#endif diff --git a/src/native/cambricon/ops/rms_norm/rms_norm.h b/src/native/cambricon/ops/rms_norm/rms_norm.h index 6a9aed098..cb91e61e9 100644 --- a/src/native/cambricon/ops/rms_norm/rms_norm.h +++ b/src/native/cambricon/ops/rms_norm/rms_norm.h @@ -7,7 +7,7 @@ #include "base/rms_norm.h" #include "native/cambricon/common.h" -#include "native/cambricon/data_type_.h" +#include namespace infini::ops { diff --git a/src/native/cambricon/runtime_.h b/src/native/cambricon/runtime_.h deleted file mode 100644 index 7ff30fe9c..000000000 --- a/src/native/cambricon/runtime_.h +++ /dev/null @@ -1,35 +0,0 @@ -#ifndef INFINI_OPS_CAMBRICON_RUNTIME_H_ -#define INFINI_OPS_CAMBRICON_RUNTIME_H_ - -#include - -#include "native/cambricon/device_.h" -#include "runtime.h" - -namespace infini::ops { - -template <> -struct Runtime - : DeviceRuntime> { - using Stream = cnrtQueue_t; - - static constexpr Device::Type kDeviceType = Device::Type::kCambricon; - - static constexpr auto Malloc = cnrtMalloc; - - static constexpr auto Free = cnrtFree; - - static constexpr auto Memcpy = cnrtMemcpy; - - static constexpr auto MemcpyHostToDevice = cnrtMemcpyHostToDev; - - static constexpr auto MemcpyDeviceToHost = cnrtMemcpyDevToHost; - - static constexpr auto Memset = cnrtMemset; -}; - -static_assert(Runtime::Validate()); - -} // namespace infini::ops - -#endif diff --git a/src/native/cuda/iluvatar/blas.h b/src/native/cuda/iluvatar/blas.h index 7e7545a40..9115d8ca9 100644 --- a/src/native/cuda/iluvatar/blas.h +++ b/src/native/cuda/iluvatar/blas.h @@ -10,7 +10,8 @@ #include "data_type.h" #include "native/cuda/blas.h" #include "native/cuda/iluvatar/blas_utils.h" -#include "native/cuda/iluvatar/runtime_.h" +#include +#include "native/cuda/iluvatar/runtime_utils.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/caster.cuh b/src/native/cuda/iluvatar/caster.cuh index a75369570..2a8a39c40 100644 --- a/src/native/cuda/iluvatar/caster.cuh +++ b/src/native/cuda/iluvatar/caster.cuh @@ -2,7 +2,7 @@ #define INFINI_OPS_ILUVATAR_CASTER__H_ #include "native/cuda/caster.cuh" -#include "native/cuda/iluvatar/data_type_.h" +#include namespace infini::ops { diff --git a/src/native/cuda/iluvatar/data_type_.h b/src/native/cuda/iluvatar/data_type_.h deleted file mode 100644 index b9e3ed8a3..000000000 --- a/src/native/cuda/iluvatar/data_type_.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef INFINI_OPS_ILUVATAR_DATA_TYPE__H_ -#define INFINI_OPS_ILUVATAR_DATA_TYPE__H_ - -#include - -namespace infini::ops { - -using infini::rt::cuda_bfloat16; -using infini::rt::cuda_bfloat162; - -} // namespace infini::ops - -#endif diff --git a/src/native/cuda/iluvatar/device_.h b/src/native/cuda/iluvatar/device_.h deleted file mode 100644 index 96743518f..000000000 --- a/src/native/cuda/iluvatar/device_.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef INFINI_OPS_ILUVATAR_DEVICE__H_ -#define INFINI_OPS_ILUVATAR_DEVICE__H_ - -#include - -#endif diff --git a/src/native/cuda/iluvatar/ops/add/kernel.h b/src/native/cuda/iluvatar/ops/add/kernel.h index f41aa6be4..a83062fae 100644 --- a/src/native/cuda/iluvatar/ops/add/kernel.h +++ b/src/native/cuda/iluvatar/ops/add/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include "native/cuda/iluvatar/runtime_.h" +#include +#include "native/cuda/iluvatar/runtime_utils.h" #include "native/cuda/ops/add/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/add_rms_norm/kernel.h b/src/native/cuda/iluvatar/ops/add_rms_norm/kernel.h index 828b93bbc..f95ca7317 100644 --- a/src/native/cuda/iluvatar/ops/add_rms_norm/kernel.h +++ b/src/native/cuda/iluvatar/ops/add_rms_norm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include "native/cuda/iluvatar/runtime_.h" +#include +#include "native/cuda/iluvatar/runtime_utils.h" #include "native/cuda/ops/add_rms_norm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/causal_softmax/kernel.h b/src/native/cuda/iluvatar/ops/causal_softmax/kernel.h index 8c35c87ce..733a59272 100644 --- a/src/native/cuda/iluvatar/ops/causal_softmax/kernel.h +++ b/src/native/cuda/iluvatar/ops/causal_softmax/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include "native/cuda/iluvatar/runtime_.h" +#include +#include "native/cuda/iluvatar/runtime_utils.h" #include "native/cuda/ops/causal_softmax/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/conv_infinilm/kernel.h b/src/native/cuda/iluvatar/ops/conv_infinilm/kernel.h index 7eed0d8ab..9439d2214 100644 --- a/src/native/cuda/iluvatar/ops/conv_infinilm/kernel.h +++ b/src/native/cuda/iluvatar/ops/conv_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include "native/cuda/iluvatar/runtime_.h" +#include +#include "native/cuda/iluvatar/runtime_utils.h" #include "native/cuda/ops/conv_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/embedding/kernel.h b/src/native/cuda/iluvatar/ops/embedding/kernel.h index db77867f6..ac8f4cced 100644 --- a/src/native/cuda/iluvatar/ops/embedding/kernel.h +++ b/src/native/cuda/iluvatar/ops/embedding/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include "native/cuda/iluvatar/runtime_.h" +#include +#include "native/cuda/iluvatar/runtime_utils.h" #include "native/cuda/ops/embedding/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/gelu_infinilm/kernel.h b/src/native/cuda/iluvatar/ops/gelu_infinilm/kernel.h index edbb424cc..34c25b31f 100644 --- a/src/native/cuda/iluvatar/ops/gelu_infinilm/kernel.h +++ b/src/native/cuda/iluvatar/ops/gelu_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include "native/cuda/iluvatar/runtime_.h" +#include +#include "native/cuda/iluvatar/runtime_utils.h" #include "native/cuda/ops/gelu_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/gelutanh_infinilm/kernel.h b/src/native/cuda/iluvatar/ops/gelutanh_infinilm/kernel.h index 1ef9f1345..bd99df025 100644 --- a/src/native/cuda/iluvatar/ops/gelutanh_infinilm/kernel.h +++ b/src/native/cuda/iluvatar/ops/gelutanh_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include "native/cuda/iluvatar/runtime_.h" +#include +#include "native/cuda/iluvatar/runtime_utils.h" #include "native/cuda/ops/gelutanh_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/kv_caching_infinilm/kernel.h b/src/native/cuda/iluvatar/ops/kv_caching_infinilm/kernel.h index 3f3018864..0f7cbfff1 100644 --- a/src/native/cuda/iluvatar/ops/kv_caching_infinilm/kernel.h +++ b/src/native/cuda/iluvatar/ops/kv_caching_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include "native/cuda/iluvatar/runtime_.h" +#include +#include "native/cuda/iluvatar/runtime_utils.h" #include "native/cuda/ops/kv_caching_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/paged_attention_infinilm/kernel.h b/src/native/cuda/iluvatar/ops/paged_attention_infinilm/kernel.h index 3284d3256..d7757426e 100644 --- a/src/native/cuda/iluvatar/ops/paged_attention_infinilm/kernel.h +++ b/src/native/cuda/iluvatar/ops/paged_attention_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include "native/cuda/iluvatar/runtime_.h" +#include +#include "native/cuda/iluvatar/runtime_utils.h" #include "native/cuda/ops/paged_attention_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/paged_attention_prefill_infinilm/kernel.h b/src/native/cuda/iluvatar/ops/paged_attention_prefill_infinilm/kernel.h index d2ad9806e..78f34c99c 100644 --- a/src/native/cuda/iluvatar/ops/paged_attention_prefill_infinilm/kernel.h +++ b/src/native/cuda/iluvatar/ops/paged_attention_prefill_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include "native/cuda/iluvatar/runtime_.h" +#include +#include "native/cuda/iluvatar/runtime_utils.h" #include "native/cuda/ops/paged_attention_prefill_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/paged_caching_infinilm/kernel.h b/src/native/cuda/iluvatar/ops/paged_caching_infinilm/kernel.h index 350e43c4e..c280477dc 100644 --- a/src/native/cuda/iluvatar/ops/paged_caching_infinilm/kernel.h +++ b/src/native/cuda/iluvatar/ops/paged_caching_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include "native/cuda/iluvatar/runtime_.h" +#include +#include "native/cuda/iluvatar/runtime_utils.h" #include "native/cuda/ops/paged_caching_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/random_sample_infinilm/kernel.h b/src/native/cuda/iluvatar/ops/random_sample_infinilm/kernel.h index 7a1dcc5de..0c453d5af 100644 --- a/src/native/cuda/iluvatar/ops/random_sample_infinilm/kernel.h +++ b/src/native/cuda/iluvatar/ops/random_sample_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include "native/cuda/iluvatar/runtime_.h" +#include +#include "native/cuda/iluvatar/runtime_utils.h" #include "native/cuda/ops/random_sample_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/rearrange_infinilm/kernel.h b/src/native/cuda/iluvatar/ops/rearrange_infinilm/kernel.h index 5ba9a9198..5ec8ff4b2 100644 --- a/src/native/cuda/iluvatar/ops/rearrange_infinilm/kernel.h +++ b/src/native/cuda/iluvatar/ops/rearrange_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include "native/cuda/iluvatar/runtime_.h" +#include +#include "native/cuda/iluvatar/runtime_utils.h" #include "native/cuda/ops/rearrange_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/relu_infinilm/kernel.h b/src/native/cuda/iluvatar/ops/relu_infinilm/kernel.h index faf1c0790..bd83616ee 100644 --- a/src/native/cuda/iluvatar/ops/relu_infinilm/kernel.h +++ b/src/native/cuda/iluvatar/ops/relu_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include "native/cuda/iluvatar/runtime_.h" +#include +#include "native/cuda/iluvatar/runtime_utils.h" #include "native/cuda/ops/relu_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/rms_norm/kernel.h b/src/native/cuda/iluvatar/ops/rms_norm/kernel.h index 3230fe8b0..9e0e438fd 100644 --- a/src/native/cuda/iluvatar/ops/rms_norm/kernel.h +++ b/src/native/cuda/iluvatar/ops/rms_norm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include "native/cuda/iluvatar/runtime_.h" +#include +#include "native/cuda/iluvatar/runtime_utils.h" #include "native/cuda/ops/rms_norm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/rotary_embedding_infinilm/kernel.h b/src/native/cuda/iluvatar/ops/rotary_embedding_infinilm/kernel.h index 41d7a4259..c6b812042 100644 --- a/src/native/cuda/iluvatar/ops/rotary_embedding_infinilm/kernel.h +++ b/src/native/cuda/iluvatar/ops/rotary_embedding_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include "native/cuda/iluvatar/runtime_.h" +#include +#include "native/cuda/iluvatar/runtime_utils.h" #include "native/cuda/ops/rotary_embedding_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/sigmoid_infinilm/kernel.h b/src/native/cuda/iluvatar/ops/sigmoid_infinilm/kernel.h index 2d5c487ea..2752a99b6 100644 --- a/src/native/cuda/iluvatar/ops/sigmoid_infinilm/kernel.h +++ b/src/native/cuda/iluvatar/ops/sigmoid_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include "native/cuda/iluvatar/runtime_.h" +#include +#include "native/cuda/iluvatar/runtime_utils.h" #include "native/cuda/ops/sigmoid_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/silu/kernel.h b/src/native/cuda/iluvatar/ops/silu/kernel.h index dfa250544..32e25fa5c 100644 --- a/src/native/cuda/iluvatar/ops/silu/kernel.h +++ b/src/native/cuda/iluvatar/ops/silu/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include "native/cuda/iluvatar/runtime_.h" +#include +#include "native/cuda/iluvatar/runtime_utils.h" #include "native/cuda/ops/silu/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/silu_and_mul_infinilm/kernel.h b/src/native/cuda/iluvatar/ops/silu_and_mul_infinilm/kernel.h index b29469062..b12a6e49d 100644 --- a/src/native/cuda/iluvatar/ops/silu_and_mul_infinilm/kernel.h +++ b/src/native/cuda/iluvatar/ops/silu_and_mul_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include "native/cuda/iluvatar/runtime_.h" +#include +#include "native/cuda/iluvatar/runtime_utils.h" #include "native/cuda/ops/silu_and_mul_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/softmax_infinilm/kernel.h b/src/native/cuda/iluvatar/ops/softmax_infinilm/kernel.h index b36ff0638..b28f40b6c 100644 --- a/src/native/cuda/iluvatar/ops/softmax_infinilm/kernel.h +++ b/src/native/cuda/iluvatar/ops/softmax_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include "native/cuda/iluvatar/runtime_.h" +#include +#include "native/cuda/iluvatar/runtime_utils.h" #include "native/cuda/ops/softmax_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/swiglu/kernel.h b/src/native/cuda/iluvatar/ops/swiglu/kernel.h index b03ef19c9..2e71c4ca6 100644 --- a/src/native/cuda/iluvatar/ops/swiglu/kernel.h +++ b/src/native/cuda/iluvatar/ops/swiglu/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include "native/cuda/iluvatar/runtime_.h" +#include +#include "native/cuda/iluvatar/runtime_utils.h" #include "native/cuda/ops/swiglu/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/topksoftmax_infinilm/kernel.h b/src/native/cuda/iluvatar/ops/topksoftmax_infinilm/kernel.h index 2efe8807b..06c116f54 100644 --- a/src/native/cuda/iluvatar/ops/topksoftmax_infinilm/kernel.h +++ b/src/native/cuda/iluvatar/ops/topksoftmax_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include "native/cuda/iluvatar/runtime_.h" +#include +#include "native/cuda/iluvatar/runtime_utils.h" #include "native/cuda/ops/topksoftmax_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/zeros_infinilm/kernel.h b/src/native/cuda/iluvatar/ops/zeros_infinilm/kernel.h index 0360063ea..af0ede96d 100644 --- a/src/native/cuda/iluvatar/ops/zeros_infinilm/kernel.h +++ b/src/native/cuda/iluvatar/ops/zeros_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include "native/cuda/iluvatar/runtime_.h" +#include +#include "native/cuda/iluvatar/runtime_utils.h" #include "native/cuda/ops/zeros_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/runtime_.h b/src/native/cuda/iluvatar/runtime_.h deleted file mode 100644 index 28f2c67bc..000000000 --- a/src/native/cuda/iluvatar/runtime_.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef INFINI_OPS_ILUVATAR_RUNTIME_H_ -#define INFINI_OPS_ILUVATAR_RUNTIME_H_ - -#include -#include "native/cuda/iluvatar/runtime_utils.h" -#include "runtime.h" - -#endif diff --git a/src/native/cuda/metax/blas.h b/src/native/cuda/metax/blas.h index 68e5183e1..c24da8774 100644 --- a/src/native/cuda/metax/blas.h +++ b/src/native/cuda/metax/blas.h @@ -10,7 +10,8 @@ #include "data_type.h" #include "native/cuda/blas.h" #include "native/cuda/metax/blas_utils.h" -#include "native/cuda/metax/runtime_.h" +#include +#include "native/cuda/metax/runtime_utils.h" namespace infini::ops { diff --git a/src/native/cuda/metax/caster.cuh b/src/native/cuda/metax/caster.cuh index 6ad565870..a39fb796a 100644 --- a/src/native/cuda/metax/caster.cuh +++ b/src/native/cuda/metax/caster.cuh @@ -2,7 +2,7 @@ #define INFINI_OPS_METAX_CASTER__H_ #include "native/cuda/caster.cuh" -#include "native/cuda/metax/data_type_.h" +#include namespace infini::ops { diff --git a/src/native/cuda/metax/data_type_.h b/src/native/cuda/metax/data_type_.h deleted file mode 100644 index 003dd48c0..000000000 --- a/src/native/cuda/metax/data_type_.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef INFINI_OPS_METAX_DATA_TYPE__H_ -#define INFINI_OPS_METAX_DATA_TYPE__H_ - -#include - -namespace infini::ops { - -using infini::rt::cuda_bfloat16; -using infini::rt::cuda_bfloat162; - -} // namespace infini::ops - -#endif diff --git a/src/native/cuda/metax/device_.h b/src/native/cuda/metax/device_.h deleted file mode 100644 index 7aa554ea2..000000000 --- a/src/native/cuda/metax/device_.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef INFINI_OPS_METAX_DEVICE__H_ -#define INFINI_OPS_METAX_DEVICE__H_ - -#include - -#endif diff --git a/src/native/cuda/metax/ops/add/kernel.h b/src/native/cuda/metax/ops/add/kernel.h index 7059bb7c1..2189c8578 100644 --- a/src/native/cuda/metax/ops/add/kernel.h +++ b/src/native/cuda/metax/ops/add/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/metax/caster.cuh" -#include "native/cuda/metax/runtime_.h" +#include +#include "native/cuda/metax/runtime_utils.h" #include "native/cuda/ops/add/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/add_rms_norm/kernel.h b/src/native/cuda/metax/ops/add_rms_norm/kernel.h index 564ceba61..f77705f07 100644 --- a/src/native/cuda/metax/ops/add_rms_norm/kernel.h +++ b/src/native/cuda/metax/ops/add_rms_norm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/metax/caster.cuh" -#include "native/cuda/metax/runtime_.h" +#include +#include "native/cuda/metax/runtime_utils.h" #include "native/cuda/ops/add_rms_norm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/causal_softmax/kernel.h b/src/native/cuda/metax/ops/causal_softmax/kernel.h index 426465984..f5a1b084e 100644 --- a/src/native/cuda/metax/ops/causal_softmax/kernel.h +++ b/src/native/cuda/metax/ops/causal_softmax/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/metax/caster.cuh" -#include "native/cuda/metax/runtime_.h" +#include +#include "native/cuda/metax/runtime_utils.h" #include "native/cuda/ops/causal_softmax/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/conv_infinilm/kernel.h b/src/native/cuda/metax/ops/conv_infinilm/kernel.h index 288f666c4..8c36eb6c4 100644 --- a/src/native/cuda/metax/ops/conv_infinilm/kernel.h +++ b/src/native/cuda/metax/ops/conv_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/metax/caster.cuh" -#include "native/cuda/metax/runtime_.h" +#include +#include "native/cuda/metax/runtime_utils.h" #include "native/cuda/ops/conv_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/embedding/kernel.h b/src/native/cuda/metax/ops/embedding/kernel.h index 279d9e33f..8a0bc8893 100644 --- a/src/native/cuda/metax/ops/embedding/kernel.h +++ b/src/native/cuda/metax/ops/embedding/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/metax/caster.cuh" -#include "native/cuda/metax/runtime_.h" +#include +#include "native/cuda/metax/runtime_utils.h" #include "native/cuda/ops/embedding/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/gelu_infinilm/kernel.h b/src/native/cuda/metax/ops/gelu_infinilm/kernel.h index 124f1dd6c..6f3df83f1 100644 --- a/src/native/cuda/metax/ops/gelu_infinilm/kernel.h +++ b/src/native/cuda/metax/ops/gelu_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/metax/caster.cuh" -#include "native/cuda/metax/runtime_.h" +#include +#include "native/cuda/metax/runtime_utils.h" #include "native/cuda/ops/gelu_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/gelutanh_infinilm/kernel.h b/src/native/cuda/metax/ops/gelutanh_infinilm/kernel.h index ae3dae8bc..0b871847f 100644 --- a/src/native/cuda/metax/ops/gelutanh_infinilm/kernel.h +++ b/src/native/cuda/metax/ops/gelutanh_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/metax/caster.cuh" -#include "native/cuda/metax/runtime_.h" +#include +#include "native/cuda/metax/runtime_utils.h" #include "native/cuda/ops/gelutanh_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/kv_caching_infinilm/kernel.h b/src/native/cuda/metax/ops/kv_caching_infinilm/kernel.h index 92a24226f..4b30a44e4 100644 --- a/src/native/cuda/metax/ops/kv_caching_infinilm/kernel.h +++ b/src/native/cuda/metax/ops/kv_caching_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/metax/caster.cuh" -#include "native/cuda/metax/runtime_.h" +#include +#include "native/cuda/metax/runtime_utils.h" #include "native/cuda/ops/kv_caching_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/paged_attention_infinilm/kernel.h b/src/native/cuda/metax/ops/paged_attention_infinilm/kernel.h index 59287b74e..9ee369357 100644 --- a/src/native/cuda/metax/ops/paged_attention_infinilm/kernel.h +++ b/src/native/cuda/metax/ops/paged_attention_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/metax/caster.cuh" -#include "native/cuda/metax/runtime_.h" +#include +#include "native/cuda/metax/runtime_utils.h" #include "native/cuda/ops/paged_attention_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/paged_attention_prefill_infinilm/kernel.h b/src/native/cuda/metax/ops/paged_attention_prefill_infinilm/kernel.h index dedd3c053..a5d2d39ce 100644 --- a/src/native/cuda/metax/ops/paged_attention_prefill_infinilm/kernel.h +++ b/src/native/cuda/metax/ops/paged_attention_prefill_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/metax/caster.cuh" -#include "native/cuda/metax/runtime_.h" +#include +#include "native/cuda/metax/runtime_utils.h" #include "native/cuda/ops/paged_attention_prefill_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/paged_caching_infinilm/kernel.h b/src/native/cuda/metax/ops/paged_caching_infinilm/kernel.h index e1ac0aad8..de8f91b45 100644 --- a/src/native/cuda/metax/ops/paged_caching_infinilm/kernel.h +++ b/src/native/cuda/metax/ops/paged_caching_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/metax/caster.cuh" -#include "native/cuda/metax/runtime_.h" +#include +#include "native/cuda/metax/runtime_utils.h" #include "native/cuda/ops/paged_caching_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/random_sample_infinilm/kernel.h b/src/native/cuda/metax/ops/random_sample_infinilm/kernel.h index da18e8a4d..1c7c48671 100644 --- a/src/native/cuda/metax/ops/random_sample_infinilm/kernel.h +++ b/src/native/cuda/metax/ops/random_sample_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/metax/caster.cuh" -#include "native/cuda/metax/runtime_.h" +#include +#include "native/cuda/metax/runtime_utils.h" #include "native/cuda/ops/random_sample_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/rearrange_infinilm/kernel.h b/src/native/cuda/metax/ops/rearrange_infinilm/kernel.h index 4a92bf678..f62fa0bf2 100644 --- a/src/native/cuda/metax/ops/rearrange_infinilm/kernel.h +++ b/src/native/cuda/metax/ops/rearrange_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/metax/caster.cuh" -#include "native/cuda/metax/runtime_.h" +#include +#include "native/cuda/metax/runtime_utils.h" #include "native/cuda/ops/rearrange_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/relu_infinilm/kernel.h b/src/native/cuda/metax/ops/relu_infinilm/kernel.h index 7e0aa3af5..bd327efdf 100644 --- a/src/native/cuda/metax/ops/relu_infinilm/kernel.h +++ b/src/native/cuda/metax/ops/relu_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/metax/caster.cuh" -#include "native/cuda/metax/runtime_.h" +#include +#include "native/cuda/metax/runtime_utils.h" #include "native/cuda/ops/relu_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/rms_norm/kernel.h b/src/native/cuda/metax/ops/rms_norm/kernel.h index 38d84c830..34c039b64 100644 --- a/src/native/cuda/metax/ops/rms_norm/kernel.h +++ b/src/native/cuda/metax/ops/rms_norm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/metax/caster.cuh" -#include "native/cuda/metax/runtime_.h" +#include +#include "native/cuda/metax/runtime_utils.h" #include "native/cuda/ops/rms_norm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/rotary_embedding_infinilm/kernel.h b/src/native/cuda/metax/ops/rotary_embedding_infinilm/kernel.h index 456e6fc47..dc8a61a9f 100644 --- a/src/native/cuda/metax/ops/rotary_embedding_infinilm/kernel.h +++ b/src/native/cuda/metax/ops/rotary_embedding_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/metax/caster.cuh" -#include "native/cuda/metax/runtime_.h" +#include +#include "native/cuda/metax/runtime_utils.h" #include "native/cuda/ops/rotary_embedding_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/sigmoid_infinilm/kernel.h b/src/native/cuda/metax/ops/sigmoid_infinilm/kernel.h index 3fca745b1..31f3425e5 100644 --- a/src/native/cuda/metax/ops/sigmoid_infinilm/kernel.h +++ b/src/native/cuda/metax/ops/sigmoid_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/metax/caster.cuh" -#include "native/cuda/metax/runtime_.h" +#include +#include "native/cuda/metax/runtime_utils.h" #include "native/cuda/ops/sigmoid_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/silu/kernel.h b/src/native/cuda/metax/ops/silu/kernel.h index 1ad57e9e3..374fe7aa8 100644 --- a/src/native/cuda/metax/ops/silu/kernel.h +++ b/src/native/cuda/metax/ops/silu/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/metax/caster.cuh" -#include "native/cuda/metax/runtime_.h" +#include +#include "native/cuda/metax/runtime_utils.h" #include "native/cuda/ops/silu/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/silu_and_mul_infinilm/kernel.h b/src/native/cuda/metax/ops/silu_and_mul_infinilm/kernel.h index beaafe030..40427c7b4 100644 --- a/src/native/cuda/metax/ops/silu_and_mul_infinilm/kernel.h +++ b/src/native/cuda/metax/ops/silu_and_mul_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/metax/caster.cuh" -#include "native/cuda/metax/runtime_.h" +#include +#include "native/cuda/metax/runtime_utils.h" #include "native/cuda/ops/silu_and_mul_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/softmax_infinilm/kernel.h b/src/native/cuda/metax/ops/softmax_infinilm/kernel.h index fe0a9da51..339826c80 100644 --- a/src/native/cuda/metax/ops/softmax_infinilm/kernel.h +++ b/src/native/cuda/metax/ops/softmax_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/metax/caster.cuh" -#include "native/cuda/metax/runtime_.h" +#include +#include "native/cuda/metax/runtime_utils.h" #include "native/cuda/ops/softmax_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/swiglu/kernel.h b/src/native/cuda/metax/ops/swiglu/kernel.h index 0bcfd3198..60d4ca004 100644 --- a/src/native/cuda/metax/ops/swiglu/kernel.h +++ b/src/native/cuda/metax/ops/swiglu/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/metax/caster.cuh" -#include "native/cuda/metax/runtime_.h" +#include +#include "native/cuda/metax/runtime_utils.h" #include "native/cuda/ops/swiglu/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/topksoftmax_infinilm/kernel.h b/src/native/cuda/metax/ops/topksoftmax_infinilm/kernel.h index 4d1e8e026..72545e62f 100644 --- a/src/native/cuda/metax/ops/topksoftmax_infinilm/kernel.h +++ b/src/native/cuda/metax/ops/topksoftmax_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/metax/caster.cuh" -#include "native/cuda/metax/runtime_.h" +#include +#include "native/cuda/metax/runtime_utils.h" #include "native/cuda/ops/topksoftmax_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/zeros_infinilm/kernel.h b/src/native/cuda/metax/ops/zeros_infinilm/kernel.h index a6ff877e7..32582b9c8 100644 --- a/src/native/cuda/metax/ops/zeros_infinilm/kernel.h +++ b/src/native/cuda/metax/ops/zeros_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/metax/caster.cuh" -#include "native/cuda/metax/runtime_.h" +#include +#include "native/cuda/metax/runtime_utils.h" #include "native/cuda/ops/zeros_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/runtime_.h b/src/native/cuda/metax/runtime_.h deleted file mode 100644 index f952fb8b2..000000000 --- a/src/native/cuda/metax/runtime_.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef INFINI_OPS_METAX_RUNTIME_H_ -#define INFINI_OPS_METAX_RUNTIME_H_ - -#include -#include "native/cuda/metax/runtime_utils.h" -#include "runtime.h" - -#endif diff --git a/src/native/cuda/moore/blas.h b/src/native/cuda/moore/blas.h index b2531a1ca..d08f1680c 100644 --- a/src/native/cuda/moore/blas.h +++ b/src/native/cuda/moore/blas.h @@ -8,7 +8,8 @@ #include "data_type.h" #include "native/cuda/blas.h" #include "native/cuda/moore/blas_utils.h" -#include "native/cuda/moore/runtime_.h" +#include +#include "native/cuda/moore/runtime_utils.h" namespace infini::ops { diff --git a/src/native/cuda/moore/caster.cuh b/src/native/cuda/moore/caster.cuh index 48154980d..f552c53e8 100644 --- a/src/native/cuda/moore/caster.cuh +++ b/src/native/cuda/moore/caster.cuh @@ -2,7 +2,7 @@ #define INFINI_OPS_MOORE_CASTER__H_ #include "native/cuda/caster.cuh" -#include "native/cuda/moore/data_type_.h" +#include namespace infini::ops { diff --git a/src/native/cuda/moore/data_type_.h b/src/native/cuda/moore/data_type_.h deleted file mode 100644 index 7bc17d047..000000000 --- a/src/native/cuda/moore/data_type_.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef INFINI_OPS_MOORE_DATA_TYPE__H_ -#define INFINI_OPS_MOORE_DATA_TYPE__H_ - -#include - -namespace infini::ops { - -using infini::rt::cuda_bfloat16; -using infini::rt::cuda_bfloat162; - -} // namespace infini::ops - -#endif diff --git a/src/native/cuda/moore/device_.h b/src/native/cuda/moore/device_.h deleted file mode 100644 index 2b23db2dd..000000000 --- a/src/native/cuda/moore/device_.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef INFINI_OPS_MOORE_DEVICE__H_ -#define INFINI_OPS_MOORE_DEVICE__H_ - -#include - -#endif diff --git a/src/native/cuda/moore/ops/add/kernel.h b/src/native/cuda/moore/ops/add/kernel.h index abcea37c5..f163fd403 100644 --- a/src/native/cuda/moore/ops/add/kernel.h +++ b/src/native/cuda/moore/ops/add/kernel.h @@ -9,7 +9,8 @@ #include "native/cuda/moore/caster.cuh" #include "native/cuda/moore/polyfills.cuh" -#include "native/cuda/moore/runtime_.h" +#include +#include "native/cuda/moore/runtime_utils.h" #include "native/cuda/ops/add/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/add_rms_norm/kernel.h b/src/native/cuda/moore/ops/add_rms_norm/kernel.h index 8d3f5cc79..f04c9c07f 100644 --- a/src/native/cuda/moore/ops/add_rms_norm/kernel.h +++ b/src/native/cuda/moore/ops/add_rms_norm/kernel.h @@ -8,7 +8,8 @@ // clang-format on #include "native/cuda/moore/caster.cuh" -#include "native/cuda/moore/runtime_.h" +#include +#include "native/cuda/moore/runtime_utils.h" #include "native/cuda/ops/add_rms_norm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/causal_softmax/kernel.h b/src/native/cuda/moore/ops/causal_softmax/kernel.h index e13118763..6b4ff93bb 100644 --- a/src/native/cuda/moore/ops/causal_softmax/kernel.h +++ b/src/native/cuda/moore/ops/causal_softmax/kernel.h @@ -6,11 +6,12 @@ // clang-format on // clang-format off -#include "native/cuda/moore/device_.h" +#include // clang-format on #include "native/cuda/moore/caster.cuh" -#include "native/cuda/moore/runtime_.h" +#include +#include "native/cuda/moore/runtime_utils.h" #include "native/cuda/ops/causal_softmax/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/conv_infinilm/kernel.h b/src/native/cuda/moore/ops/conv_infinilm/kernel.h index 1d00481d6..c3811b61d 100644 --- a/src/native/cuda/moore/ops/conv_infinilm/kernel.h +++ b/src/native/cuda/moore/ops/conv_infinilm/kernel.h @@ -5,7 +5,8 @@ #include "native/cuda/moore/caster.cuh" #include "native/cuda/moore/polyfills.cuh" -#include "native/cuda/moore/runtime_.h" +#include +#include "native/cuda/moore/runtime_utils.h" #include "native/cuda/ops/conv_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/embedding/kernel.h b/src/native/cuda/moore/ops/embedding/kernel.h index 03b8cf35a..eae4b2f53 100644 --- a/src/native/cuda/moore/ops/embedding/kernel.h +++ b/src/native/cuda/moore/ops/embedding/kernel.h @@ -5,7 +5,8 @@ #include "native/cuda/moore/caster.cuh" #include "native/cuda/moore/polyfills.cuh" -#include "native/cuda/moore/runtime_.h" +#include +#include "native/cuda/moore/runtime_utils.h" #include "native/cuda/ops/embedding/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/gelu_infinilm/kernel.h b/src/native/cuda/moore/ops/gelu_infinilm/kernel.h index 3895a1b91..687a0a6e2 100644 --- a/src/native/cuda/moore/ops/gelu_infinilm/kernel.h +++ b/src/native/cuda/moore/ops/gelu_infinilm/kernel.h @@ -5,7 +5,8 @@ #include "native/cuda/moore/caster.cuh" #include "native/cuda/moore/polyfills.cuh" -#include "native/cuda/moore/runtime_.h" +#include +#include "native/cuda/moore/runtime_utils.h" #include "native/cuda/ops/gelu_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/gelutanh_infinilm/kernel.h b/src/native/cuda/moore/ops/gelutanh_infinilm/kernel.h index 3c502ff0d..27041d2f9 100644 --- a/src/native/cuda/moore/ops/gelutanh_infinilm/kernel.h +++ b/src/native/cuda/moore/ops/gelutanh_infinilm/kernel.h @@ -5,7 +5,8 @@ #include "native/cuda/moore/caster.cuh" #include "native/cuda/moore/polyfills.cuh" -#include "native/cuda/moore/runtime_.h" +#include +#include "native/cuda/moore/runtime_utils.h" #include "native/cuda/ops/gelutanh_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/kv_caching_infinilm/kernel.h b/src/native/cuda/moore/ops/kv_caching_infinilm/kernel.h index 223cd8049..314901c16 100644 --- a/src/native/cuda/moore/ops/kv_caching_infinilm/kernel.h +++ b/src/native/cuda/moore/ops/kv_caching_infinilm/kernel.h @@ -5,7 +5,8 @@ #include "native/cuda/moore/caster.cuh" #include "native/cuda/moore/polyfills.cuh" -#include "native/cuda/moore/runtime_.h" +#include +#include "native/cuda/moore/runtime_utils.h" #include "native/cuda/ops/kv_caching_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/paged_attention_infinilm/kernel.h b/src/native/cuda/moore/ops/paged_attention_infinilm/kernel.h index 7ee03230e..ad0fd4ac8 100644 --- a/src/native/cuda/moore/ops/paged_attention_infinilm/kernel.h +++ b/src/native/cuda/moore/ops/paged_attention_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/moore/caster.cuh" -#include "native/cuda/moore/runtime_.h" +#include +#include "native/cuda/moore/runtime_utils.h" #include "native/cuda/ops/paged_attention_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/paged_attention_prefill_infinilm/kernel.h b/src/native/cuda/moore/ops/paged_attention_prefill_infinilm/kernel.h index 71c06b1f2..6a27957d4 100644 --- a/src/native/cuda/moore/ops/paged_attention_prefill_infinilm/kernel.h +++ b/src/native/cuda/moore/ops/paged_attention_prefill_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/moore/caster.cuh" -#include "native/cuda/moore/runtime_.h" +#include +#include "native/cuda/moore/runtime_utils.h" #include "native/cuda/ops/paged_attention_prefill_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/paged_caching_infinilm/kernel.h b/src/native/cuda/moore/ops/paged_caching_infinilm/kernel.h index 0c6226687..eb90900cc 100644 --- a/src/native/cuda/moore/ops/paged_caching_infinilm/kernel.h +++ b/src/native/cuda/moore/ops/paged_caching_infinilm/kernel.h @@ -8,7 +8,8 @@ // clang-format on #include "native/cuda/moore/caster.cuh" -#include "native/cuda/moore/runtime_.h" +#include +#include "native/cuda/moore/runtime_utils.h" #include "native/cuda/ops/paged_caching_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/random_sample_infinilm/kernel.h b/src/native/cuda/moore/ops/random_sample_infinilm/kernel.h index dc418de6e..283b3806f 100644 --- a/src/native/cuda/moore/ops/random_sample_infinilm/kernel.h +++ b/src/native/cuda/moore/ops/random_sample_infinilm/kernel.h @@ -8,7 +8,8 @@ // clang-format on #include "native/cuda/moore/caster.cuh" -#include "native/cuda/moore/runtime_.h" +#include +#include "native/cuda/moore/runtime_utils.h" #include "native/cuda/ops/random_sample_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/rearrange_infinilm/kernel.h b/src/native/cuda/moore/ops/rearrange_infinilm/kernel.h index bd8e69896..4d5e9606b 100644 --- a/src/native/cuda/moore/ops/rearrange_infinilm/kernel.h +++ b/src/native/cuda/moore/ops/rearrange_infinilm/kernel.h @@ -5,7 +5,8 @@ #include "native/cuda/moore/caster.cuh" #include "native/cuda/moore/polyfills.cuh" -#include "native/cuda/moore/runtime_.h" +#include +#include "native/cuda/moore/runtime_utils.h" #include "native/cuda/ops/rearrange_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/relu_infinilm/kernel.h b/src/native/cuda/moore/ops/relu_infinilm/kernel.h index 4de90b8fa..a01a3821a 100644 --- a/src/native/cuda/moore/ops/relu_infinilm/kernel.h +++ b/src/native/cuda/moore/ops/relu_infinilm/kernel.h @@ -5,7 +5,8 @@ #include "native/cuda/moore/caster.cuh" #include "native/cuda/moore/polyfills.cuh" -#include "native/cuda/moore/runtime_.h" +#include +#include "native/cuda/moore/runtime_utils.h" #include "native/cuda/ops/relu_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/rms_norm/kernel.h b/src/native/cuda/moore/ops/rms_norm/kernel.h index 0f160017d..998a98cdf 100644 --- a/src/native/cuda/moore/ops/rms_norm/kernel.h +++ b/src/native/cuda/moore/ops/rms_norm/kernel.h @@ -8,7 +8,8 @@ // clang-format on #include "native/cuda/moore/caster.cuh" -#include "native/cuda/moore/runtime_.h" +#include +#include "native/cuda/moore/runtime_utils.h" #include "native/cuda/ops/rms_norm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/rotary_embedding_infinilm/kernel.h b/src/native/cuda/moore/ops/rotary_embedding_infinilm/kernel.h index 7278c9465..71b03cf78 100644 --- a/src/native/cuda/moore/ops/rotary_embedding_infinilm/kernel.h +++ b/src/native/cuda/moore/ops/rotary_embedding_infinilm/kernel.h @@ -8,7 +8,8 @@ // clang-format on #include "native/cuda/moore/caster.cuh" -#include "native/cuda/moore/runtime_.h" +#include +#include "native/cuda/moore/runtime_utils.h" #include "native/cuda/ops/rotary_embedding_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/sigmoid_infinilm/kernel.h b/src/native/cuda/moore/ops/sigmoid_infinilm/kernel.h index 62a403937..bfba5aa9d 100644 --- a/src/native/cuda/moore/ops/sigmoid_infinilm/kernel.h +++ b/src/native/cuda/moore/ops/sigmoid_infinilm/kernel.h @@ -5,7 +5,8 @@ #include "native/cuda/moore/caster.cuh" #include "native/cuda/moore/polyfills.cuh" -#include "native/cuda/moore/runtime_.h" +#include +#include "native/cuda/moore/runtime_utils.h" #include "native/cuda/ops/sigmoid_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/silu/kernel.h b/src/native/cuda/moore/ops/silu/kernel.h index b7603a9fc..110d145a9 100644 --- a/src/native/cuda/moore/ops/silu/kernel.h +++ b/src/native/cuda/moore/ops/silu/kernel.h @@ -8,7 +8,8 @@ // clang-format on #include "native/cuda/moore/caster.cuh" -#include "native/cuda/moore/runtime_.h" +#include +#include "native/cuda/moore/runtime_utils.h" #include "native/cuda/ops/silu/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/silu_and_mul_infinilm/kernel.h b/src/native/cuda/moore/ops/silu_and_mul_infinilm/kernel.h index fd0f36209..97d394010 100644 --- a/src/native/cuda/moore/ops/silu_and_mul_infinilm/kernel.h +++ b/src/native/cuda/moore/ops/silu_and_mul_infinilm/kernel.h @@ -5,7 +5,8 @@ #include "native/cuda/moore/caster.cuh" #include "native/cuda/moore/polyfills.cuh" -#include "native/cuda/moore/runtime_.h" +#include +#include "native/cuda/moore/runtime_utils.h" #include "native/cuda/ops/silu_and_mul_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/softmax_infinilm/kernel.h b/src/native/cuda/moore/ops/softmax_infinilm/kernel.h index a1d91f9b8..1f1318bc7 100644 --- a/src/native/cuda/moore/ops/softmax_infinilm/kernel.h +++ b/src/native/cuda/moore/ops/softmax_infinilm/kernel.h @@ -5,7 +5,8 @@ #include "native/cuda/moore/caster.cuh" #include "native/cuda/moore/polyfills.cuh" -#include "native/cuda/moore/runtime_.h" +#include +#include "native/cuda/moore/runtime_utils.h" #include "native/cuda/ops/softmax_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/swiglu/kernel.h b/src/native/cuda/moore/ops/swiglu/kernel.h index ac54bff50..9ae685e9d 100644 --- a/src/native/cuda/moore/ops/swiglu/kernel.h +++ b/src/native/cuda/moore/ops/swiglu/kernel.h @@ -9,7 +9,8 @@ #include "native/cuda/moore/caster.cuh" #include "native/cuda/moore/polyfills.cuh" -#include "native/cuda/moore/runtime_.h" +#include +#include "native/cuda/moore/runtime_utils.h" #include "native/cuda/ops/swiglu/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/topksoftmax_infinilm/kernel.h b/src/native/cuda/moore/ops/topksoftmax_infinilm/kernel.h index 5c1461b50..d202e0090 100644 --- a/src/native/cuda/moore/ops/topksoftmax_infinilm/kernel.h +++ b/src/native/cuda/moore/ops/topksoftmax_infinilm/kernel.h @@ -5,7 +5,8 @@ #include "native/cuda/moore/caster.cuh" #include "native/cuda/moore/polyfills.cuh" -#include "native/cuda/moore/runtime_.h" +#include +#include "native/cuda/moore/runtime_utils.h" #include "native/cuda/ops/topksoftmax_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/zeros_infinilm/kernel.h b/src/native/cuda/moore/ops/zeros_infinilm/kernel.h index 1e929a78e..8e79d44f8 100644 --- a/src/native/cuda/moore/ops/zeros_infinilm/kernel.h +++ b/src/native/cuda/moore/ops/zeros_infinilm/kernel.h @@ -5,7 +5,8 @@ #include "native/cuda/moore/caster.cuh" #include "native/cuda/moore/polyfills.cuh" -#include "native/cuda/moore/runtime_.h" +#include +#include "native/cuda/moore/runtime_utils.h" #include "native/cuda/ops/zeros_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/runtime_.h b/src/native/cuda/moore/runtime_.h deleted file mode 100644 index bc960d949..000000000 --- a/src/native/cuda/moore/runtime_.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef INFINI_OPS_MOORE_RUNTIME_H_ -#define INFINI_OPS_MOORE_RUNTIME_H_ - -#include -#include "native/cuda/moore/runtime_utils.h" -#include "runtime.h" - -#endif diff --git a/src/native/cuda/nvidia/blas.h b/src/native/cuda/nvidia/blas.h index 7cdfd2c62..3d76a2f38 100644 --- a/src/native/cuda/nvidia/blas.h +++ b/src/native/cuda/nvidia/blas.h @@ -10,7 +10,8 @@ #include "data_type.h" #include "native/cuda/blas.h" #include "native/cuda/nvidia/blas_utils.h" -#include "native/cuda/nvidia/runtime_.h" +#include +#include "native/cuda/nvidia/runtime_utils.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/caster.cuh b/src/native/cuda/nvidia/caster.cuh index b8a366e26..9aeff21f7 100644 --- a/src/native/cuda/nvidia/caster.cuh +++ b/src/native/cuda/nvidia/caster.cuh @@ -2,7 +2,7 @@ #define INFINI_OPS_NVIDIA_CASTER__H_ #include "native/cuda/caster.cuh" -#include "native/cuda/nvidia/data_type_.h" +#include namespace infini::ops { diff --git a/src/native/cuda/nvidia/data_type_.h b/src/native/cuda/nvidia/data_type_.h deleted file mode 100644 index 7f1fdc9e8..000000000 --- a/src/native/cuda/nvidia/data_type_.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef INFINI_OPS_NVIDIA_DATA_TYPE__H_ -#define INFINI_OPS_NVIDIA_DATA_TYPE__H_ - -#include - -namespace infini::ops { - -using infini::rt::cuda_bfloat16; -using infini::rt::cuda_bfloat162; - -} // namespace infini::ops - -#endif diff --git a/src/native/cuda/nvidia/device_.h b/src/native/cuda/nvidia/device_.h deleted file mode 100644 index b46caf035..000000000 --- a/src/native/cuda/nvidia/device_.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef INFINI_OPS_NVIDIA_DEVICE__H_ -#define INFINI_OPS_NVIDIA_DEVICE__H_ - -#include - -#endif diff --git a/src/native/cuda/nvidia/ops/add/kernel.h b/src/native/cuda/nvidia/ops/add/kernel.h index 373f5a775..0be5b5c4c 100644 --- a/src/native/cuda/nvidia/ops/add/kernel.h +++ b/src/native/cuda/nvidia/ops/add/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include "native/cuda/nvidia/runtime_.h" +#include +#include "native/cuda/nvidia/runtime_utils.h" #include "native/cuda/ops/add/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/add_rms_norm/kernel.h b/src/native/cuda/nvidia/ops/add_rms_norm/kernel.h index 2bb6f6051..bec736bc9 100644 --- a/src/native/cuda/nvidia/ops/add_rms_norm/kernel.h +++ b/src/native/cuda/nvidia/ops/add_rms_norm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include "native/cuda/nvidia/runtime_.h" +#include +#include "native/cuda/nvidia/runtime_utils.h" #include "native/cuda/ops/add_rms_norm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/causal_softmax/kernel.h b/src/native/cuda/nvidia/ops/causal_softmax/kernel.h index 5a7e18a94..19438b9f3 100644 --- a/src/native/cuda/nvidia/ops/causal_softmax/kernel.h +++ b/src/native/cuda/nvidia/ops/causal_softmax/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include "native/cuda/nvidia/runtime_.h" +#include +#include "native/cuda/nvidia/runtime_utils.h" #include "native/cuda/ops/causal_softmax/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/conv_infinilm/kernel.h b/src/native/cuda/nvidia/ops/conv_infinilm/kernel.h index 3cd648895..6c83855db 100644 --- a/src/native/cuda/nvidia/ops/conv_infinilm/kernel.h +++ b/src/native/cuda/nvidia/ops/conv_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include "native/cuda/nvidia/runtime_.h" +#include +#include "native/cuda/nvidia/runtime_utils.h" #include "native/cuda/ops/conv_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/embedding/kernel.h b/src/native/cuda/nvidia/ops/embedding/kernel.h index f05f155ae..c97d161cc 100644 --- a/src/native/cuda/nvidia/ops/embedding/kernel.h +++ b/src/native/cuda/nvidia/ops/embedding/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include "native/cuda/nvidia/runtime_.h" +#include +#include "native/cuda/nvidia/runtime_utils.h" #include "native/cuda/ops/embedding/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/gelu_infinilm/kernel.h b/src/native/cuda/nvidia/ops/gelu_infinilm/kernel.h index e1825b449..99f709a85 100644 --- a/src/native/cuda/nvidia/ops/gelu_infinilm/kernel.h +++ b/src/native/cuda/nvidia/ops/gelu_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include "native/cuda/nvidia/runtime_.h" +#include +#include "native/cuda/nvidia/runtime_utils.h" #include "native/cuda/ops/gelu_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/gelutanh_infinilm/kernel.h b/src/native/cuda/nvidia/ops/gelutanh_infinilm/kernel.h index 15fff6a06..ecb60de1b 100644 --- a/src/native/cuda/nvidia/ops/gelutanh_infinilm/kernel.h +++ b/src/native/cuda/nvidia/ops/gelutanh_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include "native/cuda/nvidia/runtime_.h" +#include +#include "native/cuda/nvidia/runtime_utils.h" #include "native/cuda/ops/gelutanh_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/gemm/cublaslt.h b/src/native/cuda/nvidia/ops/gemm/cublaslt.h index dee5d9a2e..28f831991 100644 --- a/src/native/cuda/nvidia/ops/gemm/cublaslt.h +++ b/src/native/cuda/nvidia/ops/gemm/cublaslt.h @@ -10,7 +10,8 @@ #include "base/gemm.h" #include "native/cuda/nvidia/blas_utils.h" -#include "native/cuda/nvidia/runtime_.h" +#include +#include "native/cuda/nvidia/runtime_utils.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/kv_caching_infinilm/kernel.h b/src/native/cuda/nvidia/ops/kv_caching_infinilm/kernel.h index b4e7405bf..8c52ab5f1 100644 --- a/src/native/cuda/nvidia/ops/kv_caching_infinilm/kernel.h +++ b/src/native/cuda/nvidia/ops/kv_caching_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include "native/cuda/nvidia/runtime_.h" +#include +#include "native/cuda/nvidia/runtime_utils.h" #include "native/cuda/ops/kv_caching_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/paged_attention_infinilm/kernel.h b/src/native/cuda/nvidia/ops/paged_attention_infinilm/kernel.h index 08286ffdf..9c329a2bf 100644 --- a/src/native/cuda/nvidia/ops/paged_attention_infinilm/kernel.h +++ b/src/native/cuda/nvidia/ops/paged_attention_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include "native/cuda/nvidia/runtime_.h" +#include +#include "native/cuda/nvidia/runtime_utils.h" #include "native/cuda/ops/paged_attention_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/paged_attention_prefill_infinilm/kernel.h b/src/native/cuda/nvidia/ops/paged_attention_prefill_infinilm/kernel.h index 809590e6b..e1e7d7f39 100644 --- a/src/native/cuda/nvidia/ops/paged_attention_prefill_infinilm/kernel.h +++ b/src/native/cuda/nvidia/ops/paged_attention_prefill_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include "native/cuda/nvidia/runtime_.h" +#include +#include "native/cuda/nvidia/runtime_utils.h" #include "native/cuda/ops/paged_attention_prefill_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/paged_caching_infinilm/kernel.h b/src/native/cuda/nvidia/ops/paged_caching_infinilm/kernel.h index 524eb0c71..be581b128 100644 --- a/src/native/cuda/nvidia/ops/paged_caching_infinilm/kernel.h +++ b/src/native/cuda/nvidia/ops/paged_caching_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include "native/cuda/nvidia/runtime_.h" +#include +#include "native/cuda/nvidia/runtime_utils.h" #include "native/cuda/ops/paged_caching_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/random_sample_infinilm/kernel.h b/src/native/cuda/nvidia/ops/random_sample_infinilm/kernel.h index f2608bf64..0e9764f1c 100644 --- a/src/native/cuda/nvidia/ops/random_sample_infinilm/kernel.h +++ b/src/native/cuda/nvidia/ops/random_sample_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include "native/cuda/nvidia/runtime_.h" +#include +#include "native/cuda/nvidia/runtime_utils.h" #include "native/cuda/ops/random_sample_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/rearrange_infinilm/kernel.h b/src/native/cuda/nvidia/ops/rearrange_infinilm/kernel.h index 8ab25c69a..9015e9ec1 100644 --- a/src/native/cuda/nvidia/ops/rearrange_infinilm/kernel.h +++ b/src/native/cuda/nvidia/ops/rearrange_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include "native/cuda/nvidia/runtime_.h" +#include +#include "native/cuda/nvidia/runtime_utils.h" #include "native/cuda/ops/rearrange_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/relu_infinilm/kernel.h b/src/native/cuda/nvidia/ops/relu_infinilm/kernel.h index 37f140caa..0c1b9e04f 100644 --- a/src/native/cuda/nvidia/ops/relu_infinilm/kernel.h +++ b/src/native/cuda/nvidia/ops/relu_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include "native/cuda/nvidia/runtime_.h" +#include +#include "native/cuda/nvidia/runtime_utils.h" #include "native/cuda/ops/relu_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/rms_norm/kernel.h b/src/native/cuda/nvidia/ops/rms_norm/kernel.h index 74cfab17a..09ad3332b 100644 --- a/src/native/cuda/nvidia/ops/rms_norm/kernel.h +++ b/src/native/cuda/nvidia/ops/rms_norm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include "native/cuda/nvidia/runtime_.h" +#include +#include "native/cuda/nvidia/runtime_utils.h" #include "native/cuda/ops/rms_norm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/rotary_embedding_infinilm/kernel.h b/src/native/cuda/nvidia/ops/rotary_embedding_infinilm/kernel.h index 93014ad0d..f80dfdbf0 100644 --- a/src/native/cuda/nvidia/ops/rotary_embedding_infinilm/kernel.h +++ b/src/native/cuda/nvidia/ops/rotary_embedding_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include "native/cuda/nvidia/runtime_.h" +#include +#include "native/cuda/nvidia/runtime_utils.h" #include "native/cuda/ops/rotary_embedding_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/sigmoid_infinilm/kernel.h b/src/native/cuda/nvidia/ops/sigmoid_infinilm/kernel.h index 44c504ee3..eeaff3f26 100644 --- a/src/native/cuda/nvidia/ops/sigmoid_infinilm/kernel.h +++ b/src/native/cuda/nvidia/ops/sigmoid_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include "native/cuda/nvidia/runtime_.h" +#include +#include "native/cuda/nvidia/runtime_utils.h" #include "native/cuda/ops/sigmoid_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/silu/kernel.h b/src/native/cuda/nvidia/ops/silu/kernel.h index 7f5001f71..f57c689c4 100644 --- a/src/native/cuda/nvidia/ops/silu/kernel.h +++ b/src/native/cuda/nvidia/ops/silu/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include "native/cuda/nvidia/runtime_.h" +#include +#include "native/cuda/nvidia/runtime_utils.h" #include "native/cuda/ops/silu/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/silu_and_mul_infinilm/kernel.h b/src/native/cuda/nvidia/ops/silu_and_mul_infinilm/kernel.h index 9f908b1fd..4e1958c2c 100644 --- a/src/native/cuda/nvidia/ops/silu_and_mul_infinilm/kernel.h +++ b/src/native/cuda/nvidia/ops/silu_and_mul_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include "native/cuda/nvidia/runtime_.h" +#include +#include "native/cuda/nvidia/runtime_utils.h" #include "native/cuda/ops/silu_and_mul_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/softmax_infinilm/kernel.h b/src/native/cuda/nvidia/ops/softmax_infinilm/kernel.h index d00d143c4..03e422ca0 100644 --- a/src/native/cuda/nvidia/ops/softmax_infinilm/kernel.h +++ b/src/native/cuda/nvidia/ops/softmax_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include "native/cuda/nvidia/runtime_.h" +#include +#include "native/cuda/nvidia/runtime_utils.h" #include "native/cuda/ops/softmax_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/swiglu/kernel.h b/src/native/cuda/nvidia/ops/swiglu/kernel.h index fd58f29ef..a1e366676 100644 --- a/src/native/cuda/nvidia/ops/swiglu/kernel.h +++ b/src/native/cuda/nvidia/ops/swiglu/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include "native/cuda/nvidia/runtime_.h" +#include +#include "native/cuda/nvidia/runtime_utils.h" #include "native/cuda/ops/swiglu/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/topksoftmax_infinilm/kernel.h b/src/native/cuda/nvidia/ops/topksoftmax_infinilm/kernel.h index fd8897dd6..9e4fc15c3 100644 --- a/src/native/cuda/nvidia/ops/topksoftmax_infinilm/kernel.h +++ b/src/native/cuda/nvidia/ops/topksoftmax_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include "native/cuda/nvidia/runtime_.h" +#include +#include "native/cuda/nvidia/runtime_utils.h" #include "native/cuda/ops/topksoftmax_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/zeros_infinilm/kernel.h b/src/native/cuda/nvidia/ops/zeros_infinilm/kernel.h index a6092c5dc..5b24b6d93 100644 --- a/src/native/cuda/nvidia/ops/zeros_infinilm/kernel.h +++ b/src/native/cuda/nvidia/ops/zeros_infinilm/kernel.h @@ -4,7 +4,8 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include "native/cuda/nvidia/runtime_.h" +#include +#include "native/cuda/nvidia/runtime_utils.h" #include "native/cuda/ops/zeros_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/runtime_.h b/src/native/cuda/nvidia/runtime_.h deleted file mode 100644 index 5745b3ffd..000000000 --- a/src/native/cuda/nvidia/runtime_.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef INFINI_OPS_NVIDIA_RUNTIME_H_ -#define INFINI_OPS_NVIDIA_RUNTIME_H_ - -#include -#include "native/cuda/nvidia/runtime_utils.h" -#include "runtime.h" - -#endif diff --git a/src/native/cuda/ops/rotary_embedding_infinilm/kernel.cuh b/src/native/cuda/ops/rotary_embedding_infinilm/kernel.cuh index 22c904aa6..ff2725401 100644 --- a/src/native/cuda/ops/rotary_embedding_infinilm/kernel.cuh +++ b/src/native/cuda/ops/rotary_embedding_infinilm/kernel.cuh @@ -9,6 +9,9 @@ namespace infini::ops { +using infini::rt::cuda_bfloat16; +using infini::rt::cuda_bfloat162; + template struct VecTypeHelper {}; diff --git a/src/native/cuda/runtime_utils.h b/src/native/cuda/runtime_utils.h index c5fb79b0c..e28d8c554 100644 --- a/src/native/cuda/runtime_utils.h +++ b/src/native/cuda/runtime_utils.h @@ -1,7 +1,7 @@ #ifndef INFINI_OPS_CUDA_RUNTIME_UTILS_H_ #define INFINI_OPS_CUDA_RUNTIME_UTILS_H_ -#include "device.h" +#include "runtime.h" namespace infini::ops { From fe9856ae094c07123da58ada1c50f15a6c07b4e6 Mon Sep 17 00:00:00 2001 From: Jiacheng Huang Date: Thu, 18 Jun 2026 09:35:26 +0800 Subject: [PATCH 04/12] refactor: keep runtime shim diff small --- examples/runtime_api.h | 16 ++++++---------- src/native/ascend/data_type_.h | 2 +- src/native/ascend/device_.h | 6 ++++++ src/native/ascend/runtime_.h | 7 +++++++ src/native/cambricon/data_type_.h | 6 ++++++ src/native/cambricon/device_.h | 6 ++++++ src/native/cambricon/ops/rms_norm/rms_norm.h | 2 +- src/native/cambricon/runtime_.h | 7 +++++++ src/native/cuda/iluvatar/blas.h | 3 +-- src/native/cuda/iluvatar/caster.cuh | 2 +- src/native/cuda/iluvatar/data_type_.h | 13 +++++++++++++ src/native/cuda/iluvatar/device_.h | 6 ++++++ src/native/cuda/iluvatar/ops/add/kernel.h | 3 +-- .../cuda/iluvatar/ops/add_rms_norm/kernel.h | 3 +-- .../cuda/iluvatar/ops/causal_softmax/kernel.h | 3 +-- .../cuda/iluvatar/ops/conv_infinilm/kernel.h | 3 +-- src/native/cuda/iluvatar/ops/embedding/kernel.h | 3 +-- .../cuda/iluvatar/ops/gelu_infinilm/kernel.h | 3 +-- .../cuda/iluvatar/ops/gelutanh_infinilm/kernel.h | 3 +-- .../iluvatar/ops/kv_caching_infinilm/kernel.h | 3 +-- .../ops/paged_attention_infinilm/kernel.h | 3 +-- .../paged_attention_prefill_infinilm/kernel.h | 3 +-- .../iluvatar/ops/paged_caching_infinilm/kernel.h | 3 +-- .../iluvatar/ops/random_sample_infinilm/kernel.h | 3 +-- .../iluvatar/ops/rearrange_infinilm/kernel.h | 3 +-- .../cuda/iluvatar/ops/relu_infinilm/kernel.h | 3 +-- src/native/cuda/iluvatar/ops/rms_norm/kernel.h | 3 +-- .../ops/rotary_embedding_infinilm/kernel.h | 3 +-- .../cuda/iluvatar/ops/sigmoid_infinilm/kernel.h | 3 +-- src/native/cuda/iluvatar/ops/silu/kernel.h | 3 +-- .../iluvatar/ops/silu_and_mul_infinilm/kernel.h | 3 +-- .../cuda/iluvatar/ops/softmax_infinilm/kernel.h | 3 +-- src/native/cuda/iluvatar/ops/swiglu/kernel.h | 3 +-- .../iluvatar/ops/topksoftmax_infinilm/kernel.h | 3 +-- .../cuda/iluvatar/ops/zeros_infinilm/kernel.h | 3 +-- src/native/cuda/iluvatar/runtime_.h | 8 ++++++++ src/native/cuda/metax/blas.h | 3 +-- src/native/cuda/metax/caster.cuh | 2 +- src/native/cuda/metax/data_type_.h | 13 +++++++++++++ src/native/cuda/metax/device_.h | 6 ++++++ src/native/cuda/metax/ops/add/kernel.h | 3 +-- src/native/cuda/metax/ops/add_rms_norm/kernel.h | 3 +-- .../cuda/metax/ops/causal_softmax/kernel.h | 3 +-- src/native/cuda/metax/ops/conv_infinilm/kernel.h | 3 +-- src/native/cuda/metax/ops/embedding/kernel.h | 3 +-- src/native/cuda/metax/ops/gelu_infinilm/kernel.h | 3 +-- .../cuda/metax/ops/gelutanh_infinilm/kernel.h | 3 +-- .../cuda/metax/ops/kv_caching_infinilm/kernel.h | 3 +-- .../metax/ops/paged_attention_infinilm/kernel.h | 3 +-- .../paged_attention_prefill_infinilm/kernel.h | 3 +-- .../metax/ops/paged_caching_infinilm/kernel.h | 3 +-- .../metax/ops/random_sample_infinilm/kernel.h | 3 +-- .../cuda/metax/ops/rearrange_infinilm/kernel.h | 3 +-- src/native/cuda/metax/ops/relu_infinilm/kernel.h | 3 +-- src/native/cuda/metax/ops/rms_norm/kernel.h | 3 +-- .../metax/ops/rotary_embedding_infinilm/kernel.h | 3 +-- .../cuda/metax/ops/sigmoid_infinilm/kernel.h | 3 +-- src/native/cuda/metax/ops/silu/kernel.h | 3 +-- .../metax/ops/silu_and_mul_infinilm/kernel.h | 3 +-- .../cuda/metax/ops/softmax_infinilm/kernel.h | 3 +-- src/native/cuda/metax/ops/swiglu/kernel.h | 3 +-- .../cuda/metax/ops/topksoftmax_infinilm/kernel.h | 3 +-- .../cuda/metax/ops/zeros_infinilm/kernel.h | 3 +-- src/native/cuda/metax/runtime_.h | 8 ++++++++ src/native/cuda/moore/blas.h | 3 +-- src/native/cuda/moore/caster.cuh | 2 +- src/native/cuda/moore/data_type_.h | 13 +++++++++++++ src/native/cuda/moore/device_.h | 6 ++++++ src/native/cuda/moore/ops/add/kernel.h | 3 +-- src/native/cuda/moore/ops/add_rms_norm/kernel.h | 3 +-- .../cuda/moore/ops/causal_softmax/kernel.h | 5 ++--- src/native/cuda/moore/ops/conv_infinilm/kernel.h | 3 +-- src/native/cuda/moore/ops/embedding/kernel.h | 3 +-- src/native/cuda/moore/ops/gelu_infinilm/kernel.h | 3 +-- .../cuda/moore/ops/gelutanh_infinilm/kernel.h | 3 +-- .../cuda/moore/ops/kv_caching_infinilm/kernel.h | 3 +-- .../moore/ops/paged_attention_infinilm/kernel.h | 3 +-- .../paged_attention_prefill_infinilm/kernel.h | 3 +-- .../moore/ops/paged_caching_infinilm/kernel.h | 3 +-- .../moore/ops/random_sample_infinilm/kernel.h | 3 +-- .../cuda/moore/ops/rearrange_infinilm/kernel.h | 3 +-- src/native/cuda/moore/ops/relu_infinilm/kernel.h | 3 +-- src/native/cuda/moore/ops/rms_norm/kernel.h | 3 +-- .../moore/ops/rotary_embedding_infinilm/kernel.h | 3 +-- .../cuda/moore/ops/sigmoid_infinilm/kernel.h | 3 +-- src/native/cuda/moore/ops/silu/kernel.h | 3 +-- .../moore/ops/silu_and_mul_infinilm/kernel.h | 3 +-- .../cuda/moore/ops/softmax_infinilm/kernel.h | 3 +-- src/native/cuda/moore/ops/swiglu/kernel.h | 3 +-- .../cuda/moore/ops/topksoftmax_infinilm/kernel.h | 3 +-- .../cuda/moore/ops/zeros_infinilm/kernel.h | 3 +-- src/native/cuda/moore/runtime_.h | 8 ++++++++ src/native/cuda/nvidia/blas.h | 3 +-- src/native/cuda/nvidia/caster.cuh | 2 +- src/native/cuda/nvidia/data_type_.h | 13 +++++++++++++ src/native/cuda/nvidia/device_.h | 6 ++++++ src/native/cuda/nvidia/ops/add/kernel.h | 3 +-- src/native/cuda/nvidia/ops/add_rms_norm/kernel.h | 3 +-- .../cuda/nvidia/ops/causal_softmax/kernel.h | 3 +-- .../cuda/nvidia/ops/conv_infinilm/kernel.h | 3 +-- src/native/cuda/nvidia/ops/embedding/kernel.h | 3 +-- .../cuda/nvidia/ops/gelu_infinilm/kernel.h | 3 +-- .../cuda/nvidia/ops/gelutanh_infinilm/kernel.h | 3 +-- src/native/cuda/nvidia/ops/gemm/cublaslt.h | 3 +-- .../cuda/nvidia/ops/kv_caching_infinilm/kernel.h | 3 +-- .../nvidia/ops/paged_attention_infinilm/kernel.h | 3 +-- .../paged_attention_prefill_infinilm/kernel.h | 3 +-- .../nvidia/ops/paged_caching_infinilm/kernel.h | 3 +-- .../nvidia/ops/random_sample_infinilm/kernel.h | 3 +-- .../cuda/nvidia/ops/rearrange_infinilm/kernel.h | 3 +-- .../cuda/nvidia/ops/relu_infinilm/kernel.h | 3 +-- src/native/cuda/nvidia/ops/rms_norm/kernel.h | 3 +-- .../ops/rotary_embedding_infinilm/kernel.h | 3 +-- .../cuda/nvidia/ops/sigmoid_infinilm/kernel.h | 3 +-- src/native/cuda/nvidia/ops/silu/kernel.h | 3 +-- .../nvidia/ops/silu_and_mul_infinilm/kernel.h | 3 +-- .../cuda/nvidia/ops/softmax_infinilm/kernel.h | 3 +-- src/native/cuda/nvidia/ops/swiglu/kernel.h | 3 +-- .../nvidia/ops/topksoftmax_infinilm/kernel.h | 3 +-- .../cuda/nvidia/ops/zeros_infinilm/kernel.h | 3 +-- src/native/cuda/nvidia/runtime_.h | 8 ++++++++ .../ops/rotary_embedding_infinilm/kernel.cuh | 3 --- src/native/cuda/runtime_utils.h | 2 +- 123 files changed, 251 insertions(+), 215 deletions(-) create mode 100644 src/native/ascend/device_.h create mode 100644 src/native/ascend/runtime_.h create mode 100644 src/native/cambricon/data_type_.h create mode 100644 src/native/cambricon/device_.h create mode 100644 src/native/cambricon/runtime_.h create mode 100644 src/native/cuda/iluvatar/data_type_.h create mode 100644 src/native/cuda/iluvatar/device_.h create mode 100644 src/native/cuda/iluvatar/runtime_.h create mode 100644 src/native/cuda/metax/data_type_.h create mode 100644 src/native/cuda/metax/device_.h create mode 100644 src/native/cuda/metax/runtime_.h create mode 100644 src/native/cuda/moore/data_type_.h create mode 100644 src/native/cuda/moore/device_.h create mode 100644 src/native/cuda/moore/runtime_.h create mode 100644 src/native/cuda/nvidia/data_type_.h create mode 100644 src/native/cuda/nvidia/device_.h create mode 100644 src/native/cuda/nvidia/runtime_.h diff --git a/examples/runtime_api.h b/examples/runtime_api.h index 93681c8f8..292b40408 100644 --- a/examples/runtime_api.h +++ b/examples/runtime_api.h @@ -7,26 +7,22 @@ #ifdef WITH_NVIDIA #include "native/cuda/nvidia/ops/gemm/cublas.h" #include "native/cuda/nvidia/ops/gemm/cublaslt.h" -#include -#include "native/cuda/nvidia/runtime_utils.h" +#include "native/cuda/nvidia/runtime_.h" #elif WITH_ILUVATAR #include "native/cuda/iluvatar/ops/gemm/cublas.h" -#include -#include "native/cuda/iluvatar/runtime_utils.h" +#include "native/cuda/iluvatar/runtime_.h" #elif WITH_METAX #include "native/cuda/metax/ops/gemm/mcblas.h" -#include -#include "native/cuda/metax/runtime_utils.h" +#include "native/cuda/metax/runtime_.h" #elif WITH_CAMBRICON #include "native/cambricon/ops/gemm/cnblas.h" -#include +#include "native/cambricon/runtime_.h" #elif WITH_MOORE #include "native/cuda/moore/ops/gemm/mublas.h" -#include -#include "native/cuda/moore/runtime_utils.h" +#include "native/cuda/moore/runtime_.h" #elif WITH_ASCEND #include "native/ascend/ops/gemm/kernel.h" -#include +#include "native/ascend/runtime_.h" #elif WITH_CPU #include #include "native/cpu/ops/gemm/gemm.h" diff --git a/src/native/ascend/data_type_.h b/src/native/ascend/data_type_.h index 095ad99bf..81253a9b3 100644 --- a/src/native/ascend/data_type_.h +++ b/src/native/ascend/data_type_.h @@ -5,7 +5,7 @@ #include "acl/acl.h" #include "data_type.h" -#include +#include "native/ascend/device_.h" namespace infini::ops::ascend { diff --git a/src/native/ascend/device_.h b/src/native/ascend/device_.h new file mode 100644 index 000000000..8d69ab95e --- /dev/null +++ b/src/native/ascend/device_.h @@ -0,0 +1,6 @@ +#ifndef INFINI_OPS_ASCEND_DEVICE__H_ +#define INFINI_OPS_ASCEND_DEVICE__H_ + +#include + +#endif diff --git a/src/native/ascend/runtime_.h b/src/native/ascend/runtime_.h new file mode 100644 index 000000000..19e374ae2 --- /dev/null +++ b/src/native/ascend/runtime_.h @@ -0,0 +1,7 @@ +#ifndef INFINI_OPS_ASCEND_RUNTIME__H_ +#define INFINI_OPS_ASCEND_RUNTIME__H_ + +#include +#include "runtime.h" + +#endif diff --git a/src/native/cambricon/data_type_.h b/src/native/cambricon/data_type_.h new file mode 100644 index 000000000..805b311ed --- /dev/null +++ b/src/native/cambricon/data_type_.h @@ -0,0 +1,6 @@ +#ifndef INFINI_OPS_CAMBRICON_DATA_TYPE__H_ +#define INFINI_OPS_CAMBRICON_DATA_TYPE__H_ + +#include + +#endif diff --git a/src/native/cambricon/device_.h b/src/native/cambricon/device_.h new file mode 100644 index 000000000..d59054af8 --- /dev/null +++ b/src/native/cambricon/device_.h @@ -0,0 +1,6 @@ +#ifndef INFINI_OPS_CAMBRICON_DEVICE__H_ +#define INFINI_OPS_CAMBRICON_DEVICE__H_ + +#include + +#endif diff --git a/src/native/cambricon/ops/rms_norm/rms_norm.h b/src/native/cambricon/ops/rms_norm/rms_norm.h index cb91e61e9..6a9aed098 100644 --- a/src/native/cambricon/ops/rms_norm/rms_norm.h +++ b/src/native/cambricon/ops/rms_norm/rms_norm.h @@ -7,7 +7,7 @@ #include "base/rms_norm.h" #include "native/cambricon/common.h" -#include +#include "native/cambricon/data_type_.h" namespace infini::ops { diff --git a/src/native/cambricon/runtime_.h b/src/native/cambricon/runtime_.h new file mode 100644 index 000000000..a1ade0d80 --- /dev/null +++ b/src/native/cambricon/runtime_.h @@ -0,0 +1,7 @@ +#ifndef INFINI_OPS_CAMBRICON_RUNTIME_H_ +#define INFINI_OPS_CAMBRICON_RUNTIME_H_ + +#include +#include "runtime.h" + +#endif diff --git a/src/native/cuda/iluvatar/blas.h b/src/native/cuda/iluvatar/blas.h index 9115d8ca9..7e7545a40 100644 --- a/src/native/cuda/iluvatar/blas.h +++ b/src/native/cuda/iluvatar/blas.h @@ -10,8 +10,7 @@ #include "data_type.h" #include "native/cuda/blas.h" #include "native/cuda/iluvatar/blas_utils.h" -#include -#include "native/cuda/iluvatar/runtime_utils.h" +#include "native/cuda/iluvatar/runtime_.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/caster.cuh b/src/native/cuda/iluvatar/caster.cuh index 2a8a39c40..a75369570 100644 --- a/src/native/cuda/iluvatar/caster.cuh +++ b/src/native/cuda/iluvatar/caster.cuh @@ -2,7 +2,7 @@ #define INFINI_OPS_ILUVATAR_CASTER__H_ #include "native/cuda/caster.cuh" -#include +#include "native/cuda/iluvatar/data_type_.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/data_type_.h b/src/native/cuda/iluvatar/data_type_.h new file mode 100644 index 000000000..b9e3ed8a3 --- /dev/null +++ b/src/native/cuda/iluvatar/data_type_.h @@ -0,0 +1,13 @@ +#ifndef INFINI_OPS_ILUVATAR_DATA_TYPE__H_ +#define INFINI_OPS_ILUVATAR_DATA_TYPE__H_ + +#include + +namespace infini::ops { + +using infini::rt::cuda_bfloat16; +using infini::rt::cuda_bfloat162; + +} // namespace infini::ops + +#endif diff --git a/src/native/cuda/iluvatar/device_.h b/src/native/cuda/iluvatar/device_.h new file mode 100644 index 000000000..96743518f --- /dev/null +++ b/src/native/cuda/iluvatar/device_.h @@ -0,0 +1,6 @@ +#ifndef INFINI_OPS_ILUVATAR_DEVICE__H_ +#define INFINI_OPS_ILUVATAR_DEVICE__H_ + +#include + +#endif diff --git a/src/native/cuda/iluvatar/ops/add/kernel.h b/src/native/cuda/iluvatar/ops/add/kernel.h index a83062fae..f41aa6be4 100644 --- a/src/native/cuda/iluvatar/ops/add/kernel.h +++ b/src/native/cuda/iluvatar/ops/add/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include -#include "native/cuda/iluvatar/runtime_utils.h" +#include "native/cuda/iluvatar/runtime_.h" #include "native/cuda/ops/add/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/add_rms_norm/kernel.h b/src/native/cuda/iluvatar/ops/add_rms_norm/kernel.h index f95ca7317..828b93bbc 100644 --- a/src/native/cuda/iluvatar/ops/add_rms_norm/kernel.h +++ b/src/native/cuda/iluvatar/ops/add_rms_norm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include -#include "native/cuda/iluvatar/runtime_utils.h" +#include "native/cuda/iluvatar/runtime_.h" #include "native/cuda/ops/add_rms_norm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/causal_softmax/kernel.h b/src/native/cuda/iluvatar/ops/causal_softmax/kernel.h index 733a59272..8c35c87ce 100644 --- a/src/native/cuda/iluvatar/ops/causal_softmax/kernel.h +++ b/src/native/cuda/iluvatar/ops/causal_softmax/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include -#include "native/cuda/iluvatar/runtime_utils.h" +#include "native/cuda/iluvatar/runtime_.h" #include "native/cuda/ops/causal_softmax/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/conv_infinilm/kernel.h b/src/native/cuda/iluvatar/ops/conv_infinilm/kernel.h index 9439d2214..7eed0d8ab 100644 --- a/src/native/cuda/iluvatar/ops/conv_infinilm/kernel.h +++ b/src/native/cuda/iluvatar/ops/conv_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include -#include "native/cuda/iluvatar/runtime_utils.h" +#include "native/cuda/iluvatar/runtime_.h" #include "native/cuda/ops/conv_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/embedding/kernel.h b/src/native/cuda/iluvatar/ops/embedding/kernel.h index ac8f4cced..db77867f6 100644 --- a/src/native/cuda/iluvatar/ops/embedding/kernel.h +++ b/src/native/cuda/iluvatar/ops/embedding/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include -#include "native/cuda/iluvatar/runtime_utils.h" +#include "native/cuda/iluvatar/runtime_.h" #include "native/cuda/ops/embedding/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/gelu_infinilm/kernel.h b/src/native/cuda/iluvatar/ops/gelu_infinilm/kernel.h index 34c25b31f..edbb424cc 100644 --- a/src/native/cuda/iluvatar/ops/gelu_infinilm/kernel.h +++ b/src/native/cuda/iluvatar/ops/gelu_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include -#include "native/cuda/iluvatar/runtime_utils.h" +#include "native/cuda/iluvatar/runtime_.h" #include "native/cuda/ops/gelu_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/gelutanh_infinilm/kernel.h b/src/native/cuda/iluvatar/ops/gelutanh_infinilm/kernel.h index bd99df025..1ef9f1345 100644 --- a/src/native/cuda/iluvatar/ops/gelutanh_infinilm/kernel.h +++ b/src/native/cuda/iluvatar/ops/gelutanh_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include -#include "native/cuda/iluvatar/runtime_utils.h" +#include "native/cuda/iluvatar/runtime_.h" #include "native/cuda/ops/gelutanh_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/kv_caching_infinilm/kernel.h b/src/native/cuda/iluvatar/ops/kv_caching_infinilm/kernel.h index 0f7cbfff1..3f3018864 100644 --- a/src/native/cuda/iluvatar/ops/kv_caching_infinilm/kernel.h +++ b/src/native/cuda/iluvatar/ops/kv_caching_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include -#include "native/cuda/iluvatar/runtime_utils.h" +#include "native/cuda/iluvatar/runtime_.h" #include "native/cuda/ops/kv_caching_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/paged_attention_infinilm/kernel.h b/src/native/cuda/iluvatar/ops/paged_attention_infinilm/kernel.h index d7757426e..3284d3256 100644 --- a/src/native/cuda/iluvatar/ops/paged_attention_infinilm/kernel.h +++ b/src/native/cuda/iluvatar/ops/paged_attention_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include -#include "native/cuda/iluvatar/runtime_utils.h" +#include "native/cuda/iluvatar/runtime_.h" #include "native/cuda/ops/paged_attention_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/paged_attention_prefill_infinilm/kernel.h b/src/native/cuda/iluvatar/ops/paged_attention_prefill_infinilm/kernel.h index 78f34c99c..d2ad9806e 100644 --- a/src/native/cuda/iluvatar/ops/paged_attention_prefill_infinilm/kernel.h +++ b/src/native/cuda/iluvatar/ops/paged_attention_prefill_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include -#include "native/cuda/iluvatar/runtime_utils.h" +#include "native/cuda/iluvatar/runtime_.h" #include "native/cuda/ops/paged_attention_prefill_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/paged_caching_infinilm/kernel.h b/src/native/cuda/iluvatar/ops/paged_caching_infinilm/kernel.h index c280477dc..350e43c4e 100644 --- a/src/native/cuda/iluvatar/ops/paged_caching_infinilm/kernel.h +++ b/src/native/cuda/iluvatar/ops/paged_caching_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include -#include "native/cuda/iluvatar/runtime_utils.h" +#include "native/cuda/iluvatar/runtime_.h" #include "native/cuda/ops/paged_caching_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/random_sample_infinilm/kernel.h b/src/native/cuda/iluvatar/ops/random_sample_infinilm/kernel.h index 0c453d5af..7a1dcc5de 100644 --- a/src/native/cuda/iluvatar/ops/random_sample_infinilm/kernel.h +++ b/src/native/cuda/iluvatar/ops/random_sample_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include -#include "native/cuda/iluvatar/runtime_utils.h" +#include "native/cuda/iluvatar/runtime_.h" #include "native/cuda/ops/random_sample_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/rearrange_infinilm/kernel.h b/src/native/cuda/iluvatar/ops/rearrange_infinilm/kernel.h index 5ec8ff4b2..5ba9a9198 100644 --- a/src/native/cuda/iluvatar/ops/rearrange_infinilm/kernel.h +++ b/src/native/cuda/iluvatar/ops/rearrange_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include -#include "native/cuda/iluvatar/runtime_utils.h" +#include "native/cuda/iluvatar/runtime_.h" #include "native/cuda/ops/rearrange_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/relu_infinilm/kernel.h b/src/native/cuda/iluvatar/ops/relu_infinilm/kernel.h index bd83616ee..faf1c0790 100644 --- a/src/native/cuda/iluvatar/ops/relu_infinilm/kernel.h +++ b/src/native/cuda/iluvatar/ops/relu_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include -#include "native/cuda/iluvatar/runtime_utils.h" +#include "native/cuda/iluvatar/runtime_.h" #include "native/cuda/ops/relu_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/rms_norm/kernel.h b/src/native/cuda/iluvatar/ops/rms_norm/kernel.h index 9e0e438fd..3230fe8b0 100644 --- a/src/native/cuda/iluvatar/ops/rms_norm/kernel.h +++ b/src/native/cuda/iluvatar/ops/rms_norm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include -#include "native/cuda/iluvatar/runtime_utils.h" +#include "native/cuda/iluvatar/runtime_.h" #include "native/cuda/ops/rms_norm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/rotary_embedding_infinilm/kernel.h b/src/native/cuda/iluvatar/ops/rotary_embedding_infinilm/kernel.h index c6b812042..41d7a4259 100644 --- a/src/native/cuda/iluvatar/ops/rotary_embedding_infinilm/kernel.h +++ b/src/native/cuda/iluvatar/ops/rotary_embedding_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include -#include "native/cuda/iluvatar/runtime_utils.h" +#include "native/cuda/iluvatar/runtime_.h" #include "native/cuda/ops/rotary_embedding_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/sigmoid_infinilm/kernel.h b/src/native/cuda/iluvatar/ops/sigmoid_infinilm/kernel.h index 2752a99b6..2d5c487ea 100644 --- a/src/native/cuda/iluvatar/ops/sigmoid_infinilm/kernel.h +++ b/src/native/cuda/iluvatar/ops/sigmoid_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include -#include "native/cuda/iluvatar/runtime_utils.h" +#include "native/cuda/iluvatar/runtime_.h" #include "native/cuda/ops/sigmoid_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/silu/kernel.h b/src/native/cuda/iluvatar/ops/silu/kernel.h index 32e25fa5c..dfa250544 100644 --- a/src/native/cuda/iluvatar/ops/silu/kernel.h +++ b/src/native/cuda/iluvatar/ops/silu/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include -#include "native/cuda/iluvatar/runtime_utils.h" +#include "native/cuda/iluvatar/runtime_.h" #include "native/cuda/ops/silu/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/silu_and_mul_infinilm/kernel.h b/src/native/cuda/iluvatar/ops/silu_and_mul_infinilm/kernel.h index b12a6e49d..b29469062 100644 --- a/src/native/cuda/iluvatar/ops/silu_and_mul_infinilm/kernel.h +++ b/src/native/cuda/iluvatar/ops/silu_and_mul_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include -#include "native/cuda/iluvatar/runtime_utils.h" +#include "native/cuda/iluvatar/runtime_.h" #include "native/cuda/ops/silu_and_mul_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/softmax_infinilm/kernel.h b/src/native/cuda/iluvatar/ops/softmax_infinilm/kernel.h index b28f40b6c..b36ff0638 100644 --- a/src/native/cuda/iluvatar/ops/softmax_infinilm/kernel.h +++ b/src/native/cuda/iluvatar/ops/softmax_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include -#include "native/cuda/iluvatar/runtime_utils.h" +#include "native/cuda/iluvatar/runtime_.h" #include "native/cuda/ops/softmax_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/swiglu/kernel.h b/src/native/cuda/iluvatar/ops/swiglu/kernel.h index 2e71c4ca6..b03ef19c9 100644 --- a/src/native/cuda/iluvatar/ops/swiglu/kernel.h +++ b/src/native/cuda/iluvatar/ops/swiglu/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include -#include "native/cuda/iluvatar/runtime_utils.h" +#include "native/cuda/iluvatar/runtime_.h" #include "native/cuda/ops/swiglu/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/topksoftmax_infinilm/kernel.h b/src/native/cuda/iluvatar/ops/topksoftmax_infinilm/kernel.h index 06c116f54..2efe8807b 100644 --- a/src/native/cuda/iluvatar/ops/topksoftmax_infinilm/kernel.h +++ b/src/native/cuda/iluvatar/ops/topksoftmax_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include -#include "native/cuda/iluvatar/runtime_utils.h" +#include "native/cuda/iluvatar/runtime_.h" #include "native/cuda/ops/topksoftmax_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/ops/zeros_infinilm/kernel.h b/src/native/cuda/iluvatar/ops/zeros_infinilm/kernel.h index af0ede96d..0360063ea 100644 --- a/src/native/cuda/iluvatar/ops/zeros_infinilm/kernel.h +++ b/src/native/cuda/iluvatar/ops/zeros_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/iluvatar/caster.cuh" -#include -#include "native/cuda/iluvatar/runtime_utils.h" +#include "native/cuda/iluvatar/runtime_.h" #include "native/cuda/ops/zeros_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/iluvatar/runtime_.h b/src/native/cuda/iluvatar/runtime_.h new file mode 100644 index 000000000..28f2c67bc --- /dev/null +++ b/src/native/cuda/iluvatar/runtime_.h @@ -0,0 +1,8 @@ +#ifndef INFINI_OPS_ILUVATAR_RUNTIME_H_ +#define INFINI_OPS_ILUVATAR_RUNTIME_H_ + +#include +#include "native/cuda/iluvatar/runtime_utils.h" +#include "runtime.h" + +#endif diff --git a/src/native/cuda/metax/blas.h b/src/native/cuda/metax/blas.h index c24da8774..68e5183e1 100644 --- a/src/native/cuda/metax/blas.h +++ b/src/native/cuda/metax/blas.h @@ -10,8 +10,7 @@ #include "data_type.h" #include "native/cuda/blas.h" #include "native/cuda/metax/blas_utils.h" -#include -#include "native/cuda/metax/runtime_utils.h" +#include "native/cuda/metax/runtime_.h" namespace infini::ops { diff --git a/src/native/cuda/metax/caster.cuh b/src/native/cuda/metax/caster.cuh index a39fb796a..6ad565870 100644 --- a/src/native/cuda/metax/caster.cuh +++ b/src/native/cuda/metax/caster.cuh @@ -2,7 +2,7 @@ #define INFINI_OPS_METAX_CASTER__H_ #include "native/cuda/caster.cuh" -#include +#include "native/cuda/metax/data_type_.h" namespace infini::ops { diff --git a/src/native/cuda/metax/data_type_.h b/src/native/cuda/metax/data_type_.h new file mode 100644 index 000000000..003dd48c0 --- /dev/null +++ b/src/native/cuda/metax/data_type_.h @@ -0,0 +1,13 @@ +#ifndef INFINI_OPS_METAX_DATA_TYPE__H_ +#define INFINI_OPS_METAX_DATA_TYPE__H_ + +#include + +namespace infini::ops { + +using infini::rt::cuda_bfloat16; +using infini::rt::cuda_bfloat162; + +} // namespace infini::ops + +#endif diff --git a/src/native/cuda/metax/device_.h b/src/native/cuda/metax/device_.h new file mode 100644 index 000000000..7aa554ea2 --- /dev/null +++ b/src/native/cuda/metax/device_.h @@ -0,0 +1,6 @@ +#ifndef INFINI_OPS_METAX_DEVICE__H_ +#define INFINI_OPS_METAX_DEVICE__H_ + +#include + +#endif diff --git a/src/native/cuda/metax/ops/add/kernel.h b/src/native/cuda/metax/ops/add/kernel.h index 2189c8578..7059bb7c1 100644 --- a/src/native/cuda/metax/ops/add/kernel.h +++ b/src/native/cuda/metax/ops/add/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/metax/caster.cuh" -#include -#include "native/cuda/metax/runtime_utils.h" +#include "native/cuda/metax/runtime_.h" #include "native/cuda/ops/add/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/add_rms_norm/kernel.h b/src/native/cuda/metax/ops/add_rms_norm/kernel.h index f77705f07..564ceba61 100644 --- a/src/native/cuda/metax/ops/add_rms_norm/kernel.h +++ b/src/native/cuda/metax/ops/add_rms_norm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/metax/caster.cuh" -#include -#include "native/cuda/metax/runtime_utils.h" +#include "native/cuda/metax/runtime_.h" #include "native/cuda/ops/add_rms_norm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/causal_softmax/kernel.h b/src/native/cuda/metax/ops/causal_softmax/kernel.h index f5a1b084e..426465984 100644 --- a/src/native/cuda/metax/ops/causal_softmax/kernel.h +++ b/src/native/cuda/metax/ops/causal_softmax/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/metax/caster.cuh" -#include -#include "native/cuda/metax/runtime_utils.h" +#include "native/cuda/metax/runtime_.h" #include "native/cuda/ops/causal_softmax/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/conv_infinilm/kernel.h b/src/native/cuda/metax/ops/conv_infinilm/kernel.h index 8c36eb6c4..288f666c4 100644 --- a/src/native/cuda/metax/ops/conv_infinilm/kernel.h +++ b/src/native/cuda/metax/ops/conv_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/metax/caster.cuh" -#include -#include "native/cuda/metax/runtime_utils.h" +#include "native/cuda/metax/runtime_.h" #include "native/cuda/ops/conv_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/embedding/kernel.h b/src/native/cuda/metax/ops/embedding/kernel.h index 8a0bc8893..279d9e33f 100644 --- a/src/native/cuda/metax/ops/embedding/kernel.h +++ b/src/native/cuda/metax/ops/embedding/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/metax/caster.cuh" -#include -#include "native/cuda/metax/runtime_utils.h" +#include "native/cuda/metax/runtime_.h" #include "native/cuda/ops/embedding/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/gelu_infinilm/kernel.h b/src/native/cuda/metax/ops/gelu_infinilm/kernel.h index 6f3df83f1..124f1dd6c 100644 --- a/src/native/cuda/metax/ops/gelu_infinilm/kernel.h +++ b/src/native/cuda/metax/ops/gelu_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/metax/caster.cuh" -#include -#include "native/cuda/metax/runtime_utils.h" +#include "native/cuda/metax/runtime_.h" #include "native/cuda/ops/gelu_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/gelutanh_infinilm/kernel.h b/src/native/cuda/metax/ops/gelutanh_infinilm/kernel.h index 0b871847f..ae3dae8bc 100644 --- a/src/native/cuda/metax/ops/gelutanh_infinilm/kernel.h +++ b/src/native/cuda/metax/ops/gelutanh_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/metax/caster.cuh" -#include -#include "native/cuda/metax/runtime_utils.h" +#include "native/cuda/metax/runtime_.h" #include "native/cuda/ops/gelutanh_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/kv_caching_infinilm/kernel.h b/src/native/cuda/metax/ops/kv_caching_infinilm/kernel.h index 4b30a44e4..92a24226f 100644 --- a/src/native/cuda/metax/ops/kv_caching_infinilm/kernel.h +++ b/src/native/cuda/metax/ops/kv_caching_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/metax/caster.cuh" -#include -#include "native/cuda/metax/runtime_utils.h" +#include "native/cuda/metax/runtime_.h" #include "native/cuda/ops/kv_caching_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/paged_attention_infinilm/kernel.h b/src/native/cuda/metax/ops/paged_attention_infinilm/kernel.h index 9ee369357..59287b74e 100644 --- a/src/native/cuda/metax/ops/paged_attention_infinilm/kernel.h +++ b/src/native/cuda/metax/ops/paged_attention_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/metax/caster.cuh" -#include -#include "native/cuda/metax/runtime_utils.h" +#include "native/cuda/metax/runtime_.h" #include "native/cuda/ops/paged_attention_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/paged_attention_prefill_infinilm/kernel.h b/src/native/cuda/metax/ops/paged_attention_prefill_infinilm/kernel.h index a5d2d39ce..dedd3c053 100644 --- a/src/native/cuda/metax/ops/paged_attention_prefill_infinilm/kernel.h +++ b/src/native/cuda/metax/ops/paged_attention_prefill_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/metax/caster.cuh" -#include -#include "native/cuda/metax/runtime_utils.h" +#include "native/cuda/metax/runtime_.h" #include "native/cuda/ops/paged_attention_prefill_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/paged_caching_infinilm/kernel.h b/src/native/cuda/metax/ops/paged_caching_infinilm/kernel.h index de8f91b45..e1ac0aad8 100644 --- a/src/native/cuda/metax/ops/paged_caching_infinilm/kernel.h +++ b/src/native/cuda/metax/ops/paged_caching_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/metax/caster.cuh" -#include -#include "native/cuda/metax/runtime_utils.h" +#include "native/cuda/metax/runtime_.h" #include "native/cuda/ops/paged_caching_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/random_sample_infinilm/kernel.h b/src/native/cuda/metax/ops/random_sample_infinilm/kernel.h index 1c7c48671..da18e8a4d 100644 --- a/src/native/cuda/metax/ops/random_sample_infinilm/kernel.h +++ b/src/native/cuda/metax/ops/random_sample_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/metax/caster.cuh" -#include -#include "native/cuda/metax/runtime_utils.h" +#include "native/cuda/metax/runtime_.h" #include "native/cuda/ops/random_sample_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/rearrange_infinilm/kernel.h b/src/native/cuda/metax/ops/rearrange_infinilm/kernel.h index f62fa0bf2..4a92bf678 100644 --- a/src/native/cuda/metax/ops/rearrange_infinilm/kernel.h +++ b/src/native/cuda/metax/ops/rearrange_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/metax/caster.cuh" -#include -#include "native/cuda/metax/runtime_utils.h" +#include "native/cuda/metax/runtime_.h" #include "native/cuda/ops/rearrange_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/relu_infinilm/kernel.h b/src/native/cuda/metax/ops/relu_infinilm/kernel.h index bd327efdf..7e0aa3af5 100644 --- a/src/native/cuda/metax/ops/relu_infinilm/kernel.h +++ b/src/native/cuda/metax/ops/relu_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/metax/caster.cuh" -#include -#include "native/cuda/metax/runtime_utils.h" +#include "native/cuda/metax/runtime_.h" #include "native/cuda/ops/relu_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/rms_norm/kernel.h b/src/native/cuda/metax/ops/rms_norm/kernel.h index 34c039b64..38d84c830 100644 --- a/src/native/cuda/metax/ops/rms_norm/kernel.h +++ b/src/native/cuda/metax/ops/rms_norm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/metax/caster.cuh" -#include -#include "native/cuda/metax/runtime_utils.h" +#include "native/cuda/metax/runtime_.h" #include "native/cuda/ops/rms_norm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/rotary_embedding_infinilm/kernel.h b/src/native/cuda/metax/ops/rotary_embedding_infinilm/kernel.h index dc8a61a9f..456e6fc47 100644 --- a/src/native/cuda/metax/ops/rotary_embedding_infinilm/kernel.h +++ b/src/native/cuda/metax/ops/rotary_embedding_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/metax/caster.cuh" -#include -#include "native/cuda/metax/runtime_utils.h" +#include "native/cuda/metax/runtime_.h" #include "native/cuda/ops/rotary_embedding_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/sigmoid_infinilm/kernel.h b/src/native/cuda/metax/ops/sigmoid_infinilm/kernel.h index 31f3425e5..3fca745b1 100644 --- a/src/native/cuda/metax/ops/sigmoid_infinilm/kernel.h +++ b/src/native/cuda/metax/ops/sigmoid_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/metax/caster.cuh" -#include -#include "native/cuda/metax/runtime_utils.h" +#include "native/cuda/metax/runtime_.h" #include "native/cuda/ops/sigmoid_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/silu/kernel.h b/src/native/cuda/metax/ops/silu/kernel.h index 374fe7aa8..1ad57e9e3 100644 --- a/src/native/cuda/metax/ops/silu/kernel.h +++ b/src/native/cuda/metax/ops/silu/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/metax/caster.cuh" -#include -#include "native/cuda/metax/runtime_utils.h" +#include "native/cuda/metax/runtime_.h" #include "native/cuda/ops/silu/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/silu_and_mul_infinilm/kernel.h b/src/native/cuda/metax/ops/silu_and_mul_infinilm/kernel.h index 40427c7b4..beaafe030 100644 --- a/src/native/cuda/metax/ops/silu_and_mul_infinilm/kernel.h +++ b/src/native/cuda/metax/ops/silu_and_mul_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/metax/caster.cuh" -#include -#include "native/cuda/metax/runtime_utils.h" +#include "native/cuda/metax/runtime_.h" #include "native/cuda/ops/silu_and_mul_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/softmax_infinilm/kernel.h b/src/native/cuda/metax/ops/softmax_infinilm/kernel.h index 339826c80..fe0a9da51 100644 --- a/src/native/cuda/metax/ops/softmax_infinilm/kernel.h +++ b/src/native/cuda/metax/ops/softmax_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/metax/caster.cuh" -#include -#include "native/cuda/metax/runtime_utils.h" +#include "native/cuda/metax/runtime_.h" #include "native/cuda/ops/softmax_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/swiglu/kernel.h b/src/native/cuda/metax/ops/swiglu/kernel.h index 60d4ca004..0bcfd3198 100644 --- a/src/native/cuda/metax/ops/swiglu/kernel.h +++ b/src/native/cuda/metax/ops/swiglu/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/metax/caster.cuh" -#include -#include "native/cuda/metax/runtime_utils.h" +#include "native/cuda/metax/runtime_.h" #include "native/cuda/ops/swiglu/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/topksoftmax_infinilm/kernel.h b/src/native/cuda/metax/ops/topksoftmax_infinilm/kernel.h index 72545e62f..4d1e8e026 100644 --- a/src/native/cuda/metax/ops/topksoftmax_infinilm/kernel.h +++ b/src/native/cuda/metax/ops/topksoftmax_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/metax/caster.cuh" -#include -#include "native/cuda/metax/runtime_utils.h" +#include "native/cuda/metax/runtime_.h" #include "native/cuda/ops/topksoftmax_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/ops/zeros_infinilm/kernel.h b/src/native/cuda/metax/ops/zeros_infinilm/kernel.h index 32582b9c8..a6ff877e7 100644 --- a/src/native/cuda/metax/ops/zeros_infinilm/kernel.h +++ b/src/native/cuda/metax/ops/zeros_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/metax/caster.cuh" -#include -#include "native/cuda/metax/runtime_utils.h" +#include "native/cuda/metax/runtime_.h" #include "native/cuda/ops/zeros_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/metax/runtime_.h b/src/native/cuda/metax/runtime_.h new file mode 100644 index 000000000..f952fb8b2 --- /dev/null +++ b/src/native/cuda/metax/runtime_.h @@ -0,0 +1,8 @@ +#ifndef INFINI_OPS_METAX_RUNTIME_H_ +#define INFINI_OPS_METAX_RUNTIME_H_ + +#include +#include "native/cuda/metax/runtime_utils.h" +#include "runtime.h" + +#endif diff --git a/src/native/cuda/moore/blas.h b/src/native/cuda/moore/blas.h index d08f1680c..b2531a1ca 100644 --- a/src/native/cuda/moore/blas.h +++ b/src/native/cuda/moore/blas.h @@ -8,8 +8,7 @@ #include "data_type.h" #include "native/cuda/blas.h" #include "native/cuda/moore/blas_utils.h" -#include -#include "native/cuda/moore/runtime_utils.h" +#include "native/cuda/moore/runtime_.h" namespace infini::ops { diff --git a/src/native/cuda/moore/caster.cuh b/src/native/cuda/moore/caster.cuh index f552c53e8..48154980d 100644 --- a/src/native/cuda/moore/caster.cuh +++ b/src/native/cuda/moore/caster.cuh @@ -2,7 +2,7 @@ #define INFINI_OPS_MOORE_CASTER__H_ #include "native/cuda/caster.cuh" -#include +#include "native/cuda/moore/data_type_.h" namespace infini::ops { diff --git a/src/native/cuda/moore/data_type_.h b/src/native/cuda/moore/data_type_.h new file mode 100644 index 000000000..7bc17d047 --- /dev/null +++ b/src/native/cuda/moore/data_type_.h @@ -0,0 +1,13 @@ +#ifndef INFINI_OPS_MOORE_DATA_TYPE__H_ +#define INFINI_OPS_MOORE_DATA_TYPE__H_ + +#include + +namespace infini::ops { + +using infini::rt::cuda_bfloat16; +using infini::rt::cuda_bfloat162; + +} // namespace infini::ops + +#endif diff --git a/src/native/cuda/moore/device_.h b/src/native/cuda/moore/device_.h new file mode 100644 index 000000000..2b23db2dd --- /dev/null +++ b/src/native/cuda/moore/device_.h @@ -0,0 +1,6 @@ +#ifndef INFINI_OPS_MOORE_DEVICE__H_ +#define INFINI_OPS_MOORE_DEVICE__H_ + +#include + +#endif diff --git a/src/native/cuda/moore/ops/add/kernel.h b/src/native/cuda/moore/ops/add/kernel.h index f163fd403..abcea37c5 100644 --- a/src/native/cuda/moore/ops/add/kernel.h +++ b/src/native/cuda/moore/ops/add/kernel.h @@ -9,8 +9,7 @@ #include "native/cuda/moore/caster.cuh" #include "native/cuda/moore/polyfills.cuh" -#include -#include "native/cuda/moore/runtime_utils.h" +#include "native/cuda/moore/runtime_.h" #include "native/cuda/ops/add/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/add_rms_norm/kernel.h b/src/native/cuda/moore/ops/add_rms_norm/kernel.h index f04c9c07f..8d3f5cc79 100644 --- a/src/native/cuda/moore/ops/add_rms_norm/kernel.h +++ b/src/native/cuda/moore/ops/add_rms_norm/kernel.h @@ -8,8 +8,7 @@ // clang-format on #include "native/cuda/moore/caster.cuh" -#include -#include "native/cuda/moore/runtime_utils.h" +#include "native/cuda/moore/runtime_.h" #include "native/cuda/ops/add_rms_norm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/causal_softmax/kernel.h b/src/native/cuda/moore/ops/causal_softmax/kernel.h index 6b4ff93bb..e13118763 100644 --- a/src/native/cuda/moore/ops/causal_softmax/kernel.h +++ b/src/native/cuda/moore/ops/causal_softmax/kernel.h @@ -6,12 +6,11 @@ // clang-format on // clang-format off -#include +#include "native/cuda/moore/device_.h" // clang-format on #include "native/cuda/moore/caster.cuh" -#include -#include "native/cuda/moore/runtime_utils.h" +#include "native/cuda/moore/runtime_.h" #include "native/cuda/ops/causal_softmax/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/conv_infinilm/kernel.h b/src/native/cuda/moore/ops/conv_infinilm/kernel.h index c3811b61d..1d00481d6 100644 --- a/src/native/cuda/moore/ops/conv_infinilm/kernel.h +++ b/src/native/cuda/moore/ops/conv_infinilm/kernel.h @@ -5,8 +5,7 @@ #include "native/cuda/moore/caster.cuh" #include "native/cuda/moore/polyfills.cuh" -#include -#include "native/cuda/moore/runtime_utils.h" +#include "native/cuda/moore/runtime_.h" #include "native/cuda/ops/conv_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/embedding/kernel.h b/src/native/cuda/moore/ops/embedding/kernel.h index eae4b2f53..03b8cf35a 100644 --- a/src/native/cuda/moore/ops/embedding/kernel.h +++ b/src/native/cuda/moore/ops/embedding/kernel.h @@ -5,8 +5,7 @@ #include "native/cuda/moore/caster.cuh" #include "native/cuda/moore/polyfills.cuh" -#include -#include "native/cuda/moore/runtime_utils.h" +#include "native/cuda/moore/runtime_.h" #include "native/cuda/ops/embedding/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/gelu_infinilm/kernel.h b/src/native/cuda/moore/ops/gelu_infinilm/kernel.h index 687a0a6e2..3895a1b91 100644 --- a/src/native/cuda/moore/ops/gelu_infinilm/kernel.h +++ b/src/native/cuda/moore/ops/gelu_infinilm/kernel.h @@ -5,8 +5,7 @@ #include "native/cuda/moore/caster.cuh" #include "native/cuda/moore/polyfills.cuh" -#include -#include "native/cuda/moore/runtime_utils.h" +#include "native/cuda/moore/runtime_.h" #include "native/cuda/ops/gelu_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/gelutanh_infinilm/kernel.h b/src/native/cuda/moore/ops/gelutanh_infinilm/kernel.h index 27041d2f9..3c502ff0d 100644 --- a/src/native/cuda/moore/ops/gelutanh_infinilm/kernel.h +++ b/src/native/cuda/moore/ops/gelutanh_infinilm/kernel.h @@ -5,8 +5,7 @@ #include "native/cuda/moore/caster.cuh" #include "native/cuda/moore/polyfills.cuh" -#include -#include "native/cuda/moore/runtime_utils.h" +#include "native/cuda/moore/runtime_.h" #include "native/cuda/ops/gelutanh_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/kv_caching_infinilm/kernel.h b/src/native/cuda/moore/ops/kv_caching_infinilm/kernel.h index 314901c16..223cd8049 100644 --- a/src/native/cuda/moore/ops/kv_caching_infinilm/kernel.h +++ b/src/native/cuda/moore/ops/kv_caching_infinilm/kernel.h @@ -5,8 +5,7 @@ #include "native/cuda/moore/caster.cuh" #include "native/cuda/moore/polyfills.cuh" -#include -#include "native/cuda/moore/runtime_utils.h" +#include "native/cuda/moore/runtime_.h" #include "native/cuda/ops/kv_caching_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/paged_attention_infinilm/kernel.h b/src/native/cuda/moore/ops/paged_attention_infinilm/kernel.h index ad0fd4ac8..7ee03230e 100644 --- a/src/native/cuda/moore/ops/paged_attention_infinilm/kernel.h +++ b/src/native/cuda/moore/ops/paged_attention_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/moore/caster.cuh" -#include -#include "native/cuda/moore/runtime_utils.h" +#include "native/cuda/moore/runtime_.h" #include "native/cuda/ops/paged_attention_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/paged_attention_prefill_infinilm/kernel.h b/src/native/cuda/moore/ops/paged_attention_prefill_infinilm/kernel.h index 6a27957d4..71c06b1f2 100644 --- a/src/native/cuda/moore/ops/paged_attention_prefill_infinilm/kernel.h +++ b/src/native/cuda/moore/ops/paged_attention_prefill_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/moore/caster.cuh" -#include -#include "native/cuda/moore/runtime_utils.h" +#include "native/cuda/moore/runtime_.h" #include "native/cuda/ops/paged_attention_prefill_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/paged_caching_infinilm/kernel.h b/src/native/cuda/moore/ops/paged_caching_infinilm/kernel.h index eb90900cc..0c6226687 100644 --- a/src/native/cuda/moore/ops/paged_caching_infinilm/kernel.h +++ b/src/native/cuda/moore/ops/paged_caching_infinilm/kernel.h @@ -8,8 +8,7 @@ // clang-format on #include "native/cuda/moore/caster.cuh" -#include -#include "native/cuda/moore/runtime_utils.h" +#include "native/cuda/moore/runtime_.h" #include "native/cuda/ops/paged_caching_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/random_sample_infinilm/kernel.h b/src/native/cuda/moore/ops/random_sample_infinilm/kernel.h index 283b3806f..dc418de6e 100644 --- a/src/native/cuda/moore/ops/random_sample_infinilm/kernel.h +++ b/src/native/cuda/moore/ops/random_sample_infinilm/kernel.h @@ -8,8 +8,7 @@ // clang-format on #include "native/cuda/moore/caster.cuh" -#include -#include "native/cuda/moore/runtime_utils.h" +#include "native/cuda/moore/runtime_.h" #include "native/cuda/ops/random_sample_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/rearrange_infinilm/kernel.h b/src/native/cuda/moore/ops/rearrange_infinilm/kernel.h index 4d5e9606b..bd8e69896 100644 --- a/src/native/cuda/moore/ops/rearrange_infinilm/kernel.h +++ b/src/native/cuda/moore/ops/rearrange_infinilm/kernel.h @@ -5,8 +5,7 @@ #include "native/cuda/moore/caster.cuh" #include "native/cuda/moore/polyfills.cuh" -#include -#include "native/cuda/moore/runtime_utils.h" +#include "native/cuda/moore/runtime_.h" #include "native/cuda/ops/rearrange_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/relu_infinilm/kernel.h b/src/native/cuda/moore/ops/relu_infinilm/kernel.h index a01a3821a..4de90b8fa 100644 --- a/src/native/cuda/moore/ops/relu_infinilm/kernel.h +++ b/src/native/cuda/moore/ops/relu_infinilm/kernel.h @@ -5,8 +5,7 @@ #include "native/cuda/moore/caster.cuh" #include "native/cuda/moore/polyfills.cuh" -#include -#include "native/cuda/moore/runtime_utils.h" +#include "native/cuda/moore/runtime_.h" #include "native/cuda/ops/relu_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/rms_norm/kernel.h b/src/native/cuda/moore/ops/rms_norm/kernel.h index 998a98cdf..0f160017d 100644 --- a/src/native/cuda/moore/ops/rms_norm/kernel.h +++ b/src/native/cuda/moore/ops/rms_norm/kernel.h @@ -8,8 +8,7 @@ // clang-format on #include "native/cuda/moore/caster.cuh" -#include -#include "native/cuda/moore/runtime_utils.h" +#include "native/cuda/moore/runtime_.h" #include "native/cuda/ops/rms_norm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/rotary_embedding_infinilm/kernel.h b/src/native/cuda/moore/ops/rotary_embedding_infinilm/kernel.h index 71b03cf78..7278c9465 100644 --- a/src/native/cuda/moore/ops/rotary_embedding_infinilm/kernel.h +++ b/src/native/cuda/moore/ops/rotary_embedding_infinilm/kernel.h @@ -8,8 +8,7 @@ // clang-format on #include "native/cuda/moore/caster.cuh" -#include -#include "native/cuda/moore/runtime_utils.h" +#include "native/cuda/moore/runtime_.h" #include "native/cuda/ops/rotary_embedding_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/sigmoid_infinilm/kernel.h b/src/native/cuda/moore/ops/sigmoid_infinilm/kernel.h index bfba5aa9d..62a403937 100644 --- a/src/native/cuda/moore/ops/sigmoid_infinilm/kernel.h +++ b/src/native/cuda/moore/ops/sigmoid_infinilm/kernel.h @@ -5,8 +5,7 @@ #include "native/cuda/moore/caster.cuh" #include "native/cuda/moore/polyfills.cuh" -#include -#include "native/cuda/moore/runtime_utils.h" +#include "native/cuda/moore/runtime_.h" #include "native/cuda/ops/sigmoid_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/silu/kernel.h b/src/native/cuda/moore/ops/silu/kernel.h index 110d145a9..b7603a9fc 100644 --- a/src/native/cuda/moore/ops/silu/kernel.h +++ b/src/native/cuda/moore/ops/silu/kernel.h @@ -8,8 +8,7 @@ // clang-format on #include "native/cuda/moore/caster.cuh" -#include -#include "native/cuda/moore/runtime_utils.h" +#include "native/cuda/moore/runtime_.h" #include "native/cuda/ops/silu/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/silu_and_mul_infinilm/kernel.h b/src/native/cuda/moore/ops/silu_and_mul_infinilm/kernel.h index 97d394010..fd0f36209 100644 --- a/src/native/cuda/moore/ops/silu_and_mul_infinilm/kernel.h +++ b/src/native/cuda/moore/ops/silu_and_mul_infinilm/kernel.h @@ -5,8 +5,7 @@ #include "native/cuda/moore/caster.cuh" #include "native/cuda/moore/polyfills.cuh" -#include -#include "native/cuda/moore/runtime_utils.h" +#include "native/cuda/moore/runtime_.h" #include "native/cuda/ops/silu_and_mul_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/softmax_infinilm/kernel.h b/src/native/cuda/moore/ops/softmax_infinilm/kernel.h index 1f1318bc7..a1d91f9b8 100644 --- a/src/native/cuda/moore/ops/softmax_infinilm/kernel.h +++ b/src/native/cuda/moore/ops/softmax_infinilm/kernel.h @@ -5,8 +5,7 @@ #include "native/cuda/moore/caster.cuh" #include "native/cuda/moore/polyfills.cuh" -#include -#include "native/cuda/moore/runtime_utils.h" +#include "native/cuda/moore/runtime_.h" #include "native/cuda/ops/softmax_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/swiglu/kernel.h b/src/native/cuda/moore/ops/swiglu/kernel.h index 9ae685e9d..ac54bff50 100644 --- a/src/native/cuda/moore/ops/swiglu/kernel.h +++ b/src/native/cuda/moore/ops/swiglu/kernel.h @@ -9,8 +9,7 @@ #include "native/cuda/moore/caster.cuh" #include "native/cuda/moore/polyfills.cuh" -#include -#include "native/cuda/moore/runtime_utils.h" +#include "native/cuda/moore/runtime_.h" #include "native/cuda/ops/swiglu/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/topksoftmax_infinilm/kernel.h b/src/native/cuda/moore/ops/topksoftmax_infinilm/kernel.h index d202e0090..5c1461b50 100644 --- a/src/native/cuda/moore/ops/topksoftmax_infinilm/kernel.h +++ b/src/native/cuda/moore/ops/topksoftmax_infinilm/kernel.h @@ -5,8 +5,7 @@ #include "native/cuda/moore/caster.cuh" #include "native/cuda/moore/polyfills.cuh" -#include -#include "native/cuda/moore/runtime_utils.h" +#include "native/cuda/moore/runtime_.h" #include "native/cuda/ops/topksoftmax_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/ops/zeros_infinilm/kernel.h b/src/native/cuda/moore/ops/zeros_infinilm/kernel.h index 8e79d44f8..1e929a78e 100644 --- a/src/native/cuda/moore/ops/zeros_infinilm/kernel.h +++ b/src/native/cuda/moore/ops/zeros_infinilm/kernel.h @@ -5,8 +5,7 @@ #include "native/cuda/moore/caster.cuh" #include "native/cuda/moore/polyfills.cuh" -#include -#include "native/cuda/moore/runtime_utils.h" +#include "native/cuda/moore/runtime_.h" #include "native/cuda/ops/zeros_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/moore/runtime_.h b/src/native/cuda/moore/runtime_.h new file mode 100644 index 000000000..bc960d949 --- /dev/null +++ b/src/native/cuda/moore/runtime_.h @@ -0,0 +1,8 @@ +#ifndef INFINI_OPS_MOORE_RUNTIME_H_ +#define INFINI_OPS_MOORE_RUNTIME_H_ + +#include +#include "native/cuda/moore/runtime_utils.h" +#include "runtime.h" + +#endif diff --git a/src/native/cuda/nvidia/blas.h b/src/native/cuda/nvidia/blas.h index 3d76a2f38..7cdfd2c62 100644 --- a/src/native/cuda/nvidia/blas.h +++ b/src/native/cuda/nvidia/blas.h @@ -10,8 +10,7 @@ #include "data_type.h" #include "native/cuda/blas.h" #include "native/cuda/nvidia/blas_utils.h" -#include -#include "native/cuda/nvidia/runtime_utils.h" +#include "native/cuda/nvidia/runtime_.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/caster.cuh b/src/native/cuda/nvidia/caster.cuh index 9aeff21f7..b8a366e26 100644 --- a/src/native/cuda/nvidia/caster.cuh +++ b/src/native/cuda/nvidia/caster.cuh @@ -2,7 +2,7 @@ #define INFINI_OPS_NVIDIA_CASTER__H_ #include "native/cuda/caster.cuh" -#include +#include "native/cuda/nvidia/data_type_.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/data_type_.h b/src/native/cuda/nvidia/data_type_.h new file mode 100644 index 000000000..7f1fdc9e8 --- /dev/null +++ b/src/native/cuda/nvidia/data_type_.h @@ -0,0 +1,13 @@ +#ifndef INFINI_OPS_NVIDIA_DATA_TYPE__H_ +#define INFINI_OPS_NVIDIA_DATA_TYPE__H_ + +#include + +namespace infini::ops { + +using infini::rt::cuda_bfloat16; +using infini::rt::cuda_bfloat162; + +} // namespace infini::ops + +#endif diff --git a/src/native/cuda/nvidia/device_.h b/src/native/cuda/nvidia/device_.h new file mode 100644 index 000000000..b46caf035 --- /dev/null +++ b/src/native/cuda/nvidia/device_.h @@ -0,0 +1,6 @@ +#ifndef INFINI_OPS_NVIDIA_DEVICE__H_ +#define INFINI_OPS_NVIDIA_DEVICE__H_ + +#include + +#endif diff --git a/src/native/cuda/nvidia/ops/add/kernel.h b/src/native/cuda/nvidia/ops/add/kernel.h index 0be5b5c4c..373f5a775 100644 --- a/src/native/cuda/nvidia/ops/add/kernel.h +++ b/src/native/cuda/nvidia/ops/add/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include -#include "native/cuda/nvidia/runtime_utils.h" +#include "native/cuda/nvidia/runtime_.h" #include "native/cuda/ops/add/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/add_rms_norm/kernel.h b/src/native/cuda/nvidia/ops/add_rms_norm/kernel.h index bec736bc9..2bb6f6051 100644 --- a/src/native/cuda/nvidia/ops/add_rms_norm/kernel.h +++ b/src/native/cuda/nvidia/ops/add_rms_norm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include -#include "native/cuda/nvidia/runtime_utils.h" +#include "native/cuda/nvidia/runtime_.h" #include "native/cuda/ops/add_rms_norm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/causal_softmax/kernel.h b/src/native/cuda/nvidia/ops/causal_softmax/kernel.h index 19438b9f3..5a7e18a94 100644 --- a/src/native/cuda/nvidia/ops/causal_softmax/kernel.h +++ b/src/native/cuda/nvidia/ops/causal_softmax/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include -#include "native/cuda/nvidia/runtime_utils.h" +#include "native/cuda/nvidia/runtime_.h" #include "native/cuda/ops/causal_softmax/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/conv_infinilm/kernel.h b/src/native/cuda/nvidia/ops/conv_infinilm/kernel.h index 6c83855db..3cd648895 100644 --- a/src/native/cuda/nvidia/ops/conv_infinilm/kernel.h +++ b/src/native/cuda/nvidia/ops/conv_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include -#include "native/cuda/nvidia/runtime_utils.h" +#include "native/cuda/nvidia/runtime_.h" #include "native/cuda/ops/conv_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/embedding/kernel.h b/src/native/cuda/nvidia/ops/embedding/kernel.h index c97d161cc..f05f155ae 100644 --- a/src/native/cuda/nvidia/ops/embedding/kernel.h +++ b/src/native/cuda/nvidia/ops/embedding/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include -#include "native/cuda/nvidia/runtime_utils.h" +#include "native/cuda/nvidia/runtime_.h" #include "native/cuda/ops/embedding/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/gelu_infinilm/kernel.h b/src/native/cuda/nvidia/ops/gelu_infinilm/kernel.h index 99f709a85..e1825b449 100644 --- a/src/native/cuda/nvidia/ops/gelu_infinilm/kernel.h +++ b/src/native/cuda/nvidia/ops/gelu_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include -#include "native/cuda/nvidia/runtime_utils.h" +#include "native/cuda/nvidia/runtime_.h" #include "native/cuda/ops/gelu_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/gelutanh_infinilm/kernel.h b/src/native/cuda/nvidia/ops/gelutanh_infinilm/kernel.h index ecb60de1b..15fff6a06 100644 --- a/src/native/cuda/nvidia/ops/gelutanh_infinilm/kernel.h +++ b/src/native/cuda/nvidia/ops/gelutanh_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include -#include "native/cuda/nvidia/runtime_utils.h" +#include "native/cuda/nvidia/runtime_.h" #include "native/cuda/ops/gelutanh_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/gemm/cublaslt.h b/src/native/cuda/nvidia/ops/gemm/cublaslt.h index 28f831991..dee5d9a2e 100644 --- a/src/native/cuda/nvidia/ops/gemm/cublaslt.h +++ b/src/native/cuda/nvidia/ops/gemm/cublaslt.h @@ -10,8 +10,7 @@ #include "base/gemm.h" #include "native/cuda/nvidia/blas_utils.h" -#include -#include "native/cuda/nvidia/runtime_utils.h" +#include "native/cuda/nvidia/runtime_.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/kv_caching_infinilm/kernel.h b/src/native/cuda/nvidia/ops/kv_caching_infinilm/kernel.h index 8c52ab5f1..b4e7405bf 100644 --- a/src/native/cuda/nvidia/ops/kv_caching_infinilm/kernel.h +++ b/src/native/cuda/nvidia/ops/kv_caching_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include -#include "native/cuda/nvidia/runtime_utils.h" +#include "native/cuda/nvidia/runtime_.h" #include "native/cuda/ops/kv_caching_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/paged_attention_infinilm/kernel.h b/src/native/cuda/nvidia/ops/paged_attention_infinilm/kernel.h index 9c329a2bf..08286ffdf 100644 --- a/src/native/cuda/nvidia/ops/paged_attention_infinilm/kernel.h +++ b/src/native/cuda/nvidia/ops/paged_attention_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include -#include "native/cuda/nvidia/runtime_utils.h" +#include "native/cuda/nvidia/runtime_.h" #include "native/cuda/ops/paged_attention_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/paged_attention_prefill_infinilm/kernel.h b/src/native/cuda/nvidia/ops/paged_attention_prefill_infinilm/kernel.h index e1e7d7f39..809590e6b 100644 --- a/src/native/cuda/nvidia/ops/paged_attention_prefill_infinilm/kernel.h +++ b/src/native/cuda/nvidia/ops/paged_attention_prefill_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include -#include "native/cuda/nvidia/runtime_utils.h" +#include "native/cuda/nvidia/runtime_.h" #include "native/cuda/ops/paged_attention_prefill_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/paged_caching_infinilm/kernel.h b/src/native/cuda/nvidia/ops/paged_caching_infinilm/kernel.h index be581b128..524eb0c71 100644 --- a/src/native/cuda/nvidia/ops/paged_caching_infinilm/kernel.h +++ b/src/native/cuda/nvidia/ops/paged_caching_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include -#include "native/cuda/nvidia/runtime_utils.h" +#include "native/cuda/nvidia/runtime_.h" #include "native/cuda/ops/paged_caching_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/random_sample_infinilm/kernel.h b/src/native/cuda/nvidia/ops/random_sample_infinilm/kernel.h index 0e9764f1c..f2608bf64 100644 --- a/src/native/cuda/nvidia/ops/random_sample_infinilm/kernel.h +++ b/src/native/cuda/nvidia/ops/random_sample_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include -#include "native/cuda/nvidia/runtime_utils.h" +#include "native/cuda/nvidia/runtime_.h" #include "native/cuda/ops/random_sample_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/rearrange_infinilm/kernel.h b/src/native/cuda/nvidia/ops/rearrange_infinilm/kernel.h index 9015e9ec1..8ab25c69a 100644 --- a/src/native/cuda/nvidia/ops/rearrange_infinilm/kernel.h +++ b/src/native/cuda/nvidia/ops/rearrange_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include -#include "native/cuda/nvidia/runtime_utils.h" +#include "native/cuda/nvidia/runtime_.h" #include "native/cuda/ops/rearrange_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/relu_infinilm/kernel.h b/src/native/cuda/nvidia/ops/relu_infinilm/kernel.h index 0c1b9e04f..37f140caa 100644 --- a/src/native/cuda/nvidia/ops/relu_infinilm/kernel.h +++ b/src/native/cuda/nvidia/ops/relu_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include -#include "native/cuda/nvidia/runtime_utils.h" +#include "native/cuda/nvidia/runtime_.h" #include "native/cuda/ops/relu_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/rms_norm/kernel.h b/src/native/cuda/nvidia/ops/rms_norm/kernel.h index 09ad3332b..74cfab17a 100644 --- a/src/native/cuda/nvidia/ops/rms_norm/kernel.h +++ b/src/native/cuda/nvidia/ops/rms_norm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include -#include "native/cuda/nvidia/runtime_utils.h" +#include "native/cuda/nvidia/runtime_.h" #include "native/cuda/ops/rms_norm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/rotary_embedding_infinilm/kernel.h b/src/native/cuda/nvidia/ops/rotary_embedding_infinilm/kernel.h index f80dfdbf0..93014ad0d 100644 --- a/src/native/cuda/nvidia/ops/rotary_embedding_infinilm/kernel.h +++ b/src/native/cuda/nvidia/ops/rotary_embedding_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include -#include "native/cuda/nvidia/runtime_utils.h" +#include "native/cuda/nvidia/runtime_.h" #include "native/cuda/ops/rotary_embedding_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/sigmoid_infinilm/kernel.h b/src/native/cuda/nvidia/ops/sigmoid_infinilm/kernel.h index eeaff3f26..44c504ee3 100644 --- a/src/native/cuda/nvidia/ops/sigmoid_infinilm/kernel.h +++ b/src/native/cuda/nvidia/ops/sigmoid_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include -#include "native/cuda/nvidia/runtime_utils.h" +#include "native/cuda/nvidia/runtime_.h" #include "native/cuda/ops/sigmoid_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/silu/kernel.h b/src/native/cuda/nvidia/ops/silu/kernel.h index f57c689c4..7f5001f71 100644 --- a/src/native/cuda/nvidia/ops/silu/kernel.h +++ b/src/native/cuda/nvidia/ops/silu/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include -#include "native/cuda/nvidia/runtime_utils.h" +#include "native/cuda/nvidia/runtime_.h" #include "native/cuda/ops/silu/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/silu_and_mul_infinilm/kernel.h b/src/native/cuda/nvidia/ops/silu_and_mul_infinilm/kernel.h index 4e1958c2c..9f908b1fd 100644 --- a/src/native/cuda/nvidia/ops/silu_and_mul_infinilm/kernel.h +++ b/src/native/cuda/nvidia/ops/silu_and_mul_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include -#include "native/cuda/nvidia/runtime_utils.h" +#include "native/cuda/nvidia/runtime_.h" #include "native/cuda/ops/silu_and_mul_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/softmax_infinilm/kernel.h b/src/native/cuda/nvidia/ops/softmax_infinilm/kernel.h index 03e422ca0..d00d143c4 100644 --- a/src/native/cuda/nvidia/ops/softmax_infinilm/kernel.h +++ b/src/native/cuda/nvidia/ops/softmax_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include -#include "native/cuda/nvidia/runtime_utils.h" +#include "native/cuda/nvidia/runtime_.h" #include "native/cuda/ops/softmax_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/swiglu/kernel.h b/src/native/cuda/nvidia/ops/swiglu/kernel.h index a1e366676..fd58f29ef 100644 --- a/src/native/cuda/nvidia/ops/swiglu/kernel.h +++ b/src/native/cuda/nvidia/ops/swiglu/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include -#include "native/cuda/nvidia/runtime_utils.h" +#include "native/cuda/nvidia/runtime_.h" #include "native/cuda/ops/swiglu/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/topksoftmax_infinilm/kernel.h b/src/native/cuda/nvidia/ops/topksoftmax_infinilm/kernel.h index 9e4fc15c3..fd8897dd6 100644 --- a/src/native/cuda/nvidia/ops/topksoftmax_infinilm/kernel.h +++ b/src/native/cuda/nvidia/ops/topksoftmax_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include -#include "native/cuda/nvidia/runtime_utils.h" +#include "native/cuda/nvidia/runtime_.h" #include "native/cuda/ops/topksoftmax_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/ops/zeros_infinilm/kernel.h b/src/native/cuda/nvidia/ops/zeros_infinilm/kernel.h index 5b24b6d93..a6092c5dc 100644 --- a/src/native/cuda/nvidia/ops/zeros_infinilm/kernel.h +++ b/src/native/cuda/nvidia/ops/zeros_infinilm/kernel.h @@ -4,8 +4,7 @@ #include #include "native/cuda/nvidia/caster.cuh" -#include -#include "native/cuda/nvidia/runtime_utils.h" +#include "native/cuda/nvidia/runtime_.h" #include "native/cuda/ops/zeros_infinilm/kernel.h" namespace infini::ops { diff --git a/src/native/cuda/nvidia/runtime_.h b/src/native/cuda/nvidia/runtime_.h new file mode 100644 index 000000000..5745b3ffd --- /dev/null +++ b/src/native/cuda/nvidia/runtime_.h @@ -0,0 +1,8 @@ +#ifndef INFINI_OPS_NVIDIA_RUNTIME_H_ +#define INFINI_OPS_NVIDIA_RUNTIME_H_ + +#include +#include "native/cuda/nvidia/runtime_utils.h" +#include "runtime.h" + +#endif diff --git a/src/native/cuda/ops/rotary_embedding_infinilm/kernel.cuh b/src/native/cuda/ops/rotary_embedding_infinilm/kernel.cuh index ff2725401..22c904aa6 100644 --- a/src/native/cuda/ops/rotary_embedding_infinilm/kernel.cuh +++ b/src/native/cuda/ops/rotary_embedding_infinilm/kernel.cuh @@ -9,9 +9,6 @@ namespace infini::ops { -using infini::rt::cuda_bfloat16; -using infini::rt::cuda_bfloat162; - template struct VecTypeHelper {}; diff --git a/src/native/cuda/runtime_utils.h b/src/native/cuda/runtime_utils.h index e28d8c554..c5fb79b0c 100644 --- a/src/native/cuda/runtime_utils.h +++ b/src/native/cuda/runtime_utils.h @@ -1,7 +1,7 @@ #ifndef INFINI_OPS_CUDA_RUNTIME_UTILS_H_ #define INFINI_OPS_CUDA_RUNTIME_UTILS_H_ -#include "runtime.h" +#include "device.h" namespace infini::ops { From a825ef6a3764dcf5f3dfa40c594e32aa316ea5c6 Mon Sep 17 00:00:00 2001 From: Jiacheng Huang Date: Thu, 18 Jun 2026 10:05:01 +0800 Subject: [PATCH 05/12] fix: consume installed InfiniRT dependency --- CMakeLists.txt | 92 ++++++++++++++++++++++++++++++++---- scripts/generate_wrappers.py | 33 +++++++++++++ src/CMakeLists.txt | 24 ++++++---- src/native/cpu/data_type_.h | 6 +++ src/native/cpu/device_.h | 6 +++ src/native/cpu/runtime_.h | 7 +++ 6 files changed, 151 insertions(+), 17 deletions(-) create mode 100644 src/native/cpu/data_type_.h create mode 100644 src/native/cpu/device_.h create mode 100644 src/native/cpu/runtime_.h diff --git a/CMakeLists.txt b/CMakeLists.txt index ab8d8bb5d..39c2d25e9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -500,15 +500,89 @@ if(WITH_HYGON AND NOT EXISTS "${DTK_ROOT}/llvm/lib/LLVMgold.so") set(PYBIND11_ENABLE_EXTRAS OFF) endif() -set(INFINI_RT_SOURCE_DIR "${PROJECT_SOURCE_DIR}/../InfiniRT" CACHE PATH "InfiniRT source directory") -if(NOT EXISTS "${INFINI_RT_SOURCE_DIR}/CMakeLists.txt") - message(FATAL_ERROR - "InfiniRT not found at `${INFINI_RT_SOURCE_DIR}`. " - "Set `INFINI_RT_SOURCE_DIR` to the InfiniRT source directory.") -endif() -if(NOT TARGET infinirt) - add_subdirectory("${INFINI_RT_SOURCE_DIR}" "${CMAKE_BINARY_DIR}/InfiniRT") -endif() +set(INFINI_RT_ROOT "$ENV{INFINI_RT_ROOT}" CACHE PATH "Installed InfiniRT prefix") +set(INFINI_RT_SOURCE_DIR "" CACHE PATH "InfiniRT source directory for development builds") +set(INFINI_RT_INCLUDE_DIRS "" CACHE STRING "InfiniRT include directories") +set(INFINI_RT_LIBRARY "" CACHE FILEPATH "InfiniRT library") + +function(_infiniops_configure_infinirt) + if(TARGET infinirt) + get_target_property(_infinirt_include_dirs infinirt INTERFACE_INCLUDE_DIRECTORIES) + if(_infinirt_include_dirs) + set(INFINI_RT_INCLUDE_DIRS "${_infinirt_include_dirs}" CACHE STRING "InfiniRT include directories" FORCE) + endif() + return() + endif() + + if(INFINI_RT_SOURCE_DIR) + if(NOT EXISTS "${INFINI_RT_SOURCE_DIR}/CMakeLists.txt") + message(FATAL_ERROR + "InfiniRT source tree not found at `${INFINI_RT_SOURCE_DIR}`. " + "Unset `INFINI_RT_SOURCE_DIR` to use an installed InfiniRT package.") + endif() + + add_subdirectory("${INFINI_RT_SOURCE_DIR}" "${CMAKE_BINARY_DIR}/InfiniRT") + set(INFINI_RT_INCLUDE_DIRS + "${INFINI_RT_SOURCE_DIR}/include;${INFINI_RT_SOURCE_DIR}/generated/include" + CACHE STRING "InfiniRT include directories" FORCE) + return() + endif() + + set(_infinirt_include_hints ${INFINI_RT_INCLUDE_DIRS}) + if(INFINI_RT_ROOT) + list(APPEND _infinirt_include_hints "${INFINI_RT_ROOT}/include") + endif() + + find_path(_INFINI_RT_INCLUDE_DIR + NAMES infini/rt.h + HINTS ${_infinirt_include_hints} + PATH_SUFFIXES include + ) + + if(INFINI_RT_LIBRARY) + set(_INFINI_RT_LIBRARY "${INFINI_RT_LIBRARY}") + else() + set(_infinirt_library_hints "") + if(INFINI_RT_ROOT) + list(APPEND _infinirt_library_hints + "${INFINI_RT_ROOT}/lib" + "${INFINI_RT_ROOT}/lib64") + endif() + find_library(_INFINI_RT_LIBRARY + NAMES infinirt + HINTS ${_infinirt_library_hints} + ) + endif() + + if(NOT _INFINI_RT_INCLUDE_DIR OR NOT _INFINI_RT_LIBRARY) + message(FATAL_ERROR + "InfiniRT installed headers/library were not found. " + "Set `INFINI_RT_ROOT` to an installed InfiniRT prefix, or set " + "`INFINI_RT_INCLUDE_DIRS` and `INFINI_RT_LIBRARY` explicitly. " + "For development builds from source, set `INFINI_RT_SOURCE_DIR`.") + endif() + + set(_infinirt_resolved_include_dirs "") + foreach(_infinirt_include_dir IN LISTS _infinirt_include_hints) + if(_infinirt_include_dir AND EXISTS "${_infinirt_include_dir}") + list(APPEND _infinirt_resolved_include_dirs "${_infinirt_include_dir}") + endif() + endforeach() + list(APPEND _infinirt_resolved_include_dirs "${_INFINI_RT_INCLUDE_DIR}") + list(REMOVE_DUPLICATES _infinirt_resolved_include_dirs) + + add_library(infinirt SHARED IMPORTED GLOBAL) + set_target_properties(infinirt PROPERTIES + IMPORTED_LOCATION "${_INFINI_RT_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES "${_infinirt_resolved_include_dirs}" + ) + + set(INFINI_RT_INCLUDE_DIRS "${_infinirt_resolved_include_dirs}" CACHE STRING "InfiniRT include directories" FORCE) + set(INFINI_RT_LIBRARY "${_INFINI_RT_LIBRARY}" CACHE FILEPATH "InfiniRT library" FORCE) + message(STATUS "Using installed InfiniRT: ${_INFINI_RT_LIBRARY}") +endfunction() + +_infiniops_configure_infinirt() add_subdirectory(src) diff --git a/scripts/generate_wrappers.py b/scripts/generate_wrappers.py index 84075bd49..0384e0423 100644 --- a/scripts/generate_wrappers.py +++ b/scripts/generate_wrappers.py @@ -83,6 +83,38 @@ def _op_relative_type(op_name): return "::".join(parts) +def _get_infini_rt_include_flags(): + include_dirs = [] + + for include_dir in os.environ.get("INFINI_RT_INCLUDE_DIRS", "").split(os.pathsep): + if include_dir: + include_dirs.append(pathlib.Path(include_dir)) + + infini_rt_root = os.environ.get("INFINI_RT_ROOT") + if infini_rt_root: + include_dirs.append(pathlib.Path(infini_rt_root) / "include") + + infini_rt_source_dir = os.environ.get("INFINI_RT_SOURCE_DIR") + if infini_rt_source_dir: + infini_rt_source_path = pathlib.Path(infini_rt_source_dir) + include_dirs.extend( + ( + infini_rt_source_path / "include", + infini_rt_source_path / "generated" / "include", + ) + ) + + flags = [] + seen = set() + for include_dir in include_dirs: + include_dir = include_dir.resolve() + if include_dir.exists() and include_dir not in seen: + flags.extend(("-I", str(include_dir))) + seen.add(include_dir) + + return tuple(flags) + + def _write_text_if_changed(path: pathlib.Path, content: str) -> bool: """Write `content` only when the file's bytes would change.""" path.parent.mkdir(parents=True, exist_ok=True) @@ -194,6 +226,7 @@ def __call__(self, op_name): "-I", str(_GENERATION_DIR), ) + + _get_infini_rt_include_flags() + _get_system_include_flags() ) translation_unit = index.parse(str(_find_base_header(op_name)), args=args) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e46f516bf..9a0769494 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -41,6 +41,12 @@ target_sources(infiniops PRIVATE ${BASE_SRCS}) target_link_libraries(infiniops PUBLIC infinirt) +set(INFINI_RT_INCLUDE_FLAGS "") +foreach(_include_dir IN LISTS INFINI_RT_INCLUDE_DIRS) + list(APPEND INFINI_RT_INCLUDE_FLAGS "-I${_include_dir}") +endforeach() +list(JOIN INFINI_RT_INCLUDE_DIRS ":" INFINI_RT_INCLUDE_DIRS_ENV) + set(DEVICE_LIST "") if(WITH_CPU) @@ -226,8 +232,7 @@ if(WITH_CAMBRICON) set(MLU_COMPILE_OPTS -c --bang-mlu-arch=mtp_592 -O3 -fPIC -Wall -Werror -std=c++17 -pthread -I${CMAKE_CURRENT_SOURCE_DIR} -I${NEUWARE_HOME}/include - -I${INFINI_RT_SOURCE_DIR}/include - -I${INFINI_RT_SOURCE_DIR}/generated/include + ${INFINI_RT_INCLUDE_FLAGS} -idirafter /usr/local/neuware/lib/clang/11.1.0/include ) function(compile_mlu_file src_file) @@ -604,7 +609,7 @@ if(GENERATE_OPERATOR_CALL_INSTANTIATIONS OR GENERATE_PYTHON_BINDINGS) execute_process( COMMAND ${CMAKE_COMMAND} -E env - INFINI_RT_SOURCE_DIR=${INFINI_RT_SOURCE_DIR} + INFINI_RT_INCLUDE_DIRS=${INFINI_RT_INCLUDE_DIRS_ENV} ${Python_EXECUTABLE} ${PROJECT_SOURCE_DIR}/scripts/generate_wrappers.py ${GENERATOR_ARGS} @@ -633,8 +638,7 @@ if(GENERATE_OPERATOR_CALL_INSTANTIATIONS) "-I${PROJECT_SOURCE_DIR}" "-I${PROJECT_SOURCE_DIR}/generated" "-I${PROJECT_SOURCE_DIR}/generated/include" - "-I${INFINI_RT_SOURCE_DIR}/include" - "-I${INFINI_RT_SOURCE_DIR}/generated/include") + ${INFINI_RT_INCLUDE_FLAGS}) foreach(_dir IN LISTS TORCH_INCLUDE_DIRS CUDAToolkit_INCLUDE_DIRS) list(APPEND _iluvatar_call_instantiation_include_flags "-I${_dir}") endforeach() @@ -809,8 +813,7 @@ if(GENERATE_PYTHON_BINDINGS) "-I${CMAKE_CURRENT_SOURCE_DIR}" "-I${PROJECT_SOURCE_DIR}" "-I${PROJECT_SOURCE_DIR}/generated" - "-I${INFINI_RT_SOURCE_DIR}/include" - "-I${INFINI_RT_SOURCE_DIR}/generated/include") + ${INFINI_RT_INCLUDE_FLAGS}) foreach(_dir IN LISTS TORCH_INCLUDE_DIRS CUDAToolkit_INCLUDE_DIRS) list(APPEND _iluvatar_dispatch_include_flags "-I${_dir}") endforeach() @@ -938,7 +941,12 @@ if(GENERATE_PYTHON_BINDINGS) set_target_properties(infiniops PROPERTIES INSTALL_RPATH "${_INFINI_OPS_INSTALL_RPATH}") set_target_properties(ops PROPERTIES INSTALL_RPATH "${_INFINI_OPS_INSTALL_RPATH}") - install(TARGETS infinirt infiniops ops DESTINATION .) + get_target_property(_infinirt_imported infinirt IMPORTED) + if(_infinirt_imported) + install(TARGETS infiniops ops DESTINATION .) + else() + install(TARGETS infinirt infiniops ops DESTINATION .) + endif() file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/__init__.py" "") install(FILES "${CMAKE_CURRENT_BINARY_DIR}/__init__.py" DESTINATION .) diff --git a/src/native/cpu/data_type_.h b/src/native/cpu/data_type_.h new file mode 100644 index 000000000..815de89a4 --- /dev/null +++ b/src/native/cpu/data_type_.h @@ -0,0 +1,6 @@ +#ifndef INFINI_OPS_CPU_DATA_TYPE__H_ +#define INFINI_OPS_CPU_DATA_TYPE__H_ + +#include + +#endif diff --git a/src/native/cpu/device_.h b/src/native/cpu/device_.h new file mode 100644 index 000000000..534e4dcd2 --- /dev/null +++ b/src/native/cpu/device_.h @@ -0,0 +1,6 @@ +#ifndef INFINI_OPS_CPU_DEVICE__H_ +#define INFINI_OPS_CPU_DEVICE__H_ + +#include + +#endif diff --git a/src/native/cpu/runtime_.h b/src/native/cpu/runtime_.h new file mode 100644 index 000000000..e62eff67e --- /dev/null +++ b/src/native/cpu/runtime_.h @@ -0,0 +1,7 @@ +#ifndef INFINI_OPS_CPU_RUNTIME_H_ +#define INFINI_OPS_CPU_RUNTIME_H_ + +#include +#include "runtime.h" + +#endif From 507b5c5674caf07d66b3a70e36e30050b13a9dcd Mon Sep 17 00:00:00 2001 From: Jiacheng Huang Date: Thu, 18 Jun 2026 10:08:38 +0800 Subject: [PATCH 06/12] style: format InfiniRT include shims --- examples/runtime_api.h | 1 + src/data_type.h | 7 ++++--- src/device.h | 3 ++- src/native/ascend/runtime_.h | 1 + src/native/cambricon/runtime_.h | 1 + src/native/cpu/caster_.h | 3 ++- src/native/cpu/runtime_.h | 1 + src/native/cuda/iluvatar/runtime_.h | 1 + src/native/cuda/metax/runtime_.h | 1 + src/native/cuda/moore/runtime_.h | 1 + src/native/cuda/nvidia/runtime_.h | 1 + 11 files changed, 16 insertions(+), 5 deletions(-) diff --git a/examples/runtime_api.h b/examples/runtime_api.h index 292b40408..1bbfdc1f3 100644 --- a/examples/runtime_api.h +++ b/examples/runtime_api.h @@ -25,6 +25,7 @@ #include "native/ascend/runtime_.h" #elif WITH_CPU #include + #include "native/cpu/ops/gemm/gemm.h" #else #error "One `WITH_*` backend must be enabled for the examples." diff --git a/src/data_type.h b/src/data_type.h index f5d53b2e1..4c7329cdc 100644 --- a/src/data_type.h +++ b/src/data_type.h @@ -1,19 +1,20 @@ #ifndef INFINI_OPS_DATA_TYPE_H_ #define INFINI_OPS_DATA_TYPE_H_ +#include + #include "common/traits.h" #include "device.h" -#include namespace infini::ops { using infini::rt::DataType; -using infini::rt::Float16; using infini::rt::BFloat16; +using infini::rt::Float16; -using infini::rt::kDataTypeToSize; using infini::rt::kDataTypeToDesc; +using infini::rt::kDataTypeToSize; using infini::rt::kStringToDataType; template diff --git a/src/device.h b/src/device.h index 0f41f647c..41a3af3b1 100644 --- a/src/device.h +++ b/src/device.h @@ -1,9 +1,10 @@ #ifndef INFINI_OPS_DEVICE_H_ #define INFINI_OPS_DEVICE_H_ -#include "common/traits.h" #include +#include "common/traits.h" + namespace infini::ops { using Device = infini::rt::Device; diff --git a/src/native/ascend/runtime_.h b/src/native/ascend/runtime_.h index 19e374ae2..ed21ad687 100644 --- a/src/native/ascend/runtime_.h +++ b/src/native/ascend/runtime_.h @@ -2,6 +2,7 @@ #define INFINI_OPS_ASCEND_RUNTIME__H_ #include + #include "runtime.h" #endif diff --git a/src/native/cambricon/runtime_.h b/src/native/cambricon/runtime_.h index a1ade0d80..1ca395d09 100644 --- a/src/native/cambricon/runtime_.h +++ b/src/native/cambricon/runtime_.h @@ -2,6 +2,7 @@ #define INFINI_OPS_CAMBRICON_RUNTIME_H_ #include + #include "runtime.h" #endif diff --git a/src/native/cpu/caster_.h b/src/native/cpu/caster_.h index 10348c0e0..3766191ed 100644 --- a/src/native/cpu/caster_.h +++ b/src/native/cpu/caster_.h @@ -1,10 +1,11 @@ #ifndef INFINI_OPS_COMMON_CPU_CASTER_H_ #define INFINI_OPS_COMMON_CPU_CASTER_H_ +#include + #include #include "caster.h" -#include namespace infini::ops { diff --git a/src/native/cpu/runtime_.h b/src/native/cpu/runtime_.h index e62eff67e..4cfb213ae 100644 --- a/src/native/cpu/runtime_.h +++ b/src/native/cpu/runtime_.h @@ -2,6 +2,7 @@ #define INFINI_OPS_CPU_RUNTIME_H_ #include + #include "runtime.h" #endif diff --git a/src/native/cuda/iluvatar/runtime_.h b/src/native/cuda/iluvatar/runtime_.h index 28f2c67bc..5983e5bbb 100644 --- a/src/native/cuda/iluvatar/runtime_.h +++ b/src/native/cuda/iluvatar/runtime_.h @@ -2,6 +2,7 @@ #define INFINI_OPS_ILUVATAR_RUNTIME_H_ #include + #include "native/cuda/iluvatar/runtime_utils.h" #include "runtime.h" diff --git a/src/native/cuda/metax/runtime_.h b/src/native/cuda/metax/runtime_.h index f952fb8b2..01e9ae74e 100644 --- a/src/native/cuda/metax/runtime_.h +++ b/src/native/cuda/metax/runtime_.h @@ -2,6 +2,7 @@ #define INFINI_OPS_METAX_RUNTIME_H_ #include + #include "native/cuda/metax/runtime_utils.h" #include "runtime.h" diff --git a/src/native/cuda/moore/runtime_.h b/src/native/cuda/moore/runtime_.h index bc960d949..14f3411b1 100644 --- a/src/native/cuda/moore/runtime_.h +++ b/src/native/cuda/moore/runtime_.h @@ -2,6 +2,7 @@ #define INFINI_OPS_MOORE_RUNTIME_H_ #include + #include "native/cuda/moore/runtime_utils.h" #include "runtime.h" diff --git a/src/native/cuda/nvidia/runtime_.h b/src/native/cuda/nvidia/runtime_.h index 5745b3ffd..d3474881d 100644 --- a/src/native/cuda/nvidia/runtime_.h +++ b/src/native/cuda/nvidia/runtime_.h @@ -2,6 +2,7 @@ #define INFINI_OPS_NVIDIA_RUNTIME_H_ #include + #include "native/cuda/nvidia/runtime_utils.h" #include "runtime.h" From b09f48c3180b55d2aa2bcb61a4101fff5281fb8c Mon Sep 17 00:00:00 2001 From: Jiacheng Huang Date: Thu, 18 Jun 2026 10:54:17 +0800 Subject: [PATCH 07/12] ci: install InfiniRT before InfiniOps tests --- .github/ci_config.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/ci_config.yml b/.github/ci_config.yml index b9979570a..3def765cc 100644 --- a/.github/ci_config.yml +++ b/.github/ci_config.yml @@ -15,7 +15,7 @@ platforms: BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3 SKIP_APT: "1" PIP_INDEX_URL: https://pypi.tuna.tsinghua.edu.cn/simple - setup: pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON + setup: pip install cmake scikit-build-core pybind11 libclang pyyaml && rm -rf /tmp/infinirt-src /tmp/infinirt-build /tmp/infinirt-prefix && git clone --depth 1 ${INFINI_RT_GIT_URL:-https://gh-proxy.com/https://github.com/InfiniTensor/InfiniRT.git} /tmp/infinirt-src && cmake -S /tmp/infinirt-src -B /tmp/infinirt-build -DWITH_CPU=ON -DWITH_NVIDIA=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/infinirt-prefix && cmake --build /tmp/infinirt-build -j$(nproc) && cmake --install /tmp/infinirt-build && pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON --config-settings=cmake.define.INFINI_RT_ROOT=/tmp/infinirt-prefix jobs: gpu: type: unittest @@ -51,7 +51,7 @@ platforms: - /lib/firmware:/lib/firmware - /usr/src:/usr/src - /lib/modules:/lib/modules - setup: python -m pip install packaging exceptiongroup typing-extensions pygments pybind11 libclang && python -m pip install . --no-build-isolation --no-deps --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON + setup: python -m pip install cmake scikit-build-core packaging exceptiongroup typing-extensions pygments pybind11 libclang pyyaml && rm -rf /tmp/infinirt-src /tmp/infinirt-build /tmp/infinirt-prefix && git clone --depth 1 ${INFINI_RT_GIT_URL:-https://gh-proxy.com/https://github.com/InfiniTensor/InfiniRT.git} /tmp/infinirt-src && cmake -S /tmp/infinirt-src -B /tmp/infinirt-build -DWITH_CPU=ON -DWITH_ILUVATAR=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/infinirt-prefix -DCMAKE_CUDA_ARCHITECTURES=OFF -DCMAKE_CUDA_COMPILER=/usr/local/corex-4.3.0.20250624/bin/clang++ -DCUDAToolkit_ROOT=/usr/local/corex-4.3.0.20250624 && cmake --build /tmp/infinirt-build -j$(nproc) && cmake --install /tmp/infinirt-build && python -m pip install . --no-build-isolation --no-deps --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON --config-settings=cmake.define.INFINI_RT_ROOT=/tmp/infinirt-prefix jobs: gpu: type: unittest @@ -81,7 +81,7 @@ platforms: - "--privileged" - "--ulimit=memlock=-1" - "--ulimit=stack=67108864" - setup: pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON + setup: pip install cmake scikit-build-core pybind11 libclang pyyaml && rm -rf /tmp/infinirt-src /tmp/infinirt-build /tmp/infinirt-prefix && git clone --depth 1 ${INFINI_RT_GIT_URL:-https://gh-proxy.com/https://github.com/InfiniTensor/InfiniRT.git} /tmp/infinirt-src && cmake -S /tmp/infinirt-src -B /tmp/infinirt-build -DWITH_CPU=ON -DWITH_METAX=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/infinirt-prefix && cmake --build /tmp/infinirt-build -j$(nproc) && cmake --install /tmp/infinirt-build && pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON --config-settings=cmake.define.INFINI_RT_ROOT=/tmp/infinirt-prefix jobs: gpu: type: unittest @@ -109,7 +109,7 @@ platforms: PIP_INDEX_URL: https://pypi.org/simple docker_args: - "--privileged" - setup: pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON + setup: pip install cmake scikit-build-core pybind11 libclang pyyaml && rm -rf /tmp/infinirt-src /tmp/infinirt-build /tmp/infinirt-prefix && git clone --depth 1 ${INFINI_RT_GIT_URL:-https://gh-proxy.com/https://github.com/InfiniTensor/InfiniRT.git} /tmp/infinirt-src && cmake -S /tmp/infinirt-src -B /tmp/infinirt-build -DWITH_CPU=ON -DWITH_MOORE=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/infinirt-prefix && cmake --build /tmp/infinirt-build -j$(nproc) && cmake --install /tmp/infinirt-build && pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON --config-settings=cmake.define.INFINI_RT_ROOT=/tmp/infinirt-prefix jobs: gpu: type: unittest @@ -136,7 +136,7 @@ platforms: PIP_INDEX_URL: https://pypi.org/simple docker_args: - "--privileged" - setup: pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON + setup: pip install cmake scikit-build-core pybind11 libclang pyyaml && rm -rf /tmp/infinirt-src /tmp/infinirt-build /tmp/infinirt-prefix && git clone --depth 1 ${INFINI_RT_GIT_URL:-https://gh-proxy.com/https://github.com/InfiniTensor/InfiniRT.git} /tmp/infinirt-src && cmake -S /tmp/infinirt-src -B /tmp/infinirt-build -DWITH_CPU=ON -DWITH_CAMBRICON=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/infinirt-prefix && cmake --build /tmp/infinirt-build -j$(nproc) && cmake --install /tmp/infinirt-build && pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON --config-settings=cmake.define.INFINI_RT_ROOT=/tmp/infinirt-prefix jobs: gpu: type: unittest @@ -172,7 +172,7 @@ platforms: - "--group-add=video" volumes: - /opt/hyhal:/opt/hyhal:ro - setup: pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON + setup: pip install cmake scikit-build-core pybind11 libclang pyyaml && rm -rf /tmp/infinirt-src /tmp/infinirt-build /tmp/infinirt-prefix && git clone --depth 1 ${INFINI_RT_GIT_URL:-https://gh-proxy.com/https://github.com/InfiniTensor/InfiniRT.git} /tmp/infinirt-src && cmake -S /tmp/infinirt-src -B /tmp/infinirt-build -DWITH_CPU=ON -DWITH_HYGON=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/infinirt-prefix && cmake --build /tmp/infinirt-build -j$(nproc) && cmake --install /tmp/infinirt-build && pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON --config-settings=cmake.define.INFINI_RT_ROOT=/tmp/infinirt-prefix jobs: gpu: type: unittest @@ -209,7 +209,7 @@ platforms: - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi:ro env: ASCEND_HOME_PATH: /usr/local/Ascend/ascend-toolkit/latest - setup: pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON + setup: pip install cmake scikit-build-core pybind11 libclang pyyaml && rm -rf /tmp/infinirt-src /tmp/infinirt-build /tmp/infinirt-prefix && git clone --depth 1 ${INFINI_RT_GIT_URL:-https://gh-proxy.com/https://github.com/InfiniTensor/InfiniRT.git} /tmp/infinirt-src && cmake -S /tmp/infinirt-src -B /tmp/infinirt-build -DWITH_CPU=ON -DWITH_ASCEND=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/infinirt-prefix && cmake --build /tmp/infinirt-build -j$(nproc) && cmake --install /tmp/infinirt-build && pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON --config-settings=cmake.define.INFINI_RT_ROOT=/tmp/infinirt-prefix jobs: npu: type: unittest From 629e56b65fd188712e59ac9366fa67a83ab5ab1c Mon Sep 17 00:00:00 2001 From: Jiacheng Huang Date: Thu, 18 Jun 2026 11:25:59 +0800 Subject: [PATCH 08/12] ci: install InfiniRT without cmake wheel dependency --- .github/ci_config.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/ci_config.yml b/.github/ci_config.yml index 3def765cc..65b2b208f 100644 --- a/.github/ci_config.yml +++ b/.github/ci_config.yml @@ -15,7 +15,7 @@ platforms: BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3 SKIP_APT: "1" PIP_INDEX_URL: https://pypi.tuna.tsinghua.edu.cn/simple - setup: pip install cmake scikit-build-core pybind11 libclang pyyaml && rm -rf /tmp/infinirt-src /tmp/infinirt-build /tmp/infinirt-prefix && git clone --depth 1 ${INFINI_RT_GIT_URL:-https://gh-proxy.com/https://github.com/InfiniTensor/InfiniRT.git} /tmp/infinirt-src && cmake -S /tmp/infinirt-src -B /tmp/infinirt-build -DWITH_CPU=ON -DWITH_NVIDIA=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/infinirt-prefix && cmake --build /tmp/infinirt-build -j$(nproc) && cmake --install /tmp/infinirt-build && pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON --config-settings=cmake.define.INFINI_RT_ROOT=/tmp/infinirt-prefix + setup: pip install scikit-build-core pybind11 libclang pyyaml && rm -rf /tmp/infinirt-src /tmp/infinirt-build /tmp/infinirt-prefix && git clone --depth 1 ${INFINI_RT_GIT_URL:-https://gh-proxy.com/https://github.com/InfiniTensor/InfiniRT.git} /tmp/infinirt-src && cmake -S /tmp/infinirt-src -B /tmp/infinirt-build -DWITH_CPU=ON -DWITH_NVIDIA=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/infinirt-prefix && cmake --build /tmp/infinirt-build -j$(nproc) && cmake --install /tmp/infinirt-build && pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON --config-settings=cmake.define.INFINI_RT_ROOT=/tmp/infinirt-prefix jobs: gpu: type: unittest @@ -51,7 +51,7 @@ platforms: - /lib/firmware:/lib/firmware - /usr/src:/usr/src - /lib/modules:/lib/modules - setup: python -m pip install cmake scikit-build-core packaging exceptiongroup typing-extensions pygments pybind11 libclang pyyaml && rm -rf /tmp/infinirt-src /tmp/infinirt-build /tmp/infinirt-prefix && git clone --depth 1 ${INFINI_RT_GIT_URL:-https://gh-proxy.com/https://github.com/InfiniTensor/InfiniRT.git} /tmp/infinirt-src && cmake -S /tmp/infinirt-src -B /tmp/infinirt-build -DWITH_CPU=ON -DWITH_ILUVATAR=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/infinirt-prefix -DCMAKE_CUDA_ARCHITECTURES=OFF -DCMAKE_CUDA_COMPILER=/usr/local/corex-4.3.0.20250624/bin/clang++ -DCUDAToolkit_ROOT=/usr/local/corex-4.3.0.20250624 && cmake --build /tmp/infinirt-build -j$(nproc) && cmake --install /tmp/infinirt-build && python -m pip install . --no-build-isolation --no-deps --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON --config-settings=cmake.define.INFINI_RT_ROOT=/tmp/infinirt-prefix + setup: python -m pip install scikit-build-core packaging exceptiongroup typing-extensions pygments pybind11 libclang pyyaml && rm -rf /tmp/infinirt-src /tmp/infinirt-build /tmp/infinirt-prefix && git clone --depth 1 ${INFINI_RT_GIT_URL:-https://gh-proxy.com/https://github.com/InfiniTensor/InfiniRT.git} /tmp/infinirt-src && cmake -S /tmp/infinirt-src -B /tmp/infinirt-build -DWITH_CPU=ON -DWITH_ILUVATAR=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/infinirt-prefix -DCMAKE_CUDA_ARCHITECTURES=OFF -DCMAKE_CUDA_COMPILER=/usr/local/corex-4.3.0.20250624/bin/clang++ -DCUDAToolkit_ROOT=/usr/local/corex-4.3.0.20250624 && cmake --build /tmp/infinirt-build -j$(nproc) && cmake --install /tmp/infinirt-build && python -m pip install . --no-build-isolation --no-deps --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON --config-settings=cmake.define.INFINI_RT_ROOT=/tmp/infinirt-prefix jobs: gpu: type: unittest @@ -81,7 +81,7 @@ platforms: - "--privileged" - "--ulimit=memlock=-1" - "--ulimit=stack=67108864" - setup: pip install cmake scikit-build-core pybind11 libclang pyyaml && rm -rf /tmp/infinirt-src /tmp/infinirt-build /tmp/infinirt-prefix && git clone --depth 1 ${INFINI_RT_GIT_URL:-https://gh-proxy.com/https://github.com/InfiniTensor/InfiniRT.git} /tmp/infinirt-src && cmake -S /tmp/infinirt-src -B /tmp/infinirt-build -DWITH_CPU=ON -DWITH_METAX=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/infinirt-prefix && cmake --build /tmp/infinirt-build -j$(nproc) && cmake --install /tmp/infinirt-build && pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON --config-settings=cmake.define.INFINI_RT_ROOT=/tmp/infinirt-prefix + setup: pip install scikit-build-core pybind11 libclang pyyaml && rm -rf /tmp/infinirt-src /tmp/infinirt-build /tmp/infinirt-prefix && git clone --depth 1 ${INFINI_RT_GIT_URL:-https://gh-proxy.com/https://github.com/InfiniTensor/InfiniRT.git} /tmp/infinirt-src && cmake -S /tmp/infinirt-src -B /tmp/infinirt-build -DWITH_CPU=ON -DWITH_METAX=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/infinirt-prefix && cmake --build /tmp/infinirt-build -j$(nproc) && cmake --install /tmp/infinirt-build && pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON --config-settings=cmake.define.INFINI_RT_ROOT=/tmp/infinirt-prefix jobs: gpu: type: unittest @@ -109,7 +109,7 @@ platforms: PIP_INDEX_URL: https://pypi.org/simple docker_args: - "--privileged" - setup: pip install cmake scikit-build-core pybind11 libclang pyyaml && rm -rf /tmp/infinirt-src /tmp/infinirt-build /tmp/infinirt-prefix && git clone --depth 1 ${INFINI_RT_GIT_URL:-https://gh-proxy.com/https://github.com/InfiniTensor/InfiniRT.git} /tmp/infinirt-src && cmake -S /tmp/infinirt-src -B /tmp/infinirt-build -DWITH_CPU=ON -DWITH_MOORE=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/infinirt-prefix && cmake --build /tmp/infinirt-build -j$(nproc) && cmake --install /tmp/infinirt-build && pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON --config-settings=cmake.define.INFINI_RT_ROOT=/tmp/infinirt-prefix + setup: pip install scikit-build-core pybind11 libclang pyyaml && rm -rf /tmp/infinirt-src /tmp/infinirt-build /tmp/infinirt-prefix && git clone --depth 1 ${INFINI_RT_GIT_URL:-https://gh-proxy.com/https://github.com/InfiniTensor/InfiniRT.git} /tmp/infinirt-src && cmake -S /tmp/infinirt-src -B /tmp/infinirt-build -DWITH_CPU=ON -DWITH_MOORE=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/infinirt-prefix && cmake --build /tmp/infinirt-build -j$(nproc) && cmake --install /tmp/infinirt-build && pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON --config-settings=cmake.define.INFINI_RT_ROOT=/tmp/infinirt-prefix jobs: gpu: type: unittest @@ -136,7 +136,7 @@ platforms: PIP_INDEX_URL: https://pypi.org/simple docker_args: - "--privileged" - setup: pip install cmake scikit-build-core pybind11 libclang pyyaml && rm -rf /tmp/infinirt-src /tmp/infinirt-build /tmp/infinirt-prefix && git clone --depth 1 ${INFINI_RT_GIT_URL:-https://gh-proxy.com/https://github.com/InfiniTensor/InfiniRT.git} /tmp/infinirt-src && cmake -S /tmp/infinirt-src -B /tmp/infinirt-build -DWITH_CPU=ON -DWITH_CAMBRICON=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/infinirt-prefix && cmake --build /tmp/infinirt-build -j$(nproc) && cmake --install /tmp/infinirt-build && pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON --config-settings=cmake.define.INFINI_RT_ROOT=/tmp/infinirt-prefix + setup: pip install scikit-build-core pybind11 libclang pyyaml && rm -rf /tmp/infinirt-src /tmp/infinirt-build /tmp/infinirt-prefix && git clone --depth 1 ${INFINI_RT_GIT_URL:-https://gh-proxy.com/https://github.com/InfiniTensor/InfiniRT.git} /tmp/infinirt-src && cmake -S /tmp/infinirt-src -B /tmp/infinirt-build -DWITH_CPU=ON -DWITH_CAMBRICON=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/infinirt-prefix && cmake --build /tmp/infinirt-build -j$(nproc) && cmake --install /tmp/infinirt-build && pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON --config-settings=cmake.define.INFINI_RT_ROOT=/tmp/infinirt-prefix jobs: gpu: type: unittest @@ -172,7 +172,7 @@ platforms: - "--group-add=video" volumes: - /opt/hyhal:/opt/hyhal:ro - setup: pip install cmake scikit-build-core pybind11 libclang pyyaml && rm -rf /tmp/infinirt-src /tmp/infinirt-build /tmp/infinirt-prefix && git clone --depth 1 ${INFINI_RT_GIT_URL:-https://gh-proxy.com/https://github.com/InfiniTensor/InfiniRT.git} /tmp/infinirt-src && cmake -S /tmp/infinirt-src -B /tmp/infinirt-build -DWITH_CPU=ON -DWITH_HYGON=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/infinirt-prefix && cmake --build /tmp/infinirt-build -j$(nproc) && cmake --install /tmp/infinirt-build && pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON --config-settings=cmake.define.INFINI_RT_ROOT=/tmp/infinirt-prefix + setup: pip install scikit-build-core pybind11 libclang pyyaml && rm -rf /tmp/infinirt-src /tmp/infinirt-build /tmp/infinirt-prefix && git clone --depth 1 ${INFINI_RT_GIT_URL:-https://gh-proxy.com/https://github.com/InfiniTensor/InfiniRT.git} /tmp/infinirt-src && cmake -S /tmp/infinirt-src -B /tmp/infinirt-build -DWITH_CPU=ON -DWITH_HYGON=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/infinirt-prefix && cmake --build /tmp/infinirt-build -j$(nproc) && cmake --install /tmp/infinirt-build && pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON --config-settings=cmake.define.INFINI_RT_ROOT=/tmp/infinirt-prefix jobs: gpu: type: unittest @@ -209,7 +209,7 @@ platforms: - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi:ro env: ASCEND_HOME_PATH: /usr/local/Ascend/ascend-toolkit/latest - setup: pip install cmake scikit-build-core pybind11 libclang pyyaml && rm -rf /tmp/infinirt-src /tmp/infinirt-build /tmp/infinirt-prefix && git clone --depth 1 ${INFINI_RT_GIT_URL:-https://gh-proxy.com/https://github.com/InfiniTensor/InfiniRT.git} /tmp/infinirt-src && cmake -S /tmp/infinirt-src -B /tmp/infinirt-build -DWITH_CPU=ON -DWITH_ASCEND=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/infinirt-prefix && cmake --build /tmp/infinirt-build -j$(nproc) && cmake --install /tmp/infinirt-build && pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON --config-settings=cmake.define.INFINI_RT_ROOT=/tmp/infinirt-prefix + setup: pip install scikit-build-core pybind11 libclang pyyaml && rm -rf /tmp/infinirt-src /tmp/infinirt-build /tmp/infinirt-prefix && git clone --depth 1 ${INFINI_RT_GIT_URL:-https://gh-proxy.com/https://github.com/InfiniTensor/InfiniRT.git} /tmp/infinirt-src && cmake -S /tmp/infinirt-src -B /tmp/infinirt-build -DWITH_CPU=ON -DWITH_ASCEND=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/infinirt-prefix && cmake --build /tmp/infinirt-build -j$(nproc) && cmake --install /tmp/infinirt-build && pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON --config-settings=cmake.define.INFINI_RT_ROOT=/tmp/infinirt-prefix jobs: npu: type: unittest From fe63ba7ae07415a9dc1a2f7b4e878b8a4e27f6e8 Mon Sep 17 00:00:00 2001 From: Jiacheng Huang Date: Thu, 18 Jun 2026 11:26:19 +0800 Subject: [PATCH 09/12] fix: keep optional wrapper overloads distinct --- tests/test_generate_wrappers.py | 57 +++++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 2 deletions(-) diff --git a/tests/test_generate_wrappers.py b/tests/test_generate_wrappers.py index b4c97f868..931677553 100644 --- a/tests/test_generate_wrappers.py +++ b/tests/test_generate_wrappers.py @@ -62,16 +62,69 @@ class Clamp { text = "\n".join(declarations) assert ( - "MakeClamp(const Config& config, const Tensor input, " + "MakeClamp(const Config& config, Tensor input, " "const std::optional min, const std::optional max, " "Tensor out)" ) in text assert ( - "MakeClamp(const Config& config, const Tensor input, " + "MakeClamp(const Config& config, Tensor input, " "std::optional min, std::optional max, Tensor out)" ) in text +def test_operator_call_instantiations_keep_optional_scalar_and_tensor_overloads_distinct( + monkeypatch, tmp_path +): + module = _load_generator_module() + base_header = tmp_path / "clamp.h" + base_header.write_text( + """ +class Clamp { + public: + virtual void operator()(const Tensor input, const std::optional min, + const std::optional max, Tensor out) const = 0; + virtual void operator()(const Tensor input, const std::optional min, + const std::optional max, Tensor out) const = 0; +}; +""" + ) + monkeypatch.setattr(module, "_find_base_header", lambda op_name: base_header) + + operator = module._Operator( + "clamp", + constructors=[], + calls=[ + module._ParsedFunction( + [ + module._ParsedArgument("const Tensor", "input"), + module._ParsedArgument("const std::optional", "min"), + module._ParsedArgument("const std::optional", "max"), + module._ParsedArgument("Tensor", "out"), + ] + ), + module._ParsedFunction( + [ + module._ParsedArgument("const Tensor", "input"), + module._ParsedArgument("const std::optional", "min"), + module._ParsedArgument("const std::optional", "max"), + module._ParsedArgument("Tensor", "out"), + ] + ), + ], + ) + + declarations, _ = module._generate_operator_call_instantiation_entries(operator) + + text = "\n".join(declarations) + + assert ( + "Call, std::optional, Tensor>" + ) in text + assert ( + "Call, std::optional, Tensor>" + ) in text + + def test_pybind_default_implementation_uses_first_active_index(monkeypatch, tmp_path): module = _load_generator_module() base_header = tmp_path / "mul.h" From 8cee0a5dc879e55c4b6834f3f0ee7b6695124558 Mon Sep 17 00:00:00 2001 From: Jiacheng Huang Date: Thu, 18 Jun 2026 11:26:32 +0800 Subject: [PATCH 10/12] fix: pass InfiniRT includes to torch objects --- src/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9a0769494..07a22efaf 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -539,6 +539,7 @@ if(WITH_TORCH) -std=c++17 -fPIC -O0 "-I${CMAKE_CURRENT_SOURCE_DIR}" "-I${PROJECT_SOURCE_DIR}/generated" + ${INFINI_RT_INCLUDE_FLAGS} ${_torch_include_flags} ${_torch_extra_flags} -c "${_src}" -o "${_obj}" @@ -564,6 +565,7 @@ if(WITH_TORCH) PROPERTIES POSITION_INDEPENDENT_CODE ON) target_include_directories(infini_ops_torch_objs PRIVATE $ + ${INFINI_RT_INCLUDE_DIRS} ${TORCH_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR}/generated) target_compile_definitions(infini_ops_torch_objs PRIVATE From 99140eb1fd5859c85e573c23156639c975b8bdec Mon Sep 17 00:00:00 2001 From: Jiacheng Huang Date: Thu, 18 Jun 2026 14:27:34 +0800 Subject: [PATCH 11/12] fix: ship InfiniRT library in Python wheel --- src/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 07a22efaf..0a1870906 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -946,6 +946,8 @@ if(GENERATE_PYTHON_BINDINGS) get_target_property(_infinirt_imported infinirt IMPORTED) if(_infinirt_imported) install(TARGETS infiniops ops DESTINATION .) + get_target_property(_infinirt_imported_location infinirt IMPORTED_LOCATION) + install(FILES "${_infinirt_imported_location}" DESTINATION .) else() install(TARGETS infinirt infiniops ops DESTINATION .) endif() From 90791a953a944c77240e4c151d921154a09c7950 Mon Sep 17 00:00:00 2001 From: Jiacheng Huang Date: Thu, 18 Jun 2026 15:14:53 +0800 Subject: [PATCH 12/12] docs: document InfiniRT dependency setup --- CONTRIBUTING.md | 21 ++++++++++++++++----- README.md | 24 ++++++++++++++++++++---- 2 files changed, 36 insertions(+), 9 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 53240b134..39b38497f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -69,13 +69,21 @@ Branch names use the format `/xxx-yyyy-zzzz`, where `` matches the P Using Nvidia as an example: ```bash -pip install .[dev] -C cmake.define.WITH_CPU=ON -C cmake.define.WITH_NVIDIA=ON +pip install .[dev] \ + -C cmake.define.INFINI_RT_ROOT=/path/to/infinirt-prefix \ + -C cmake.define.WITH_CPU=ON \ + -C cmake.define.WITH_NVIDIA=ON ``` -Auto-detection is supported for some platforms, so you can also simply run: +`/path/to/infinirt-prefix` is the InfiniRT install prefix, typically the value +used for InfiniRT's `CMAKE_INSTALL_PREFIX`. It should contain +`include/infini/rt.h` and `lib/libinfinirt.so`. + +Auto-detection is supported for some platforms, so you can also let InfiniOps +detect the device backends while still pointing it at the installed InfiniRT: ```bash -pip install .[dev] +pip install .[dev] -C cmake.define.INFINI_RT_ROOT=/path/to/infinirt-prefix ``` > `[dev]` installs optional development dependencies (e.g. `pytest`) that are not needed for production but required for development and testing. After the first install, subsequent installs only need `pip install .`. @@ -94,6 +102,7 @@ For routine development and pull requests, start with a smoke build plus the smo ```bash python -m pip install .[dev] --no-build-isolation --no-deps \ + --config-settings=cmake.define.INFINI_RT_ROOT=/path/to/infinirt-prefix \ --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON python -m pytest tests -m smoke -q ``` @@ -251,5 +260,7 @@ using T = TypeMapType(list_tag)>; 2. **Segmentation fault during tests**: Run `pytest -n 1`. 3. **`Unknown CMake command "pybind11_add_module"`**: Install pybind11 with `pip install pybind11[global]`. See the [pybind11 installation guide](https://pybind11.readthedocs.io/en/stable/installing.html). 4. **Auto-detection (`AUTO_DETECT_DEVICES`) fails**: Some machines may not expose devices in expected paths (e.g. `/dev/nvidia*`). Use explicit CMake defines instead (e.g. `-C cmake.define.WITH_NVIDIA=ON`). -5. **`bash: pytest: command not found`**: Use `python -m pytest`. -6. **`CUBLAS_STATUS_INVALID_VALUE` in `cublasSgemmStridedBatched`**: PyTorch version issue. Downgrade to `torch<=2.9.1`. +5. **InfiniRT headers or library are not found**: Set `INFINI_RT_ROOT` to the InfiniRT install prefix, or set `INFINI_RT_INCLUDE_DIRS` and `INFINI_RT_LIBRARY` explicitly. +6. **`ImportError: libinfinirt.so: cannot open shared object file`**: Reinstall InfiniOps from a build that was configured with the InfiniRT install prefix so the Python wheel can bundle the runtime library. +7. **`bash: pytest: command not found`**: Use `python -m pytest`. +8. **`CUBLAS_STATUS_INVALID_VALUE` in `cublasSgemmStridedBatched`**: PyTorch version issue. Downgrade to `torch<=2.9.1`. diff --git a/README.md b/README.md index 44a8c71f1..4a8c14232 100644 --- a/README.md +++ b/README.md @@ -8,21 +8,33 @@ InfiniOps is a high-performance, cross-platform operator library supporting mult - CMake 3.18+ - Python 3.10+ - Hardware-specific SDKs (e.g. CUDA Toolkit, MUSA Toolkit) +- Installed InfiniRT headers and library ## Installation -Install with pip (recommended): +Install InfiniRT first, then build InfiniOps with the InfiniRT install prefix: ```bash -pip install . +pip install . -C cmake.define.INFINI_RT_ROOT=/path/to/infinirt-prefix ``` -This auto-detects available platforms on supported backends. To specify platforms explicitly: +`/path/to/infinirt-prefix` is the directory passed to InfiniRT as +`CMAKE_INSTALL_PREFIX`; it should contain `include/infini/rt.h` and +`lib/libinfinirt.so`. + +InfiniOps auto-detects available platforms on supported backends. To specify +platforms explicitly: ```bash -pip install . -C cmake.define.WITH_CPU=ON -C cmake.define.WITH_NVIDIA=ON +pip install . \ + -C cmake.define.INFINI_RT_ROOT=/path/to/infinirt-prefix \ + -C cmake.define.WITH_CPU=ON \ + -C cmake.define.WITH_NVIDIA=ON ``` +The Python wheel installs the required InfiniRT shared library next to the +InfiniOps extension so `import infini.ops` can load its runtime dependency. + ### CMake Options | Option | Description | Default | @@ -36,6 +48,10 @@ pip install . -C cmake.define.WITH_CPU=ON -C cmake.define.WITH_NVIDIA=ON | `-DWITH_CAMBRICON=[ON\|OFF]` | Compile the Cambricon implementation | OFF | | `-DWITH_ASCEND=[ON\|OFF]` | Compile the Ascend implementation | OFF | | `-DAUTO_DETECT_DEVICES=[ON\|OFF]` | Auto-detect available platforms | ON | +| `-DINFINI_RT_ROOT=` | InfiniRT install prefix containing `include/` and `lib/` | `$INFINI_RT_ROOT` | +| `-DINFINI_RT_INCLUDE_DIRS=` | Explicit InfiniRT include directories | empty | +| `-DINFINI_RT_LIBRARY=` | Explicit path to `libinfinirt.so` | empty | +| `-DINFINI_RT_SOURCE_DIR=` | InfiniRT source tree for development builds | empty | If no accelerator options are provided and auto-detection finds nothing, `WITH_CPU` is enabled by default.