From 3fb8b6f7b5cdbff7982b30fda53b8bcb52472b21 Mon Sep 17 00:00:00 2001
From: gongchensu <zhuyue_134@qq.com>
Date: Fri, 24 Apr 2026 16:13:00 +0800
Subject: [PATCH 1/3] feat(hygon): add Hygon backend infrastructure

---
 .gitignore                |   2 +
 CMakeLists.txt            | 126 +++++++++++++++++++++++++++++++++++++-
 README.md                 |   4 ++
 examples/CMakeLists.txt   |   8 ++-
 src/CMakeLists.txt        |  51 ++++++++++++++-
 src/hygon/blas.h          |  40 ++++++++++++
 src/hygon/blas_utils.h    |  30 +++++++++
 src/hygon/device_.h       | 101 ++++++++++++++++++++++++++++++
 src/hygon/runtime_.h      |  44 +++++++++++++
 src/hygon/runtime_utils.h |  15 +++++
 src/pybind11_utils.h      |  35 ++++++++++-
 11 files changed, 448 insertions(+), 8 deletions(-)
 create mode 100644 src/hygon/blas.h
 create mode 100644 src/hygon/blas_utils.h
 create mode 100644 src/hygon/device_.h
 create mode 100644 src/hygon/runtime_.h
 create mode 100644 src/hygon/runtime_utils.h

diff --git a/.gitignore b/.gitignore
index 2effaff2..4a5d4b7a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,7 @@
 # Generated files
 build/
+build-*/
+cmake-build-*/
 generated/
 
 # Prerequisites
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 91c2b015..677511cd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -11,6 +11,7 @@ set(PYBIND11_ENABLE_EXTRAS ON)
 option(WITH_CPU "Enable CPU backend" OFF)
 option(WITH_NVIDIA "Enable CUDA backend" OFF)
 option(WITH_ILUVATAR "Enable Iluvatar GPU backend" OFF)
+option(WITH_HYGON "Enable Hygon GPU backend" OFF)
 option(WITH_METAX "Enable MetaX backend" OFF)
 option(WITH_CAMBRICON "Enable Cambricon backend" OFF)
 option(WITH_MOORE "Enable Moore backend" OFF)
@@ -29,6 +30,31 @@ option(AUTO_DETECT_DEVICES "Automatically detect available devices" OFF)
 option(AUTO_DETECT_BACKENDS "Automatically detect available backends" OFF)
 option(GENERATE_PYTHON_BINDINGS "Generate Python bindings" OFF)
 
+set(_DEFAULT_HYGON_DTK_ROOT "/opt/dtk")
+
+function(_infiniops_find_hygon_cuda_root out_var dtk_root)
+    set(_candidates
+        "${dtk_root}/cuda"
+        "${dtk_root}/cuda/cuda"
+    )
+
+    file(GLOB _versioned_cuda_dirs LIST_DIRECTORIES true "${dtk_root}/cuda/cuda-*")
+    if(_versioned_cuda_dirs)
+        list(SORT _versioned_cuda_dirs)
+        list(REVERSE _versioned_cuda_dirs)
+        list(APPEND _candidates ${_versioned_cuda_dirs})
+    endif()
+
+    foreach(_candidate IN LISTS _candidates)
+        if(EXISTS "${_candidate}/bin/nvcc")
+            set(${out_var} "${_candidate}" PARENT_SCOPE)
+            return()
+        endif()
+    endforeach()
+
+    set(${out_var} "" PARENT_SCOPE)
+endfunction()
+
 if(AUTO_DETECT_DEVICES)
     message(STATUS "Auto-detecting available devices...")
 
@@ -48,6 +74,21 @@ if(AUTO_DETECT_DEVICES)
         message(STATUS "Auto-detected Iluvatar environment.")
     endif()
 
+    set(_hygon_detected FALSE)
+    if(DEFINED ENV{DTK_ROOT} AND NOT "$ENV{DTK_ROOT}" STREQUAL "")
+        set(_hygon_detected TRUE)
+    else()
+        _infiniops_find_hygon_cuda_root(_HYGON_CUDA_DETECT_ROOT "${_DEFAULT_HYGON_DTK_ROOT}")
+        if(_HYGON_CUDA_DETECT_ROOT)
+            set(_hygon_detected TRUE)
+        endif()
+    endif()
+
+    if(_hygon_detected)
+        set(WITH_HYGON ON)
+        message(STATUS "Auto-detected Hygon environment.")
+    endif()
+
     if(DEFINED ENV{MACA_PATH})
         set(WITH_METAX ON)
         message(STATUS "Auto-detected MetaX environment from MACA_PATH")
@@ -132,6 +173,17 @@ if(WITH_TORCH)
         OUTPUT_STRIP_TRAILING_WHITESPACE
     )
 
+    execute_process(
+        COMMAND ${Python_EXECUTABLE} -c "import pathlib, torch; p = pathlib.Path(torch.__file__).resolve().parent.parent / 'torch.libs'; print(str(p) if p.exists() else '')"
+        OUTPUT_VARIABLE _torch_private_lib_dir
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+
+    set(TORCH_RUNTIME_DIRS ${_torch_lib_dirs})
+    if(_torch_private_lib_dir)
+        list(APPEND TORCH_RUNTIME_DIRS ${_torch_private_lib_dir})
+    endif()
+
     find_library(TORCH_LIB torch HINTS ${_torch_lib_dirs} REQUIRED)
     find_library(TORCH_CPU_LIB torch_cpu HINTS ${_torch_lib_dirs} REQUIRED)
     find_library(C10_LIB c10 HINTS ${_torch_lib_dirs} REQUIRED)
@@ -181,14 +233,14 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src)
 
 # Only one CUDA-like GPU backend can be enabled at a time.
 set(_gpu_backend_count 0)
-foreach(_gpu_backend WITH_NVIDIA WITH_ILUVATAR WITH_METAX WITH_MOORE WITH_ASCEND)
+foreach(_gpu_backend WITH_NVIDIA WITH_ILUVATAR WITH_HYGON WITH_METAX WITH_MOORE WITH_ASCEND)
     if(${_gpu_backend})
         math(EXPR _gpu_backend_count "${_gpu_backend_count} + 1")
     endif()
 endforeach()
 
 if(_gpu_backend_count GREATER 1)
-    message(FATAL_ERROR "`WITH_NVIDIA`, `WITH_ILUVATAR`, `WITH_METAX`, `WITH_MOORE`, and `WITH_ASCEND` are mutually exclusive. Build one GPU backend at a time.")
+    message(FATAL_ERROR "`WITH_NVIDIA`, `WITH_ILUVATAR`, `WITH_HYGON`, `WITH_METAX`, `WITH_MOORE`, and `WITH_ASCEND` are mutually exclusive. Build one GPU backend at a time.")
 endif()
 
 if(WITH_NVIDIA)
@@ -227,6 +279,70 @@ if(WITH_ILUVATAR)
     add_compile_options($<$<COMPILE_LANGUAGE:CUDA>:-x$<SEMICOLON>ivcore>)
 endif()
 
+if(WITH_HYGON)
+    add_compile_definitions(WITH_HYGON=1)
+    set(DTK_ROOT $ENV{DTK_ROOT})
+    if(NOT DTK_ROOT)
+        set(DTK_ROOT "${_DEFAULT_HYGON_DTK_ROOT}")
+    endif()
+    if(NOT EXISTS "${DTK_ROOT}")
+        message(FATAL_ERROR "`WITH_HYGON` is `ON` but `DTK_ROOT` (`${DTK_ROOT}`) does not exist.")
+    endif()
+
+    set(_HYGON_ARCH_DEFAULT "gfx906")
+    if(DEFINED ENV{HYGON_ARCH} AND NOT "$ENV{HYGON_ARCH}" STREQUAL "")
+        set(_HYGON_ARCH_DEFAULT "$ENV{HYGON_ARCH}")
+    else()
+        find_program(HYGON_ROCMINFO_EXECUTABLE NAMES rocminfo HINTS "${DTK_ROOT}/bin")
+        if(HYGON_ROCMINFO_EXECUTABLE)
+            execute_process(
+                COMMAND ${HYGON_ROCMINFO_EXECUTABLE}
+                OUTPUT_VARIABLE _HYGON_ROCMINFO_OUTPUT
+                ERROR_QUIET
+                OUTPUT_STRIP_TRAILING_WHITESPACE
+            )
+            string(REGEX MATCH "gfx[0-9]+" _HYGON_ARCH_AUTO "${_HYGON_ROCMINFO_OUTPUT}")
+            if(_HYGON_ARCH_AUTO)
+                set(_HYGON_ARCH_DEFAULT "${_HYGON_ARCH_AUTO}")
+            endif()
+        endif()
+    endif()
+
+    set(HYGON_ARCH "${_HYGON_ARCH_DEFAULT}" CACHE STRING "Hygon GPU architecture")
+    _infiniops_find_hygon_cuda_root(HYGON_CUDA_ROOT "${DTK_ROOT}")
+
+    if(NOT HYGON_CUDA_ROOT)
+        message(FATAL_ERROR "`WITH_HYGON` is `ON` but no DTK `nvcc` was found under `${DTK_ROOT}`. Checked `${DTK_ROOT}/cuda/bin/nvcc`, `${DTK_ROOT}/cuda/cuda/bin/nvcc`, and `${DTK_ROOT}/cuda/cuda-*/bin/nvcc`.")
+    endif()
+
+    set(CMAKE_CUDA_COMPILER "${HYGON_CUDA_ROOT}/bin/nvcc" CACHE FILEPATH "Hygon CUDA compiler (DTK nvcc)")
+    set(CUDAToolkit_ROOT "${HYGON_CUDA_ROOT}" CACHE PATH "Hygon CUDA toolkit root")
+    set(CMAKE_CUDA_ARCHITECTURES OFF CACHE STRING "Disable default CUDA arch flags for Hygon" FORCE)
+    set(CMAKE_CUDA_FLAGS "-std=c++17 -fPIC -arch=${HYGON_ARCH} -Wno-return-type -Wno-error=unused-private-field" CACHE STRING "Hygon CUDA flags")
+    set(CMAKE_CUDA_SEPARABLE_COMPILATION OFF CACHE BOOL "Disable RDC for Hygon")
+
+    # DTK's nvcc wrapper derives its toolkit root from `CUDA_PATH`.
+    set(ENV{CUDA_PATH} "${HYGON_CUDA_ROOT}")
+    set(ENV{CUDA_HOME} "${HYGON_CUDA_ROOT}")
+
+    # DTK's nvcc wrapper may invoke `nvcc` by name during compiler checks.
+    set(ENV{PATH} "${HYGON_CUDA_ROOT}/bin:$ENV{PATH}")
+
+    # The actual Ninja build runs in fresh processes. Keep a launcher command
+    # for CUDA-backed Python bindings that need the DTK wrapper environment.
+    set(_HYGON_RULE_LAUNCH_ENV
+        "${CMAKE_COMMAND} -E env CUDA_PATH=${HYGON_CUDA_ROOT} CUDA_HOME=${HYGON_CUDA_ROOT} PATH=${HYGON_CUDA_ROOT}/bin:$ENV{PATH}")
+
+    include_directories("${DTK_ROOT}/include")
+    include_directories("${HYGON_CUDA_ROOT}/include")
+    link_directories("${DTK_ROOT}/lib")
+    link_directories("${HYGON_CUDA_ROOT}/lib64")
+
+    message(STATUS "Hygon: CUDA compiler ${CMAKE_CUDA_COMPILER}, arch ${HYGON_ARCH}, DTK root ${DTK_ROOT}")
+    enable_language(CUDA)
+    find_package(CUDAToolkit REQUIRED)
+endif()
+
 if(WITH_METAX)
     add_compile_definitions(WITH_METAX=1)
 
@@ -310,7 +426,7 @@ if(WITH_ASCEND)
 endif()
 
 # If all other platforms are not enabled, CPU is enabled by default.
-if(NOT WITH_NVIDIA AND NOT WITH_ILUVATAR AND NOT WITH_METAX AND NOT WITH_MOORE AND NOT WITH_CAMBRICON AND NOT WITH_ASCEND)
+if(NOT WITH_NVIDIA AND NOT WITH_ILUVATAR AND NOT WITH_HYGON AND NOT WITH_METAX AND NOT WITH_MOORE AND NOT WITH_CAMBRICON AND NOT WITH_ASCEND)
     add_compile_definitions(WITH_CPU=1)
 endif()
 
@@ -318,6 +434,10 @@ if(WITH_METAX OR WITH_MOORE)
     set(PYBIND11_ENABLE_EXTRAS OFF)
 endif()
 
+if(WITH_HYGON AND NOT EXISTS "${DTK_ROOT}/llvm/lib/LLVMgold.so")
+    set(PYBIND11_ENABLE_EXTRAS OFF)
+endif()
+
 add_subdirectory(src)
 
 add_subdirectory(examples)
diff --git a/README.md b/README.md
index 6b9fc6f6..01e7f00d 100644
--- a/README.md
+++ b/README.md
@@ -31,12 +31,16 @@ pip install . -C cmake.define.WITH_CPU=ON -C cmake.define.WITH_NVIDIA=ON
 | `-DWITH_NVIDIA=[ON\|OFF]` | Compile the Nvidia implementation | OFF |
 | `-DWITH_METAX=[ON\|OFF]` | Compile the MetaX implementation | OFF |
 | `-DWITH_ILUVATAR=[ON\|OFF]` | Compile the Iluvatar implementation | OFF |
+| `-DWITH_HYGON=[ON\|OFF]` | Compile the Hygon implementation | OFF |
 | `-DWITH_MOORE=[ON\|OFF]` | Compile the Moore implementation | OFF |
 | `-DWITH_CAMBRICON=[ON\|OFF]` | Compile the Cambricon implementation | OFF |
+| `-DWITH_ASCEND=[ON\|OFF]` | Compile the Ascend implementation | OFF |
 | `-DAUTO_DETECT_DEVICES=[ON\|OFF]` | Auto-detect available platforms | ON |
 
 If no accelerator options are provided and auto-detection finds nothing, `WITH_CPU` is enabled by default.
 
+For Hygon builds, set `DTK_ROOT` to the DTK installation root if it is not installed at `/opt/dtk`. You can override the default DCU arch with `-DHYGON_ARCH=<arch>` when configuring CMake.
+
 ## Contributing
 
 See [CONTRIBUTING.md](CONTRIBUTING.md) for code style, commit conventions, PR workflow, development guide, and troubleshooting.
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 68ebc1b5..18f70090 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -9,8 +9,14 @@ foreach(source_file ${EXAMPLE_SOURCES})
     target_link_libraries(${example_name} PRIVATE infiniops)
 
     target_include_directories(${example_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
-    
+
     get_filename_component(example_dir ${source_file} DIRECTORY)
 
     target_include_directories(${example_name} PRIVATE ${example_dir})
+
+    if(WITH_TORCH)
+        foreach(_torch_dir ${TORCH_RUNTIME_DIRS})
+            target_link_options(${example_name} PRIVATE "LINKER:-rpath-link,${_torch_dir}")
+        endforeach()
+    endif()
 endforeach()
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 32c92949..8b24420d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -77,6 +77,34 @@ if(WITH_ILUVATAR)
     list(APPEND DEVICE_LIST "iluvatar")
 endif()
 
+if(WITH_HYGON)
+    set(HYGON_PATTERNS
+        "cuda/*.cc"
+        "cuda/*.cpp"
+        "cuda/*.cu"
+        "hygon/*.cc"
+        "hygon/*.cpp"
+        "hygon/*.cu"
+    )
+
+    file(GLOB_RECURSE HYGON_SOURCES CONFIGURE_DEPENDS ${HYGON_PATTERNS})
+
+    enable_language(CUDA)
+
+    target_compile_definitions(infiniops PUBLIC WITH_HYGON=1)
+    target_sources(infiniops PRIVATE ${HYGON_SOURCES})
+
+    find_package(CUDAToolkit REQUIRED)
+    target_link_libraries(infiniops PUBLIC CUDA::cudart CUDA::cublas)
+
+    set_target_properties(infiniops PROPERTIES
+        CUDA_STANDARD 17
+        CUDA_STANDARD_REQUIRED ON
+    )
+
+    list(APPEND DEVICE_LIST "hygon")
+endif()
+
 if(WITH_METAX)
     set(METAX_PATTERNS
         "cuda/*.cc"
@@ -343,7 +371,7 @@ if(GENERATE_PYTHON_BINDINGS)
     set(PYBIND11_SOURCES "${PROJECT_SOURCE_DIR}/generated/bindings/ops.cc")
 
     # TODO: There might be a better solution.
-    if(WITH_NVIDIA OR WITH_ILUVATAR)
+    if(WITH_NVIDIA OR WITH_ILUVATAR OR WITH_HYGON)
         set_source_files_properties(${PYBIND11_SOURCES} PROPERTIES LANGUAGE CUDA)
     endif()
 
@@ -370,6 +398,13 @@ if(GENERATE_PYTHON_BINDINGS)
         pybind11_add_module(ops NO_EXTRAS ${PYBIND11_SOURCES})
     endif()
 
+    if(WITH_HYGON)
+        set_target_properties(ops PROPERTIES
+            RULE_LAUNCH_COMPILE "${_HYGON_RULE_LAUNCH_ENV}"
+            RULE_LAUNCH_LINK "${_HYGON_RULE_LAUNCH_ENV}"
+        )
+    endif()
+
     target_include_directories(ops PRIVATE ${PROJECT_SOURCE_DIR})
     target_link_libraries(ops PRIVATE infiniops)
 
@@ -383,9 +418,19 @@ if(GENERATE_PYTHON_BINDINGS)
         target_link_libraries(ops PRIVATE
             -Wl,--whole-archive no_workspace_kernel -Wl,--no-whole-archive)
     endif()
+    set(_INFINIOPS_INSTALL_RPATH "$ORIGIN")
+    if(WITH_TORCH)
+        list(APPEND _INFINIOPS_INSTALL_RPATH ${TORCH_RUNTIME_DIRS})
+    endif()
+
+    if(WITH_HYGON)
+        list(APPEND _INFINIOPS_INSTALL_RPATH
+            "${HYGON_CUDA_ROOT}/lib64"
+        )
+    endif()
 
-    set_target_properties(infiniops PROPERTIES INSTALL_RPATH "$ORIGIN")
-    set_target_properties(ops PROPERTIES INSTALL_RPATH "$ORIGIN")
+    set_target_properties(infiniops PROPERTIES INSTALL_RPATH "${_INFINIOPS_INSTALL_RPATH}")
+    set_target_properties(ops PROPERTIES INSTALL_RPATH "${_INFINIOPS_INSTALL_RPATH}")
 
     install(TARGETS infiniops ops DESTINATION .)
 
diff --git a/src/hygon/blas.h b/src/hygon/blas.h
new file mode 100644
index 00000000..1a2c3d07
--- /dev/null
+++ b/src/hygon/blas.h
@@ -0,0 +1,40 @@
+#ifndef INFINI_OPS_HYGON_BLAS_H_
+#define INFINI_OPS_HYGON_BLAS_H_
+
+#include <utility>
+
+// clang-format off
+#include "cublas_v2.h"
+// clang-format on
+
+#include "cuda/blas.h"
+#include "data_type.h"
+#include "hygon/blas_utils.h"
+#include "hygon/runtime_.h"
+
+namespace infini::ops {
+
+template <>
+struct Blas<Device::Type::kHygon> : public Runtime<Device::Type::kHygon> {
+  using BlasHandle = cublasHandle_t;
+
+  static constexpr auto BLAS_OP_N = CUBLAS_OP_N;
+
+  static constexpr auto BLAS_OP_T = CUBLAS_OP_T;
+
+  static constexpr auto BLAS_GEMM_DEFAULT = CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+
+  static constexpr auto BlasCreate = cublasCreate;
+
+  static constexpr auto BlasSetStream = cublasSetStream;
+
+  static constexpr auto BlasDestroy = cublasDestroy;
+
+  static constexpr auto BlasGemmStridedBatchedEx = [](auto&&... args) {
+    return cublasGemmStridedBatchedEx(std::forward<decltype(args)>(args)...);
+  };
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/hygon/blas_utils.h b/src/hygon/blas_utils.h
new file mode 100644
index 00000000..4123372f
--- /dev/null
+++ b/src/hygon/blas_utils.h
@@ -0,0 +1,30 @@
+#ifndef INFINI_OPS_HYGON_BLAS_UTILS_H_
+#define INFINI_OPS_HYGON_BLAS_UTILS_H_
+
+// clang-format off
+#include "cublas_v2.h"
+// clang-format on
+
+#include "cuda/blas_utils.h"
+#include "data_type.h"
+
+namespace infini::ops {
+
+template <>
+struct BlasUtils<Device::Type::kHygon> {
+  static auto GetDataType(DataType dtype) {
+    if (dtype == DataType::kFloat16) return CUDA_R_16F;
+    if (dtype == DataType::kBFloat16) return CUDA_R_16BF;
+    return CUDA_R_32F;
+  }
+
+  static auto GetComputeType(DataType dtype) {
+    if (dtype == DataType::kFloat16 || dtype == DataType::kBFloat16)
+      return CUBLAS_COMPUTE_32F;
+    return CUBLAS_COMPUTE_32F;
+  }
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/hygon/device_.h b/src/hygon/device_.h
new file mode 100644
index 00000000..7df5dbd8
--- /dev/null
+++ b/src/hygon/device_.h
@@ -0,0 +1,101 @@
+#ifndef INFINI_OPS_HYGON_DEVICE__H_
+#define INFINI_OPS_HYGON_DEVICE__H_
+
+#include <cassert>
+#include <vector>
+
+// clang-format off
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+// clang-format on
+
+#include "cuda/caster.cuh"
+#include "data_type.h"
+#include "device.h"
+
+namespace infini::ops {
+
+template <>
+struct DeviceEnabled<Device::Type::kHygon> : std::true_type {};
+
+// Some DTK toolchains expose the underlying bf16 structs but gate the
+// nv_bfloat16 typedefs behind CUDA_NO_BFLOAT16.
+using cuda_bfloat16 = __nv_bfloat16;
+
+using cuda_bfloat162 = __nv_bfloat162;
+
+namespace detail {
+
+template <>
+struct ToFloat<Device::Type::kHygon, half> {
+  __host__ __device__ float operator()(half x) { return __half2float(x); }
+};
+
+template <>
+struct ToFloat<Device::Type::kHygon, __nv_bfloat16> {
+  __host__ __device__ float operator()(__nv_bfloat16 x) {
+    return __bfloat162float(x);
+  }
+};
+
+template <>
+struct FromFloat<Device::Type::kHygon, half> {
+  __host__ __device__ half operator()(float f) { return __float2half(f); }
+};
+
+template <>
+struct FromFloat<Device::Type::kHygon, __nv_bfloat16> {
+  __host__ __device__ __nv_bfloat16 operator()(float f) {
+    return __float2bfloat16(f);
+  }
+};
+
+}  // namespace detail
+
+template <>
+struct TypeMap<Device::Type::kHygon, DataType::kFloat16> {
+  using type = half;
+};
+
+template <>
+struct TypeMap<Device::Type::kHygon, DataType::kBFloat16> {
+  using type = __nv_bfloat16;
+};
+
+// Caches `cudaDeviceProp` per device, initialized once at first access.
+class DevicePropertyCache {
+ public:
+  static const cudaDeviceProp& GetCurrentDeviceProps() {
+    int device_id = 0;
+    cudaGetDevice(&device_id);
+    return GetDeviceProps(device_id);
+  }
+
+  static const cudaDeviceProp& GetDeviceProps(int device_id) {
+    static std::vector<cudaDeviceProp> cache = []() {
+      int count = 0;
+      cudaGetDeviceCount(&count);
+      if (count == 0) return std::vector<cudaDeviceProp>{};
+      std::vector<cudaDeviceProp> props(count);
+      for (int i = 0; i < count; ++i) {
+        cudaGetDeviceProperties(&props[i], i);
+      }
+      return props;
+    }();
+
+    assert(device_id >= 0 && device_id < static_cast<int>(cache.size()));
+    return cache[device_id];
+  }
+};
+
+inline int QueryMaxThreadsPerBlock() {
+  return DevicePropertyCache::GetCurrentDeviceProps().maxThreadsPerBlock;
+}
+
+template <>
+struct Caster<Device::Type::kHygon> : CudaCasterImpl<Device::Type::kHygon> {};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/hygon/runtime_.h b/src/hygon/runtime_.h
new file mode 100644
index 00000000..356e0424
--- /dev/null
+++ b/src/hygon/runtime_.h
@@ -0,0 +1,44 @@
+#ifndef INFINI_OPS_HYGON_RUNTIME_H_
+#define INFINI_OPS_HYGON_RUNTIME_H_
+
+#include <utility>
+
+// clang-format off
+#include <cuda_runtime.h>
+// clang-format on
+
+#include "cuda/runtime.h"
+#include "hygon/device_.h"
+#include "hygon/runtime_utils.h"
+
+namespace infini::ops {
+
+template <>
+struct Runtime<Device::Type::kHygon>
+    : CudaRuntime<Runtime<Device::Type::kHygon>> {
+  using Stream = cudaStream_t;
+
+  static constexpr Device::Type kDeviceType = Device::Type::kHygon;
+
+  static constexpr auto Malloc = [](auto&&... args) {
+    return cudaMalloc(std::forward<decltype(args)>(args)...);
+  };
+
+  static constexpr auto Memcpy = cudaMemcpy;
+
+  static constexpr auto Free = [](auto&&... args) {
+    return cudaFree(std::forward<decltype(args)>(args)...);
+  };
+
+  static constexpr auto MemcpyHostToDevice = cudaMemcpyHostToDevice;
+
+  static constexpr auto MemcpyDeviceToHost = cudaMemcpyDeviceToHost;
+
+  static constexpr auto Memset = cudaMemset;
+};
+
+static_assert(Runtime<Device::Type::kHygon>::Validate());
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/hygon/runtime_utils.h b/src/hygon/runtime_utils.h
new file mode 100644
index 00000000..afa84265
--- /dev/null
+++ b/src/hygon/runtime_utils.h
@@ -0,0 +1,15 @@
+#ifndef INFINI_OPS_HYGON_RUNTIME_UTILS_H_
+#define INFINI_OPS_HYGON_RUNTIME_UTILS_H_
+
+#include "cuda/runtime_utils.h"
+#include "hygon/device_.h"
+
+namespace infini::ops {
+
+template <>
+struct RuntimeUtils<Device::Type::kHygon>
+    : CudaRuntimeUtils<QueryMaxThreadsPerBlock> {};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/pybind11_utils.h b/src/pybind11_utils.h
index f13d3116..555645ef 100644
--- a/src/pybind11_utils.h
+++ b/src/pybind11_utils.h
@@ -1,8 +1,10 @@
 #ifndef INFINI_OPS_PYBIND11_UTILS_H_
 #define INFINI_OPS_PYBIND11_UTILS_H_
 
+#include <algorithm>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
+#include <stdexcept>
 
 #include "tensor.h"
 #include "torch/device_.h"
@@ -38,7 +40,38 @@ inline Device::Type DeviceTypeFromString(const std::string& name) {
     return it->second;
   }
 
-  return Device::TypeFromString(name);
+  std::vector<std::string> supported_names;
+
+  for (const auto& [torch_name, device_type] : kTorchNameToTypes) {
+    const auto internal_name = std::string{Device::StringFromType(device_type)};
+
+    if (name == internal_name) {
+      return device_type;
+    }
+
+    supported_names.push_back(torch_name);
+    supported_names.push_back(internal_name);
+  }
+
+  std::sort(supported_names.begin(), supported_names.end());
+  supported_names.erase(
+      std::unique(supported_names.begin(), supported_names.end()),
+      supported_names.end());
+
+  std::string message =
+      "Unsupported device type `" + name +
+      "` for this InfiniOps build. Supported device names: ";
+
+  for (std::size_t i = 0; i < supported_names.size(); ++i) {
+    if (i != 0) {
+      message += ", ";
+    }
+    message += supported_names[i];
+  }
+
+  message += ". Rebuild InfiniOps with the matching backend enabled.";
+
+  throw std::invalid_argument(message);
 }
 
 inline Tensor TensorFromPybind11Handle(py::handle obj) {

From a7a1b9a1d85357124d5eceae82c0494eb1db7e7d Mon Sep 17 00:00:00 2001
From: gongchensu <zhuyue_134@qq.com>
Date: Fri, 24 Apr 2026 16:17:43 +0800
Subject: [PATCH 2/3] feat(hygon-add): add Hygon backend support for Add

---
 src/hygon/add/kernel.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 src/hygon/add/kernel.h

diff --git a/src/hygon/add/kernel.h b/src/hygon/add/kernel.h
new file mode 100644
index 00000000..3c17b482
--- /dev/null
+++ b/src/hygon/add/kernel.h
@@ -0,0 +1,18 @@
+#ifndef INFINI_OPS_HYGON_ADD_KERNEL_H_
+#define INFINI_OPS_HYGON_ADD_KERNEL_H_
+
+#include "cuda/add/kernel.h"
+#include "hygon/runtime_.h"
+
+namespace infini::ops {
+
+template <>
+class Operator<Add, Device::Type::kHygon>
+    : public CudaAdd<Runtime<Device::Type::kHygon>> {
+ public:
+  using CudaAdd<Runtime<Device::Type::kHygon>>::CudaAdd;
+};
+
+}  // namespace infini::ops
+
+#endif

From c8d8b56ba64b00a03f2cd19812a55f09b10539e9 Mon Sep 17 00:00:00 2001
From: gongchensu <zhuyue_134@qq.com>
Date: Fri, 24 Apr 2026 16:18:12 +0800
Subject: [PATCH 3/3] feat(hygon-gemm): add Hygon backend support for Gemm

---
 examples/runtime_api.h  |  5 +++++
 src/hygon/gemm/cublas.h | 18 ++++++++++++++++++
 2 files changed, 23 insertions(+)
 create mode 100644 src/hygon/gemm/cublas.h

diff --git a/examples/runtime_api.h b/examples/runtime_api.h
index c1ef3161..accb2341 100644
--- a/examples/runtime_api.h
+++ b/examples/runtime_api.h
@@ -10,6 +10,9 @@
 #elif WITH_ILUVATAR
 #include "cuda/iluvatar/gemm/cublas.h"
 #include "cuda/iluvatar/runtime_.h"
+#elif WITH_HYGON
+#include "hygon/gemm/cublas.h"
+#include "hygon/runtime_.h"
 #elif WITH_METAX
 #include "cuda/metax/gemm/mcblas.h"
 #include "cuda/metax/runtime_.h"
@@ -35,6 +38,8 @@ namespace infini::ops {
 using DefaultRuntimeUtils = Runtime<Device::Type::kNvidia>;
 #elif WITH_ILUVATAR
 using DefaultRuntimeUtils = Runtime<Device::Type::kIluvatar>;
+#elif WITH_HYGON
+using DefaultRuntimeUtils = Runtime<Device::Type::kHygon>;
 #elif WITH_METAX
 using DefaultRuntimeUtils = Runtime<Device::Type::kMetax>;
 #elif WITH_CAMBRICON
diff --git a/src/hygon/gemm/cublas.h b/src/hygon/gemm/cublas.h
new file mode 100644
index 00000000..fefb2621
--- /dev/null
+++ b/src/hygon/gemm/cublas.h
@@ -0,0 +1,18 @@
+#ifndef INFINI_OPS_HYGON_GEMM_CUBLAS_H_
+#define INFINI_OPS_HYGON_GEMM_CUBLAS_H_
+
+#include "cuda/gemm/blas.h"
+#include "hygon/blas.h"
+
+namespace infini::ops {
+
+template <>
+class Operator<Gemm, Device::Type::kHygon>
+    : public BlasGemm<Blas<Device::Type::kHygon>> {
+ public:
+  using BlasGemm<Blas<Device::Type::kHygon>>::BlasGemm;
+};
+
+}  // namespace infini::ops
+
+#endif