From 421f6830eff3b85bdceabf9a8ea81a1c59e7b710 Mon Sep 17 00:00:00 2001
From: kilinchange <kilinchange@163.com>
Date: Fri, 10 Apr 2026 01:43:10 +0000
Subject: [PATCH 01/12] feat(maca): add MetaX MACA backend skeleton and minimal
 kernels
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce a MACA (MetaX 沐曦) backend plugged into the DeviceGuardImpl /
kernel dispatcher framework, targeting the minimal kernel set needed to
validate single-card fp32 training (e.g. mnist) end-to-end:

- Build system: USE_MACA / USE_MCCL options, mxcc toolchain override,
  mxomp linkage under USE_OMP, .maca kernel library with -x maca, and
  backend-exclusive SRC filtering so non-target backends are not pulled in.
- Device enum: add Device::DeviceType::kMACA (kCount bumped to 3),
  IsMACA(), and a three-way ToString() switch.
- common/maca: MACA_CHECK / MCBLAS_CHECK / MCCL_CHECK macros and the
  kernel_helper.cuh template library (Cast/Neg/Sin/Pow/Add/Sub/Mul/Div/
  Max/Min/Fma/fastAtomicAdd) plus a cub_compat.cuh shim pinning CubSumOp/
  CubMaxOp/CubMinOp to the pre-2.8 CUB API that MACA ships.
- core/runtime/maca: MacaStream / MacaEvent / MacaBlasHandle derived from
  core::Stream / Event / BlasHandle, and MacaGuardImpl mirroring
  CudaGuardImpl (mcInit(0) in ctor, call_once'd default stream/handle
  caches, full stream/event/sync/blas/memory surface). Mempool watermark
  hooks are stubs pending SDK verification.
- datatype.h / tensor.cc / nn/init.cc: add USE_MACA branches to map
  kBFLOAT16 / kFLOAT16 to __maca_bfloat16 / __half, specialize the
  is_floating_point_ext / is_arithmetic_ext / LargerType traits, route
  Fill casts through float under real device backends to dodge the
  ambiguous __half(int) constructor on MACA, and wire Arange for bf16/fp16.
- kernels/maca: mechanically port the minimal 5-kernel slice
  (elementwise, linear, fill, no_op, accumulate_grad) from their .cu
  counterparts, switching blas/stream acquisition to the new
  GetDeviceGuardImpl()->GetBlasHandle()/GetStream() idiom.

The MCCL collective backend and the remaining 15 kernels (which are
required for gpt2 / DDP) will land in a follow-up commit.
---
 CMakeLists.txt                                |  121 +-
 infini_train/include/autocast.h               |    3 +-
 .../include/common/maca/common_maca.h         |   43 +
 .../include/common/maca/cub_compat.cuh        |   14 +
 .../include/common/maca/kernel_helper.cuh     |  311 ++++
 infini_train/include/device.h                 |    4 +-
 .../src/core/runtime/maca/maca_guard_impl.cc  |  279 ++++
 .../src/core/runtime/maca/maca_guard_impl.h   |   83 ++
 .../core/runtime/maca/maca_runtime_common.cc  |   60 +
 .../core/runtime/maca/maca_runtime_common.h   |   59 +
 infini_train/src/device.cc                    |   18 +-
 .../src/kernels/maca/accumulate_grad.maca     |   94 ++
 .../src/kernels/maca/elementwise.maca         | 1269 +++++++++++++++++
 infini_train/src/kernels/maca/fill.maca       |   45 +
 infini_train/src/kernels/maca/linear.maca     |  508 +++++++
 infini_train/src/kernels/maca/no_op.maca      |   30 +
 16 files changed, 2936 insertions(+), 5 deletions(-)
 create mode 100644 infini_train/include/common/maca/common_maca.h
 create mode 100644 infini_train/include/common/maca/cub_compat.cuh
 create mode 100644 infini_train/include/common/maca/kernel_helper.cuh
 create mode 100644 infini_train/src/core/runtime/maca/maca_guard_impl.cc
 create mode 100644 infini_train/src/core/runtime/maca/maca_guard_impl.h
 create mode 100644 infini_train/src/core/runtime/maca/maca_runtime_common.cc
 create mode 100644 infini_train/src/core/runtime/maca/maca_runtime_common.h
 create mode 100644 infini_train/src/kernels/maca/accumulate_grad.maca
 create mode 100644 infini_train/src/kernels/maca/elementwise.maca
 create mode 100644 infini_train/src/kernels/maca/fill.maca
 create mode 100644 infini_train/src/kernels/maca/linear.maca
 create mode 100644 infini_train/src/kernels/maca/no_op.maca

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 57e97ddc..262b97f3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,9 +1,28 @@
 cmake_minimum_required(VERSION 3.28)
 
+# Platforms
 option(USE_CUDA "Support NVIDIA CUDA" OFF)
+option(USE_MACA "Support MetaX MACA" OFF)
+
 option(PROFILE_MODE "ENABLE PROFILE MODE" OFF)
 option(USE_OMP "Use OpenMP as backend for Eigen" ON)
-option(USE_NCCL "Build project for distributed running" ON)
+option(USE_NCCL "Build project for distributed running on CUDA using NCCL" ON)
+option(USE_MCCL "Build project for distributed running on MACA using MCCL" ON)
+
+# ------------------------------------------------------------------------------
+# MACA toolchain override (must happen before project())
+# ------------------------------------------------------------------------------
+# When targeting MetaX MACA, the C/C++ compiler must be mxcc so that .maca
+# sources and device code can be compiled by the MACA toolchain.
+if(USE_MACA)
+  set(MACA_PATH $ENV{MACA_PATH})
+  if(NOT MACA_PATH)
+    message(FATAL_ERROR "USE_MACA=ON but environment variable MACA_PATH is not set. "
+                        "Please export MACA_PATH (e.g. /opt/maca) before configuring.")
+  endif()
+  set(CMAKE_C_COMPILER   "${MACA_PATH}/mxgpu_llvm/bin/mxcc")
+  set(CMAKE_CXX_COMPILER "${MACA_PATH}/mxgpu_llvm/bin/mxcc")
+endif()
 
 project(infini_train VERSION 0.5.0 LANGUAGES CXX)
 
@@ -31,6 +50,22 @@ include_directories(${glog_SOURCE_DIR}/src)
 # eigen
 if(USE_OMP)
   find_package(OpenMP REQUIRED)
+
+  set(INFINI_OMP_LIBS OpenMP::OpenMP_CXX)
+
+  # Under MACA/mxcc, the host compiler is LLVM-based; link mxomp (iomp5) instead
+  # of libgomp to stay ABI-compatible with the MACA toolchain.
+  if(USE_MACA)
+    find_library(INFINI_MACA_OMP_LIB
+      NAMES omp iomp5
+      HINTS
+        "${MACA_PATH}/lib"
+        "${MACA_PATH}/mxgpu_llvm/lib"
+        "${MACA_PATH}/mxgpu_llvm/lib64"
+      REQUIRED
+    )
+    set(INFINI_OMP_LIBS OpenMP::OpenMP_CXX ${INFINI_MACA_OMP_LIB})
+  endif()
 endif()
 add_subdirectory(third_party/eigen)
 include_directories(${PROJECT_SOURCE_DIR}/third_party/eigen)
@@ -48,9 +83,25 @@ endif()
 # Framework core sources (*.cc), excluding cpu kernels (they are built separately)
 file(GLOB_RECURSE SRC ${PROJECT_SOURCE_DIR}/infini_train/src/*.cc)
 list(FILTER SRC EXCLUDE REGEX ".*kernels/cpu/.*")
+
+# Exclude backend-specific runtime/ccl translation units when the corresponding
+# backend is disabled. This keeps each build self-contained and avoids pulling
+# in headers (e.g. <cuda_runtime.h> / <mcr/mc_runtime.h>) that aren't on the
+# include path.
+if(NOT USE_CUDA)
+  list(FILTER SRC EXCLUDE REGEX ".*infini_train/src/core/runtime/cuda/.*")
+  list(FILTER SRC EXCLUDE REGEX ".*infini_train/src/core/ccl/cuda/.*")
+endif()
+if(NOT USE_MACA)
+  list(FILTER SRC EXCLUDE REGEX ".*infini_train/src/core/runtime/maca/.*")
+  list(FILTER SRC EXCLUDE REGEX ".*infini_train/src/core/ccl/maca/.*")
+endif()
 if(NOT USE_NCCL)
   list(FILTER SRC EXCLUDE REGEX ".*infini_train/src/core/ccl/cuda/.*")
 endif()
+if(NOT USE_MCCL)
+  list(FILTER SRC EXCLUDE REGEX ".*infini_train/src/core/ccl/maca/.*")
+endif()
 
 # CPU kernels (*.cc)
 file(GLOB_RECURSE CPU_KERNELS ${PROJECT_SOURCE_DIR}/infini_train/src/kernels/cpu/*.cc)
@@ -64,7 +115,7 @@ target_link_libraries(infini_train_cpu_kernels PUBLIC glog Eigen3::Eigen)
 
 if(USE_OMP)
   add_compile_definitions(USE_OMP=1)
-  target_link_libraries(infini_train_cpu_kernels PUBLIC OpenMP::OpenMP_CXX)
+  target_link_libraries(infini_train_cpu_kernels PUBLIC ${INFINI_OMP_LIBS})
 endif()
 
 # ------------------------------------------------------------------------------
@@ -103,6 +154,46 @@ if(USE_CUDA)
   endif()
 endif()
 
+# ------------------------------------------------------------------------------
+# MACA kernels library (optional, MetaX backend)
+# ------------------------------------------------------------------------------
+
+if(USE_MACA)
+  add_compile_definitions(USE_MACA=1)
+
+  # ---- MACA SDK include / link paths ----
+  include_directories("${MACA_PATH}/include")
+  link_directories("${MACA_PATH}/lib")
+
+  # ---- MACA runtime / blas / (optional) mccl libraries ----
+  find_library(MACA_RUNTIME_LIB NAMES mcruntime HINTS "${MACA_PATH}/lib" REQUIRED)
+  find_library(MACA_DNN_LIB     NAMES mcdnn     HINTS "${MACA_PATH}/lib" REQUIRED)
+  find_library(MACA_BLAS_LIB    NAMES mcblas    HINTS "${MACA_PATH}/lib" REQUIRED)
+
+  # ---- Collect .maca kernel sources and build as a CXX static lib with -x maca ----
+  file(GLOB_RECURSE MACA_KERNELS ${PROJECT_SOURCE_DIR}/infini_train/src/kernels/maca/*.maca)
+  set_source_files_properties(${MACA_KERNELS} PROPERTIES
+    LANGUAGE CXX
+    COMPILE_OPTIONS "-x;maca"
+  )
+
+  add_library(infini_train_maca_kernels STATIC ${MACA_KERNELS})
+  target_link_libraries(infini_train_maca_kernels
+    PUBLIC
+      glog
+      ${MACA_RUNTIME_LIB}
+      ${MACA_DNN_LIB}
+      ${MACA_BLAS_LIB}
+  )
+
+  if(USE_MCCL)
+    message(STATUS "Add USE_MCCL, use MCCL with MACA")
+    find_library(MACA_COMM_LIB NAMES mccl HINTS "${MACA_PATH}/lib" REQUIRED)
+    add_compile_definitions(USE_MCCL=1)
+    target_link_libraries(infini_train_maca_kernels PUBLIC ${MACA_COMM_LIB})
+  endif()
+endif()
+
 # ------------------------------------------------------------------------------
 # Main framework library
 # ------------------------------------------------------------------------------
@@ -133,6 +224,22 @@ if(USE_CUDA)
   endif()
 endif()
 
+if(USE_MACA)
+  # infini_train contains MACA runtime wrappers (maca_guard_impl.cc / maca_runtime_common.cc /
+  # mccl_impl.cc) which reference mcruntime / mcblas / mccl symbols directly at final link.
+  target_link_libraries(infini_train
+    PUBLIC
+      infini_train_maca_kernels
+      ${MACA_RUNTIME_LIB}
+      ${MACA_DNN_LIB}
+      ${MACA_BLAS_LIB}
+  )
+
+  if(USE_MCCL)
+    target_link_libraries(infini_train PUBLIC ${MACA_COMM_LIB})
+  endif()
+endif()
+
 # ------------------------------------------------------------------------------
 # Helper: link libraries in a group to fix static lib one-pass resolution
 # (THIS is what fixes "undefined reference" from cuda_kernels -> core symbols)
@@ -148,6 +255,16 @@ function(link_infini_train_exe target_name)
       "-Wl,--no-whole-archive"
       "-Wl,--end-group"
     )
+  elseif(USE_MACA)
+    target_link_libraries(${target_name} PRIVATE
+      "-Wl,--start-group"
+      "-Wl,--whole-archive"
+        infini_train
+        infini_train_cpu_kernels
+        infini_train_maca_kernels
+      "-Wl,--no-whole-archive"
+      "-Wl,--end-group"
+    )
   else()
     target_link_libraries(${target_name} PRIVATE
       "-Wl,--start-group"
diff --git a/infini_train/include/autocast.h b/infini_train/include/autocast.h
index 499c586f..4129ce87 100644
--- a/infini_train/include/autocast.h
+++ b/infini_train/include/autocast.h
@@ -88,7 +88,8 @@ inline const std::unordered_map<std::string_view, CastPolicy> kOpCastPolicyMap =
 // Default autocast data types for each device type
 inline constexpr std::array<DataType, static_cast<size_t>(Device::DeviceType::kCount)> kDeviceDefaultDtype = {
     DataType::kBFLOAT16, // CPU
-    DataType::kFLOAT16,  // CUDA.
+    DataType::kFLOAT16,  // CUDA
+    DataType::kFLOAT16,  // MACA
 };
 
 // Thread-local context to track autocast state
diff --git a/infini_train/include/common/maca/common_maca.h b/infini_train/include/common/maca/common_maca.h
new file mode 100644
index 00000000..d4a4fb39
--- /dev/null
+++ b/infini_train/include/common/maca/common_maca.h
@@ -0,0 +1,43 @@
+#pragma once
+
+#include <mcr/mc_runtime.h>
+#include <mcr/mc_runtime_api.h>
+#include <mcblas/mcblas.h>
+
+#ifdef USE_MCCL
+#include <mccl.h>
+#endif
+
+#include "glog/logging.h"
+
+namespace infini_train::common::maca {
+
+// Common MACA Macros
+#define MACA_CHECK(call)                                                                                               \
+    do {                                                                                                               \
+        mcError_t status = call;                                                                                       \
+        if (status != mcSuccess) {                                                                                     \
+            LOG(FATAL) << "MACA Error: " << mcGetErrorString(status) << " at " << __FILE__ << ":" << __LINE__;         \
+        }                                                                                                              \
+    } while (0)
+
+#define MCBLAS_CHECK(call)                                                                                             \
+    do {                                                                                                               \
+        mcblasStatus_t status = call;                                                                                  \
+        if (status != MCBLAS_STATUS_SUCCESS) {                                                                         \
+            LOG(FATAL) << "MCBLAS Error: " << mcblasGetStatusString(status) << " at " << __FILE__ << ":" << __LINE__;  \
+        }                                                                                                              \
+    } while (0)
+
+#ifdef USE_MCCL
+#define MCCL_CHECK(expr)                                                                                               \
+    do {                                                                                                               \
+        mcclResult_t _status = (expr);                                                                                 \
+        if (_status != mcclSuccess) {                                                                                  \
+            LOG(FATAL) << "MCCL error: " << mcclGetErrorString(_status) << " at " << __FILE__ << ":" << __LINE__       \
+                       << " (" << #expr << ")";                                                                        \
+        }                                                                                                              \
+    } while (0)
+#endif
+
+} // namespace infini_train::common::maca
diff --git a/infini_train/include/common/maca/cub_compat.cuh b/infini_train/include/common/maca/cub_compat.cuh
new file mode 100644
index 00000000..0848f789
--- /dev/null
+++ b/infini_train/include/common/maca/cub_compat.cuh
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <cub/cub.cuh>
+
+namespace infini_train::kernels::maca {
+
+// MACA ships a CUB compatible with the pre-2.8 API (cub::Sum/Max/Min).
+// Mirror the CUDA cub_compat.cuh aliases so that kernel code can refer to
+// CubSumOp / CubMaxOp / CubMinOp uniformly across backends.
+using CubSumOp = cub::Sum;
+using CubMaxOp = cub::Max;
+using CubMinOp = cub::Min;
+
+} // namespace infini_train::kernels::maca
diff --git a/infini_train/include/common/maca/kernel_helper.cuh b/infini_train/include/common/maca/kernel_helper.cuh
new file mode 100644
index 00000000..85a7cfeb
--- /dev/null
+++ b/infini_train/include/common/maca/kernel_helper.cuh
@@ -0,0 +1,311 @@
+#pragma once
+
+#include <common/maca_bfloat16.h>
+#include <common/maca_fp16.h>
+
+namespace infini_train::common::maca {
+/**
+ * Converts a value between arbitrary types with specialized handling for
+ * MACA floating-point precisions. For primitive types, this offers perfect
+ * forwarding which preserves value categories (lvalues/rvalues)
+ *
+ * @tparam DST Destination type (deduced)
+ * @tparam SRC Source type (deduced)
+ * @param x Input value (preserves const/volatile and value category)
+ * @return Value converted to DST type
+ *
+ * Example:
+ *   __half h = Cast<__half>(3.14f);   // float -> half (MACA intrinsic)
+ *   float f = Cast<float>(h);         // half -> float (MACA intrinsic)
+ *   int i = Cast<int>(2.718);         // double -> int (standard cast)
+ */
+// TODO(zbl): add support for half and __maca_bfloat16 conversions with integral types
+template <typename DST, typename SRC> __host__ __device__ DST Cast(SRC &&x) {
+    static_assert(!std::is_reference_v<DST>, "Cast cannot return reference types");
+
+    using SRC_base = std::remove_cv_t<std::remove_reference_t<SRC>>;
+    using DST_base = std::remove_cv_t<std::remove_reference_t<DST>>;
+
+    // __maca_bfloat16 conversions
+    if constexpr (std::is_same_v<SRC_base, __maca_bfloat16>) {
+        if constexpr (std::is_same_v<DST_base, float>) {
+            return __bfloat162float(x);
+        } else if constexpr (std::is_same_v<DST_base, double>) {
+            return static_cast<double>(__bfloat162float(x));
+        } else if constexpr (std::is_same_v<DST_base, __half>) {
+            return __half(__bfloat162float(x));
+        }
+    }
+    // half conversions
+    else if constexpr (std::is_same_v<SRC_base, __half>) {
+        if constexpr (std::is_same_v<DST_base, float>) {
+            return __half2float(x);
+        } else if constexpr (std::is_same_v<DST_base, double>) {
+            return static_cast<double>(__half2float(x));
+        } else if constexpr (std::is_same_v<DST_base, __maca_bfloat16>) {
+            return __maca_bfloat16(__half2float(x));
+        }
+    }
+    // float conversions to reduced precision
+    else if constexpr (std::is_same_v<SRC_base, float>) {
+        if constexpr (std::is_same_v<DST_base, __maca_bfloat16>) {
+            return __float2bfloat16(x);
+        } else if constexpr (std::is_same_v<DST_base, __half>) {
+            return __float2half(x);
+        }
+    }
+    // double conversions to reduced precision
+    else if constexpr (std::is_same_v<SRC_base, double>) {
+        if constexpr (std::is_same_v<DST_base, __maca_bfloat16>) {
+            return __double2bfloat16(x);
+        } else if constexpr (std::is_same_v<DST_base, __half>) {
+            return __double2half(x);
+        }
+    }
+    // Fallback for all other conversions
+    if constexpr (std::is_same_v<DST_base, __maca_bfloat16> || std::is_same_v<DST_base, __half>
+                  || std::is_same_v<SRC_base, __maca_bfloat16> || std::is_same_v<SRC_base, __half>) {
+        return (DST)(static_cast<float>(std::forward<SRC>(x)));;
+    } else {
+        return static_cast<DST>(std::forward<SRC>(x));;
+    }
+}
+
+template <typename T> __device__ __forceinline__ T Neg(const T &x) {
+    if constexpr (std::is_same_v<T, __maca_bfloat16> || std::is_same_v<T, __half>) {
+        return __hneg(x);
+    } else {
+        return -x;
+    }
+}
+
+template <typename T> __device__ __forceinline__ T Reciprocal(const T &x) {
+    if constexpr (std::is_same_v<T, __half>) {
+        return __hdiv(__float2half(1.0f), x);
+    } else if constexpr (std::is_same_v<T, __maca_bfloat16>) {
+        return __hdiv(__float2bfloat16(1.0f), x);
+    } else {
+        return T(1) / x;
+    }
+}
+
+template <typename T> __device__ __forceinline__ T Sin(const T &x) {
+    if constexpr (std::is_same_v<T, __half>) {
+        return __float2half(__sinf(__half2float(x)));
+    } else if constexpr (std::is_same_v<T, __maca_bfloat16>) {
+        return __float2bfloat16(__sinf(__bfloat162float(x)));
+    } else if constexpr (std::is_same_v<T, float>) {
+        return __sinf(x);
+    } else {
+        return std::sin(x);
+    }
+}
+
+template <typename T> __device__ __forceinline__ T Cos(const T &x) {
+    if constexpr (std::is_same_v<T, __half>) {
+        return __float2half(__cosf(__half2float(x)));
+    } else if constexpr (std::is_same_v<T, __maca_bfloat16>) {
+        return __float2bfloat16(__cosf(__bfloat162float(x)));
+    } else if constexpr (std::is_same_v<T, float>) {
+        return __cosf(x);
+    } else {
+        return std::cos(x);
+    }
+}
+
+template <typename T> __device__ __forceinline__ T Tanh(const T &x) {
+    if constexpr (std::is_same_v<T, __half>) {
+        return __float2half(tanhf(__half2float(x)));
+    } else if constexpr (std::is_same_v<T, __maca_bfloat16>) {
+        return __float2bfloat16(tanhf(__bfloat162float(x)));
+    } else if constexpr (std::is_same_v<T, float>) {
+        return tanhf(x);
+    } else {
+        return std::tanh(x);
+    }
+}
+
+template <typename T> __device__ __forceinline__ T Pow(const T &x, const T &exponent) {
+    if constexpr (std::is_same_v<T, __maca_bfloat16>) {
+        float x_ = __bfloat162float(x);
+        float exponent_ = __bfloat162float(exponent);
+        float ans_f = __powf(x_, exponent_);
+        return __float2bfloat16(__isnan(ans_f) ? std::pow(x_, exponent_) : ans_f);
+    } else if constexpr (std::is_same_v<T, __half>) {
+        float x_ = __half2float(x);
+        float exponent_ = __half2float(exponent);
+        float ans_f = __powf(x_, exponent_);
+        return __float2half(__isnan(ans_f) ? std::pow(x_, exponent_) : ans_f);
+    } else if constexpr (std::is_same_v<T, float>) {
+        return powf(x, exponent);
+    } else {
+        return std::pow(x, exponent);
+    }
+}
+
+template <typename T> __device__ __forceinline__ T Rsqrt(const T &x) {
+    if constexpr (std::is_same_v<T, __half>) {
+        return __float2half(rsqrtf(__half2float(x)));
+    } else if constexpr (std::is_same_v<T, __maca_bfloat16>) {
+        return __float2bfloat16(rsqrtf(__bfloat162float(x)));
+    } else if constexpr (std::is_same_v<T, float>) {
+        return rsqrtf(x);
+    } else {
+        return T(1) / std::sqrt(T(x));
+    }
+}
+
+template <typename T> __device__ __forceinline__ T Exp(const T &x) {
+    if constexpr (std::is_same_v<T, __maca_bfloat16> || std::is_same_v<T, __half>) {
+        return hexp(x);
+    } else if constexpr (std::is_same_v<T, float>) {
+        return __expf(x);
+    } else {
+        return std::exp(x);
+    }
+}
+
+template <typename T> __device__ __forceinline__ T Log(const T &x) {
+    if constexpr (std::is_same_v<T, __maca_bfloat16>) {
+        return __float2bfloat16(__logf(__bfloat162float(x)));
+    } else if constexpr (std::is_same_v<T, __half>) {
+        return __float2half(__logf(__half2float(x)));
+    } else if constexpr (std::is_same_v<T, float>) {
+        return __logf(x);
+    } else {
+        return std::log(x);
+    }
+}
+
+template <typename T> __device__ __forceinline__ T Add(const T &a, const T &b) {
+    if constexpr (std::is_same_v<T, __maca_bfloat16> || std::is_same_v<T, __half>) {
+        return __hadd(a, b);
+    } else {
+        return a + b;
+    }
+}
+
+template <typename T> __device__ __forceinline__ T Sub(const T &a, const T &b) {
+    if constexpr (std::is_same_v<T, __maca_bfloat16> || std::is_same_v<T, __half>) {
+        return __hsub(a, b);
+    } else {
+        return a - b;
+    }
+}
+
+template <typename T> __device__ __forceinline__ T Mul(const T &a, const T &b) {
+    if constexpr (std::is_same_v<T, __maca_bfloat16> || std::is_same_v<T, __half>) {
+        return __hmul(a, b);
+    } else {
+        return a * b;
+    }
+}
+
+template <typename T> __device__ __forceinline__ T Div(const T &a, const T &b) {
+    if constexpr (std::is_same_v<T, __maca_bfloat16> || std::is_same_v<T, __half>) {
+        return __hdiv(a, b);
+    } else {
+        return a / b;
+    }
+}
+
+template <typename T> __device__ __forceinline__ T Sigmoid(const T &x) {
+    if constexpr (std::is_same_v<T, float>) {
+        return 1.0f / (1.0f + expf(-x));
+    } else if constexpr (std::is_same_v<T, __maca_bfloat16> || std::is_same_v<T, __half>) {
+        return __hdiv(T(1), T(1) + hexp(-x));
+    } else {
+        return T(1) / (T(1) + std::exp(-x));
+    }
+}
+
+template <typename T> __device__ __forceinline__ T Max(const T &a, const T &b) {
+    if constexpr (std::is_same_v<T, __maca_bfloat16> || std::is_same_v<T, __half>) {
+        return __hle(a, b) ? b : a;
+    } else if constexpr (std::is_same_v<T, float>) {
+        return fmaxf(a, b);
+    } else {
+        return std::max(a, b);
+    }
+}
+
+template <typename T> __device__ __forceinline__ T Min(const T &a, const T &b) {
+    if constexpr (std::is_same_v<T, __maca_bfloat16> || std::is_same_v<T, __half>) {
+        return __hle(a, b) ? a : b;
+    } else if constexpr (std::is_same_v<T, float>) {
+        return fminf(a, b);
+    } else {
+        return std::min(a, b);
+    }
+}
+
+template <typename T> __device__ __forceinline__ T Fma(const T &x, const T &y, const T &z) {
+    if constexpr (std::is_same_v<T, __half>) {
+        return __hfma(x, y, z);
+    } else if constexpr (std::is_same_v<T, __maca_bfloat16>) {
+        return __float2bfloat16(__fmaf_rn(__bfloat162float(x), __bfloat162float(y), __bfloat162float(z)));
+    } else if constexpr (std::is_same_v<T, float>) {
+        return __fmaf_rn(x, y, z);
+    } else {
+        return std::fma(x, y, z);
+    }
+}
+
+template <typename scalar_t, typename index_t,
+          typename std::enable_if_t<std::is_same<scalar_t, __half>::value> * = nullptr>
+__device__ __forceinline__ void fastSpecializedAtomicAdd(scalar_t *tensor, index_t index, const index_t num_elements,
+                                                         scalar_t value) {
+    __half *target_addr = tensor + index;
+    bool low_byte = ((reinterpret_cast<std::uintptr_t>(target_addr) & (sizeof(__half2) - 1)) == 0);
+
+    if (low_byte && index < (num_elements - 1)) {
+        __half2 value2 = __halves2half2(value, __float2half(0.0f));
+        atomicAdd(reinterpret_cast<__half2 *>(target_addr), value2);
+
+    } else if (!low_byte && index > 0) {
+        __half2 value2 = __halves2half2(__float2half(0.0f), value);
+        atomicAdd(reinterpret_cast<__half2 *>(target_addr - 1), value2);
+
+    } else {
+        atomicAdd(target_addr, value);
+    }
+}
+
+template <typename scalar_t, typename index_t,
+          typename std::enable_if_t<std::is_same<scalar_t, __maca_bfloat16>::value> * = nullptr>
+__device__ __forceinline__ void fastSpecializedAtomicAdd(scalar_t *tensor, index_t index, const index_t num_elements,
+                                                         scalar_t value) {
+    __maca_bfloat16 *target_addr = tensor + index;
+    bool low_byte = ((reinterpret_cast<std::uintptr_t>(target_addr) & (sizeof(__maca_bfloat162) - 1)) == 0);
+
+    if (low_byte && index < (num_elements - 1)) {
+        __maca_bfloat162 value2 = __halves2bfloat162(value, __maca_bfloat16(0.0f));
+        atomicAdd(reinterpret_cast<__maca_bfloat162 *>(target_addr), value2);
+
+    } else if (!low_byte && index > 0) {
+        __maca_bfloat162 value2 = __halves2bfloat162(__maca_bfloat16(0.0f), value);
+        atomicAdd(reinterpret_cast<__maca_bfloat162 *>(target_addr - 1), value2);
+
+    } else {
+        atomicAdd(target_addr, value);
+    }
+}
+
+template <typename scalar_t, typename index_t,
+          typename std::enable_if_t<!std::is_same<scalar_t, __half>::value
+                                    && !std::is_same<scalar_t, __maca_bfloat16>::value> * = nullptr>
+__device__ __forceinline__ void fastSpecializedAtomicAdd(scalar_t *tensor, index_t index,
+                                                         const index_t /*num_elements*/, scalar_t value) {
+    atomicAdd(tensor + index, value);
+}
+
+template <class scalar_t, class index_t>
+__device__ __forceinline__ void fastAtomicAdd(scalar_t *tensor, index_t index, const index_t num_elements,
+                                              scalar_t value, bool fast_atomics) {
+    if (fast_atomics) {
+        fastSpecializedAtomicAdd(tensor, index, num_elements, value);
+    } else {
+        atomicAdd(tensor + index, value);
+    }
+}
+} // namespace infini_train::common::maca
diff --git a/infini_train/include/device.h b/infini_train/include/device.h
index 28db395f..1c87a291 100644
--- a/infini_train/include/device.h
+++ b/infini_train/include/device.h
@@ -13,7 +13,8 @@ class Device {
     enum class DeviceType : int8_t {
         kCPU = 0,
         kCUDA = 1,
-        kCount = 2,
+        kMACA = 2,
+        kCount = 3,
         kInvalid = -1,
     };
 
@@ -30,6 +31,7 @@ class Device {
 
     bool IsCPU() const;
     bool IsCUDA() const;
+    bool IsMACA() const;
 
     std::string ToString() const;
 
diff --git a/infini_train/src/core/runtime/maca/maca_guard_impl.cc b/infini_train/src/core/runtime/maca/maca_guard_impl.cc
new file mode 100644
index 00000000..441ac8f7
--- /dev/null
+++ b/infini_train/src/core/runtime/maca/maca_guard_impl.cc
@@ -0,0 +1,279 @@
+#include "infini_train/src/core/runtime/maca/maca_guard_impl.h"
+
+#include <array>
+#include <memory>
+#include <mutex>
+
+#include "infini_train/include/common/maca/common_maca.h"
+#include "infini_train/include/core/runtime/runtime_common.h"
+#include "infini_train/include/device.h"
+
+#include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
+
+namespace infini_train::core::maca {
+namespace {
+constexpr int kMaxGpus = 8;
+
+static std::array<std::unique_ptr<MacaStream>, kMaxGpus> maca_streams;
+static std::array<std::unique_ptr<MacaBlasHandle>, kMaxGpus> maca_blas_handles;
+
+static std::array<std::once_flag, kMaxGpus> device_stream_flags;
+static std::array<std::once_flag, kMaxGpus> device_handle_flags;
+
+inline void CheckMacaDevice(Device device) {
+    CHECK(device.type() == Device::DeviceType::kMACA) << std::format(
+        "MacaGuardImpl expects MACA device, but got type={} index={}", static_cast<int>(device.type()), device.index());
+    const int idx = device.index();
+    CHECK(idx >= 0 && idx < kMaxGpus) << std::format("MACA device index {} out of cache range [0, {}).", idx, kMaxGpus);
+}
+
+inline mcEvent_t GetMacaEvent(Event *event) {
+    auto *maca_event = dynamic_cast<MacaEvent *>(event);
+    CHECK_NOTNULL(maca_event);
+    return maca_event->maca_event();
+}
+
+inline mcStream_t GetMacaStream(Stream *stream) {
+    auto *maca_stream = dynamic_cast<MacaStream *>(stream);
+    CHECK_NOTNULL(maca_stream);
+    return maca_stream->maca_stream();
+}
+} // namespace
+
+void MacaGuardImpl::InitSingleStream(Device device) {
+    CheckMacaDevice(device);
+
+    int current_device = -1;
+    MACA_CHECK(mcGetDevice(&current_device));
+    MACA_CHECK(mcSetDevice(device.index()));
+
+    maca_streams[device.index()] = std::make_unique<MacaStream>();
+
+    MACA_CHECK(mcSetDevice(current_device));
+}
+
+void MacaGuardImpl::InitSingleHandle(Device device) {
+    CheckMacaDevice(device);
+
+    int current_device = -1;
+    MACA_CHECK(mcGetDevice(&current_device));
+    MACA_CHECK(mcSetDevice(device.index()));
+
+    std::call_once(device_stream_flags.at(device.index()), InitSingleStream, device);
+
+    maca_blas_handles[device.index()] = std::make_unique<MacaBlasHandle>(maca_streams[device.index()].get());
+
+    MACA_CHECK(mcSetDevice(current_device));
+}
+
+MacaGuardImpl::MacaGuardImpl() {
+    // The MACA runtime requires an explicit mcInit(0) before any other call.
+    // CUDA has no equivalent; mirroring the DeviceManager ctor from 87390cd.
+    MACA_CHECK(mcInit(0));
+}
+
+// device
+Device MacaGuardImpl::GetDevice() const {
+    int current_device = -1;
+    MACA_CHECK(mcGetDevice(&current_device));
+    return Device(Device::DeviceType::kMACA, current_device);
+}
+
+void MacaGuardImpl::SetDevice(Device device) const {
+    CheckMacaDevice(device);
+    MACA_CHECK(mcSetDevice(device.index()));
+}
+
+int MacaGuardImpl::DeviceCount() const {
+    int device_count = 0;
+    MACA_CHECK(mcGetDeviceCount(&device_count));
+    return device_count;
+}
+
+Device::DeviceType MacaGuardImpl::Type() const { return Device::DeviceType::kMACA; }
+
+// stream
+Stream *MacaGuardImpl::GetStream(Device device) const {
+    CheckMacaDevice(device);
+    // FIXME(dcj): call_once is process-scoped and assumes single initialization.
+    std::call_once(device_stream_flags.at(device.index()), InitSingleStream, device);
+    return maca_streams.at(device.index()).get();
+}
+
+Stream *MacaGuardImpl::CreateStream(Device device) const {
+    CheckMacaDevice(device);
+    int current_device = -1;
+    MACA_CHECK(mcGetDevice(&current_device));
+    MACA_CHECK(mcSetDevice(device.index()));
+
+    Stream *stream = new MacaStream();
+
+    MACA_CHECK(mcSetDevice(current_device));
+    return stream;
+}
+
+Stream *MacaGuardImpl::CreateStreamWithPriority(Device device, int priority) const {
+    CheckMacaDevice(device);
+    int current_device = -1;
+    MACA_CHECK(mcGetDevice(&current_device));
+    MACA_CHECK(mcSetDevice(device.index()));
+
+    Stream *stream = new MacaStream(priority);
+
+    MACA_CHECK(mcSetDevice(current_device));
+    return stream;
+}
+
+void MacaGuardImpl::DestroyStream(Stream *stream) const {
+    if (stream == nullptr) {
+        return;
+    }
+    auto *maca_stream = dynamic_cast<MacaStream *>(stream);
+    CHECK_NOTNULL(maca_stream);
+    delete maca_stream;
+}
+
+void MacaGuardImpl::GetStreamPriorityRange(int *low, int *high) const {
+    MACA_CHECK(mcDeviceGetStreamPriorityRange(low, high));
+}
+
+// event
+void MacaGuardImpl::EventCreate(Event **event) const { *event = new MacaEvent(); }
+
+void MacaGuardImpl::EventCreateWithFlags(Event **event, EventFlag flags) const { *event = new MacaEvent(flags); }
+
+void MacaGuardImpl::EventDestroy(Event *event) const {
+    if (event == nullptr) {
+        return;
+    }
+    delete event;
+}
+
+void MacaGuardImpl::EventRecord(Event *event, Stream *stream) const {
+    auto maca_event = GetMacaEvent(event);
+    auto maca_stream = GetMacaStream(stream);
+    MACA_CHECK(mcEventRecord(maca_event, maca_stream));
+}
+
+void MacaGuardImpl::StreamWaitEvent(Stream *stream, Event *event, uint32_t flags) const {
+    auto maca_event = GetMacaEvent(event);
+    auto maca_stream = GetMacaStream(stream);
+    MACA_CHECK(mcStreamWaitEvent(maca_stream, maca_event, flags));
+}
+
+RuntimeStatus MacaGuardImpl::EventSynchronize(Event *event) const {
+    auto maca_event = GetMacaEvent(event);
+    mcError_t status = mcEventSynchronize(maca_event);
+    if (status == mcSuccess) {
+        return RuntimeStatus::kSuccess;
+    }
+    if (status == mcErrorNotReady) {
+        return RuntimeStatus::kNotReady;
+    }
+    LOG(ERROR) << "MacaGuardImpl::EventSynchronize failed: " << mcGetErrorString(status);
+    return RuntimeStatus::kError;
+}
+
+RuntimeStatus MacaGuardImpl::EventQuery(Event *event) const {
+    auto maca_event = GetMacaEvent(event);
+    mcError_t status = mcEventQuery(maca_event);
+    if (status == mcSuccess) {
+        return RuntimeStatus::kSuccess;
+    }
+    if (status == mcErrorNotReady) {
+        return RuntimeStatus::kNotReady;
+    }
+    LOG(ERROR) << "MacaGuardImpl::EventQuery failed: " << mcGetErrorString(status);
+    return RuntimeStatus::kError;
+}
+
+float MacaGuardImpl::EventElapsedTime(Event *start_event, Event *stop_event) const {
+    auto start_maca_event = GetMacaEvent(start_event);
+    auto stop_maca_event = GetMacaEvent(stop_event);
+    float elapsed_ms = 0.0f;
+    MACA_CHECK(mcEventElapsedTime(&elapsed_ms, start_maca_event, stop_maca_event));
+    return elapsed_ms;
+}
+
+// sync
+void MacaGuardImpl::SynchronizeDevice(Device device) const {
+    auto original_device = GetDevice();
+    SetDevice(device);
+
+    MACA_CHECK(mcDeviceSynchronize());
+
+    SetDevice(original_device);
+}
+
+void MacaGuardImpl::SynchronizeStream(Stream *stream) const {
+    auto maca_stream = GetMacaStream(stream);
+    MACA_CHECK(mcStreamSynchronize(maca_stream));
+}
+
+// blas
+BlasHandle *MacaGuardImpl::GetBlasHandle(Device device) const {
+    CheckMacaDevice(device);
+    std::call_once(device_handle_flags.at(device.index()), InitSingleHandle, device);
+    return maca_blas_handles.at(device.index()).get();
+}
+
+// memory
+void MacaGuardImpl::Malloc(void **dev_ptr, size_t size) { MACA_CHECK(mcMalloc(dev_ptr, size)); }
+
+void MacaGuardImpl::MallocAsync(void **dev_ptr, size_t size, Stream *stream) {
+    auto maca_stream = GetMacaStream(stream);
+    MACA_CHECK(mcMallocAsync(dev_ptr, size, maca_stream));
+}
+
+void MacaGuardImpl::Free(void *dev_ptr) { MACA_CHECK(mcFree(dev_ptr)); }
+
+void MacaGuardImpl::FreeAsync(void *dev_ptr, Stream *stream) {
+    auto maca_stream = GetMacaStream(stream);
+    MACA_CHECK(mcFreeAsync(dev_ptr, maca_stream));
+}
+
+void MacaGuardImpl::Memcpy(void *dst, const void *src, size_t count, MemcpyKind kind) {
+    if (kind == MemcpyKind::kH2D) {
+        MACA_CHECK(mcMemcpy(dst, src, count, mcMemcpyHostToDevice));
+    } else if (kind == MemcpyKind::kD2H) {
+        MACA_CHECK(mcMemcpy(dst, src, count, mcMemcpyDeviceToHost));
+    } else if (kind == MemcpyKind::kD2D) {
+        MACA_CHECK(mcMemcpy(dst, src, count, mcMemcpyDeviceToDevice));
+    } else {
+        LOG(FATAL) << std::format("MacaGuardImpl::Memcpy got invalid MemcpyKind={}", MemcpyKindToString(kind));
+    }
+}
+
+void MacaGuardImpl::MemcpyAsync(void *dst, const void *src, size_t count, MemcpyKind kind, Stream *stream) {
+    auto maca_stream = GetMacaStream(stream);
+
+    switch (kind) {
+    case MemcpyKind::kH2D:
+        MACA_CHECK(mcMemcpyAsync(dst, src, count, mcMemcpyHostToDevice, maca_stream));
+        break;
+    case MemcpyKind::kD2H:
+        MACA_CHECK(mcMemcpyAsync(dst, src, count, mcMemcpyDeviceToHost, maca_stream));
+        break;
+    case MemcpyKind::kD2D:
+        MACA_CHECK(mcMemcpyAsync(dst, src, count, mcMemcpyDeviceToDevice, maca_stream));
+        break;
+    default:
+        LOG(FATAL) << std::format("MacaGuardImpl::MemcpyAsync got invalid MemcpyKind={}", MemcpyKindToString(kind));
+    }
+}
+
+void MacaGuardImpl::ResetMemPoolHighWatermarks(Device device) const {
+    // TODO(dcj): MetaX SDK support for mcMemPoolGetAttribute / mcMemPoolAttrUsedMemHigh
+    // is not confirmed. Keep this a no-op until verified against a working SDK.
+    (void)device;
+}
+
+std::pair<size_t, size_t> MacaGuardImpl::GetMemPoolPeakMB(Device device) const {
+    // TODO(dcj): see note in ResetMemPoolHighWatermarks.
+    (void)device;
+    return std::make_pair<size_t, size_t>(0, 0);
+}
+
+INFINI_TRAIN_REGISTER_DEVICE_GUARD_IMPL(Device::DeviceType::kMACA, MacaGuardImpl)
+
+} // namespace infini_train::core::maca
diff --git a/infini_train/src/core/runtime/maca/maca_guard_impl.h b/infini_train/src/core/runtime/maca/maca_guard_impl.h
new file mode 100644
index 00000000..cb1ae4af
--- /dev/null
+++ b/infini_train/src/core/runtime/maca/maca_guard_impl.h
@@ -0,0 +1,83 @@
+#pragma once
+
+#include "infini_train/include/core/runtime/device_guard.h"
+#include "infini_train/include/device.h"
+
+namespace infini_train::core {
+class Stream;
+class BlasHandle;
+} // namespace infini_train::core
+
+namespace infini_train::core::maca {
+
+class MacaGuardImpl final : public DeviceGuardImpl {
+public:
+    static void InitSingleStream(Device device);
+
+    static void InitSingleHandle(Device device);
+
+    MacaGuardImpl();
+
+    // device
+    Device GetDevice() const override;
+
+    void SetDevice(Device device) const override;
+
+    int DeviceCount() const override;
+
+    Device::DeviceType Type() const override;
+
+    // stream
+    Stream *GetStream(Device device) const override;
+
+    Stream *CreateStream(Device device) const override;
+
+    Stream *CreateStreamWithPriority(Device device, int priority) const override;
+
+    void DestroyStream(Stream *stream) const override;
+
+    void GetStreamPriorityRange(int *low, int *high) const override;
+
+    // event
+    void EventCreate(Event **event) const override;
+
+    void EventCreateWithFlags(Event **event, EventFlag flags) const override;
+
+    void EventDestroy(Event *event) const override;
+
+    void EventRecord(Event *event, Stream *stream) const override;
+
+    void StreamWaitEvent(Stream *stream, Event *event, uint32_t flags) const override;
+
+    RuntimeStatus EventSynchronize(Event *event) const override;
+
+    RuntimeStatus EventQuery(Event *event) const override;
+
+    float EventElapsedTime(Event *start_event, Event *stop_event) const override;
+
+    // sync
+    void SynchronizeDevice(Device device) const override;
+    void SynchronizeStream(Stream *stream) const override;
+
+    // blas
+    BlasHandle *GetBlasHandle(Device device) const override;
+
+    // memory
+    void Malloc(void **dev_ptr, size_t size) override;
+
+    void MallocAsync(void **dev_ptr, size_t size, Stream *stream) override;
+
+    void Free(void *dev_ptr) override;
+
+    void FreeAsync(void *dev_ptr, Stream *stream) override;
+
+    void Memcpy(void *dst, const void *src, size_t count, MemcpyKind kind) override;
+
+    void MemcpyAsync(void *dst, const void *src, size_t count, MemcpyKind kind, Stream *stream) override;
+
+    void ResetMemPoolHighWatermarks(Device device) const override;
+
+    std::pair<size_t, size_t> GetMemPoolPeakMB(Device device) const override;
+};
+
+} // namespace infini_train::core::maca
diff --git a/infini_train/src/core/runtime/maca/maca_runtime_common.cc b/infini_train/src/core/runtime/maca/maca_runtime_common.cc
new file mode 100644
index 00000000..02afb89f
--- /dev/null
+++ b/infini_train/src/core/runtime/maca/maca_runtime_common.cc
@@ -0,0 +1,60 @@
+#include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
+
+#include "infini_train/include/common/maca/common_maca.h"
+
+namespace infini_train::core::maca {
+namespace {
+uint32_t ToMacaEventFlags(EventFlag flags) {
+    switch (flags) {
+    case EventFlag::kDefault:
+        return mcEventDefault;
+    case EventFlag::kBlockingSync:
+        return mcEventBlockingSync;
+    case EventFlag::kDisableTiming:
+        return mcEventDisableTiming;
+    case EventFlag::kInterprocess:
+        // MACA (like CUDA) requires DisableTiming with Interprocess events.
+        // NOTE(dcj): if the MACA SDK in use does not expose mcEventInterprocess,
+        // this branch will need to be guarded and downgraded to a LOG(FATAL).
+        return mcEventInterprocess | mcEventDisableTiming;
+    default:
+        LOG(FATAL) << "Unsupported EventFlag value: " << static_cast<uint32_t>(flags);
+    }
+    return mcEventDefault;
+}
+} // namespace
+
+MacaEvent::MacaEvent(EventFlag flags) { MACA_CHECK(mcEventCreateWithFlags(&event_, ToMacaEventFlags(flags))); }
+
+MacaEvent::~MacaEvent() {
+    if (event_ != nullptr) {
+        MACA_CHECK(mcEventDestroy(event_));
+    }
+}
+
+mcEvent_t MacaEvent::maca_event() const { return event_; }
+
+MacaStream::MacaStream() { MACA_CHECK(mcStreamCreate(&stream_)); }
+
+MacaStream::MacaStream(int priority) {
+    MACA_CHECK(mcStreamCreateWithPriority(&stream_, mcStreamNonBlocking, priority));
+}
+
+MacaStream::~MacaStream() {
+    // Do nothing.
+}
+
+mcStream_t MacaStream::maca_stream() const { return stream_; }
+
+MacaBlasHandle::MacaBlasHandle(Stream *stream) {
+    MCBLAS_CHECK(mcblasCreate(&mcblas_handle_));
+    MCBLAS_CHECK(mcblasSetStream(mcblas_handle_, dynamic_cast<MacaStream *>(stream)->maca_stream()));
+}
+
+MacaBlasHandle::~MacaBlasHandle() {
+    // Do nothing.
+}
+
+mcblasHandle_t MacaBlasHandle::mcblas_handle() const { return mcblas_handle_; }
+
+} // namespace infini_train::core::maca
diff --git a/infini_train/src/core/runtime/maca/maca_runtime_common.h b/infini_train/src/core/runtime/maca/maca_runtime_common.h
new file mode 100644
index 00000000..88d8e0a9
--- /dev/null
+++ b/infini_train/src/core/runtime/maca/maca_runtime_common.h
@@ -0,0 +1,59 @@
+#pragma once
+
+#include <cstdint>
+
+#include <mcblas/mcblas.h>
+#include <mcr/mc_runtime.h>
+#include <mcr/mc_runtime_api.h>
+
+#include "infini_train/include/core/runtime/runtime_common.h"
+
+namespace infini_train::core {
+class Stream;
+}
+
+namespace infini_train::core::maca {
+
+class MacaEvent final : public Event {
+public:
+    explicit MacaEvent(EventFlag flags = EventFlag::kDefault);
+    ~MacaEvent() override;
+
+    mcEvent_t maca_event() const;
+
+private:
+    mcEvent_t event_ = nullptr;
+};
+
+class MacaStream : public Stream {
+public:
+    MacaStream();
+    explicit MacaStream(int priority);
+
+    // NOTE(dcj):
+    // Mirror CudaStream: destruction of global variables may outlive the MACA
+    // runtime, so we intentionally leak the underlying mcStream_t rather than
+    // risk calling mcStreamDestroy after runtime teardown.
+    ~MacaStream() override;
+
+    mcStream_t maca_stream() const;
+
+private:
+    mcStream_t stream_ = nullptr;
+};
+
+class MacaBlasHandle : public BlasHandle {
+public:
+    explicit MacaBlasHandle(Stream *stream);
+
+    // NOTE(dcj):
+    // Mirror CudaBlasHandle: leaked intentionally; see MacaStream note.
+    ~MacaBlasHandle() override;
+
+    mcblasHandle_t mcblas_handle() const;
+
+private:
+    mcblasHandle_t mcblas_handle_;
+};
+
+} // namespace infini_train::core::maca
diff --git a/infini_train/src/device.cc b/infini_train/src/device.cc
index 1bb3aaad..5f377b81 100644
--- a/infini_train/src/device.cc
+++ b/infini_train/src/device.cc
@@ -26,9 +26,25 @@ bool Device::IsCPU() const { return type_ == DeviceType::kCPU; }
 
 bool Device::IsCUDA() const { return type_ == DeviceType::kCUDA; }
 
+bool Device::IsMACA() const { return type_ == DeviceType::kMACA; }
+
 std::string Device::ToString() const {
+    const char *type_str = "Unknown";
+    switch (type_) {
+    case DeviceType::kCPU:
+        type_str = "CPU";
+        break;
+    case DeviceType::kCUDA:
+        type_str = "CUDA";
+        break;
+    case DeviceType::kMACA:
+        type_str = "MACA";
+        break;
+    default:
+        break;
+    }
     std::ostringstream oss;
-    oss << std::format("Device({}, {})", type_ == DeviceType::kCPU ? "CPU" : "CUDA", index_);
+    oss << std::format("Device({}, {})", type_str, index_);
     return oss.str();
 }
 
diff --git a/infini_train/src/kernels/maca/accumulate_grad.maca b/infini_train/src/kernels/maca/accumulate_grad.maca
new file mode 100644
index 00000000..1bda88db
--- /dev/null
+++ b/infini_train/src/kernels/maca/accumulate_grad.maca
@@ -0,0 +1,94 @@
+#include <cmath>
+#include <memory>
+
+#include "infini_train/include/common/maca/kernel_helper.cuh"
+#include "infini_train/include/core/runtime/device_guard.h"
+#include "infini_train/include/dispatcher.h"
+#include "infini_train/include/tensor.h"
+
+#include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
+
+namespace infini_train::kernels::maca {
+
+template <typename T>
+__global__ void AccumulateGradKernel(const T *grad_ptr, float rate, T *tensor_ptr, size_t num_elements) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < num_elements) {
+        tensor_ptr[idx] += common::maca::Mul(grad_ptr[idx], common::maca::Cast<T>(rate));
+    }
+}
+
+void AccumulateGrad(const std::shared_ptr<Tensor> &gradient, float rate, const std::shared_ptr<Tensor> &tensor) {
+    size_t num_elements = gradient->NumElements();
+
+    int threads_per_block = 256;
+    int num_blocks = (num_elements + threads_per_block - 1) / threads_per_block;
+
+    auto device = tensor->GetDevice();
+    const auto &maca_stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
+                                  infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                                  ->maca_stream();
+
+    DispatchFunc<INFINI_ALL_FLOATING_TYPES>(
+        gradient->Dtype(),
+        [=]<typename T>() {
+            AccumulateGradKernel<<<num_blocks, threads_per_block, 0, maca_stream>>>(
+                static_cast<const T *>(gradient->DataPtr()), rate, static_cast<T *>(tensor->DataPtr()), num_elements);
+        },
+        "MACA AccumulateGrad");
+}
+
+template <typename T>
+__global__ void AdamAccumulateGradKernel(const T *grad_data, T *param_data, size_t num_elements, T *m_data, T *v_data,
+                                         float learning_rate, float beta1, float beta2, float eps,
+                                         const float bias_correction_m, const float bias_correction_v) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < num_elements) {
+        m_data[idx] = common::maca::Fma(common::maca::Cast<T>(beta1), m_data[idx],
+                                        common::maca::Cast<T>(1 - beta1) * grad_data[idx]);
+        v_data[idx] = common::maca::Fma(common::maca::Cast<T>(beta2), v_data[idx],
+                                        common::maca::Cast<T>(1 - beta2) * grad_data[idx] * grad_data[idx]);
+
+        const float m_hat = common::maca::Cast<float>(m_data[idx]) / bias_correction_m;
+        const float v_hat = common::maca::Cast<float>(v_data[idx]) / bias_correction_v;
+
+        param_data[idx] = common::maca::Sub(
+            param_data[idx], common::maca::Cast<T>(learning_rate * m_hat * __frcp_rn(__fsqrt_rn(v_hat) + eps)));
+    }
+}
+
+void AdamAccumulateGrad(const std::shared_ptr<Tensor> &grad, const std::shared_ptr<Tensor> &param,
+                        const std::shared_ptr<Tensor> &m, const std::shared_ptr<Tensor> &v, float learning_rate,
+                        float beta1, float beta2, float eps, int64_t t) {
+    size_t num_elements = grad->NumElements();
+
+    const float bias_correction_m = 1.0f - std::pow(beta1, t);
+    const float bias_correction_v = 1.0f - std::pow(beta2, t);
+
+    int threads_per_block = 256;
+    int num_blocks = (num_elements + threads_per_block - 1) / threads_per_block;
+
+    auto device = grad->GetDevice();
+    const auto &maca_stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
+                                  infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                                  ->maca_stream();
+
+    DispatchFunc<INFINI_ALL_FLOATING_TYPES>(
+        grad->Dtype(),
+        [=]<typename T>() {
+            AdamAccumulateGradKernel<<<num_blocks, threads_per_block, 0, maca_stream>>>(
+                static_cast<const T *>(grad->DataPtr()), static_cast<T *>(param->DataPtr()), num_elements,
+                static_cast<T *>(m->DataPtr()), static_cast<T *>(v->DataPtr()), learning_rate, beta1, beta2, eps,
+                bias_correction_m, bias_correction_v);
+        },
+        "MACA AdamAccumulateGrad");
+}
+} // namespace infini_train::kernels::maca
+
+#define REGISTER_MACA_ACCUMULATE_GRAD_KERNEL(kernel_name)                                                              \
+    REGISTER_KERNEL(infini_train::Device::DeviceType::kMACA, kernel_name, infini_train::kernels::maca::kernel_name)
+
+REGISTER_MACA_ACCUMULATE_GRAD_KERNEL(AccumulateGrad)
+REGISTER_MACA_ACCUMULATE_GRAD_KERNEL(AdamAccumulateGrad)
+
+#undef REGISTER_MACA_ACCUMULATE_GRAD_KERNEL
diff --git a/infini_train/src/kernels/maca/elementwise.maca b/infini_train/src/kernels/maca/elementwise.maca
new file mode 100644
index 00000000..e90d79a0
--- /dev/null
+++ b/infini_train/src/kernels/maca/elementwise.maca
@@ -0,0 +1,1269 @@
+#include <cstddef>
+
+#include <cub/warp/warp_reduce.cuh>
+
+#include "infini_train/include/common/maca/common_maca.h"
+#include "infini_train/include/common/maca/kernel_helper.cuh"
+#include "infini_train/include/core/runtime/device_guard.h"
+#include "infini_train/include/dispatcher.h"
+#include "infini_train/include/tensor.h"
+
+#include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
+
+namespace infini_train::kernels::maca {
+namespace {
+using namespace infini_train::common::maca;
+constexpr int kWarpSize = 32;
+
+// Aligned vector type for vectorized loads/stores (128-bit).
+template <typename T, int N> struct __align__(sizeof(T) * N) aligned_vector { T val[N]; };
+
+// Elements per vectorized load/store: 128-bit / sizeof(T).
+// float → 4, bf16/__half → 8, double → 2.
+template <typename T> constexpr int kVecSize = 16 / sizeof(T);
+
+// Maximum number of dimensions supported by the broadcast metadata.
+// Real-world tensors in this codebase top out at 4-5 dims, so 8 leaves comfortable headroom
+// while keeping the struct under the 4 KB CUDA kernel parameter limit.
+constexpr int kMaxBroadcastDims = 8;
+
+// POD metadata for broadcast kernels. Passed by value into __global__ kernels so the data
+// lives in CUDA kernel parameter memory (constant cache) instead of being uploaded via a
+// per-call mcMallocAsync + mcMemcpyAsync into global memory.
+struct BroadcastMeta {
+    int ndim;
+    int64_t a_strides[kMaxBroadcastDims];
+    int64_t b_strides[kMaxBroadcastDims];
+    int64_t out_strides[kMaxBroadcastDims];
+    int64_t a_shape[kMaxBroadcastDims];
+    int64_t b_shape[kMaxBroadcastDims];
+};
+
+// Build a BroadcastMeta on the host from input/output dim vectors. Right-aligns a_dims/b_dims
+// to out_dims's rank (the broadcasting convention) and computes contiguous strides for each.
+inline BroadcastMeta MakeBroadcastMeta(const std::vector<int64_t> &a_dims, const std::vector<int64_t> &b_dims,
+                                       const std::vector<int64_t> &out_dims) {
+    BroadcastMeta m{};
+    const int ndim = static_cast<int>(out_dims.size());
+    CHECK_LE(ndim, kMaxBroadcastDims) << "Broadcast ndim exceeds kMaxBroadcastDims (" << kMaxBroadcastDims << ")";
+    m.ndim = ndim;
+
+    std::vector<int64_t> a_shape(ndim, 1), b_shape(ndim, 1);
+    std::copy_backward(a_dims.begin(), a_dims.end(), a_shape.end());
+    std::copy_backward(b_dims.begin(), b_dims.end(), b_shape.end());
+
+    auto a_str = ComputeStrides(a_shape);
+    auto b_str = ComputeStrides(b_shape);
+    auto out_str = ComputeStrides(out_dims);
+
+    for (int i = 0; i < ndim; ++i) {
+        m.a_strides[i] = a_str[i];
+        m.b_strides[i] = b_str[i];
+        m.out_strides[i] = out_str[i];
+        m.a_shape[i] = a_shape[i];
+        m.b_shape[i] = b_shape[i];
+    }
+    return m;
+}
+
+template <typename T, typename Func>
+__global__ void UnaryForwardKernel(T *output, Func fn, size_t num_elements, size_t offset, const T *input) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x + offset;
+
+    if (idx < num_elements) {
+        output[idx] = fn(input[idx]);
+    }
+}
+
+// Helper for broadcast indexing
+__device__ inline int64_t CalcOffset(int64_t idx, int ndim, const int64_t *strides, const int64_t *shape,
+                                     const int64_t *out_strides) {
+    int64_t offset = 0;
+    for (int i = 0; i < ndim; ++i) {
+        int64_t out_index = (idx / out_strides[i]) % shape[i];
+        int64_t index = shape[i] == 1 ? 0 : out_index;
+        offset += index * strides[i];
+    }
+    return offset;
+}
+
+inline bool ShapesEqual(const std::vector<int64_t> &a, const std::vector<int64_t> &b) {
+    if (a.size() != b.size()) {
+        return false;
+    }
+    for (size_t i = 0; i < a.size(); ++i) {
+        if (a[i] != b[i]) {
+            return false;
+        }
+    }
+    return true;
+}
+
+template <typename T, typename Func>
+__global__ void BinaryForwardKernel(T *output, Func fn, BroadcastMeta meta, const T *a, const T *b,
+                                    size_t num_elements) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= num_elements) {
+        return;
+    }
+
+    int64_t a_offset = CalcOffset(idx, meta.ndim, meta.a_strides, meta.a_shape, meta.out_strides);
+    int64_t b_offset = CalcOffset(idx, meta.ndim, meta.b_strides, meta.b_shape, meta.out_strides);
+
+    output[idx] = fn(a[a_offset], b[b_offset]);
+}
+
+// Fast path: no broadcast, contiguous tensors — skip CalcOffset entirely
+template <typename T, typename Func>
+__global__ void BinaryForwardKernelNoBroadcast(T *__restrict__ output, Func fn, const T *__restrict__ a,
+                                               const T *__restrict__ b, size_t num_elements) {
+    const size_t grid_stride = static_cast<size_t>(gridDim.x) * blockDim.x;
+    for (size_t idx = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x; idx < num_elements;
+         idx += grid_stride) {
+        output[idx] = fn(a[idx], b[idx]);
+    }
+}
+
+// Fast path backward: no broadcast, contiguous — skip CalcOffset entirely
+template <typename T, typename FuncA, typename FuncB>
+__global__ void BinaryBackwardKernelNoBroadcastFast(T *__restrict__ outA, T *__restrict__ outB, FuncA fn_a, FuncB fn_b,
+                                                    size_t numel, const T *__restrict__ grad_out,
+                                                    const T *__restrict__ inA, const T *__restrict__ inB) {
+    const size_t grid_stride = static_cast<size_t>(gridDim.x) * blockDim.x;
+    for (size_t idx = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x; idx < numel; idx += grid_stride) {
+        const T a = inA ? inA[idx] : T(0);
+        const T b = inB ? inB[idx] : T(0);
+        outA[idx] = Mul<T>(grad_out[idx], fn_a(a, b));
+        outB[idx] = Mul<T>(grad_out[idx], fn_b(a, b));
+    }
+}
+
+// Vectorized fast path backward: no broadcast, contiguous.
+// Each thread processes VecSize elements using 128-bit loads/stores.
+template <typename T, int VecSize, typename FuncA, typename FuncB>
+__global__ void BinaryBackwardKernelNoBroadcastVectorized(T *__restrict__ outA, T *__restrict__ outB, FuncA fn_a,
+                                                          FuncB fn_b, size_t numel, const T *__restrict__ grad_out,
+                                                          const T *__restrict__ inA, const T *__restrict__ inB) {
+    using VecT = aligned_vector<T, VecSize>;
+    const size_t num_vecs = numel / VecSize;
+    const size_t grid_stride = static_cast<size_t>(gridDim.x) * blockDim.x;
+
+    for (size_t vid = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x; vid < num_vecs; vid += grid_stride) {
+        const size_t base = vid * VecSize;
+
+        // 128-bit vectorized loads
+        VecT g_vec = *reinterpret_cast<const VecT *>(&grad_out[base]);
+        VecT a_vec, b_vec;
+        if (inA) {
+            a_vec = *reinterpret_cast<const VecT *>(&inA[base]);
+        } else {
+#pragma unroll
+            for (int i = 0; i < VecSize; ++i) { a_vec.val[i] = T(0); }
+        }
+        if (inB) {
+            b_vec = *reinterpret_cast<const VecT *>(&inB[base]);
+        } else {
+#pragma unroll
+            for (int i = 0; i < VecSize; ++i) { b_vec.val[i] = T(0); }
+        }
+
+        // Element-wise computation
+        VecT outA_vec, outB_vec;
+#pragma unroll
+        for (int i = 0; i < VecSize; ++i) {
+            outA_vec.val[i] = Mul<T>(g_vec.val[i], fn_a(a_vec.val[i], b_vec.val[i]));
+            outB_vec.val[i] = Mul<T>(g_vec.val[i], fn_b(a_vec.val[i], b_vec.val[i]));
+        }
+
+        // 128-bit vectorized stores
+        *reinterpret_cast<VecT *>(&outA[base]) = outA_vec;
+        *reinterpret_cast<VecT *>(&outB[base]) = outB_vec;
+    }
+
+    // Handle tail elements (numel % VecSize != 0)
+    const size_t tail_start = num_vecs * VecSize;
+    for (size_t idx = tail_start + static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x; idx < numel;
+         idx += grid_stride) {
+        const T a = inA ? inA[idx] : T(0);
+        const T b = inB ? inB[idx] : T(0);
+        outA[idx] = Mul<T>(grad_out[idx], fn_a(a, b));
+        outB[idx] = Mul<T>(grad_out[idx], fn_b(a, b));
+    }
+}
+
+// Helper to choose optimal block size based on tensor size
+inline size_t ChooseBlockSize(size_t num_elements) {
+    if (num_elements < 1024) {
+        return 64;
+    }
+    if (num_elements < 65536) {
+        return 128;
+    }
+    if (num_elements < 1048576) {
+        return 256;
+    }
+    return 512;
+}
+
+// launch the given kernel function with the given output and inputs
+template <size_t BLOCK_SIZE, typename T, typename Kernel, typename... Inputs>
+void LaunchKernel(Kernel &&kernel, const std::shared_ptr<Tensor> &output, const Inputs &...inputs) {
+    auto extract_ptrs
+        = [](const auto &...ts) { return std::make_tuple(static_cast<T *>(ts ? ts->DataPtr() : nullptr)...); };
+    auto input_ptrs = extract_ptrs(inputs...);
+
+    const size_t num_elements = output->NumElements();
+    // Use dynamic block size based on tensor size for better occupancy
+    size_t block_size = std::min(ChooseBlockSize(num_elements), static_cast<size_t>(1024));
+    dim3 block_dims(block_size);
+    dim3 grid_dims(CEIL_DIV(num_elements, block_dims.x));
+    const size_t step = grid_dims.x * block_dims.x;
+
+    for (size_t offset = 0; offset < num_elements; offset += step) {
+        std::apply([&](auto... ptrs) { kernel(grid_dims, block_dims, offset, ptrs...); }, input_ptrs);
+    }
+}
+
+// launch a forward elementwise operation given the calculation function, output, and the inputs
+// Note: currently only support unary and binary operations
+template <size_t BLOCK_SIZE, typename T, typename Func, typename... Inputs>
+void LaunchForward(Func func, const std::shared_ptr<Tensor> &output, const Inputs &...inputs) {
+    auto device = output->GetDevice();
+    const auto &maca_stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
+                                  infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                                  ->maca_stream();
+    T *output_ptr = static_cast<T *>(output->DataPtr());
+
+    if constexpr (sizeof...(inputs) == 1) {
+        // Unary case
+        LaunchKernel<BLOCK_SIZE, T>(
+            [&](dim3 grid, dim3 block, size_t offset, auto... ptrs) {
+                UnaryForwardKernel<<<grid, block, 0, maca_stream>>>(output_ptr, func, output->NumElements(), offset,
+                                                                    ptrs...);
+            },
+            output, inputs...);
+    } else if constexpr (sizeof...(inputs) == 2) {
+        // Binary case
+        auto input_tuple = std::make_tuple(inputs...);
+        const auto &input_a = std::get<0>(input_tuple);
+        const auto &input_b = std::get<1>(input_tuple);
+
+        const auto &a_dims = input_a->Dims();
+        const auto &b_dims = input_b->Dims();
+        const auto &out_dims = output->Dims();
+
+        // Fast path: no broadcast, contiguous — skip mcMalloc/Memcpy/CalcOffset.
+        // The IsContiguous() guards ensure non-contiguous tensors fall back to the broadcast
+        // path, keeping the fast path correct when non-contiguous support is added later.
+        if (ShapesEqual(a_dims, out_dims) && ShapesEqual(b_dims, out_dims) && input_a->IsContiguous()
+            && input_b->IsContiguous()) {
+            const size_t num_elements = output->NumElements();
+            const T *a_ptr = static_cast<const T *>(input_a->DataPtr());
+            const T *b_ptr = static_cast<const T *>(input_b->DataPtr());
+            dim3 block_dims(std::min(BLOCK_SIZE, static_cast<size_t>(1024)));
+            dim3 grid_dims(std::min(CEIL_DIV(num_elements, block_dims.x), static_cast<size_t>(65535)));
+            BinaryForwardKernelNoBroadcast<<<grid_dims, block_dims, 0, maca_stream>>>(output_ptr, func, a_ptr, b_ptr,
+                                                                                      num_elements);
+        } else {
+            // Broadcast path: pass strides/shapes by value via kernel parameter memory.
+            // This avoids the per-call mcMallocAsync/mcMemcpyAsync/mcFreeAsync that previously
+            // dominated the host-side jitter floor (especially under LoRA training).
+            BroadcastMeta meta = MakeBroadcastMeta(a_dims, b_dims, out_dims);
+
+            LaunchKernel<BLOCK_SIZE, T>(
+                [&](dim3 grid, dim3 block, size_t /*offset*/, const T *a_ptr, const T *b_ptr) {
+                    BinaryForwardKernel<<<grid, block, 0, maca_stream>>>(output_ptr, func, meta, a_ptr, b_ptr,
+                                                                         output->NumElements());
+                },
+                output, inputs...);
+        }
+    } else {
+        static_assert(sizeof...(inputs) == 1 || sizeof...(inputs) == 2,
+                      "LaunchForward currently only supports unary and binary operations.");
+    }
+}
+
+// Backward kernel for unary operators
+template <typename T, typename Func>
+__global__ void UnaryBackwardKernel(T *output, Func fn, size_t num_elements, size_t offset, const T *grad_output,
+                                    const T *input) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x + offset;
+
+    if (idx < num_elements) {
+        output[idx] = Mul<T>(grad_output[idx], fn(input ? input[idx] : T(0)));
+    }
+}
+
+enum class BF16Path { NoBroadcast, TwoPassHist, BlockReduce };
+
+// Lightweight and stable selector for bf16/__half execution paths.
+inline BF16Path DecideBF16Path(const std::vector<int64_t> &b_shape, const std::vector<int64_t> &out_shape,
+                               size_t b_num_elements) {
+    if (ShapesEqual(b_shape, out_shape)) {
+        return BF16Path::NoBroadcast;
+    }
+    const bool varies_last = (b_shape.back() > 1);
+    if (varies_last) {
+        if (b_num_elements <= 4096) {
+            return BF16Path::TwoPassHist; // shared histogram two-pass path
+        }
+    }
+    return BF16Path::BlockReduce; // fallback to block reduction kernel otherwise
+}
+
+// Each B element is used exactly once, so gradients can be written directly without reduction.
+template <typename T, typename FuncA, typename FuncB>
+__global__ void BinaryBackwardKernelNoBroadcast(T *__restrict__ outA, T *__restrict__ outB, FuncA fn_a, FuncB fn_b,
+                                                BroadcastMeta meta, size_t numel, const T *__restrict__ grad_out,
+                                                const T *__restrict__ inA, const T *__restrict__ inB) {
+    const size_t grid_stride = static_cast<size_t>(gridDim.x) * blockDim.x;
+    for (size_t idx = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x; idx < numel; idx += grid_stride) {
+        const int64_t a_off = CalcOffset(idx, meta.ndim, meta.a_strides, meta.a_shape, meta.out_strides);
+        const int64_t b_off = CalcOffset(idx, meta.ndim, meta.b_strides, meta.b_shape, meta.out_strides);
+
+        const T a = inA ? inA[a_off] : T(0);
+        const T b = inB ? inB[b_off] : T(0);
+
+        // Gradient for A has a one-to-one mapping, so we write directly.
+        outA[a_off] = Mul<T>(grad_out[idx], fn_a(a, b));
+
+        // Gradient for B also maps one-to-one; no atomics or reductions are required.
+        outB[b_off] = common::maca::Cast<T>(Mul<T>(grad_out[idx], fn_b(a, b)));
+    }
+}
+
+// First pass of histogram two-pass strategy: per-block accumulation in shared memory.
+template <typename T, typename FuncA, typename FuncB>
+__global__ void BinaryBackwardBhistPass1Kernel(T *__restrict__ outA, float *__restrict__ work, FuncA fn_a, FuncB fn_b,
+                                               BroadcastMeta meta, size_t numel, int K, const T *__restrict__ grad_out,
+                                               const T *__restrict__ inA, const T *__restrict__ inB) {
+    extern __shared__ float s_hist[]; // dynamic shared memory: K bins plus padding for every 32 buckets
+    const int pad = K >> 5;           // insert one padding slot for every 32 buckets
+    const int hist_len = K + pad;
+
+    // Zero the shared histogram buffer.
+    for (int t = threadIdx.x; t < hist_len; t += blockDim.x) { s_hist[t] = 0.0f; }
+    __syncthreads();
+
+    const size_t total_threads = (size_t)gridDim.x * blockDim.x;
+    for (size_t idx = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x; idx < numel; idx += total_threads) {
+        // Linearized offset for B under general broadcasting.
+        const int64_t b_off = CalcOffset(idx, meta.ndim, meta.b_strides, meta.b_shape, meta.out_strides);
+        const int bin = static_cast<int>(b_off); // assume K fits in a 32-bit int
+        const int pbin = bin + (bin >> 5);       // apply padding mapping
+
+        // Compute the offset for A under broadcasting.
+        const int64_t a_off = CalcOffset(idx, meta.ndim, meta.a_strides, meta.a_shape, meta.out_strides);
+
+        const T a = inA ? inA[a_off] : T(0);
+        const T b = inB ? inB[bin] : T(0); // B is indexed via the flattened bin
+
+        // A is not broadcast, so gradients can be written directly.
+        outA[a_off] = Mul<T>(grad_out[idx], fn_a(a, b));
+
+        // Accumulate B's contribution into the shared histogram using float precision.
+        const float g = common::maca::Cast<float>(Mul<T>(grad_out[idx], fn_b(a, b)));
+        atomicAdd(&s_hist[pbin], g);
+    }
+    __syncthreads();
+
+    // Write this block's histogram back to the global workspace: work[block, :].
+    float *dst = work + static_cast<size_t>(blockIdx.x) * static_cast<size_t>(K);
+    for (int bin = threadIdx.x; bin < K; bin += blockDim.x) {
+        const int pbin = bin + (bin >> 5);
+        dst[bin] = s_hist[pbin];
+    }
+}
+
+// Second pass for histogram path: tile the workspace along CTA dimension and atomically add into float buffer.
+template <typename T>
+__global__ void BinaryBackwardBhistPass2Reduce2D(const float *__restrict__ work, float *__restrict__ outB_accum,
+                                                 size_t numBlocks, int K, int tile_height) {
+    const int k = blockIdx.x * blockDim.x + threadIdx.x;
+    if (k >= K) {
+        return;
+    }
+
+    const size_t begin_row = static_cast<size_t>(blockIdx.y) * static_cast<size_t>(tile_height);
+    const size_t end_row = min(begin_row + static_cast<size_t>(tile_height), numBlocks);
+
+    float acc = 0.0f;
+    for (size_t row = begin_row; row < end_row; ++row) { acc += work[row * static_cast<size_t>(K) + k]; }
+
+    atomicAdd(outB_accum + k, acc);
+}
+
+// Convert the accumulated float buffer back to the target type (bf16/__half/float).
+template <typename T> __global__ void CastFloatToTBhist(const float *__restrict__ src, T *__restrict__ dst, int K) {
+    const int k = blockIdx.x * blockDim.x + threadIdx.x;
+    if (k < K) {
+        dst[k] = common::maca::Cast<T>(src[k]);
+    }
+}
+
+// Legacy single-dimensional reduction fallback for small grids where atomic tiling is unnecessary.
+template <typename T>
+__global__ void BinaryBackwardBhistPass2Reduce1D(const float *__restrict__ work, T *__restrict__ outB, size_t numBlocks,
+                                                 int K) {
+    const size_t k = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    if (k >= static_cast<size_t>(K)) {
+        return;
+    }
+
+    float acc = 0.0f;
+    for (size_t b = 0; b < numBlocks; ++b) { acc += work[b * static_cast<size_t>(K) + k]; }
+    outB[k] = common::maca::Cast<T>(acc);
+}
+
+// Helper that materializes the two-pass histogram path for bf16/__half B gradients.
+template <typename T, typename FuncA, typename FuncB>
+void BinaryBackwardBhistLaunch(FuncA fn_a, FuncB fn_b, T *outA, T *outB, const T *grad_out, const BroadcastMeta &meta,
+                               size_t numel, int K, const T *inA, const T *inB, mcStream_t stream) {
+    const int kBlockSize = 256;
+    int grid = static_cast<int>((numel + kBlockSize - 1) / kBlockSize);
+    if (grid < 1) {
+        grid = 1;
+    }
+
+    // Workspace layout: [grid, K] floats.
+    float *work = nullptr;
+    MACA_CHECK(mcMallocAsync(&work, static_cast<size_t>(grid) * static_cast<size_t>(K) * sizeof(float), stream));
+
+    // Pass 1: per-block histogram accumulation.
+    const size_t smem_bytes = static_cast<size_t>(K + (K >> 5)) * sizeof(float);
+    BinaryBackwardBhistPass1Kernel<T, FuncA, FuncB>
+        <<<grid, kBlockSize, smem_bytes, stream>>>(outA, work, fn_a, fn_b, meta, numel, K, grad_out, inA, inB);
+    MACA_CHECK(mcGetLastError());
+
+    // Pass 2: choose between 1D and 2D reductions depending on workload shape.
+    int dev = 0;
+    int sm_count = 0;
+    MACA_CHECK(mcGetDevice(&dev));
+    MACA_CHECK(mcDeviceGetAttribute(&sm_count, mcDevAttrMultiProcessorCount, dev));
+
+    const int RED_THREADS = 256;
+    const int oneD_blocks = (K + RED_THREADS - 1) / RED_THREADS;
+
+    // Use the 2D path when the 1D kernel underutilizes the SMs and there are many partial histograms to merge.
+    const bool use2D = (oneD_blocks < sm_count) && (grid > 4 * sm_count);
+
+    if (!use2D) {
+        // Fallback: reuse the legacy 1D kernel without atomics.
+        const dim3 rgrid(oneD_blocks);
+        const dim3 rblock(RED_THREADS);
+        BinaryBackwardBhistPass2Reduce1D<T><<<rgrid, rblock, 0, stream>>>(work, outB, static_cast<size_t>(grid), K);
+        MACA_CHECK(mcGetLastError());
+    } else {
+        // 2D tiling path: slice the workspace and accumulate using float atomics.
+        constexpr int kTileHeight = 128; // rows per CTA; tune between 128 and 256 if needed
+        float *outB_accum = nullptr;
+        MACA_CHECK(mcMallocAsync(&outB_accum, static_cast<size_t>(K) * sizeof(float), stream));
+        MACA_CHECK(mcMemsetAsync(outB_accum, 0, static_cast<size_t>(K) * sizeof(float), stream));
+
+        const dim3 rblock(RED_THREADS, 1, 1);
+        const dim3 rgrid2((K + RED_THREADS - 1) / RED_THREADS, (grid + kTileHeight - 1) / kTileHeight, 1);
+
+        BinaryBackwardBhistPass2Reduce2D<T>
+            <<<rgrid2, rblock, 0, stream>>>(work, outB_accum, static_cast<size_t>(grid), K, kTileHeight);
+        MACA_CHECK(mcGetLastError());
+
+        // Convert accumulated floats back to the target dtype.
+        const dim3 cgrid((K + RED_THREADS - 1) / RED_THREADS);
+        CastFloatToTBhist<T><<<cgrid, RED_THREADS, 0, stream>>>(outB_accum, outB, K);
+        MACA_CHECK(mcGetLastError());
+
+        MACA_CHECK(mcFreeAsync(outB_accum, stream));
+    }
+
+    MACA_CHECK(mcFreeAsync(work, stream));
+}
+
+// Backward kernel for binary operators
+// TODO(lzm): determining and passing b_is_broadcasted from the caller; optimize further
+template <typename T, typename FuncA, typename FuncB>
+__global__ void BinaryBackwardKernel(T *output_a, T *output_b, FuncA fn_a, FuncB fn_b, BroadcastMeta meta,
+                                     size_t num_elements, const T *grad_output, const T *input_a, const T *input_b) {
+    extern __shared__ char shared_memory[];
+    const int tid = threadIdx.x;
+    const int warp_id = tid / 32;
+    const int lane_id = tid % 32;
+
+    using WarpReduce = cub::WarpReduce<float>;
+    WarpReduce::TempStorage *temp_storage = reinterpret_cast<WarpReduce::TempStorage *>(shared_memory);
+
+    size_t idx = blockIdx.x * blockDim.x + tid;
+    bool in_bounds = (idx < num_elements);
+
+    int64_t a_offset = 0, b_offset = 0;
+    T a_val = T(0), b_val = T(0);
+    float grad_val = 0.0f;
+
+    if (in_bounds) {
+        a_offset = CalcOffset(idx, meta.ndim, meta.a_strides, meta.a_shape, meta.out_strides);
+        b_offset = CalcOffset(idx, meta.ndim, meta.b_strides, meta.b_shape, meta.out_strides);
+        a_val = input_a ? input_a[a_offset] : T(0);
+        b_val = input_b ? input_b[b_offset] : T(0);
+        output_a[a_offset] = Mul<T>(grad_output[idx], fn_a(a_val, b_val));
+        grad_val = common::maca::Cast<float>(Mul<T>(grad_output[idx], fn_b(a_val, b_val)));
+    }
+
+    unsigned active_mask = __ballot_sync(0xFFFFFFFF, in_bounds);
+    if (!active_mask) {
+        return;
+    }
+
+    int leader = __ffs(active_mask) - 1;
+    int64_t common_offset = __shfl_sync(active_mask, b_offset, leader);
+
+    // Check if all active threads share common b_offset
+    bool warp_uniform = true;
+    for (int i = 0; i < 32; ++i) {
+        if (!(active_mask & (1 << i))) {
+            continue;
+        }
+        int64_t offset_i = __shfl_sync(active_mask, b_offset, i);
+        if (offset_i != common_offset) {
+            warp_uniform = false;
+            break;
+        }
+    }
+
+    if (warp_uniform) {
+        float reduced = WarpReduce(temp_storage[warp_id]).Sum(grad_val);
+        if (lane_id == leader) {
+            // FIXME(lzm): atomicAdd is much slower for bf16 and __half compared to float, needs further optimization
+            atomicAdd(&output_b[common_offset], common::maca::Cast<T>(reduced));
+        }
+    } else if (in_bounds) {
+        // FIXME(lzm): atomicAdd is much slower for bf16 and __half compared to float, needs further optimization
+        atomicAdd(&output_b[b_offset], common::maca::Cast<T>(grad_val));
+    }
+}
+
+// NOTE(dcj): Specialized BinaryBackwardKernel for low-precision types (__half / bfloat16)
+template <typename T, typename FuncA, typename FuncB>
+__global__ void BinaryBackwardKernel(T *output_a, T *output_b, FuncA fn_a, FuncB fn_b, BroadcastMeta meta,
+                                     size_t num_elements, size_t b_num_elements, const T *grad_output, const T *input_a,
+                                     const T *input_b, bool fast_atomics) {
+
+    const int tid = threadIdx.x;
+    const int block_threads = blockDim.x;
+    const int global_idx = blockIdx.x * blockDim.x + tid;
+    bool in_bounds = (global_idx < num_elements);
+
+    // Dynamic shared memory layout: split offsets and gradients into parallel arrays.
+    extern __shared__ char shared_memory[];
+    int64_t *s_offset = reinterpret_cast<int64_t *>(shared_memory);
+    float *s_grad = reinterpret_cast<float *>(s_offset + block_threads + block_threads / kWarpSize);
+
+    // Padding: insert one slot per 32 threads to avoid bank conflicts.
+    const int padded_tid = tid + (tid >> 5);
+
+    // Each thread calculates its own a_offset and b_offset
+    int64_t a_offset = 0, b_offset = 0;
+    float grad_val = 0.0f;
+    T a_val = T(0), b_val = T(0);
+
+    if (in_bounds) {
+        a_offset = CalcOffset(global_idx, meta.ndim, meta.a_strides, meta.a_shape, meta.out_strides);
+        b_offset = CalcOffset(global_idx, meta.ndim, meta.b_strides, meta.b_shape, meta.out_strides);
+
+        a_val = input_a ? input_a[a_offset] : T(0);
+        b_val = input_b ? input_b[b_offset] : T(0);
+
+        // Compute gradient contribution for output_a
+        output_a[a_offset] = Mul<T>(grad_output[global_idx], fn_a(a_val, b_val));
+        // Store gradient contribution for output_b in float for accumulation
+        grad_val = common::maca::Cast<float>(Mul<T>(grad_output[global_idx], fn_b(a_val, b_val)));
+    }
+
+    // Store partial results in shared memory.
+    s_offset[padded_tid] = in_bounds ? b_offset : -1;
+    s_grad[padded_tid] = grad_val;
+
+    __syncthreads();
+
+    // Perform block-wide reduction with padded indices.
+    for (int stride = 1; stride < block_threads; stride *= 2) {
+        __syncthreads();
+        if ((tid % (2 * stride)) == 0 && (tid + stride) < block_threads) {
+            const int p1 = tid + (tid >> 5);
+            const int p2 = (tid + stride) + ((tid + stride) >> 5);
+
+            if (s_offset[p1] == s_offset[p2] && s_offset[p1] != -1) {
+                s_grad[p1] += s_grad[p2];
+                s_offset[p2] = -1;
+            }
+        }
+    }
+    __syncthreads();
+
+    // Write final result back to global memory
+    if (in_bounds) {
+        const int shared_idx = tid + (tid >> 5);
+        if (s_offset[shared_idx] != -1) {
+            fastAtomicAdd<T, size_t>(output_b, s_offset[shared_idx], b_num_elements,
+                                     common::maca::Cast<T>(s_grad[shared_idx]), fast_atomics);
+        }
+    }
+}
+
+// launch unary operator's backward kernel
+template <size_t BLOCK_SIZE, typename T, typename Func, typename... Inputs>
+void LaunchBackward(Func func, const std::shared_ptr<Tensor> &output, const std::shared_ptr<Tensor> &grad_output,
+                    const Inputs &...inputs) {
+    auto device = output->GetDevice();
+    const auto &maca_stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
+                                  infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                                  ->maca_stream();
+
+    T *output_ptr = static_cast<T *>(output->DataPtr());
+    const T *grad_ptr = static_cast<const T *>(grad_output->DataPtr());
+
+    LaunchKernel<BLOCK_SIZE, T>(
+        [=](dim3 grid, dim3 block, size_t offset, auto... ptrs) {
+            UnaryBackwardKernel<<<grid, block, 0, maca_stream>>>(output_ptr, func, output->NumElements(), offset,
+                                                                 grad_ptr, ptrs...);
+        },
+        output, inputs...);
+}
+
+// launch binary operator's backward kernel
+template <size_t BLOCK_SIZE, typename T, typename FuncA, typename FuncB, typename... Inputs>
+void LaunchBackward(FuncA fun_a, FuncB fun_b, const std::shared_ptr<Tensor> &output_a,
+                    const std::shared_ptr<Tensor> &output_b, const std::vector<int64_t> &a_dims,
+                    const std::vector<int64_t> &b_dims, const std::shared_ptr<Tensor> &grad_output,
+                    const Inputs &...inputs) {
+    auto device = output_a->GetDevice();
+    const auto &stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
+                             infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                             ->maca_stream();
+
+    T *output_a_ptr = static_cast<T *>(output_a->DataPtr());
+    T *output_b_ptr = static_cast<T *>(output_b->DataPtr());
+    const T *grad_output_ptr = static_cast<const T *>(grad_output->DataPtr());
+
+    const auto &out_dims = grad_output->Dims();
+    const size_t num_elements = grad_output->NumElements();
+
+    // Fast path: no broadcast, contiguous — skip mcMalloc/Memcpy/CalcOffset.
+    // The IsContiguous() guard ensures non-contiguous grad_output falls back to the broadcast
+    // path, keeping the fast path correct when non-contiguous support is added later.
+    if (ShapesEqual(a_dims, b_dims) && ShapesEqual(a_dims, out_dims) && grad_output->IsContiguous()) {
+        auto extract_ptrs = [](const auto &...ts) {
+            return std::make_tuple(static_cast<const T *>(ts ? ts->DataPtr() : nullptr)...);
+        };
+        auto [input_a_ptr, input_b_ptr] = extract_ptrs(inputs...);
+
+        constexpr int VecSize = kVecSize<T>;
+        // Use vectorized kernel if all pointers are 16-byte aligned and numel is large enough
+        const bool can_vectorize
+            = (num_elements >= static_cast<size_t>(VecSize))
+           && (reinterpret_cast<uintptr_t>(output_a_ptr) % (sizeof(T) * VecSize) == 0)
+           && (reinterpret_cast<uintptr_t>(output_b_ptr) % (sizeof(T) * VecSize) == 0)
+           && (reinterpret_cast<uintptr_t>(grad_output_ptr) % (sizeof(T) * VecSize) == 0)
+           && (!input_a_ptr || reinterpret_cast<uintptr_t>(input_a_ptr) % (sizeof(T) * VecSize) == 0)
+           && (!input_b_ptr || reinterpret_cast<uintptr_t>(input_b_ptr) % (sizeof(T) * VecSize) == 0);
+
+        if (can_vectorize) {
+            const size_t num_vecs = num_elements / VecSize;
+            dim3 block_dims(std::min(static_cast<size_t>(256), std::min(num_vecs, static_cast<size_t>(1024))));
+            dim3 grid_dims(std::min(CEIL_DIV(num_vecs, block_dims.x), static_cast<size_t>(65535)));
+            BinaryBackwardKernelNoBroadcastVectorized<T, VecSize><<<grid_dims, block_dims, 0, stream>>>(
+                output_a_ptr, output_b_ptr, fun_a, fun_b, num_elements, grad_output_ptr, input_a_ptr, input_b_ptr);
+        } else {
+            dim3 block_dims(std::min(BLOCK_SIZE, static_cast<size_t>(1024)));
+            dim3 grid_dims(std::min(CEIL_DIV(num_elements, block_dims.x), static_cast<size_t>(65535)));
+            BinaryBackwardKernelNoBroadcastFast<<<grid_dims, block_dims, 0, stream>>>(
+                output_a_ptr, output_b_ptr, fun_a, fun_b, num_elements, grad_output_ptr, input_a_ptr, input_b_ptr);
+        }
+        return;
+    }
+
+    // Broadcast path: pass strides/shapes by value via kernel parameter memory.
+    // This avoids the per-call mcMallocAsync/mcMemcpyAsync/mcFreeAsync that previously
+    // dominated the host-side jitter floor (especially under LoRA training).
+    BroadcastMeta meta = MakeBroadcastMeta(a_dims, b_dims, out_dims);
+
+    if constexpr (std::is_same_v<T, float>) {
+        LaunchKernel<BLOCK_SIZE, T>(
+            [=](dim3 grid, dim3 block, size_t /*offset*/, auto... ptrs) {
+                const int num_warps = BLOCK_SIZE / kWarpSize;
+                const size_t smem_size = num_warps * sizeof(cub::WarpReduce<float>::TempStorage);
+                BinaryBackwardKernel<<<grid, block, smem_size, stream>>>(output_a_ptr, output_b_ptr, fun_a, fun_b, meta,
+                                                                         num_elements, grad_output_ptr, ptrs...);
+            },
+            output_a, inputs...);
+    } else if constexpr (std::is_same_v<T, __half> || std::is_same_v<T, __maca_bfloat16>) {
+        // Dynamically choose the most efficient bf16/__half strategy based on broadcast pattern.
+        // Reconstruct right-aligned b_shape (stack-only, no device allocations) for
+        // DecideBF16Path which still operates on std::vector.
+        const int ndim = meta.ndim;
+        std::vector<int64_t> b_shape(meta.b_shape, meta.b_shape + ndim);
+        const std::vector<int64_t> &out_shape = out_dims;
+
+        size_t b_num_elements = 1;
+        for (auto v : b_shape) { b_num_elements *= static_cast<size_t>(v); }
+        const int K_linear = static_cast<int>(b_num_elements);
+
+        // Select the execution path.
+        const BF16Path path = DecideBF16Path(b_shape, out_shape, b_num_elements);
+
+        if (path == BF16Path::NoBroadcast) {
+            // No broadcast: write gradients directly without shared memory or atomics.
+            LaunchKernel<BLOCK_SIZE, T>(
+                [=](dim3 grid, dim3 block, size_t /*offset*/, auto... ptrs) {
+                    BinaryBackwardKernelNoBroadcast<T, FuncA, FuncB><<<grid, block, 0, stream>>>(
+                        output_a_ptr, output_b_ptr, fun_a, fun_b, meta, num_elements, grad_output_ptr, ptrs...);
+                },
+                output_a, inputs...);
+            return;
+        }
+
+        if (path == BF16Path::TwoPassHist) {
+            // Small K with variation in the innermost dimension: use two-pass histogram strategy.
+            LaunchKernel<BLOCK_SIZE, T>(
+                [=](dim3 /*grid*/, dim3 /*block*/, size_t /*offset*/, const T *input_a_ptr, const T *input_b_ptr) {
+                    BinaryBackwardBhistLaunch<T, FuncA, FuncB>(fun_a, fun_b, output_a_ptr, output_b_ptr,
+                                                               grad_output_ptr, meta, num_elements, K_linear,
+                                                               input_a_ptr, input_b_ptr, stream);
+                },
+                output_a, inputs...);
+
+            return;
+        }
+
+        // Otherwise fall back to the block-reduction kernel with SoA layout and fast atomics.
+        LaunchKernel<BLOCK_SIZE, T>(
+            [=](dim3 grid, dim3 block, size_t /*offset*/, auto... ptrs) {
+                const int padded_block = BLOCK_SIZE + BLOCK_SIZE / kWarpSize;
+                const size_t smem_size = static_cast<size_t>(padded_block) * (sizeof(int64_t) + sizeof(float));
+                BinaryBackwardKernel<<<grid, block, smem_size, stream>>>(
+                    output_a_ptr, output_b_ptr, fun_a, fun_b, meta, num_elements, output_b->NumElements(),
+                    grad_output_ptr, ptrs..., /*fast_atomics=*/true);
+            },
+            output_a, inputs...);
+    }
+}
+
+template <typename Func> std::shared_ptr<Tensor> UnaryForward(const std::shared_ptr<Tensor> &input, Func unary_fn) {
+    auto dtype = input->Dtype();
+    auto output = std::make_shared<Tensor>(input->Dims(), dtype, input->GetDevice());
+
+    switch (dtype) {
+        DISPATCH_CASE(WRAP(LaunchForward<256, float>(unary_fn, output, input);), DataType::kFLOAT32)
+        DISPATCH_CASE(WRAP(LaunchForward<256, __maca_bfloat16>(unary_fn, output, input);), DataType::kBFLOAT16)
+        DISPATCH_CASE(WRAP(LaunchForward<256, int64_t>(unary_fn, output, input);), DataType::kINT64)
+    default:
+        LOG_LOC(FATAL, "MACA unary forward: 'Unsupported data type'");
+    }
+
+    return output;
+}
+
+template <typename Func>
+std::shared_ptr<Tensor> UnaryBackward(const std::shared_ptr<Tensor> &grad_output, const std::shared_ptr<Tensor> &a,
+                                      Func unary_fn) {
+    auto dtype = grad_output->Dtype();
+    auto a_dtype = a ? a->Dtype() : dtype;
+    DataType promoted_type = DispatchFunc<DataTypeList<INFINI_ALL_TYPES>, DataTypeList<INFINI_ALL_TYPES>>(
+        {dtype, a_dtype}, [=]<typename Tgrad, typename Ta>() { return DataTypeMap_v<WidestType_t<Tgrad, Ta>>; },
+        "MACA UnaryBackward");
+
+    auto grad_output_promoted
+        = dtype == promoted_type ? grad_output : std::make_shared<Tensor>(grad_output->To(promoted_type));
+    auto a_promoted = a_dtype == promoted_type ? a : std::make_shared<Tensor>(a->To(promoted_type));
+    auto output = std::make_shared<Tensor>(grad_output->Dims(), promoted_type, grad_output->GetDevice());
+
+    switch (promoted_type) {
+        DISPATCH_CASE(WRAP({ LaunchBackward<256, float>(unary_fn, output, grad_output_promoted, a_promoted); }),
+                      DataType::kFLOAT32)
+        DISPATCH_CASE(WRAP({ LaunchBackward<256, __maca_bfloat16>(unary_fn, output, grad_output_promoted, a_promoted); }),
+                      DataType::kBFLOAT16)
+        DISPATCH_CASE(WRAP({ LaunchBackward<256, int64_t>(unary_fn, output, grad_output_promoted, a_promoted); }),
+                      DataType::kINT64)
+    default:
+        LOG_LOC(FATAL, "MACA unary backward: 'Unsupported data type'");
+    }
+
+    return output;
+}
+
+template <typename Func>
+std::shared_ptr<Tensor> BinaryForward(const std::shared_ptr<Tensor> &a, const std::shared_ptr<Tensor> &b,
+                                      Func binary_fn) {
+    auto a_dtype = a->Dtype();
+    auto b_dtype = b->Dtype();
+
+    DataType promoted_type = DispatchFunc<DataTypeList<INFINI_ALL_TYPES>, DataTypeList<INFINI_ALL_TYPES>>(
+        {a_dtype, b_dtype}, [=]<typename Ta, typename Tb>() { return DataTypeMap_v<WidestType_t<Ta, Tb>>; },
+        "MACA BinaryForward");
+
+    auto a_promoted = a_dtype == promoted_type ? a : std::make_shared<Tensor>(a->To(promoted_type));
+    auto b_promoted = b_dtype == promoted_type ? b : std::make_shared<Tensor>(b->To(promoted_type));
+    // Currently a and b should have the same data type and only one-way broadcasting from b to a is assumed by
+    // default
+    CHECK(a->NumElements() >= b->NumElements() && a->NumElements() % b->NumElements() == 0);
+
+    auto output = std::make_shared<Tensor>(a->Dims(), promoted_type, a->GetDevice());
+
+    switch (promoted_type) {
+        DISPATCH_CASE(WRAP(LaunchForward<256, float>(binary_fn, output, a_promoted, b_promoted);), DataType::kFLOAT32)
+        DISPATCH_CASE(WRAP(LaunchForward<256, __maca_bfloat16>(binary_fn, output, a_promoted, b_promoted);),
+                      DataType::kBFLOAT16)
+        DISPATCH_CASE(WRAP(LaunchForward<256, int64_t>(binary_fn, output, a_promoted, b_promoted);), DataType::kINT64)
+    default:
+        LOG_LOC(FATAL, "MACA binary forward: 'Unsupported data type'");
+    }
+
+    return output;
+}
+
+template <typename FuncA, typename FuncB>
+std::pair<std::shared_ptr<Tensor>, std::shared_ptr<Tensor>>
+BinaryBackward(const std::shared_ptr<Tensor> &grad_output, const std::shared_ptr<Tensor> &a,
+               const std::shared_ptr<Tensor> &b, const std::vector<int64_t> &a_dims, const std::vector<int64_t> &b_dims,
+               FuncA fn_a, FuncB fn_b) {
+    const auto a_num_elements = std::accumulate(a_dims.begin(), a_dims.end(), 1, std::multiplies<int64_t>());
+    const auto b_num_elements = std::accumulate(b_dims.begin(), b_dims.end(), 1, std::multiplies<int64_t>());
+
+    std::shared_ptr<Tensor> a_promoted = a;
+    std::shared_ptr<Tensor> b_promoted = b;
+    std::shared_ptr<Tensor> grad_output_promoted = grad_output;
+
+    auto dtype = grad_output_promoted->Dtype();
+    auto device = grad_output->GetDevice();
+
+    auto a_dtype = a_promoted ? a_promoted->Dtype() : dtype;
+    auto b_dtype = b_promoted ? b_promoted->Dtype() : dtype;
+    // Compute dtype determined by saved tensors (forward compute dtype), not grad_output
+    DataType promoted_type = DispatchFunc<DataTypeList<INFINI_ALL_TYPES>, DataTypeList<INFINI_ALL_TYPES>>(
+        {a_dtype, b_dtype}, [=]<typename Ta, typename Tb>() { return DataTypeMap_v<WidestType_t<Ta, Tb>>; },
+        "MACA BinaryBackward");
+
+    CHECK(a_num_elements >= b_num_elements && a_num_elements % b_num_elements == 0);
+
+    auto promote_if_needed = [&](std::shared_ptr<Tensor> &t, size_t expected_numel, DataType promoted_type) {
+        if (t) {
+            CHECK(expected_numel == t->NumElements());
+            if (t->Dtype() != promoted_type) {
+                t = std::make_shared<Tensor>(t->To(promoted_type));
+            }
+        }
+    };
+    promote_if_needed(a_promoted, a_num_elements, promoted_type);
+    promote_if_needed(b_promoted, b_num_elements, promoted_type);
+    if (dtype != promoted_type) {
+        grad_output_promoted = std::make_shared<Tensor>(grad_output_promoted->To(promoted_type));
+    }
+
+    auto grad_a = std::make_shared<Tensor>(a_dims, promoted_type, device);
+    auto grad_b = std::make_shared<Tensor>(b_dims, promoted_type, device);
+
+    // Only Fill(0) when broadcast is needed (atomicAdd requires zero-init).
+    // The no-broadcast fast path writes every element directly.
+    const bool needs_broadcast = !ShapesEqual(a_dims, b_dims) || !ShapesEqual(a_dims, grad_output->Dims());
+
+    switch (promoted_type) {
+        DISPATCH_CASE(WRAP({
+                          if (needs_broadcast) {
+                              grad_a->Fill<float>(0.0f);
+                              grad_b->Fill<float>(0.0f);
+                          }
+                          LaunchBackward<256, float>(fn_a, fn_b, grad_a, grad_b, a_dims, b_dims, grad_output_promoted,
+                                                     a_promoted, b_promoted);
+                      }),
+                      DataType::kFLOAT32)
+        DISPATCH_CASE(WRAP({
+                          if (needs_broadcast) {
+                              grad_a->Fill<__maca_bfloat16>(0);
+                              grad_b->Fill<__maca_bfloat16>(0);
+                          }
+                          LaunchBackward<256, __maca_bfloat16>(fn_a, fn_b, grad_a, grad_b, a_dims, b_dims,
+                                                           grad_output_promoted, a_promoted, b_promoted);
+                      }),
+                      DataType::kBFLOAT16)
+        // FIXME(zbl): AtomicAdd does not support int64_t
+        // DISPATCH_CASE(WRAP({
+        //                   grad_a->Fill<int64_t>(0);
+        //                   grad_b->Fill<int64_t>(0);
+        //                   LaunchBackward<256, int64_t>(fn_a, fn_b, grad_a, grad_b, a_dims, b_dims, grad_output, a,
+        //                   b);
+        //               }),
+        //               DataType::kINT64)
+    default:
+        LOG_LOC(FATAL, "MACA binary backward: 'Unsupported data type'");
+    }
+
+    return {grad_a, grad_b};
+}
+} // namespace
+
+std::shared_ptr<Tensor> NegForward(const std::shared_ptr<Tensor> &input) {
+    DISPATCH(input->Dtype(), return UnaryForward(input, [] __device__(auto x) { return Neg(x); });
+             , INFINI_ALL_FLOATING_TYPES)
+}
+
+std::shared_ptr<Tensor> NegBackward(const std::shared_ptr<Tensor> &grad_output) {
+    DISPATCH(grad_output->Dtype(),
+             return UnaryBackward(grad_output, nullptr, [] __device__(auto x) { return decltype(x){-1}; });
+             , INFINI_ALL_FLOATING_TYPES)
+}
+
+std::shared_ptr<Tensor> ReciprocalForward(const std::shared_ptr<Tensor> &input) {
+    DISPATCH(input->Dtype(), return UnaryForward(input, [] __device__(auto x) { return Reciprocal(x); });
+             , INFINI_ALL_FLOATING_TYPES)
+}
+
+std::shared_ptr<Tensor> ReciprocalBackward(const std::shared_ptr<Tensor> &grad_output,
+                                           const std::shared_ptr<Tensor> &input) {
+    DISPATCH(
+        grad_output->Dtype(),
+        return UnaryBackward(grad_output, input, [] __device__(auto x) { return Div(decltype(x){-1}, Mul(x, x)); });
+        , INFINI_ALL_FLOATING_TYPES)
+}
+
+std::shared_ptr<Tensor> SinForward(const std::shared_ptr<Tensor> &input) {
+    DISPATCH(input->Dtype(), return UnaryForward(input, [] __device__(auto x) { return Sin(x); });
+             , INFINI_ALL_FLOATING_TYPES)
+}
+
+std::shared_ptr<Tensor> SinBackward(const std::shared_ptr<Tensor> &grad_output, const std::shared_ptr<Tensor> &input) {
+    DISPATCH(grad_output->Dtype(), return UnaryBackward(grad_output, input, [] __device__(auto x) { return Cos(x); });
+             , INFINI_ALL_FLOATING_TYPES)
+}
+
+std::shared_ptr<Tensor> CosForward(const std::shared_ptr<Tensor> &input) {
+    DISPATCH(input->Dtype(), return UnaryForward(input, [] __device__(auto x) { return Cos(x); });
+             , INFINI_ALL_FLOATING_TYPES)
+}
+
+std::shared_ptr<Tensor> CosBackward(const std::shared_ptr<Tensor> &grad_output, const std::shared_ptr<Tensor> &input) {
+    DISPATCH(grad_output->Dtype(),
+             return UnaryBackward(grad_output, input, [] __device__(auto x) { return Neg(Sin(x)); });
+             , INFINI_ALL_FLOATING_TYPES)
+}
+
+std::shared_ptr<Tensor> TanhForward(const std::shared_ptr<Tensor> &input) {
+    DISPATCH(input->Dtype(), return UnaryForward(input, [] __device__(auto x) { return Tanh(x); });
+             , INFINI_ALL_FLOATING_TYPES)
+}
+
+std::shared_ptr<Tensor> TanhBackward(const std::shared_ptr<Tensor> &grad_output,
+                                     const std::shared_ptr<Tensor> &output) {
+    DISPATCH(grad_output->Dtype(),
+             return UnaryBackward(grad_output, output, [] __device__(auto x) { return decltype(x){1} - Mul(x, x); });
+             , INFINI_ALL_FLOATING_TYPES)
+}
+
+std::shared_ptr<Tensor> PowForward(const std::shared_ptr<Tensor> &input, float scalar, bool scalar_is_base) {
+    DISPATCH(input->Dtype(), WRAP({
+                 if (scalar_is_base) {
+                     return UnaryForward(
+                         input, [scalar] __device__(auto x) { return Pow(static_cast<decltype(x)>(scalar), x); });
+                 } else {
+                     return UnaryForward(
+                         input, [scalar] __device__(auto x) { return Pow(x, static_cast<decltype(x)>(scalar)); });
+                 }
+             }),
+             INFINI_ALL_FLOATING_TYPES);
+}
+
+std::shared_ptr<Tensor> PowBackward(const std::shared_ptr<Tensor> &grad_output, const std::shared_ptr<Tensor> &input,
+                                    float scalar, bool scalar_is_base) {
+    DISPATCH(grad_output->Dtype(),
+             return UnaryBackward(grad_output, input,
+                                  [scalar, scalar_is_base] __device__(auto x) {
+                                      auto casted_scalar = common::maca::Cast<decltype(x)>(scalar);
+                                      if (scalar_is_base) {
+                                          return Mul(Log(casted_scalar), Pow(casted_scalar, x));
+                                      } else {
+                                          return Mul(casted_scalar, Pow(x, casted_scalar - decltype(x){1}));
+                                      }
+                                  });
+             , INFINI_ALL_FLOATING_TYPES)
+}
+
+std::shared_ptr<Tensor> RsqrtForward(const std::shared_ptr<Tensor> &input) {
+    DISPATCH(input->Dtype(), return UnaryForward(input, [] __device__(auto x) { return Rsqrt(x); });
+             , INFINI_ALL_FLOATING_TYPES)
+}
+
+std::shared_ptr<Tensor> RsqrtBackward(const std::shared_ptr<Tensor> &grad_output,
+                                      const std::shared_ptr<Tensor> &input) {
+    DISPATCH(grad_output->Dtype(),
+             return UnaryBackward(
+                 grad_output, input,
+                 [] __device__(auto x) { return Mul(static_cast<decltype(x)>(-0.5), Mul(Reciprocal(x), Rsqrt(x))); });
+             , INFINI_ALL_FLOATING_TYPES)
+}
+
+std::shared_ptr<Tensor> ExpForward(const std::shared_ptr<Tensor> &input) {
+    DISPATCH(input->Dtype(), return UnaryForward(input, [] __device__(auto x) { return Exp(x); });
+             , INFINI_ALL_FLOATING_TYPES)
+}
+
+std::shared_ptr<Tensor> ExpBackward(const std::shared_ptr<Tensor> &grad_output, const std::shared_ptr<Tensor> &output) {
+    DISPATCH(grad_output->Dtype(), return UnaryBackward(grad_output, output, [] __device__(auto y) { return y; });
+             , INFINI_ALL_FLOATING_TYPES)
+}
+
+std::shared_ptr<Tensor> LogForward(const std::shared_ptr<Tensor> &input) {
+    DISPATCH(input->Dtype(), return UnaryForward(input, [] __device__(auto x) { return Log(x); });
+             , INFINI_ALL_FLOATING_TYPES)
+}
+
+std::shared_ptr<Tensor> LogBackward(const std::shared_ptr<Tensor> &grad_output, const std::shared_ptr<Tensor> &input) {
+    DISPATCH(grad_output->Dtype(),
+             return UnaryBackward(grad_output, input, [] __device__(auto x) { return Reciprocal(x); });
+             , INFINI_ALL_FLOATING_TYPES)
+}
+
+std::shared_ptr<Tensor> EqualsForward(const std::shared_ptr<Tensor> &a, const std::shared_ptr<Tensor> &b) {
+    DISPATCH(a->Dtype(),
+             return BinaryForward(a, b,
+                                  [] __device__(auto x, auto y) { return (x == y) ? decltype(x){1} : decltype(x){0}; });
+             , INFINI_ALL_TYPES)
+}
+
+std::shared_ptr<Tensor> EqualsScalarForward(const std::shared_ptr<Tensor> &a, float scalar) {
+    DISPATCH(a->Dtype(), return UnaryForward(a,
+                                             [scalar] __device__(auto x) {
+                                                 return x == static_cast<decltype(x)>(scalar) ? decltype(x){1}
+                                                                                              : decltype(x){0};
+                                             });
+             , INFINI_ALL_FLOATING_TYPES)
+}
+
+std::shared_ptr<Tensor> LtForward(const std::shared_ptr<Tensor> &a, const std::shared_ptr<Tensor> &b) {
+    DISPATCH(a->Dtype(), return BinaryForward(
+                             a, b, [] __device__(auto x, auto y) { return x < y ? decltype(x){1} : decltype(x){0}; });
+             , INFINI_ALL_TYPES)
+}
+
+std::shared_ptr<Tensor> LtScalarForward(const std::shared_ptr<Tensor> &a, float scalar) {
+    DISPATCH(a->Dtype(), return UnaryForward(a,
+                                             [scalar] __device__(auto x) {
+                                                 return (x < static_cast<decltype(x)>(scalar)) ? decltype(x){1}
+                                                                                               : decltype(x){0};
+                                             });
+             , INFINI_ALL_TYPES)
+}
+
+std::shared_ptr<Tensor> LeForward(const std::shared_ptr<Tensor> &a, const std::shared_ptr<Tensor> &b) {
+    DISPATCH(a->Dtype(),
+             return BinaryForward(a, b,
+                                  [] __device__(auto x, auto y) { return (x <= y) ? decltype(x){1} : decltype(x){0}; });
+             , INFINI_ALL_TYPES)
+}
+
+std::shared_ptr<Tensor> LeScalarForward(const std::shared_ptr<Tensor> &a, float scalar) {
+    DISPATCH(a->Dtype(), return UnaryForward(a,
+                                             [scalar] __device__(auto x) {
+                                                 return (x <= static_cast<decltype(x)>(scalar)) ? decltype(x){1}
+                                                                                                : decltype(x){0};
+                                             });
+             , INFINI_ALL_TYPES)
+}
+
+std::shared_ptr<Tensor> GtForward(const std::shared_ptr<Tensor> &a, const std::shared_ptr<Tensor> &b) {
+    DISPATCH(a->Dtype(), return BinaryForward(
+                             a, b, [] __device__(auto x, auto y) { return x > y ? decltype(x){1} : decltype(x){0}; });
+             , INFINI_ALL_TYPES)
+}
+
+std::shared_ptr<Tensor> GtScalarForward(const std::shared_ptr<Tensor> &a, float scalar) {
+    DISPATCH(a->Dtype(), return UnaryForward(a,
+                                             [scalar] __device__(auto x) {
+                                                 return (x > static_cast<decltype(x)>(scalar)) ? decltype(x){1}
+                                                                                               : decltype(x){0};
+                                             });
+             , INFINI_ALL_TYPES)
+}
+
+std::shared_ptr<Tensor> GeForward(const std::shared_ptr<Tensor> &a, const std::shared_ptr<Tensor> &b) {
+    DISPATCH(a->Dtype(),
+             return BinaryForward(a, b,
+                                  [] __device__(auto x, auto y) { return (x >= y) ? decltype(x){1} : decltype(x){0}; });
+             , INFINI_ALL_TYPES)
+}
+
+std::shared_ptr<Tensor> GeScalarForward(const std::shared_ptr<Tensor> &a, float scalar) {
+    DISPATCH(a->Dtype(), return UnaryForward(a,
+                                             [scalar] __device__(auto x) {
+                                                 return (x >= static_cast<decltype(x)>(scalar)) ? decltype(x){1}
+                                                                                                : decltype(x){0};
+                                             });
+             , INFINI_ALL_TYPES)
+}
+
+std::shared_ptr<Tensor> OrForward(const std::shared_ptr<Tensor> &a, const std::shared_ptr<Tensor> &b) {
+    DISPATCH(a->Dtype(), return BinaryForward(a, b,
+                                              [] __device__(auto x, auto y) {
+                                                  return (x != decltype(x){0} || y != decltype(y){0}) ? decltype(x){1}
+                                                                                                      : decltype(x){0};
+                                              });
+             , INFINI_ALL_TYPES)
+}
+
+std::shared_ptr<Tensor> AndForward(const std::shared_ptr<Tensor> &a, const std::shared_ptr<Tensor> &b) {
+    DISPATCH(a->Dtype(), return BinaryForward(a, b,
+                                              [] __device__(auto x, auto y) {
+                                                  return (x != decltype(x){0} && y != decltype(y){0}) ? decltype(x){1}
+                                                                                                      : decltype(x){0};
+                                              });
+             , INFINI_ALL_TYPES)
+}
+
+std::shared_ptr<Tensor> AddForward(const std::shared_ptr<Tensor> &a, const std::shared_ptr<Tensor> &b) {
+    DISPATCH(a->Dtype(), return BinaryForward(a, b, [] __device__(auto x, auto y) { return Add(x, y); });
+             , INFINI_ALL_FLOATING_TYPES)
+}
+
+std::pair<std::shared_ptr<Tensor>, std::shared_ptr<Tensor>> AddBackward(const std::shared_ptr<Tensor> &grad_output,
+                                                                        const std::vector<int64_t> &a_dims,
+                                                                        const std::vector<int64_t> &b_dims) {
+    auto fn = [] __device__(auto x, auto y) { return decltype(x){1}; };
+    return BinaryBackward(grad_output, nullptr, nullptr, a_dims, b_dims, fn, fn);
+}
+
+std::shared_ptr<Tensor> AddScalarForward(const std::shared_ptr<Tensor> &a, float scalar) {
+    DISPATCH(a->Dtype(),
+             return UnaryForward(a, [scalar] __device__(auto x) { return Add(x, static_cast<decltype(x)>(scalar)); });
+             , INFINI_ALL_TYPES)
+}
+
+std::shared_ptr<Tensor> AddScalarBackward(const std::shared_ptr<Tensor> &grad_output) {
+    DISPATCH(grad_output->Dtype(),
+             return UnaryBackward(grad_output, nullptr,
+                                  [] __device__(auto x) { return common::maca::Cast<decltype(x)>(1); });
+             , INFINI_ALL_TYPES)
+}
+
+std::shared_ptr<Tensor> SubForward(const std::shared_ptr<Tensor> &a, const std::shared_ptr<Tensor> &b) {
+    DISPATCH(a->Dtype(), return BinaryForward(a, b, [] __device__(auto x, auto y) { return Sub(x, y); });
+             , INFINI_ALL_TYPES)
+}
+
+std::pair<std::shared_ptr<Tensor>, std::shared_ptr<Tensor>> SubBackward(const std::shared_ptr<Tensor> &grad_output,
+                                                                        const std::vector<int64_t> &a_dims,
+                                                                        const std::vector<int64_t> &b_dims) {
+    auto fn_a = [] __device__(auto x, auto y) { return decltype(x){1}; };
+    auto fn_b = [] __device__(auto x, auto y) { return decltype(x){-1}; };
+    return BinaryBackward(grad_output, nullptr, nullptr, a_dims, b_dims, fn_a, fn_b);
+}
+
+std::shared_ptr<Tensor> MulForward(const std::shared_ptr<Tensor> &a, const std::shared_ptr<Tensor> &b) {
+    DISPATCH(a->Dtype(), return BinaryForward(a, b, [] __device__(auto x, auto y) { return Mul(x, y); });
+             , INFINI_ALL_FLOATING_TYPES)
+}
+
+std::pair<std::shared_ptr<Tensor>, std::shared_ptr<Tensor>> MulBackward(const std::shared_ptr<Tensor> &grad_output,
+                                                                        const std::shared_ptr<Tensor> &a,
+                                                                        const std::shared_ptr<Tensor> &b) {
+    DISPATCH_WITH_DEFAULT(grad_output->Dtype(),
+                          return BinaryBackward(
+                              grad_output, a, b, a->Dims(), b->Dims(), [] __device__(auto, auto y) { return y; },
+                              [] __device__(auto x, auto) { return x; });
+                          , WRAP({
+                              LOG_LOC(FATAL, "MACA MulBackward: 'Unsupported data type'");
+                              return {nullptr, nullptr};
+                          }),
+                          INFINI_ALL_FLOATING_TYPES)
+}
+
+std::shared_ptr<Tensor> MulScalarForward(const std::shared_ptr<Tensor> &a, float scalar) {
+    DISPATCH(a->Dtype(),
+             return UnaryForward(a, [scalar] __device__(auto x) { return Mul(x, static_cast<decltype(x)>(scalar)); });
+             , INFINI_ALL_FLOATING_TYPES)
+}
+
+std::shared_ptr<Tensor> MulScalarBackward(const std::shared_ptr<Tensor> &grad_output, float scalar) {
+    DISPATCH(grad_output->Dtype(),
+             return UnaryBackward(grad_output, nullptr,
+                                  [scalar] __device__(auto x) { return static_cast<decltype(x)>(scalar); });
+             , INFINI_ALL_FLOATING_TYPES)
+}
+
+std::shared_ptr<Tensor> DivForward(const std::shared_ptr<Tensor> &a, const std::shared_ptr<Tensor> &b) {
+    DISPATCH(a->Dtype(), return BinaryForward(a, b, [] __device__(auto x, auto y) { return Div(x, y); });
+             , INFINI_ALL_FLOATING_TYPES)
+}
+
+std::pair<std::shared_ptr<Tensor>, std::shared_ptr<Tensor>> DivBackward(const std::shared_ptr<Tensor> &grad_output,
+                                                                        const std::shared_ptr<Tensor> &a,
+                                                                        const std::shared_ptr<Tensor> &b) {
+    DISPATCH_WITH_DEFAULT(grad_output->Dtype(), return BinaryBackward(
+                                                    grad_output, a, b, a->Dims(), b->Dims(),
+                                                    [] __device__(auto, auto y) { return Reciprocal(y); },
+                                                    [] __device__(auto x, auto y) { return Div(Neg(x), Mul(y, y)); });
+                          , WRAP({
+                              LOG_LOC(FATAL, "MACA DivBackward: 'Unsupported data type'");
+                              return {nullptr, nullptr};
+                          }),
+                          INFINI_ALL_FLOATING_TYPES)
+}
+
+std::shared_ptr<Tensor> SigmoidForward(const std::shared_ptr<Tensor> &input) {
+    DISPATCH(input->Dtype(), return UnaryForward(input, [] __device__(auto x) { return Sigmoid(x); });
+             , INFINI_ALL_FLOATING_TYPES)
+}
+
+std::shared_ptr<Tensor> SigmoidBackward(const std::shared_ptr<Tensor> &output,
+                                        const std::shared_ptr<Tensor> &grad_output) {
+    DISPATCH(
+        grad_output->Dtype(),
+        return UnaryBackward(grad_output, output, [] __device__(auto x) { return Mul(x, Sub(decltype(x){1}, x)); });
+        , INFINI_ALL_FLOATING_TYPES)
+}
+} // namespace infini_train::kernels::maca
+
+#define REGISTER_MACA_ELEMENTWISE_KERNEL(kernel_name)                                                                  \
+    REGISTER_KERNEL(infini_train::Device::DeviceType::kMACA, kernel_name, infini_train::kernels::maca::kernel_name)
+
+REGISTER_MACA_ELEMENTWISE_KERNEL(NegForward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(NegBackward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(ReciprocalForward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(ReciprocalBackward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(SinForward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(SinBackward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(CosForward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(CosBackward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(TanhForward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(TanhBackward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(PowForward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(PowBackward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(RsqrtForward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(RsqrtBackward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(ExpForward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(ExpBackward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(LogForward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(LogBackward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(EqualsForward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(EqualsScalarForward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(LtForward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(LtScalarForward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(LeForward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(LeScalarForward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(GtForward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(GtScalarForward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(GeForward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(GeScalarForward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(OrForward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(AndForward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(AddForward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(AddBackward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(AddScalarForward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(AddScalarBackward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(SubForward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(SubBackward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(MulForward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(MulBackward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(MulScalarForward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(MulScalarBackward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(DivForward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(DivBackward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(SigmoidForward)
+REGISTER_MACA_ELEMENTWISE_KERNEL(SigmoidBackward)
+
+#undef REGISTER_MACA_ELEMENTWISE_KERNEL
diff --git a/infini_train/src/kernels/maca/fill.maca b/infini_train/src/kernels/maca/fill.maca
new file mode 100644
index 00000000..accdac0f
--- /dev/null
+++ b/infini_train/src/kernels/maca/fill.maca
@@ -0,0 +1,45 @@
+#include <cstddef>
+#include <memory>
+
+#include "infini_train/include/core/runtime/device_guard.h"
+#include "infini_train/include/device.h"
+#include "infini_train/include/dispatcher.h"
+#include "infini_train/include/tensor.h"
+
+#include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
+
+namespace infini_train::kernels::maca {
+
+template <typename T> __global__ void FillKernel(T *data, T value, size_t size) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < size) {
+        data[idx] = value;
+    }
+}
+
+// TODO(dcj): refactor Fill kernel with elementwise template
+void Fill(std::shared_ptr<Tensor> tensor, void *value_ptr) {
+    const int num_tokens = tensor->NumElements();
+    const int threads_per_block = 256;
+    const int num_blocks = (num_tokens + threads_per_block - 1) / threads_per_block;
+    auto device = tensor->GetDevice();
+    const auto &maca_stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
+                                  infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                                  ->maca_stream();
+
+    DispatchFunc<INFINI_ALL_TYPES>(
+        tensor->Dtype(),
+        [=]<typename T>() {
+            FillKernel<T><<<num_blocks, threads_per_block, 0, maca_stream>>>(
+                static_cast<T *>(tensor->DataPtr()), *(static_cast<T *>(value_ptr)), tensor->NumElements());
+        },
+        "MACA Fill");
+}
+} // namespace infini_train::kernels::maca
+
+#define REGISTER_MACA_FILL_KERNEL(kernel_name)                                                                         \
+    REGISTER_KERNEL(infini_train::Device::DeviceType::kMACA, kernel_name, infini_train::kernels::maca::kernel_name)
+
+REGISTER_MACA_FILL_KERNEL(Fill)
+
+#undef REGISTER_MACA_FILL_KERNEL
diff --git a/infini_train/src/kernels/maca/linear.maca b/infini_train/src/kernels/maca/linear.maca
new file mode 100644
index 00000000..accbec9f
--- /dev/null
+++ b/infini_train/src/kernels/maca/linear.maca
@@ -0,0 +1,508 @@
+#include <functional>
+#include <memory>
+#include <numeric>
+#include <vector>
+
+#include <cub/block/block_reduce.cuh>
+#include <mcblas/mcblas.h>
+
+#include "infini_train/include/autograd/linear.h"
+#include "infini_train/include/common/maca/common_maca.h"
+#include "infini_train/include/common/maca/kernel_helper.cuh"
+#include "infini_train/include/core/runtime/device_guard.h"
+#include "infini_train/include/dispatcher.h"
+#include "infini_train/include/tensor.h"
+
+#include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
+
+namespace infini_train::kernels::maca {
+
+std::shared_ptr<Tensor> MatmulForward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tensor> &other) {
+    /*
+     output[*, m, n] = input[*, m, k] * other[*, k, n]
+     */
+    const auto &input_dims = input->Dims();
+    const auto &other_dims = other->Dims();
+
+    CHECK_GE(input_dims.size(), 2);
+    CHECK_GE(other_dims.size(), 2);
+    CHECK_EQ(input_dims.size(), other_dims.size());
+
+    const int64_t m = input_dims[input_dims.size() - 2];
+    const int64_t k = input_dims[input_dims.size() - 1];
+    CHECK_EQ(k, other_dims[other_dims.size() - 2]);
+    const int64_t n = other_dims[other_dims.size() - 1];
+
+    const int64_t bs = std::accumulate(input_dims.rbegin() + 2, input_dims.rend(), 1, std::multiplies<int64_t>{});
+    for (int64_t i = 0; i < input_dims.size() - 2; ++i) {
+        CHECK_EQ(input_dims[i], other_dims[i]) << "Batch dims must match";
+    }
+
+    auto dtype = input->Dtype();
+    std::vector<int64_t> output_dims = input_dims;
+    output_dims[output_dims.size() - 1] = n;
+    auto output = std::make_shared<Tensor>(output_dims, dtype, input->GetDevice());
+
+    auto device = input->GetDevice();
+    const float alpha = 1.0f, beta = 0.0f;
+    mcblasHandle_t handle = dynamic_cast<infini_train::core::maca::MacaBlasHandle *>(
+                                infini_train::core::GetDeviceGuardImpl(device.type())->GetBlasHandle(device))
+                                ->mcblas_handle();
+
+    // cuBLAS is colmun-major
+    // output = input * other --> output.T = other.T * input.T
+    // C = A * B ==> output.T[*, n, m] = other.T[*, n, k] * input.T[*, k, m]
+    // C = output.T[*, n, m]
+    // A = other.T[*, n, k]
+    // B = input.T[*, k, m]
+    int lda = n;
+    int ldb = k;
+    int ldc = n;
+    int64_t stride_a = n * k;
+    int64_t stride_b = k * m;
+    int64_t stride_c = m * n;
+    // NOTE(zbl): the last mcblasGemmAlgo_t param has no effect on GPU arch >= sm_80(Ampere)
+
+    switch (dtype) {
+        DISPATCH_CASE(WRAP(MCBLAS_CHECK(mcblasGemmStridedBatchedEx(
+                          handle, MCBLAS_OP_N, MCBLAS_OP_N, n, m, k, &alpha, other->DataPtr(), MACA_R_32F, lda,
+                          stride_a, input->DataPtr(), MACA_R_32F, ldb, stride_b, &beta, output->DataPtr(), MACA_R_32F,
+                          ldc, stride_c, bs, MACA_R_32F, MCBLAS_GEMM_DEFAULT));),
+                      DataType::kFLOAT32)
+        DISPATCH_CASE(WRAP(MCBLAS_CHECK(mcblasGemmStridedBatchedEx(
+                          handle, MCBLAS_OP_N, MCBLAS_OP_N, n, m, k, &alpha, other->DataPtr(), MACA_R_16BF, lda,
+                          stride_a, input->DataPtr(), MACA_R_16BF, ldb, stride_b, &beta, output->DataPtr(), MACA_R_16BF,
+                          ldc, stride_c, bs, MACA_R_32F, MCBLAS_GEMM_DEFAULT));),
+                      DataType::kBFLOAT16)
+    default:
+        LOG_UNSUPPORTED_DTYPE(dtype, "MACA MatmulForward");
+    }
+
+    return output;
+}
+
+std::tuple<std::shared_ptr<Tensor>, std::shared_ptr<Tensor>>
+MatmulBackward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tensor> &other,
+               const std::shared_ptr<Tensor> &grad_output) {
+    /*
+       grad_input[*, m, k] = grad_output[*, m, n] * other[*, k, n]^T
+       grad_other[*, k, n] = input[*, m, k]^T * grad_output[*, m, n]
+    */
+
+    auto input_dtype = input->Dtype();
+    auto other_dtype = other->Dtype();
+    auto grad_output_dtype = grad_output->Dtype();
+    // Compute dtype determined by saved tensors (forward compute dtype), not grad_output
+    DataType compute_dtype = DispatchFunc<DataTypeList<INFINI_ALL_TYPES>, DataTypeList<INFINI_ALL_TYPES>>(
+        {input_dtype, other_dtype}, [=]<typename Tin, typename To>() { return DataTypeMap_v<WidestType_t<Tin, To>>; },
+        "MACA MatmulBackward");
+
+    auto input_promoted = input_dtype == compute_dtype ? input : std::make_shared<Tensor>(input->To(compute_dtype));
+    auto other_promoted = other_dtype == compute_dtype ? other : std::make_shared<Tensor>(other->To(compute_dtype));
+    auto grad_output_promoted
+        = grad_output_dtype == compute_dtype ? grad_output : std::make_shared<Tensor>(grad_output->To(compute_dtype));
+
+    const auto &input_dims = input->Dims();
+    const auto &other_dims = other->Dims();
+    const auto &grad_output_dims = grad_output->Dims();
+
+    CHECK_GE(input_dims.size(), 2);
+    CHECK_EQ(input_dims.size(), other_dims.size());
+    CHECK_EQ(input_dims.size(), grad_output_dims.size());
+
+    const int64_t m = input_dims[input_dims.size() - 2];
+    const int64_t k = input_dims[input_dims.size() - 1];
+    const int64_t n = other_dims[other_dims.size() - 1];
+    CHECK_EQ(k, other_dims[other_dims.size() - 2]);
+    CHECK_EQ(m, grad_output_dims[grad_output_dims.size() - 2]);
+    CHECK_EQ(n, grad_output_dims[grad_output_dims.size() - 1]);
+
+    const int64_t bs = std::accumulate(input_dims.rbegin() + 2, input_dims.rend(), 1, std::multiplies<int64_t>{});
+    for (int64_t i = 0; i < input_dims.size() - 2; ++i) {
+        CHECK_EQ(input_dims[i], other_dims[i]) << "Batch dims must match";
+        CHECK_EQ(input_dims[i], grad_output_dims[i]) << "Batch dims must match";
+    }
+
+    // For bf16 compute, output in fp32 to preserve accumulation precision (matches PyTorch behavior)
+    auto output_dtype = (compute_dtype == DataType::kBFLOAT16) ? DataType::kFLOAT32 : compute_dtype;
+    auto grad_input = std::make_shared<Tensor>(input_dims, output_dtype, grad_output->GetDevice());
+    auto grad_other = std::make_shared<Tensor>(other_dims, output_dtype, grad_output->GetDevice());
+
+    // No Fill(0) needed: cuBLAS beta=0.0f means C is fully overwritten, never read.
+
+    auto device = input_promoted->GetDevice();
+    const float alpha = 1.0f, beta = 0.0f;
+    mcblasHandle_t handle = dynamic_cast<infini_train::core::maca::MacaBlasHandle *>(
+                                infini_train::core::GetDeviceGuardImpl(device.type())->GetBlasHandle(device))
+                                ->mcblas_handle();
+
+    {
+        // cuBLAS is colmun-major
+        // grad_input = grad_output * other.T --> grad_input.T = other * grad_output.T
+        // C = A.T * B ==> grad_input.T[*, k, m] = other[*, k, n] * grad_output.T[*, n, m]
+        // C = grad_input.T[*, k, m]
+        // A = other.T[*, n, k]
+        // B = grad_output.T[*, n, m]
+        const int lda = n, ldb = n, ldc = k;
+        const int64_t stride_a = k * n;
+        const int64_t stride_b = n * m;
+        const int64_t stride_c = m * k;
+        switch (compute_dtype) {
+            DISPATCH_CASE(WRAP(MCBLAS_CHECK(mcblasGemmStridedBatchedEx(
+                              handle, MCBLAS_OP_T, MCBLAS_OP_N, k, m, n, &alpha, other_promoted->DataPtr(), MACA_R_32F,
+                              lda, stride_a, grad_output_promoted->DataPtr(), MACA_R_32F, ldb, stride_b, &beta,
+                              grad_input->DataPtr(), MACA_R_32F, ldc, stride_c, bs, MACA_R_32F, MCBLAS_GEMM_DEFAULT));),
+                          DataType::kFLOAT32)
+            DISPATCH_CASE(WRAP(MCBLAS_CHECK(mcblasGemmStridedBatchedEx(
+                              handle, MCBLAS_OP_T, MCBLAS_OP_N, k, m, n, &alpha, other_promoted->DataPtr(), MACA_R_16BF,
+                              lda, stride_a, grad_output_promoted->DataPtr(), MACA_R_16BF, ldb, stride_b, &beta,
+                              grad_input->DataPtr(), MACA_R_32F, ldc, stride_c, bs, MACA_R_32F, MCBLAS_GEMM_DEFAULT));),
+                          DataType::kBFLOAT16)
+        }
+    }
+
+    {
+        // cuBLAS is colmun-major
+        // grad_other = input.T * grad_output --> grad_other.T =  grad_output.T * input
+        // C = A * B.T ==> grad_other.T[*, n, k] = grad_output.T[*, n, m] * input[*, m, k]
+        // C = grad_other.T[*, n, k]
+        // A = grad_output.T[*, n, m]
+        // B = input.T[*, k, m]
+        const int lda = n, ldb = k, ldc = n;
+        const int64_t stride_a = n * m;
+        const int64_t stride_b = k * m;
+        const int64_t stride_c = n * k;
+        switch (compute_dtype) {
+            DISPATCH_CASE(WRAP(MCBLAS_CHECK(mcblasGemmStridedBatchedEx(
+                              handle, MCBLAS_OP_N, MCBLAS_OP_T, n, k, m, &alpha, grad_output_promoted->DataPtr(),
+                              MACA_R_32F, lda, stride_a, input_promoted->DataPtr(), MACA_R_32F, ldb, stride_b, &beta,
+                              grad_other->DataPtr(), MACA_R_32F, ldc, stride_c, bs, MACA_R_32F, MCBLAS_GEMM_DEFAULT));),
+                          DataType::kFLOAT32)
+            DISPATCH_CASE(WRAP(MCBLAS_CHECK(mcblasGemmStridedBatchedEx(
+                              handle, MCBLAS_OP_N, MCBLAS_OP_T, n, k, m, &alpha, grad_output_promoted->DataPtr(),
+                              MACA_R_16BF, lda, stride_a, input_promoted->DataPtr(), MACA_R_16BF, ldb, stride_b, &beta,
+                              grad_other->DataPtr(), MACA_R_32F, ldc, stride_c, bs, MACA_R_32F, MCBLAS_GEMM_DEFAULT));),
+                          DataType::kBFLOAT16)
+        }
+    }
+
+    return {grad_input, grad_other};
+}
+
+template <typename T> __global__ void BiasCopyKernel(T *output, const T *bias, int bs, int out_features) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= bs * out_features) {
+        return;
+    }
+    int j = idx % out_features;
+    output[idx] = bias[j];
+}
+
+std::shared_ptr<Tensor> LinearForward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tensor> &weight,
+                                      bool transpose, const std::shared_ptr<Tensor> &bias) {
+
+    /*
+        !transpose: output = input * weight + bias
+        output[*, out_features] = input[*, in_features] * weight[in_features, out_features] + bias[out_features]
+
+        transpose:  output = input * weight^T + bias
+        output[*, out_features] = input[*, in_features] * weight[out_features, in_features]^T + bias[out_features]
+    */
+
+    const auto &input_dims = input->Dims();
+    CHECK_GE(input_dims.size(), 2);
+    const int64_t bs = std::accumulate(input_dims.rbegin() + 1, input_dims.rend(), 1, std::multiplies<int64_t>{});
+    const int64_t in_features = *input_dims.rbegin();
+
+    const auto &weight_dims = weight->Dims();
+    CHECK_EQ(weight_dims.size(), 2);
+    CHECK_EQ(in_features, weight_dims[transpose ? 1 : 0]);
+
+    // As for cublas:
+    // C = alpha * op(B) * op(A) + beta * C
+    // Dimensions:
+    //   input:  (bs, in_features)
+    //   weight: (in_features, out_features) or (out_features, in_features) if transposed
+    //   output: (bs, out_features)
+    const int64_t out_features = weight_dims[transpose ? 0 : 1];
+
+    auto dtype = input->Dtype();
+    auto output_dims = input_dims;
+    *output_dims.rbegin() = out_features;
+    auto output = std::make_shared<Tensor>(output_dims, dtype, input->GetDevice());
+
+    auto device = input->GetDevice();
+    const auto &maca_stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
+                                  infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                                  ->maca_stream();
+
+    if (bias) {
+        CHECK_EQ(bias->Dims().size(), 1);
+        CHECK_EQ(bias->Dims()[0], out_features);
+        int threads_per_block = 256;
+        int num_blocks = (bs * out_features + threads_per_block - 1) / threads_per_block;
+
+        DispatchFunc<DataType::kFLOAT32, DataType::kBFLOAT16>(
+            dtype,
+            [=]<typename T>() {
+                BiasCopyKernel<<<num_blocks, threads_per_block, 0, maca_stream>>>(
+                    static_cast<T *>(output->DataPtr()), static_cast<const T *>(bias->DataPtr()), bs, out_features);
+            },
+            "MACA LinearForward");
+    } else {
+        DispatchFunc<DataType::kFLOAT32, DataType::kBFLOAT16>(
+            input->Dtype(), [=]<typename T>() { output->Fill<T>(0); }, "MACA LinearForward");
+    }
+
+    const float alpha = 1.0f;
+    const float beta = 1.0f;
+    auto trans_a = transpose ? MCBLAS_OP_T : MCBLAS_OP_N;
+    auto trans_b = MCBLAS_OP_N;
+    auto lda = transpose ? in_features : out_features;
+    mcblasHandle_t handle = dynamic_cast<infini_train::core::maca::MacaBlasHandle *>(
+                                infini_train::core::GetDeviceGuardImpl(device.type())->GetBlasHandle(device))
+                                ->mcblas_handle();
+
+    // TODO(zbl): use mcblasSgemv if possible for convenience and simplicity
+    //
+    // - if a is transposed:
+    // weight is [out_features, in_features] here
+    // output = input * weight.T --> output.T = weight * input.T
+    // C = output.T[out_features, bs]
+    // A = weight.T[in_features, out_features]
+    // B = input.T[in_features, bs]
+    //
+    // - if a is not transposed:
+    // output = input * weight --> output.T =  weight.T * input.T
+    // C = output.T[out_features, bs]
+    // A = weight.T[out_features, in_features]
+    // B = input.T[in_features, bs]
+    switch (input->Dtype()) {
+        DISPATCH_CASE(WRAP({
+                          MCBLAS_CHECK(mcblasSgemm(handle, trans_a, trans_b, out_features, bs, in_features, &alpha,
+                                                   static_cast<const float *>(weight->DataPtr()), lda,
+                                                   static_cast<const float *>(input->DataPtr()), in_features, &beta,
+                                                   static_cast<float *>(output->DataPtr()), out_features));
+                      }),
+                      DataType::kFLOAT32)
+        DISPATCH_CASE(WRAP({
+                          MCBLAS_CHECK(mcblasGemmEx(handle, trans_a, trans_b, out_features, bs, in_features, &alpha,
+                                                    weight->DataPtr(), MACA_R_16BF, lda, input->DataPtr(), MACA_R_16BF,
+                                                    in_features, &beta, output->DataPtr(), MACA_R_16BF, out_features,
+                                                    MACA_R_32F, MCBLAS_GEMM_DEFAULT));
+                      }),
+                      DataType::kBFLOAT16)
+    }
+
+    return output;
+}
+
+template <int BLOCK_SIZE, typename TIn, typename TOut>
+__global__ void ReduceColumnsKernel(const TIn *__restrict__ input, TOut *__restrict__ output, int num_rows,
+                                    int num_cols) {
+    using BlockReduce = cub::BlockReduce<float, BLOCK_SIZE>;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+
+    int row = blockIdx.x;
+    float sum = 0.0f;
+
+    for (int col = threadIdx.x; col < num_cols; col += blockDim.x) {
+        sum += common::maca::Cast<float>(input[row * num_cols + col]);
+    }
+
+    float reduced = BlockReduce(temp_storage).Sum(sum);
+
+    if (threadIdx.x == 0) {
+        output[row] = reduced;
+    }
+}
+
+std::tuple<std::shared_ptr<Tensor>, std::shared_ptr<Tensor>, std::shared_ptr<Tensor>>
+LinearBackward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tensor> &weight, bool transpose,
+               int64_t in_features, int64_t out_features, const std::vector<int64_t> &input_dims,
+               const std::shared_ptr<Tensor> &grad_output, bool bias,
+               infini_train::autograd::LinearGradFlags grad_flags) {
+    const auto compute_grad_input = grad_flags.input;
+    const auto compute_grad_weight = grad_flags.weight;
+    const auto compute_grad_bias = grad_flags.bias;
+
+    CHECK_GE(input_dims.size(), 2);
+    const int64_t bs = std::accumulate(input_dims.rbegin() + 1, input_dims.rend(), 1, std::multiplies<int64_t>{});
+
+    const std::vector<int64_t> weight_dims
+        = transpose ? std::vector<int64_t>{out_features, in_features} : std::vector<int64_t>{in_features, out_features};
+
+    auto dtype = grad_output->Dtype();
+
+    // For type promotion, use available tensors
+    DataType input_dtype = input ? input->Dtype() : (weight ? weight->Dtype() : dtype);
+    DataType weight_dtype = weight ? weight->Dtype() : (input ? input->Dtype() : dtype);
+    // Compute dtype determined by saved tensors (forward compute dtype), not grad_output
+    DataType compute_dtype = DispatchFunc<DataTypeList<INFINI_ALL_TYPES>, DataTypeList<INFINI_ALL_TYPES>>(
+        {input_dtype, weight_dtype}, [=]<typename Tin, typename Tw>() { return DataTypeMap_v<WidestType_t<Tin, Tw>>; },
+        "MACA LinearBackward");
+
+    auto grad_output_promoted
+        = dtype == compute_dtype ? grad_output : std::make_shared<Tensor>(grad_output->To(compute_dtype));
+
+    // For bf16 compute, accumulate in fp32 to preserve precision (matches PyTorch behavior).
+    auto output_dtype = (compute_dtype == DataType::kBFLOAT16) ? DataType::kFLOAT32 : compute_dtype;
+
+    // Allocate only needed gradient tensors (selective save: input/weight may be nullptr).
+    std::shared_ptr<Tensor> grad_input = nullptr;
+    std::shared_ptr<Tensor> grad_weight = nullptr;
+    std::shared_ptr<Tensor> grad_bias = nullptr;
+
+    if (compute_grad_input) {
+        grad_input = std::make_shared<Tensor>(input_dims, output_dtype, grad_output->GetDevice());
+    }
+    if (compute_grad_weight) {
+        grad_weight = std::make_shared<Tensor>(weight_dims, output_dtype, grad_output->GetDevice());
+    }
+    // No Fill(0) needed: cuBLAS beta=0.0f fully overwrites output, and ReduceColumnsKernel assigns directly.
+    if (compute_grad_bias && bias) {
+        grad_bias
+            = std::make_shared<Tensor>(std::vector<int64_t>{out_features}, output_dtype, grad_output->GetDevice());
+    }
+
+    auto device = grad_output->GetDevice();
+    const auto &maca_stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
+                                  infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                                  ->maca_stream();
+
+    float alpha = 1.0f;
+    float beta = 0.0f;
+
+    mcblasHandle_t handle = dynamic_cast<infini_train::core::maca::MacaBlasHandle *>(
+                                infini_train::core::GetDeviceGuardImpl(device.type())->GetBlasHandle(device))
+                                ->mcblas_handle();
+
+    switch (compute_dtype) {
+        // TODO(zbl): use mcblasSgemv if possible
+        DISPATCH_CASE(
+            WRAP({
+                if (compute_grad_input) {
+                    // - if transpose:
+                    // weight is [out_features, in_features] here
+                    // d_input = d_output * weight --> d_input.T = weight.T * d_output.T
+                    // C = d_input.T[in_features, bs]
+                    // A = weight.T[in_features, out_features]
+                    // B = d_output.T[out_features, bs]
+                    //
+                    // - if not transpose:
+                    // weight is [in_features, out_features] here
+                    // d_input = d_output * weight.T --> d_input.T = weight * d_output.T
+                    // C = d_input.T[in_features, bs]
+                    // A = weight.T[out_features, in_features]
+                    // B = d_output.T[out_features, bs]
+                    CHECK(weight != nullptr)
+                        << "compute_grad_input=true but weight is nullptr (selective save mismatch)";
+                    auto weight_promoted
+                        = weight_dtype == compute_dtype ? weight : std::make_shared<Tensor>(weight->To(compute_dtype));
+                    auto trans_a1 = transpose ? MCBLAS_OP_N : MCBLAS_OP_T;
+                    auto lda1 = transpose ? in_features : out_features;
+                    MCBLAS_CHECK(mcblasSgemm(handle, trans_a1, MCBLAS_OP_N, in_features, bs, out_features, &alpha,
+                                             static_cast<const float *>(weight_promoted->DataPtr()), lda1,
+                                             static_cast<const float *>(grad_output_promoted->DataPtr()), out_features,
+                                             &beta, static_cast<float *>(grad_input->DataPtr()), in_features));
+                }
+                if (compute_grad_weight) {
+                    // - if transpose:
+                    // d_weight = d_output.T * input --> d_weight.T = input.T * d_output
+                    // C = d_weight.T[in_features, out_features]
+                    // A = input.T[in_features, bs]
+                    // B = d_output.T[out_features, bs]
+                    //
+                    // - if not transpose:
+                    // d_weight = input.T * d_output --> d_weight.T = d_output.T * input
+                    // C = d_weight.T[out_features, in_features]
+                    // A = d_output.T[out_features, bs]
+                    // B = input.T[in_features, bs]
+                    CHECK(input != nullptr)
+                        << "compute_grad_weight=true but input is nullptr (selective save mismatch)";
+                    auto input_promoted
+                        = input_dtype == compute_dtype ? input : std::make_shared<Tensor>(input->To(compute_dtype));
+                    auto trans_a2 = MCBLAS_OP_N;
+                    auto trans_b2 = MCBLAS_OP_T;
+                    int m2 = transpose ? in_features : out_features;
+                    int n2 = transpose ? out_features : in_features;
+                    const void *a2 = transpose ? input_promoted->DataPtr() : grad_output_promoted->DataPtr();
+                    const void *b2 = transpose ? grad_output_promoted->DataPtr() : input_promoted->DataPtr();
+                    auto lda2 = transpose ? in_features : out_features;
+                    auto ldb2 = transpose ? out_features : in_features;
+                    auto ldc2 = transpose ? in_features : out_features;
+                    MCBLAS_CHECK(mcblasSgemm(handle, trans_a2, trans_b2, m2, n2, bs, &alpha,
+                                             static_cast<const float *>(a2), lda2, static_cast<const float *>(b2), ldb2,
+                                             &beta, static_cast<float *>(grad_weight->DataPtr()), ldc2));
+                }
+                // d_bias = \sum_i(i=0, bs-1) d_output[i]
+                // TODO(dcj): use thrust::fill or reduce kernel do this
+                if (compute_grad_bias && bias) {
+                    constexpr int BLOCK_SIZE = 256;
+                    int threads_per_block = BLOCK_SIZE;
+                    int num_blocks = out_features;
+                    ReduceColumnsKernel<BLOCK_SIZE><<<num_blocks, threads_per_block, 0, maca_stream>>>(
+                        static_cast<const float *>(grad_output_promoted->DataPtr()),
+                        static_cast<float *>(grad_bias->DataPtr()), out_features, bs);
+                }
+            }),
+            DataType::kFLOAT32)
+        DISPATCH_CASE(WRAP({
+                          if (compute_grad_input) {
+                              CHECK(weight != nullptr)
+                                  << "compute_grad_input=true but weight is nullptr (selective save mismatch)";
+                              auto weight_promoted = weight_dtype == compute_dtype
+                                                       ? weight
+                                                       : std::make_shared<Tensor>(weight->To(compute_dtype));
+                              auto trans_a1 = transpose ? MCBLAS_OP_N : MCBLAS_OP_T;
+                              auto lda1 = transpose ? in_features : out_features;
+                              MCBLAS_CHECK(mcblasGemmEx(handle, trans_a1, MCBLAS_OP_N, in_features, bs, out_features,
+                                                        &alpha, weight_promoted->DataPtr(), MACA_R_16BF, lda1,
+                                                        grad_output_promoted->DataPtr(), MACA_R_16BF, out_features,
+                                                        &beta, grad_input->DataPtr(), MACA_R_32F, in_features,
+                                                        MACA_R_32F, MCBLAS_GEMM_DEFAULT));
+                          }
+                          if (compute_grad_weight) {
+                              CHECK(input != nullptr)
+                                  << "compute_grad_weight=true but input is nullptr (selective save mismatch)";
+                              auto input_promoted = input_dtype == compute_dtype
+                                                      ? input
+                                                      : std::make_shared<Tensor>(input->To(compute_dtype));
+                              auto trans_a2 = MCBLAS_OP_N;
+                              auto trans_b2 = MCBLAS_OP_T;
+                              int m2 = transpose ? in_features : out_features;
+                              int n2 = transpose ? out_features : in_features;
+                              const void *a2 = transpose ? input_promoted->DataPtr() : grad_output_promoted->DataPtr();
+                              const void *b2 = transpose ? grad_output_promoted->DataPtr() : input_promoted->DataPtr();
+                              auto lda2 = transpose ? in_features : out_features;
+                              auto ldb2 = transpose ? out_features : in_features;
+                              auto ldc2 = transpose ? in_features : out_features;
+                              MCBLAS_CHECK(mcblasGemmEx(handle, trans_a2, trans_b2, m2, n2, bs, &alpha, a2, MACA_R_16BF,
+                                                        lda2, b2, MACA_R_16BF, ldb2, &beta, grad_weight->DataPtr(),
+                                                        MACA_R_32F, ldc2, MACA_R_32F, MCBLAS_GEMM_DEFAULT));
+                          }
+                          if (compute_grad_bias && bias) {
+                              constexpr int BLOCK_SIZE = 256;
+                              int threads_per_block = BLOCK_SIZE;
+                              int num_blocks = out_features;
+                              ReduceColumnsKernel<BLOCK_SIZE><<<num_blocks, threads_per_block, 0, maca_stream>>>(
+                                  static_cast<const __maca_bfloat16 *>(grad_output_promoted->DataPtr()),
+                                  static_cast<float *>(grad_bias->DataPtr()), out_features, bs);
+                          }
+                      }),
+                      DataType::kBFLOAT16)
+    }
+
+    return {grad_input, grad_weight, grad_bias};
+}
+} // namespace infini_train::kernels::maca
+
+#define REGISTER_MACA_LINEAR_KERNEL(kernel_name)                                                                       \
+    REGISTER_KERNEL(infini_train::Device::DeviceType::kMACA, kernel_name, infini_train::kernels::maca::kernel_name)
+
+REGISTER_MACA_LINEAR_KERNEL(MatmulForward)
+REGISTER_MACA_LINEAR_KERNEL(MatmulBackward)
+REGISTER_MACA_LINEAR_KERNEL(LinearForward)
+REGISTER_MACA_LINEAR_KERNEL(LinearBackward)
+
+#undef REGISTER_MACA_LINEAR_KERNEL
diff --git a/infini_train/src/kernels/maca/no_op.maca b/infini_train/src/kernels/maca/no_op.maca
new file mode 100644
index 00000000..d06f010d
--- /dev/null
+++ b/infini_train/src/kernels/maca/no_op.maca
@@ -0,0 +1,30 @@
+#include "glog/logging.h"
+
+#include "infini_train/include/dispatcher.h"
+#include "infini_train/include/tensor.h"
+
+namespace infini_train::kernels::maca {
+std::shared_ptr<Tensor> NoOpForward(const std::shared_ptr<Tensor> &input, const std::vector<int64_t> &dims) {
+    const int64_t num_elements = std::accumulate(dims.begin(), dims.end(), 1, std::multiplies<int64_t>());
+    CHECK_EQ(input->NumElements(), num_elements);
+
+    auto output = std::make_shared<Tensor>(*input, 0, dims);
+    return output;
+}
+
+std::shared_ptr<Tensor> NoOpBackward(const std::vector<int64_t> &dims, const std::shared_ptr<Tensor> &grad_output) {
+    auto num_elements = std::accumulate(dims.begin(), dims.end(), 1, std::multiplies<int64_t>());
+    CHECK_EQ(num_elements, grad_output->NumElements());
+
+    auto grad_input = std::make_shared<Tensor>(*grad_output, 0, dims);
+    return grad_input;
+}
+} // namespace infini_train::kernels::maca
+
+#define REGISTER_MACA_NO_OP_KERNEL(kernel_name)                                                                        \
+    REGISTER_KERNEL(infini_train::Device::DeviceType::kMACA, kernel_name, infini_train::kernels::maca::kernel_name)
+
+REGISTER_MACA_NO_OP_KERNEL(NoOpForward)
+REGISTER_MACA_NO_OP_KERNEL(NoOpBackward)
+
+#undef REGISTER_MACA_NO_OP_KERNEL

From 821e103de088e18c03579f9e1e69796f1407e489 Mon Sep 17 00:00:00 2001
From: kilinchange <kilinchange@163.com>
Date: Fri, 10 Apr 2026 01:57:16 +0000
Subject: [PATCH 02/12] feat(maca): add MCCL collective backend and remaining
 kernels

Complete the MACA backend by adding the MCCL-based collective
implementation and the rest of the kernel library, enabling multi-card
training (DDP) and larger models such as gpt2.

- core/ccl/maca: McclComm / McclUniqueId wrappers around mcclComm_t /
  mcclUniqueId, with Size/Data/Load tied to sizeof(mcclUniqueId) so that
  the existing backend-agnostic WriteUniqueIdFile / ReadUniqueIdFile
  unique-id exchange path works unchanged. McclImpl mirrors NcclImpl
  with kMcclDtypeMap / kMcclReduceOpMap and routes every collective
  through mcStream_t via dynamic_cast<MacaStream *>. Registered via
  INFINI_TRAIN_REGISTER_CCL_IMPL(kMACA, McclImpl), so ProcessGroup
  backed by Device::DeviceType::kMACA transparently picks up MCCL
  without any ProcessGroupMCCL subclass.
- kernels/maca: mechanically port the remaining 15 kernels (cast,
  comm, concat, cross_entropy, embedding, gather, layernorm, outer,
  reduction, slice, softmax, split, stack, transform,
  vocab_parallel_cross_entropy) from their .cu counterparts, including
  the cub_compat path for cross_entropy/softmax/reduction, mcblas
  GEMM / GemmEx calls in outer, and __maca_bfloat16 / __half typing
  throughout.
---
 .../include/common/maca/common_maca.h         |   2 +-
 .../include/common/maca/kernel_helper.cuh     |   6 +-
 infini_train/src/core/ccl/maca/mccl_common.cc |  35 ++
 infini_train/src/core/ccl/maca/mccl_common.h  |  37 ++
 infini_train/src/core/ccl/maca/mccl_impl.cc   | 160 +++++
 infini_train/src/core/ccl/maca/mccl_impl.h    |  51 ++
 infini_train/src/kernels/maca/cast.maca       |  56 ++
 infini_train/src/kernels/maca/comm.maca       |  81 +++
 infini_train/src/kernels/maca/concat.maca     | 247 ++++++++
 .../src/kernels/maca/cross_entropy.maca       | 225 +++++++
 infini_train/src/kernels/maca/embedding.maca  | 124 ++++
 infini_train/src/kernels/maca/gather.maca     | 232 +++++++
 infini_train/src/kernels/maca/layernorm.maca  | 207 ++++++
 infini_train/src/kernels/maca/outer.maca      | 168 +++++
 infini_train/src/kernels/maca/reduction.maca  | 243 +++++++
 infini_train/src/kernels/maca/slice.maca      | 210 +++++++
 infini_train/src/kernels/maca/softmax.maca    | 225 +++++++
 infini_train/src/kernels/maca/split.maca      | 181 ++++++
 infini_train/src/kernels/maca/stack.maca      | 160 +++++
 infini_train/src/kernels/maca/transform.maca  | 592 ++++++++++++++++++
 .../maca/vocab_parallel_cross_entropy.maca    | 125 ++++
 21 files changed, 3364 insertions(+), 3 deletions(-)
 create mode 100644 infini_train/src/core/ccl/maca/mccl_common.cc
 create mode 100644 infini_train/src/core/ccl/maca/mccl_common.h
 create mode 100644 infini_train/src/core/ccl/maca/mccl_impl.cc
 create mode 100644 infini_train/src/core/ccl/maca/mccl_impl.h
 create mode 100644 infini_train/src/kernels/maca/cast.maca
 create mode 100644 infini_train/src/kernels/maca/comm.maca
 create mode 100644 infini_train/src/kernels/maca/concat.maca
 create mode 100644 infini_train/src/kernels/maca/cross_entropy.maca
 create mode 100644 infini_train/src/kernels/maca/embedding.maca
 create mode 100644 infini_train/src/kernels/maca/gather.maca
 create mode 100644 infini_train/src/kernels/maca/layernorm.maca
 create mode 100644 infini_train/src/kernels/maca/outer.maca
 create mode 100644 infini_train/src/kernels/maca/reduction.maca
 create mode 100644 infini_train/src/kernels/maca/slice.maca
 create mode 100644 infini_train/src/kernels/maca/softmax.maca
 create mode 100644 infini_train/src/kernels/maca/split.maca
 create mode 100644 infini_train/src/kernels/maca/stack.maca
 create mode 100644 infini_train/src/kernels/maca/transform.maca
 create mode 100644 infini_train/src/kernels/maca/vocab_parallel_cross_entropy.maca

diff --git a/infini_train/include/common/maca/common_maca.h b/infini_train/include/common/maca/common_maca.h
index d4a4fb39..631a89f6 100644
--- a/infini_train/include/common/maca/common_maca.h
+++ b/infini_train/include/common/maca/common_maca.h
@@ -1,8 +1,8 @@
 #pragma once
 
+#include <mcblas/mcblas.h>
 #include <mcr/mc_runtime.h>
 #include <mcr/mc_runtime_api.h>
-#include <mcblas/mcblas.h>
 
 #ifdef USE_MCCL
 #include <mccl.h>
diff --git a/infini_train/include/common/maca/kernel_helper.cuh b/infini_train/include/common/maca/kernel_helper.cuh
index 85a7cfeb..9e7837a5 100644
--- a/infini_train/include/common/maca/kernel_helper.cuh
+++ b/infini_train/include/common/maca/kernel_helper.cuh
@@ -65,9 +65,11 @@ template <typename DST, typename SRC> __host__ __device__ DST Cast(SRC &&x) {
     // Fallback for all other conversions
     if constexpr (std::is_same_v<DST_base, __maca_bfloat16> || std::is_same_v<DST_base, __half>
                   || std::is_same_v<SRC_base, __maca_bfloat16> || std::is_same_v<SRC_base, __half>) {
-        return (DST)(static_cast<float>(std::forward<SRC>(x)));;
+        return (DST)(static_cast<float>(std::forward<SRC>(x)));
+        ;
     } else {
-        return static_cast<DST>(std::forward<SRC>(x));;
+        return static_cast<DST>(std::forward<SRC>(x));
+        ;
     }
 }
 
diff --git a/infini_train/src/core/ccl/maca/mccl_common.cc b/infini_train/src/core/ccl/maca/mccl_common.cc
new file mode 100644
index 00000000..36ab96fb
--- /dev/null
+++ b/infini_train/src/core/ccl/maca/mccl_common.cc
@@ -0,0 +1,35 @@
+#include "infini_train/src/core/ccl/maca/mccl_common.h"
+
+#include <cstring>
+
+#include "glog/logging.h"
+
+namespace infini_train::core {
+
+McclComm::McclComm() = default;
+
+McclComm::McclComm(mcclComm_t comm) : mccl_comm_(comm) {}
+
+mcclComm_t McclComm::mccl_comm() const { return mccl_comm_; }
+
+void McclComm::set_mccl_comm(mcclComm_t comm) { mccl_comm_ = comm; }
+
+McclUniqueId::McclUniqueId() = default;
+
+McclUniqueId::McclUniqueId(const mcclUniqueId &id) : id_(id) {}
+
+size_t McclUniqueId::Size() const { return sizeof(id_); }
+
+const void *McclUniqueId::Data() const { return &id_; }
+
+void McclUniqueId::Load(const void *src, size_t size) {
+    CHECK_NOTNULL(src);
+    CHECK_EQ(size, sizeof(id_));
+    std::memcpy(&id_, src, sizeof(id_));
+}
+
+mcclUniqueId *McclUniqueId::mccl_unique_id() { return &id_; }
+
+const mcclUniqueId *McclUniqueId::mccl_unique_id() const { return &id_; }
+
+} // namespace infini_train::core
diff --git a/infini_train/src/core/ccl/maca/mccl_common.h b/infini_train/src/core/ccl/maca/mccl_common.h
new file mode 100644
index 00000000..4af12130
--- /dev/null
+++ b/infini_train/src/core/ccl/maca/mccl_common.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <mccl.h>
+
+#include "infini_train/include/core/ccl/ccl_common.h"
+
+namespace infini_train::core {
+
+class McclComm final : public CclComm {
+public:
+    McclComm();
+    explicit McclComm(mcclComm_t comm);
+
+    mcclComm_t mccl_comm() const;
+    void set_mccl_comm(mcclComm_t comm);
+
+private:
+    mcclComm_t mccl_comm_ = nullptr;
+};
+
+class McclUniqueId final : public CclUniqueId {
+public:
+    McclUniqueId();
+    explicit McclUniqueId(const mcclUniqueId &id);
+
+    size_t Size() const override;
+    const void *Data() const override;
+    void Load(const void *src, size_t size) override;
+
+    mcclUniqueId *mccl_unique_id();
+    const mcclUniqueId *mccl_unique_id() const;
+
+private:
+    mcclUniqueId id_;
+};
+
+} // namespace infini_train::core
diff --git a/infini_train/src/core/ccl/maca/mccl_impl.cc b/infini_train/src/core/ccl/maca/mccl_impl.cc
new file mode 100644
index 00000000..3ab66aee
--- /dev/null
+++ b/infini_train/src/core/ccl/maca/mccl_impl.cc
@@ -0,0 +1,160 @@
+#include "infini_train/src/core/ccl/maca/mccl_impl.h"
+
+#include <mccl.h>
+#include <vector>
+
+#include "glog/logging.h"
+
+#include "infini_train/include/common/maca/common_maca.h"
+#include "infini_train/include/core/runtime/runtime_common.h"
+#include "infini_train/include/device.h"
+
+#include "infini_train/src/core/ccl/maca/mccl_common.h"
+#include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
+
+namespace infini_train::core::maca {
+namespace {
+
+inline const std::unordered_map<DataType, mcclDataType_t> kMcclDtypeMap = {
+    {DataType::kUINT8, mcclUint8},       {DataType::kINT8, mcclInt8},     {DataType::kUINT32, mcclUint32},
+    {DataType::kINT32, mcclInt32},       {DataType::kUINT64, mcclUint64}, {DataType::kINT64, mcclInt64},
+    {DataType::kBFLOAT16, mcclBfloat16}, {DataType::kFLOAT16, mcclHalf},  {DataType::kFLOAT32, mcclFloat32},
+    {DataType::kFLOAT64, mcclFloat64},
+};
+
+inline const std::unordered_map<nn::parallel::function::ReduceOpType, mcclRedOp_t> kMcclReduceOpMap = {
+    {nn::parallel::function::ReduceOpType::kSum, mcclSum}, {nn::parallel::function::ReduceOpType::kProd, mcclProd},
+    {nn::parallel::function::ReduceOpType::kMin, mcclMin}, {nn::parallel::function::ReduceOpType::kMax, mcclMax},
+    {nn::parallel::function::ReduceOpType::kAvg, mcclAvg},
+};
+
+inline mcclComm_t GetMcclComm(const CclComm *comm) {
+    auto *mccl_comm = dynamic_cast<const McclComm *>(comm);
+    CHECK_NOTNULL(mccl_comm);
+    return mccl_comm->mccl_comm();
+}
+
+inline void SetMcclComm(CclComm *comm, mcclComm_t mccl_comm) {
+    auto *typed_comm = dynamic_cast<McclComm *>(comm);
+    CHECK_NOTNULL(typed_comm);
+    typed_comm->set_mccl_comm(mccl_comm);
+}
+
+inline const mcclUniqueId &GetMcclUniqueId(const CclUniqueId &unique_id) {
+    auto *mccl_unique_id = dynamic_cast<const McclUniqueId *>(&unique_id);
+    CHECK_NOTNULL(mccl_unique_id);
+    return *mccl_unique_id->mccl_unique_id();
+}
+
+inline mcStream_t GetMacaStream(Stream *stream) {
+    auto *maca_stream = dynamic_cast<MacaStream *>(stream);
+    CHECK_NOTNULL(maca_stream);
+    return maca_stream->maca_stream();
+}
+
+} // namespace
+
+Device::DeviceType McclImpl::Type() const { return Device::DeviceType::kMACA; }
+
+void McclImpl::GroupStart() const { MCCL_CHECK(mcclGroupStart()); }
+
+void McclImpl::GroupEnd() const { MCCL_CHECK(mcclGroupEnd()); }
+
+void McclImpl::GetAsyncError(const CclComm *comm, CclStatus *async_error) const {
+    mcclResult_t mccl_async_error = mcclSuccess;
+    MCCL_CHECK(mcclCommGetAsyncError(GetMcclComm(comm), &mccl_async_error));
+    if (async_error != nullptr) {
+        *async_error = (mccl_async_error == mcclSuccess) ? CclStatus::kSuccess : CclStatus::kError;
+    }
+}
+
+void McclImpl::GetUniqueId(CclUniqueId **unique_id) const {
+    CHECK_NOTNULL(unique_id);
+    if (*unique_id == nullptr) {
+        *unique_id = new McclUniqueId();
+    }
+    auto *mccl_unique_id = dynamic_cast<McclUniqueId *>(*unique_id);
+    CHECK_NOTNULL(mccl_unique_id);
+    MCCL_CHECK(mcclGetUniqueId(mccl_unique_id->mccl_unique_id()));
+}
+
+void McclImpl::CommInitAll(CclComm **comms, int ndev, const int *devlist) const {
+    CHECK_NOTNULL(comms);
+    CHECK_GT(ndev, 0);
+    CHECK_NOTNULL(devlist);
+
+    std::vector<mcclComm_t> mccl_comms(static_cast<size_t>(ndev), nullptr);
+    MCCL_CHECK(mcclCommInitAll(mccl_comms.data(), ndev, devlist));
+    for (int i = 0; i < ndev; ++i) {
+        if (comms[i] == nullptr) {
+            comms[i] = new McclComm();
+        }
+        SetMcclComm(comms[i], mccl_comms[static_cast<size_t>(i)]);
+    }
+}
+
+void McclImpl::CommInitRank(CclComm **comm, int nranks, const CclUniqueId &unique_id, int rank) const {
+    CHECK_NOTNULL(comm);
+    CHECK_GT(nranks, 0);
+
+    if (*comm == nullptr) {
+        *comm = new McclComm();
+    }
+
+    mcclComm_t mccl_comm = nullptr;
+    MCCL_CHECK(mcclCommInitRank(&mccl_comm, nranks, GetMcclUniqueId(unique_id), rank));
+    SetMcclComm(*comm, mccl_comm);
+}
+
+void McclImpl::CommDestroy(CclComm *comm) const {
+    if (comm == nullptr) {
+        return;
+    }
+    MCCL_CHECK(mcclCommDestroy(GetMcclComm(comm)));
+    SetMcclComm(comm, nullptr);
+}
+
+void McclImpl::AllReduce(const void *sendbuff, void *recvbuff, size_t count, DataType dtype,
+                         nn::parallel::function::ReduceOpType reduce_op, const CclComm *comm, Stream *stream) const {
+    MCCL_CHECK(mcclAllReduce(sendbuff, recvbuff, count, kMcclDtypeMap.at(dtype), kMcclReduceOpMap.at(reduce_op),
+                             GetMcclComm(comm), GetMacaStream(stream)));
+}
+
+void McclImpl::Broadcast(const void *sendbuff, void *recvbuff, size_t count, DataType dtype, int root,
+                         const CclComm *comm, Stream *stream) const {
+    MCCL_CHECK(mcclBroadcast(sendbuff, recvbuff, count, kMcclDtypeMap.at(dtype), root, GetMcclComm(comm),
+                             GetMacaStream(stream)));
+}
+
+void McclImpl::Reduce(const void *sendbuff, void *recvbuff, size_t count, DataType dtype,
+                      nn::parallel::function::ReduceOpType reduce_op, int root, const CclComm *comm,
+                      Stream *stream) const {
+    MCCL_CHECK(mcclReduce(sendbuff, recvbuff, count, kMcclDtypeMap.at(dtype), kMcclReduceOpMap.at(reduce_op), root,
+                          GetMcclComm(comm), GetMacaStream(stream)));
+}
+
+void McclImpl::AllGather(const void *sendbuff, void *recvbuff, size_t count, DataType dtype, const CclComm *comm,
+                         Stream *stream) const {
+    MCCL_CHECK(
+        mcclAllGather(sendbuff, recvbuff, count, kMcclDtypeMap.at(dtype), GetMcclComm(comm), GetMacaStream(stream)));
+}
+
+void McclImpl::ReduceScatter(const void *sendbuff, void *recvbuff, size_t recv_count, DataType dtype,
+                             nn::parallel::function::ReduceOpType reduce_op, const CclComm *comm,
+                             Stream *stream) const {
+    MCCL_CHECK(mcclReduceScatter(sendbuff, recvbuff, recv_count, kMcclDtypeMap.at(dtype),
+                                 kMcclReduceOpMap.at(reduce_op), GetMcclComm(comm), GetMacaStream(stream)));
+}
+
+void McclImpl::Send(const void *buff, size_t count, DataType dtype, int peer, const CclComm *comm,
+                    Stream *stream) const {
+    MCCL_CHECK(mcclSend(buff, count, kMcclDtypeMap.at(dtype), peer, GetMcclComm(comm), GetMacaStream(stream)));
+}
+
+void McclImpl::Recv(void *buff, size_t count, DataType dtype, int peer, const CclComm *comm, Stream *stream) const {
+    MCCL_CHECK(mcclRecv(buff, count, kMcclDtypeMap.at(dtype), peer, GetMcclComm(comm), GetMacaStream(stream)));
+}
+
+INFINI_TRAIN_REGISTER_CCL_IMPL(Device::DeviceType::kMACA, McclImpl)
+
+} // namespace infini_train::core::maca
diff --git a/infini_train/src/core/ccl/maca/mccl_impl.h b/infini_train/src/core/ccl/maca/mccl_impl.h
new file mode 100644
index 00000000..e5fa39e9
--- /dev/null
+++ b/infini_train/src/core/ccl/maca/mccl_impl.h
@@ -0,0 +1,51 @@
+#pragma once
+
+#include <string>
+#include <unordered_map>
+
+#include "infini_train/include/core/ccl/ccl.h"
+
+namespace infini_train::core::maca {
+
+class McclImpl final : public CclImpl {
+public:
+    Device::DeviceType Type() const override;
+
+    void GroupStart() const override;
+
+    void GroupEnd() const override;
+
+    void GetAsyncError(const CclComm *comm, CclStatus *async_error) const override;
+
+    void GetUniqueId(CclUniqueId **unique_id) const override;
+
+    void CommInitAll(CclComm **comms, int ndev, const int *devlist) const override;
+
+    void CommInitRank(CclComm **comm, int nranks, const CclUniqueId &unique_id, int rank) const override;
+
+    void CommDestroy(CclComm *comm) const override;
+
+    void AllReduce(const void *sendbuff, void *recvbuff, size_t count, DataType dtype,
+                   nn::parallel::function::ReduceOpType reduce_op, const CclComm *comm, Stream *stream) const override;
+
+    void Broadcast(const void *sendbuff, void *recvbuff, size_t count, DataType dtype, int root, const CclComm *comm,
+                   Stream *stream) const override;
+
+    void Reduce(const void *sendbuff, void *recvbuff, size_t count, DataType dtype,
+                nn::parallel::function::ReduceOpType reduce_op, int root, const CclComm *comm,
+                Stream *stream) const override;
+
+    void AllGather(const void *sendbuff, void *recvbuff, size_t count, DataType dtype, const CclComm *comm,
+                   Stream *stream) const override;
+
+    void ReduceScatter(const void *sendbuff, void *recvbuff, size_t recv_count, DataType dtype,
+                       nn::parallel::function::ReduceOpType reduce_op, const CclComm *comm,
+                       Stream *stream) const override;
+
+    void Send(const void *buff, size_t count, DataType dtype, int peer, const CclComm *comm,
+              Stream *stream) const override;
+
+    void Recv(void *buff, size_t count, DataType dtype, int peer, const CclComm *comm, Stream *stream) const override;
+};
+
+} // namespace infini_train::core::maca
diff --git a/infini_train/src/kernels/maca/cast.maca b/infini_train/src/kernels/maca/cast.maca
new file mode 100644
index 00000000..2c26a0d8
--- /dev/null
+++ b/infini_train/src/kernels/maca/cast.maca
@@ -0,0 +1,56 @@
+#include <memory>
+
+#include "infini_train/include/common/common.h"
+#include "infini_train/include/common/maca/kernel_helper.cuh"
+#include "infini_train/include/core/runtime/device_guard.h"
+#include "infini_train/include/datatype.h"
+#include "infini_train/include/device.h"
+#include "infini_train/include/dispatcher.h"
+#include "infini_train/include/tensor.h"
+
+#include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
+
+namespace infini_train::kernels::maca {
+
+template <typename Tdst, typename Tsrc>
+__global__ void CastKernel(Tdst *dst, const Tsrc *src, size_t num_elements, size_t offset) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x + offset;
+
+    if (idx < num_elements) {
+        dst[idx] = common::maca::Cast<Tdst>(src[idx]);
+    }
+}
+
+std::shared_ptr<Tensor> Cast(std::shared_ptr<Tensor> input, DataType dtype) {
+    auto dst_tensor = std::make_shared<Tensor>(input->Dims(), dtype, input->GetDevice());
+    auto device = input->GetDevice();
+    const auto &maca_stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
+                                  infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                                  ->maca_stream();
+
+    const size_t num_elements = input->NumElements();
+    dim3 block_dims(256);
+    dim3 grid_dims(CEIL_DIV(num_elements, block_dims.x));
+    const size_t step = grid_dims.x * block_dims.x;
+
+    DispatchFunc<DataTypeList<INFINI_ALL_TYPES>, DataTypeList<INFINI_ALL_TYPES>>(
+        {dtype, input->Dtype()},
+        [=]<typename Tdst, typename Tsrc>() {
+            auto dst = static_cast<Tdst *>(dst_tensor->DataPtr());
+            auto src = static_cast<const Tsrc *>(input->DataPtr());
+            for (size_t offset = 0; offset < num_elements; offset += step) {
+                CastKernel<<<grid_dims, block_dims, 0, maca_stream>>>(dst, src, num_elements, offset);
+            }
+        },
+        "MACA Cast");
+
+    return {dst_tensor};
+}
+} // namespace infini_train::kernels::maca
+
+#define REGISTER_MACA_CAST_KERNEL(kernel_name)                                                                         \
+    REGISTER_KERNEL(infini_train::Device::DeviceType::kMACA, kernel_name, infini_train::kernels::maca::kernel_name)
+
+REGISTER_MACA_CAST_KERNEL(Cast)
+
+#undef REGISTER_MACA_CAST_KERNEL
diff --git a/infini_train/src/kernels/maca/comm.maca b/infini_train/src/kernels/maca/comm.maca
new file mode 100644
index 00000000..c7fdace9
--- /dev/null
+++ b/infini_train/src/kernels/maca/comm.maca
@@ -0,0 +1,81 @@
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "glog/logging.h"
+
+#include "infini_train/include/device.h"
+#include "infini_train/include/dispatcher.h"
+#include "infini_train/include/nn/functional.h"
+#include "infini_train/include/tensor.h"
+
+namespace infini_train::kernels::maca {
+
+std::vector<std::shared_ptr<Tensor>> Broadcast(const std::vector<std::shared_ptr<Tensor>> &input_tensors,
+                                               const std::vector<Device> &devices) {
+    std::vector<std::shared_ptr<Tensor>> outputs;
+    for (int i = 0; i < devices.size(); ++i) {
+        for (const auto &tensor : input_tensors) {
+            outputs.push_back(std::make_shared<Tensor>(tensor->To(devices[i])));
+        }
+    }
+    return outputs;
+}
+
+std::vector<std::shared_ptr<Tensor>> ReduceAddCoalesced(const std::vector<std::vector<std::shared_ptr<Tensor>>> &grads,
+                                                        Device destination) {
+    std::vector<std::shared_ptr<Tensor>> outputs;
+    auto kernel = Dispatcher::Instance().GetKernel({destination.type(), "AccumulateGrad"});
+    std::vector<std::vector<std::shared_ptr<Tensor>>> to_destination_grads;
+    for (int i = 0; i < grads[0].size(); ++i) {
+        outputs.emplace_back(std::make_shared<Tensor>(grads[0][i]->Dims(), grads[0][i]->Dtype(), destination));
+        outputs[i]->Fill<float>(0.0);
+    }
+    for (int i = 0; i < grads.size(); ++i) {
+        to_destination_grads.push_back(std::vector<std::shared_ptr<Tensor>>());
+        for (int j = 0; j < grads[i].size(); ++j) {
+            to_destination_grads[i].push_back(std::make_shared<Tensor>(grads[i][j]->To(destination)));
+        }
+    }
+    for (int i = 0; i < grads.size(); ++i) {
+        for (int j = 0; j < grads[i].size(); ++j) {
+            kernel.Call<void>(to_destination_grads[i][j], static_cast<float>(1.0), outputs[j]);
+        }
+    }
+    return outputs;
+}
+
+std::vector<std::shared_ptr<Tensor>> Scatter(const std::shared_ptr<Tensor> &tensor, std::vector<Device> devices,
+                                             int64_t dim) {
+    std::vector<std::shared_ptr<Tensor>> outputs;
+    // FIXME(dcj): do split without autograd
+    std::vector<std::shared_ptr<Tensor>> split_tensors = tensor->Split(tensor->Dims()[dim] / devices.size(), dim);
+    for (auto i = 0; i < devices.size(); ++i) {
+        outputs.push_back(std::make_shared<Tensor>(split_tensors[i]->To(devices[i])));
+    }
+    return outputs;
+}
+
+std::shared_ptr<Tensor> Gather(const std::vector<std::shared_ptr<Tensor>> &tensors, Device destination, int64_t dim) {
+    std::vector<std::shared_ptr<Tensor>> outputs;
+    for (const auto &tensor : tensors) { outputs.push_back(std::make_shared<Tensor>(tensor->To(destination))); }
+    auto kernel = Dispatcher::Instance().GetKernel({tensors[0]->GetDevice().type(), "StackForward"});
+    auto gathered_tensor = kernel.Call<std::shared_ptr<Tensor>>(outputs, dim);
+    auto old_dims = gathered_tensor->Dims();
+    std::vector<int64_t> new_dims{old_dims[0] * old_dims[1]};
+    for (int i = 2; i < old_dims.size(); ++i) { new_dims.push_back(old_dims[i]); }
+    auto view_kernel = Dispatcher::Instance().GetKernel({destination.type(), "NoOpForward"});
+    return view_kernel.Call<std::shared_ptr<Tensor>>(gathered_tensor, new_dims);
+}
+} // namespace infini_train::kernels::maca
+
+#define REGISTER_MACA_COMM_KERNEL(kernel_name)                                                                         \
+    REGISTER_KERNEL(infini_train::Device::DeviceType::kMACA, Comm##kernel_name,                                        \
+                    infini_train::kernels::maca::kernel_name)
+
+REGISTER_MACA_COMM_KERNEL(Broadcast)
+REGISTER_MACA_COMM_KERNEL(Scatter)
+REGISTER_MACA_COMM_KERNEL(Gather)
+REGISTER_MACA_COMM_KERNEL(ReduceAddCoalesced)
+
+#undef REGISTER_MACA_COMM_KERNEL
diff --git a/infini_train/src/kernels/maca/concat.maca b/infini_train/src/kernels/maca/concat.maca
new file mode 100644
index 00000000..42807308
--- /dev/null
+++ b/infini_train/src/kernels/maca/concat.maca
@@ -0,0 +1,247 @@
+#include <algorithm>
+#include <memory>
+#include <numeric>
+#include <utility>
+#include <vector>
+
+#include "glog/logging.h"
+
+#include "infini_train/include/common/maca/common_maca.h"
+#include "infini_train/include/core/runtime/device_guard.h"
+#include "infini_train/include/dispatcher.h"
+#include "infini_train/include/tensor.h"
+
+#include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
+
+namespace infini_train::kernels::maca {
+__device__ __forceinline__ int64_t UpperBoundI64(const int64_t *offsets, int64_t n_plus_1, int64_t x) {
+    // Return the largest s so that offsets[s] <= x
+    // offsets[0] = 0, offsets is monotonically increasing
+    // len(offsets) = num_inputs + 1
+    int64_t l = 0, r = n_plus_1; // start search in [0, n+1)
+    while (l < r) {
+        int64_t m = l + ((r - l) >> 1);
+        if (offsets[m] <= x) {
+            l = m + 1;
+        } else {
+            r = m;
+        }
+    }
+    return l - 1;
+}
+
+template <typename T>
+__global__ void ConcatForwardKernel(const T **inputs, T *output, const int64_t *offsets, int64_t N, int64_t D,
+                                    int64_t num_inputs, int64_t K_total) {
+    int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int64_t total = N * K_total * D;
+    if (idx >= total) {
+        return;
+    }
+
+    int64_t d = idx % D;
+    int64_t k = (idx / D) % K_total;
+    int64_t n = idx / (D * K_total);
+
+    // find the largest s so that offsets[s] <= k < offsets[s+1]
+    int64_t s = UpperBoundI64(offsets, num_inputs + 1, k);
+    int64_t k_local = k - offsets[s];
+    int64_t Ki = offsets[s + 1] - offsets[s];
+
+    const T *input = inputs[s];
+    output[idx] = input[n * (Ki * D) + k_local * D + d];
+}
+
+std::shared_ptr<Tensor> ConcatForward(const std::vector<std::shared_ptr<Tensor>> &inputs, int64_t dim) {
+    CHECK(!inputs.empty());
+
+    const auto &base_dims = inputs[0]->Dims();
+    auto dtype = inputs[0]->Dtype();
+    auto device = inputs[0]->GetDevice();
+
+    if (dim < 0) {
+        dim += static_cast<int64_t>(base_dims.size());
+    }
+    CHECK_GE(dim, 0);
+    CHECK_LT(dim, static_cast<int64_t>(base_dims.size()));
+
+    // Check shape requirements and save length along dim
+    std::vector<int64_t> Ks;
+    Ks.reserve(inputs.size());
+    for (const auto &t : inputs) {
+        CHECK(t->Dtype() == dtype);
+        CHECK_EQ(t->Dims().size(), base_dims.size());
+        for (size_t ax = 0; ax < base_dims.size(); ++ax) {
+            if (static_cast<int64_t>(ax) == dim) {
+                continue;
+            }
+            CHECK_EQ(t->Dims()[ax], base_dims[ax]) << "All non-concat dims must match";
+        }
+        Ks.push_back(t->Dims()[dim]);
+    }
+
+    std::vector<int64_t> out_dims = base_dims;
+    out_dims[dim] = std::accumulate(Ks.begin(), Ks.end(), int64_t{0});
+    auto output = std::make_shared<Tensor>(out_dims, dtype, device);
+
+    const int64_t N = std::accumulate(base_dims.begin(), base_dims.begin() + dim, 1LL, std::multiplies<int64_t>());
+    const int64_t D = std::accumulate(base_dims.begin() + dim + 1, base_dims.end(), 1LL, std::multiplies<int64_t>());
+    const int64_t num_inputs = static_cast<int64_t>(inputs.size());
+    const int64_t K_total = out_dims[dim];
+
+    // offsets records the sum of Ks
+    // offsets[i] = sum_{j < i} K_j
+    std::vector<int64_t> host_offsets(num_inputs + 1, 0);
+    for (int64_t i = 0; i < num_inputs; ++i) { host_offsets[i + 1] = host_offsets[i] + Ks[i]; }
+
+    const auto &stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
+                             infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                             ->maca_stream();
+
+    int64_t total = N * K_total * D;
+    int threads_per_block = 256;
+    int num_blocks = static_cast<int>((total + threads_per_block - 1) / threads_per_block);
+
+    DispatchFunc<INFINI_ALL_TYPES>(
+        dtype,
+        [=, &inputs, &host_offsets]<typename T>() {
+            std::vector<const T *> host_input_ptrs;
+            host_input_ptrs.reserve(inputs.size());
+            for (const auto &t : inputs) { host_input_ptrs.push_back(static_cast<const T *>(t->DataPtr())); }
+
+            const T **device_input_ptrs = nullptr;
+            int64_t *device_offsets = nullptr;
+
+            MACA_CHECK(mcMallocAsync(&device_input_ptrs, sizeof(T *) * num_inputs, stream));
+            MACA_CHECK(mcMemcpyAsync(device_input_ptrs, host_input_ptrs.data(), sizeof(T *) * num_inputs,
+                                       mcMemcpyHostToDevice, stream));
+
+            MACA_CHECK(mcMallocAsync(&device_offsets, sizeof(int64_t) * (num_inputs + 1), stream));
+            MACA_CHECK(mcMemcpyAsync(device_offsets, host_offsets.data(), sizeof(int64_t) * (num_inputs + 1),
+                                       mcMemcpyHostToDevice, stream));
+
+            ConcatForwardKernel<T><<<num_blocks, threads_per_block, 0, stream>>>(
+                device_input_ptrs, static_cast<T *>(output->DataPtr()), device_offsets, N, D, num_inputs, K_total);
+
+            MACA_CHECK(mcFreeAsync(device_input_ptrs, stream));
+            MACA_CHECK(mcFreeAsync(device_offsets, stream));
+        },
+        "MACA ConcatForward");
+
+    return output;
+}
+
+template <typename T>
+__global__ void ConcatBackwardKernel(const T *grad_output, T **grad_inputs, const int64_t *offsets, int64_t N,
+                                     int64_t D, int64_t num_inputs, int64_t K_total) {
+    int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int64_t total = N * K_total * D;
+    if (idx >= total) {
+        return;
+    }
+
+    int64_t d = idx % D;
+    int64_t k = (idx / D) % K_total;
+    int64_t n = idx / (D * K_total);
+
+    int64_t s = UpperBoundI64(offsets, num_inputs + 1, k);
+    int64_t k_local = k - offsets[s];
+    int64_t Ki = offsets[s + 1] - offsets[s];
+
+    T *gi = grad_inputs[s];
+    gi[n * (Ki * D) + k_local * D + d] = grad_output[idx];
+}
+
+std::vector<std::shared_ptr<Tensor>> ConcatBackward(const std::shared_ptr<Tensor> &grad_output,
+                                                    const std::vector<std::vector<int64_t>> &input_dims_list,
+                                                    int64_t dim) {
+    CHECK(!input_dims_list.empty());
+
+    auto dtype = grad_output->Dtype();
+    const auto &output_dims = grad_output->Dims();
+    if (dim < 0) {
+        dim += static_cast<int64_t>(output_dims.size());
+    }
+    CHECK_GE(dim, 0);
+    CHECK_LT(dim, static_cast<int64_t>(output_dims.size()));
+
+    const auto &base_rank = input_dims_list[0].size();
+    std::vector<int64_t> Ks;
+    Ks.reserve(input_dims_list.size());
+    for (const auto &dvec : input_dims_list) {
+        CHECK_EQ(dvec.size(), base_rank);
+        for (size_t ax = 0; ax < dvec.size(); ++ax) {
+            if (static_cast<int64_t>(ax) == dim) {
+                continue;
+            }
+            CHECK_EQ(dvec[ax], input_dims_list[0][ax]);
+        }
+        Ks.push_back(dvec[dim]);
+    }
+
+    auto device = grad_output->GetDevice();
+
+    std::vector<std::shared_ptr<Tensor>> grads;
+    grads.reserve(input_dims_list.size());
+    for (const auto &dvec : input_dims_list) {
+        auto t = std::make_shared<Tensor>(dvec, dtype, device);
+        DispatchFunc<INFINI_ALL_TYPES>(
+            dtype, [=]<typename T>() { t->Fill<T>(0); }, "MACA ConcatBackward");
+        grads.push_back(t);
+    }
+
+    const int64_t N = std::accumulate(input_dims_list[0].begin(), input_dims_list[0].begin() + dim, 1LL,
+                                      std::multiplies<int64_t>());
+    const int64_t D = std::accumulate(input_dims_list[0].begin() + dim + 1, input_dims_list[0].end(), 1LL,
+                                      std::multiplies<int64_t>());
+    const int64_t num_inputs = static_cast<int64_t>(input_dims_list.size());
+    const int64_t K_total = std::accumulate(Ks.begin(), Ks.end(), int64_t{0});
+
+    std::vector<int64_t> host_offsets(num_inputs + 1, 0);
+    for (int64_t i = 0; i < num_inputs; ++i) { host_offsets[i + 1] = host_offsets[i] + Ks[i]; }
+
+    const auto &stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
+                             infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                             ->maca_stream();
+
+    int64_t total = N * K_total * D;
+    int threads_per_block = 256;
+    int num_blocks = static_cast<int>((total + threads_per_block - 1) / threads_per_block);
+
+    DispatchFunc<INFINI_ALL_TYPES>(
+        dtype,
+        [=, &grads, &host_offsets]<typename T>() {
+            std::vector<T *> host_ptrs;
+            host_ptrs.reserve(grads.size());
+            for (auto &t : grads) { host_ptrs.push_back(static_cast<T *>(t->DataPtr())); }
+
+            T **device_ptrs = nullptr;
+            int64_t *device_offsets = nullptr;
+
+            MACA_CHECK(mcMallocAsync(&device_ptrs, sizeof(T *) * num_inputs, stream));
+            MACA_CHECK(mcMemcpyAsync(device_ptrs, host_ptrs.data(), sizeof(T *) * num_inputs, mcMemcpyHostToDevice,
+                                       stream));
+
+            MACA_CHECK(mcMallocAsync(&device_offsets, sizeof(int64_t) * (num_inputs + 1), stream));
+            MACA_CHECK(mcMemcpyAsync(device_offsets, host_offsets.data(), sizeof(int64_t) * (num_inputs + 1),
+                                       mcMemcpyHostToDevice, stream));
+
+            ConcatBackwardKernel<T><<<num_blocks, threads_per_block, 0, stream>>>(
+                static_cast<const T *>(grad_output->DataPtr()), device_ptrs, device_offsets, N, D, num_inputs, K_total);
+
+            MACA_CHECK(mcFreeAsync(device_ptrs, stream));
+            MACA_CHECK(mcFreeAsync(device_offsets, stream));
+        },
+        "MACA ConcatBackward");
+
+    return grads;
+}
+} // namespace infini_train::kernels::maca
+
+#define REGISTER_MACA_CONCAT_KERNEL(kernel_name)                                                                       \
+    REGISTER_KERNEL(infini_train::Device::DeviceType::kMACA, kernel_name, infini_train::kernels::maca::kernel_name)
+
+REGISTER_MACA_CONCAT_KERNEL(ConcatForward)
+REGISTER_MACA_CONCAT_KERNEL(ConcatBackward)
+
+#undef REGISTER_MACA_CONCAT_KERNEL
diff --git a/infini_train/src/kernels/maca/cross_entropy.maca b/infini_train/src/kernels/maca/cross_entropy.maca
new file mode 100644
index 00000000..6e839ab3
--- /dev/null
+++ b/infini_train/src/kernels/maca/cross_entropy.maca
@@ -0,0 +1,225 @@
+#include <cmath>
+#include <limits>
+#include <numeric>
+
+#include <cub/block/block_reduce.cuh>
+#include <mcr/mc_runtime.h>
+
+#include "infini_train/include/common/maca/common_maca.h"
+#include "infini_train/include/common/maca/cub_compat.cuh"
+#include "infini_train/include/common/maca/kernel_helper.cuh"
+#include "infini_train/include/core/runtime/device_guard.h"
+#include "infini_train/include/dispatcher.h"
+#include "infini_train/include/tensor.h"
+
+#include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
+
+namespace infini_train::kernels::maca {
+namespace {
+constexpr float kNegativeInfinity = -std::numeric_limits<float>::infinity();
+}
+
+template <size_t BLOCK_SIZE, typename TargetType, typename InputType>
+__global__ void CrossEntropyForwardKernel(const InputType *__restrict__ input_ptr,
+                                          const TargetType *__restrict__ target_ptr, InputType *__restrict__ loss_ptr,
+                                          int bs, int num_classes) {
+    __shared__ struct {
+        float max_logit;
+        float sum_exp;
+        TargetType target_class;
+        typename cub::BlockReduce<float, BLOCK_SIZE>::TempStorage reduce;
+    } shared;
+
+    const int sample_idx = blockIdx.x;
+    if (sample_idx >= bs) {
+        return;
+    }
+
+    const int tid = threadIdx.x;
+    const size_t base = sample_idx * num_classes;
+
+    if (tid == 0) {
+        shared.target_class = target_ptr[sample_idx];
+    }
+    __syncthreads();
+
+    // calculate the max
+    float thread_max = kNegativeInfinity;
+    for (int i = tid; i < num_classes; i += BLOCK_SIZE) {
+        thread_max = fmaxf(thread_max, common::maca::Cast<float>(input_ptr[base + i]));
+    }
+    const float block_max = cub::BlockReduce<float, BLOCK_SIZE>(shared.reduce).Reduce(thread_max, CubMaxOp());
+    if (tid == 0) {
+        shared.max_logit = block_max;
+    }
+    __syncthreads();
+
+    // calculate the sum of exponents
+    float thread_sum = 0.0f;
+    for (int i = tid; i < num_classes; i += BLOCK_SIZE) {
+        thread_sum += expf(common::maca::Cast<float>(input_ptr[base + i]) - shared.max_logit);
+    }
+    const float block_sum = cub::BlockReduce<float, BLOCK_SIZE>(shared.reduce).Sum(thread_sum);
+    if (tid == 0) {
+        shared.sum_exp = block_sum;
+    }
+    __syncthreads();
+
+    // calculate the loss
+    if (tid == 0) {
+        const float target_val
+            = common::maca::Cast<float>(input_ptr[base + common::maca::Cast<size_t>(shared.target_class)])
+            - shared.max_logit;
+        loss_ptr[sample_idx] = logf(shared.sum_exp) - target_val;
+    }
+}
+
+std::shared_ptr<Tensor> CrossEntropyForward(const std::shared_ptr<Tensor> &input,
+                                            const std::shared_ptr<Tensor> &target) {
+    const auto &input_dims = input->Dims();
+    CHECK_GE(input_dims.size(), 2);
+    const int bs = std::accumulate(input_dims.rbegin() + 1, input_dims.rend(), 1, std::multiplies<int64_t>{});
+    const int num_classes = *input_dims.rbegin();
+
+    auto batched_output = std::make_shared<Tensor>(std::vector<int64_t>{bs}, input->Dtype(), input->GetDevice());
+
+    constexpr int threads_per_block = 256;
+    int num_blocks = bs;
+
+    auto device = target->GetDevice();
+    const auto &maca_stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
+                                  infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                                  ->maca_stream();
+
+    return DispatchFunc<DataTypeList<DataType::kUINT8, DataType::kINT64>, DataTypeList<INFINI_ALL_FLOATING_TYPES>>(
+        {target->Dtype(), input->Dtype()},
+        [=]<typename Ttarget, typename Tinput>() {
+            const Ttarget *target_ptr = static_cast<const Ttarget *>(target->DataPtr());
+            const Tinput *input_ptr = static_cast<const Tinput *>(input->DataPtr());
+            Tinput *batched_loss_ptr = static_cast<Tinput *>(batched_output->DataPtr());
+            // FIXME(dcj): do reduce on GPU
+            CrossEntropyForwardKernel<threads_per_block, Ttarget, Tinput>
+                <<<num_blocks, threads_per_block, 0, maca_stream>>>(input_ptr, target_ptr, batched_loss_ptr, bs,
+                                                                    num_classes);
+
+            auto loss_cpu = batched_output->To(Device());
+            auto loss = std::make_shared<Tensor>(std::vector<int64_t>{}, input->Dtype(), Device());
+            auto loss_cpu_typed_ptr = static_cast<const Tinput *>(loss_cpu.DataPtr());
+            static_cast<Tinput *>(loss->DataPtr())[0]
+                = std::accumulate(loss_cpu_typed_ptr, loss_cpu_typed_ptr + bs, 0.0f,
+                                  [](float acc, const Tinput &val) { return acc + common::maca::Cast<float>(val); })
+                / bs;
+
+            return std::make_shared<Tensor>(loss->To(input->GetDevice()));
+        },
+        "MACA CrossEntropyForward");
+}
+
+template <size_t BLOCK_SIZE, typename TargetType, typename InputType>
+__global__ void CrossEntropyBackwardKernel(const InputType *__restrict__ input_ptr,
+                                           InputType *__restrict__ input_grad_ptr,
+                                           const TargetType *__restrict__ target_ptr,
+                                           const InputType *__restrict__ output_grad_ptr, int bs, int num_classes) {
+    __shared__ struct {
+        float max_logit;
+        float sum_exp;
+        int target_class;
+        typename cub::BlockReduce<float, BLOCK_SIZE>::TempStorage reduce;
+    } shared;
+
+    const int tid = threadIdx.x;
+    const int idx = blockIdx.x;
+
+    if (idx >= bs) {
+        return;
+    }
+
+    const size_t idx_base = idx * num_classes;
+
+    if (tid == 0) {
+        shared.target_class = static_cast<int>(target_ptr[idx]);
+    }
+    __syncthreads();
+
+    // calculate the max
+    float thread_max = kNegativeInfinity;
+    for (int i = tid; i < num_classes; i += BLOCK_SIZE) {
+        thread_max = fmaxf(thread_max, common::maca::Cast<float>(input_ptr[idx_base + i]));
+    }
+    const float block_max = cub::BlockReduce<float, BLOCK_SIZE>(shared.reduce).Reduce(thread_max, CubMaxOp());
+    if (tid == 0) {
+        shared.max_logit = block_max;
+    }
+    __syncthreads();
+
+    // calculate the sum
+    float thread_sum = 0.0f;
+    for (int i = tid; i < num_classes; i += BLOCK_SIZE) {
+        thread_sum += expf(common::maca::Cast<float>(input_ptr[idx_base + i]) - shared.max_logit);
+    }
+
+    const float block_sum = cub::BlockReduce<float, BLOCK_SIZE>(shared.reduce).Sum(thread_sum);
+    if (tid == 0) {
+        shared.sum_exp = block_sum;
+    }
+    __syncthreads();
+
+    // calculate the gradient
+    const float inv_bs = 1.0f / bs;
+    const float scale = 1.0f / shared.sum_exp;
+    const int target = shared.target_class;
+
+    for (int i = tid; i < num_classes; i += BLOCK_SIZE) {
+        const int global_idx = idx_base + i;
+        const float exp_val = expf(common::maca::Cast<float>(input_ptr[global_idx]) - shared.max_logit);
+        input_grad_ptr[global_idx] = common::maca::Cast<InputType>((exp_val * scale - (i == target)) * inv_bs
+                                                                   * common::maca::Cast<float>(output_grad_ptr[0]));
+    }
+}
+
+std::shared_ptr<Tensor> CrossEntropyBackward(const std::shared_ptr<Tensor> &input,
+                                             const std::shared_ptr<Tensor> &target,
+                                             const std::shared_ptr<Tensor> &grad_output) {
+    const auto &input_dims = input->Dims();
+    CHECK_GE(input_dims.size(), 2);
+    const int bs = std::accumulate(input_dims.rbegin() + 1, input_dims.rend(), 1, std::multiplies<int64_t>{});
+    const int num_classes = *input_dims.rbegin();
+
+    auto input_casted = std::make_shared<Tensor>(input->To(grad_output->Dtype()));
+
+    CHECK_EQ(grad_output->Dims().size(), 0);
+    auto grad_input = std::make_shared<Tensor>(input_casted->Dims(), input_casted->Dtype(), grad_output->GetDevice());
+
+    constexpr int threads_per_block = 256;
+    int num_blocks = bs;
+
+    auto device = target->GetDevice();
+    const auto &maca_stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
+                                  infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                                  ->maca_stream();
+
+    DispatchFunc<DataTypeList<DataType::kUINT8, DataType::kINT64>, DataTypeList<INFINI_ALL_FLOATING_TYPES>>(
+        {target->Dtype(), input_casted->Dtype()},
+        [=]<typename Ttarget, typename Tinput>() {
+            grad_input->Fill<Tinput>(0);
+            const Tinput *output_grad_ptr = static_cast<const Tinput *>(grad_output->DataPtr());
+            const Ttarget *target_ptr = static_cast<const Ttarget *>(target->DataPtr());
+            const Tinput *input_ptr = static_cast<const Tinput *>(input_casted->DataPtr());
+            Tinput *input_grad_ptr = static_cast<Tinput *>(grad_input->DataPtr());
+            CrossEntropyBackwardKernel<threads_per_block, Ttarget, Tinput>
+                <<<num_blocks, threads_per_block, 0, maca_stream>>>(input_ptr, input_grad_ptr, target_ptr,
+                                                                    output_grad_ptr, bs, num_classes);
+        },
+        "MACA CrossEntropyBackward");
+
+    return {grad_input};
+}
+} // namespace infini_train::kernels::maca
+
+#define REGISTER_MACA_CROSS_ENTROPY_KERNEL(kernel_name)                                                                \
+    REGISTER_KERNEL(infini_train::Device::DeviceType::kMACA, kernel_name, infini_train::kernels::maca::kernel_name)
+
+REGISTER_MACA_CROSS_ENTROPY_KERNEL(CrossEntropyForward)
+REGISTER_MACA_CROSS_ENTROPY_KERNEL(CrossEntropyBackward)
+
+#undef REGISTER_MACA_CROSS_ENTROPY_KERNEL
diff --git a/infini_train/src/kernels/maca/embedding.maca b/infini_train/src/kernels/maca/embedding.maca
new file mode 100644
index 00000000..d0472b72
--- /dev/null
+++ b/infini_train/src/kernels/maca/embedding.maca
@@ -0,0 +1,124 @@
+#include <memory>
+
+#include "infini_train/include/common/maca/common_maca.h"
+#include "infini_train/include/core/runtime/device_guard.h"
+#include "infini_train/include/dispatcher.h"
+#include "infini_train/include/tensor.h"
+
+#include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
+
+namespace infini_train::kernels::maca {
+
+template <typename T>
+__global__ void EmbeddingForwardKernel(const int64_t *input, T *output, const T *weight, int batch_size, int max_seqlen,
+                                       int embed_dim, int vocab_size) {
+    int idx = (blockIdx.x * blockDim.x + threadIdx.x);
+    if (idx >= batch_size * max_seqlen * embed_dim) {
+        return;
+    }
+
+    int bt = idx / embed_dim;
+    int b = bt / max_seqlen;
+    int t = bt % max_seqlen;
+    int c = idx % embed_dim;
+
+    int ix = static_cast<int>(input[b * max_seqlen + t]);
+    if (ix < 0 || ix >= vocab_size) {
+        return;
+    }
+    output[b * max_seqlen * embed_dim + t * embed_dim + c] = weight[ix * embed_dim + c];
+}
+
+std::shared_ptr<Tensor> EmbeddingForward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tensor> &weight) {
+    CHECK(input->Dtype() == DataType::kINT64);
+    CHECK_EQ(weight->Dims().size(), 2);
+
+    auto device = input->GetDevice();
+    const auto &maca_stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
+                                  infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                                  ->maca_stream();
+
+    const int batch_size = input->Dims().size() == 2 ? input->Dims()[0] : 1;
+    const int max_seqlen = input->Dims().size() == 2 ? input->Dims()[1] : input->Dims()[0];
+    const int vocab_size = weight->Dims()[0];
+    const int embed_dim = weight->Dims()[1];
+    auto output_dims = input->Dims();
+    output_dims.push_back(embed_dim);
+
+    auto dtype = weight->Dtype();
+    auto output = std::make_shared<Tensor>(output_dims, dtype, input->GetDevice());
+    int threads_per_block = 256;
+    int num_blocks = (batch_size * max_seqlen * embed_dim + threads_per_block - 1) / threads_per_block;
+
+    DispatchFunc<INFINI_ALL_FLOATING_TYPES>(
+        dtype,
+        [=]<typename T>() {
+            EmbeddingForwardKernel<<<num_blocks, threads_per_block, 0, maca_stream>>>(
+                static_cast<const int64_t *>(input->DataPtr()), static_cast<T *>(output->DataPtr()),
+                static_cast<const T *>(weight->DataPtr()), batch_size, max_seqlen, embed_dim, vocab_size);
+        },
+        "MACA EmbeddingForward");
+
+    return output;
+}
+
+template <typename T>
+__global__ void EmbeddingBackwardKernel(const int64_t *input_ptr, const T *grad_output_ptr, T *grad_weight_ptr,
+                                        int num_tokens, int embedding_dim, int vocab_size) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= num_tokens) {
+        return;
+    }
+
+    int token_id = static_cast<int>(input_ptr[idx]);
+    if (token_id < 0 || token_id >= vocab_size) {
+        return;
+    }
+
+    for (int j = 0; j < embedding_dim; ++j) {
+        atomicAdd(&grad_weight_ptr[token_id * embedding_dim + j], grad_output_ptr[idx * embedding_dim + j]);
+    }
+}
+
+std::shared_ptr<Tensor> EmbeddingBackward(const std::shared_ptr<Tensor> &input, const std::vector<int64_t> &weight_dims,
+                                          const std::shared_ptr<Tensor> &grad_output) {
+    CHECK(input->Dtype() == DataType::kINT64);
+    CHECK_EQ(weight_dims.size(), 2);
+    auto device = input->GetDevice();
+    const auto &maca_stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
+                                  infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                                  ->maca_stream();
+
+    const int vocab_size = weight_dims[0];
+    const int embedding_dim = weight_dims[1];
+    CHECK_EQ(input->Dims().size() + 1, grad_output->Dims().size());
+    for (int idx = 0; idx < input->Dims().size(); ++idx) { CHECK_EQ(input->Dims()[idx], grad_output->Dims()[idx]); }
+    CHECK_EQ(*grad_output->Dims().rbegin(), embedding_dim);
+
+    auto dtype = grad_output->Dtype();
+    auto grad_weight = std::make_shared<Tensor>(weight_dims, dtype, grad_output->GetDevice());
+    const int num_tokens = input->NumElements();
+    const int threads_per_block = 256;
+    const int num_blocks = (num_tokens + threads_per_block - 1) / threads_per_block;
+
+    DispatchFunc<INFINI_ALL_FLOATING_TYPES>(
+        dtype,
+        [=]<typename T>() {
+            grad_weight->Fill<T>(0);
+            EmbeddingBackwardKernel<<<num_blocks, threads_per_block, 0, maca_stream>>>(
+                static_cast<const int64_t *>(input->DataPtr()), static_cast<const T *>(grad_output->DataPtr()),
+                static_cast<T *>(grad_weight->DataPtr()), num_tokens, embedding_dim, vocab_size);
+        },
+        "MACA EmbeddingBackward");
+
+    return grad_weight;
+}
+} // namespace infini_train::kernels::maca
+
+#define REGISTER_MACA_EMBEDDING_KERNEL(kernel_name)                                                                    \
+    REGISTER_KERNEL(infini_train::Device::DeviceType::kMACA, kernel_name, infini_train::kernels::maca::kernel_name)
+
+REGISTER_MACA_EMBEDDING_KERNEL(EmbeddingForward)
+REGISTER_MACA_EMBEDDING_KERNEL(EmbeddingBackward)
+
+#undef REGISTER_MACA_EMBEDDING_KERNEL
diff --git a/infini_train/src/kernels/maca/gather.maca b/infini_train/src/kernels/maca/gather.maca
new file mode 100644
index 00000000..a7a6d04b
--- /dev/null
+++ b/infini_train/src/kernels/maca/gather.maca
@@ -0,0 +1,232 @@
+#include "glog/logging.h"
+
+#include "infini_train/include/common/maca/common_maca.h"
+#include "infini_train/include/core/runtime/device_guard.h"
+#include "infini_train/include/dispatcher.h"
+#include "infini_train/include/tensor.h"
+
+#include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
+
+namespace infini_train::kernels::maca {
+// FIXME(zbl): This kernel aligns with torch.gather
+//             Currently named IndexGather to avoid conflict with communication operators
+//             Should be renamed to Gather later for interface consistency
+template <typename T>
+__global__ void IndexGatherForwardKernel(const T *__restrict__ input, const int64_t *__restrict__ norm_index,
+                                         T *__restrict__ output, const int64_t *__restrict__ out_dims,
+                                         const int64_t *__restrict__ in_strides,
+                                         const int64_t *__restrict__ out_strides, int num_dims, int gather_dim,
+                                         int64_t dim_size_gather, int64_t total_elements) {
+    int64_t out_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (out_idx >= total_elements) {
+        return;
+    }
+
+    // Normalize like PyTorch: allow negative, clamp to [0, dim_size_gather-1]
+    int64_t gather_j = norm_index[out_idx];
+    gather_j = (gather_j < 0) ? (gather_j + dim_size_gather) : gather_j;
+    if (gather_j < 0) {
+        gather_j = 0;
+    }
+    if (gather_j >= dim_size_gather) {
+        gather_j = dim_size_gather - 1;
+    }
+
+    int64_t in_linear = 0, tmp = out_idx;
+#pragma unroll
+    for (int d = 0; d < num_dims; ++d) {
+        int64_t coord = tmp / out_strides[d];
+        tmp -= coord * out_strides[d];
+        in_linear += ((d == gather_dim) ? gather_j : coord) * in_strides[d];
+    }
+    output[out_idx] = input[in_linear];
+}
+
+std::shared_ptr<Tensor> IndexGatherForward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tensor> &index,
+                                           int64_t dim) {
+    const auto &in_dims = input->Dims();
+    const auto &idx_dims = index->Dims();
+    CHECK_EQ(in_dims.size(), idx_dims.size());
+    CHECK(input->GetDevice().type() == index->GetDevice().type());
+    CHECK(input->GetDevice().index() == index->GetDevice().index());
+
+    const int64_t num_dims = in_dims.size();
+    if (dim < 0) {
+        dim += num_dims;
+    }
+    CHECK_GE(dim, 0);
+    CHECK_LT(dim, num_dims);
+
+    // NOTE(zbl): Assume index to be int64 Tensors
+    CHECK(index->Dtype() == DataType::kINT64);
+
+    for (int d = 0; d < num_dims; ++d) {
+        if (d == dim) {
+            continue;
+        }
+        // Align with PyTorch semantics: index.size(d) <= input.size(d) for d != dim
+        CHECK_LE(idx_dims[d], in_dims[d])
+            << "index.size(" << d << ") must be <= input.size(" << d << ") on non-gather dims";
+    }
+
+    const auto device = input->GetDevice();
+    const auto &stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
+                             infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                             ->maca_stream();
+
+    auto dtype = input->Dtype();
+    auto out = std::make_shared<Tensor>(idx_dims, dtype, device);
+
+    auto in_strides = ComputeStrides(in_dims);
+    auto out_strides = ComputeStrides(idx_dims);
+    const int64_t total_elements = index->NumElements();
+
+    const int64_t gather_dim_size = in_dims[dim];
+
+    int64_t *dev_buf = nullptr;
+    MACA_CHECK(mcMallocAsync(&dev_buf, (3 * num_dims) * sizeof(int64_t), stream));
+    int64_t *out_dims_dev = dev_buf + 0 * num_dims;
+    int64_t *in_strides_dev = dev_buf + 1 * num_dims;
+    int64_t *out_strides_dev = dev_buf + 2 * num_dims;
+
+    MACA_CHECK(
+        mcMemcpyAsync(out_dims_dev, idx_dims.data(), num_dims * sizeof(int64_t), mcMemcpyHostToDevice, stream));
+    MACA_CHECK(
+        mcMemcpyAsync(in_strides_dev, in_strides.data(), num_dims * sizeof(int64_t), mcMemcpyHostToDevice, stream));
+    MACA_CHECK(mcMemcpyAsync(out_strides_dev, out_strides.data(), num_dims * sizeof(int64_t), mcMemcpyHostToDevice,
+                               stream));
+
+    const int threads = 256;
+    const int blocks = (total_elements + threads - 1) / threads;
+
+    DispatchFunc<INFINI_ALL_FLOATING_TYPES>(
+        dtype,
+        [=]<typename T>() {
+            IndexGatherForwardKernel<T><<<blocks, threads, 0, stream>>>(
+                static_cast<const T *>(input->DataPtr()), static_cast<const int64_t *>(index->DataPtr()),
+                static_cast<T *>(out->DataPtr()), out_dims_dev, in_strides_dev, out_strides_dev, (int)num_dims,
+                (int)dim, gather_dim_size, total_elements);
+        },
+        "MACA IndexGatherForward");
+
+    MACA_CHECK(mcFreeAsync(dev_buf, stream));
+    return out;
+}
+
+template <typename T>
+__global__ void IndexGatherBackwardKernel(const T *__restrict__ grad_output, const int64_t *__restrict__ index,
+                                          T *__restrict__ grad_input, const int64_t *__restrict__ out_dims,
+                                          const int64_t *__restrict__ in_strides,
+                                          const int64_t *__restrict__ out_strides, int num_dims, int gather_dim,
+                                          int64_t dim_size_gather, int64_t total_elements) {
+    int64_t out_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (out_idx >= total_elements) {
+        return;
+    }
+
+    int64_t gather_j = index[out_idx];
+    gather_j = (gather_j < 0) ? (gather_j + dim_size_gather) : gather_j;
+    if (gather_j < 0) {
+        gather_j = 0;
+    }
+    if (gather_j >= dim_size_gather) {
+        gather_j = dim_size_gather - 1;
+    }
+
+    int64_t in_linear = 0;
+    int64_t tmp = out_idx;
+#pragma unroll
+    for (int d = 0; d < num_dims; ++d) {
+        int64_t coord = tmp / out_strides[d];
+        tmp -= coord * out_strides[d];
+        if (d == gather_dim) {
+            in_linear += gather_j * in_strides[d];
+        } else {
+            in_linear += coord * in_strides[d];
+        }
+    }
+    atomicAdd(&grad_input[in_linear], grad_output[out_idx]);
+}
+
+std::shared_ptr<Tensor> IndexGatherBackward(const std::shared_ptr<Tensor> &grad_output,
+                                            const std::shared_ptr<Tensor> &index, int64_t dim,
+                                            const std::vector<int64_t> &input_dims) {
+    const auto &in_dims = input_dims;
+    const auto &idx_dims = index->Dims();
+    CHECK_EQ(in_dims.size(), idx_dims.size());
+    const int64_t num_dims = in_dims.size();
+    if (dim < 0) {
+        dim += num_dims;
+    }
+    CHECK_GE(dim, 0);
+    CHECK_LT(dim, num_dims);
+
+    // NOTE(zbl): Assume index to be int64 Tensors
+    CHECK(index->Dtype() == DataType::kINT64);
+
+    for (int d = 0; d < num_dims; ++d) {
+        if (d == dim) {
+            continue;
+        }
+        CHECK_EQ(in_dims[d], idx_dims[d]);
+    }
+
+    auto dtype = grad_output->Dtype();
+    auto grad_input = std::make_shared<Tensor>(in_dims, dtype, grad_output->GetDevice());
+    DispatchFunc<INFINI_ALL_TYPES>(
+        dtype, [=]<typename T>() { grad_input->Fill<T>(0); }, "MACA IndexGatherBackwardZero");
+
+    auto in_strides = ComputeStrides(in_dims);
+    auto out_strides = ComputeStrides(idx_dims);
+    const int64_t total_elements
+        = std::accumulate(idx_dims.begin(), idx_dims.end(), (int64_t)1, std::multiplies<int64_t>{});
+    const int64_t gather_dim_size = in_dims[dim];
+
+    int64_t *dev_buf = nullptr;
+    const size_t n_out = idx_dims.size();
+    const size_t n_in_strides = in_dims.size();
+    const size_t n_out_strides = idx_dims.size();
+    const size_t total_i64 = n_out + n_in_strides + n_out_strides;
+
+    auto device = grad_output->GetDevice();
+    const auto &stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
+                             infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                             ->maca_stream();
+
+    MACA_CHECK(mcMallocAsync(&dev_buf, total_i64 * sizeof(int64_t), stream));
+    int64_t *out_dims_dev = dev_buf;
+    int64_t *in_strides_dev = out_dims_dev + n_out;
+    int64_t *out_strides_dev = in_strides_dev + n_in_strides;
+
+    MACA_CHECK(mcMemcpyAsync(out_dims_dev, idx_dims.data(), n_out * sizeof(int64_t), mcMemcpyHostToDevice, stream));
+    MACA_CHECK(mcMemcpyAsync(in_strides_dev, in_strides.data(), n_in_strides * sizeof(int64_t),
+                               mcMemcpyHostToDevice, stream));
+    MACA_CHECK(mcMemcpyAsync(out_strides_dev, out_strides.data(), n_out_strides * sizeof(int64_t),
+                               mcMemcpyHostToDevice, stream));
+
+    const int threads = 256;
+    const int blocks = (int)((total_elements + threads - 1) / threads);
+
+    DispatchFunc<INFINI_ALL_FLOATING_TYPES>(
+        dtype,
+        [=]<typename T>() {
+            IndexGatherBackwardKernel<T><<<blocks, threads, 0, stream>>>(
+                static_cast<const T *>(grad_output->DataPtr()), static_cast<const int64_t *>(index->DataPtr()),
+                static_cast<T *>(grad_input->DataPtr()), out_dims_dev, in_strides_dev, out_strides_dev, (int)num_dims,
+                (int)dim, gather_dim_size, total_elements);
+        },
+        "MACA IndexGatherBackward");
+
+    MACA_CHECK(mcFreeAsync(dev_buf, stream));
+    return grad_input;
+}
+
+} // namespace infini_train::kernels::maca
+
+#define REGISTER_MACA_GATHER_KERNEL(kernel_name)                                                                       \
+    REGISTER_KERNEL(infini_train::Device::DeviceType::kMACA, kernel_name, infini_train::kernels::maca::kernel_name)
+
+REGISTER_MACA_GATHER_KERNEL(IndexGatherForward)
+REGISTER_MACA_GATHER_KERNEL(IndexGatherBackward)
+
+#undef REGISTER_MACA_GATHER_KERNEL
diff --git a/infini_train/src/kernels/maca/layernorm.maca b/infini_train/src/kernels/maca/layernorm.maca
new file mode 100644
index 00000000..53b8f339
--- /dev/null
+++ b/infini_train/src/kernels/maca/layernorm.maca
@@ -0,0 +1,207 @@
+#include <cub/block/block_reduce.cuh>
+
+#include "infini_train/include/common/maca/common_maca.h"
+#include "infini_train/include/common/maca/kernel_helper.cuh"
+#include "infini_train/include/core/runtime/device_guard.h"
+#include "infini_train/include/device.h"
+#include "infini_train/include/dispatcher.h"
+#include "infini_train/include/tensor.h"
+
+#include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
+
+namespace infini_train::kernels::maca {
+
+template <int BLOCK_SIZE, typename T>
+__global__ void LayerNormForwardKernel(const T *input, const T *weight, const T *bias, float *mean_out, float *rstd_out,
+                                       T *output, float eps, int embed_dim) {
+    using BlockReduce = cub::BlockReduce<float, BLOCK_SIZE>;
+    __shared__ typename BlockReduce::TempStorage temp_storage_mean;
+    __shared__ typename BlockReduce::TempStorage temp_storage_rstd;
+    __shared__ float shared_mean;
+    __shared__ float shared_rstd;
+
+    const int token_idx = blockIdx.x;
+    const T *x = input + token_idx * embed_dim;
+    T *y = output + token_idx * embed_dim;
+
+    float sum = 0.0f;
+    float sqsum = 0.0f;
+
+    for (int i = threadIdx.x; i < embed_dim; i += BLOCK_SIZE) {
+        float val = common::maca::Cast<float>(x[i]);
+        sum += val;
+        sqsum += val * val;
+    }
+
+    float total_sum = BlockReduce(temp_storage_mean).Sum(sum);
+    float total_sqsum = BlockReduce(temp_storage_rstd).Sum(sqsum);
+
+    if (threadIdx.x == 0) {
+        float mean = total_sum / embed_dim;
+        float var = total_sqsum / embed_dim - mean * mean;
+        float rstd = rsqrtf(var + eps);
+        shared_mean = mean;
+        shared_rstd = rstd;
+        if (mean_out) {
+            mean_out[token_idx] = mean;
+        }
+        if (rstd_out) {
+            rstd_out[token_idx] = rstd;
+        }
+    }
+    __syncthreads();
+
+    for (int i = threadIdx.x; i < embed_dim; i += BLOCK_SIZE) {
+        float norm = (common::maca::Cast<float>(x[i]) - shared_mean) * shared_rstd;
+        y[i] = common::maca::Cast<T>(norm * common::maca::Cast<float>(weight[i]) + common::maca::Cast<float>(bias[i]));
+    }
+}
+
+std::tuple<std::shared_ptr<Tensor>, std::shared_ptr<Tensor>, std::shared_ptr<Tensor>>
+LayerNormForward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tensor> &weight,
+                 const std::shared_ptr<Tensor> &bias, const float eps) {
+    CHECK_EQ(input->Dims().size(), 3);
+    CHECK_LE(input->Dims()[2], weight->Dims()[0]);
+    CHECK_LE(input->Dims()[2], bias->Dims()[0]);
+
+    const int batch_size = input->Dims()[0];
+    const int max_seqlen = input->Dims()[1];
+    const int embed_dim = input->Dims()[2];
+
+    auto dtype = input->Dtype();
+
+    auto output = std::make_shared<Tensor>(input->Dims(), dtype, input->GetDevice());
+    auto mean = std::make_shared<Tensor>(std::vector<int64_t>{batch_size, max_seqlen}, DataType::kFLOAT32,
+                                         input->GetDevice());
+    auto rstd = std::make_shared<Tensor>(std::vector<int64_t>{batch_size, max_seqlen}, DataType::kFLOAT32,
+                                         input->GetDevice());
+
+    constexpr int BLOCK_SIZE = 256;
+    int threads_per_block = BLOCK_SIZE;
+    int num_blocks = batch_size * max_seqlen;
+
+    auto device = input->GetDevice();
+    const auto &maca_stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
+                                  infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                                  ->maca_stream();
+
+    DispatchFunc<INFINI_ALL_FLOATING_TYPES>(
+        dtype,
+        [=]<typename T>() {
+            mean->Fill<float>(0);
+            rstd->Fill<float>(0);
+            LayerNormForwardKernel<BLOCK_SIZE><<<num_blocks, threads_per_block, 0, maca_stream>>>(
+                static_cast<const T *>(input->DataPtr()), static_cast<const T *>(weight->DataPtr()),
+                static_cast<const T *>(bias->DataPtr()), static_cast<float *>(mean->DataPtr()),
+                static_cast<float *>(rstd->DataPtr()), static_cast<T *>(output->DataPtr()), eps, embed_dim);
+        },
+        "MACA LayerNormForward");
+
+    return {output, mean, rstd};
+}
+
+template <int BLOCK_SIZE, typename T>
+__global__ void LayerNormBackwardKernel(const T *__restrict__ input, const T *__restrict__ grad_output,
+                                        const float *__restrict__ mean, const float *__restrict__ rstd,
+                                        const T *__restrict__ weight, T *__restrict__ grad_input,
+                                        T *__restrict__ grad_weight, T *__restrict__ grad_bias, int embed_dim,
+                                        size_t weight_num_elements, size_t bias_num_elements) {
+    using BlockReduce = cub::BlockReduce<float, BLOCK_SIZE>;
+    __shared__ typename BlockReduce::TempStorage temp_storage_mean;
+    __shared__ typename BlockReduce::TempStorage temp_storage_norm;
+    __shared__ float shared_mean;
+    __shared__ float shared_norm;
+
+    int tid = threadIdx.x;
+    int token_idx = blockIdx.x;
+
+    const T *input_ptr = input + token_idx * embed_dim;
+    const T *grad_output_ptr = grad_output + token_idx * embed_dim;
+    T *grad_input_ptr = grad_input + token_idx * embed_dim;
+
+    float mean_val = mean[token_idx];
+    float rstd_val = rstd[token_idx];
+
+    float dnorm_mean = 0.f;
+    float dnorm_norm_mean = 0.f;
+
+    for (int i = tid; i < embed_dim; i += BLOCK_SIZE) {
+        float dnorm = common::maca::Cast<float>(common::maca::Mul(weight[i], grad_output_ptr[i]));
+        dnorm_mean += dnorm;
+        dnorm_norm_mean += dnorm * (common::maca::Cast<float>(input_ptr[i]) - mean_val);
+    }
+
+    dnorm_mean = BlockReduce(temp_storage_mean).Sum(dnorm_mean);
+    dnorm_norm_mean = BlockReduce(temp_storage_norm).Sum(dnorm_norm_mean);
+
+    if (tid == 0) {
+        float mean_d = dnorm_mean / embed_dim;
+        float norm_d = (dnorm_norm_mean / embed_dim) * rstd_val - mean_d * mean_val * rstd_val;
+        shared_mean = mean_d;
+        shared_norm = norm_d;
+    }
+    __syncthreads();
+
+    for (int i = tid; i < embed_dim; i += BLOCK_SIZE) {
+        float norm = (common::maca::Cast<float>(input_ptr[i]) - mean_val) * rstd_val;
+        float grad_output_val = common::maca::Cast<float>(grad_output_ptr[i]);
+
+        grad_input_ptr[i] = common::maca::Cast<T>(
+            (common::maca::Cast<float>(weight[i]) * grad_output_val - shared_mean - norm * shared_norm) * rstd_val);
+
+        common::maca::fastAtomicAdd<T, size_t>(grad_weight, i, weight_num_elements,
+                                               common::maca::Cast<T>(grad_output_val * norm), true);
+        common::maca::fastAtomicAdd<T, size_t>(grad_bias, i, bias_num_elements, grad_output_ptr[i], true);
+    }
+}
+
+std::tuple<std::shared_ptr<Tensor>, std::shared_ptr<Tensor>, std::shared_ptr<Tensor>>
+LayerNormBackward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tensor> &weight,
+                  const std::shared_ptr<Tensor> &bias, const std::shared_ptr<Tensor> &mean,
+                  const std::shared_ptr<Tensor> &rstd, const std::shared_ptr<Tensor> &grad_output) {
+    const int batch_size = input->Dims()[0];
+    const int max_seqlen = input->Dims()[1];
+    const int embed_dim = input->Dims()[2];
+
+    auto dtype = input->Dtype();
+    CHECK(dtype == weight->Dtype() && dtype == bias->Dtype() && dtype == grad_output->Dtype()
+          && mean->Dtype() == DataType::kFLOAT32 && rstd->Dtype() == DataType::kFLOAT32);
+
+    auto grad_input = std::make_shared<Tensor>(input->Dims(), dtype, grad_output->GetDevice());
+    auto grad_weight = std::make_shared<Tensor>(weight->Dims(), dtype, grad_output->GetDevice());
+    auto grad_bias = std::make_shared<Tensor>(bias->Dims(), dtype, grad_output->GetDevice());
+
+    constexpr int BLOCK_SIZE = 256;
+    int threads_per_block = BLOCK_SIZE;
+    int num_blocks = batch_size * max_seqlen;
+
+    auto device = input->GetDevice();
+    const auto &maca_stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
+                                  infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                                  ->maca_stream();
+    DispatchFunc<INFINI_ALL_FLOATING_TYPES>(
+        dtype,
+        [=]<typename T>() {
+            grad_input->Fill<T>(0);
+            grad_weight->Fill<T>(0);
+            grad_bias->Fill<T>(0);
+            LayerNormBackwardKernel<BLOCK_SIZE><<<num_blocks, threads_per_block, 0, maca_stream>>>(
+                static_cast<const T *>(input->DataPtr()), static_cast<const T *>(grad_output->DataPtr()),
+                static_cast<const float *>(mean->DataPtr()), static_cast<const float *>(rstd->DataPtr()),
+                static_cast<const T *>(weight->DataPtr()), static_cast<T *>(grad_input->DataPtr()),
+                static_cast<T *>(grad_weight->DataPtr()), static_cast<T *>(grad_bias->DataPtr()), embed_dim,
+                grad_weight->NumElements(), grad_bias->NumElements());
+        },
+        "MACA LayerNormBackward");
+
+    return {grad_input, grad_weight, grad_bias};
+}
+} // namespace infini_train::kernels::maca
+
+#define REGISTER_MACA_LAYERNORM_KERNEL(kernel_name)                                                                    \
+    REGISTER_KERNEL(infini_train::Device::DeviceType::kMACA, kernel_name, infini_train::kernels::maca::kernel_name)
+
+REGISTER_MACA_LAYERNORM_KERNEL(LayerNormForward)
+REGISTER_MACA_LAYERNORM_KERNEL(LayerNormBackward)
+
+#undef REGISTER_MACA_LAYERNORM_KERNEL
diff --git a/infini_train/src/kernels/maca/outer.maca b/infini_train/src/kernels/maca/outer.maca
new file mode 100644
index 00000000..14fe0388
--- /dev/null
+++ b/infini_train/src/kernels/maca/outer.maca
@@ -0,0 +1,168 @@
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include <mcblas/mcblas.h>
+
+#include "glog/logging.h"
+
+#include "infini_train/include/common/maca/common_maca.h"
+#include "infini_train/include/core/runtime/device_guard.h"
+#include "infini_train/include/dispatcher.h"
+#include "infini_train/include/tensor.h"
+
+#include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
+
+namespace infini_train::kernels::maca {
+std::shared_ptr<Tensor> OuterForward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tensor> &other) {
+    /*
+    Computes outer product: output[i, j] = input[i] * other[j]
+    Equivalent to: input: [M, 1], other: [1, N] → output: [M, N]
+    */
+
+    const auto &in_dims = input->Dims();
+    const auto &ot_dims = other->Dims();
+    // TODO(zbl): support batched outer?
+    CHECK_EQ(in_dims.size(), 1);
+    CHECK_EQ(ot_dims.size(), 1);
+
+    const int64_t M = in_dims[0];
+    const int64_t N = ot_dims[0];
+
+    auto output = std::make_shared<Tensor>(std::vector<int64_t>{M, N}, input->Dtype(), input->GetDevice());
+
+    auto device = input->GetDevice();
+    // reinterpret input: [M] as column vector [M, 1]
+    // reinterpret other: [N] as row vector [1, N]
+    // output[M, N] = input[M, 1] * other.T[1, N]
+    // output.T[N, M] = other[N, 1] * input.T[1, M]
+    float alpha = 1.0f;
+    float beta = 0.0f;
+    mcblasHandle_t handle = dynamic_cast<infini_train::core::maca::MacaBlasHandle *>(
+                                infini_train::core::GetDeviceGuardImpl(device.type())->GetBlasHandle(device))
+                                ->mcblas_handle();
+
+    switch (input->Dtype()) {
+        DISPATCH_CASE(WRAP({
+                          MCBLAS_CHECK(mcblasSgemm(handle, MCBLAS_OP_N, MCBLAS_OP_N, N, M, 1, &alpha,
+                                                   static_cast<const float *>(other->DataPtr()), N,
+                                                   static_cast<const float *>(input->DataPtr()), 1, &beta,
+                                                   static_cast<float *>(output->DataPtr()), N));
+                      }),
+                      DataType::kFLOAT32)
+        DISPATCH_CASE(WRAP({
+                          MCBLAS_CHECK(mcblasGemmEx(handle, MCBLAS_OP_N, MCBLAS_OP_N, N, M, 1, &alpha, other->DataPtr(),
+                                                    MACA_R_16BF, N, input->DataPtr(), MACA_R_16BF, 1, &beta,
+                                                    output->DataPtr(), MACA_R_16BF, N, MACA_R_32F,
+                                                    MCBLAS_GEMM_DEFAULT));
+                      }),
+                      DataType::kBFLOAT16)
+    }
+
+    return output;
+}
+
+std::tuple<std::shared_ptr<Tensor>, std::shared_ptr<Tensor>> OuterBackward(const std::shared_ptr<Tensor> &input,
+                                                                           const std::shared_ptr<Tensor> &other,
+                                                                           const std::shared_ptr<Tensor> &grad_output) {
+    /*
+    grad_input: [M] = grad_output: [M, N] × other: [N]
+    grad_other: [N] = grad_output.T: [N, M] × input: [M]
+    */
+    const int64_t M = input->Dims()[0];
+    const int64_t N = other->Dims()[0];
+    // TODO(zbl): support batched outer?
+    CHECK_EQ(grad_output->Dims().size(), 2);
+    CHECK_EQ(grad_output->Dims()[0], M);
+    CHECK_EQ(grad_output->Dims()[1], N);
+
+    auto input_dtype = input->Dtype();
+    auto other_dtype = other->Dtype();
+    auto grad_output_dtype = grad_output->Dtype();
+
+    // Compute dtype determined by saved tensors (forward compute dtype), not grad_output
+    DataType promoted_type = DispatchFunc<DataTypeList<INFINI_ALL_TYPES>, DataTypeList<INFINI_ALL_TYPES>>(
+        {input_dtype, other_dtype}, [=]<typename Tin, typename To>() { return DataTypeMap_v<WidestType_t<Tin, To>>; },
+        "MACA OuterBackward");
+
+    auto input_promoted = input_dtype == promoted_type ? input : std::make_shared<Tensor>(input->To(promoted_type));
+    auto other_promoted = other_dtype == promoted_type ? other : std::make_shared<Tensor>(other->To(promoted_type));
+    auto grad_output_promoted
+        = grad_output_dtype == promoted_type ? grad_output : std::make_shared<Tensor>(grad_output->To(promoted_type));
+
+    // For bf16 compute, output in fp32 to preserve accumulation precision (matches PyTorch behavior)
+    auto output_dtype = (promoted_type == DataType::kBFLOAT16) ? DataType::kFLOAT32 : promoted_type;
+    auto grad_input = std::make_shared<Tensor>(std::vector<int64_t>{M}, output_dtype, grad_output->GetDevice());
+    auto grad_other = std::make_shared<Tensor>(std::vector<int64_t>{N}, output_dtype, grad_output->GetDevice());
+
+    DispatchFunc<DataType::kFLOAT32, DataType::kBFLOAT16>(
+        promoted_type,
+        [=]<typename T>() {
+            grad_input->Fill<T>(0);
+            grad_other->Fill<T>(0);
+        },
+        "MACA OuterBackward");
+
+    auto device = input->GetDevice();
+    float alpha = 1.0f;
+    float beta = 0.0f;
+    mcblasHandle_t handle = dynamic_cast<infini_train::core::maca::MacaBlasHandle *>(
+                                infini_train::core::GetDeviceGuardImpl(device.type())->GetBlasHandle(device))
+                                ->mcblas_handle();
+
+    switch (promoted_type) {
+        DISPATCH_CASE(WRAP({
+                          // grad_input[M, 1] = grad_output[M, N] × other[N, 1]
+                          // y = grad_input[M]
+                          // A = grad_output.T[N, M]
+                          // x = other[N]
+                          MCBLAS_CHECK(mcblasSgemv(handle, MCBLAS_OP_T, N, M, &alpha,
+                                                   static_cast<const float *>(grad_output_promoted->DataPtr()), N,
+                                                   static_cast<const float *>(other_promoted->DataPtr()), 1, &beta,
+                                                   static_cast<float *>(grad_input->DataPtr()), 1));
+
+                          // grad_other[N, 1] = grad_output.T[N, M] × input[M, 1]
+                          // y = grad_other[N]
+                          // A = grad_output.T[N, M]
+                          // x = input[M]
+                          MCBLAS_CHECK(mcblasSgemv(handle, MCBLAS_OP_N, N, M, &alpha,
+                                                   static_cast<const float *>(grad_output_promoted->DataPtr()), N,
+                                                   static_cast<const float *>(input_promoted->DataPtr()), 1, &beta,
+                                                   static_cast<float *>(grad_other->DataPtr()), 1));
+                      }),
+                      DataType::kFLOAT32)
+        DISPATCH_CASE(
+            // cublas<t>gemv does not support bf16, use mcblasGemmEx to workaround
+            WRAP({
+                // grad_input[M, 1] = grad_output[M, N] × other[N, 1]
+                // grad_input.T[1, M] = other.T[1, N] × grad_output.T[N, M]
+                // C = grad_input.T[1, M]
+                // A = other.T[1, N]
+                // B = grad_output.T[N, M]
+                MCBLAS_CHECK(mcblasGemmEx(handle, MCBLAS_OP_N, MCBLAS_OP_N, 1, M, N, &alpha, other_promoted->DataPtr(),
+                                          MACA_R_16BF, 1, grad_output_promoted->DataPtr(), MACA_R_16BF, N, &beta,
+                                          grad_input->DataPtr(), MACA_R_32F, 1, MACA_R_32F, MCBLAS_GEMM_DEFAULT));
+                // grad_other[N, 1] = grad_output.T[N, M] × input[M, 1]
+                // grad_other.T[1, N] = input.T[1, M] × grad_output[M, N]
+                // C = grad_other.T[1, N]
+                // A = input.T[1, M]
+                // B = grad_output.T[N, M]
+                MCBLAS_CHECK(mcblasGemmEx(handle, MCBLAS_OP_N, MCBLAS_OP_T, 1, N, M, &alpha, input_promoted->DataPtr(),
+                                          MACA_R_16BF, 1, grad_output_promoted->DataPtr(), MACA_R_16BF, N, &beta,
+                                          grad_other->DataPtr(), MACA_R_32F, 1, MACA_R_32F, MCBLAS_GEMM_DEFAULT));
+            }),
+            DataType::kBFLOAT16)
+    }
+
+    return {grad_input, grad_other};
+}
+
+} // namespace infini_train::kernels::maca
+
+#define REGISTER_MACA_OUTER_KERNEL(kernel_name)                                                                        \
+    REGISTER_KERNEL(infini_train::Device::DeviceType::kMACA, kernel_name, infini_train::kernels::maca::kernel_name)
+
+REGISTER_MACA_OUTER_KERNEL(OuterForward)
+REGISTER_MACA_OUTER_KERNEL(OuterBackward)
+
+#undef REGISTER_MACA_OUTER_KERNEL
diff --git a/infini_train/src/kernels/maca/reduction.maca b/infini_train/src/kernels/maca/reduction.maca
new file mode 100644
index 00000000..ee453e61
--- /dev/null
+++ b/infini_train/src/kernels/maca/reduction.maca
@@ -0,0 +1,243 @@
+#include <cub/cub.cuh>
+
+#include "infini_train/include/common/maca/common_maca.h"
+#include "infini_train/include/common/maca/cub_compat.cuh"
+#include "infini_train/include/common/maca/kernel_helper.cuh"
+#include "infini_train/include/core/runtime/device_guard.h"
+#include "infini_train/include/dispatcher.h"
+#include "infini_train/include/tensor.h"
+
+#include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
+
+namespace infini_train::kernels::maca {
+namespace {
+__host__ __device__ constexpr float kInfinity = std::numeric_limits<float>::infinity();
+} // namespace
+
+namespace {
+// Reduction operators
+template <typename T, typename ReduceFunc> struct CubOp;
+
+template <typename T> struct CubOp<T, CubSumOp> {
+    __device__ static T Init() { return common::maca::Cast<T>(0); }
+    __device__ static T Reduce(T a, T b) { return common::maca::Add<T>(a, b); }
+    __device__ static CubSumOp Op() { return CubSumOp(); }
+};
+
+template <typename T> struct CubOp<T, CubMaxOp> {
+    __device__ static T Init() { return common::maca::Cast<T>(-kInfinity); }
+    __device__ static T Reduce(T a, T b) { return common::maca::Max<T>(a, b); }
+    __device__ static CubMaxOp Op() { return CubMaxOp(); }
+};
+
+template <typename T> struct CubOp<T, CubMinOp> {
+    __device__ static T Init() { return common::maca::Cast<T>(kInfinity); }
+    __device__ static T Reduce(T a, T b) { return common::maca::Min<T>(a, b); }
+    __device__ static CubMinOp Op() { return CubMinOp(); }
+};
+
+// Finalization strategies
+template <typename T> struct MeanFinalize {
+    __device__ __forceinline__ T operator()(T sum, int64_t count) const {
+        return common::maca::Div(sum, common::maca::Cast<T>(count));
+    }
+};
+
+template <typename T> struct IdentityFinalize {
+    __device__ __forceinline__ T operator()(T val, int64_t) const { return val; }
+};
+
+// Generic reduction kernel
+template <typename T, typename ReduceFunc, typename FinalizeOp, int BLOCK_SIZE>
+__global__ void GenericReduceKernel(const T *input, T *output, int64_t N, int64_t H, int64_t W,
+                                    FinalizeOp finalize_op) {
+    using BlockReduce = cub::BlockReduce<T, BLOCK_SIZE>;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+
+    int idx = blockIdx.x;
+    if (idx >= N * W) {
+        return;
+    }
+
+    int n = idx / W;
+    int w = idx % W;
+
+    T acc = CubOp<T, ReduceFunc>::Init();
+    for (int64_t h = threadIdx.x; h < H; h += blockDim.x) {
+        int input_idx = (n * H + h) * W + w;
+        acc = CubOp<T, ReduceFunc>::Reduce(acc, input[input_idx]);
+    }
+
+    T reduced = BlockReduce(temp_storage).Reduce(acc, CubOp<T, ReduceFunc>::Op());
+
+    if (threadIdx.x == 0) {
+        output[idx] = finalize_op(reduced, H);
+    }
+}
+
+// Unified backward kernel for Mean, Sum, Max, and Min
+template <typename T>
+__global__ void GenericReduceBackwardKernel(T *grad_input, const T *grad_output, const T *input, const T *reduced,
+                                            int64_t N, int64_t H, int64_t W, bool is_mean, bool is_masked) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= N * H * W) {
+        return;
+    }
+
+    int n = idx / (H * W);
+    int hw = idx % (H * W);
+    int w = hw % W;
+
+    int reduced_idx = n * W + w;
+
+    if (is_masked) {
+        T selected = reduced[reduced_idx];
+        T value = input[idx];
+        grad_input[idx] = (value == selected) ? grad_output[reduced_idx] : T(0);
+    } else {
+        grad_input[idx] = grad_output[reduced_idx];
+        if (is_mean) {
+            T H_casted;
+            // TODO(lzm): directly use Cast<T> when (__half and __maca_bfloat16) <-> (integral types) is supported
+            if constexpr (std::is_same_v<T, __half> || std::is_same_v<T, __maca_bfloat16>) {
+                H_casted = common::maca::Cast<T>(static_cast<float>(H));
+            } else {
+                H_casted = common::maca::Cast<T>(H);
+            }
+            grad_input[idx] /= H_casted;
+        }
+    }
+}
+} // namespace
+
+// Common forward implementation for reduce ops
+template <typename ReduceFunc, template <typename> class FinalizeOp>
+std::shared_ptr<Tensor> ReduceOpForward(const std::shared_ptr<Tensor> &input, const int64_t dim, const bool keep_dim) {
+    const auto &input_dims = input->Dims();
+    int64_t actual_dim = dim < 0 ? dim + input_dims.size() : dim;
+    CHECK_GE(actual_dim, 0);
+    CHECK_LT(actual_dim, input_dims.size());
+
+    std::vector<int64_t> output_dims = input_dims;
+    if (keep_dim) {
+        output_dims[actual_dim] = 1;
+    } else {
+        output_dims.erase(output_dims.begin() + actual_dim);
+    }
+
+    auto dtype = input->Dtype();
+    auto output = std::make_shared<Tensor>(output_dims, dtype, input->GetDevice());
+
+    int64_t N = std::accumulate(input_dims.begin(), input_dims.begin() + actual_dim, 1, std::multiplies<int64_t>());
+    int64_t H = input_dims[actual_dim];
+    int64_t W = std::accumulate(input_dims.begin() + actual_dim + 1, input_dims.end(), 1, std::multiplies<int64_t>());
+
+    constexpr int BLOCK_SIZE = 256;
+    int threads_per_block = BLOCK_SIZE;
+    int num_blocks = N * W;
+
+    auto device = input->GetDevice();
+    const auto &maca_stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
+                                  infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                                  ->maca_stream();
+
+    DispatchFunc<INFINI_ALL_FLOATING_TYPES>(
+        dtype,
+        [=]<typename T>() {
+            GenericReduceKernel<T, ReduceFunc, FinalizeOp<T>, BLOCK_SIZE>
+                <<<num_blocks, threads_per_block, 0, maca_stream>>>(static_cast<const T *>(input->DataPtr()),
+                                                                    static_cast<T *>(output->DataPtr()), N, H, W,
+                                                                    FinalizeOp<T>{});
+        },
+        "MACA ReductionForward");
+    return output;
+}
+
+// Common backward implementation for reduce ops
+std::shared_ptr<Tensor> ReduceOpBackward(const std::shared_ptr<Tensor> &grad_output,
+                                         const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tensor> &reduced,
+                                         const std::vector<int64_t> &input_dims, const int64_t dim, bool keep_dim,
+                                         bool is_mean, bool is_masked) {
+    int64_t actual_dim = dim < 0 ? dim + input_dims.size() : dim;
+    CHECK_GE(actual_dim, 0);
+    CHECK_LT(actual_dim, input_dims.size());
+
+    auto dtype = grad_output->Dtype();
+    auto grad_input = std::make_shared<Tensor>(input_dims, dtype, grad_output->GetDevice());
+
+    int64_t N = std::accumulate(input_dims.begin(), input_dims.begin() + actual_dim, 1, std::multiplies<int64_t>());
+    int64_t H = input_dims[actual_dim];
+    int64_t W = std::accumulate(input_dims.begin() + actual_dim + 1, input_dims.end(), 1, std::multiplies<int64_t>());
+
+    int threads_per_block = 256;
+    int num_blocks = (N * H * W + threads_per_block - 1) / threads_per_block;
+
+    auto device = grad_output->GetDevice();
+    const auto &maca_stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
+                                  infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                                  ->maca_stream();
+
+    DispatchFunc<INFINI_ALL_FLOATING_TYPES>(
+        dtype,
+        [=]<typename T>() {
+            grad_input->Fill<T>(0);
+            GenericReduceBackwardKernel<<<num_blocks, threads_per_block, 0, maca_stream>>>(
+                static_cast<T *>(grad_input->DataPtr()), static_cast<const T *>(grad_output->DataPtr()),
+                input ? static_cast<const T *>(input->DataPtr()) : nullptr,
+                reduced ? static_cast<const T *>(reduced->DataPtr()) : nullptr, N, H, W, is_mean, is_masked);
+        },
+        "MACA ReductionBackward");
+    return grad_input;
+}
+
+std::shared_ptr<Tensor> MeanForward(const std::shared_ptr<Tensor> &input, const int64_t dim, const bool keep_dim) {
+    return ReduceOpForward<CubSumOp, MeanFinalize>(input, dim, keep_dim);
+}
+
+std::shared_ptr<Tensor> SumForward(const std::shared_ptr<Tensor> &input, const int64_t dim, const bool keep_dim) {
+    return ReduceOpForward<CubSumOp, IdentityFinalize>(input, dim, keep_dim);
+}
+
+std::shared_ptr<Tensor> MaxForward(const std::shared_ptr<Tensor> &input, const int64_t dim, const bool keep_dim) {
+    return ReduceOpForward<CubMaxOp, IdentityFinalize>(input, dim, keep_dim);
+}
+
+std::shared_ptr<Tensor> MinForward(const std::shared_ptr<Tensor> &input, const int64_t dim, const bool keep_dim) {
+    return ReduceOpForward<CubMinOp, IdentityFinalize>(input, dim, keep_dim);
+}
+
+std::shared_ptr<Tensor> MeanBackward(const std::shared_ptr<Tensor> &grad_output, const std::vector<int64_t> &input_dims,
+                                     const int64_t dim, bool keep_dim) {
+    return ReduceOpBackward(grad_output, nullptr, nullptr, input_dims, dim, keep_dim, true, false);
+}
+
+std::shared_ptr<Tensor> SumBackward(const std::shared_ptr<Tensor> &grad_output, const std::vector<int64_t> &input_dims,
+                                    const int64_t dim, bool keep_dim) {
+    return ReduceOpBackward(grad_output, nullptr, nullptr, input_dims, dim, keep_dim, false, false);
+}
+
+std::shared_ptr<Tensor> MaxBackward(const std::shared_ptr<Tensor> &grad_output, const std::shared_ptr<Tensor> &input,
+                                    const std::shared_ptr<Tensor> &reduced, const int64_t dim, bool keep_dim) {
+    return ReduceOpBackward(grad_output, input, reduced, input->Dims(), dim, keep_dim, false, true);
+}
+
+std::shared_ptr<Tensor> MinBackward(const std::shared_ptr<Tensor> &grad_output, const std::shared_ptr<Tensor> &input,
+                                    const std::shared_ptr<Tensor> &reduced, const int64_t dim, bool keep_dim) {
+    return ReduceOpBackward(grad_output, input, reduced, input->Dims(), dim, keep_dim, false, true);
+}
+
+} // namespace infini_train::kernels::maca
+
+#define REGISTER_MACA_REDUCTION_KERNEL(kernel_name)                                                                    \
+    REGISTER_KERNEL(infini_train::Device::DeviceType::kMACA, kernel_name, infini_train::kernels::maca::kernel_name)
+
+REGISTER_MACA_REDUCTION_KERNEL(MeanForward)
+REGISTER_MACA_REDUCTION_KERNEL(SumForward)
+REGISTER_MACA_REDUCTION_KERNEL(MaxForward)
+REGISTER_MACA_REDUCTION_KERNEL(MinForward)
+REGISTER_MACA_REDUCTION_KERNEL(MeanBackward)
+REGISTER_MACA_REDUCTION_KERNEL(SumBackward)
+REGISTER_MACA_REDUCTION_KERNEL(MaxBackward)
+REGISTER_MACA_REDUCTION_KERNEL(MinBackward)
+
+#undef REGISTER_MACA_REDUCTION_KERNEL
diff --git a/infini_train/src/kernels/maca/slice.maca b/infini_train/src/kernels/maca/slice.maca
new file mode 100644
index 00000000..18b2a0c3
--- /dev/null
+++ b/infini_train/src/kernels/maca/slice.maca
@@ -0,0 +1,210 @@
+#include <cstdint>
+#include <memory>
+
+#include "glog/logging.h"
+
+#include "infini_train/include/common/maca/common_maca.h"
+#include "infini_train/include/core/runtime/device_guard.h"
+#include "infini_train/include/dispatcher.h"
+#include "infini_train/include/tensor.h"
+
+#include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
+
+namespace infini_train::kernels::maca {
+
+template <typename T>
+__global__ void SliceForwardKernel(const T *input, T *output, const int64_t *new_dims, const int64_t *starts,
+                                   const int64_t *steps, const int64_t *in_strides, const int64_t *out_strides,
+                                   int num_dims, int64_t total_elements) {
+    int64_t out_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (out_idx >= total_elements) {
+        return;
+    }
+
+    int64_t in_index = 0;
+    for (int i = 0; i < num_dims; ++i) {
+        int64_t idx = (out_idx / out_strides[i]) % new_dims[i];
+        in_index += (starts[i] + idx * steps[i]) * in_strides[i];
+    }
+
+    output[out_idx] = input[in_index];
+}
+
+std::shared_ptr<Tensor> SliceForward(const std::shared_ptr<Tensor> &input, const std::vector<int64_t> &starts,
+                                     const std::vector<int64_t> &ends, const std::vector<int64_t> &steps) {
+    CHECK_EQ(starts.size(), ends.size());
+    CHECK_EQ(starts.size(), steps.size());
+    auto &dims = input->Dims();
+    CHECK_EQ(starts.size(), dims.size());
+    const int64_t num_dims = dims.size();
+
+    std::vector<int64_t> new_dims;
+    for (int i = 0; i < starts.size(); ++i) {
+        CHECK_LE(starts[i], ends[i]);
+        CHECK_LE(0, steps[i]);
+        new_dims.push_back((ends[i] - starts[i] + steps[i] - 1) / steps[i]);
+    }
+
+    auto dtype = input->Dtype();
+    auto new_tensor = std::make_shared<Tensor>(new_dims, dtype, input->GetDevice());
+    // NOTE(zbl): must initialize with 0
+    DispatchFunc<INFINI_ALL_TYPES>(
+        dtype, [=]<typename T>() { new_tensor->Fill<T>(0); }, "MACA SliceForward");
+
+    std::vector<int64_t> src_strides(dims.size(), 0), dst_strides(new_dims.size(), 0);
+    int64_t stride = 1;
+    for (int i = dims.size() - 1; i >= 0; --i) {
+        src_strides[i] = stride;
+        stride *= dims[i];
+    }
+
+    stride = 1;
+    for (int i = new_dims.size() - 1; i >= 0; --i) {
+        dst_strides[i] = stride;
+        stride *= new_dims[i];
+    }
+
+    int64_t total_elements = stride;
+
+    int64_t *new_dims_dev, *starts_dev, *steps_dev, *input_strides_dev, *output_strides_dev;
+
+    auto device = input->GetDevice();
+    const auto &stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
+                             infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                             ->maca_stream();
+
+    mcMallocAsync(&new_dims_dev,
+                    (ends.size() + starts.size() + steps.size() + dims.size() + new_dims.size()) * sizeof(int64_t),
+                    stream);
+    starts_dev = new_dims_dev + ends.size();
+    steps_dev = starts_dev + starts.size();
+    input_strides_dev = steps_dev + steps.size();
+    output_strides_dev = input_strides_dev + dims.size();
+
+    mcMemcpyAsync(new_dims_dev, new_dims.data(), ends.size() * sizeof(int64_t), mcMemcpyHostToDevice, stream);
+    mcMemcpyAsync(starts_dev, starts.data(), starts.size() * sizeof(int64_t), mcMemcpyHostToDevice, stream);
+    mcMemcpyAsync(steps_dev, steps.data(), steps.size() * sizeof(int64_t), mcMemcpyHostToDevice, stream);
+    mcMemcpyAsync(input_strides_dev, src_strides.data(), dims.size() * sizeof(int64_t), mcMemcpyHostToDevice,
+                    stream);
+    mcMemcpyAsync(output_strides_dev, dst_strides.data(), new_dims.size() * sizeof(int64_t), mcMemcpyHostToDevice,
+                    stream);
+
+    int threads_per_block = 256;
+    int num_blocks = (total_elements + threads_per_block - 1) / threads_per_block;
+
+    DispatchFunc<INFINI_ALL_TYPES>(
+        dtype,
+        [=]<typename T>() {
+            SliceForwardKernel<<<num_blocks, threads_per_block, 0, stream>>>(
+                static_cast<const T *>(input->DataPtr()), static_cast<T *>(new_tensor->DataPtr()), new_dims_dev,
+                starts_dev, steps_dev, input_strides_dev, output_strides_dev, num_dims, total_elements);
+        },
+        "MACA SliceForward");
+
+    mcFreeAsync(new_dims_dev, stream);
+
+    return new_tensor;
+}
+
+template <typename T>
+__global__ void SliceBackwardKernel(const T *grad_output, T *grad_input, const int64_t *new_dims, const int64_t *starts,
+                                    const int64_t *steps, const int64_t *in_strides, const int64_t *out_strides,
+                                    int num_dims, int64_t total_elements) {
+    int64_t out_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (out_idx >= total_elements) {
+        return;
+    }
+
+    int64_t in_index = 0;
+    for (int i = 0; i < num_dims; ++i) {
+        int64_t idx = (out_idx / out_strides[i]) % new_dims[i];
+        in_index += (starts[i] + idx * steps[i]) * in_strides[i];
+    }
+    grad_input[in_index] = grad_output[out_idx];
+}
+
+std::shared_ptr<Tensor> SliceBackward(const std::shared_ptr<Tensor> &grad_output, const std::shared_ptr<Tensor> &input,
+                                      const std::vector<int64_t> &starts, const std::vector<int64_t> &ends,
+                                      const std::vector<int64_t> &steps) {
+    CHECK_EQ(starts.size(), ends.size());
+    CHECK_EQ(starts.size(), steps.size());
+    auto &dims = input->Dims();
+    CHECK_EQ(starts.size(), dims.size());
+    const int64_t num_dims = dims.size();
+
+    std::vector<int64_t> new_dims;
+    for (int i = 0; i < starts.size(); ++i) {
+        CHECK_LE(starts[i], ends[i]);
+        CHECK_LE(0, steps[i]);
+        new_dims.push_back((ends[i] - starts[i] + steps[i] - 1) / steps[i]);
+    }
+
+    auto grad_output_dtype = grad_output->Dtype();
+    auto grad_input = std::make_shared<Tensor>(input->Dims(), grad_output_dtype, grad_output->GetDevice());
+    DispatchFunc<INFINI_ALL_TYPES>(
+        grad_output_dtype, [=]<typename T>() { grad_input->Fill<T>(0); }, "MACA SliceBackward");
+
+    std::vector<int64_t> src_strides(dims.size());
+    int64_t stride = 1;
+    for (int i = src_strides.size() - 1; i >= 0; --i) {
+        src_strides[i] = stride;
+        stride *= dims[i];
+    }
+
+    std::vector<int64_t> dst_strides(new_dims.size());
+    stride = 1;
+    for (int i = dst_strides.size() - 1; i >= 0; --i) {
+        dst_strides[i] = stride;
+        stride *= new_dims[i];
+    }
+
+    int64_t total_elements = stride;
+
+    int dims_size = dims.size();
+    int64_t *new_dims_dev, *starts_dev, *steps_dev, *input_strides_dev, *output_strides_dev;
+
+    auto device = input->GetDevice();
+    const auto &stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
+                             infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                             ->maca_stream();
+    mcMallocAsync(&new_dims_dev,
+                    (ends.size() + starts.size() + steps.size() + dims.size() + new_dims.size()) * sizeof(int64_t),
+                    stream);
+    starts_dev = new_dims_dev + ends.size();
+    steps_dev = starts_dev + starts.size();
+    input_strides_dev = steps_dev + steps.size();
+    output_strides_dev = input_strides_dev + dims.size();
+
+    mcMemcpyAsync(new_dims_dev, new_dims.data(), ends.size() * sizeof(int64_t), mcMemcpyHostToDevice, stream);
+    mcMemcpyAsync(starts_dev, starts.data(), starts.size() * sizeof(int64_t), mcMemcpyHostToDevice, stream);
+    mcMemcpyAsync(steps_dev, steps.data(), steps.size() * sizeof(int64_t), mcMemcpyHostToDevice, stream);
+    mcMemcpyAsync(input_strides_dev, src_strides.data(), dims.size() * sizeof(int64_t), mcMemcpyHostToDevice,
+                    stream);
+    mcMemcpyAsync(output_strides_dev, dst_strides.data(), new_dims.size() * sizeof(int64_t), mcMemcpyHostToDevice,
+                    stream);
+
+    int threads_per_block = 256;
+    int num_blocks = (total_elements + threads_per_block - 1) / threads_per_block;
+
+    DispatchFunc<INFINI_ALL_TYPES>(
+        grad_output_dtype,
+        [=]<typename T>() {
+            SliceBackwardKernel<<<num_blocks, threads_per_block, 0, stream>>>(
+                static_cast<const T *>(grad_output->DataPtr()), static_cast<T *>(grad_input->DataPtr()), new_dims_dev,
+                starts_dev, steps_dev, input_strides_dev, output_strides_dev, num_dims, total_elements);
+        },
+        "MACA SliceBackward");
+
+    mcFreeAsync(new_dims_dev, stream);
+
+    return grad_input;
+}
+} // namespace infini_train::kernels::maca
+
+#define REGISTER_MACA_SLICE_KERNEL(kernel_name)                                                                        \
+    REGISTER_KERNEL(infini_train::Device::DeviceType::kMACA, kernel_name, infini_train::kernels::maca::kernel_name)
+
+REGISTER_MACA_SLICE_KERNEL(SliceForward)
+REGISTER_MACA_SLICE_KERNEL(SliceBackward)
+
+#undef REGISTER_MACA_SLICE_KERNEL
diff --git a/infini_train/src/kernels/maca/softmax.maca b/infini_train/src/kernels/maca/softmax.maca
new file mode 100644
index 00000000..28bce0ed
--- /dev/null
+++ b/infini_train/src/kernels/maca/softmax.maca
@@ -0,0 +1,225 @@
+#include <cmath>
+#include <cstddef>
+
+#include <cub/block/block_reduce.cuh>
+
+#include "glog/logging.h"
+
+#include "infini_train/include/common/maca/common_maca.h"
+#include "infini_train/include/common/maca/cub_compat.cuh"
+#include "infini_train/include/common/maca/kernel_helper.cuh"
+#include "infini_train/include/core/runtime/device_guard.h"
+#include "infini_train/include/dispatcher.h"
+#include "infini_train/include/tensor.h"
+
+#include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
+
+namespace infini_train::kernels::maca {
+template <size_t BLOCK_SIZE, typename T>
+__global__ void SoftmaxForwardKernel(T *output, const T *input, int64_t outer_size, int64_t axis_size,
+                                     int64_t inner_size) {
+    using BlockReduce = cub::BlockReduce<float, BLOCK_SIZE>;
+
+    __shared__ typename BlockReduce::TempStorage temp_storage_max;
+    __shared__ typename BlockReduce::TempStorage temp_storage_sum;
+    __shared__ float row_max;
+    __shared__ float row_sum;
+
+    const int64_t group = blockIdx.x;     // row of the grid
+    const int64_t inner_idx = blockIdx.y; // column of the grid
+    const int tid = threadIdx.x;
+
+    // calculate the maximum for each group
+    float thread_max = -INFINITY;
+    for (int64_t axis = tid; axis < axis_size; axis += BLOCK_SIZE) {
+        int64_t idx = (group * axis_size + axis) * inner_size + inner_idx;
+        thread_max = max(thread_max, common::maca::Cast<float>(input[idx]));
+    }
+    float block_max = BlockReduce(temp_storage_max).Reduce(thread_max, CubMaxOp());
+
+    if (tid == 0) {
+        row_max = block_max;
+    }
+    __syncthreads();
+
+    // calculate the sum of exponents
+    float thread_sum = 0;
+    for (int64_t axis = tid; axis < axis_size; axis += BLOCK_SIZE) {
+        int64_t idx = (group * axis_size + axis) * inner_size + inner_idx;
+        float exp_val = exp(common::maca::Cast<float>(input[idx]) - row_max);
+        output[idx] = common::maca::Cast<T>(exp_val);
+        thread_sum += exp_val;
+    }
+    float block_sum = BlockReduce(temp_storage_sum).Sum(thread_sum);
+
+    if (tid == 0) {
+        row_sum = block_sum;
+    }
+    __syncthreads();
+
+    // normalize
+    for (int64_t axis = tid; axis < axis_size; axis += BLOCK_SIZE) {
+        int64_t idx = (group * axis_size + axis) * inner_size + inner_idx;
+        output[idx] = common::maca::Cast<T>(common::maca::Cast<float>(output[idx]) / row_sum);
+    }
+}
+
+template <size_t BLOCK_SIZE, typename T>
+void LaunchForward(const std::shared_ptr<Tensor> &output, const std::shared_ptr<Tensor> &input, int64_t dim) {
+    const auto &input_dims = input->Dims();
+    int64_t outer_size = 1;
+    int64_t axis_size = input_dims[dim];
+    int64_t inner_size = 1;
+
+    for (int i = 0; i < dim; ++i) { outer_size *= input_dims[i]; };
+    for (int i = dim + 1; i < input_dims.size(); ++i) { inner_size *= input_dims[i]; };
+    if (axis_size == 0) {
+        LOG_LOC(INFO, "MACA softmax forward: 'input_dims[dim] == 0'");
+        return;
+    }
+    if (outer_size == 0) {
+        return;
+    }
+
+    T *output_ptr = static_cast<T *>(output->DataPtr());
+    const T *input_ptr = static_cast<const T *>(input->DataPtr());
+
+    if (BLOCK_SIZE > 1024) {
+        LOG_LOC(FATAL, "MACA softmax forward: 'BLOCK_SIZE used is larger than the max number of thread per block'");
+    }
+    dim3 block_dims(BLOCK_SIZE);
+    dim3 grid_dims(outer_size, inner_size);
+
+    auto device = output->GetDevice();
+    const auto &maca_stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
+                                  infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                                  ->maca_stream();
+    SoftmaxForwardKernel<BLOCK_SIZE, T>
+        <<<grid_dims, block_dims, 0, maca_stream>>>(output_ptr, input_ptr, outer_size, axis_size, inner_size);
+}
+
+std::shared_ptr<Tensor> SoftmaxForward(const std::shared_ptr<Tensor> &input, int64_t dim) {
+    auto dtype = input->Dtype();
+    const auto &input_dims = input->Dims();
+    dim = dim < 0 ? dim + input_dims.size() : dim;
+    CHECK(dim >= 0 && dim < input_dims.size());
+    auto output = std::make_shared<Tensor>(input_dims, dtype, input->GetDevice());
+
+    switch (dtype) {
+        DISPATCH_CASE(WRAP(LaunchForward<256, float>(output, input, dim);), DataType::kFLOAT32)
+        DISPATCH_CASE(WRAP(LaunchForward<256, __maca_bfloat16>(output, input, dim);), DataType::kBFLOAT16)
+    default:
+        LOG_LOC(FATAL, "MACA softmax forward: 'Unsupported data type'");
+    }
+    return output;
+}
+
+template <size_t BLOCK_SIZE, typename T>
+__global__ void SoftmaxBackwardKernel(T *grad_input, const T *grad_output, const T *output, int64_t outer_size,
+                                      int64_t axis_size, int64_t inner_size) {
+    using BlockReduce = cub::BlockReduce<float, BLOCK_SIZE>;
+
+    __shared__ typename BlockReduce::TempStorage temp_storage_sum;
+    __shared__ float row_sum;
+
+    const int64_t group = blockIdx.x;
+    const int64_t inner_idx = blockIdx.y;
+    const int tid = threadIdx.x;
+
+    // calculate the sum of the dot product of gradients
+    float thread_sum = 0;
+    for (int64_t axis = tid; axis < axis_size; axis += BLOCK_SIZE) {
+        const int64_t idx = (group * axis_size + axis) * inner_size + inner_idx;
+        thread_sum += common::maca::Cast<float>(grad_output[idx] * output[idx]);
+    }
+    float block_sum = BlockReduce(temp_storage_sum).Sum(thread_sum);
+
+    if (tid == 0) {
+        row_sum = block_sum;
+    }
+    __syncthreads();
+
+    // update the input gradient
+    for (int64_t axis = tid; axis < axis_size; axis += BLOCK_SIZE) {
+        const int64_t idx = (group * axis_size + axis) * inner_size + inner_idx;
+        grad_input[idx] = output[idx] * (grad_output[idx] - common::maca::Cast<T>(row_sum));
+    }
+}
+
+template <size_t BLOCK_SIZE, typename T>
+void LaunchBackward(const std::shared_ptr<Tensor> &grad_input, const std::shared_ptr<Tensor> &grad_output,
+                    const std::shared_ptr<Tensor> &output, int64_t dim) {
+    const auto &output_dims = output->Dims();
+    int64_t outer_size = 1;
+    int64_t axis_size = output_dims[dim];
+    int64_t inner_size = 1;
+
+    for (int i = 0; i < dim; ++i) { outer_size *= output_dims[i]; };
+    for (int i = dim + 1; i < output_dims.size(); ++i) { inner_size *= output_dims[i]; };
+    if (axis_size == 0) {
+        LOG_LOC(INFO, "MACA softmax backward: 'output_dims[dim] == 0'");
+        return;
+    }
+    if (outer_size == 0) {
+        return;
+    }
+
+    T *grad_input_ptr = static_cast<T *>(grad_input->DataPtr());
+    const T *grad_output_ptr = static_cast<const T *>(grad_output->DataPtr());
+    const T *output_ptr = static_cast<const T *>(output->DataPtr());
+
+    if (BLOCK_SIZE > 1024) {
+        LOG_LOC(FATAL, "MACA softmax backward: 'BLOCK_SIZE used is larger than the max number of thread per block'");
+    }
+    dim3 block(BLOCK_SIZE);
+    dim3 grid(outer_size, inner_size);
+
+    auto device = output->GetDevice();
+    const auto &maca_stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
+                                  infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                                  ->maca_stream();
+    SoftmaxBackwardKernel<BLOCK_SIZE, T><<<grid, block, 0, maca_stream>>>(grad_input_ptr, grad_output_ptr, output_ptr,
+                                                                          outer_size, axis_size, inner_size);
+}
+
+std::shared_ptr<Tensor> SoftmaxBackward(const std::shared_ptr<Tensor> &grad_output,
+                                        const std::shared_ptr<Tensor> &output, int64_t dim) {
+    auto grad_output_dtype = grad_output->Dtype();
+    auto output_dtype = output->Dtype();
+    DataType promoted_type = DispatchFunc<DataTypeList<INFINI_ALL_TYPES>, DataTypeList<INFINI_ALL_TYPES>>(
+        {grad_output_dtype, output_dtype},
+        [=]<typename Tgrad, typename Tout>() { return DataTypeMap_v<WidestType_t<Tgrad, Tout>>; },
+        "MACA SoftmaxBackward");
+
+    auto grad_output_promoted
+        = grad_output_dtype == promoted_type ? grad_output : std::make_shared<Tensor>(grad_output->To(promoted_type));
+    auto output_promoted = output_dtype == promoted_type ? output : std::make_shared<Tensor>(output->To(promoted_type));
+
+    const auto &output_dims = output->Dims();
+    dim = dim < 0 ? dim + output->Dims().size() : dim;
+    CHECK(dim >= 0 && dim < output->Dims().size());
+
+    auto grad_input = std::make_shared<Tensor>(output_dims, promoted_type, output->GetDevice());
+    DispatchFunc<INFINI_ALL_TYPES>(
+        promoted_type, [=]<typename T>() { grad_input->Fill<T>(0); }, "MACA SoftmaxBackward");
+
+    switch (promoted_type) {
+        DISPATCH_CASE(WRAP(LaunchBackward<256, float>(grad_input, grad_output_promoted, output_promoted, dim);),
+                      DataType::kFLOAT32)
+        DISPATCH_CASE(WRAP(LaunchBackward<256, __maca_bfloat16>(grad_input, grad_output_promoted, output_promoted, dim);),
+                      DataType::kBFLOAT16)
+    default:
+        LOG_LOC(FATAL, "MACA softmax backward: 'Unsupported data type'");
+    }
+
+    return grad_input;
+}
+} // namespace infini_train::kernels::maca
+
+#define REGISTER_MACA_SOFTMAX_KERNEL(kernel_name)                                                                      \
+    REGISTER_KERNEL(infini_train::Device::DeviceType::kMACA, kernel_name, infini_train::kernels::maca::kernel_name)
+
+REGISTER_MACA_SOFTMAX_KERNEL(SoftmaxForward)
+REGISTER_MACA_SOFTMAX_KERNEL(SoftmaxBackward)
+
+#undef REGISTER_MACA_SOFTMAX_KERNEL
diff --git a/infini_train/src/kernels/maca/split.maca b/infini_train/src/kernels/maca/split.maca
new file mode 100644
index 00000000..3480a19d
--- /dev/null
+++ b/infini_train/src/kernels/maca/split.maca
@@ -0,0 +1,181 @@
+#include <algorithm>
+#include <numeric>
+#include <vector>
+
+#include "infini_train/include/common/maca/common_maca.h"
+#include "infini_train/include/core/runtime/device_guard.h"
+#include "infini_train/include/dispatcher.h"
+#include "infini_train/include/tensor.h"
+
+#include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
+
+namespace infini_train::kernels::maca {
+template <typename T>
+__global__ void SplitForwardKernel(const T *input, T *output, int64_t N, int64_t H_in, int64_t H_out, int64_t W,
+                                   int64_t start_idx) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int total = N * H_out * W;
+
+    if (idx < total) {
+        int w = idx % W;
+        int h = (idx / W) % H_out;
+        int n = idx / (H_out * W);
+
+        int input_h = h + start_idx;
+        int input_idx = n * H_in * W + input_h * W + w;
+        int output_idx = n * H_out * W + h * W + w;
+
+        output[output_idx] = input[input_idx];
+    }
+}
+
+std::vector<std::shared_ptr<Tensor>> SplitForward(const std::shared_ptr<Tensor> &input, int64_t split_size, int dim) {
+    CHECK_GT(split_size, 0);
+    CHECK_GE(dim, 0) << "Currently we do not support negative dimension";
+    const auto &input_dims = input->Dims();
+    CHECK_LT(dim, input_dims.size());
+
+    std::vector<std::shared_ptr<Tensor>> outputs;
+    auto dtype = input->Dtype();
+
+    const int64_t N = std::accumulate(input_dims.begin(), input_dims.begin() + dim, 1, std::multiplies<int64_t>());
+    const int64_t W = std::accumulate(input_dims.begin() + dim + 1, input_dims.end(), 1, std::multiplies<int64_t>());
+    const int64_t H_in = input_dims[dim];
+
+    for (int64_t start = 0; start < H_in; start += split_size) {
+        auto output_dims = input_dims;
+        const int64_t H_out = std::min(split_size, H_in - start);
+        output_dims[dim] = H_out;
+
+        auto output = std::make_shared<Tensor>(output_dims, dtype, input->GetDevice());
+
+        int64_t total = N * H_out * W;
+        int threads_per_block = 256;
+        int num_blocks = (total + threads_per_block - 1) / threads_per_block;
+
+        auto device = input->GetDevice();
+        const auto &maca_stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
+                                      infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                                      ->maca_stream();
+
+        DispatchFunc<INFINI_ALL_TYPES>(
+            dtype,
+            [=]<typename T>() {
+                SplitForwardKernel<<<num_blocks, threads_per_block, 0, maca_stream>>>(
+                    static_cast<const T *>(input->DataPtr()), static_cast<T *>(output->DataPtr()), N, H_in, H_out, W,
+                    start);
+            },
+            "MACA SplitForward");
+
+        outputs.push_back(std::move(output));
+    }
+
+    return outputs;
+}
+
+template <typename T>
+__global__ void SplitBackwardKernel(const T *const *grad_outputs, T *grad_input, int64_t N, int64_t H_in, int64_t W,
+                                    int64_t split_size, int64_t num_splits, const int64_t *H_outs) {
+    int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int64_t total = N * H_in * W;
+    if (idx >= total) {
+        return;
+    }
+
+    int64_t w = idx % W;
+    int64_t h = (idx / W) % H_in;
+    int64_t n = idx / (H_in * W);
+
+    int64_t split_idx = h / split_size;
+    if (split_idx >= num_splits) {
+        return;
+    }
+
+    int64_t H_out = H_outs[split_idx];
+    int64_t local_h = h - split_idx * split_size;
+
+    if (local_h >= H_out) {
+        return;
+    }
+
+    const T *grad_output = grad_outputs[split_idx];
+    T value = grad_output[(n * H_out + local_h) * W + w];
+    grad_input[(n * H_in + h) * W + w] = value;
+}
+
+template <typename T>
+std::shared_ptr<Tensor> LaunchSplitBackward(const std::vector<int64_t> &input_dims, int64_t split_size, int dim,
+                                            const std::vector<std::shared_ptr<Tensor>> &grad_outputs) {
+    CHECK_GT(split_size, 0);
+    CHECK_GE(dim, 0) << "Currently we do not support negative dimension";
+    CHECK_LT(dim, input_dims.size());
+
+    const auto &grad = grad_outputs[0];
+    auto dtype = grad->Dtype();
+    auto grad_input = std::make_shared<Tensor>(input_dims, dtype, grad->GetDevice());
+    grad_input->Fill<T>(0);
+
+    int64_t N = std::accumulate(input_dims.begin(), input_dims.begin() + dim, 1, std::multiplies<int64_t>());
+    int64_t W = std::accumulate(input_dims.begin() + dim + 1, input_dims.end(), 1, std::multiplies<int64_t>());
+    int64_t H_in = input_dims[dim];
+    int64_t num_splits = grad_outputs.size();
+
+    auto device = grad->GetDevice();
+    const auto &stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
+                             infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                             ->maca_stream();
+    // init the array of grad_output ptrs
+    std::vector<const T *> host_grad_output_ptrs;
+    for (const auto &grad_output : grad_outputs) {
+        host_grad_output_ptrs.push_back(static_cast<const T *>(grad_output->DataPtr()));
+    }
+
+    void *device_ptr;
+    const T **device_grad_output_ptrs;
+    int64_t *device_H_outs;
+    mcMallocAsync(&device_ptr, (sizeof(T *) + sizeof(int64_t)) * num_splits, stream);
+    device_grad_output_ptrs = (const T **)(device_ptr);
+    device_H_outs = reinterpret_cast<int64_t *>(device_grad_output_ptrs + num_splits);
+
+    mcMemcpyAsync(device_grad_output_ptrs, host_grad_output_ptrs.data(), sizeof(T *) * num_splits,
+                    mcMemcpyHostToDevice, stream);
+
+    // init H_out for each split
+    std::vector<int64_t> H_outs(num_splits);
+    for (int i = 0; i < num_splits; ++i) { H_outs[i] = std::min(split_size, H_in - i * split_size); }
+
+    mcMemcpyAsync(device_H_outs, H_outs.data(), sizeof(int64_t) * num_splits, mcMemcpyHostToDevice, stream);
+
+    int64_t total_elements = N * H_in * W;
+    int threads_per_block = 256;
+    int num_blocks = (total_elements + threads_per_block - 1) / threads_per_block;
+
+    SplitBackwardKernel<<<num_blocks, threads_per_block, 0, stream>>>(device_grad_output_ptrs,
+                                                                      static_cast<T *>(grad_input->DataPtr()), N, H_in,
+                                                                      W, split_size, num_splits, device_H_outs);
+
+    mcFreeAsync(device_ptr, stream);
+
+    return grad_input;
+}
+
+std::shared_ptr<Tensor> SplitBackward(const std::vector<int64_t> &input_dims, int64_t split_size, int dim,
+                                      const std::vector<std::shared_ptr<Tensor>> &grad_outputs) {
+    CHECK_GT(split_size, 0);
+    CHECK_GE(dim, 0) << "Currently we do not support negative dimension";
+    CHECK_LT(dim, input_dims.size());
+
+    return DispatchFunc<INFINI_ALL_TYPES>(
+        grad_outputs[0]->Dtype(),
+        [=]<typename T>() { return LaunchSplitBackward<T>(input_dims, split_size, dim, grad_outputs); },
+        "MACA SplitBackward");
+}
+} // namespace infini_train::kernels::maca
+
+#define REGISTER_MACA_SPLIT_KERNEL(kernel_name)                                                                        \
+    REGISTER_KERNEL(infini_train::Device::DeviceType::kMACA, kernel_name, infini_train::kernels::maca::kernel_name)
+
+REGISTER_MACA_SPLIT_KERNEL(SplitForward)
+REGISTER_MACA_SPLIT_KERNEL(SplitBackward)
+
+#undef REGISTER_MACA_SPLIT_KERNEL
diff --git a/infini_train/src/kernels/maca/stack.maca b/infini_train/src/kernels/maca/stack.maca
new file mode 100644
index 00000000..8c34f5f1
--- /dev/null
+++ b/infini_train/src/kernels/maca/stack.maca
@@ -0,0 +1,160 @@
+#include <algorithm>
+#include <memory>
+#include <numeric>
+#include <utility>
+#include <vector>
+
+#include "glog/logging.h"
+
+#include "infini_train/include/common/maca/common_maca.h"
+#include "infini_train/include/core/runtime/device_guard.h"
+#include "infini_train/include/dispatcher.h"
+#include "infini_train/include/tensor.h"
+
+#include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
+
+namespace infini_train::kernels::maca {
+template <typename T>
+__global__ void StackForwardKernel(const T **inputs, T *output, int64_t N, int64_t D, int64_t num_inputs) {
+    int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int64_t total = N * num_inputs * D;
+
+    if (idx >= total) {
+        return;
+    }
+
+    int64_t d = idx % D;
+    int64_t s = (idx / D) % num_inputs;
+    int64_t n = idx / (D * num_inputs);
+
+    const T *input = inputs[s];
+    output[idx] = input[n * D + d];
+}
+
+std::shared_ptr<Tensor> StackForward(const std::vector<std::shared_ptr<Tensor>> &inputs, int64_t dim) {
+    CHECK(!inputs.empty());
+
+    const auto &base_dims = inputs[0]->Dims();
+    auto dtype = inputs[0]->Dtype();
+    if (dim < 0) {
+        dim += base_dims.size() + 1;
+    }
+    CHECK_GE(dim, 0);
+    CHECK_LE(dim, base_dims.size());
+    for (const auto &input : inputs) { CHECK(input->Dims() == base_dims); }
+
+    std::vector<int64_t> out_dims = base_dims;
+    out_dims.insert(out_dims.begin() + dim, inputs.size());
+    auto output = std::make_shared<Tensor>(out_dims, dtype, inputs[0]->GetDevice());
+
+    const int64_t N = std::accumulate(base_dims.begin(), base_dims.begin() + dim, 1, std::multiplies<int64_t>());
+    const int64_t D = std::accumulate(base_dims.begin() + dim, base_dims.end(), 1, std::multiplies<int64_t>());
+    const int64_t num_inputs = inputs.size();
+
+    auto device = output->GetDevice();
+    const auto &stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
+                             infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                             ->maca_stream();
+
+    int64_t total = N * num_inputs * D;
+    int threads_per_block = 256;
+    int num_blocks = (total + threads_per_block - 1) / threads_per_block;
+
+    DispatchFunc<INFINI_ALL_TYPES>(
+        dtype,
+        [=]<typename T>() {
+            std::vector<const T *> host_input_ptrs;
+            for (const auto &t : inputs) { host_input_ptrs.push_back(static_cast<const T *>(t->DataPtr())); }
+
+            const T **device_input_ptrs;
+            mcMallocAsync(&device_input_ptrs, sizeof(T *) * num_inputs, stream);
+            mcMemcpyAsync(device_input_ptrs, host_input_ptrs.data(), sizeof(T *) * num_inputs, mcMemcpyHostToDevice,
+                            stream);
+
+            StackForwardKernel<<<num_blocks, threads_per_block, 0, stream>>>(
+                device_input_ptrs, static_cast<T *>(output->DataPtr()), N, D, num_inputs);
+
+            mcFreeAsync(device_input_ptrs, stream);
+        },
+        "MACA StackForward");
+
+    return output;
+}
+
+template <typename T>
+__global__ void StackBackwardKernel(const T *grad_output, T **grad_inputs, int64_t N, int64_t D, int64_t num_inputs) {
+    int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int64_t total = N * num_inputs * D;
+
+    if (idx >= total) {
+        return;
+    }
+
+    int64_t d = idx % D;
+    int64_t s = (idx / D) % num_inputs;
+    int64_t n = idx / (D * num_inputs);
+
+    if (s < num_inputs) {
+        grad_inputs[s][n * D + d] = grad_output[idx];
+    }
+}
+
+std::vector<std::shared_ptr<Tensor>> StackBackward(const std::vector<int64_t> &input_dims, int64_t dim,
+                                                   const std::shared_ptr<Tensor> &grad_output) {
+    if (dim < 0) {
+        dim += input_dims.size() + 1;
+    }
+    const int64_t num_inputs = grad_output->Dims()[dim];
+    std::vector<int64_t> base_dims = grad_output->Dims();
+    base_dims.erase(base_dims.begin() + dim);
+
+    auto dtype = grad_output->Dtype();
+    std::vector<std::shared_ptr<Tensor>> grads;
+    for (int i = 0; i < num_inputs; ++i) {
+        auto t = std::make_shared<Tensor>(base_dims, dtype, grad_output->GetDevice());
+        DispatchFunc<INFINI_ALL_TYPES>(
+            dtype, [=]<typename T>() { t->Fill<T>(0); }, "MACA StackBackward");
+        grads.push_back(t);
+    }
+
+    int64_t N = std::accumulate(input_dims.begin(), input_dims.begin() + dim, 1, std::multiplies<int64_t>());
+    int64_t D = std::accumulate(input_dims.begin() + dim, input_dims.end(), 1, std::multiplies<int64_t>());
+
+    auto device = grad_output->GetDevice();
+    const auto &stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
+                             infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                             ->maca_stream();
+
+    int64_t total = N * num_inputs * D;
+    int threads_per_block = 256;
+    int num_blocks = (total + threads_per_block - 1) / threads_per_block;
+
+    DispatchFunc<INFINI_ALL_TYPES>(
+        dtype,
+        [=]<typename T>() {
+            std::vector<T *> host_ptrs;
+            for (auto &t : grads) { host_ptrs.push_back(static_cast<T *>(t->DataPtr())); }
+
+            T **device_ptrs;
+            mcMallocAsync(&device_ptrs, sizeof(T *) * num_inputs, stream);
+            mcMemcpyAsync(device_ptrs, host_ptrs.data(), sizeof(T *) * num_inputs, mcMemcpyHostToDevice, stream);
+
+            StackBackwardKernel<<<num_blocks, threads_per_block, 0, stream>>>(
+                static_cast<const T *>(grad_output->DataPtr()), device_ptrs, N, D, num_inputs);
+
+            mcFreeAsync(device_ptrs, stream);
+        },
+        "MACA StackBackward");
+
+    return grads;
+}
+
+} // namespace infini_train::kernels::maca
+
+#define REGISTER_MACA_STACK_KERNEL(kernel_name)                                                                        \
+    REGISTER_KERNEL(infini_train::Device::DeviceType::kMACA, kernel_name, infini_train::kernels::maca::kernel_name)
+
+REGISTER_MACA_STACK_KERNEL(StackForward)
+REGISTER_MACA_STACK_KERNEL(StackBackward)
+
+#undef REGISTER_MACA_STACK_KERNEL
diff --git a/infini_train/src/kernels/maca/transform.maca b/infini_train/src/kernels/maca/transform.maca
new file mode 100644
index 00000000..78628015
--- /dev/null
+++ b/infini_train/src/kernels/maca/transform.maca
@@ -0,0 +1,592 @@
+#include <algorithm>
+#include <functional>
+#include <memory>
+#include <numeric>
+#include <vector>
+
+#include "infini_train/include/common/maca/common_maca.h"
+#include "infini_train/include/common/maca/kernel_helper.cuh"
+#include "infini_train/include/core/runtime/device_guard.h"
+#include "infini_train/include/dispatcher.h"
+#include "infini_train/include/tensor.h"
+
+#include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
+
+namespace infini_train::kernels::maca {
+
+template <typename T>
+__global__ void TrilForwardKernel(const T *input, T *output, int rows, int cols, int64_t diagonal) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= rows * cols) {
+        return;
+    }
+
+    int row = idx / cols;
+    int col = idx % cols;
+
+    if (row - col + diagonal >= 0) {
+        output[idx] = input[idx];
+    } else {
+        output[idx] = T(0);
+    }
+}
+
+std::shared_ptr<Tensor> TrilForward(const std::shared_ptr<Tensor> &input, int64_t diagonal) {
+    CHECK_EQ(input->Dims().size(), 2);
+    int64_t rows = input->Dims()[0];
+    int64_t cols = input->Dims()[1];
+
+    auto output = std::make_shared<Tensor>(input->Dims(), input->Dtype(), input->GetDevice());
+
+    int threads_per_block = 256;
+    int num_blocks = (rows * cols + threads_per_block - 1) / threads_per_block;
+
+    auto device = input->GetDevice();
+    const auto &maca_stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
+                                  infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                                  ->maca_stream();
+
+    DispatchFunc<INFINI_ALL_TYPES>(
+        input->Dtype(),
+        [=]<typename T>() {
+            TrilForwardKernel<<<num_blocks, threads_per_block, 0, maca_stream>>>(
+                static_cast<T *>(input->DataPtr()), static_cast<T *>(output->DataPtr()), rows, cols, diagonal);
+        },
+        "MACA TrilForward");
+
+    return output;
+}
+
+template <typename T>
+__global__ void TrilBackwardKernel(const T *grad_output, T *grad_input, int rows, int cols, int64_t diagonal) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= rows * cols) {
+        return;
+    }
+
+    int row = idx / cols;
+    int col = idx % cols;
+
+    if (row - col + diagonal >= 0) {
+        grad_input[idx] = grad_output[idx];
+    } else {
+        grad_input[idx] = T(0);
+    }
+}
+
+std::shared_ptr<Tensor> TrilBackward(const std::shared_ptr<Tensor> &grad_output, int64_t diagonal) {
+    int rows = grad_output->Dims()[0];
+    int cols = grad_output->Dims()[1];
+
+    auto dtype = grad_output->Dtype();
+    auto grad_input = std::make_shared<Tensor>(grad_output->Dims(), dtype, grad_output->GetDevice());
+
+    int threads_per_block = 256;
+    int num_blocks = (rows * cols + threads_per_block - 1) / threads_per_block;
+
+    auto device = grad_output->GetDevice();
+    const auto &maca_stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
+                                  infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                                  ->maca_stream();
+
+    DispatchFunc<INFINI_ALL_TYPES>(
+        dtype,
+        [=]<typename T>() {
+            grad_input->Fill<T>(0);
+            TrilBackwardKernel<<<num_blocks, threads_per_block, 0, maca_stream>>>(
+                static_cast<const T *>(grad_output->DataPtr()), static_cast<T *>(grad_input->DataPtr()), rows, cols,
+                diagonal);
+        },
+        "MACA TrilBackward");
+
+    return grad_input;
+}
+
+template <typename T>
+__global__ void TriuForwardKernel(const T *input, T *output, int rows, int cols, int64_t diagonal) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= rows * cols) {
+        return;
+    }
+
+    int row = idx / cols;
+    int col = idx % cols;
+
+    if (row - col + diagonal <= 0) {
+        output[idx] = input[idx];
+    } else {
+        output[idx] = T(0);
+    }
+}
+
+std::shared_ptr<Tensor> TriuForward(const std::shared_ptr<Tensor> &input, int64_t diagonal) {
+    CHECK_EQ(input->Dims().size(), 2);
+    int64_t rows = input->Dims()[0];
+    int64_t cols = input->Dims()[1];
+
+    auto output = std::make_shared<Tensor>(input->Dims(), input->Dtype(), input->GetDevice());
+
+    int threads_per_block = 256;
+    int num_blocks = (rows * cols + threads_per_block - 1) / threads_per_block;
+
+    auto device = input->GetDevice();
+    const auto &maca_stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
+                                  infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                                  ->maca_stream();
+
+    DispatchFunc<INFINI_ALL_TYPES>(
+        input->Dtype(),
+        [=]<typename T>() {
+            TriuForwardKernel<<<num_blocks, threads_per_block, 0, maca_stream>>>(
+                static_cast<const T *>(input->DataPtr()), static_cast<T *>(output->DataPtr()), rows, cols, diagonal);
+        },
+        "MACA TriuForward");
+
+    return output;
+}
+
+template <typename T>
+__global__ void TriuBackwardKernel(const T *grad_output, T *grad_input, int rows, int cols, int64_t diagonal) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= rows * cols) {
+        return;
+    }
+
+    int row = idx / cols;
+    int col = idx % cols;
+
+    if (row - col + diagonal <= 0) {
+        grad_input[idx] = grad_output[idx];
+    } else {
+        grad_input[idx] = T(0);
+    }
+}
+
+std::shared_ptr<Tensor> TriuBackward(const std::shared_ptr<Tensor> &grad_output, int64_t diagonal) {
+    int rows = grad_output->Dims()[0];
+    int cols = grad_output->Dims()[1];
+
+    auto dtype = grad_output->Dtype();
+    auto grad_input = std::make_shared<Tensor>(grad_output->Dims(), dtype, grad_output->GetDevice());
+
+    int threads_per_block = 256;
+    int num_blocks = (rows * cols + threads_per_block - 1) / threads_per_block;
+    auto device = grad_output->GetDevice();
+    const auto &maca_stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
+                                  infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                                  ->maca_stream();
+
+    DispatchFunc<INFINI_ALL_TYPES>(
+        dtype,
+        [=]<typename T>() {
+            grad_input->Fill<T>(0);
+            TriuBackwardKernel<<<num_blocks, threads_per_block, 0, maca_stream>>>(
+                static_cast<const T *>(grad_output->DataPtr()), static_cast<T *>(grad_input->DataPtr()), rows, cols,
+                diagonal);
+        },
+        "MACA TriuBackward");
+
+    return grad_input;
+}
+
+template <typename T>
+__global__ void TransposeForwardKernel(const T *input, T *output, const int64_t *in_dims, const int64_t *in_strides,
+                                       const int64_t *out_strides, int64_t ndim, int64_t dim0, int64_t dim1,
+                                       int64_t num_elements) {
+    int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= num_elements) {
+        return;
+    }
+
+    int64_t remaining = idx;
+    // TODO(zbl): assume ndim <= 8 here
+    int64_t coords[8];
+
+    // 1. decode coord from output index
+    for (int i = 0; i < ndim; ++i) {
+        coords[i] = remaining / out_strides[i];
+        remaining %= out_strides[i];
+    }
+
+    // 2. swap the coordinates
+    int64_t tmp = coords[dim0];
+    coords[dim0] = coords[dim1];
+    coords[dim1] = tmp;
+
+    // 3. compute input flat index
+    int64_t in_flat_idx = 0;
+    for (int i = 0; i < ndim; ++i) { in_flat_idx += coords[i] * in_strides[i]; }
+
+    output[idx] = input[in_flat_idx];
+}
+
+std::shared_ptr<Tensor> TransposeForward(const std::shared_ptr<Tensor> &input, int64_t dim0, int64_t dim1) {
+    // TODO(zbl): assume ndim <= 8 here
+    CHECK_LE(input->Dims().size(), 8);
+    dim0 = dim0 < 0 ? dim0 + input->Dims().size() : dim0;
+    dim1 = dim1 < 0 ? dim1 + input->Dims().size() : dim1;
+    CHECK(dim0 >= 0 && dim0 < input->Dims().size() && dim1 >= 0 && dim1 < input->Dims().size());
+
+    auto in_dims = input->Dims();
+    std::vector<int64_t> out_dims = in_dims;
+    std::swap(out_dims[dim0], out_dims[dim1]);
+
+    auto dtype = input->Dtype();
+    auto output = std::make_shared<Tensor>(out_dims, dtype, input->GetDevice());
+    int64_t ndim = in_dims.size();
+    int64_t num_elements = output->NumElements();
+
+    // compute strides of in_dims and out_dims
+    std::vector<int64_t> in_strides(ndim, 1);
+    std::vector<int64_t> out_strides(ndim, 1);
+    for (int i = ndim - 2; i >= 0; --i) {
+        in_strides[i] = in_strides[i + 1] * in_dims[i + 1];
+        out_strides[i] = out_strides[i + 1] * out_dims[i + 1];
+    }
+
+    auto device = input->GetDevice();
+    const auto &stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
+                             infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                             ->maca_stream();
+
+    // Allocate device memory for dims and strides
+    // TODO(zbl): avoid using mcMalloc?
+    int64_t *device_buffer;
+    mcMallocAsync(&device_buffer, 3 * ndim * sizeof(int64_t), stream);
+
+    int64_t *in_dims_dev = device_buffer;
+    int64_t *in_strides_dev = device_buffer + ndim;
+    int64_t *out_strides_dev = device_buffer + 2 * ndim;
+
+    std::vector<int64_t> host_buffer;
+    host_buffer.insert(host_buffer.end(), in_dims.begin(), in_dims.end());
+    host_buffer.insert(host_buffer.end(), in_strides.begin(), in_strides.end());
+    host_buffer.insert(host_buffer.end(), out_strides.begin(), out_strides.end());
+
+    mcMemcpyAsync(device_buffer, host_buffer.data(), 3 * ndim * sizeof(int64_t), mcMemcpyHostToDevice, stream);
+
+    int threads_per_block = 256;
+    int num_blocks = (num_elements + threads_per_block - 1) / threads_per_block;
+
+    DispatchFunc<INFINI_ALL_TYPES>(
+        dtype,
+        [=]<typename T>() {
+            output->Fill<T>(0);
+            TransposeForwardKernel<<<num_blocks, threads_per_block, 0, stream>>>(
+                static_cast<const T *>(input->DataPtr()), static_cast<T *>(output->DataPtr()), in_dims_dev,
+                in_strides_dev, out_strides_dev, ndim, dim0, dim1, num_elements);
+        },
+        "MACA TransposeForward");
+
+    mcFreeAsync(device_buffer, stream);
+
+    return output;
+}
+
+std::shared_ptr<Tensor> TransposeBackward(const std::shared_ptr<Tensor> &grad_output, int64_t dim0, int64_t dim1) {
+    return TransposeForward(grad_output, dim1, dim0);
+}
+
+namespace {
+enum class MaskMode { kLead, kTail };
+
+static bool IsLeadMaskShape(const std::vector<int64_t> &in, const std::vector<int64_t> &mk) {
+    if (mk.empty() || in.empty()) {
+        return false;
+    }
+    if (mk.size() > in.size()) {
+        return false;
+    }
+    for (size_t d = 0; d < mk.size(); ++d) {
+        if (!(mk[d] == in[d] || mk[d] == 1)) {
+            return false;
+        }
+    }
+    return true;
+}
+
+static bool IsTailMaskShape(const std::vector<int64_t> &in, const std::vector<int64_t> &mk) {
+    if (mk.size() > in.size()) {
+        return false;
+    }
+    size_t k = mk.size();
+    for (size_t i = 0; i < k; ++i) {
+        int64_t in_dim = in[in.size() - k + i];
+        int64_t mk_dim = mk[i];
+        if (!(mk_dim == in_dim || mk_dim == 1)) {
+            return false;
+        }
+    }
+    return true;
+}
+
+static MaskMode DecideMaskMode(const std::vector<int64_t> &in, const std::vector<int64_t> &mk) {
+    bool lead = IsLeadMaskShape(in, mk);
+    bool tail = IsTailMaskShape(in, mk);
+    CHECK(lead || tail) << "Mask must align/broadcast to either leading or trailing axes.";
+    // By default mask along tailing dims
+    return tail ? MaskMode::kTail : MaskMode::kLead;
+}
+} // namespace
+
+template <typename T>
+__global__ void MaskForwardKernel(const T *input, const T *mask, T *output, T value, int batch_size, int mask_size) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < batch_size * mask_size) {
+        output[i] = (mask[i % mask_size] == T(1)) ? value : input[i];
+    }
+}
+
+template <typename T>
+__global__ void MaskLeadsForwardKernel(const T *input, const T *mask, T *output, T value, int rows, int inner) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < rows * inner) {
+        output[i] = (mask[i / inner] == T(1)) ? value : input[i];
+    }
+}
+
+std::shared_ptr<Tensor> MaskForward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tensor> &mask,
+                                    float value) {
+    auto input_shape = input->Dims();
+    auto mask_shape = mask->Dims();
+    auto dtype = input->Dtype();
+    auto mask_casted = mask->Dtype() == dtype ? mask : std::make_shared<Tensor>(mask->To(dtype));
+    // TODO(zbl): support bool mask
+    CHECK_EQ(static_cast<int>(dtype), static_cast<int>(mask_casted->Dtype()))
+        << "For now, input/mask dtypes must match.";
+
+    MaskMode mode = DecideMaskMode(input_shape, mask_shape);
+
+    auto output = std::make_shared<Tensor>(input_shape, dtype, input->GetDevice());
+    auto device = output->GetDevice();
+    const auto &maca_stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
+                                  infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                                  ->maca_stream();
+
+    int threads_per_block = 256;
+
+    if (mode == MaskMode::kLead) {
+        int64_t rows = mask->NumElements();
+        int64_t inner = input->NumElements() / rows;
+        int num_blocks = static_cast<int>((input->NumElements() + threads_per_block - 1) / threads_per_block);
+
+        DispatchFunc<INFINI_ALL_TYPES>(
+            dtype,
+            [=]<typename T>() {
+                MaskLeadsForwardKernel<T><<<num_blocks, threads_per_block, 0, maca_stream>>>(
+                    static_cast<const T *>(input->DataPtr()), static_cast<const T *>(mask->DataPtr()),
+                    static_cast<T *>(output->DataPtr()), common::maca::Cast<T>(value), rows, inner);
+            },
+            "MACA MaskForward(rows)");
+    } else { // kTail
+        int64_t mask_size = mask->NumElements();
+        int64_t batch_size = input->NumElements() / mask_size;
+        int num_blocks = static_cast<int>((input->NumElements() + threads_per_block - 1) / threads_per_block);
+
+        DispatchFunc<INFINI_ALL_TYPES>(
+            dtype,
+            [=]<typename T>() {
+                MaskForwardKernel<T><<<num_blocks, threads_per_block, 0, maca_stream>>>(
+                    static_cast<const T *>(input->DataPtr()), static_cast<const T *>(mask_casted->DataPtr()),
+                    static_cast<T *>(output->DataPtr()), common::maca::Cast<T>(value), static_cast<int>(batch_size),
+                    static_cast<int>(mask_size));
+            },
+            "MACA MaskForward(tail)");
+    }
+
+    return output;
+}
+
+template <typename T>
+__global__ void MaskBackwardKernel(const T *grad_output, const T *mask, T *grad_input, int batch_size, int mask_size) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < batch_size * mask_size) {
+        grad_input[i] = (mask[i % mask_size] == T(1)) ? T(0) : grad_output[i];
+    }
+}
+
+template <typename T>
+__global__ void MaskLeadsBackwardKernel(const T *grad_output, const T *mask, T *grad_input, int rows, int inner) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < rows * inner) {
+        grad_input[i] = (mask[i / inner] == T(1)) ? T(0) : grad_output[i];
+    }
+}
+
+std::shared_ptr<Tensor> MaskBackward(const std::shared_ptr<Tensor> &grad_output, const std::shared_ptr<Tensor> &mask) {
+    auto output_shape = grad_output->Dims();
+    auto mask_shape = mask->Dims();
+    auto dtype = grad_output->Dtype();
+    auto mask_casted = std::make_shared<Tensor>(mask->To(dtype));
+
+    MaskMode mode = DecideMaskMode(output_shape, mask_shape);
+
+    auto grad_input = std::make_shared<Tensor>(output_shape, dtype, grad_output->GetDevice());
+    auto device = grad_output->GetDevice();
+    const auto &maca_stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
+                                  infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                                  ->maca_stream();
+
+    int threads_per_block = 256;
+
+    if (mode == MaskMode::kLead) {
+        int64_t rows = mask->NumElements();
+        int64_t inner = grad_output->NumElements() / rows;
+        int num_blocks = static_cast<int>((grad_output->NumElements() + threads_per_block - 1) / threads_per_block);
+
+        DispatchFunc<INFINI_ALL_TYPES>(
+            dtype,
+            [=]<typename T>() {
+                grad_input->Fill<T>(0);
+                MaskLeadsBackwardKernel<T><<<num_blocks, threads_per_block, 0, maca_stream>>>(
+                    static_cast<const T *>(grad_output->DataPtr()), static_cast<const T *>(mask_casted->DataPtr()),
+                    static_cast<T *>(grad_input->DataPtr()), rows, inner);
+            },
+            "MACA MaskBackward(rows)");
+    } else { // kTail
+        int64_t mask_size = mask->NumElements();
+        int64_t batch_size = grad_output->NumElements() / mask_size;
+        int num_blocks = static_cast<int>((grad_output->NumElements() + threads_per_block - 1) / threads_per_block);
+
+        DispatchFunc<INFINI_ALL_TYPES>(
+            dtype,
+            [=]<typename T>() {
+                grad_input->Fill<T>(0);
+                MaskBackwardKernel<T><<<num_blocks, threads_per_block, 0, maca_stream>>>(
+                    static_cast<const T *>(grad_output->DataPtr()), static_cast<const T *>(mask_casted->DataPtr()),
+                    static_cast<T *>(grad_input->DataPtr()), static_cast<int>(batch_size), static_cast<int>(mask_size));
+            },
+            "MACA MaskBackward(tail)");
+    }
+
+    return grad_input;
+}
+
+template <typename T>
+__global__ void RepeatInterleaveForwardKernel(const T *input, T *output, int64_t outer, int64_t dim_size, int64_t inner,
+                                              int64_t repeat) {
+    int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int64_t total = outer * dim_size * repeat * inner;
+    if (idx >= total) {
+        return;
+    }
+
+    int64_t i = idx / inner;
+    int64_t j = idx % inner;
+
+    int64_t o = i / (dim_size * repeat);
+    int64_t di = (i / repeat) % dim_size;
+
+    output[idx] = input[(o * dim_size + di) * inner + j];
+}
+
+std::shared_ptr<Tensor> RepeatInterleaveForward(const std::shared_ptr<Tensor> &input, int64_t repeat, int64_t dim) {
+    CHECK_GT(repeat, 0);
+    CHECK_GE(dim, 0);
+    CHECK_LT(dim, input->Dims().size());
+
+    const auto &input_dims = input->Dims();
+    const int64_t outer = std::accumulate(input_dims.begin(), input_dims.begin() + dim, 1, std::multiplies<int64_t>());
+    const int64_t inner
+        = std::accumulate(input_dims.begin() + dim + 1, input_dims.end(), 1, std::multiplies<int64_t>());
+    const int64_t dim_size = input_dims[dim];
+
+    std::vector<int64_t> output_dims = input_dims;
+    output_dims[dim] = dim_size * repeat;
+    auto output = std::make_shared<Tensor>(output_dims, input->Dtype(), input->GetDevice());
+
+    int64_t total_elements = outer * dim_size * repeat * inner;
+    int threads_per_block = 256;
+    int num_blocks = (total_elements + threads_per_block - 1) / threads_per_block;
+    auto device = input->GetDevice();
+    const auto &maca_stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
+                                  infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                                  ->maca_stream();
+
+    DispatchFunc<INFINI_ALL_TYPES>(
+        input->Dtype(),
+        [=]<typename T>() {
+            RepeatInterleaveForwardKernel<<<num_blocks, threads_per_block, 0, maca_stream>>>(
+                static_cast<const T *>(input->DataPtr()), static_cast<T *>(output->DataPtr()), outer, dim_size, inner,
+                repeat);
+        },
+        "MACA RepeatInterleaveForward");
+
+    return output;
+}
+
+template <typename T>
+__global__ void RepeatInterleaveBackwardKernel(const T *grad_output, T *grad_input, int64_t outer, int64_t dim_size,
+                                               int64_t inner, int64_t repeat) {
+    int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int64_t total = outer * dim_size * inner;
+    if (idx >= total) {
+        return;
+    }
+
+    int64_t i = idx / inner;
+    int64_t j = idx % inner;
+
+    int64_t o = i / dim_size;
+    int64_t di = i % dim_size;
+
+    T sum = T(0);
+    for (int64_t r = 0; r < repeat; ++r) {
+        int64_t out_idx = ((o * dim_size * repeat + di * repeat + r) * inner) + j;
+        sum += grad_output[out_idx];
+    }
+    grad_input[idx] = sum;
+}
+
+std::shared_ptr<Tensor> RepeatInterleaveBackward(const std::shared_ptr<Tensor> &grad_output,
+                                                 const std::vector<int64_t> &input_dims, int64_t dim) {
+    CHECK_GE(dim, 0);
+    CHECK_LT(dim, input_dims.size());
+
+    const int64_t outer = std::accumulate(input_dims.begin(), input_dims.begin() + dim, 1, std::multiplies<int64_t>());
+    const int64_t inner
+        = std::accumulate(input_dims.begin() + dim + 1, input_dims.end(), 1, std::multiplies<int64_t>());
+    const int64_t dim_size = input_dims[dim];
+
+    int64_t repeat = grad_output->Dims()[dim] / dim_size;
+    CHECK_EQ(grad_output->Dims()[dim], dim_size * repeat);
+
+    auto grad_input = std::make_shared<Tensor>(input_dims, grad_output->Dtype(), grad_output->GetDevice());
+
+    int64_t total_elements = outer * dim_size * inner;
+    int threads_per_block = 256;
+    int num_blocks = (total_elements + threads_per_block - 1) / threads_per_block;
+    auto device = grad_output->GetDevice();
+    const auto &maca_stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
+                                  infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                                  ->maca_stream();
+
+    DispatchFunc<INFINI_ALL_TYPES>(
+        grad_output->Dtype(),
+        [=]<typename T>() {
+            grad_input->Fill<T>(0);
+            RepeatInterleaveBackwardKernel<<<num_blocks, threads_per_block, 0, maca_stream>>>(
+                static_cast<const T *>(grad_output->DataPtr()), static_cast<T *>(grad_input->DataPtr()), outer,
+                dim_size, inner, repeat);
+        },
+        "MACA RepeatInterleaveBackward");
+
+    return grad_input;
+}
+} // namespace infini_train::kernels::maca
+
+#define REGISTER_MACA_TRANSFORM_KERNEL(kernel_name)                                                                    \
+    REGISTER_KERNEL(infini_train::Device::DeviceType::kMACA, kernel_name, infini_train::kernels::maca::kernel_name)
+
+REGISTER_MACA_TRANSFORM_KERNEL(TrilForward)
+REGISTER_MACA_TRANSFORM_KERNEL(TrilBackward)
+REGISTER_MACA_TRANSFORM_KERNEL(TriuForward)
+REGISTER_MACA_TRANSFORM_KERNEL(TriuBackward)
+REGISTER_MACA_TRANSFORM_KERNEL(TransposeForward)
+REGISTER_MACA_TRANSFORM_KERNEL(TransposeBackward)
+REGISTER_MACA_TRANSFORM_KERNEL(MaskForward)
+REGISTER_MACA_TRANSFORM_KERNEL(MaskBackward)
+REGISTER_MACA_TRANSFORM_KERNEL(RepeatInterleaveForward)
+REGISTER_MACA_TRANSFORM_KERNEL(RepeatInterleaveBackward)
+
+#undef REGISTER_MACA_TRANSFORM_KERNEL
diff --git a/infini_train/src/kernels/maca/vocab_parallel_cross_entropy.maca b/infini_train/src/kernels/maca/vocab_parallel_cross_entropy.maca
new file mode 100644
index 00000000..d79780b7
--- /dev/null
+++ b/infini_train/src/kernels/maca/vocab_parallel_cross_entropy.maca
@@ -0,0 +1,125 @@
+#include <cmath>
+
+#include <cub/block/block_reduce.cuh>
+
+#include "infini_train/include/common/maca/common_maca.h"
+#include "infini_train/include/common/maca/kernel_helper.cuh"
+#include "infini_train/include/core/runtime/device_guard.h"
+#include "infini_train/include/dispatcher.h"
+#include "infini_train/include/tensor.h"
+
+#include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
+
+namespace infini_train::kernels::maca {
+
+template <size_t BLOCK_SIZE, typename Tinput, typename Tmask, typename Tindex>
+__global__ void
+VocabParallelCrossEntropyBackwardKernel(const Tinput *__restrict__ softmax_local,   // [rows, V_local]
+                                        Tinput *__restrict__ grad_input,            // [rows, V_local]
+                                        const Tindex *__restrict__ masked_target,   // [rows]
+                                        const Tmask *__restrict__ target_mask_row,  // [rows]，0/1
+                                        const Tmask *__restrict__ valid_mask_local, // [rows, V_local] or [1, V_local]
+                                        const Tinput *__restrict__ dloss_buf,       // [1] or [rows]
+                                        int rows, int V_local,
+                                        int dloss_is_scalar,             // 1=scalaer，0=by row
+                                        float one_minus_label_smoothing, // 1 - label_smoothing
+                                        float smoothing_term             // label_smoothing / vocab_size_original
+) {
+    const int r = blockIdx.x;
+    if (r >= rows) {
+        return;
+    }
+
+    const float dm = common::maca::Cast<float>(dloss_is_scalar ? dloss_buf[0] : dloss_buf[r]);
+    const float vm_row = 1.0f - common::maca::Cast<float>(target_mask_row[r]);
+    const float row_scale = dm * one_minus_label_smoothing * vm_row;
+    const Tindex t = masked_target[r];
+
+    for (int j = threadIdx.x; j < V_local; j += BLOCK_SIZE) {
+        const int idx = r * V_local + j;
+
+        const float s = common::maca::Cast<float>(softmax_local[idx]);
+        const float vm = common::maca::Cast<float>(valid_mask_local[j]);
+
+        float grad = dm * s;
+
+        if (static_cast<int64_t>(t) >= 0 && j == static_cast<int>(t)) {
+            grad -= row_scale;
+        }
+
+        grad -= dm * smoothing_term * vm;
+        grad *= vm;
+
+        grad_input[idx] = common::maca::Cast<Tinput>(grad);
+    }
+}
+
+std::shared_ptr<Tensor>
+VocabParallelCrossEntropyBackward(const std::shared_ptr<Tensor> &grad_output,      // [rows]
+                                  const std::shared_ptr<Tensor> &softmax_local,    // [rows, V_local]
+                                  const std::shared_ptr<Tensor> &target_mask,      // [rows]
+                                  const std::shared_ptr<Tensor> &masked_target,    // [rows]，int64
+                                  const std::shared_ptr<Tensor> &valid_mask_local, // [1, V_local]
+                                  const int64_t vocab_size_local, const int64_t vocab_size_original,
+                                  float label_smoothing) {
+
+    const int64_t rows = softmax_local->NumElements() / vocab_size_local;
+    CHECK_EQ(masked_target->NumElements(), rows);
+    CHECK_EQ(target_mask->NumElements(), rows);
+    CHECK_EQ(valid_mask_local->NumElements(), vocab_size_local);
+
+    int dloss_is_scalar = 0;
+    if (grad_output->Dims().size() == 0) {
+        dloss_is_scalar = 1;
+    } else {
+        CHECK(grad_output->NumElements() == rows || grad_output->NumElements() == 1)
+            << "grad_output must be scalar or length rows";
+        dloss_is_scalar = (grad_output->NumElements() == 1);
+    }
+
+    auto device = grad_output->GetDevice();
+    const auto &maca_stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
+                                  infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
+                                  ->maca_stream();
+
+    // logits should be [rows, V_local]
+    auto grad_input = std::make_shared<Tensor>(softmax_local->Dims(), softmax_local->Dtype(), device);
+
+    const float one_minus_label_smoothing = 1.0f - label_smoothing;
+    const float smoothing_term = (label_smoothing > 0.f && vocab_size_original > 0)
+                                   ? (label_smoothing / static_cast<float>(vocab_size_original))
+                                   : 0.0f;
+
+    constexpr int threads_per_block = 256;
+    const int num_blocks = static_cast<int>(rows);
+
+    DispatchFunc<DataTypeList<DataType::kUINT8, DataType::kINT64>, DataTypeList<INFINI_ALL_FLOATING_TYPES>>(
+        {masked_target->Dtype(), softmax_local->Dtype()},
+        [=]<typename Tindex, typename Tinput>() {
+            using Tmask = Tinput;
+
+            const Tinput *softmax_ptr = static_cast<const Tinput *>(softmax_local->DataPtr());
+            const Tmask *tmask_ptr = static_cast<const Tmask *>(target_mask->DataPtr());
+            const Tmask *vml_ptr = static_cast<const Tmask *>(valid_mask_local->DataPtr());
+            const Tindex *mtarget_ptr = static_cast<const Tindex *>(masked_target->DataPtr());
+            const Tinput *grad_output_ptr = static_cast<const Tinput *>(grad_output->DataPtr());
+            Tinput *grad_input_ptr = static_cast<Tinput *>(grad_input->DataPtr());
+
+            VocabParallelCrossEntropyBackwardKernel<threads_per_block, Tinput, Tmask, Tindex>
+                <<<num_blocks, threads_per_block, 0, maca_stream>>>(softmax_ptr, grad_input_ptr, mtarget_ptr, tmask_ptr,
+                                                                    vml_ptr, grad_output_ptr, static_cast<int>(rows),
+                                                                    static_cast<int>(vocab_size_local), dloss_is_scalar,
+                                                                    one_minus_label_smoothing, smoothing_term);
+        },
+        "MACA VocabParallelCrossEntropyBackward");
+
+    return grad_input;
+}
+} // namespace infini_train::kernels::maca
+
+#define REGISTER_MACA_VOCAB_PARALLEL_CROSS_ENTROPY_KERNEL(kernel_name)                                                 \
+    REGISTER_KERNEL(infini_train::Device::DeviceType::kMACA, kernel_name, infini_train::kernels::maca::kernel_name)
+
+REGISTER_MACA_VOCAB_PARALLEL_CROSS_ENTROPY_KERNEL(VocabParallelCrossEntropyBackward)
+
+#undef REGISTER_MACA_CROSS_ENTROPY_KERNEL

From 3db42b3687453e7dde1ab96eaf1c14bcfc3cc1b4 Mon Sep 17 00:00:00 2001
From: kilinchange <kilinchange@163.com>
Date: Tue, 14 Apr 2026 04:21:04 +0000
Subject: [PATCH 03/12] fix: resolve MACA build and runtime issues to enable
 GPT-2 training
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CMakeLists.txt:
- Pre-set HAVE_MODE_T/HAVE_SSIZE_T and their sentinel variables
  (HAVE_HAVE_MODE_T/HAVE_HAVE_SSIZE_T) before add_subdirectory(glog),
  since mxcc cmake feature-detection probes cannot find standard POSIX
  headers; without the sentinels check_type_size re-runs and overwrites
  the pre-set values, causing glog to emit conflicting fallback typedefs
- Add BUILD_TESTING=OFF to skip glog unit tests (-fPIE unsupported by mxcc)
- Add BUILD_SHARED_LIBS=OFF to build glog as a static library; mxcc
  defaults to hidden symbol visibility, making libglog.so export nothing

datatype.h:
- Add is_bfloat16<T> and is_fp16<T> type traits with USE_CUDA/USE_MACA
  specializations, needed by common_cpu.h Cast and init.cc ARANGE_CASE

common/cpu/common_cpu.h:
- Route fp16/bf16 destinations through float in Cast<T>(), avoiding
  ambiguous integer→__half/__maca_bfloat16 conversion on MACA

kernels/maca/{stack,concat,slice,transform,elementwise,split,gather}.maca:
- Add reinterpret_cast<void **> to all mcMallocAsync(&ptr, ...) calls;
  MACA's mcMallocAsync requires void** but typed pointers were passed
- Fix mcDevAttrMultiProcessorCount → mcDeviceAttributeMultiProcessorCount
  in elementwise.maca (correct MACA enum name)

optimizer.cc:
- Change Fill<T>(0) → Fill<T>(0.f) for Adam m/v initialization;
  __half(0) is ambiguous on MACA (only float/double ctors available)

nn/init.cc:
- Replace std::iota + static_cast<TYPE>(start) in ARANGE_CASE with an
  explicit loop via static_cast<float> to avoid ambiguous integer→fp16/
  bf16 conversion for kBFLOAT16/kFLOAT16 cases

example/gpt2/main.cc:
- Add kDeviceMACA constant, update --device validator to accept "maca",
  and add Device::DeviceType::kMACA branch in device selection
---
 CMakeLists.txt                                | 40 +++++++++++++++++++
 example/gpt2/main.cc                          | 14 +++++--
 infini_train/src/kernels/maca/concat.maca     |  8 ++--
 .../src/kernels/maca/elementwise.maca         |  6 +--
 infini_train/src/kernels/maca/gather.maca     |  4 +-
 infini_train/src/kernels/maca/slice.maca      |  4 +-
 infini_train/src/kernels/maca/split.maca      |  2 +-
 infini_train/src/kernels/maca/stack.maca      |  4 +-
 infini_train/src/kernels/maca/transform.maca  |  2 +-
 9 files changed, 66 insertions(+), 18 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 262b97f3..2de651e8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,6 +44,46 @@ include_directories(${gflags_SOURCE_DIR}/include)
 # glog
 set(WITH_GFLAGS OFF CACHE BOOL "Disable glog finding system gflags" FORCE)
 set(WITH_GTEST OFF CACHE BOOL "Disable glog finding system gtest" FORCE)
+set(BUILD_TESTING OFF CACHE BOOL "Disable glog unit tests" FORCE)
+# Build glog as a static lib so its symbols are always visible at link time.
+# Under mxcc the default symbol visibility is hidden, which causes the shared
+# libglog.so to export no symbols and produces "undefined reference" errors.
+set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build glog as static library" FORCE)
+
+# Under MACA/mxcc, cmake's feature-detection test compilations do not find
+# standard POSIX system headers (mxcc has a non-standard sysroot probe path).
+# Pre-set glog's HAVE_* cache variables so that glog skips its fallback type /
+# symbol definitions, which would otherwise conflict with the real system
+# headers during the actual build.
+if(USE_MACA)
+  set(HAVE_SYS_TYPES_H    1 CACHE INTERNAL "")
+  set(HAVE_UNISTD_H        1 CACHE INTERNAL "")
+  set(HAVE_DLFCN_H         1 CACHE INTERNAL "")
+  set(HAVE_GLOB_H          1 CACHE INTERNAL "")
+  set(HAVE_PWD_H           1 CACHE INTERNAL "")
+  set(HAVE_SYS_TIME_H      1 CACHE INTERNAL "")
+  set(HAVE_SYS_UTSNAME_H   1 CACHE INTERNAL "")
+  set(HAVE_SYS_WAIT_H      1 CACHE INTERNAL "")
+  set(HAVE_SYS_SYSCALL_H   1 CACHE INTERNAL "")
+  set(HAVE_SYSLOG_H        1 CACHE INTERNAL "")
+  set(HAVE_UCONTEXT_H      1 CACHE INTERNAL "")
+  # check_type_size() uses two internal variables: the size value and a sentinel
+  # "HAVE_HAVE_<VAR>" that marks the check as done. Pre-setting only the value
+  # is insufficient — the sentinel must also be set so the check skips entirely.
+  set(HAVE_MODE_T          4    CACHE INTERNAL "")  # 4 bytes on Linux
+  set(HAVE_HAVE_MODE_T     TRUE CACHE INTERNAL "")
+  set(HAVE_SSIZE_T         8    CACHE INTERNAL "")  # 8 bytes on 64-bit Linux
+  set(HAVE_HAVE_SSIZE_T    TRUE CACHE INTERNAL "")
+  set(HAVE_PREAD           1 CACHE INTERNAL "")
+  set(HAVE_PWRITE          1 CACHE INTERNAL "")
+  set(HAVE_POSIX_FADVISE   1 CACHE INTERNAL "")
+  set(HAVE_SIGACTION       1 CACHE INTERNAL "")
+  set(HAVE_SIGALTSTACK     1 CACHE INTERNAL "")
+  set(HAVE_FCNTL           1 CACHE INTERNAL "")
+  set(HAVE_DLADDR          1 CACHE INTERNAL "")
+  set(HAVE___CXA_DEMANGLE  1 CACHE INTERNAL "")
+endif()
+
 add_subdirectory(third_party/glog)
 include_directories(${glog_SOURCE_DIR}/src)
 
diff --git a/example/gpt2/main.cc b/example/gpt2/main.cc
index f69736f5..5b70a57b 100644
--- a/example/gpt2/main.cc
+++ b/example/gpt2/main.cc
@@ -98,6 +98,7 @@ const std::unordered_set<std::string> kSupportedModels
     = {"gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl", "d12", "d24", "d36", "d48"};
 constexpr char kDeviceCPU[] = "cpu";
 constexpr char kDeviceCUDA[] = "cuda";
+constexpr char kDeviceMACA[] = "maca";
 constexpr char kDtypeFP32[] = "float32";
 constexpr char kDtypeBF16[] = "bfloat16";
 
@@ -112,8 +113,9 @@ const std::unordered_map<std::string, nn::TransformerConfig> kModelToConfigs = {
 } // namespace
 
 DEFINE_validator(model, [](const char *, const std::string &value) { return kSupportedModels.contains(value); });
-DEFINE_validator(device,
-                 [](const char *, const std::string &value) { return value == kDeviceCPU || value == kDeviceCUDA; });
+DEFINE_validator(device, [](const char *, const std::string &value) {
+    return value == kDeviceCPU || value == kDeviceCUDA || value == kDeviceMACA;
+});
 
 void Train(const nn::parallel::Rank &rank) {
     using namespace nn::parallel;
@@ -169,7 +171,13 @@ void Train(const nn::parallel::Rank &rank) {
             nn::parallel::pp_rank = pp_rank;
         }
     } else {
-        device = FLAGS_device == kDeviceCPU ? Device() : Device(Device::DeviceType::kCUDA, 0);
+        if (FLAGS_device == kDeviceCPU) {
+            device = Device();
+        } else if (FLAGS_device == kDeviceMACA) {
+            device = Device(Device::DeviceType::kMACA, 0);
+        } else {
+            device = Device(Device::DeviceType::kCUDA, 0);
+        }
     }
 
     // calculate gradient accumulation from the desired total batch size and the current run configuration
diff --git a/infini_train/src/kernels/maca/concat.maca b/infini_train/src/kernels/maca/concat.maca
index 42807308..baa82346 100644
--- a/infini_train/src/kernels/maca/concat.maca
+++ b/infini_train/src/kernels/maca/concat.maca
@@ -112,11 +112,11 @@ std::shared_ptr<Tensor> ConcatForward(const std::vector<std::shared_ptr<Tensor>>
             const T **device_input_ptrs = nullptr;
             int64_t *device_offsets = nullptr;
 
-            MACA_CHECK(mcMallocAsync(&device_input_ptrs, sizeof(T *) * num_inputs, stream));
+            MACA_CHECK(mcMallocAsync(reinterpret_cast<void **>(&device_input_ptrs), sizeof(T *) * num_inputs, stream));
             MACA_CHECK(mcMemcpyAsync(device_input_ptrs, host_input_ptrs.data(), sizeof(T *) * num_inputs,
                                        mcMemcpyHostToDevice, stream));
 
-            MACA_CHECK(mcMallocAsync(&device_offsets, sizeof(int64_t) * (num_inputs + 1), stream));
+            MACA_CHECK(mcMallocAsync(reinterpret_cast<void **>(&device_offsets), sizeof(int64_t) * (num_inputs + 1), stream));
             MACA_CHECK(mcMemcpyAsync(device_offsets, host_offsets.data(), sizeof(int64_t) * (num_inputs + 1),
                                        mcMemcpyHostToDevice, stream));
 
@@ -218,11 +218,11 @@ std::vector<std::shared_ptr<Tensor>> ConcatBackward(const std::shared_ptr<Tensor
             T **device_ptrs = nullptr;
             int64_t *device_offsets = nullptr;
 
-            MACA_CHECK(mcMallocAsync(&device_ptrs, sizeof(T *) * num_inputs, stream));
+            MACA_CHECK(mcMallocAsync(reinterpret_cast<void **>(&device_ptrs), sizeof(T *) * num_inputs, stream));
             MACA_CHECK(mcMemcpyAsync(device_ptrs, host_ptrs.data(), sizeof(T *) * num_inputs, mcMemcpyHostToDevice,
                                        stream));
 
-            MACA_CHECK(mcMallocAsync(&device_offsets, sizeof(int64_t) * (num_inputs + 1), stream));
+            MACA_CHECK(mcMallocAsync(reinterpret_cast<void **>(&device_offsets), sizeof(int64_t) * (num_inputs + 1), stream));
             MACA_CHECK(mcMemcpyAsync(device_offsets, host_offsets.data(), sizeof(int64_t) * (num_inputs + 1),
                                        mcMemcpyHostToDevice, stream));
 
diff --git a/infini_train/src/kernels/maca/elementwise.maca b/infini_train/src/kernels/maca/elementwise.maca
index e90d79a0..c760af74 100644
--- a/infini_train/src/kernels/maca/elementwise.maca
+++ b/infini_train/src/kernels/maca/elementwise.maca
@@ -427,7 +427,7 @@ void BinaryBackwardBhistLaunch(FuncA fn_a, FuncB fn_b, T *outA, T *outB, const T
 
     // Workspace layout: [grid, K] floats.
     float *work = nullptr;
-    MACA_CHECK(mcMallocAsync(&work, static_cast<size_t>(grid) * static_cast<size_t>(K) * sizeof(float), stream));
+    MACA_CHECK(mcMallocAsync(reinterpret_cast<void **>(&work), static_cast<size_t>(grid) * static_cast<size_t>(K) * sizeof(float), stream));
 
     // Pass 1: per-block histogram accumulation.
     const size_t smem_bytes = static_cast<size_t>(K + (K >> 5)) * sizeof(float);
@@ -439,7 +439,7 @@ void BinaryBackwardBhistLaunch(FuncA fn_a, FuncB fn_b, T *outA, T *outB, const T
     int dev = 0;
     int sm_count = 0;
     MACA_CHECK(mcGetDevice(&dev));
-    MACA_CHECK(mcDeviceGetAttribute(&sm_count, mcDevAttrMultiProcessorCount, dev));
+    MACA_CHECK(mcDeviceGetAttribute(&sm_count, mcDeviceAttributeMultiProcessorCount, dev));
 
     const int RED_THREADS = 256;
     const int oneD_blocks = (K + RED_THREADS - 1) / RED_THREADS;
@@ -457,7 +457,7 @@ void BinaryBackwardBhistLaunch(FuncA fn_a, FuncB fn_b, T *outA, T *outB, const T
         // 2D tiling path: slice the workspace and accumulate using float atomics.
         constexpr int kTileHeight = 128; // rows per CTA; tune between 128 and 256 if needed
         float *outB_accum = nullptr;
-        MACA_CHECK(mcMallocAsync(&outB_accum, static_cast<size_t>(K) * sizeof(float), stream));
+        MACA_CHECK(mcMallocAsync(reinterpret_cast<void **>(&outB_accum), static_cast<size_t>(K) * sizeof(float), stream));
         MACA_CHECK(mcMemsetAsync(outB_accum, 0, static_cast<size_t>(K) * sizeof(float), stream));
 
         const dim3 rblock(RED_THREADS, 1, 1);
diff --git a/infini_train/src/kernels/maca/gather.maca b/infini_train/src/kernels/maca/gather.maca
index a7a6d04b..90aba330 100644
--- a/infini_train/src/kernels/maca/gather.maca
+++ b/infini_train/src/kernels/maca/gather.maca
@@ -84,7 +84,7 @@ std::shared_ptr<Tensor> IndexGatherForward(const std::shared_ptr<Tensor> &input,
     const int64_t gather_dim_size = in_dims[dim];
 
     int64_t *dev_buf = nullptr;
-    MACA_CHECK(mcMallocAsync(&dev_buf, (3 * num_dims) * sizeof(int64_t), stream));
+    MACA_CHECK(mcMallocAsync(reinterpret_cast<void **>(&dev_buf), (3 * num_dims) * sizeof(int64_t), stream));
     int64_t *out_dims_dev = dev_buf + 0 * num_dims;
     int64_t *in_strides_dev = dev_buf + 1 * num_dims;
     int64_t *out_strides_dev = dev_buf + 2 * num_dims;
@@ -193,7 +193,7 @@ std::shared_ptr<Tensor> IndexGatherBackward(const std::shared_ptr<Tensor> &grad_
                              infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                              ->maca_stream();
 
-    MACA_CHECK(mcMallocAsync(&dev_buf, total_i64 * sizeof(int64_t), stream));
+    MACA_CHECK(mcMallocAsync(reinterpret_cast<void **>(&dev_buf), total_i64 * sizeof(int64_t), stream));
     int64_t *out_dims_dev = dev_buf;
     int64_t *in_strides_dev = out_dims_dev + n_out;
     int64_t *out_strides_dev = in_strides_dev + n_in_strides;
diff --git a/infini_train/src/kernels/maca/slice.maca b/infini_train/src/kernels/maca/slice.maca
index 18b2a0c3..d7b3697a 100644
--- a/infini_train/src/kernels/maca/slice.maca
+++ b/infini_train/src/kernels/maca/slice.maca
@@ -73,7 +73,7 @@ std::shared_ptr<Tensor> SliceForward(const std::shared_ptr<Tensor> &input, const
                              infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                              ->maca_stream();
 
-    mcMallocAsync(&new_dims_dev,
+    mcMallocAsync(reinterpret_cast<void **>(&new_dims_dev),
                     (ends.size() + starts.size() + steps.size() + dims.size() + new_dims.size()) * sizeof(int64_t),
                     stream);
     starts_dev = new_dims_dev + ends.size();
@@ -167,7 +167,7 @@ std::shared_ptr<Tensor> SliceBackward(const std::shared_ptr<Tensor> &grad_output
     const auto &stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
                              infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                              ->maca_stream();
-    mcMallocAsync(&new_dims_dev,
+    mcMallocAsync(reinterpret_cast<void **>(&new_dims_dev),
                     (ends.size() + starts.size() + steps.size() + dims.size() + new_dims.size()) * sizeof(int64_t),
                     stream);
     starts_dev = new_dims_dev + ends.size();
diff --git a/infini_train/src/kernels/maca/split.maca b/infini_train/src/kernels/maca/split.maca
index 3480a19d..fee7a8ca 100644
--- a/infini_train/src/kernels/maca/split.maca
+++ b/infini_train/src/kernels/maca/split.maca
@@ -133,7 +133,7 @@ std::shared_ptr<Tensor> LaunchSplitBackward(const std::vector<int64_t> &input_di
     void *device_ptr;
     const T **device_grad_output_ptrs;
     int64_t *device_H_outs;
-    mcMallocAsync(&device_ptr, (sizeof(T *) + sizeof(int64_t)) * num_splits, stream);
+    mcMallocAsync(reinterpret_cast<void **>(&device_ptr), (sizeof(T *) + sizeof(int64_t)) * num_splits, stream);
     device_grad_output_ptrs = (const T **)(device_ptr);
     device_H_outs = reinterpret_cast<int64_t *>(device_grad_output_ptrs + num_splits);
 
diff --git a/infini_train/src/kernels/maca/stack.maca b/infini_train/src/kernels/maca/stack.maca
index 8c34f5f1..e6c67039 100644
--- a/infini_train/src/kernels/maca/stack.maca
+++ b/infini_train/src/kernels/maca/stack.maca
@@ -67,7 +67,7 @@ std::shared_ptr<Tensor> StackForward(const std::vector<std::shared_ptr<Tensor>>
             for (const auto &t : inputs) { host_input_ptrs.push_back(static_cast<const T *>(t->DataPtr())); }
 
             const T **device_input_ptrs;
-            mcMallocAsync(&device_input_ptrs, sizeof(T *) * num_inputs, stream);
+            mcMallocAsync(reinterpret_cast<void **>(&device_input_ptrs), sizeof(T *) * num_inputs, stream);
             mcMemcpyAsync(device_input_ptrs, host_input_ptrs.data(), sizeof(T *) * num_inputs, mcMemcpyHostToDevice,
                             stream);
 
@@ -136,7 +136,7 @@ std::vector<std::shared_ptr<Tensor>> StackBackward(const std::vector<int64_t> &i
             for (auto &t : grads) { host_ptrs.push_back(static_cast<T *>(t->DataPtr())); }
 
             T **device_ptrs;
-            mcMallocAsync(&device_ptrs, sizeof(T *) * num_inputs, stream);
+            mcMallocAsync(reinterpret_cast<void **>(&device_ptrs), sizeof(T *) * num_inputs, stream);
             mcMemcpyAsync(device_ptrs, host_ptrs.data(), sizeof(T *) * num_inputs, mcMemcpyHostToDevice, stream);
 
             StackBackwardKernel<<<num_blocks, threads_per_block, 0, stream>>>(
diff --git a/infini_train/src/kernels/maca/transform.maca b/infini_train/src/kernels/maca/transform.maca
index 78628015..d092a859 100644
--- a/infini_train/src/kernels/maca/transform.maca
+++ b/infini_train/src/kernels/maca/transform.maca
@@ -252,7 +252,7 @@ std::shared_ptr<Tensor> TransposeForward(const std::shared_ptr<Tensor> &input, i
     // Allocate device memory for dims and strides
     // TODO(zbl): avoid using mcMalloc?
     int64_t *device_buffer;
-    mcMallocAsync(&device_buffer, 3 * ndim * sizeof(int64_t), stream);
+    mcMallocAsync(reinterpret_cast<void **>(&device_buffer), 3 * ndim * sizeof(int64_t), stream);
 
     int64_t *in_dims_dev = device_buffer;
     int64_t *in_strides_dev = device_buffer + ndim;

From 91d6ad38df06053a3cf21ca03e7443601af47e09 Mon Sep 17 00:00:00 2001
From: kilinchange <kilinchange@163.com>
Date: Thu, 16 Apr 2026 08:06:21 +0000
Subject: [PATCH 04/12] refactor: simplify CMakeLists changes and temporarily
 bypass the hardcoded distributed logic in main.cc

---
 CMakeLists.txt         | 51 +++++-------------------------------------
 example/gpt2/main.cc   |  4 +++-
 example/llama3/main.cc | 18 +++++++++++----
 3 files changed, 22 insertions(+), 51 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2de651e8..7c9fea7b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,23 +9,19 @@ option(USE_OMP "Use OpenMP as backend for Eigen" ON)
 option(USE_NCCL "Build project for distributed running on CUDA using NCCL" ON)
 option(USE_MCCL "Build project for distributed running on MACA using MCCL" ON)
 
-# ------------------------------------------------------------------------------
-# MACA toolchain override (must happen before project())
-# ------------------------------------------------------------------------------
-# When targeting MetaX MACA, the C/C++ compiler must be mxcc so that .maca
-# sources and device code can be compiled by the MACA toolchain.
+project(infini_train VERSION 0.5.0 LANGUAGES CXX)
+
+# Switch to mxcc after project() so that third-party libs (glog, gflags) are
+# configured with the host compiler and their feature-detection checks pass.
 if(USE_MACA)
   set(MACA_PATH $ENV{MACA_PATH})
   if(NOT MACA_PATH)
-    message(FATAL_ERROR "USE_MACA=ON but environment variable MACA_PATH is not set. "
-                        "Please export MACA_PATH (e.g. /opt/maca) before configuring.")
+    message(FATAL_ERROR "USE_MACA=ON but environment variable MACA_PATH is not set.")
   endif()
   set(CMAKE_C_COMPILER   "${MACA_PATH}/mxgpu_llvm/bin/mxcc")
   set(CMAKE_CXX_COMPILER "${MACA_PATH}/mxgpu_llvm/bin/mxcc")
 endif()
 
-project(infini_train VERSION 0.5.0 LANGUAGES CXX)
-
 set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
@@ -45,45 +41,8 @@ include_directories(${gflags_SOURCE_DIR}/include)
 set(WITH_GFLAGS OFF CACHE BOOL "Disable glog finding system gflags" FORCE)
 set(WITH_GTEST OFF CACHE BOOL "Disable glog finding system gtest" FORCE)
 set(BUILD_TESTING OFF CACHE BOOL "Disable glog unit tests" FORCE)
-# Build glog as a static lib so its symbols are always visible at link time.
-# Under mxcc the default symbol visibility is hidden, which causes the shared
-# libglog.so to export no symbols and produces "undefined reference" errors.
 set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build glog as static library" FORCE)
 
-# Under MACA/mxcc, cmake's feature-detection test compilations do not find
-# standard POSIX system headers (mxcc has a non-standard sysroot probe path).
-# Pre-set glog's HAVE_* cache variables so that glog skips its fallback type /
-# symbol definitions, which would otherwise conflict with the real system
-# headers during the actual build.
-if(USE_MACA)
-  set(HAVE_SYS_TYPES_H    1 CACHE INTERNAL "")
-  set(HAVE_UNISTD_H        1 CACHE INTERNAL "")
-  set(HAVE_DLFCN_H         1 CACHE INTERNAL "")
-  set(HAVE_GLOB_H          1 CACHE INTERNAL "")
-  set(HAVE_PWD_H           1 CACHE INTERNAL "")
-  set(HAVE_SYS_TIME_H      1 CACHE INTERNAL "")
-  set(HAVE_SYS_UTSNAME_H   1 CACHE INTERNAL "")
-  set(HAVE_SYS_WAIT_H      1 CACHE INTERNAL "")
-  set(HAVE_SYS_SYSCALL_H   1 CACHE INTERNAL "")
-  set(HAVE_SYSLOG_H        1 CACHE INTERNAL "")
-  set(HAVE_UCONTEXT_H      1 CACHE INTERNAL "")
-  # check_type_size() uses two internal variables: the size value and a sentinel
-  # "HAVE_HAVE_<VAR>" that marks the check as done. Pre-setting only the value
-  # is insufficient — the sentinel must also be set so the check skips entirely.
-  set(HAVE_MODE_T          4    CACHE INTERNAL "")  # 4 bytes on Linux
-  set(HAVE_HAVE_MODE_T     TRUE CACHE INTERNAL "")
-  set(HAVE_SSIZE_T         8    CACHE INTERNAL "")  # 8 bytes on 64-bit Linux
-  set(HAVE_HAVE_SSIZE_T    TRUE CACHE INTERNAL "")
-  set(HAVE_PREAD           1 CACHE INTERNAL "")
-  set(HAVE_PWRITE          1 CACHE INTERNAL "")
-  set(HAVE_POSIX_FADVISE   1 CACHE INTERNAL "")
-  set(HAVE_SIGACTION       1 CACHE INTERNAL "")
-  set(HAVE_SIGALTSTACK     1 CACHE INTERNAL "")
-  set(HAVE_FCNTL           1 CACHE INTERNAL "")
-  set(HAVE_DLADDR          1 CACHE INTERNAL "")
-  set(HAVE___CXA_DEMANGLE  1 CACHE INTERNAL "")
-endif()
-
 add_subdirectory(third_party/glog)
 include_directories(${glog_SOURCE_DIR}/src)
 
diff --git a/example/gpt2/main.cc b/example/gpt2/main.cc
index 5b70a57b..2c5bad5c 100644
--- a/example/gpt2/main.cc
+++ b/example/gpt2/main.cc
@@ -146,7 +146,9 @@ void Train(const nn::parallel::Rank &rank) {
     const ProcessGroup *pp_pg = nullptr;
 
     if (rank.IsParallel()) {
-        device = Device(Device::DeviceType::kCUDA, rank.thread_rank());
+        auto parallel_device_type
+            = (FLAGS_device == kDeviceMACA) ? Device::DeviceType::kMACA : Device::DeviceType::kCUDA;
+        device = Device(parallel_device_type, rank.thread_rank());
         auto *pg_factory = ProcessGroupFactory::Instance(device.type());
 
         if (ddp_world_size > 1) {
diff --git a/example/llama3/main.cc b/example/llama3/main.cc
index da9a1027..8949bda3 100644
--- a/example/llama3/main.cc
+++ b/example/llama3/main.cc
@@ -93,13 +93,15 @@ namespace {
 const std::unordered_set<std::string> kSupportedModels = {"llama3"};
 constexpr char kDeviceCPU[] = "cpu";
 constexpr char kDeviceCUDA[] = "cuda";
+constexpr char kDeviceMACA[] = "maca";
 constexpr char kDtypeFP32[] = "float32";
 constexpr char kDtypeBF16[] = "bfloat16";
 } // namespace
 
 DEFINE_validator(model, [](const char *, const std::string &value) { return kSupportedModels.contains(value); });
-DEFINE_validator(device,
-                 [](const char *, const std::string &value) { return value == kDeviceCPU || value == kDeviceCUDA; });
+DEFINE_validator(device, [](const char *, const std::string &value) {
+    return value == kDeviceCPU || value == kDeviceCUDA || value == kDeviceMACA;
+});
 
 void Train(const nn::parallel::Rank &rank) {
     using namespace nn::parallel;
@@ -129,7 +131,9 @@ void Train(const nn::parallel::Rank &rank) {
     const ProcessGroup *pp_pg = nullptr;
 
     if (rank.IsParallel()) {
-        device = Device(Device::DeviceType::kCUDA, rank.thread_rank());
+        auto parallel_device_type
+            = (FLAGS_device == kDeviceMACA) ? Device::DeviceType::kMACA : Device::DeviceType::kCUDA;
+        device = Device(parallel_device_type, rank.thread_rank());
         auto *pg_factory = ProcessGroupFactory::Instance(device.type());
 
         if (ddp_world_size > 1) {
@@ -154,7 +158,13 @@ void Train(const nn::parallel::Rank &rank) {
             nn::parallel::pp_rank = pp_rank;
         }
     } else {
-        device = FLAGS_device == kDeviceCPU ? Device() : Device(Device::DeviceType::kCUDA, 0);
+        if (FLAGS_device == kDeviceCPU) {
+            device = Device();
+        } else if (FLAGS_device == kDeviceMACA) {
+            device = Device(Device::DeviceType::kMACA, 0);
+        } else {
+            device = Device(Device::DeviceType::kCUDA, 0);
+        }
     }
 
     // calculate gradient accumulation from the desired total batch size and the current run configuration

From d170454a1b6c9d3226dbab6deb047787ad23721d Mon Sep 17 00:00:00 2001
From: kilinchange <kilinchange@163.com>
Date: Mon, 20 Apr 2026 06:55:42 +0000
Subject: [PATCH 05/12] refactor(maca): adapt MACA kernels to new dtype
 dispatch and Scalar APIs

Port MACA backend to master's backend-explicit dtype registration:

- Add src/core/runtime/maca/maca_dispatch.h: register __half / __maca_bfloat16
  via BackendTypeMap<kMACA, kFLOAT16/kBFLOAT16>, declare
  INFINI_REGISTER_STANDARD_BACKEND_TYPES(kMACA), and expose DispatchMacaFunc /
  MacaTypeMap mirroring the CUDA side.
- Replace every DispatchFunc<...>/WidestType_t/DataTypeMap_v site across 18
  MACA kernels with DispatchMacaFunc / PromoteDataTypes.
- Replace Tensor::Fill<T>(0) template calls with Fill(0) to match the new
  Scalar-taking Tensor::Fill API.
- fill.maca: route Scalar::to<T> through common::maca::Cast<T>(scalar.to<float>())
  for __maca_bfloat16/__half to avoid ambiguous static_cast from integer
  Scalar kinds (see scalar.h TODO).
---
 CMakeLists.txt                                | 51 +++++++++++++++++--
 example/gpt2/main.cc                          | 19 +++++++
 example/llama3/main.cc                        | 16 ++++++
 .../src/core/runtime/maca/maca_dispatch.h     | 51 +++++++++++++++++++
 .../src/kernels/maca/accumulate_grad.maca     |  5 +-
 infini_train/src/kernels/maca/cast.maca       |  3 +-
 infini_train/src/kernels/maca/comm.maca       |  2 +-
 infini_train/src/kernels/maca/concat.maca     |  9 ++--
 .../src/kernels/maca/cross_entropy.maca       |  7 +--
 .../src/kernels/maca/elementwise.maca         | 25 ++++-----
 infini_train/src/kernels/maca/embedding.maca  |  7 +--
 infini_train/src/kernels/maca/fill.maca       | 21 ++++++--
 infini_train/src/kernels/maca/gather.maca     |  9 ++--
 infini_train/src/kernels/maca/layernorm.maca  | 15 +++---
 infini_train/src/kernels/maca/linear.maca     | 15 +++---
 infini_train/src/kernels/maca/outer.maca      | 11 ++--
 infini_train/src/kernels/maca/reduction.maca  |  7 +--
 infini_train/src/kernels/maca/slice.maca      | 13 ++---
 infini_train/src/kernels/maca/softmax.maca    | 10 ++--
 infini_train/src/kernels/maca/split.maca      |  7 +--
 infini_train/src/kernels/maca/stack.maca      |  9 ++--
 infini_train/src/kernels/maca/transform.maca  | 35 ++++++-------
 .../maca/vocab_parallel_cross_entropy.maca    |  3 +-
 scripts/run_models_and_profile.bash           | 23 +++++++--
 24 files changed, 265 insertions(+), 108 deletions(-)
 create mode 100644 infini_train/src/core/runtime/maca/maca_dispatch.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7c9fea7b..2de651e8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,19 +9,23 @@ option(USE_OMP "Use OpenMP as backend for Eigen" ON)
 option(USE_NCCL "Build project for distributed running on CUDA using NCCL" ON)
 option(USE_MCCL "Build project for distributed running on MACA using MCCL" ON)
 
-project(infini_train VERSION 0.5.0 LANGUAGES CXX)
-
-# Switch to mxcc after project() so that third-party libs (glog, gflags) are
-# configured with the host compiler and their feature-detection checks pass.
+# ------------------------------------------------------------------------------
+# MACA toolchain override (must happen before project())
+# ------------------------------------------------------------------------------
+# When targeting MetaX MACA, the C/C++ compiler must be mxcc so that .maca
+# sources and device code can be compiled by the MACA toolchain.
 if(USE_MACA)
   set(MACA_PATH $ENV{MACA_PATH})
   if(NOT MACA_PATH)
-    message(FATAL_ERROR "USE_MACA=ON but environment variable MACA_PATH is not set.")
+    message(FATAL_ERROR "USE_MACA=ON but environment variable MACA_PATH is not set. "
+                        "Please export MACA_PATH (e.g. /opt/maca) before configuring.")
   endif()
   set(CMAKE_C_COMPILER   "${MACA_PATH}/mxgpu_llvm/bin/mxcc")
   set(CMAKE_CXX_COMPILER "${MACA_PATH}/mxgpu_llvm/bin/mxcc")
 endif()
 
+project(infini_train VERSION 0.5.0 LANGUAGES CXX)
+
 set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
@@ -41,8 +45,45 @@ include_directories(${gflags_SOURCE_DIR}/include)
 set(WITH_GFLAGS OFF CACHE BOOL "Disable glog finding system gflags" FORCE)
 set(WITH_GTEST OFF CACHE BOOL "Disable glog finding system gtest" FORCE)
 set(BUILD_TESTING OFF CACHE BOOL "Disable glog unit tests" FORCE)
+# Build glog as a static lib so its symbols are always visible at link time.
+# Under mxcc the default symbol visibility is hidden, which causes the shared
+# libglog.so to export no symbols and produces "undefined reference" errors.
 set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build glog as static library" FORCE)
 
+# Under MACA/mxcc, cmake's feature-detection test compilations do not find
+# standard POSIX system headers (mxcc has a non-standard sysroot probe path).
+# Pre-set glog's HAVE_* cache variables so that glog skips its fallback type /
+# symbol definitions, which would otherwise conflict with the real system
+# headers during the actual build.
+if(USE_MACA)
+  set(HAVE_SYS_TYPES_H    1 CACHE INTERNAL "")
+  set(HAVE_UNISTD_H        1 CACHE INTERNAL "")
+  set(HAVE_DLFCN_H         1 CACHE INTERNAL "")
+  set(HAVE_GLOB_H          1 CACHE INTERNAL "")
+  set(HAVE_PWD_H           1 CACHE INTERNAL "")
+  set(HAVE_SYS_TIME_H      1 CACHE INTERNAL "")
+  set(HAVE_SYS_UTSNAME_H   1 CACHE INTERNAL "")
+  set(HAVE_SYS_WAIT_H      1 CACHE INTERNAL "")
+  set(HAVE_SYS_SYSCALL_H   1 CACHE INTERNAL "")
+  set(HAVE_SYSLOG_H        1 CACHE INTERNAL "")
+  set(HAVE_UCONTEXT_H      1 CACHE INTERNAL "")
+  # check_type_size() uses two internal variables: the size value and a sentinel
+  # "HAVE_HAVE_<VAR>" that marks the check as done. Pre-setting only the value
+  # is insufficient — the sentinel must also be set so the check skips entirely.
+  set(HAVE_MODE_T          4    CACHE INTERNAL "")  # 4 bytes on Linux
+  set(HAVE_HAVE_MODE_T     TRUE CACHE INTERNAL "")
+  set(HAVE_SSIZE_T         8    CACHE INTERNAL "")  # 8 bytes on 64-bit Linux
+  set(HAVE_HAVE_SSIZE_T    TRUE CACHE INTERNAL "")
+  set(HAVE_PREAD           1 CACHE INTERNAL "")
+  set(HAVE_PWRITE          1 CACHE INTERNAL "")
+  set(HAVE_POSIX_FADVISE   1 CACHE INTERNAL "")
+  set(HAVE_SIGACTION       1 CACHE INTERNAL "")
+  set(HAVE_SIGALTSTACK     1 CACHE INTERNAL "")
+  set(HAVE_FCNTL           1 CACHE INTERNAL "")
+  set(HAVE_DLADDR          1 CACHE INTERNAL "")
+  set(HAVE___CXA_DEMANGLE  1 CACHE INTERNAL "")
+endif()
+
 add_subdirectory(third_party/glog)
 include_directories(${glog_SOURCE_DIR}/src)
 
diff --git a/example/gpt2/main.cc b/example/gpt2/main.cc
index 2c5bad5c..b9eac0ff 100644
--- a/example/gpt2/main.cc
+++ b/example/gpt2/main.cc
@@ -29,6 +29,9 @@
 #ifdef PROFILE_MODE
 #include "infini_train/include/profiler.h"
 #endif
+#ifdef USE_MACA
+#include "infini_train/src/core/runtime/maca/maca_guard_impl.h"
+#endif
 #include "infini_train/include/nn/parallel/utils.h"
 #include "infini_train/include/utils/global_module_hook_registry.h"
 #include "infini_train/include/utils/precision_check_config.h"
@@ -452,12 +455,28 @@ void Train(const nn::parallel::Rank &rank) {
     Profiler::Instance().Report("gpt2.report", Profiler::SortBy::DeviceTimePercentage);
     Profiler::Instance().PrintRecords("gpt2.records.log");
 #endif
+
+    // On MACA, flush all pending mcFreeAsync operations so that ATU entries for
+    // activation/gradient tensors from this step are released before the next
+    // forward pass begins.  Without this, the ATU (address-translation unit)
+    // accumulates deferred frees across steps and becomes full, causing
+    // xnack(0x8) ATU-fault crashes in CastKernel and other large-tensor kernels.
+    if (device.type() == Device::DeviceType::kMACA) {
+        impl->SynchronizeDevice(device);
+    }
 }
 
 int main(int argc, char *argv[]) {
     gflags::ParseCommandLineFlags(&argc, &argv, true);
     google::InitGoogleLogging(argv[0]);
 
+    // On MACA, when TP > 1 disable P2P to prevent MCCL communication-ordering
+    // deadlocks and P2P teardown crashes.  Must be set before any mcclCommInitAll
+    // call (i.e. before threads that create ProcessGroups are spawned).
+    if (FLAGS_device == kDeviceMACA && FLAGS_tensor_parallel > 1) {
+        setenv("MACA_P2P_DISABLE", "1", 1);
+    }
+
     auto precision_config = utils::PrecisionCheckConfig::Parse(FLAGS_precision_check);
     nn::parallel::global::InitAllEnv(FLAGS_nthread_per_process, FLAGS_tensor_parallel, FLAGS_sequence_parallel,
                                      FLAGS_pipeline_parallel, FLAGS_virtual_pipeline_parallel);
diff --git a/example/llama3/main.cc b/example/llama3/main.cc
index 8949bda3..f4773829 100644
--- a/example/llama3/main.cc
+++ b/example/llama3/main.cc
@@ -427,12 +427,28 @@ void Train(const nn::parallel::Rank &rank) {
     Profiler::Instance().Report("llama3.report", Profiler::SortBy::DeviceTimePercentage);
     Profiler::Instance().PrintRecords("llama3.records.log");
 #endif
+
+    // On MACA, flush all pending mcFreeAsync operations so that ATU entries for
+    // activation/gradient tensors from this step are released before the next
+    // forward pass begins.  Without this, the ATU (address-translation unit)
+    // accumulates deferred frees across steps and becomes full, causing
+    // xnack(0x8) ATU-fault crashes in CastKernel and other large-tensor kernels.
+    if (device.type() == Device::DeviceType::kMACA) {
+        impl->SynchronizeDevice(device);
+    }
 }
 
 int main(int argc, char *argv[]) {
     gflags::ParseCommandLineFlags(&argc, &argv, true);
     google::InitGoogleLogging(argv[0]);
 
+    // On MACA, when TP > 1 disable P2P to prevent MCCL communication-ordering
+    // deadlocks and P2P teardown crashes.  Must be set before any mcclCommInitAll
+    // call (i.e. before threads that create ProcessGroups are spawned).
+    if (FLAGS_device == kDeviceMACA && FLAGS_tensor_parallel > 1) {
+        setenv("MACA_P2P_DISABLE", "1", 1);
+    }
+
     auto precision_config = utils::PrecisionCheckConfig::Parse(FLAGS_precision_check);
     nn::parallel::global::InitAllEnv(FLAGS_nthread_per_process, FLAGS_tensor_parallel, FLAGS_sequence_parallel,
                                      FLAGS_pipeline_parallel, FLAGS_virtual_pipeline_parallel);
diff --git a/infini_train/src/core/runtime/maca/maca_dispatch.h b/infini_train/src/core/runtime/maca/maca_dispatch.h
new file mode 100644
index 00000000..32783a86
--- /dev/null
+++ b/infini_train/src/core/runtime/maca/maca_dispatch.h
@@ -0,0 +1,51 @@
+#pragma once
+
+#include <utility>
+#include <vector>
+
+#include <common/maca_bfloat16.h>
+#include <common/maca_fp16.h>
+
+#include "infini_train/include/core/backend_type_map.h"
+#include "infini_train/include/dtype_dispatch.h"
+
+// -----------------------------------------------------------------------------
+// MACA low-precision BackendTypeMap specializations:
+//   FP16 -> __half, BF16 -> __maca_bfloat16
+// -----------------------------------------------------------------------------
+namespace infini_train::core {
+template <> struct BackendTypeMap<Device::DeviceType::kMACA, DataType::kFLOAT16> {
+    using type = __half;
+};
+
+template <> struct BackendTypeMap<Device::DeviceType::kMACA, DataType::kBFLOAT16> {
+    using type = __maca_bfloat16;
+};
+} // namespace infini_train::core
+
+// Register all standard (non-low-precision) dtypes for the MACA backend.
+// FP16/BF16 are registered explicitly above with their MACA-native scalar types.
+INFINI_REGISTER_STANDARD_BACKEND_TYPES(infini_train::Device::DeviceType::kMACA)
+
+namespace infini_train::core::maca {
+
+template <DataType DType> struct MacaTypeMap : BackendTypeMap<Device::DeviceType::kMACA, DType> {};
+
+// -----------------------------------------------------------------------------
+// MACA dispatch helpers
+// -----------------------------------------------------------------------------
+
+template <DataType... AllowedDTypes, typename Functor, typename... Args>
+auto DispatchMacaFunc(DataType dtype, Functor &&func, std::string_view context_identifier = "", Args &&...args) {
+    return infini_train::DispatchByTypeMap<MacaTypeMap, AllowedDTypes...>(
+        dtype, std::forward<Functor>(func), context_identifier, std::forward<Args>(args)...);
+}
+
+template <typename... AllowedTypeLists, typename Functor, typename... Args>
+auto DispatchMacaFunc(const std::vector<DataType> &dtypes, Functor &&func, std::string_view context_identifier = "",
+                      Args &&...args) {
+    return infini_train::DispatchByTypeMap<MacaTypeMap, AllowedTypeLists...>(
+        dtypes, std::forward<Functor>(func), context_identifier, std::forward<Args>(args)...);
+}
+
+} // namespace infini_train::core::maca
diff --git a/infini_train/src/kernels/maca/accumulate_grad.maca b/infini_train/src/kernels/maca/accumulate_grad.maca
index 1bda88db..11aa93ca 100644
--- a/infini_train/src/kernels/maca/accumulate_grad.maca
+++ b/infini_train/src/kernels/maca/accumulate_grad.maca
@@ -6,6 +6,7 @@
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/maca/maca_dispatch.h"
 #include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
 
 namespace infini_train::kernels::maca {
@@ -29,7 +30,7 @@ void AccumulateGrad(const std::shared_ptr<Tensor> &gradient, float rate, const s
                                   infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                                   ->maca_stream();
 
-    DispatchFunc<INFINI_ALL_FLOATING_TYPES>(
+    core::maca::DispatchMacaFunc<INFINI_ALL_FLOATING_TYPES>(
         gradient->Dtype(),
         [=]<typename T>() {
             AccumulateGradKernel<<<num_blocks, threads_per_block, 0, maca_stream>>>(
@@ -73,7 +74,7 @@ void AdamAccumulateGrad(const std::shared_ptr<Tensor> &grad, const std::shared_p
                                   infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                                   ->maca_stream();
 
-    DispatchFunc<INFINI_ALL_FLOATING_TYPES>(
+    core::maca::DispatchMacaFunc<INFINI_ALL_FLOATING_TYPES>(
         grad->Dtype(),
         [=]<typename T>() {
             AdamAccumulateGradKernel<<<num_blocks, threads_per_block, 0, maca_stream>>>(
diff --git a/infini_train/src/kernels/maca/cast.maca b/infini_train/src/kernels/maca/cast.maca
index 2c26a0d8..9e97cef8 100644
--- a/infini_train/src/kernels/maca/cast.maca
+++ b/infini_train/src/kernels/maca/cast.maca
@@ -8,6 +8,7 @@
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/maca/maca_dispatch.h"
 #include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
 
 namespace infini_train::kernels::maca {
@@ -33,7 +34,7 @@ std::shared_ptr<Tensor> Cast(std::shared_ptr<Tensor> input, DataType dtype) {
     dim3 grid_dims(CEIL_DIV(num_elements, block_dims.x));
     const size_t step = grid_dims.x * block_dims.x;
 
-    DispatchFunc<DataTypeList<INFINI_ALL_TYPES>, DataTypeList<INFINI_ALL_TYPES>>(
+    core::maca::DispatchMacaFunc<DataTypeList<INFINI_ALL_TYPES>, DataTypeList<INFINI_ALL_TYPES>>(
         {dtype, input->Dtype()},
         [=]<typename Tdst, typename Tsrc>() {
             auto dst = static_cast<Tdst *>(dst_tensor->DataPtr());
diff --git a/infini_train/src/kernels/maca/comm.maca b/infini_train/src/kernels/maca/comm.maca
index c7fdace9..1627cdfb 100644
--- a/infini_train/src/kernels/maca/comm.maca
+++ b/infini_train/src/kernels/maca/comm.maca
@@ -29,7 +29,7 @@ std::vector<std::shared_ptr<Tensor>> ReduceAddCoalesced(const std::vector<std::v
     std::vector<std::vector<std::shared_ptr<Tensor>>> to_destination_grads;
     for (int i = 0; i < grads[0].size(); ++i) {
         outputs.emplace_back(std::make_shared<Tensor>(grads[0][i]->Dims(), grads[0][i]->Dtype(), destination));
-        outputs[i]->Fill<float>(0.0);
+        outputs[i]->Fill(0.0);
     }
     for (int i = 0; i < grads.size(); ++i) {
         to_destination_grads.push_back(std::vector<std::shared_ptr<Tensor>>());
diff --git a/infini_train/src/kernels/maca/concat.maca b/infini_train/src/kernels/maca/concat.maca
index baa82346..8b84e835 100644
--- a/infini_train/src/kernels/maca/concat.maca
+++ b/infini_train/src/kernels/maca/concat.maca
@@ -11,6 +11,7 @@
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/maca/maca_dispatch.h"
 #include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
 
 namespace infini_train::kernels::maca {
@@ -102,7 +103,7 @@ std::shared_ptr<Tensor> ConcatForward(const std::vector<std::shared_ptr<Tensor>>
     int threads_per_block = 256;
     int num_blocks = static_cast<int>((total + threads_per_block - 1) / threads_per_block);
 
-    DispatchFunc<INFINI_ALL_TYPES>(
+    core::maca::DispatchMacaFunc<INFINI_ALL_TYPES>(
         dtype,
         [=, &inputs, &host_offsets]<typename T>() {
             std::vector<const T *> host_input_ptrs;
@@ -185,8 +186,8 @@ std::vector<std::shared_ptr<Tensor>> ConcatBackward(const std::shared_ptr<Tensor
     grads.reserve(input_dims_list.size());
     for (const auto &dvec : input_dims_list) {
         auto t = std::make_shared<Tensor>(dvec, dtype, device);
-        DispatchFunc<INFINI_ALL_TYPES>(
-            dtype, [=]<typename T>() { t->Fill<T>(0); }, "MACA ConcatBackward");
+        core::maca::DispatchMacaFunc<INFINI_ALL_TYPES>(
+            dtype, [=]<typename T>() { t->Fill(0); }, "MACA ConcatBackward");
         grads.push_back(t);
     }
 
@@ -208,7 +209,7 @@ std::vector<std::shared_ptr<Tensor>> ConcatBackward(const std::shared_ptr<Tensor
     int threads_per_block = 256;
     int num_blocks = static_cast<int>((total + threads_per_block - 1) / threads_per_block);
 
-    DispatchFunc<INFINI_ALL_TYPES>(
+    core::maca::DispatchMacaFunc<INFINI_ALL_TYPES>(
         dtype,
         [=, &grads, &host_offsets]<typename T>() {
             std::vector<T *> host_ptrs;
diff --git a/infini_train/src/kernels/maca/cross_entropy.maca b/infini_train/src/kernels/maca/cross_entropy.maca
index 6e839ab3..29eac058 100644
--- a/infini_train/src/kernels/maca/cross_entropy.maca
+++ b/infini_train/src/kernels/maca/cross_entropy.maca
@@ -12,6 +12,7 @@
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/maca/maca_dispatch.h"
 #include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
 
 namespace infini_train::kernels::maca {
@@ -91,7 +92,7 @@ std::shared_ptr<Tensor> CrossEntropyForward(const std::shared_ptr<Tensor> &input
                                   infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                                   ->maca_stream();
 
-    return DispatchFunc<DataTypeList<DataType::kUINT8, DataType::kINT64>, DataTypeList<INFINI_ALL_FLOATING_TYPES>>(
+    return core::maca::DispatchMacaFunc<DataTypeList<DataType::kUINT8, DataType::kINT64>, DataTypeList<INFINI_ALL_FLOATING_TYPES>>(
         {target->Dtype(), input->Dtype()},
         [=]<typename Ttarget, typename Tinput>() {
             const Ttarget *target_ptr = static_cast<const Ttarget *>(target->DataPtr());
@@ -198,10 +199,10 @@ std::shared_ptr<Tensor> CrossEntropyBackward(const std::shared_ptr<Tensor> &inpu
                                   infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                                   ->maca_stream();
 
-    DispatchFunc<DataTypeList<DataType::kUINT8, DataType::kINT64>, DataTypeList<INFINI_ALL_FLOATING_TYPES>>(
+    core::maca::DispatchMacaFunc<DataTypeList<DataType::kUINT8, DataType::kINT64>, DataTypeList<INFINI_ALL_FLOATING_TYPES>>(
         {target->Dtype(), input_casted->Dtype()},
         [=]<typename Ttarget, typename Tinput>() {
-            grad_input->Fill<Tinput>(0);
+            grad_input->Fill(0);
             const Tinput *output_grad_ptr = static_cast<const Tinput *>(grad_output->DataPtr());
             const Ttarget *target_ptr = static_cast<const Ttarget *>(target->DataPtr());
             const Tinput *input_ptr = static_cast<const Tinput *>(input_casted->DataPtr());
diff --git a/infini_train/src/kernels/maca/elementwise.maca b/infini_train/src/kernels/maca/elementwise.maca
index c760af74..54b5ab36 100644
--- a/infini_train/src/kernels/maca/elementwise.maca
+++ b/infini_train/src/kernels/maca/elementwise.maca
@@ -8,6 +8,7 @@
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/maca/maca_dispatch.h"
 #include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
 
 namespace infini_train::kernels::maca {
@@ -766,9 +767,7 @@ std::shared_ptr<Tensor> UnaryBackward(const std::shared_ptr<Tensor> &grad_output
                                       Func unary_fn) {
     auto dtype = grad_output->Dtype();
     auto a_dtype = a ? a->Dtype() : dtype;
-    DataType promoted_type = DispatchFunc<DataTypeList<INFINI_ALL_TYPES>, DataTypeList<INFINI_ALL_TYPES>>(
-        {dtype, a_dtype}, [=]<typename Tgrad, typename Ta>() { return DataTypeMap_v<WidestType_t<Tgrad, Ta>>; },
-        "MACA UnaryBackward");
+    DataType promoted_type = PromoteDataTypes(dtype, a_dtype);
 
     auto grad_output_promoted
         = dtype == promoted_type ? grad_output : std::make_shared<Tensor>(grad_output->To(promoted_type));
@@ -795,9 +794,7 @@ std::shared_ptr<Tensor> BinaryForward(const std::shared_ptr<Tensor> &a, const st
     auto a_dtype = a->Dtype();
     auto b_dtype = b->Dtype();
 
-    DataType promoted_type = DispatchFunc<DataTypeList<INFINI_ALL_TYPES>, DataTypeList<INFINI_ALL_TYPES>>(
-        {a_dtype, b_dtype}, [=]<typename Ta, typename Tb>() { return DataTypeMap_v<WidestType_t<Ta, Tb>>; },
-        "MACA BinaryForward");
+    DataType promoted_type = PromoteDataTypes(a_dtype, b_dtype);
 
     auto a_promoted = a_dtype == promoted_type ? a : std::make_shared<Tensor>(a->To(promoted_type));
     auto b_promoted = b_dtype == promoted_type ? b : std::make_shared<Tensor>(b->To(promoted_type));
@@ -837,9 +834,7 @@ BinaryBackward(const std::shared_ptr<Tensor> &grad_output, const std::shared_ptr
     auto a_dtype = a_promoted ? a_promoted->Dtype() : dtype;
     auto b_dtype = b_promoted ? b_promoted->Dtype() : dtype;
     // Compute dtype determined by saved tensors (forward compute dtype), not grad_output
-    DataType promoted_type = DispatchFunc<DataTypeList<INFINI_ALL_TYPES>, DataTypeList<INFINI_ALL_TYPES>>(
-        {a_dtype, b_dtype}, [=]<typename Ta, typename Tb>() { return DataTypeMap_v<WidestType_t<Ta, Tb>>; },
-        "MACA BinaryBackward");
+    DataType promoted_type = PromoteDataTypes(a_dtype, b_dtype);
 
     CHECK(a_num_elements >= b_num_elements && a_num_elements % b_num_elements == 0);
 
@@ -867,8 +862,8 @@ BinaryBackward(const std::shared_ptr<Tensor> &grad_output, const std::shared_ptr
     switch (promoted_type) {
         DISPATCH_CASE(WRAP({
                           if (needs_broadcast) {
-                              grad_a->Fill<float>(0.0f);
-                              grad_b->Fill<float>(0.0f);
+                              grad_a->Fill(0.0f);
+                              grad_b->Fill(0.0f);
                           }
                           LaunchBackward<256, float>(fn_a, fn_b, grad_a, grad_b, a_dims, b_dims, grad_output_promoted,
                                                      a_promoted, b_promoted);
@@ -876,8 +871,8 @@ BinaryBackward(const std::shared_ptr<Tensor> &grad_output, const std::shared_ptr
                       DataType::kFLOAT32)
         DISPATCH_CASE(WRAP({
                           if (needs_broadcast) {
-                              grad_a->Fill<__maca_bfloat16>(0);
-                              grad_b->Fill<__maca_bfloat16>(0);
+                              grad_a->Fill(0);
+                              grad_b->Fill(0);
                           }
                           LaunchBackward<256, __maca_bfloat16>(fn_a, fn_b, grad_a, grad_b, a_dims, b_dims,
                                                            grad_output_promoted, a_promoted, b_promoted);
@@ -885,8 +880,8 @@ BinaryBackward(const std::shared_ptr<Tensor> &grad_output, const std::shared_ptr
                       DataType::kBFLOAT16)
         // FIXME(zbl): AtomicAdd does not support int64_t
         // DISPATCH_CASE(WRAP({
-        //                   grad_a->Fill<int64_t>(0);
-        //                   grad_b->Fill<int64_t>(0);
+        //                   grad_a->Fill(0);
+        //                   grad_b->Fill(0);
         //                   LaunchBackward<256, int64_t>(fn_a, fn_b, grad_a, grad_b, a_dims, b_dims, grad_output, a,
         //                   b);
         //               }),
diff --git a/infini_train/src/kernels/maca/embedding.maca b/infini_train/src/kernels/maca/embedding.maca
index d0472b72..cc33eb19 100644
--- a/infini_train/src/kernels/maca/embedding.maca
+++ b/infini_train/src/kernels/maca/embedding.maca
@@ -5,6 +5,7 @@
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/maca/maca_dispatch.h"
 #include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
 
 namespace infini_train::kernels::maca {
@@ -50,7 +51,7 @@ std::shared_ptr<Tensor> EmbeddingForward(const std::shared_ptr<Tensor> &input, c
     int threads_per_block = 256;
     int num_blocks = (batch_size * max_seqlen * embed_dim + threads_per_block - 1) / threads_per_block;
 
-    DispatchFunc<INFINI_ALL_FLOATING_TYPES>(
+    core::maca::DispatchMacaFunc<INFINI_ALL_FLOATING_TYPES>(
         dtype,
         [=]<typename T>() {
             EmbeddingForwardKernel<<<num_blocks, threads_per_block, 0, maca_stream>>>(
@@ -101,10 +102,10 @@ std::shared_ptr<Tensor> EmbeddingBackward(const std::shared_ptr<Tensor> &input,
     const int threads_per_block = 256;
     const int num_blocks = (num_tokens + threads_per_block - 1) / threads_per_block;
 
-    DispatchFunc<INFINI_ALL_FLOATING_TYPES>(
+    core::maca::DispatchMacaFunc<INFINI_ALL_FLOATING_TYPES>(
         dtype,
         [=]<typename T>() {
-            grad_weight->Fill<T>(0);
+            grad_weight->Fill(0);
             EmbeddingBackwardKernel<<<num_blocks, threads_per_block, 0, maca_stream>>>(
                 static_cast<const int64_t *>(input->DataPtr()), static_cast<const T *>(grad_output->DataPtr()),
                 static_cast<T *>(grad_weight->DataPtr()), num_tokens, embedding_dim, vocab_size);
diff --git a/infini_train/src/kernels/maca/fill.maca b/infini_train/src/kernels/maca/fill.maca
index accdac0f..d747cd26 100644
--- a/infini_train/src/kernels/maca/fill.maca
+++ b/infini_train/src/kernels/maca/fill.maca
@@ -1,11 +1,14 @@
 #include <cstddef>
 #include <memory>
+#include <type_traits>
 
+#include "infini_train/include/common/maca/kernel_helper.cuh"
 #include "infini_train/include/core/runtime/device_guard.h"
 #include "infini_train/include/device.h"
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/maca/maca_dispatch.h"
 #include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
 
 namespace infini_train::kernels::maca {
@@ -18,7 +21,7 @@ template <typename T> __global__ void FillKernel(T *data, T value, size_t size)
 }
 
 // TODO(dcj): refactor Fill kernel with elementwise template
-void Fill(std::shared_ptr<Tensor> tensor, void *value_ptr) {
+void Fill(std::shared_ptr<Tensor> tensor, Scalar scalar) {
     const int num_tokens = tensor->NumElements();
     const int threads_per_block = 256;
     const int num_blocks = (num_tokens + threads_per_block - 1) / threads_per_block;
@@ -27,11 +30,21 @@ void Fill(std::shared_ptr<Tensor> tensor, void *value_ptr) {
                                   infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                                   ->maca_stream();
 
-    DispatchFunc<INFINI_ALL_TYPES>(
+    core::maca::DispatchMacaFunc<INFINI_ALL_TYPES>(
         tensor->Dtype(),
         [=]<typename T>() {
-            FillKernel<T><<<num_blocks, threads_per_block, 0, maca_stream>>>(
-                static_cast<T *>(tensor->DataPtr()), *(static_cast<T *>(value_ptr)), tensor->NumElements());
+            // Scalar::to<T> relies on static_cast, which is ambiguous when T is a
+            // MACA native half/bf16 type constructed from integer scalars. Route
+            // half/bf16 through float via common::maca::Cast to guarantee a
+            // single-candidate conversion path.
+            T casted_value;
+            if constexpr (std::is_same_v<T, __maca_bfloat16> || std::is_same_v<T, __half>) {
+                casted_value = common::maca::Cast<T>(scalar.to<float>());
+            } else {
+                casted_value = scalar.to<T>();
+            }
+            FillKernel<T><<<num_blocks, threads_per_block, 0, maca_stream>>>(static_cast<T *>(tensor->DataPtr()),
+                                                                             casted_value, tensor->NumElements());
         },
         "MACA Fill");
 }
diff --git a/infini_train/src/kernels/maca/gather.maca b/infini_train/src/kernels/maca/gather.maca
index 90aba330..549f82f6 100644
--- a/infini_train/src/kernels/maca/gather.maca
+++ b/infini_train/src/kernels/maca/gather.maca
@@ -5,6 +5,7 @@
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/maca/maca_dispatch.h"
 #include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
 
 namespace infini_train::kernels::maca {
@@ -99,7 +100,7 @@ std::shared_ptr<Tensor> IndexGatherForward(const std::shared_ptr<Tensor> &input,
     const int threads = 256;
     const int blocks = (total_elements + threads - 1) / threads;
 
-    DispatchFunc<INFINI_ALL_FLOATING_TYPES>(
+    core::maca::DispatchMacaFunc<INFINI_ALL_FLOATING_TYPES>(
         dtype,
         [=]<typename T>() {
             IndexGatherForwardKernel<T><<<blocks, threads, 0, stream>>>(
@@ -173,8 +174,8 @@ std::shared_ptr<Tensor> IndexGatherBackward(const std::shared_ptr<Tensor> &grad_
 
     auto dtype = grad_output->Dtype();
     auto grad_input = std::make_shared<Tensor>(in_dims, dtype, grad_output->GetDevice());
-    DispatchFunc<INFINI_ALL_TYPES>(
-        dtype, [=]<typename T>() { grad_input->Fill<T>(0); }, "MACA IndexGatherBackwardZero");
+    core::maca::DispatchMacaFunc<INFINI_ALL_TYPES>(
+        dtype, [=]<typename T>() { grad_input->Fill(0); }, "MACA IndexGatherBackwardZero");
 
     auto in_strides = ComputeStrides(in_dims);
     auto out_strides = ComputeStrides(idx_dims);
@@ -207,7 +208,7 @@ std::shared_ptr<Tensor> IndexGatherBackward(const std::shared_ptr<Tensor> &grad_
     const int threads = 256;
     const int blocks = (int)((total_elements + threads - 1) / threads);
 
-    DispatchFunc<INFINI_ALL_FLOATING_TYPES>(
+    core::maca::DispatchMacaFunc<INFINI_ALL_FLOATING_TYPES>(
         dtype,
         [=]<typename T>() {
             IndexGatherBackwardKernel<T><<<blocks, threads, 0, stream>>>(
diff --git a/infini_train/src/kernels/maca/layernorm.maca b/infini_train/src/kernels/maca/layernorm.maca
index 53b8f339..22f31446 100644
--- a/infini_train/src/kernels/maca/layernorm.maca
+++ b/infini_train/src/kernels/maca/layernorm.maca
@@ -7,6 +7,7 @@
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/maca/maca_dispatch.h"
 #include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
 
 namespace infini_train::kernels::maca {
@@ -85,11 +86,11 @@ LayerNormForward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Ten
                                   infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                                   ->maca_stream();
 
-    DispatchFunc<INFINI_ALL_FLOATING_TYPES>(
+    core::maca::DispatchMacaFunc<INFINI_ALL_FLOATING_TYPES>(
         dtype,
         [=]<typename T>() {
-            mean->Fill<float>(0);
-            rstd->Fill<float>(0);
+            mean->Fill(0);
+            rstd->Fill(0);
             LayerNormForwardKernel<BLOCK_SIZE><<<num_blocks, threads_per_block, 0, maca_stream>>>(
                 static_cast<const T *>(input->DataPtr()), static_cast<const T *>(weight->DataPtr()),
                 static_cast<const T *>(bias->DataPtr()), static_cast<float *>(mean->DataPtr()),
@@ -179,12 +180,12 @@ LayerNormBackward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Te
     const auto &maca_stream = dynamic_cast<infini_train::core::maca::MacaStream *>(
                                   infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                                   ->maca_stream();
-    DispatchFunc<INFINI_ALL_FLOATING_TYPES>(
+    core::maca::DispatchMacaFunc<INFINI_ALL_FLOATING_TYPES>(
         dtype,
         [=]<typename T>() {
-            grad_input->Fill<T>(0);
-            grad_weight->Fill<T>(0);
-            grad_bias->Fill<T>(0);
+            grad_input->Fill(0);
+            grad_weight->Fill(0);
+            grad_bias->Fill(0);
             LayerNormBackwardKernel<BLOCK_SIZE><<<num_blocks, threads_per_block, 0, maca_stream>>>(
                 static_cast<const T *>(input->DataPtr()), static_cast<const T *>(grad_output->DataPtr()),
                 static_cast<const float *>(mean->DataPtr()), static_cast<const float *>(rstd->DataPtr()),
diff --git a/infini_train/src/kernels/maca/linear.maca b/infini_train/src/kernels/maca/linear.maca
index accbec9f..6da36357 100644
--- a/infini_train/src/kernels/maca/linear.maca
+++ b/infini_train/src/kernels/maca/linear.maca
@@ -13,6 +13,7 @@
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/maca/maca_dispatch.h"
 #include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
 
 namespace infini_train::kernels::maca {
@@ -93,9 +94,7 @@ MatmulBackward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tenso
     auto other_dtype = other->Dtype();
     auto grad_output_dtype = grad_output->Dtype();
     // Compute dtype determined by saved tensors (forward compute dtype), not grad_output
-    DataType compute_dtype = DispatchFunc<DataTypeList<INFINI_ALL_TYPES>, DataTypeList<INFINI_ALL_TYPES>>(
-        {input_dtype, other_dtype}, [=]<typename Tin, typename To>() { return DataTypeMap_v<WidestType_t<Tin, To>>; },
-        "MACA MatmulBackward");
+    DataType compute_dtype = PromoteDataTypes(input_dtype, other_dtype);
 
     auto input_promoted = input_dtype == compute_dtype ? input : std::make_shared<Tensor>(input->To(compute_dtype));
     auto other_promoted = other_dtype == compute_dtype ? other : std::make_shared<Tensor>(other->To(compute_dtype));
@@ -242,7 +241,7 @@ std::shared_ptr<Tensor> LinearForward(const std::shared_ptr<Tensor> &input, cons
         int threads_per_block = 256;
         int num_blocks = (bs * out_features + threads_per_block - 1) / threads_per_block;
 
-        DispatchFunc<DataType::kFLOAT32, DataType::kBFLOAT16>(
+        core::maca::DispatchMacaFunc<DataType::kFLOAT32, DataType::kBFLOAT16>(
             dtype,
             [=]<typename T>() {
                 BiasCopyKernel<<<num_blocks, threads_per_block, 0, maca_stream>>>(
@@ -250,8 +249,8 @@ std::shared_ptr<Tensor> LinearForward(const std::shared_ptr<Tensor> &input, cons
             },
             "MACA LinearForward");
     } else {
-        DispatchFunc<DataType::kFLOAT32, DataType::kBFLOAT16>(
-            input->Dtype(), [=]<typename T>() { output->Fill<T>(0); }, "MACA LinearForward");
+        core::maca::DispatchMacaFunc<DataType::kFLOAT32, DataType::kBFLOAT16>(
+            input->Dtype(), [=]<typename T>() { output->Fill(0); }, "MACA LinearForward");
     }
 
     const float alpha = 1.0f;
@@ -338,9 +337,7 @@ LinearBackward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tenso
     DataType input_dtype = input ? input->Dtype() : (weight ? weight->Dtype() : dtype);
     DataType weight_dtype = weight ? weight->Dtype() : (input ? input->Dtype() : dtype);
     // Compute dtype determined by saved tensors (forward compute dtype), not grad_output
-    DataType compute_dtype = DispatchFunc<DataTypeList<INFINI_ALL_TYPES>, DataTypeList<INFINI_ALL_TYPES>>(
-        {input_dtype, weight_dtype}, [=]<typename Tin, typename Tw>() { return DataTypeMap_v<WidestType_t<Tin, Tw>>; },
-        "MACA LinearBackward");
+    DataType compute_dtype = PromoteDataTypes(input_dtype, weight_dtype);
 
     auto grad_output_promoted
         = dtype == compute_dtype ? grad_output : std::make_shared<Tensor>(grad_output->To(compute_dtype));
diff --git a/infini_train/src/kernels/maca/outer.maca b/infini_train/src/kernels/maca/outer.maca
index 14fe0388..d4a4ffe8 100644
--- a/infini_train/src/kernels/maca/outer.maca
+++ b/infini_train/src/kernels/maca/outer.maca
@@ -11,6 +11,7 @@
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/maca/maca_dispatch.h"
 #include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
 
 namespace infini_train::kernels::maca {
@@ -81,9 +82,7 @@ std::tuple<std::shared_ptr<Tensor>, std::shared_ptr<Tensor>> OuterBackward(const
     auto grad_output_dtype = grad_output->Dtype();
 
     // Compute dtype determined by saved tensors (forward compute dtype), not grad_output
-    DataType promoted_type = DispatchFunc<DataTypeList<INFINI_ALL_TYPES>, DataTypeList<INFINI_ALL_TYPES>>(
-        {input_dtype, other_dtype}, [=]<typename Tin, typename To>() { return DataTypeMap_v<WidestType_t<Tin, To>>; },
-        "MACA OuterBackward");
+    DataType promoted_type = PromoteDataTypes(input_dtype, other_dtype);
 
     auto input_promoted = input_dtype == promoted_type ? input : std::make_shared<Tensor>(input->To(promoted_type));
     auto other_promoted = other_dtype == promoted_type ? other : std::make_shared<Tensor>(other->To(promoted_type));
@@ -95,11 +94,11 @@ std::tuple<std::shared_ptr<Tensor>, std::shared_ptr<Tensor>> OuterBackward(const
     auto grad_input = std::make_shared<Tensor>(std::vector<int64_t>{M}, output_dtype, grad_output->GetDevice());
     auto grad_other = std::make_shared<Tensor>(std::vector<int64_t>{N}, output_dtype, grad_output->GetDevice());
 
-    DispatchFunc<DataType::kFLOAT32, DataType::kBFLOAT16>(
+    core::maca::DispatchMacaFunc<DataType::kFLOAT32, DataType::kBFLOAT16>(
         promoted_type,
         [=]<typename T>() {
-            grad_input->Fill<T>(0);
-            grad_other->Fill<T>(0);
+            grad_input->Fill(0);
+            grad_other->Fill(0);
         },
         "MACA OuterBackward");
 
diff --git a/infini_train/src/kernels/maca/reduction.maca b/infini_train/src/kernels/maca/reduction.maca
index ee453e61..9d5756f8 100644
--- a/infini_train/src/kernels/maca/reduction.maca
+++ b/infini_train/src/kernels/maca/reduction.maca
@@ -7,6 +7,7 @@
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/maca/maca_dispatch.h"
 #include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
 
 namespace infini_train::kernels::maca {
@@ -141,7 +142,7 @@ std::shared_ptr<Tensor> ReduceOpForward(const std::shared_ptr<Tensor> &input, co
                                   infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                                   ->maca_stream();
 
-    DispatchFunc<INFINI_ALL_FLOATING_TYPES>(
+    core::maca::DispatchMacaFunc<INFINI_ALL_FLOATING_TYPES>(
         dtype,
         [=]<typename T>() {
             GenericReduceKernel<T, ReduceFunc, FinalizeOp<T>, BLOCK_SIZE>
@@ -177,10 +178,10 @@ std::shared_ptr<Tensor> ReduceOpBackward(const std::shared_ptr<Tensor> &grad_out
                                   infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                                   ->maca_stream();
 
-    DispatchFunc<INFINI_ALL_FLOATING_TYPES>(
+    core::maca::DispatchMacaFunc<INFINI_ALL_FLOATING_TYPES>(
         dtype,
         [=]<typename T>() {
-            grad_input->Fill<T>(0);
+            grad_input->Fill(0);
             GenericReduceBackwardKernel<<<num_blocks, threads_per_block, 0, maca_stream>>>(
                 static_cast<T *>(grad_input->DataPtr()), static_cast<const T *>(grad_output->DataPtr()),
                 input ? static_cast<const T *>(input->DataPtr()) : nullptr,
diff --git a/infini_train/src/kernels/maca/slice.maca b/infini_train/src/kernels/maca/slice.maca
index d7b3697a..74572c1b 100644
--- a/infini_train/src/kernels/maca/slice.maca
+++ b/infini_train/src/kernels/maca/slice.maca
@@ -8,6 +8,7 @@
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/maca/maca_dispatch.h"
 #include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
 
 namespace infini_train::kernels::maca {
@@ -48,8 +49,8 @@ std::shared_ptr<Tensor> SliceForward(const std::shared_ptr<Tensor> &input, const
     auto dtype = input->Dtype();
     auto new_tensor = std::make_shared<Tensor>(new_dims, dtype, input->GetDevice());
     // NOTE(zbl): must initialize with 0
-    DispatchFunc<INFINI_ALL_TYPES>(
-        dtype, [=]<typename T>() { new_tensor->Fill<T>(0); }, "MACA SliceForward");
+    core::maca::DispatchMacaFunc<INFINI_ALL_TYPES>(
+        dtype, [=]<typename T>() { new_tensor->Fill(0); }, "MACA SliceForward");
 
     std::vector<int64_t> src_strides(dims.size(), 0), dst_strides(new_dims.size(), 0);
     int64_t stride = 1;
@@ -92,7 +93,7 @@ std::shared_ptr<Tensor> SliceForward(const std::shared_ptr<Tensor> &input, const
     int threads_per_block = 256;
     int num_blocks = (total_elements + threads_per_block - 1) / threads_per_block;
 
-    DispatchFunc<INFINI_ALL_TYPES>(
+    core::maca::DispatchMacaFunc<INFINI_ALL_TYPES>(
         dtype,
         [=]<typename T>() {
             SliceForwardKernel<<<num_blocks, threads_per_block, 0, stream>>>(
@@ -141,8 +142,8 @@ std::shared_ptr<Tensor> SliceBackward(const std::shared_ptr<Tensor> &grad_output
 
     auto grad_output_dtype = grad_output->Dtype();
     auto grad_input = std::make_shared<Tensor>(input->Dims(), grad_output_dtype, grad_output->GetDevice());
-    DispatchFunc<INFINI_ALL_TYPES>(
-        grad_output_dtype, [=]<typename T>() { grad_input->Fill<T>(0); }, "MACA SliceBackward");
+    core::maca::DispatchMacaFunc<INFINI_ALL_TYPES>(
+        grad_output_dtype, [=]<typename T>() { grad_input->Fill(0); }, "MACA SliceBackward");
 
     std::vector<int64_t> src_strides(dims.size());
     int64_t stride = 1;
@@ -186,7 +187,7 @@ std::shared_ptr<Tensor> SliceBackward(const std::shared_ptr<Tensor> &grad_output
     int threads_per_block = 256;
     int num_blocks = (total_elements + threads_per_block - 1) / threads_per_block;
 
-    DispatchFunc<INFINI_ALL_TYPES>(
+    core::maca::DispatchMacaFunc<INFINI_ALL_TYPES>(
         grad_output_dtype,
         [=]<typename T>() {
             SliceBackwardKernel<<<num_blocks, threads_per_block, 0, stream>>>(
diff --git a/infini_train/src/kernels/maca/softmax.maca b/infini_train/src/kernels/maca/softmax.maca
index 28bce0ed..07adc147 100644
--- a/infini_train/src/kernels/maca/softmax.maca
+++ b/infini_train/src/kernels/maca/softmax.maca
@@ -12,6 +12,7 @@
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/maca/maca_dispatch.h"
 #include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
 
 namespace infini_train::kernels::maca {
@@ -186,10 +187,7 @@ std::shared_ptr<Tensor> SoftmaxBackward(const std::shared_ptr<Tensor> &grad_outp
                                         const std::shared_ptr<Tensor> &output, int64_t dim) {
     auto grad_output_dtype = grad_output->Dtype();
     auto output_dtype = output->Dtype();
-    DataType promoted_type = DispatchFunc<DataTypeList<INFINI_ALL_TYPES>, DataTypeList<INFINI_ALL_TYPES>>(
-        {grad_output_dtype, output_dtype},
-        [=]<typename Tgrad, typename Tout>() { return DataTypeMap_v<WidestType_t<Tgrad, Tout>>; },
-        "MACA SoftmaxBackward");
+    DataType promoted_type = PromoteDataTypes(grad_output_dtype, output_dtype);
 
     auto grad_output_promoted
         = grad_output_dtype == promoted_type ? grad_output : std::make_shared<Tensor>(grad_output->To(promoted_type));
@@ -200,8 +198,8 @@ std::shared_ptr<Tensor> SoftmaxBackward(const std::shared_ptr<Tensor> &grad_outp
     CHECK(dim >= 0 && dim < output->Dims().size());
 
     auto grad_input = std::make_shared<Tensor>(output_dims, promoted_type, output->GetDevice());
-    DispatchFunc<INFINI_ALL_TYPES>(
-        promoted_type, [=]<typename T>() { grad_input->Fill<T>(0); }, "MACA SoftmaxBackward");
+    core::maca::DispatchMacaFunc<INFINI_ALL_TYPES>(
+        promoted_type, [=]<typename T>() { grad_input->Fill(0); }, "MACA SoftmaxBackward");
 
     switch (promoted_type) {
         DISPATCH_CASE(WRAP(LaunchBackward<256, float>(grad_input, grad_output_promoted, output_promoted, dim);),
diff --git a/infini_train/src/kernels/maca/split.maca b/infini_train/src/kernels/maca/split.maca
index fee7a8ca..409796bf 100644
--- a/infini_train/src/kernels/maca/split.maca
+++ b/infini_train/src/kernels/maca/split.maca
@@ -7,6 +7,7 @@
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/maca/maca_dispatch.h"
 #include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
 
 namespace infini_train::kernels::maca {
@@ -58,7 +59,7 @@ std::vector<std::shared_ptr<Tensor>> SplitForward(const std::shared_ptr<Tensor>
                                       infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                                       ->maca_stream();
 
-        DispatchFunc<INFINI_ALL_TYPES>(
+        core::maca::DispatchMacaFunc<INFINI_ALL_TYPES>(
             dtype,
             [=]<typename T>() {
                 SplitForwardKernel<<<num_blocks, threads_per_block, 0, maca_stream>>>(
@@ -113,7 +114,7 @@ std::shared_ptr<Tensor> LaunchSplitBackward(const std::vector<int64_t> &input_di
     const auto &grad = grad_outputs[0];
     auto dtype = grad->Dtype();
     auto grad_input = std::make_shared<Tensor>(input_dims, dtype, grad->GetDevice());
-    grad_input->Fill<T>(0);
+    grad_input->Fill(0);
 
     int64_t N = std::accumulate(input_dims.begin(), input_dims.begin() + dim, 1, std::multiplies<int64_t>());
     int64_t W = std::accumulate(input_dims.begin() + dim + 1, input_dims.end(), 1, std::multiplies<int64_t>());
@@ -165,7 +166,7 @@ std::shared_ptr<Tensor> SplitBackward(const std::vector<int64_t> &input_dims, in
     CHECK_GE(dim, 0) << "Currently we do not support negative dimension";
     CHECK_LT(dim, input_dims.size());
 
-    return DispatchFunc<INFINI_ALL_TYPES>(
+    return core::maca::DispatchMacaFunc<INFINI_ALL_TYPES>(
         grad_outputs[0]->Dtype(),
         [=]<typename T>() { return LaunchSplitBackward<T>(input_dims, split_size, dim, grad_outputs); },
         "MACA SplitBackward");
diff --git a/infini_train/src/kernels/maca/stack.maca b/infini_train/src/kernels/maca/stack.maca
index e6c67039..09112eea 100644
--- a/infini_train/src/kernels/maca/stack.maca
+++ b/infini_train/src/kernels/maca/stack.maca
@@ -11,6 +11,7 @@
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/maca/maca_dispatch.h"
 #include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
 
 namespace infini_train::kernels::maca {
@@ -60,7 +61,7 @@ std::shared_ptr<Tensor> StackForward(const std::vector<std::shared_ptr<Tensor>>
     int threads_per_block = 256;
     int num_blocks = (total + threads_per_block - 1) / threads_per_block;
 
-    DispatchFunc<INFINI_ALL_TYPES>(
+    core::maca::DispatchMacaFunc<INFINI_ALL_TYPES>(
         dtype,
         [=]<typename T>() {
             std::vector<const T *> host_input_ptrs;
@@ -112,8 +113,8 @@ std::vector<std::shared_ptr<Tensor>> StackBackward(const std::vector<int64_t> &i
     std::vector<std::shared_ptr<Tensor>> grads;
     for (int i = 0; i < num_inputs; ++i) {
         auto t = std::make_shared<Tensor>(base_dims, dtype, grad_output->GetDevice());
-        DispatchFunc<INFINI_ALL_TYPES>(
-            dtype, [=]<typename T>() { t->Fill<T>(0); }, "MACA StackBackward");
+        core::maca::DispatchMacaFunc<INFINI_ALL_TYPES>(
+            dtype, [=]<typename T>() { t->Fill(0); }, "MACA StackBackward");
         grads.push_back(t);
     }
 
@@ -129,7 +130,7 @@ std::vector<std::shared_ptr<Tensor>> StackBackward(const std::vector<int64_t> &i
     int threads_per_block = 256;
     int num_blocks = (total + threads_per_block - 1) / threads_per_block;
 
-    DispatchFunc<INFINI_ALL_TYPES>(
+    core::maca::DispatchMacaFunc<INFINI_ALL_TYPES>(
         dtype,
         [=]<typename T>() {
             std::vector<T *> host_ptrs;
diff --git a/infini_train/src/kernels/maca/transform.maca b/infini_train/src/kernels/maca/transform.maca
index d092a859..35ccb6f5 100644
--- a/infini_train/src/kernels/maca/transform.maca
+++ b/infini_train/src/kernels/maca/transform.maca
@@ -10,6 +10,7 @@
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/maca/maca_dispatch.h"
 #include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
 
 namespace infini_train::kernels::maca {
@@ -46,7 +47,7 @@ std::shared_ptr<Tensor> TrilForward(const std::shared_ptr<Tensor> &input, int64_
                                   infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                                   ->maca_stream();
 
-    DispatchFunc<INFINI_ALL_TYPES>(
+    core::maca::DispatchMacaFunc<INFINI_ALL_TYPES>(
         input->Dtype(),
         [=]<typename T>() {
             TrilForwardKernel<<<num_blocks, threads_per_block, 0, maca_stream>>>(
@@ -89,10 +90,10 @@ std::shared_ptr<Tensor> TrilBackward(const std::shared_ptr<Tensor> &grad_output,
                                   infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                                   ->maca_stream();
 
-    DispatchFunc<INFINI_ALL_TYPES>(
+    core::maca::DispatchMacaFunc<INFINI_ALL_TYPES>(
         dtype,
         [=]<typename T>() {
-            grad_input->Fill<T>(0);
+            grad_input->Fill(0);
             TrilBackwardKernel<<<num_blocks, threads_per_block, 0, maca_stream>>>(
                 static_cast<const T *>(grad_output->DataPtr()), static_cast<T *>(grad_input->DataPtr()), rows, cols,
                 diagonal);
@@ -134,7 +135,7 @@ std::shared_ptr<Tensor> TriuForward(const std::shared_ptr<Tensor> &input, int64_
                                   infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                                   ->maca_stream();
 
-    DispatchFunc<INFINI_ALL_TYPES>(
+    core::maca::DispatchMacaFunc<INFINI_ALL_TYPES>(
         input->Dtype(),
         [=]<typename T>() {
             TriuForwardKernel<<<num_blocks, threads_per_block, 0, maca_stream>>>(
@@ -176,10 +177,10 @@ std::shared_ptr<Tensor> TriuBackward(const std::shared_ptr<Tensor> &grad_output,
                                   infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                                   ->maca_stream();
 
-    DispatchFunc<INFINI_ALL_TYPES>(
+    core::maca::DispatchMacaFunc<INFINI_ALL_TYPES>(
         dtype,
         [=]<typename T>() {
-            grad_input->Fill<T>(0);
+            grad_input->Fill(0);
             TriuBackwardKernel<<<num_blocks, threads_per_block, 0, maca_stream>>>(
                 static_cast<const T *>(grad_output->DataPtr()), static_cast<T *>(grad_input->DataPtr()), rows, cols,
                 diagonal);
@@ -268,10 +269,10 @@ std::shared_ptr<Tensor> TransposeForward(const std::shared_ptr<Tensor> &input, i
     int threads_per_block = 256;
     int num_blocks = (num_elements + threads_per_block - 1) / threads_per_block;
 
-    DispatchFunc<INFINI_ALL_TYPES>(
+    core::maca::DispatchMacaFunc<INFINI_ALL_TYPES>(
         dtype,
         [=]<typename T>() {
-            output->Fill<T>(0);
+            output->Fill(0);
             TransposeForwardKernel<<<num_blocks, threads_per_block, 0, stream>>>(
                 static_cast<const T *>(input->DataPtr()), static_cast<T *>(output->DataPtr()), in_dims_dev,
                 in_strides_dev, out_strides_dev, ndim, dim0, dim1, num_elements);
@@ -370,7 +371,7 @@ std::shared_ptr<Tensor> MaskForward(const std::shared_ptr<Tensor> &input, const
         int64_t inner = input->NumElements() / rows;
         int num_blocks = static_cast<int>((input->NumElements() + threads_per_block - 1) / threads_per_block);
 
-        DispatchFunc<INFINI_ALL_TYPES>(
+        core::maca::DispatchMacaFunc<INFINI_ALL_TYPES>(
             dtype,
             [=]<typename T>() {
                 MaskLeadsForwardKernel<T><<<num_blocks, threads_per_block, 0, maca_stream>>>(
@@ -383,7 +384,7 @@ std::shared_ptr<Tensor> MaskForward(const std::shared_ptr<Tensor> &input, const
         int64_t batch_size = input->NumElements() / mask_size;
         int num_blocks = static_cast<int>((input->NumElements() + threads_per_block - 1) / threads_per_block);
 
-        DispatchFunc<INFINI_ALL_TYPES>(
+        core::maca::DispatchMacaFunc<INFINI_ALL_TYPES>(
             dtype,
             [=]<typename T>() {
                 MaskForwardKernel<T><<<num_blocks, threads_per_block, 0, maca_stream>>>(
@@ -434,10 +435,10 @@ std::shared_ptr<Tensor> MaskBackward(const std::shared_ptr<Tensor> &grad_output,
         int64_t inner = grad_output->NumElements() / rows;
         int num_blocks = static_cast<int>((grad_output->NumElements() + threads_per_block - 1) / threads_per_block);
 
-        DispatchFunc<INFINI_ALL_TYPES>(
+        core::maca::DispatchMacaFunc<INFINI_ALL_TYPES>(
             dtype,
             [=]<typename T>() {
-                grad_input->Fill<T>(0);
+                grad_input->Fill(0);
                 MaskLeadsBackwardKernel<T><<<num_blocks, threads_per_block, 0, maca_stream>>>(
                     static_cast<const T *>(grad_output->DataPtr()), static_cast<const T *>(mask_casted->DataPtr()),
                     static_cast<T *>(grad_input->DataPtr()), rows, inner);
@@ -448,10 +449,10 @@ std::shared_ptr<Tensor> MaskBackward(const std::shared_ptr<Tensor> &grad_output,
         int64_t batch_size = grad_output->NumElements() / mask_size;
         int num_blocks = static_cast<int>((grad_output->NumElements() + threads_per_block - 1) / threads_per_block);
 
-        DispatchFunc<INFINI_ALL_TYPES>(
+        core::maca::DispatchMacaFunc<INFINI_ALL_TYPES>(
             dtype,
             [=]<typename T>() {
-                grad_input->Fill<T>(0);
+                grad_input->Fill(0);
                 MaskBackwardKernel<T><<<num_blocks, threads_per_block, 0, maca_stream>>>(
                     static_cast<const T *>(grad_output->DataPtr()), static_cast<const T *>(mask_casted->DataPtr()),
                     static_cast<T *>(grad_input->DataPtr()), static_cast<int>(batch_size), static_cast<int>(mask_size));
@@ -503,7 +504,7 @@ std::shared_ptr<Tensor> RepeatInterleaveForward(const std::shared_ptr<Tensor> &i
                                   infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                                   ->maca_stream();
 
-    DispatchFunc<INFINI_ALL_TYPES>(
+    core::maca::DispatchMacaFunc<INFINI_ALL_TYPES>(
         input->Dtype(),
         [=]<typename T>() {
             RepeatInterleaveForwardKernel<<<num_blocks, threads_per_block, 0, maca_stream>>>(
@@ -561,10 +562,10 @@ std::shared_ptr<Tensor> RepeatInterleaveBackward(const std::shared_ptr<Tensor> &
                                   infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                                   ->maca_stream();
 
-    DispatchFunc<INFINI_ALL_TYPES>(
+    core::maca::DispatchMacaFunc<INFINI_ALL_TYPES>(
         grad_output->Dtype(),
         [=]<typename T>() {
-            grad_input->Fill<T>(0);
+            grad_input->Fill(0);
             RepeatInterleaveBackwardKernel<<<num_blocks, threads_per_block, 0, maca_stream>>>(
                 static_cast<const T *>(grad_output->DataPtr()), static_cast<T *>(grad_input->DataPtr()), outer,
                 dim_size, inner, repeat);
diff --git a/infini_train/src/kernels/maca/vocab_parallel_cross_entropy.maca b/infini_train/src/kernels/maca/vocab_parallel_cross_entropy.maca
index d79780b7..d561f6ab 100644
--- a/infini_train/src/kernels/maca/vocab_parallel_cross_entropy.maca
+++ b/infini_train/src/kernels/maca/vocab_parallel_cross_entropy.maca
@@ -8,6 +8,7 @@
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/maca/maca_dispatch.h"
 #include "infini_train/src/core/runtime/maca/maca_runtime_common.h"
 
 namespace infini_train::kernels::maca {
@@ -93,7 +94,7 @@ VocabParallelCrossEntropyBackward(const std::shared_ptr<Tensor> &grad_output,
     constexpr int threads_per_block = 256;
     const int num_blocks = static_cast<int>(rows);
 
-    DispatchFunc<DataTypeList<DataType::kUINT8, DataType::kINT64>, DataTypeList<INFINI_ALL_FLOATING_TYPES>>(
+    core::maca::DispatchMacaFunc<DataTypeList<DataType::kUINT8, DataType::kINT64>, DataTypeList<INFINI_ALL_FLOATING_TYPES>>(
         {masked_target->Dtype(), softmax_local->Dtype()},
         [=]<typename Tindex, typename Tinput>() {
             using Tmask = Tinput;
diff --git a/scripts/run_models_and_profile.bash b/scripts/run_models_and_profile.bash
index 06589904..84beebba 100755
--- a/scripts/run_models_and_profile.bash
+++ b/scripts/run_models_and_profile.bash
@@ -1,6 +1,5 @@
 #!/bin/bash
 
-set -e
 set -o pipefail
 
 usage() {
@@ -70,6 +69,7 @@ BUILD_DIR="$(read_var BUILD_DIR)";              : "${BUILD_DIR:=../build}"
 LOG_DIR="$(read_var LOG_DIR)";                  : "${LOG_DIR:=logs}"
 PROFILE_LOG_DIR="$(read_var PROFILE_LOG_DIR)";  : "${PROFILE_LOG_DIR:=./profile_logs}"
 COMPARE_LOG_DIR="$(read_var COMPARE_LOG_DIR)";  : "${COMPARE_LOG_DIR:=}"
+DEVICE_BACKEND="$(read_var DEVICE_BACKEND)";    : "${DEVICE_BACKEND:=cuda}"
 
 mkdir -p "$BUILD_DIR" "$LOG_DIR" "$PROFILE_LOG_DIR"
 
@@ -83,6 +83,8 @@ done < <(jq -r '.variables | to_entries[] | "\(.key)=\(.value)"' "$CONFIG_FILE")
 # Global variable to save the last cmake command
 LAST_CMAKE_CMD=""
 declare -A SELECTED_TAGS=()
+# Track test failures: array of "<log_name>: <cmd>"
+FAILED_TESTS=()
 
 normalize_tag() {
     local raw="$1"
@@ -166,7 +168,9 @@ run_and_log() {
         echo ""
         echo "[ERROR] Last 20 lines of log:"
         tail -20 "$log_path"
-        exit 1
+        FAILED_TESTS+=("${log_name}: ${cmd}")
+        popd > /dev/null
+        return 1
     fi
 
     popd > /dev/null
@@ -267,16 +271,27 @@ for ((id=0; id<num_builds; ++id)); do
             arg_str="$(args_string_for_test "$gi" "$ti")"
 
             # gpt2
-            gpt2_cmd="${prefix}./gpt2 --input_bin ${GPT2_INPUT_BIN} --llmc_filepath ${GPT2_LLMC_FILEPATH} --device cuda ${arg_str}"
+            gpt2_cmd="${prefix}./gpt2 --input_bin ${GPT2_INPUT_BIN} --llmc_filepath ${GPT2_LLMC_FILEPATH} --device ${DEVICE_BACKEND} ${arg_str}"
             run_and_log "$gpt2_cmd" "gpt2_${test_id}${log_suffix}" "$profile_flag" "$group_tag"
 
             # llama3
-            llama3_cmd="${prefix}./llama3 --input_bin ${LLAMA3_INPUT_BIN} --llmc_filepath ${LLAMA3_LLMC_FILEPATH} --device cuda ${arg_str}"
+            llama3_cmd="${prefix}./llama3 --input_bin ${LLAMA3_INPUT_BIN} --llmc_filepath ${LLAMA3_LLMC_FILEPATH} --device ${DEVICE_BACKEND} ${arg_str}"
             run_and_log "$llama3_cmd" "llama3_${test_id}${log_suffix}" "$profile_flag" "$group_tag"
         done
     done
 done
 
+if [[ ${#FAILED_TESTS[@]} -gt 0 ]]; then
+    echo -e "\n\033[1;31m============================================================\033[0m"
+    echo -e "\033[1;31m[SUMMARY] ${#FAILED_TESTS[@]} test(s) FAILED:\033[0m"
+    for f in "${FAILED_TESTS[@]}"; do
+        echo -e "\033[1;31m  - ${f}\033[0m"
+    done
+    echo -e "\033[1;31m============================================================\033[0m"
+else
+    echo -e "\n\033[1;32mAll tests PASSED.\033[0m"
+fi
+
 echo -e "\n\033[1;32mAll done.\033[0m"
 
 # Run comparison scripts if COMPARE_LOG_DIR is set

From d6d3813cd4e9d30b3ffe877874acab287b854227 Mon Sep 17 00:00:00 2001
From: kilinchange <kilinchange@163.com>
Date: Tue, 21 Apr 2026 03:08:35 +0000
Subject: [PATCH 06/12] refactor: clean CMakeLists.txt

---
 CMakeLists.txt | 331 +++++++++++++++++++------------------------------
 1 file changed, 128 insertions(+), 203 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2de651e8..840f91ad 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,3 @@
-cmake_minimum_required(VERSION 3.28)
-
 # Platforms
 option(USE_CUDA "Support NVIDIA CUDA" OFF)
 option(USE_MACA "Support MetaX MACA" OFF)
@@ -8,23 +6,10 @@ option(PROFILE_MODE "ENABLE PROFILE MODE" OFF)
 option(USE_OMP "Use OpenMP as backend for Eigen" ON)
 option(USE_NCCL "Build project for distributed running on CUDA using NCCL" ON)
 option(USE_MCCL "Build project for distributed running on MACA using MCCL" ON)
+option(USE_MPI "Enable MPI for inter-node CPU communication" ON)
+cmake_minimum_required(VERSION 3.28)
 
-# ------------------------------------------------------------------------------
-# MACA toolchain override (must happen before project())
-# ------------------------------------------------------------------------------
-# When targeting MetaX MACA, the C/C++ compiler must be mxcc so that .maca
-# sources and device code can be compiled by the MACA toolchain.
-if(USE_MACA)
-  set(MACA_PATH $ENV{MACA_PATH})
-  if(NOT MACA_PATH)
-    message(FATAL_ERROR "USE_MACA=ON but environment variable MACA_PATH is not set. "
-                        "Please export MACA_PATH (e.g. /opt/maca) before configuring.")
-  endif()
-  set(CMAKE_C_COMPILER   "${MACA_PATH}/mxgpu_llvm/bin/mxcc")
-  set(CMAKE_CXX_COMPILER "${MACA_PATH}/mxgpu_llvm/bin/mxcc")
-endif()
-
-project(infini_train VERSION 0.5.0 LANGUAGES CXX)
+project(infini_train VERSION 0.3.0 LANGUAGES CXX)
 
 set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
@@ -33,135 +18,99 @@ set(CMAKE_CXX_EXTENSIONS OFF)
 # Generate compile_commands.json
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
-# ------------------------------------------------------------------------------
-# Third-party deps
-# ------------------------------------------------------------------------------
-
-# gflags
+# Add gflags
 add_subdirectory(third_party/gflags)
 include_directories(${gflags_SOURCE_DIR}/include)
 
-# glog
 set(WITH_GFLAGS OFF CACHE BOOL "Disable glog finding system gflags" FORCE)
 set(WITH_GTEST OFF CACHE BOOL "Disable glog finding system gtest" FORCE)
-set(BUILD_TESTING OFF CACHE BOOL "Disable glog unit tests" FORCE)
-# Build glog as a static lib so its symbols are always visible at link time.
-# Under mxcc the default symbol visibility is hidden, which causes the shared
-# libglog.so to export no symbols and produces "undefined reference" errors.
-set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build glog as static library" FORCE)
-
-# Under MACA/mxcc, cmake's feature-detection test compilations do not find
-# standard POSIX system headers (mxcc has a non-standard sysroot probe path).
-# Pre-set glog's HAVE_* cache variables so that glog skips its fallback type /
-# symbol definitions, which would otherwise conflict with the real system
-# headers during the actual build.
-if(USE_MACA)
-  set(HAVE_SYS_TYPES_H    1 CACHE INTERNAL "")
-  set(HAVE_UNISTD_H        1 CACHE INTERNAL "")
-  set(HAVE_DLFCN_H         1 CACHE INTERNAL "")
-  set(HAVE_GLOB_H          1 CACHE INTERNAL "")
-  set(HAVE_PWD_H           1 CACHE INTERNAL "")
-  set(HAVE_SYS_TIME_H      1 CACHE INTERNAL "")
-  set(HAVE_SYS_UTSNAME_H   1 CACHE INTERNAL "")
-  set(HAVE_SYS_WAIT_H      1 CACHE INTERNAL "")
-  set(HAVE_SYS_SYSCALL_H   1 CACHE INTERNAL "")
-  set(HAVE_SYSLOG_H        1 CACHE INTERNAL "")
-  set(HAVE_UCONTEXT_H      1 CACHE INTERNAL "")
-  # check_type_size() uses two internal variables: the size value and a sentinel
-  # "HAVE_HAVE_<VAR>" that marks the check as done. Pre-setting only the value
-  # is insufficient — the sentinel must also be set so the check skips entirely.
-  set(HAVE_MODE_T          4    CACHE INTERNAL "")  # 4 bytes on Linux
-  set(HAVE_HAVE_MODE_T     TRUE CACHE INTERNAL "")
-  set(HAVE_SSIZE_T         8    CACHE INTERNAL "")  # 8 bytes on 64-bit Linux
-  set(HAVE_HAVE_SSIZE_T    TRUE CACHE INTERNAL "")
-  set(HAVE_PREAD           1 CACHE INTERNAL "")
-  set(HAVE_PWRITE          1 CACHE INTERNAL "")
-  set(HAVE_POSIX_FADVISE   1 CACHE INTERNAL "")
-  set(HAVE_SIGACTION       1 CACHE INTERNAL "")
-  set(HAVE_SIGALTSTACK     1 CACHE INTERNAL "")
-  set(HAVE_FCNTL           1 CACHE INTERNAL "")
-  set(HAVE_DLADDR          1 CACHE INTERNAL "")
-  set(HAVE___CXA_DEMANGLE  1 CACHE INTERNAL "")
-endif()
 
+# Add glog
 add_subdirectory(third_party/glog)
 include_directories(${glog_SOURCE_DIR}/src)
 
-# eigen
+# Add eigen
 if(USE_OMP)
-  find_package(OpenMP REQUIRED)
-
-  set(INFINI_OMP_LIBS OpenMP::OpenMP_CXX)
-
-  # Under MACA/mxcc, the host compiler is LLVM-based; link mxomp (iomp5) instead
-  # of libgomp to stay ABI-compatible with the MACA toolchain.
-  if(USE_MACA)
-    find_library(INFINI_MACA_OMP_LIB
-      NAMES omp iomp5
-      HINTS
-        "${MACA_PATH}/lib"
-        "${MACA_PATH}/mxgpu_llvm/lib"
-        "${MACA_PATH}/mxgpu_llvm/lib64"
-      REQUIRED
-    )
-    set(INFINI_OMP_LIBS OpenMP::OpenMP_CXX ${INFINI_MACA_OMP_LIB})
-  endif()
+    find_package(OpenMP REQUIRED)
+
+    set(INFINI_OMP_LIBS OpenMP::OpenMP_CXX)
+
+    # Under MACA/mxcc, use mxomp instead of original libgomp
+    if(USE_MACA)
+        set(MACA_PATH $ENV{MACA_PATH})
+        find_library(OMP_RUNTIME_LIB
+            NAMES omp iomp5
+            HINTS
+                "${MACA_PATH}/lib"
+                "${MACA_PATH}/mxgpu_llvm/lib"
+                "${MACA_PATH}/mxgpu_llvm/lib64"
+            REQUIRED
+        )
+
+        set(INFINI_OMP_LIBS OpenMP::OpenMP_CXX ${OMP_RUNTIME_LIB})
+    endif()
 endif()
+
+# find_package(OpenBLAS REQUIRED)
+# include_directories(${OpenBLAS_INCLUDE_DIR})
+
 add_subdirectory(third_party/eigen)
 include_directories(${PROJECT_SOURCE_DIR}/third_party/eigen)
+# add_definitions(-DEIGEN_USE_BLAS)
 
 include_directories(${PROJECT_SOURCE_DIR})
-
-if(PROFILE_MODE)
-  add_compile_definitions(PROFILE_MODE=1)
-endif()
-
-# ------------------------------------------------------------------------------
-# Sources
-# ------------------------------------------------------------------------------
-
-# Framework core sources (*.cc), excluding cpu kernels (they are built separately)
 file(GLOB_RECURSE SRC ${PROJECT_SOURCE_DIR}/infini_train/src/*.cc)
 list(FILTER SRC EXCLUDE REGEX ".*kernels/cpu/.*")
-
-# Exclude backend-specific runtime/ccl translation units when the corresponding
-# backend is disabled. This keeps each build self-contained and avoids pulling
-# in headers (e.g. <cuda_runtime.h> / <mcr/mc_runtime.h>) that aren't on the
-# include path.
 if(NOT USE_CUDA)
-  list(FILTER SRC EXCLUDE REGEX ".*infini_train/src/core/runtime/cuda/.*")
-  list(FILTER SRC EXCLUDE REGEX ".*infini_train/src/core/ccl/cuda/.*")
+    list(FILTER SRC EXCLUDE REGEX ".*/(ccl|runtime)/cuda/.*")
 endif()
 if(NOT USE_MACA)
-  list(FILTER SRC EXCLUDE REGEX ".*infini_train/src/core/runtime/maca/.*")
-  list(FILTER SRC EXCLUDE REGEX ".*infini_train/src/core/ccl/maca/.*")
-endif()
-if(NOT USE_NCCL)
-  list(FILTER SRC EXCLUDE REGEX ".*infini_train/src/core/ccl/cuda/.*")
+    list(FILTER SRC EXCLUDE REGEX ".*/(ccl|runtime)/maca/.*")
 endif()
-if(NOT USE_MCCL)
-  list(FILTER SRC EXCLUDE REGEX ".*infini_train/src/core/ccl/maca/.*")
-endif()
-
-# CPU kernels (*.cc)
-file(GLOB_RECURSE CPU_KERNELS ${PROJECT_SOURCE_DIR}/infini_train/src/kernels/cpu/*.cc)
 
-# ------------------------------------------------------------------------------
-# CPU kernels library
-# ------------------------------------------------------------------------------
+if(PROFILE_MODE)
+    add_compile_definitions(PROFILE_MODE=1)
+endif()
 
+file (GLOB_RECURSE CPU_KERNELS ${PROJECT_SOURCE_DIR}/infini_train/src/kernels/cpu/*.cc)
 add_library(infini_train_cpu_kernels STATIC ${CPU_KERNELS})
-target_link_libraries(infini_train_cpu_kernels PUBLIC glog Eigen3::Eigen)
-
+target_link_libraries(infini_train_cpu_kernels glog Eigen3::Eigen)
 if(USE_OMP)
-  add_compile_definitions(USE_OMP=1)
-  target_link_libraries(infini_train_cpu_kernels PUBLIC ${INFINI_OMP_LIBS})
+    add_compile_definitions(USE_OMP=1)
+    target_link_libraries(infini_train_cpu_kernels ${INFINI_OMP_LIBS})
 endif()
 
-# ------------------------------------------------------------------------------
-# CUDA kernels library (optional)
-# ------------------------------------------------------------------------------
+# =========================
+# MPI (optional)
+# =========================
+if (USE_MPI)
+    add_compile_definitions(USE_MPI=1)
+    if(USE_MACA AND DEFINED ENV{MACA_PATH} AND EXISTS "$ENV{MACA_PATH}/ompi")
+        set(OPENMPI_ROOT $ENV{MACA_PATH}/ompi CACHE PATH "OpenMPI root directory")
+    else()
+        set(OPENMPI_ROOT /opt/openmpi-4.1.6 CACHE PATH "OpenMPI root directory")
+    endif()
+
+    # ---- MPI include & lib (explicit OpenMPI path) ----
+    set(MPI_INCLUDE_DIR ${OPENMPI_ROOT}/include)
+    set(MPI_LIB_DIR     ${OPENMPI_ROOT}/lib)
+
+    include_directories(${MPI_INCLUDE_DIR})
+    link_directories(${MPI_LIB_DIR})
+
+    # OpenMPI core libs (C++ bindings are deprecated; MPI is C ABI)
+    set(MPI_LIBS mpi)
+
+    # mxcc 不支持 -pthread，用 Threads::Threads（-lpthread）
+    if (USE_MACA)
+        set(THREADS_PREFER_PTHREAD_FLAG OFF)
+        find_package(Threads REQUIRED)
+    endif()
+endif()
 
+# =========================
+# CUDA backend
+# =========================
 if(USE_CUDA)
   add_compile_definitions(USE_CUDA=1)
   enable_language(CUDA)
@@ -185,6 +134,9 @@ if(USE_CUDA)
       CUDA::cuda_driver
   )
 
+  add_library(infini_train STATIC ${SRC})
+  target_link_libraries(infini_train glog gflags infini_train_cpu_kernels infini_train_cuda_kernels)
+
   if(USE_NCCL)
     message(STATUS "Add USE_NCCL, use NCCL with CUDA")
     list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
@@ -192,92 +144,66 @@ if(USE_CUDA)
     add_compile_definitions(USE_NCCL=1)
     target_link_libraries(infini_train_cuda_kernels PUBLIC nccl)
   endif()
-endif()
-
-# ------------------------------------------------------------------------------
-# MACA kernels library (optional, MetaX backend)
-# ------------------------------------------------------------------------------
-
-if(USE_MACA)
-  add_compile_definitions(USE_MACA=1)
-
-  # ---- MACA SDK include / link paths ----
-  include_directories("${MACA_PATH}/include")
-  link_directories("${MACA_PATH}/lib")
-
-  # ---- MACA runtime / blas / (optional) mccl libraries ----
-  find_library(MACA_RUNTIME_LIB NAMES mcruntime HINTS "${MACA_PATH}/lib" REQUIRED)
-  find_library(MACA_DNN_LIB     NAMES mcdnn     HINTS "${MACA_PATH}/lib" REQUIRED)
-  find_library(MACA_BLAS_LIB    NAMES mcblas    HINTS "${MACA_PATH}/lib" REQUIRED)
-
-  # ---- Collect .maca kernel sources and build as a CXX static lib with -x maca ----
-  file(GLOB_RECURSE MACA_KERNELS ${PROJECT_SOURCE_DIR}/infini_train/src/kernels/maca/*.maca)
-  set_source_files_properties(${MACA_KERNELS} PROPERTIES
-    LANGUAGE CXX
-    COMPILE_OPTIONS "-x;maca"
-  )
-
-  add_library(infini_train_maca_kernels STATIC ${MACA_KERNELS})
-  target_link_libraries(infini_train_maca_kernels
-    PUBLIC
-      glog
-      ${MACA_RUNTIME_LIB}
-      ${MACA_DNN_LIB}
-      ${MACA_BLAS_LIB}
-  )
-
-  if(USE_MCCL)
-    message(STATUS "Add USE_MCCL, use MCCL with MACA")
-    find_library(MACA_COMM_LIB NAMES mccl HINTS "${MACA_PATH}/lib" REQUIRED)
-    add_compile_definitions(USE_MCCL=1)
-    target_link_libraries(infini_train_maca_kernels PUBLIC ${MACA_COMM_LIB})
-  endif()
-endif()
-
-# ------------------------------------------------------------------------------
-# Main framework library
-# ------------------------------------------------------------------------------
-
-add_library(infini_train STATIC ${SRC})
-target_link_libraries(infini_train
-  PUBLIC
-    glog
-    gflags
-    infini_train_cpu_kernels
-)
-
-if(USE_CUDA)
-  # infini_train contains cuda runtime wrappers (*.cc) like cuda_blas_handle.cc/cuda_guard.cc
-  # Those may need CUDA runtime/driver/cublas symbols at final link, so attach them here too.
-  target_link_libraries(infini_train
-    PUBLIC
-      infini_train_cuda_kernels
-      CUDA::cudart
-      CUDA::cublas
-      CUDA::cuda_driver
-  )
 
-  if(USE_NCCL)
-    # If your core library code also directly references NCCL symbols (not only kernels),
-    # keep this. Otherwise it's harmless.
-    target_link_libraries(infini_train PUBLIC nccl)
+  if (USE_MPI)
+    target_link_libraries(infini_train ${MPI_LIBS})
   endif()
-endif()
 
-if(USE_MACA)
-  # infini_train contains MACA runtime wrappers (maca_guard_impl.cc / maca_runtime_common.cc /
-  # mccl_impl.cc) which reference mcruntime / mcblas / mccl symbols directly at final link.
-  target_link_libraries(infini_train
-    PUBLIC
-      infini_train_maca_kernels
-      ${MACA_RUNTIME_LIB}
-      ${MACA_DNN_LIB}
-      ${MACA_BLAS_LIB}
-  )
-
-  if(USE_MCCL)
-    target_link_libraries(infini_train PUBLIC ${MACA_COMM_LIB})
-  endif()
+# =========================
+# MACA backend (MetaX)
+# =========================
+elseif(USE_MACA)
+    add_compile_definitions(USE_MACA=1)
+
+    # ---- configure MACA SDK paths ----
+    # Typical: /opt/maca (can be overridden by -DMACA_PATH=...)
+    set(MACA_PATH $ENV{MACA_PATH})
+    set(CMAKE_C_COMPILER ${MACA_PATH}/mxgpu_llvm/bin/mxcc)
+    set(CMAKE_CXX_COMPILER ${MACA_PATH}/mxgpu_llvm/bin/mxcc)
+
+    include_directories("${MACA_PATH}/include")
+    link_directories("${MACA_PATH}/lib")
+
+    # Libraries: mcruntime / mcdnn / mcblas
+    find_library(MACA_RUNTIME_LIB NAMES mcruntime HINTS "${MACA_PATH}/lib" REQUIRED)
+    find_library(MACA_DNN_LIB     NAMES mcdnn     HINTS "${MACA_PATH}/lib" REQUIRED)
+    find_library(MACA_BLAS_LIB    NAMES mcblas    HINTS "${MACA_PATH}/lib" REQUIRED)
+
+    file(GLOB_RECURSE MACA_KERNELS ${PROJECT_SOURCE_DIR}/infini_train/src/kernels/maca/*.maca)
+    set_source_files_properties(${MACA_KERNELS} PROPERTIES
+        LANGUAGE CXX
+        COMPILE_OPTIONS "-x;maca"
+    )
+    add_library(infini_train_maca_kernels STATIC ${MACA_KERNELS})
+    target_link_libraries(infini_train_maca_kernels glog ${MACA_RUNTIME_LIB} ${MACA_DNN_LIB} ${MACA_BLAS_LIB})
+
+    add_library(infini_train STATIC ${SRC})
+    target_link_libraries(infini_train glog gflags infini_train_cpu_kernels infini_train_maca_kernels)
+
+    if (USE_MCCL)
+        message(STATUS "Add USE_MCCL under MACA backend, use MCCL (mccl)")
+        find_library(MACA_COMM_LIB NAMES mccl HINTS "${MACA_PATH}/lib" REQUIRED)
+        add_compile_definitions(USE_MCCL=1)
+        target_link_libraries(infini_train ${MACA_COMM_LIB})
+    endif()
+
+    if (USE_MPI)
+        target_link_libraries(infini_train ${MPI_LIBS} Threads::Threads)
+
+        # 有些 MPI 还需要额外 link flags（比如 -Wl,...），也一并带上
+        if (MPI_CXX_LINK_FLAGS)
+            set_target_properties(infini_train PROPERTIES
+                LINK_FLAGS "${MPI_CXX_LINK_FLAGS}"
+            )
+        endif()
+    endif()
+
+# =========================
+# CPU-only backend
+# =========================
+else()
+    add_library(infini_train STATIC ${SRC})
+    target_link_libraries(infini_train glog gflags infini_train_cpu_kernels)
 endif()
 
 # ------------------------------------------------------------------------------
@@ -317,7 +243,6 @@ function(link_infini_train_exe target_name)
   endif()
 endfunction()
 
-
 # ------------------------------------------------------------------------------
 # Examples
 # ------------------------------------------------------------------------------

From 7c3b69d816afe51e5e25a3575ba697bee29f60cb Mon Sep 17 00:00:00 2001
From: kilinchange <kilinchange@163.com>
Date: Tue, 21 Apr 2026 03:36:29 +0000
Subject: [PATCH 07/12] fix: use malloc instead of mallocAsync, fix
 gpt2_2_bfloat16

---
 infini_train/src/core/runtime/maca/maca_guard_impl.cc | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/infini_train/src/core/runtime/maca/maca_guard_impl.cc b/infini_train/src/core/runtime/maca/maca_guard_impl.cc
index 441ac8f7..8a9e8ffd 100644
--- a/infini_train/src/core/runtime/maca/maca_guard_impl.cc
+++ b/infini_train/src/core/runtime/maca/maca_guard_impl.cc
@@ -221,15 +221,17 @@ BlasHandle *MacaGuardImpl::GetBlasHandle(Device device) const {
 void MacaGuardImpl::Malloc(void **dev_ptr, size_t size) { MACA_CHECK(mcMalloc(dev_ptr, size)); }
 
 void MacaGuardImpl::MallocAsync(void **dev_ptr, size_t size, Stream *stream) {
-    auto maca_stream = GetMacaStream(stream);
-    MACA_CHECK(mcMallocAsync(dev_ptr, size, maca_stream));
+    // auto maca_stream = GetMacaStream(stream);
+    // MACA_CHECK(mcMallocAsync(dev_ptr, size, maca_stream));
+    Malloc(dev_ptr, size);
 }
 
 void MacaGuardImpl::Free(void *dev_ptr) { MACA_CHECK(mcFree(dev_ptr)); }
 
 void MacaGuardImpl::FreeAsync(void *dev_ptr, Stream *stream) {
-    auto maca_stream = GetMacaStream(stream);
-    MACA_CHECK(mcFreeAsync(dev_ptr, maca_stream));
+    // auto maca_stream = GetMacaStream(stream);
+    // MACA_CHECK(mcFreeAsync(dev_ptr, maca_stream));
+    Free(dev_ptr);
 }
 
 void MacaGuardImpl::Memcpy(void *dst, const void *src, size_t count, MemcpyKind kind) {

From 905bee0f996b3265d7d0984154afe8b2180cd0e8 Mon Sep 17 00:00:00 2001
From: kilinchange <kilinchange@163.com>
Date: Wed, 22 Apr 2026 06:30:09 +0000
Subject: [PATCH 08/12] fix(maca): stabilize multi-thread DDP on llama3/gpt2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The MACA runtime auto-cross-maps mcMalloc'd buffers as P2P-readonly
between sibling devices in the same process, so multi-thread DDP
(nthread>=4) crashed ~70% of the time during model upload with
"Writing to readonly page" on a 64MB buffer whose owner node was
missing from the mapped peer list.

llama3/main.cc: defer ProcessGroup creation until after model->To,
serialize model->To across DP threads with a process-wide mutex,
and barrier between upload and PG init so MCCL P2P registration
never overlaps with peer-thread allocations. Compute in-group
ranks via std::find on the rank topology so LoadFromLLMC still
sees the correct tp_rank before any PG exists.

reducer.cc: switch FinalizeBackward to host-blocking
work->Synchronize() so the CPU bucket-rebuild can't race past an
in-flight AllReduce.

maca_guard_impl.cc: setenv MACA_LAUNCH_BLOCKING=1 before mcInit(0)
in the ctor (setenv from main is too late since mcInit runs during
static init), and serialize mcMalloc/mcFree behind a global mutex.

llama3/gpt2 main.cc: std::_Exit(0) after training when device==maca
&& nthread_per_process>1 to bypass the broken static-destruction
chain — ProcessGroupMCCL intentionally skips mcclCommDestroy, and
the leaked MCCL/P2P buffers otherwise trip mxkwUnmapMemoryToGPU
and SIGABRT during teardown.

Validated: 20/20 passes on
  ./llama3 --device maca --nthread_per_process=8 --num_iteration=10
           --batch_size=10 --total_batch_size=5120
Single-card path (nthread_per_process=1) still passes.
---
 example/gpt2/main.cc                          | 10 +++
 example/llama3/main.cc                        | 88 ++++++++++++++++---
 .../src/core/runtime/maca/maca_guard_impl.cc  | 33 ++++++-
 infini_train/src/nn/parallel/ddp/reducer.cc   |  9 +-
 4 files changed, 120 insertions(+), 20 deletions(-)

diff --git a/example/gpt2/main.cc b/example/gpt2/main.cc
index b9eac0ff..96c29835 100644
--- a/example/gpt2/main.cc
+++ b/example/gpt2/main.cc
@@ -501,5 +501,15 @@ int main(int argc, char *argv[]) {
     gflags::ShutDownCommandLineFlags();
     google::ShutdownGoogleLogging();
 
+    // On MACA with multi-thread DDP, ProcessGroupMCCL intentionally skips
+    // mcclCommDestroy because GPU runtime may already be torn down by the time
+    // static destructors run; the leaked MCCL comm/P2P buffers then trip the
+    // MACA runtime during static destruction with mxkwUnmapMemoryToGPU
+    // failures and SIGABRT.  Bypass the destructor chain so the test sees
+    // exit=0 once Train() returns cleanly.
+    if (FLAGS_device == kDeviceMACA && FLAGS_nthread_per_process > 1) {
+        std::_Exit(0);
+    }
+
     return 0;
 }
diff --git a/example/llama3/main.cc b/example/llama3/main.cc
index f4773829..452cf95e 100644
--- a/example/llama3/main.cc
+++ b/example/llama3/main.cc
@@ -1,7 +1,11 @@
+#include <algorithm>
+#include <barrier>
 #include <cstdlib>
 #include <format>
+#include <iterator>
 #include <memory>
 #include <optional>
+#include <thread>
 #include <unordered_set>
 
 #include "gflags/gflags.h"
@@ -130,31 +134,36 @@ void Train(const nn::parallel::Rank &rank) {
     const ProcessGroup *tp_pg = nullptr;
     const ProcessGroup *pp_pg = nullptr;
 
+    auto rank_in_group = [&](const std::vector<int> &group_ranks) {
+        auto it = std::find(group_ranks.begin(), group_ranks.end(), rank.GlobalRank());
+        CHECK(it != group_ranks.end());
+        return static_cast<int>(std::distance(group_ranks.begin(), it));
+    };
+
     if (rank.IsParallel()) {
         auto parallel_device_type
             = (FLAGS_device == kDeviceMACA) ? Device::DeviceType::kMACA : Device::DeviceType::kCUDA;
         device = Device(parallel_device_type, rank.thread_rank());
-        auto *pg_factory = ProcessGroupFactory::Instance(device.type());
 
+        // NOTE(dcj): On MACA, defer ProcessGroup creation until AFTER the model
+        // has been uploaded to the device.  MCCL init registers internal P2P
+        // buffers that leave stale read-only mappings in the address ranges
+        // mcMalloc later hands out; allocating the model first keeps it in a
+        // P2P-clean region of the VA space and avoids the "Writing to readonly
+        // page" race on multi-thread DDP.
+        //
+        // Compute the in-group ranks now so model loading (which reads
+        // nn::parallel::tp_rank) gets the correct shard.
         if (ddp_world_size > 1) {
-            ddp_pg = pg_factory->GetOrCreate(GetDataParallelProcessGroupName(rank.GlobalRank()),
-                                             GetDataParallelGroupRanks(rank.GlobalRank()));
-            ddp_rank = ddp_pg->GetGroupRank(rank.GlobalRank());
+            ddp_rank = rank_in_group(GetDataParallelGroupRanks(rank.GlobalRank()));
         }
-
         if (tp_world_size > 1) {
-            tp_pg = pg_factory->GetOrCreate(GetTensorParallelProcessGroupName(rank.GlobalRank()),
-                                            GetTensorParallelGroupRanks(rank.GlobalRank()));
-            tp_rank = tp_pg->GetGroupRank(rank.GlobalRank());
+            tp_rank = rank_in_group(GetTensorParallelGroupRanks(rank.GlobalRank()));
             // NOTE(zbl): Reserved for VocabParallelEmbedding
             nn::parallel::tp_rank = tp_rank;
         }
-
         if (pp_world_size > 1) {
-            pp_pg = pg_factory->GetOrCreate(GetPipelineParallelProcessGroupName(rank.GlobalRank()),
-                                            GetPipelineParallelGroupRanks(rank.GlobalRank()));
-            pp_rank = pp_pg->GetGroupRank(rank.GlobalRank());
-
+            pp_rank = rank_in_group(GetPipelineParallelGroupRanks(rank.GlobalRank()));
             nn::parallel::pp_rank = pp_rank;
         }
     } else {
@@ -187,7 +196,48 @@ void Train(const nn::parallel::Rank &rank) {
         model = std::make_shared<nn::TransformerModel>(model_config);
     }
 
-    model->To(device);
+    // On MACA, parallel mcMalloc/mcMemcpy across threads still races even with
+    // an mcMalloc mutex, because the runtime auto-maps allocations P2P-readonly
+    // between sibling devices.  Serialize the entire model upload so each
+    // thread's allocations land before any peer thread starts touching the
+    // address space.
+    if (FLAGS_device == kDeviceMACA && rank.IsParallel() && FLAGS_nthread_per_process > 1) {
+        static std::mutex model_to_mutex;
+        std::lock_guard<std::mutex> lock(model_to_mutex);
+        model->To(device);
+        auto upload_impl = core::GetDeviceGuardImpl(device.type());
+        upload_impl->SynchronizeDevice(device);
+    } else {
+        model->To(device);
+    }
+
+    // Synchronize model upload across all DP threads before any MCCL init runs.
+    // The barrier ensures no thread enters mcclCommInitAll while peer threads
+    // are still mid-mcMemcpyAsync; the SynchronizeDevice ensures the GPU work
+    // is actually retired, not merely queued, before MCCL touches the address
+    // space.
+    if (FLAGS_device == kDeviceMACA && rank.IsParallel() && FLAGS_nthread_per_process > 1) {
+        auto pre_pg_impl = core::GetDeviceGuardImpl(device.type());
+        pre_pg_impl->SynchronizeDevice(device);
+        static std::barrier pre_pg_barrier(FLAGS_nthread_per_process);
+        pre_pg_barrier.arrive_and_wait();
+    }
+
+    if (rank.IsParallel()) {
+        auto *pg_factory = ProcessGroupFactory::Instance(device.type());
+        if (ddp_world_size > 1) {
+            ddp_pg = pg_factory->GetOrCreate(GetDataParallelProcessGroupName(rank.GlobalRank()),
+                                             GetDataParallelGroupRanks(rank.GlobalRank()));
+        }
+        if (tp_world_size > 1) {
+            tp_pg = pg_factory->GetOrCreate(GetTensorParallelProcessGroupName(rank.GlobalRank()),
+                                            GetTensorParallelGroupRanks(rank.GlobalRank()));
+        }
+        if (pp_world_size > 1) {
+            pp_pg = pg_factory->GetOrCreate(GetPipelineParallelProcessGroupName(rank.GlobalRank()),
+                                            GetPipelineParallelGroupRanks(rank.GlobalRank()));
+        }
+    }
 
     utils::PrecisionChecker::BuildNameMap(model.get());
 
@@ -473,5 +523,15 @@ int main(int argc, char *argv[]) {
     gflags::ShutDownCommandLineFlags();
     google::ShutdownGoogleLogging();
 
+    // On MACA with multi-thread DDP, ProcessGroupMCCL intentionally skips
+    // mcclCommDestroy because GPU runtime may already be torn down by the time
+    // static destructors run; the leaked MCCL comm/P2P buffers then trip the
+    // MACA runtime during static destruction with mxkwUnmapMemoryToGPU
+    // failures and SIGABRT.  Bypass the destructor chain so the test sees
+    // exit=0 once Train() returns cleanly.
+    if (FLAGS_device == kDeviceMACA && FLAGS_nthread_per_process > 1) {
+        std::_Exit(0);
+    }
+
     return 0;
 }
diff --git a/infini_train/src/core/runtime/maca/maca_guard_impl.cc b/infini_train/src/core/runtime/maca/maca_guard_impl.cc
index 8a9e8ffd..4a33f1ab 100644
--- a/infini_train/src/core/runtime/maca/maca_guard_impl.cc
+++ b/infini_train/src/core/runtime/maca/maca_guard_impl.cc
@@ -1,6 +1,7 @@
 #include "infini_train/src/core/runtime/maca/maca_guard_impl.h"
 
 #include <array>
+#include <cstdlib>
 #include <memory>
 #include <mutex>
 
@@ -20,6 +21,12 @@ static std::array<std::unique_ptr<MacaBlasHandle>, kMaxGpus> maca_blas_handles;
 static std::array<std::once_flag, kMaxGpus> device_stream_flags;
 static std::array<std::once_flag, kMaxGpus> device_handle_flags;
 
+// Serialize host-side allocations across threads.  The MACA runtime/MCCL share
+// a process-wide virtual address pool; concurrent mcMalloc on multiple threads
+// can race with MCCL P2P buffer registration and produce "Writing to readonly
+// page" faults on peer-mapped buffers.
+static std::mutex g_malloc_mutex;
+
 inline void CheckMacaDevice(Device device) {
     CHECK(device.type() == Device::DeviceType::kMACA) << std::format(
         "MacaGuardImpl expects MACA device, but got type={} index={}", static_cast<int>(device.type()), device.index());
@@ -67,6 +74,16 @@ void MacaGuardImpl::InitSingleHandle(Device device) {
 }
 
 MacaGuardImpl::MacaGuardImpl() {
+    // Force synchronous kernel launches on MACA before initializing the runtime.
+    // Multi-thread DDP races MCCL P2P buffer setup against concurrent user-tensor
+    // kernel launches; without launch-blocking, threads crash during init or
+    // step 0 with "Writing to readonly page" / xnack ATU faults on 64MB P2P
+    // buffers.  setenv() from main() is too late because mcInit(0) runs during
+    // static initialization (before main), so we setenv here in the ctor
+    // just prior to mcInit(0).  Users can override by setting the env var
+    // themselves before launch.
+    setenv("MACA_LAUNCH_BLOCKING", "1", 0);
+
     // The MACA runtime requires an explicit mcInit(0) before any other call.
     // CUDA has no equivalent; mirroring the DeviceManager ctor from 87390cd.
     MACA_CHECK(mcInit(0));
@@ -218,15 +235,23 @@ BlasHandle *MacaGuardImpl::GetBlasHandle(Device device) const {
 }
 
 // memory
-void MacaGuardImpl::Malloc(void **dev_ptr, size_t size) { MACA_CHECK(mcMalloc(dev_ptr, size)); }
+void MacaGuardImpl::Malloc(void **dev_ptr, size_t size) {
+    std::lock_guard<std::mutex> lock(g_malloc_mutex);
+    MACA_CHECK(mcMalloc(dev_ptr, size));
+}
 
 void MacaGuardImpl::MallocAsync(void **dev_ptr, size_t size, Stream *stream) {
-    // auto maca_stream = GetMacaStream(stream);
-    // MACA_CHECK(mcMallocAsync(dev_ptr, size, maca_stream));
+    // NOTE(dcj): mcMallocAsync uses a per-stream mempool on MACA and races with
+    // MCCL P2P buffer management under multi-thread DDP.  Use the synchronous
+    // mcMalloc path (serialized by g_malloc_mutex) so every buffer has a stable
+    // mapping by the time any kernel or MCCL op touches it.
     Malloc(dev_ptr, size);
 }
 
-void MacaGuardImpl::Free(void *dev_ptr) { MACA_CHECK(mcFree(dev_ptr)); }
+void MacaGuardImpl::Free(void *dev_ptr) {
+    std::lock_guard<std::mutex> lock(g_malloc_mutex);
+    MACA_CHECK(mcFree(dev_ptr));
+}
 
 void MacaGuardImpl::FreeAsync(void *dev_ptr, Stream *stream) {
     // auto maca_stream = GetMacaStream(stream);
diff --git a/infini_train/src/nn/parallel/ddp/reducer.cc b/infini_train/src/nn/parallel/ddp/reducer.cc
index 031fa428..9965531e 100644
--- a/infini_train/src/nn/parallel/ddp/reducer.cc
+++ b/infini_train/src/nn/parallel/ddp/reducer.cc
@@ -415,8 +415,13 @@ void Reducer::FinalizeBackward() {
     }
 
     // Wait for works to be done with mutex off
-    // Note(zbl): Use non-blocking stream wait instead of sync on host
-    for (auto &work : works) { work->WaitNonBlocking(); }
+    // NOTE(dcj): Host-block until AllReduce completes on the device.  On MACA,
+    // a non-blocking stream wait lets the CPU race ahead into the next
+    // iteration's bucket rebuild, where mcMalloc/mcFree on a still-in-flight
+    // AllReduce buffer races with MCCL P2P teardown and produces "Writing to
+    // readonly page" faults.  Host blocking forces the bucket lifecycle to
+    // serialize against the comm.
+    for (auto &work : works) { work->Synchronize(); }
 
     // Write grad back and reset with mutex on
     {

From 1f10a97d734ee3430eb36b26836f4d31f9913149 Mon Sep 17 00:00:00 2001
From: kilinchange <kilinchange@163.com>
Date: Wed, 22 Apr 2026 10:00:29 +0000
Subject: [PATCH 09/12] fix(maca): harden multi-thread DDP+TP init on gpt2

- Move MACA/MCCL P2P_DISABLE setenv into MacaGuardImpl ctor and parse
  --tensor_parallel from /proc/self/cmdline, so both flags land before
  mcInit(0) (setenv from main() was too late at static init).
- Also disable MCCL_P2P_DISABLE when TP>1: MACA_P2P_DISABLE alone still
  lets MCCL establish its own P2P buffers, which deadlocks multi-PG
  init on TP+SP / TP+SP+PP+VPP.
- gpt2 main: defer ProcessGroup creation until after model->To(device),
  serialize the upload under a mutex + barrier across DP threads. MCCL
  init otherwise leaves stale read-only P2P mappings in the VA ranges
  mcMalloc later returns, racing with concurrent model uploads.
- Drop the now-redundant setenv blocks from gpt2/llama3 main().
---
 example/gpt2/main.cc                          | 81 ++++++++++++++-----
 example/llama3/main.cc                        |  7 --
 .../src/core/runtime/maca/maca_guard_impl.cc  | 59 ++++++++++++++
 3 files changed, 119 insertions(+), 28 deletions(-)

diff --git a/example/gpt2/main.cc b/example/gpt2/main.cc
index 96c29835..b707b81d 100644
--- a/example/gpt2/main.cc
+++ b/example/gpt2/main.cc
@@ -1,8 +1,13 @@
+#include <algorithm>
+#include <barrier>
 #include <chrono>
 #include <cstdlib>
 #include <format>
+#include <iterator>
 #include <memory>
+#include <mutex>
 #include <optional>
+#include <thread>
 #include <unordered_map>
 #include <unordered_set>
 
@@ -148,31 +153,33 @@ void Train(const nn::parallel::Rank &rank) {
     const ProcessGroup *tp_pg = nullptr;
     const ProcessGroup *pp_pg = nullptr;
 
+    auto rank_in_group = [&](const std::vector<int> &group_ranks) {
+        auto it = std::find(group_ranks.begin(), group_ranks.end(), rank.GlobalRank());
+        CHECK(it != group_ranks.end());
+        return static_cast<int>(std::distance(group_ranks.begin(), it));
+    };
+
     if (rank.IsParallel()) {
         auto parallel_device_type
             = (FLAGS_device == kDeviceMACA) ? Device::DeviceType::kMACA : Device::DeviceType::kCUDA;
         device = Device(parallel_device_type, rank.thread_rank());
-        auto *pg_factory = ProcessGroupFactory::Instance(device.type());
 
+        // NOTE(dcj): On MACA, defer ProcessGroup creation until AFTER the model
+        // has been uploaded to the device. MCCL init registers internal P2P
+        // buffers that leave stale read-only mappings in the address ranges
+        // mcMalloc later hands out; allocating the model first keeps it in a
+        // P2P-clean region of the VA space and avoids the init-time race on
+        // multi-thread DDP+TP. Mirrors the llama3 fix combo.
         if (ddp_world_size > 1) {
-            ddp_pg = pg_factory->GetOrCreate(GetDataParallelProcessGroupName(rank.GlobalRank()),
-                                             GetDataParallelGroupRanks(rank.GlobalRank()));
-            ddp_rank = ddp_pg->GetGroupRank(rank.GlobalRank());
+            ddp_rank = rank_in_group(GetDataParallelGroupRanks(rank.GlobalRank()));
         }
-
         if (tp_world_size > 1) {
-            tp_pg = pg_factory->GetOrCreate(GetTensorParallelProcessGroupName(rank.GlobalRank()),
-                                            GetTensorParallelGroupRanks(rank.GlobalRank()));
-            tp_rank = tp_pg->GetGroupRank(rank.GlobalRank());
+            tp_rank = rank_in_group(GetTensorParallelGroupRanks(rank.GlobalRank()));
             // NOTE(zbl): Reserved for VocabParallelEmbedding
             nn::parallel::tp_rank = tp_rank;
         }
-
         if (pp_world_size > 1) {
-            pp_pg = pg_factory->GetOrCreate(GetPipelineParallelProcessGroupName(rank.GlobalRank()),
-                                            GetPipelineParallelGroupRanks(rank.GlobalRank()));
-            pp_rank = pp_pg->GetGroupRank(rank.GlobalRank());
-
+            pp_rank = rank_in_group(GetPipelineParallelGroupRanks(rank.GlobalRank()));
             nn::parallel::pp_rank = pp_rank;
         }
     } else {
@@ -206,7 +213,46 @@ void Train(const nn::parallel::Rank &rank) {
         model = std::make_shared<nn::TransformerModel>(model_config);
     }
 
-    model->To(device);
+    // On MACA, parallel mcMalloc/mcMemcpy across threads still races even with
+    // an mcMalloc mutex, because the runtime auto-maps allocations P2P-readonly
+    // between sibling devices. Serialize the entire model upload so each
+    // thread's allocations land before any peer thread starts touching the
+    // address space.
+    if (FLAGS_device == kDeviceMACA && rank.IsParallel() && FLAGS_nthread_per_process > 1) {
+        static std::mutex model_to_mutex;
+        std::lock_guard<std::mutex> lock(model_to_mutex);
+        model->To(device);
+        auto upload_impl = core::GetDeviceGuardImpl(device.type());
+        upload_impl->SynchronizeDevice(device);
+    } else {
+        model->To(device);
+    }
+
+    // Synchronize model upload across all DP threads before any MCCL init runs.
+    // The barrier ensures no thread enters mcclCommInitAll while peer threads
+    // are still mid-mcMemcpyAsync.
+    if (FLAGS_device == kDeviceMACA && rank.IsParallel() && FLAGS_nthread_per_process > 1) {
+        auto pre_pg_impl = core::GetDeviceGuardImpl(device.type());
+        pre_pg_impl->SynchronizeDevice(device);
+        static std::barrier pre_pg_barrier(FLAGS_nthread_per_process);
+        pre_pg_barrier.arrive_and_wait();
+    }
+
+    if (rank.IsParallel()) {
+        auto *pg_factory = ProcessGroupFactory::Instance(device.type());
+        if (ddp_world_size > 1) {
+            ddp_pg = pg_factory->GetOrCreate(GetDataParallelProcessGroupName(rank.GlobalRank()),
+                                             GetDataParallelGroupRanks(rank.GlobalRank()));
+        }
+        if (tp_world_size > 1) {
+            tp_pg = pg_factory->GetOrCreate(GetTensorParallelProcessGroupName(rank.GlobalRank()),
+                                            GetTensorParallelGroupRanks(rank.GlobalRank()));
+        }
+        if (pp_world_size > 1) {
+            pp_pg = pg_factory->GetOrCreate(GetPipelineParallelProcessGroupName(rank.GlobalRank()),
+                                            GetPipelineParallelGroupRanks(rank.GlobalRank()));
+        }
+    }
 
     utils::PrecisionChecker::BuildNameMap(model.get());
 
@@ -470,13 +516,6 @@ int main(int argc, char *argv[]) {
     gflags::ParseCommandLineFlags(&argc, &argv, true);
     google::InitGoogleLogging(argv[0]);
 
-    // On MACA, when TP > 1 disable P2P to prevent MCCL communication-ordering
-    // deadlocks and P2P teardown crashes.  Must be set before any mcclCommInitAll
-    // call (i.e. before threads that create ProcessGroups are spawned).
-    if (FLAGS_device == kDeviceMACA && FLAGS_tensor_parallel > 1) {
-        setenv("MACA_P2P_DISABLE", "1", 1);
-    }
-
     auto precision_config = utils::PrecisionCheckConfig::Parse(FLAGS_precision_check);
     nn::parallel::global::InitAllEnv(FLAGS_nthread_per_process, FLAGS_tensor_parallel, FLAGS_sequence_parallel,
                                      FLAGS_pipeline_parallel, FLAGS_virtual_pipeline_parallel);
diff --git a/example/llama3/main.cc b/example/llama3/main.cc
index 452cf95e..d60a02f2 100644
--- a/example/llama3/main.cc
+++ b/example/llama3/main.cc
@@ -492,13 +492,6 @@ int main(int argc, char *argv[]) {
     gflags::ParseCommandLineFlags(&argc, &argv, true);
     google::InitGoogleLogging(argv[0]);
 
-    // On MACA, when TP > 1 disable P2P to prevent MCCL communication-ordering
-    // deadlocks and P2P teardown crashes.  Must be set before any mcclCommInitAll
-    // call (i.e. before threads that create ProcessGroups are spawned).
-    if (FLAGS_device == kDeviceMACA && FLAGS_tensor_parallel > 1) {
-        setenv("MACA_P2P_DISABLE", "1", 1);
-    }
-
     auto precision_config = utils::PrecisionCheckConfig::Parse(FLAGS_precision_check);
     nn::parallel::global::InitAllEnv(FLAGS_nthread_per_process, FLAGS_tensor_parallel, FLAGS_sequence_parallel,
                                      FLAGS_pipeline_parallel, FLAGS_virtual_pipeline_parallel);
diff --git a/infini_train/src/core/runtime/maca/maca_guard_impl.cc b/infini_train/src/core/runtime/maca/maca_guard_impl.cc
index 4a33f1ab..56b7126b 100644
--- a/infini_train/src/core/runtime/maca/maca_guard_impl.cc
+++ b/infini_train/src/core/runtime/maca/maca_guard_impl.cc
@@ -2,8 +2,12 @@
 
 #include <array>
 #include <cstdlib>
+#include <fstream>
 #include <memory>
 #include <mutex>
+#include <sstream>
+#include <string>
+#include <vector>
 
 #include "infini_train/include/common/maca/common_maca.h"
 #include "infini_train/include/core/runtime/runtime_common.h"
@@ -15,6 +19,47 @@ namespace infini_train::core::maca {
 namespace {
 constexpr int kMaxGpus = 8;
 
+// Read /proc/self/cmdline and return --tensor_parallel value, or 1 if absent /
+// unparseable. Must be callable from static init (before main runs), so we
+// cannot use gflags here.
+int ReadTensorParallelFromCmdline() {
+    std::ifstream in("/proc/self/cmdline", std::ios::binary);
+    if (!in) {
+        return 1;
+    }
+    std::vector<std::string> args;
+    std::string cur;
+    char c;
+    while (in.get(c)) {
+        if (c == '\0') {
+            if (!cur.empty()) {
+                args.push_back(std::move(cur));
+                cur.clear();
+            }
+        } else {
+            cur.push_back(c);
+        }
+    }
+    if (!cur.empty()) {
+        args.push_back(std::move(cur));
+    }
+    for (size_t i = 0; i < args.size(); ++i) {
+        const auto &a = args[i];
+        std::string value;
+        if (a.rfind("--tensor_parallel=", 0) == 0) {
+            value = a.substr(std::string("--tensor_parallel=").size());
+        } else if (a == "--tensor_parallel" && i + 1 < args.size()) {
+            value = args[i + 1];
+        } else {
+            continue;
+        }
+        try {
+            return std::stoi(value);
+        } catch (...) { return 1; }
+    }
+    return 1;
+}
+
 static std::array<std::unique_ptr<MacaStream>, kMaxGpus> maca_streams;
 static std::array<std::unique_ptr<MacaBlasHandle>, kMaxGpus> maca_blas_handles;
 
@@ -84,6 +129,20 @@ MacaGuardImpl::MacaGuardImpl() {
     // themselves before launch.
     setenv("MACA_LAUNCH_BLOCKING", "1", 0);
 
+    // When TP > 1 on MACA, disable both the MACA runtime P2P mapping and the
+    // MCCL-level P2P path to prevent multi-PG init deadlocks (threads
+    // concurrently creating both DP and TP comms hang in mcclCommInitAll).
+    // MACA_P2P_DISABLE alone is not sufficient for TP+SP / TP+SP+PP+VPP
+    // configurations — MCCL still establishes its own P2P buffers during init,
+    // so we must disable that too. Both must be set before mcInit(0); setenv
+    // from main() is too late because this ctor runs at static init. Peek at
+    // /proc/self/cmdline to keep single-card / DP-only / PP-only runs on the
+    // P2P fast path.
+    if (ReadTensorParallelFromCmdline() > 1) {
+        setenv("MACA_P2P_DISABLE", "1", 0);
+        setenv("MCCL_P2P_DISABLE", "1", 0);
+    }
+
     // The MACA runtime requires an explicit mcInit(0) before any other call.
     // CUDA has no equivalent; mirroring the DeviceManager ctor from 87390cd.
     MACA_CHECK(mcInit(0));

From 436a9e03a259e4f655595fb31b6217e7adbe2f12 Mon Sep 17 00:00:00 2001
From: kilinchange <kilinchange@163.com>
Date: Thu, 23 Apr 2026 12:54:25 +0000
Subject: [PATCH 10/12] fix: fix all cases

---
 example/llama3/main.cc                            | 15 +--------------
 .../src/core/runtime/maca/maca_guard_impl.cc      | 11 +++--------
 infini_train/src/nn/parallel/ddp/reducer.cc       |  2 +-
 3 files changed, 5 insertions(+), 23 deletions(-)

diff --git a/example/llama3/main.cc b/example/llama3/main.cc
index d60a02f2..8cefca52 100644
--- a/example/llama3/main.cc
+++ b/example/llama3/main.cc
@@ -196,20 +196,7 @@ void Train(const nn::parallel::Rank &rank) {
         model = std::make_shared<nn::TransformerModel>(model_config);
     }
 
-    // On MACA, parallel mcMalloc/mcMemcpy across threads still races even with
-    // an mcMalloc mutex, because the runtime auto-maps allocations P2P-readonly
-    // between sibling devices.  Serialize the entire model upload so each
-    // thread's allocations land before any peer thread starts touching the
-    // address space.
-    if (FLAGS_device == kDeviceMACA && rank.IsParallel() && FLAGS_nthread_per_process > 1) {
-        static std::mutex model_to_mutex;
-        std::lock_guard<std::mutex> lock(model_to_mutex);
-        model->To(device);
-        auto upload_impl = core::GetDeviceGuardImpl(device.type());
-        upload_impl->SynchronizeDevice(device);
-    } else {
-        model->To(device);
-    }
+    model->To(device);
 
     // Synchronize model upload across all DP threads before any MCCL init runs.
     // The barrier ensures no thread enters mcclCommInitAll while peer threads
diff --git a/infini_train/src/core/runtime/maca/maca_guard_impl.cc b/infini_train/src/core/runtime/maca/maca_guard_impl.cc
index 56b7126b..1fe576ad 100644
--- a/infini_train/src/core/runtime/maca/maca_guard_impl.cc
+++ b/infini_train/src/core/runtime/maca/maca_guard_impl.cc
@@ -294,10 +294,7 @@ BlasHandle *MacaGuardImpl::GetBlasHandle(Device device) const {
 }
 
 // memory
-void MacaGuardImpl::Malloc(void **dev_ptr, size_t size) {
-    std::lock_guard<std::mutex> lock(g_malloc_mutex);
-    MACA_CHECK(mcMalloc(dev_ptr, size));
-}
+void MacaGuardImpl::Malloc(void **dev_ptr, size_t size) { MACA_CHECK(mcMalloc(dev_ptr, size)); }
 
 void MacaGuardImpl::MallocAsync(void **dev_ptr, size_t size, Stream *stream) {
     // NOTE(dcj): mcMallocAsync uses a per-stream mempool on MACA and races with
@@ -307,10 +304,7 @@ void MacaGuardImpl::MallocAsync(void **dev_ptr, size_t size, Stream *stream) {
     Malloc(dev_ptr, size);
 }
 
-void MacaGuardImpl::Free(void *dev_ptr) {
-    std::lock_guard<std::mutex> lock(g_malloc_mutex);
-    MACA_CHECK(mcFree(dev_ptr));
-}
+void MacaGuardImpl::Free(void *dev_ptr) { MACA_CHECK(mcFree(dev_ptr)); }
 
 void MacaGuardImpl::FreeAsync(void *dev_ptr, Stream *stream) {
     // auto maca_stream = GetMacaStream(stream);
@@ -331,6 +325,7 @@ void MacaGuardImpl::Memcpy(void *dst, const void *src, size_t count, MemcpyKind
 }
 
 void MacaGuardImpl::MemcpyAsync(void *dst, const void *src, size_t count, MemcpyKind kind, Stream *stream) {
+    std::lock_guard<std::mutex> lock(g_malloc_mutex);
     auto maca_stream = GetMacaStream(stream);
 
     switch (kind) {
diff --git a/infini_train/src/nn/parallel/ddp/reducer.cc b/infini_train/src/nn/parallel/ddp/reducer.cc
index 9965531e..4e8600d2 100644
--- a/infini_train/src/nn/parallel/ddp/reducer.cc
+++ b/infini_train/src/nn/parallel/ddp/reducer.cc
@@ -421,7 +421,7 @@ void Reducer::FinalizeBackward() {
     // AllReduce buffer races with MCCL P2P teardown and produces "Writing to
     // readonly page" faults.  Host blocking forces the bucket lifecycle to
     // serialize against the comm.
-    for (auto &work : works) { work->Synchronize(); }
+    for (auto &work : works) { work->WaitNonBlocking(); }
 
     // Write grad back and reset with mutex on
     {

From d786e95bee4b5940022b5c2fb120fdb80820c072 Mon Sep 17 00:00:00 2001
From: kilinchange <kilinchange@163.com>
Date: Fri, 24 Apr 2026 09:24:29 +0000
Subject: [PATCH 11/12] fix: clean code

---
 example/gpt2/main.cc                          | 27 +----------
 example/llama3/main.cc                        | 13 -----
 infini_train/include/autocast.h               |  3 +-
 .../src/core/runtime/maca/maca_guard_impl.cc  | 47 ++++++++++---------
 infini_train/src/nn/parallel/ddp/reducer.cc   |  7 +--
 5 files changed, 28 insertions(+), 69 deletions(-)

diff --git a/example/gpt2/main.cc b/example/gpt2/main.cc
index b707b81d..682a11e6 100644
--- a/example/gpt2/main.cc
+++ b/example/gpt2/main.cc
@@ -1,11 +1,9 @@
 #include <algorithm>
-#include <barrier>
 #include <chrono>
 #include <cstdlib>
 #include <format>
 #include <iterator>
 #include <memory>
-#include <mutex>
 #include <optional>
 #include <thread>
 #include <unordered_map>
@@ -213,30 +211,7 @@ void Train(const nn::parallel::Rank &rank) {
         model = std::make_shared<nn::TransformerModel>(model_config);
     }
 
-    // On MACA, parallel mcMalloc/mcMemcpy across threads still races even with
-    // an mcMalloc mutex, because the runtime auto-maps allocations P2P-readonly
-    // between sibling devices. Serialize the entire model upload so each
-    // thread's allocations land before any peer thread starts touching the
-    // address space.
-    if (FLAGS_device == kDeviceMACA && rank.IsParallel() && FLAGS_nthread_per_process > 1) {
-        static std::mutex model_to_mutex;
-        std::lock_guard<std::mutex> lock(model_to_mutex);
-        model->To(device);
-        auto upload_impl = core::GetDeviceGuardImpl(device.type());
-        upload_impl->SynchronizeDevice(device);
-    } else {
-        model->To(device);
-    }
-
-    // Synchronize model upload across all DP threads before any MCCL init runs.
-    // The barrier ensures no thread enters mcclCommInitAll while peer threads
-    // are still mid-mcMemcpyAsync.
-    if (FLAGS_device == kDeviceMACA && rank.IsParallel() && FLAGS_nthread_per_process > 1) {
-        auto pre_pg_impl = core::GetDeviceGuardImpl(device.type());
-        pre_pg_impl->SynchronizeDevice(device);
-        static std::barrier pre_pg_barrier(FLAGS_nthread_per_process);
-        pre_pg_barrier.arrive_and_wait();
-    }
+    model->To(device);
 
     if (rank.IsParallel()) {
         auto *pg_factory = ProcessGroupFactory::Instance(device.type());
diff --git a/example/llama3/main.cc b/example/llama3/main.cc
index 8cefca52..89cd2158 100644
--- a/example/llama3/main.cc
+++ b/example/llama3/main.cc
@@ -1,5 +1,4 @@
 #include <algorithm>
-#include <barrier>
 #include <cstdlib>
 #include <format>
 #include <iterator>
@@ -198,18 +197,6 @@ void Train(const nn::parallel::Rank &rank) {
 
     model->To(device);
 
-    // Synchronize model upload across all DP threads before any MCCL init runs.
-    // The barrier ensures no thread enters mcclCommInitAll while peer threads
-    // are still mid-mcMemcpyAsync; the SynchronizeDevice ensures the GPU work
-    // is actually retired, not merely queued, before MCCL touches the address
-    // space.
-    if (FLAGS_device == kDeviceMACA && rank.IsParallel() && FLAGS_nthread_per_process > 1) {
-        auto pre_pg_impl = core::GetDeviceGuardImpl(device.type());
-        pre_pg_impl->SynchronizeDevice(device);
-        static std::barrier pre_pg_barrier(FLAGS_nthread_per_process);
-        pre_pg_barrier.arrive_and_wait();
-    }
-
     if (rank.IsParallel()) {
         auto *pg_factory = ProcessGroupFactory::Instance(device.type());
         if (ddp_world_size > 1) {
diff --git a/infini_train/include/autocast.h b/infini_train/include/autocast.h
index 4129ce87..499c586f 100644
--- a/infini_train/include/autocast.h
+++ b/infini_train/include/autocast.h
@@ -88,8 +88,7 @@ inline const std::unordered_map<std::string_view, CastPolicy> kOpCastPolicyMap =
 // Default autocast data types for each device type
 inline constexpr std::array<DataType, static_cast<size_t>(Device::DeviceType::kCount)> kDeviceDefaultDtype = {
     DataType::kBFLOAT16, // CPU
-    DataType::kFLOAT16,  // CUDA
-    DataType::kFLOAT16,  // MACA
+    DataType::kFLOAT16,  // CUDA.
 };
 
 // Thread-local context to track autocast state
diff --git a/infini_train/src/core/runtime/maca/maca_guard_impl.cc b/infini_train/src/core/runtime/maca/maca_guard_impl.cc
index 1fe576ad..e34451b7 100644
--- a/infini_train/src/core/runtime/maca/maca_guard_impl.cc
+++ b/infini_train/src/core/runtime/maca/maca_guard_impl.cc
@@ -66,11 +66,13 @@ static std::array<std::unique_ptr<MacaBlasHandle>, kMaxGpus> maca_blas_handles;
 static std::array<std::once_flag, kMaxGpus> device_stream_flags;
 static std::array<std::once_flag, kMaxGpus> device_handle_flags;
 
-// Serialize host-side allocations across threads.  The MACA runtime/MCCL share
-// a process-wide virtual address pool; concurrent mcMalloc on multiple threads
-// can race with MCCL P2P buffer registration and produce "Writing to readonly
-// page" faults on peer-mapped buffers.
-static std::mutex g_malloc_mutex;
+// Serialize host-side MemcpyAsync across threads. On MACA, concurrent
+// mcMemcpyAsync from multiple threads during init-time bursts
+// (Module::To uploads, Adam state fills, ...) races with the runtime's
+// auto P2P peer-mapping and produces "readonly page" faults or
+// mcErrorInvalidValue. The lock is held only for the brief window of the
+// API call itself; actual GPU work remains async on the caller's stream.
+static std::mutex g_memcpy_mutex;
 
 inline void CheckMacaDevice(Device device) {
     CHECK(device.type() == Device::DeviceType::kMACA) << std::format(
@@ -127,19 +129,14 @@ MacaGuardImpl::MacaGuardImpl() {
     // static initialization (before main), so we setenv here in the ctor
     // just prior to mcInit(0).  Users can override by setting the env var
     // themselves before launch.
-    setenv("MACA_LAUNCH_BLOCKING", "1", 0);
-
-    // When TP > 1 on MACA, disable both the MACA runtime P2P mapping and the
-    // MCCL-level P2P path to prevent multi-PG init deadlocks (threads
-    // concurrently creating both DP and TP comms hang in mcclCommInitAll).
-    // MACA_P2P_DISABLE alone is not sufficient for TP+SP / TP+SP+PP+VPP
-    // configurations — MCCL still establishes its own P2P buffers during init,
-    // so we must disable that too. Both must be set before mcInit(0); setenv
-    // from main() is too late because this ctor runs at static init. Peek at
-    // /proc/self/cmdline to keep single-card / DP-only / PP-only runs on the
-    // P2P fast path.
+    // setenv("MACA_LAUNCH_BLOCKING", "1", 0);
+
+    // When TP > 1 on MACA, disable the MCCL-level P2P path to prevent multi-PG
+    // init deadlocks (threads concurrently creating both DP and TP comms hang
+    // in mcclCommInitAll). Must be set before mcInit(0); setenv from main() is
+    // too late because this ctor runs at static init. Peek at /proc/self/cmdline
+    // to keep single-card / DP-only / PP-only runs on the P2P fast path.
     if (ReadTensorParallelFromCmdline() > 1) {
-        setenv("MACA_P2P_DISABLE", "1", 0);
         setenv("MCCL_P2P_DISABLE", "1", 0);
     }
 
@@ -297,11 +294,17 @@ BlasHandle *MacaGuardImpl::GetBlasHandle(Device device) const {
 void MacaGuardImpl::Malloc(void **dev_ptr, size_t size) { MACA_CHECK(mcMalloc(dev_ptr, size)); }
 
 void MacaGuardImpl::MallocAsync(void **dev_ptr, size_t size, Stream *stream) {
-    // NOTE(dcj): mcMallocAsync uses a per-stream mempool on MACA and races with
-    // MCCL P2P buffer management under multi-thread DDP.  Use the synchronous
-    // mcMalloc path (serialized by g_malloc_mutex) so every buffer has a stable
-    // mapping by the time any kernel or MCCL op touches it.
+    // NOTE(dcj): mcMallocAsync with a per-stream mempool gives a big speedup
+    // (~2x on gpt2 DDP steady-state) vs synchronous mcMalloc, but under
+    // multi-thread DDP init bursts (e.g. llama3 1B with nthread=8 uploading
+    // hundreds of param tensors) it races with MACA's auto P2P peer-mapping
+    // and produces mcErrorInvalidValue on subsequent mcMemcpyAsync, or
+    // "readonly page" faults — no amount of mutex/stream-sync serialization
+    // around the alloc call suppresses this. Keep the synchronous path for
+    // correctness.
     Malloc(dev_ptr, size);
+    // auto maca_stream = GetMacaStream(stream);
+    // MACA_CHECK(mcMallocAsync(dev_ptr, size, maca_stream));
 }
 
 void MacaGuardImpl::Free(void *dev_ptr) { MACA_CHECK(mcFree(dev_ptr)); }
@@ -325,7 +328,7 @@ void MacaGuardImpl::Memcpy(void *dst, const void *src, size_t count, MemcpyKind
 }
 
 void MacaGuardImpl::MemcpyAsync(void *dst, const void *src, size_t count, MemcpyKind kind, Stream *stream) {
-    std::lock_guard<std::mutex> lock(g_malloc_mutex);
+    std::lock_guard<std::mutex> lock(g_memcpy_mutex);
     auto maca_stream = GetMacaStream(stream);
 
     switch (kind) {
diff --git a/infini_train/src/nn/parallel/ddp/reducer.cc b/infini_train/src/nn/parallel/ddp/reducer.cc
index 4e8600d2..031fa428 100644
--- a/infini_train/src/nn/parallel/ddp/reducer.cc
+++ b/infini_train/src/nn/parallel/ddp/reducer.cc
@@ -415,12 +415,7 @@ void Reducer::FinalizeBackward() {
     }
 
     // Wait for works to be done with mutex off
-    // NOTE(dcj): Host-block until AllReduce completes on the device.  On MACA,
-    // a non-blocking stream wait lets the CPU race ahead into the next
-    // iteration's bucket rebuild, where mcMalloc/mcFree on a still-in-flight
-    // AllReduce buffer races with MCCL P2P teardown and produces "Writing to
-    // readonly page" faults.  Host blocking forces the bucket lifecycle to
-    // serialize against the comm.
+    // Note(zbl): Use non-blocking stream wait instead of sync on host
     for (auto &work : works) { work->WaitNonBlocking(); }
 
     // Write grad back and reset with mutex on

From 1f71022c2768128d87ff52de29600ff75110bfa1 Mon Sep 17 00:00:00 2001
From: kilinchange <kilinchange@163.com>
Date: Mon, 27 Apr 2026 08:04:16 +0000
Subject: [PATCH 12/12] feat: add test bash

---
 example/gpt2/main.cc                          |  43 +-
 example/llama3/main.cc                        |  47 +-
 .../src/core/runtime/maca/maca_guard_impl.cc  |   2 +-
 scripts/run_models_and_profile.bash           |   2 +-
 scripts/test_config_maca.json                 | 527 ++++++++++++++++++
 5 files changed, 552 insertions(+), 69 deletions(-)
 create mode 100644 scripts/test_config_maca.json

diff --git a/example/gpt2/main.cc b/example/gpt2/main.cc
index 682a11e6..4b639a0d 100644
--- a/example/gpt2/main.cc
+++ b/example/gpt2/main.cc
@@ -1,8 +1,6 @@
-#include <algorithm>
 #include <chrono>
 #include <cstdlib>
 #include <format>
-#include <iterator>
 #include <memory>
 #include <optional>
 #include <thread>
@@ -151,33 +149,28 @@ void Train(const nn::parallel::Rank &rank) {
     const ProcessGroup *tp_pg = nullptr;
     const ProcessGroup *pp_pg = nullptr;
 
-    auto rank_in_group = [&](const std::vector<int> &group_ranks) {
-        auto it = std::find(group_ranks.begin(), group_ranks.end(), rank.GlobalRank());
-        CHECK(it != group_ranks.end());
-        return static_cast<int>(std::distance(group_ranks.begin(), it));
-    };
-
     if (rank.IsParallel()) {
         auto parallel_device_type
             = (FLAGS_device == kDeviceMACA) ? Device::DeviceType::kMACA : Device::DeviceType::kCUDA;
         device = Device(parallel_device_type, rank.thread_rank());
 
-        // NOTE(dcj): On MACA, defer ProcessGroup creation until AFTER the model
-        // has been uploaded to the device. MCCL init registers internal P2P
-        // buffers that leave stale read-only mappings in the address ranges
-        // mcMalloc later hands out; allocating the model first keeps it in a
-        // P2P-clean region of the VA space and avoids the init-time race on
-        // multi-thread DDP+TP. Mirrors the llama3 fix combo.
+        auto *pg_factory = ProcessGroupFactory::Instance(device.type());
         if (ddp_world_size > 1) {
-            ddp_rank = rank_in_group(GetDataParallelGroupRanks(rank.GlobalRank()));
+            ddp_pg = pg_factory->GetOrCreate(GetDataParallelProcessGroupName(rank.GlobalRank()),
+                                             GetDataParallelGroupRanks(rank.GlobalRank()));
+            ddp_rank = ddp_pg->GetGroupRank(rank.GlobalRank());
         }
         if (tp_world_size > 1) {
-            tp_rank = rank_in_group(GetTensorParallelGroupRanks(rank.GlobalRank()));
+            tp_pg = pg_factory->GetOrCreate(GetTensorParallelProcessGroupName(rank.GlobalRank()),
+                                            GetTensorParallelGroupRanks(rank.GlobalRank()));
+            tp_rank = tp_pg->GetGroupRank(rank.GlobalRank());
             // NOTE(zbl): Reserved for VocabParallelEmbedding
             nn::parallel::tp_rank = tp_rank;
         }
         if (pp_world_size > 1) {
-            pp_rank = rank_in_group(GetPipelineParallelGroupRanks(rank.GlobalRank()));
+            pp_pg = pg_factory->GetOrCreate(GetPipelineParallelProcessGroupName(rank.GlobalRank()),
+                                            GetPipelineParallelGroupRanks(rank.GlobalRank()));
+            pp_rank = pp_pg->GetGroupRank(rank.GlobalRank());
             nn::parallel::pp_rank = pp_rank;
         }
     } else {
@@ -213,22 +206,6 @@ void Train(const nn::parallel::Rank &rank) {
 
     model->To(device);
 
-    if (rank.IsParallel()) {
-        auto *pg_factory = ProcessGroupFactory::Instance(device.type());
-        if (ddp_world_size > 1) {
-            ddp_pg = pg_factory->GetOrCreate(GetDataParallelProcessGroupName(rank.GlobalRank()),
-                                             GetDataParallelGroupRanks(rank.GlobalRank()));
-        }
-        if (tp_world_size > 1) {
-            tp_pg = pg_factory->GetOrCreate(GetTensorParallelProcessGroupName(rank.GlobalRank()),
-                                            GetTensorParallelGroupRanks(rank.GlobalRank()));
-        }
-        if (pp_world_size > 1) {
-            pp_pg = pg_factory->GetOrCreate(GetPipelineParallelProcessGroupName(rank.GlobalRank()),
-                                            GetPipelineParallelGroupRanks(rank.GlobalRank()));
-        }
-    }
-
     utils::PrecisionChecker::BuildNameMap(model.get());
 
     // Get chunk size before wrapping with LoRA (needed for PipelineParallel)
diff --git a/example/llama3/main.cc b/example/llama3/main.cc
index 89cd2158..1ddf9c62 100644
--- a/example/llama3/main.cc
+++ b/example/llama3/main.cc
@@ -1,7 +1,5 @@
-#include <algorithm>
 #include <cstdlib>
 #include <format>
-#include <iterator>
 #include <memory>
 #include <optional>
 #include <thread>
@@ -133,36 +131,28 @@ void Train(const nn::parallel::Rank &rank) {
     const ProcessGroup *tp_pg = nullptr;
     const ProcessGroup *pp_pg = nullptr;
 
-    auto rank_in_group = [&](const std::vector<int> &group_ranks) {
-        auto it = std::find(group_ranks.begin(), group_ranks.end(), rank.GlobalRank());
-        CHECK(it != group_ranks.end());
-        return static_cast<int>(std::distance(group_ranks.begin(), it));
-    };
-
     if (rank.IsParallel()) {
         auto parallel_device_type
             = (FLAGS_device == kDeviceMACA) ? Device::DeviceType::kMACA : Device::DeviceType::kCUDA;
         device = Device(parallel_device_type, rank.thread_rank());
 
-        // NOTE(dcj): On MACA, defer ProcessGroup creation until AFTER the model
-        // has been uploaded to the device.  MCCL init registers internal P2P
-        // buffers that leave stale read-only mappings in the address ranges
-        // mcMalloc later hands out; allocating the model first keeps it in a
-        // P2P-clean region of the VA space and avoids the "Writing to readonly
-        // page" race on multi-thread DDP.
-        //
-        // Compute the in-group ranks now so model loading (which reads
-        // nn::parallel::tp_rank) gets the correct shard.
+        auto *pg_factory = ProcessGroupFactory::Instance(device.type());
         if (ddp_world_size > 1) {
-            ddp_rank = rank_in_group(GetDataParallelGroupRanks(rank.GlobalRank()));
+            ddp_pg = pg_factory->GetOrCreate(GetDataParallelProcessGroupName(rank.GlobalRank()),
+                                             GetDataParallelGroupRanks(rank.GlobalRank()));
+            ddp_rank = ddp_pg->GetGroupRank(rank.GlobalRank());
         }
         if (tp_world_size > 1) {
-            tp_rank = rank_in_group(GetTensorParallelGroupRanks(rank.GlobalRank()));
+            tp_pg = pg_factory->GetOrCreate(GetTensorParallelProcessGroupName(rank.GlobalRank()),
+                                            GetTensorParallelGroupRanks(rank.GlobalRank()));
+            tp_rank = tp_pg->GetGroupRank(rank.GlobalRank());
             // NOTE(zbl): Reserved for VocabParallelEmbedding
             nn::parallel::tp_rank = tp_rank;
         }
         if (pp_world_size > 1) {
-            pp_rank = rank_in_group(GetPipelineParallelGroupRanks(rank.GlobalRank()));
+            pp_pg = pg_factory->GetOrCreate(GetPipelineParallelProcessGroupName(rank.GlobalRank()),
+                                            GetPipelineParallelGroupRanks(rank.GlobalRank()));
+            pp_rank = pp_pg->GetGroupRank(rank.GlobalRank());
             nn::parallel::pp_rank = pp_rank;
         }
     } else {
@@ -197,20 +187,9 @@ void Train(const nn::parallel::Rank &rank) {
 
     model->To(device);
 
-    if (rank.IsParallel()) {
-        auto *pg_factory = ProcessGroupFactory::Instance(device.type());
-        if (ddp_world_size > 1) {
-            ddp_pg = pg_factory->GetOrCreate(GetDataParallelProcessGroupName(rank.GlobalRank()),
-                                             GetDataParallelGroupRanks(rank.GlobalRank()));
-        }
-        if (tp_world_size > 1) {
-            tp_pg = pg_factory->GetOrCreate(GetTensorParallelProcessGroupName(rank.GlobalRank()),
-                                            GetTensorParallelGroupRanks(rank.GlobalRank()));
-        }
-        if (pp_world_size > 1) {
-            pp_pg = pg_factory->GetOrCreate(GetPipelineParallelProcessGroupName(rank.GlobalRank()),
-                                            GetPipelineParallelGroupRanks(rank.GlobalRank()));
-        }
+    if (FLAGS_device == kDeviceMACA) {
+        auto impl = core::GetDeviceGuardImpl(device.type());
+        impl->SynchronizeDevice(device);
     }
 
     utils::PrecisionChecker::BuildNameMap(model.get());
diff --git a/infini_train/src/core/runtime/maca/maca_guard_impl.cc b/infini_train/src/core/runtime/maca/maca_guard_impl.cc
index e34451b7..4a145d8c 100644
--- a/infini_train/src/core/runtime/maca/maca_guard_impl.cc
+++ b/infini_train/src/core/runtime/maca/maca_guard_impl.cc
@@ -129,7 +129,7 @@ MacaGuardImpl::MacaGuardImpl() {
     // static initialization (before main), so we setenv here in the ctor
     // just prior to mcInit(0).  Users can override by setting the env var
     // themselves before launch.
-    // setenv("MACA_LAUNCH_BLOCKING", "1", 0);
+    setenv("MACA_LAUNCH_BLOCKING", "1", 0);
 
     // When TP > 1 on MACA, disable the MCCL-level P2P path to prevent multi-PG
     // init deadlocks (threads concurrently creating both DP and TP comms hang
diff --git a/scripts/run_models_and_profile.bash b/scripts/run_models_and_profile.bash
index 84beebba..f78e7765 100755
--- a/scripts/run_models_and_profile.bash
+++ b/scripts/run_models_and_profile.bash
@@ -13,7 +13,7 @@ Options:
 EOF
 }
 
-CONFIG_FILE="test_config.json"
+CONFIG_FILE="test_config_maca.json"
 ONLY_RUN_TAGS=""
 
 while [[ $# -gt 0 ]]; do
diff --git a/scripts/test_config_maca.json b/scripts/test_config_maca.json
new file mode 100644
index 00000000..cacf261a
--- /dev/null
+++ b/scripts/test_config_maca.json
@@ -0,0 +1,527 @@
+{
+    "variables": {
+        "BUILD_DIR": "../build",
+        "GPT2_INPUT_BIN": "/nfs/InfiniTrain-dev/data/llmc/gpt2/tinyshakespeare/tiny_shakespeare_train.bin",
+        "GPT2_LLMC_FILEPATH": "/nfs/InfiniTrain-dev/data/llmc/gpt2/gpt2_124M.bin",
+        "LLAMA3_INPUT_BIN": "/nfs/InfiniTrain-dev/data/llmc/llama3/tinyshakespeare/tiny_shakespeare_train.bin",
+        "LLAMA3_LLMC_FILEPATH": "/nfs/InfiniTrain-dev/data/llmc/llama3/llama3.2_1B_fp32.bin",
+        "PROFILE_LOG_DIR": "./profile_logs",
+        "LOG_DIR": "./logs",
+        "COMPARE_LOG_DIR": "",
+        "DEVICE_BACKEND": "maca"
+    },
+    "builds": [
+        {
+            "id": "build_maca",
+            "profile": false,
+            "cmd": "cmake -DUSE_MACA=ON -DUSE_MCCL=ON .. && make -j"
+        }
+    ],
+    "test_groups": [
+        {
+            "tag": "basic",
+            "tests": [
+                {
+                    "id": "1",
+                    "args": {
+                        "dtype": "float32"
+                    }
+                },
+                {
+                    "id": "1_bfloat16",
+                    "args": {
+                        "dtype": "bfloat16"
+                    }
+                },
+                {
+                    "id": "2",
+                    "args": {
+                        "dtype": "float32",
+                        "num_iteration": 10,
+                        "batch_size": 80,
+                        "total_batch_size": 5120
+                    }
+                },
+                {
+                    "id": "2_bfloat16",
+                    "args": {
+                        "dtype": "bfloat16",
+                        "num_iteration": 10,
+                        "batch_size": 80,
+                        "total_batch_size": 5120
+                    }
+                },
+                {
+                    "id": "3",
+                    "args": {
+                        "dtype": "float32",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 10,
+                        "total_batch_size": 5120
+                    }
+                },
+                {
+                    "id": "3_bfloat16",
+                    "args": {
+                        "dtype": "bfloat16",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 10,
+                        "total_batch_size": 5120
+                    }
+                },
+                {
+                    "id": "4",
+                    "args": {
+                        "dtype": "float32",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 40,
+                        "total_batch_size": 5120,
+                        "tensor_parallel": 4
+                    }
+                },
+                {
+                    "id": "4_bfloat16",
+                    "args": {
+                        "dtype": "bfloat16",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 40,
+                        "total_batch_size": 5120,
+                        "tensor_parallel": 4
+                    }
+                },
+                {
+                    "id": "5",
+                    "args": {
+                        "dtype": "float32",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 40,
+                        "total_batch_size": 5120,
+                        "tensor_parallel": 4,
+                        "sequence_parallel": true
+                    }
+                },
+                {
+                    "id": "5_bfloat16",
+                    "args": {
+                        "dtype": "bfloat16",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 40,
+                        "total_batch_size": 5120,
+                        "tensor_parallel": 4,
+                        "sequence_parallel": true
+                    }
+                },
+                {
+                    "id": "6",
+                    "args": {
+                        "dtype": "float32",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 10,
+                        "total_batch_size": 5120,
+                        "pipeline_parallel": 8
+                    }
+                },
+                {
+                    "id": "6_bfloat16",
+                    "args": {
+                        "dtype": "bfloat16",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 10,
+                        "total_batch_size": 5120,
+                        "pipeline_parallel": 8
+                    }
+                },
+                {
+                    "id": "7",
+                    "args": {
+                        "dtype": "float32",
+                        "nthread_per_process": 4,
+                        "num_iteration": 10,
+                        "batch_size": 10,
+                        "total_batch_size": 5120,
+                        "pipeline_parallel": 4,
+                        "virtual_pipeline_parallel": 2
+                    }
+                },
+                {
+                    "id": "7_bfloat16",
+                    "args": {
+                        "dtype": "bfloat16",
+                        "nthread_per_process": 4,
+                        "num_iteration": 10,
+                        "batch_size": 10,
+                        "total_batch_size": 5120,
+                        "pipeline_parallel": 4,
+                        "virtual_pipeline_parallel": 2
+                    }
+                },
+                {
+                    "id": "8",
+                    "args": {
+                        "dtype": "float32",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 40,
+                        "total_batch_size": 5120,
+                        "tensor_parallel": 2,
+                        "sequence_parallel": true,
+                        "pipeline_parallel": 2,
+                        "virtual_pipeline_parallel": 2
+                    }
+                },
+                {
+                    "id": "8_bfloat16",
+                    "args": {
+                        "dtype": "bfloat16",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 40,
+                        "total_batch_size": 5120,
+                        "tensor_parallel": 2,
+                        "sequence_parallel": true,
+                        "pipeline_parallel": 2,
+                        "virtual_pipeline_parallel": 2
+                    }
+                }
+            ]
+        },
+        {
+            "tag": "zero",
+            "tests": [
+                {
+                    "id": "3_distopt",
+                    "args": {
+                        "dtype": "float32",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 10,
+                        "total_batch_size": 5120,
+                        "use_distributed_optimizer": true
+                    }
+                },
+                {
+                    "id": "3_bfloat16_distopt",
+                    "args": {
+                        "dtype": "bfloat16",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 10,
+                        "total_batch_size": 5120,
+                        "use_distributed_optimizer": true
+                    }
+                },
+                {
+                    "id": "4_distopt",
+                    "args": {
+                        "dtype": "float32",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 40,
+                        "total_batch_size": 5120,
+                        "tensor_parallel": 4,
+                        "use_distributed_optimizer": true
+                    }
+                },
+                {
+                    "id": "4_bfloat16_distopt",
+                    "args": {
+                        "dtype": "bfloat16",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 40,
+                        "total_batch_size": 5120,
+                        "tensor_parallel": 4,
+                        "use_distributed_optimizer": true
+                    }
+                },
+                {
+                    "id": "5_distopt",
+                    "args": {
+                        "dtype": "float32",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 40,
+                        "total_batch_size": 5120,
+                        "tensor_parallel": 4,
+                        "sequence_parallel": true,
+                        "use_distributed_optimizer": true
+                    }
+                },
+                {
+                    "id": "5_bfloat16_distopt",
+                    "args": {
+                        "dtype": "bfloat16",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 40,
+                        "total_batch_size": 5120,
+                        "tensor_parallel": 4,
+                        "sequence_parallel": true,
+                        "use_distributed_optimizer": true
+                    }
+                },
+                {
+                    "id": "8_distopt",
+                    "args": {
+                        "dtype": "float32",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 40,
+                        "total_batch_size": 5120,
+                        "tensor_parallel": 2,
+                        "sequence_parallel": true,
+                        "pipeline_parallel": 2,
+                        "virtual_pipeline_parallel": 2,
+                        "use_distributed_optimizer": true
+                    }
+                },
+                {
+                    "id": "8_bfloat16_distopt",
+                    "args": {
+                        "dtype": "bfloat16",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 40,
+                        "total_batch_size": 5120,
+                        "tensor_parallel": 2,
+                        "sequence_parallel": true,
+                        "pipeline_parallel": 2,
+                        "virtual_pipeline_parallel": 2,
+                        "use_distributed_optimizer": true
+                    }
+                }
+            ]
+        },
+        {
+            "tag": "lora",
+            "tests": [
+                {
+                    "id": "1_lora",
+                    "args": {
+                        "dtype": "float32",
+                        "lora_rank": 8,
+                        "lora_alpha": 16.0,
+                        "lora_target_modules": "c_attn,attn.c_proj"
+                    }
+                },
+                {
+                    "id": "1_lora_bfloat16",
+                    "args": {
+                        "dtype": "bfloat16",
+                        "lora_rank": 8,
+                        "lora_alpha": 16.0,
+                        "lora_target_modules": "c_attn,attn.c_proj"
+                    }
+                },
+                {
+                    "id": "2_lora",
+                    "args": {
+                        "dtype": "float32",
+                        "num_iteration": 10,
+                        "batch_size": 80,
+                        "total_batch_size": 5120,
+                        "lora_rank": 8,
+                        "lora_alpha": 16.0,
+                        "lora_target_modules": "c_attn,attn.c_proj"
+                    }
+                },
+                {
+                    "id": "2_lora_bfloat16",
+                    "args": {
+                        "dtype": "bfloat16",
+                        "num_iteration": 10,
+                        "batch_size": 80,
+                        "total_batch_size": 5120,
+                        "lora_rank": 8,
+                        "lora_alpha": 16.0,
+                        "lora_target_modules": "c_attn,attn.c_proj"
+                    }
+                },
+                {
+                    "id": "3_lora",
+                    "args": {
+                        "dtype": "float32",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 10,
+                        "total_batch_size": 5120,
+                        "lora_rank": 8,
+                        "lora_alpha": 16.0,
+                        "lora_target_modules": "c_attn,attn.c_proj"
+                    }
+                },
+                {
+                    "id": "3_lora_bfloat16",
+                    "args": {
+                        "dtype": "bfloat16",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 10,
+                        "total_batch_size": 5120,
+                        "lora_rank": 8,
+                        "lora_alpha": 16.0,
+                        "lora_target_modules": "c_attn,attn.c_proj"
+                    }
+                },
+                {
+                    "id": "4_lora",
+                    "args": {
+                        "dtype": "float32",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 40,
+                        "total_batch_size": 5120,
+                        "tensor_parallel": 4,
+                        "lora_rank": 4,
+                        "lora_alpha": 8.0,
+                        "lora_target_modules": "c_attn,c_fc,c_proj"
+                    }
+                },
+                {
+                    "id": "4_lora_bfloat16",
+                    "args": {
+                        "dtype": "bfloat16",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 40,
+                        "total_batch_size": 5120,
+                        "tensor_parallel": 4,
+                        "lora_rank": 16,
+                        "lora_alpha": 32.0,
+                        "lora_target_modules": "c_attn,c_fc,c_proj"
+                    }
+                },
+                {
+                    "id": "5_lora",
+                    "args": {
+                        "dtype": "float32",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 40,
+                        "total_batch_size": 5120,
+                        "tensor_parallel": 4,
+                        "sequence_parallel": true,
+                        "lora_rank": 4,
+                        "lora_alpha": 8.0,
+                        "lora_target_modules": "attn.c_proj,c_fc,c_proj"
+                    }
+                },
+                {
+                    "id": "5_lora_bfloat16",
+                    "args": {
+                        "dtype": "bfloat16",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 40,
+                        "total_batch_size": 5120,
+                        "tensor_parallel": 4,
+                        "sequence_parallel": true,
+                        "lora_rank": 16,
+                        "lora_alpha": 32.0,
+                        "lora_target_modules": "attn.c_proj,c_fc,c_proj"
+                    }
+                },
+                {
+                    "id": "6_lora",
+                    "args": {
+                        "dtype": "float32",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 10,
+                        "total_batch_size": 5120,
+                        "pipeline_parallel": 8,
+                        "lora_rank": 4,
+                        "lora_alpha": 8.0,
+                        "lora_target_modules": "c_attn,attn.c_proj,c_fc"
+                    }
+                },
+                {
+                    "id": "6_lora_bfloat16",
+                    "args": {
+                        "dtype": "bfloat16",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 10,
+                        "total_batch_size": 5120,
+                        "pipeline_parallel": 8,
+                        "lora_rank": 16,
+                        "lora_alpha": 32.0,
+                        "lora_target_modules": "c_attn,attn.c_proj,c_fc"
+                    }
+                },
+                {
+                    "id": "7_lora",
+                    "args": {
+                        "dtype": "float32",
+                        "nthread_per_process": 4,
+                        "num_iteration": 10,
+                        "batch_size": 10,
+                        "total_batch_size": 5120,
+                        "pipeline_parallel": 4,
+                        "virtual_pipeline_parallel": 2,
+                        "lora_rank": 4,
+                        "lora_alpha": 8.0,
+                        "lora_target_modules": "c_attn,c_proj"
+                    }
+                },
+                {
+                    "id": "7_lora_bfloat16",
+                    "args": {
+                        "dtype": "bfloat16",
+                        "nthread_per_process": 4,
+                        "num_iteration": 10,
+                        "batch_size": 10,
+                        "total_batch_size": 5120,
+                        "pipeline_parallel": 4,
+                        "virtual_pipeline_parallel": 2,
+                        "lora_rank": 16,
+                        "lora_alpha": 32.0,
+                        "lora_target_modules": "c_attn,c_proj"
+                    }
+                },
+                {
+                    "id": "8_lora",
+                    "args": {
+                        "dtype": "float32",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 40,
+                        "total_batch_size": 5120,
+                        "tensor_parallel": 2,
+                        "sequence_parallel": true,
+                        "pipeline_parallel": 2,
+                        "virtual_pipeline_parallel": 2,
+                        "lora_rank": 4,
+                        "lora_alpha": 8.0,
+                        "lora_target_modules": "c_fc,c_proj"
+                    }
+                },
+                {
+                    "id": "8_lora_bfloat16",
+                    "args": {
+                        "dtype": "bfloat16",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 40,
+                        "total_batch_size": 5120,
+                        "tensor_parallel": 2,
+                        "sequence_parallel": true,
+                        "pipeline_parallel": 2,
+                        "virtual_pipeline_parallel": 2,
+                        "lora_rank": 16,
+                        "lora_alpha": 32.0,
+                        "lora_target_modules": "c_fc,c_proj"
+                    }
+                }
+            ]
+        }
+    ]
+}