Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
97a3f86
feat(ascend): op-norm-rope group — Swiglu, SiluAndMul, CausalSoftmax,…
Apr 17, 2026
1d62aeb
fix(ascend): norm/swiglu destructors + missing add_rms_norm custom ke…
Apr 22, 2026
f3125b7
style(ascend): rename `AddRmsNorm` parameters to PyTorch-aligned names
Apr 22, 2026
50b7b66
style(ascend): comment + assert message audit for norm/swiglu/softmax…
Apr 22, 2026
b20cfc5
test(silu_and_mul): add `implementation_index` parametrize and stride…
Apr 22, 2026
799e038
refactor(ascend/rotary_embedding): unify RotaryEmbedding and ApplyRot…
Apr 22, 2026
21e5f9d
feat(scripts/generate_wrappers): emit `apply_rotary_pos_emb` Python shim
Apr 22, 2026
dcaa53e
test(rotary_embedding): merge apply_rotary_pos_emb cases + cover MLA/…
Apr 22, 2026
c8e62a9
fix(generate_wrappers): propagate scalar param defaults to pybind sig…
Apr 22, 2026
7f8292f
fix(ascend/rotary_embedding): correct pre-gathered layout + revert si…
Apr 22, 2026
8f1a55e
test(rotary_embedding): fix GPT-J reference for partial rotary
Apr 22, 2026
8759840
refactor(pr66-simplify): correct `rstd_out` semantic name + clarity f…
Apr 22, 2026
dcdc71c
style(tests): ruff format `test_add_rms_norm.py` after `residual_out`…
Apr 22, 2026
2f15274
build(ascend-custom): drive `build.sh` from `pip install` with proper…
Apr 22, 2026
84c129d
refactor(data_type): pin `DataType` enum values explicitly
Apr 22, 2026
3de9dd4
feat(ascend-custom): add bf16 support + Google-style identifier renames
Apr 22, 2026
5a3d267
refactor(base): align Linear/SiluAndMul/AddRmsNorm/RotaryEmbedding wi…
Apr 22, 2026
16222a4
refactor(base): trim narrative comments and collapse CPU Linear ctors
Apr 23, 2026
849b494
fix(pr66-review): address review findings 1-3
Apr 23, 2026
cab5251
refactor(pr66): drop `apply_rotary_pos_emb` wrapper + tests
Apr 23, 2026
e312fb8
test(rotary_embedding): add `pre_gathered=True` coverage
Apr 23, 2026
053c907
chore(pr66): drop unused headers
Apr 23, 2026
b7880e6
style(pr66): sweep assert-message periods + comment backticks
Apr 23, 2026
2375e1b
refactor(pr66): rename AscendC custom kernels to PascalCase + C2 para…
Apr 23, 2026
f303e41
refactor(pr66): trim commit-narration comments
Apr 23, 2026
08a130a
fix(build): gate ops `--whole-archive` link on `WITH_ASCEND`
Apr 25, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 15 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,21 @@ option(WITH_ASCEND "Enable Ascend backend" OFF)

option(WITH_TORCH "Enable PyTorch C++ backend" OFF)

# Default OFF until CANN's `extract_host_stub.py` path handling is fixed for
# `scikit-build-core` temp-dir builds (triggers `KeyError` on the preprocessed
# object path). Enable explicitly with `-DBUILD_CUSTOM_KERNEL=ON` when the
# toolchain is compatible or when building via the standalone
# `src/ascend/custom/build.sh` script.
option(BUILD_CUSTOM_KERNEL "Build custom AscendC kernel PyTorch extension (requires `torch_npu`)" OFF)
# Custom `AscendC` kernels under `src/ascend/custom/`. `ON` by default
# so CI and routine dev builds always exercise `implementation_index=1/2`
# for `RmsNorm` / `AddRmsNorm`. Gated by `WITH_ASCEND` in
# `src/CMakeLists.txt` — non-Ascend builds ignore it. Pass
# `-DBUILD_ASCEND_CUSTOM=OFF` to skip the `ccec` build on Ascend
# machines where the custom kernels aren't needed.
#
# When `ON`, `src/CMakeLists.txt` drives the standalone
# `src/ascend/custom/build.sh` via `execute_process` at configure time
# (sidesteps a `CANN` `extract_host_stub.py` path bug that breaks
# in-tree `ascendc_library()` under `scikit-build-core` temp-dir builds)
# and links the produced `libno_workspace_kernel.a` into the `ops`
# module with `--whole-archive`. Requires `torch_npu` and the
# `AscendC` toolchain (`ccec`).
option(BUILD_ASCEND_CUSTOM "Build custom AscendC kernels" ON)

option(AUTO_DETECT_DEVICES "Automatically detect available devices" OFF)
option(AUTO_DETECT_BACKENDS "Automatically detect available backends" OFF)
Expand Down
9 changes: 9 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,15 @@ name = "InfiniOps"
version = "0.1.0"

[project.optional-dependencies]
# TODO: `torch` here is unconstrained. On Ascend hosts, the working
# torch is the Ascend-matched `torch 2.9.0+cpu` paired with
# `torch_npu 2.9.0.post1+…`. A `pip install -e .[dev] --force-reinstall`
# will re-resolve `torch` to the latest PyPI version (currently
# `torch 2.11.0`), which now declares `cuda-toolkit` / `nvidia-cublas` /
# `nvidia-cudnn` / … as hard deps — downloads GBs of CUDA wheels and
# kills the `torch_npu` / `vllm-ascend` pairing. Needs a platform-aware
# split (e.g. `torch; platform_machine != 'aarch64'`, or move `torch`
# out of `dev` and require it pre-installed in the container image).
dev = ["pytest", "pytest-cov", "pytest-xdist", "ruff", "torch", "pyyaml"]

[tool.scikit-build.wheel]
Expand Down
27 changes: 25 additions & 2 deletions scripts/generate_wrappers.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,9 +112,29 @@ def _find_vector_tensor_params(op_name):
return set(re.findall(r"std::vector<Tensor>\s+(\w+)", source))


def _find_params_with_defaults(op_name):
"""Return ``{param_name: default_literal}`` for base-header params that
carry a `= <literal>` default value. `libclang`'s cursor API does not
expose defaults reliably, so we regex-scan the source. Only used for
plain scalar defaults such as ``bool pre_gathered = false``.
"""
source = (_BASE_DIR / f"{op_name}.h").read_text()

mapping = {}

for name, default in re.findall(
r"\b(?:bool|int(?:64_t|32_t|8_t|16_t)?|std::size_t|std::uint\w+_t|float|double)\s+(\w+)\s*=\s*([^,\)]+?)\s*(?:,|\))",
source,
):
mapping[name] = default.strip()

return mapping


def _generate_pybind11(operator):
optional_tensor_params = _find_optional_tensor_params(operator.name)
vector_tensor_params = _find_vector_tensor_params(operator.name)
params_with_defaults = _find_params_with_defaults(operator.name)

def _is_optional_tensor(arg):
if arg.spelling in optional_tensor_params:
Expand Down Expand Up @@ -186,6 +206,10 @@ def _generate_py_args(node):

if _is_optional(arg):
parts.append(f'py::arg("{arg.spelling}") = py::none()')
elif arg.spelling in params_with_defaults:
parts.append(
f'py::arg("{arg.spelling}") = {params_with_defaults[arg.spelling]}'
)
else:
parts.append(f'py::arg("{arg.spelling}")')

Expand Down Expand Up @@ -257,8 +281,7 @@ def _generate_call(op_name, call, method=True):
}})
.def_static("clear_cache", &Self::clear_cache);

{callers}
}}
{callers}}}

}} // namespace infini::ops

Expand Down
74 changes: 70 additions & 4 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -241,8 +241,66 @@ if(WITH_ASCEND)
list(APPEND DEVICE_LIST "ascend")

# Custom `AscendC` kernels (PyTorch extension, requires `torch_npu`).
if(BUILD_CUSTOM_KERNEL)
add_subdirectory(ascend/custom)
if(BUILD_ASCEND_CUSTOM)
# In-tree `ascendc_library()` trips the `CANN` `extract_host_stub.py`
# path-handling bug under `scikit-build-core`'s temp-dir builds
# (`KeyError` on `/./workspace/...` paths in `$<TARGET_OBJECTS>`).
# Work around it by driving the standalone `src/ascend/custom/build.sh`
# — that script invokes a separate `cmake` with
# `src/ascend/custom/` as its `SOURCE_DIR`, avoiding the buggy
# path shape. The produced `.a` is imported and linked into
# `ops` with `--whole-archive`.
set(_custom_build_dir "${CMAKE_SOURCE_DIR}/build/build_ascend_custom")
set(_custom_lib "${_custom_build_dir}/lib/libno_workspace_kernel.a")

if(NOT DEFINED SOC_VERSION OR "${SOC_VERSION}" STREQUAL "")
include(${CMAKE_CURRENT_SOURCE_DIR}/ascend/custom/cmake/detect_soc.cmake)
infiniops_detect_soc(SOC_VERSION)
endif()

# Drive `build.sh` as a build-phase target with explicit source
# dependencies so that editing any `op_host/` or `op_kernel/`
# source re-triggers the build (plain `execute_process` at
# configure time would only gate on file existence and leave
# stale `.a` files in place).
file(GLOB_RECURSE _custom_srcs CONFIGURE_DEPENDS
"${CMAKE_CURRENT_SOURCE_DIR}/ascend/custom/*.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/ascend/custom/*.h"
"${CMAKE_CURRENT_SOURCE_DIR}/ascend/custom/build.sh")

# Scrub env inherited from the outer `scikit-build-core` invocation
# before handing control to `build.sh`:
# * `CMAKE_GENERATOR` / `CMAKE_EXPORT_COMPILE_COMMANDS` leaking
# into the inner `cmake` change the path format passed to
# `ninja`'s `_host_cpp` rule and re-trigger the `CANN`
# `extract_host_stub.py` `KeyError` (`/./workspace/...`) that
# standalone `build.sh` avoids.
# * `PYTHONPATH` from `pip`'s build-isolation overlay makes the
# child `python3` skip the system `site-packages` — child
# `cmake` modules that `import torch` (`config_envs.cmake`)
# then fail with `ModuleNotFoundError` even though `torch` is
# installed.
add_custom_command(
OUTPUT ${_custom_lib}
COMMAND ${CMAKE_COMMAND} -E env
--unset=CMAKE_GENERATOR
--unset=CMAKE_EXPORT_COMPILE_COMMANDS
--unset=CMAKE_BUILD_PARALLEL_LEVEL
--unset=PYTHONPATH
"BUILD_DIR=${_custom_build_dir}"
"CMAKE_EXE=${CMAKE_COMMAND}"
bash ${CMAKE_CURRENT_SOURCE_DIR}/ascend/custom/build.sh ${SOC_VERSION}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/ascend/custom
DEPENDS ${_custom_srcs}
COMMENT "Building custom AscendC kernels (SOC_VERSION=${SOC_VERSION})"
VERBATIM)

add_custom_target(no_workspace_kernel_build ALL DEPENDS ${_custom_lib})

add_library(no_workspace_kernel STATIC IMPORTED GLOBAL)
set_target_properties(no_workspace_kernel PROPERTIES
IMPORTED_LOCATION "${_custom_lib}")
add_dependencies(no_workspace_kernel no_workspace_kernel_build)

# Link the compiled `AscendC` kernel objects into `infiniops` so that
# custom kernel implementations (e.g. `RmsNorm` index 1) can call
Expand Down Expand Up @@ -379,9 +437,17 @@ if(GENERATE_PYTHON_BINDINGS)
# The `Operator<..., 1>` template instantiations that call
# `aclrtlaunch_*` live in `ops.cc`, so link here with
# `--whole-archive` to ensure all launch functions are available.
if(BUILD_CUSTOM_KERNEL)
# `$<TARGET_FILE>` works for both real `ascendc_library()` targets and
# `IMPORTED` targets pointing at a pre-built `.a`. The
# `no_workspace_kernel` target is only created inside the
# `WITH_ASCEND` block above, so this branch must mirror that gate;
# otherwise non-Ascend builds error out with "No target
# no_workspace_kernel".
if(WITH_ASCEND AND BUILD_ASCEND_CUSTOM)
target_link_libraries(ops PRIVATE
-Wl,--whole-archive no_workspace_kernel -Wl,--no-whole-archive)
-Wl,--whole-archive $<TARGET_FILE:no_workspace_kernel> -Wl,--no-whole-archive)
# `ops` link step must wait for `build.sh` to produce the `.a`.
add_dependencies(ops no_workspace_kernel_build)
endif()

set_target_properties(infiniops PROPERTIES INSTALL_RPATH "$ORIGIN")
Expand Down
144 changes: 144 additions & 0 deletions src/ascend/add_rms_norm/kernel.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
#ifndef INFINI_OPS_ASCEND_ADD_RMS_NORM_KERNEL_H_
#define INFINI_OPS_ASCEND_ADD_RMS_NORM_KERNEL_H_

#include <vector>

#include "acl/acl.h"
#include "aclnn/aclnn_base.h"
#include "aclnn_add.h"
#include "aclnn_rms_norm.h"
#include "ascend/common.h"
#include "ascend/workspace_pool_.h"
#include "base/add_rms_norm.h"
#include "operator.h"

namespace infini::ops {

// Decomposed implementation: `aclnnAdd` + `aclnnRmsNorm`.
//
// The fused `aclnnAddRmsNorm` API has ~200 us host-side launch overhead that
// dominates small-tensor dispatch. Decomposing into two fast ACLNN calls
// reduces host dispatch from ~224 us to ~56 us (4x faster) with negligible
// NPU-side impact for inference tensor sizes.
template <>
class Operator<AddRmsNorm, Device::Type::kAscend, 0> : public AddRmsNorm {
public:
Operator(const Tensor input, const Tensor residual, const Tensor weight,
float eps, Tensor out, Tensor residual_out)
: AddRmsNorm(input, residual, weight, eps, out, residual_out),
input_cache_(input),
residual_cache_(residual),
weight_cache_(weight),
out_cache_(out),
residual_out_cache_(residual_out) {
// Alpha scalar for `aclnnAdd` (`residual_out = input + 1.0 * residual`).
alpha_ = aclCreateScalar(&alpha_storage_, ACL_FLOAT);

// `aclnnRmsNorm` writes `rstd` as a required side output. Size is
// computed here; the buffer is obtained from the pool in `operator()`.
rstd_shape_ = {static_cast<int64_t>(batch_size_),
static_cast<int64_t>(nhead_)};
rstd_size_ = batch_size_ * nhead_ * sizeof(float);
}

~Operator() {
if (!ascend::IsAclRuntimeAlive()) return;

// Null cached descriptors — see `AclTensorCache::release()`.
input_cache_.release();
residual_cache_.release();
weight_cache_.release();
out_cache_.release();
residual_out_cache_.release();

// `rstd_tensor_` leaks with `norm_exec_` at shutdown (see `64c367c`).
if (alpha_) aclDestroyScalar(alpha_);
}

void operator()(const Tensor input, const Tensor residual,
const Tensor weight, float eps, Tensor out,
Tensor residual_out) const override {
auto t_input = input_cache_.get(const_cast<void*>(input.data()));
auto t_residual = residual_cache_.get(const_cast<void*>(residual.data()));
auto t_weight = weight_cache_.get(const_cast<void*>(weight.data()));
auto t_out = out_cache_.get(out.data());
auto t_residual_out = residual_out_cache_.get(residual_out.data());
auto stream = static_cast<aclrtStream>(stream_);

// Step 1: `residual_out = input + residual`.
if (!add_exec_) {
aclnnAddGetWorkspaceSize(t_input, t_residual, alpha_, t_residual_out,
&add_ws_, &add_exec_);
aclSetAclOpExecutorRepeatable(add_exec_);
} else {
aclSetInputTensorAddr(add_exec_, 0, t_input,
const_cast<void*>(input.data()));
aclSetInputTensorAddr(add_exec_, 1, t_residual,
const_cast<void*>(residual.data()));
aclSetOutputTensorAddr(add_exec_, 0, t_residual_out, residual_out.data());
}
auto& add_arena = ascend::GetWorkspacePool().Ensure(stream, add_ws_);
aclnnAdd(add_arena.buf, add_ws_, add_exec_, stream);

// Obtain shared `rstd` buffer from pool.
auto& rstd_arena =
ascend::GetWorkspacePool().Ensure(stream, rstd_size_, "temp");

// Lazily create the `rstd` tensor descriptor on first call.
if (!rstd_tensor_) {
rstd_tensor_ = aclCreateTensor(rstd_shape_.data(), 2, ACL_FLOAT,
/*strides=*/nullptr, 0, ACL_FORMAT_ND,
rstd_shape_.data(), 2, rstd_arena.buf);
} else {
aclSetRawTensorAddr(rstd_tensor_, rstd_arena.buf);
}

// Step 2: `out = rms_norm(residual_out, weight, eps)`.
if (!norm_exec_) {
aclnnRmsNormGetWorkspaceSize(t_residual_out, t_weight, eps, t_out,
rstd_tensor_, &norm_ws_, &norm_exec_);
aclSetAclOpExecutorRepeatable(norm_exec_);
} else {
aclSetInputTensorAddr(norm_exec_, 0, t_residual_out, residual_out.data());
aclSetInputTensorAddr(norm_exec_, 1, t_weight,
const_cast<void*>(weight.data()));
aclSetOutputTensorAddr(norm_exec_, 0, t_out, out.data());
aclSetOutputTensorAddr(norm_exec_, 1, rstd_tensor_, rstd_arena.buf);
}
auto& norm_arena = ascend::GetWorkspacePool().Ensure(stream, norm_ws_);
aclnnRmsNorm(norm_arena.buf, norm_ws_, norm_exec_, stream);
}

private:
mutable ascend::AclTensorCache input_cache_;

mutable ascend::AclTensorCache residual_cache_;

mutable ascend::AclTensorCache weight_cache_;

mutable ascend::AclTensorCache out_cache_;

mutable ascend::AclTensorCache residual_out_cache_;

float alpha_storage_ = 1.0f;

aclScalar* alpha_ = nullptr;

std::vector<int64_t> rstd_shape_;

uint64_t rstd_size_ = 0;

mutable aclTensor* rstd_tensor_ = nullptr;

mutable aclOpExecutor* add_exec_ = nullptr;

mutable uint64_t add_ws_ = 0;

mutable aclOpExecutor* norm_exec_ = nullptr;

mutable uint64_t norm_ws_ = 0;
};

} // namespace infini::ops

#endif
Loading
Loading