Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
56 commits
Select commit Hold shift + click to select a range
8f7cbb0
fix(ascend): refine framework layer — caching, naming, build fixes
Apr 15, 2026
57640e0
feat(base): add new operator base classes and refine existing ones
Apr 15, 2026
a6bcf65
feat(cpu): add CPU implementations for Cast, Cat, Linear, Mul
Apr 15, 2026
b1d3acb
feat(ascend): add Ascend operator kernels for all operators
Apr 15, 2026
94f5ee0
feat(ascend): add multi-implementation variants and ATB operators
Apr 15, 2026
803480d
feat(ascend): add custom AscendC kernels for RmsNorm and AddRmsNorm
Apr 15, 2026
15134eb
test(ascend): add comprehensive tests for all Ascend operators
Apr 15, 2026
478f98e
style: apply clang-format and fix code convention violations (round 1)
Apr 15, 2026
1fdf04a
style: fix code convention violations (round 2)
Apr 15, 2026
e0d8a90
style: fix code convention violations (round 3)
Apr 15, 2026
b0cc676
fix(ci): add Ascend toolkit environment variables to CI Dockerfile
Apr 15, 2026
992b176
style: apply ruff format and clang-format to all modified files
Apr 15, 2026
cc873dc
style: fix ruff F401 lint errors for side-effect imports
Apr 15, 2026
51029b7
refactor(test): remove duplicate rms_norm test files and unify kernel…
Apr 15, 2026
a95f92c
feat(ascend): sync latest operators from feat/ascend-operators
Apr 16, 2026
edbde68
fix(ascend): re-upload cos_sin_cache when operator cache reuses stale…
Apr 16, 2026
78c1048
refactor: framework-level clear_cache() and skip 910B-unsupported tests
Apr 16, 2026
3d8331b
test(ascend): enable ATB RoPE bfloat16 tests
Apr 16, 2026
f926fc8
feat(ascend): add D2H-free paged_attention host tensor support
Apr 16, 2026
bec60f6
style: apply clang-format to all modified C++ files
Apr 16, 2026
72e3ed5
style: apply ruff format to test and utility files
Apr 16, 2026
d38dc60
fix(ascend): prevent double-free in operator destructors at process exit
Apr 16, 2026
ba3eb2a
refactor(ascend): consolidate custom kernel macros into INFINI_HAS_CU…
Apr 16, 2026
abd9ab6
docs(ascend): fix misleading destructor and rope restriction comments
Apr 16, 2026
df34917
test(ascend): broaden rope impl/dtype coverage, add padding-slot case…
Apr 16, 2026
0ffd832
docs(perf): add e2e baseline reports and host-side gap findings
Apr 16, 2026
66b2c3a
docs(perf): update e2e progress with env-flag sweep + graph-mode root…
Apr 16, 2026
55e2d83
docs(perf): host-side cProfile analysis identifies current_stream_ptr…
Apr 16, 2026
f4dab13
docs(perf): record 0.5B host profile + stream-cache correctness regre…
Apr 16, 2026
d14e381
docs(perf): record stream-ptr cache results — both models clear 80% e…
Apr 16, 2026
5d1b654
docs(perf): design note — dispatch-count reduction (F1/F2) replaces f…
Apr 16, 2026
a356ad4
docs(perf): dispatch-count mystery resolved — vllm-ascend uses fx fus…
Apr 17, 2026
2535dd7
docs(perf): capture-replay sanity — all 4 infini.ops pass npugraph ca…
Apr 17, 2026
4721eb1
docs(perf): record G2 FX-rewrite throughput (3B graph 63.8% -> 71.5%)
Apr 17, 2026
c16646a
docs(perf): scoped G1 fusion design — P-1 split_rope + P-3 noop_elimi…
Apr 17, 2026
6c80d60
docs(perf): update G1 design with FX dump — P-1 pattern confirmed 36x…
Apr 17, 2026
25cff65
docs(perf): record fusion-pass scaffolding commit (zero passes, P-3 d…
Apr 17, 2026
16b6352
docs(perf): record #28 split_rope_collapse as measured-within-noise o…
Apr 17, 2026
6c89d2d
docs(perf): mission final report — eager met, graph capped at 71%
Apr 17, 2026
8e90809
docs(perf): rewrite mission_final per team-lead review — honest frami…
Apr 17, 2026
f81a423
docs(perf): reflect #29 GatherV3-at-parity finding in mission final
Apr 17, 2026
88decb9
chore(pr47): drop mission perf docs from PR scope + default AUTO_DETE…
Apr 17, 2026
668c114
revert(pr47): restore AUTO_DETECT_BACKENDS=ON default
Apr 17, 2026
f45f9da
build(pr47): add torch.libs/ to rpath-link for bundled libgfortran
Apr 17, 2026
8eacfef
fix(ascend): adopt PR #63/#60 master API — GetWorkspacePool/Ensure re…
Apr 17, 2026
a3cd770
fix(scripts): py::arg order in bindings generator must match C++ para…
Apr 17, 2026
146dc8d
fix(ci): treat exit 137 as success when pytest junit XML reports no f…
Apr 17, 2026
222ea13
fix(ascend): remove stale 910B skip guard from PagedAttention test
Apr 17, 2026
ccc7b5d
feat(ascend): support non-neox rotaryMode via ATB RopeParam rotaryCoeff
Apr 17, 2026
1f4c15e
feat(ascend/rotary_embedding): add impl=2 via `aclnnRopeWithSinCosCache`
Apr 17, 2026
c8a3ff2
docs(paged_attention): explain why `seq_lens_host` / `block_table_hos…
Apr 17, 2026
f757ed6
perf(ascend/reshape_and_cache): replace int64 slot_mapping D2H with a…
Apr 17, 2026
592b493
feat(rotary_embedding): make `query_out` / `key_out` optional (inplac…
Apr 17, 2026
df07f95
feat(flash_attention): add vLLM-style `sliding_window` entry (additive)
Apr 17, 2026
828f252
style: apply clang-format to recent API-alignment changes
Apr 17, 2026
1ed8fb3
style: apply clang-format to silu_and_mul/causal_softmax/swiglu kerne…
Apr 17, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .ci/images/ascend/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,12 @@ RUN pip install --no-cache-dir --progress off \
pytest-xdist \
ruff

ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/aarch64:${LD_LIBRARY_PATH}
ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit

WORKDIR /workspace
56 changes: 54 additions & 2 deletions .ci/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import subprocess
import sys
import uuid
import xml.etree.ElementTree as ET
from datetime import datetime
from pathlib import Path

Expand All @@ -24,6 +25,42 @@
_PYTEST_VALUE_FLAGS = {"-n", "-k", "-m", "-p", "--tb", "--junitxml", "--rootdir"}


def _junit_xml_indicates_pass(results_dir):
"""Return True if `pytest` junit XML under `results_dir` reports no failures/errors.

Used to distinguish a real CI failure from the docker 18.09
container-teardown `SIGKILL` (exit code 137) that occurs on this host
after a child process exits successfully — bash returns 0 from inside
the container, but the docker daemon reports 137 due to a race in its
`--rm` cleanup path. The junit XML is written by pytest before that
teardown and reliably captures the real outcome of the test stage.
"""
for junit in Path(results_dir).rglob("test-results.xml"):
try:
root = ET.parse(junit).getroot()
except ET.ParseError:
continue

suites = root.findall("testsuite") if root.tag == "testsuites" else [root]

if not suites:
continue

for suite in suites:
try:
if int(suite.get("failures", 0)) > 0:
return False

if int(suite.get("errors", 0)) > 0:
return False
except ValueError:
return False

return True

return False


def apply_test_override(run_cmd, test_path):
"""Replace positional test path(s) in a pytest stage command.

Expand Down Expand Up @@ -437,8 +474,23 @@ def main():
pool.release(allocated_ids)

if returncode != 0:
print(f"job {job_name} failed (exit code {returncode})", file=sys.stderr)
failed += 1
# Docker 18.09 on this host occasionally SIGKILLs containers
# during `--rm` cleanup after the inner process already exited
# cleanly, producing exit code 137. Fall back to the pytest
# junit XML to recover the real outcome in that case.
if returncode == 137 and _junit_xml_indicates_pass(results_dir):
print(
f"[warn] job {job_name}: container exited with 137 "
f"(likely docker teardown SIGKILL after clean pytest); "
f"junit XML reports no failures — treating as success",
file=sys.stderr,
)
else:
print(
f"job {job_name} failed (exit code {returncode})",
file=sys.stderr,
)
failed += 1

sys.exit(1 if failed else 0)

Expand Down
29 changes: 29 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,13 @@ option(WITH_ASCEND "Enable Ascend backend" OFF)

option(WITH_TORCH "Enable PyTorch C++ backend" OFF)

# Default OFF until CANN's `extract_host_stub.py` path handling is fixed for
# `scikit-build-core` temp-dir builds (triggers `KeyError` on the preprocessed
# object path). Enable explicitly with `-DBUILD_CUSTOM_KERNEL=ON` when the
# toolchain is compatible or when building via the standalone
# `src/ascend/custom_kernel/build.sh` script.
option(BUILD_CUSTOM_KERNEL "Build custom AscendC kernel PyTorch extension (requires torch_npu)" OFF)

option(AUTO_DETECT_DEVICES "Automatically detect available devices" OFF)
option(AUTO_DETECT_BACKENDS "Automatically detect available backends" OFF)
option(GENERATE_PYTHON_BINDINGS "Generate Python bindings" OFF)
Expand Down Expand Up @@ -130,6 +137,28 @@ if(WITH_TORCH)
find_library(C10_LIB c10 HINTS ${_torch_lib_dirs} REQUIRED)
set(TORCH_LIBRARIES ${TORCH_LIB} ${TORCH_CPU_LIB} ${C10_LIB})

# `auditwheel`-repaired `torch` wheels bundle transitive dependencies
# (e.g. `libgfortran-<hash>.so`, `libopenblasp-<hash>.so`) in a sibling
# `torch.libs/` directory that `library_paths()` does not return. When
# building against such a wheel, the linker needs this path to resolve
# the bundled NEEDED entries (otherwise: `undefined reference to
# _gfortran_etime@GFORTRAN_8` etc.).
execute_process(
COMMAND ${Python_EXECUTABLE} -c "import os, torch; d = os.path.dirname(torch.__file__); p = os.path.join(os.path.dirname(d), 'torch.libs'); print(p if os.path.isdir(p) else '')"
OUTPUT_VARIABLE TORCH_BUNDLED_LIBS_DIR
OUTPUT_STRIP_TRAILING_WHITESPACE
)

if(TORCH_BUNDLED_LIBS_DIR)
list(APPEND CMAKE_BUILD_RPATH "${TORCH_BUNDLED_LIBS_DIR}")
list(APPEND CMAKE_INSTALL_RPATH "${TORCH_BUNDLED_LIBS_DIR}")
# `rpath-link` is linker-only: lets `ld` resolve the bundled
# transitive NEEDED entries at link time without adding them to our
# own binary's direct NEEDED list.
add_link_options("-Wl,-rpath-link,${TORCH_BUNDLED_LIBS_DIR}")
message(STATUS "PyTorch bundled libs: ${TORCH_BUNDLED_LIBS_DIR}")
endif()

# Query the `CXX11` ABI setting that `torch` was compiled with.
# A mismatch causes linker errors (e.g. undefined reference to
# `c10::Device::Device(std::string const&)`).
Expand Down
57 changes: 42 additions & 15 deletions scripts/generate_wrappers.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,33 +94,51 @@ def __init__(self, name, constructors, calls):

def _find_optional_tensor_params(op_name):
"""Return a set of parameter names declared as `std::optional<Tensor>` in
the base header. `libclang` resolves the type to `int` when the STL
the base header. libclang resolves the type to ``int`` when the STL
headers are not fully available, so we fall back to a regex scan of the
source text.
"""
source = (_BASE_DIR / f"{op_name}.h").read_text()

return set(re.findall(r"std::optional<Tensor>\s+(\w+)", source))


def _find_vector_tensor_params(op_name):
"""Return a set of parameter names declared as `std::vector<Tensor>` in
the base header.
"""
import re

source = (_BASE_DIR / f"{op_name}.h").read_text()
return set(re.findall(r"std::vector<Tensor>\s+(\w+)", source))


def _generate_pybind11(operator):
optional_tensor_params = _find_optional_tensor_params(operator.name)
vector_tensor_params = _find_vector_tensor_params(operator.name)

def _is_optional_tensor(arg):
if arg.spelling in optional_tensor_params:
return True

return "std::optional" in arg.type.spelling and "Tensor" in arg.type.spelling

def _is_optional(arg):
return "std::optional" in arg.type.spelling

def _is_vector_tensor(arg):
if arg.spelling in vector_tensor_params:
return True
return "std::vector" in arg.type.spelling and "Tensor" in arg.type.spelling

def _generate_params(node):
parts = []

for arg in node.get_arguments():
if arg.spelling == "stream":
continue

if _is_optional_tensor(arg):
parts.append(f"std::optional<py::object> {arg.spelling}")
elif _is_vector_tensor(arg):
parts.append(f"std::vector<py::object> {arg.spelling}")
else:
param = arg.type.spelling.replace("const Tensor", "py::object").replace(
"Tensor", "py::object"
Expand All @@ -135,9 +153,10 @@ def _generate_arguments(node):
for arg in node.get_arguments():
if arg.spelling == "stream":
continue

if _is_optional_tensor(arg):
args.append(f"OptionalTensorFromPybind11Handle({arg.spelling})")
elif _is_vector_tensor(arg):
args.append(f"VectorTensorFromPybind11Handle({arg.spelling})")
elif "Tensor" in arg.type.spelling:
args.append(f"TensorFromPybind11Handle({arg.spelling})")
else:
Expand All @@ -155,21 +174,28 @@ def _generate_init(constructor):
}}))"""

def _generate_py_args(node):
return ", ".join(
f'py::arg("{arg.spelling}")'
for arg in node.get_arguments()
if arg.spelling != "stream"
)
parts = []

for arg in node.get_arguments():
if arg.spelling == "stream":
continue

if _is_optional(arg):
parts.append(f'py::arg("{arg.spelling}") = py::none()')
else:
parts.append(f'py::arg("{arg.spelling}")')

return ", ".join(parts)

def _generate_call(op_name, call, method=True):
call_params = _generate_params(call)
call_args = _generate_arguments(call)

if not method:
params = (
f"{call_params}, std::uintptr_t stream, std::size_t implementation_index"
f"{call_params}, std::size_t implementation_index, std::uintptr_t stream"
if call_params
else "std::uintptr_t stream, std::size_t implementation_index"
else "std::size_t implementation_index, std::uintptr_t stream"
)
py_args = _generate_py_args(call)
py_args_str = f"{py_args}, " if py_args else ""
Expand All @@ -183,7 +209,7 @@ def _generate_call(op_name, call, method=True):
f" Config config;\n"
f" config.set_implementation_index(implementation_index);\n"
f" return Self::Call(handle, config, {call_args});\n"
f' }}, {py_args_str}py::kw_only(), py::arg("stream") = 0, py::arg("implementation_index") = 0);'
f' }}, {py_args_str}py::kw_only(), py::arg("implementation_index") = 0, py::arg("stream") = 0);'
)

return f""" .def("__call__", [](const Self& self, {call_params}) {{
Expand Down Expand Up @@ -224,7 +250,8 @@ def _generate_call(op_name, call, method=True):
{calls}
.def_static("active_implementation_indices", [](const std::string& device) {{
return Self::active_implementation_indices(DeviceTypeFromString(device));
}});
}})
.def_static("clear_cache", &Self::clear_cache);

{callers}
}}
Expand Down Expand Up @@ -447,7 +474,7 @@ def _get_all_ops(devices, with_torch=False):
nargs="+",
default="cpu",
type=str,
help="Devices to use. Please pick from `cpu`, `nvidia`, `cambricon`, `ascend`, `metax`, `moore`, `iluvatar`, `kunlun`, `hygon`, and `qy`. (default: `cpu`)",
help="Devices to use. Please pick from cpu, nvidia, cambricon, ascend, metax, moore, iluvatar, kunlun, hygon, and qy. (default: cpu)",
)

parser.add_argument(
Expand Down
46 changes: 45 additions & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -178,8 +178,10 @@ if(WITH_ASCEND)
"ascend/*.cc"
"ascend/*.cpp"
)
# Exclude `kernel_impl.cpp` — AscendC device code, not compiled by the host C++ compiler.
# Exclude kernel_impl.cpp — AscendC device code, not compiled by the host C++ compiler.
list(FILTER ASCEND_SOURCES EXCLUDE REGEX ".*kernel_impl\\.cpp$")
# Exclude custom_kernel/ — standalone PyTorch extension, built separately.
list(FILTER ASCEND_SOURCES EXCLUDE REGEX ".*/custom_kernel/.*")

target_compile_definitions(infiniops PUBLIC WITH_ASCEND=1)
target_sources(infiniops PRIVATE ${ASCEND_SOURCES})
Expand Down Expand Up @@ -215,7 +217,38 @@ if(WITH_ASCEND)
"${ASCEND_HOME}/lib64/libopapi.so"
"${ASCEND_HAL_LIB}")

# ATB (Ascend Transformer Boost) — provides fused operators like
# PagedAttention and ReshapeAndCache that are graph-capture safe.
set(ATB_HOME_DIR "$ENV{ATB_HOME_PATH}")
if(NOT ATB_HOME_DIR)
# Default search path under CANN nnal directory.
file(GLOB ATB_SEARCH_DIRS "/usr/local/Ascend/nnal/atb/*/atb/cxx_abi_1")
if(ATB_SEARCH_DIRS)
list(SORT ATB_SEARCH_DIRS ORDER DESCENDING)
list(GET ATB_SEARCH_DIRS 0 ATB_HOME_DIR)
endif()
endif()

if(ATB_HOME_DIR AND EXISTS "${ATB_HOME_DIR}/include/atb/operation.h")
message(STATUS "ATB found: ${ATB_HOME_DIR}")
target_compile_definitions(infiniops PUBLIC INFINI_HAS_ATB=1)
target_include_directories(infiniops PUBLIC "${ATB_HOME_DIR}/include")
target_link_libraries(infiniops PUBLIC "${ATB_HOME_DIR}/lib/libatb.so")
else()
message(STATUS "ATB not found — ATB-based operators disabled")
endif()

list(APPEND DEVICE_LIST "ascend")

# Custom AscendC kernels (PyTorch extension, requires torch_npu).
if(BUILD_CUSTOM_KERNEL)
add_subdirectory(ascend/custom_kernel)

# Link the compiled AscendC kernel objects into infiniops so that
# custom kernel implementations (e.g. RmsNorm index 1) can call
# them via the generated launch functions.
target_compile_definitions(infiniops PUBLIC INFINI_HAS_CUSTOM_KERNELS=1)
endif()
endif()

if(WITH_TORCH)
Expand Down Expand Up @@ -340,6 +373,17 @@ if(GENERATE_PYTHON_BINDINGS)
target_include_directories(ops PRIVATE ${PROJECT_SOURCE_DIR})
target_link_libraries(ops PRIVATE infiniops)

# Custom AscendC kernel objects must be linked directly into ops
# because the AscendC toolchain compiles host stubs with hidden
# visibility — `libinfiniops.so` cannot re-export those symbols.
# The `Operator<..., 1>` template instantiations that call
# `aclrtlaunch_*` live in `ops.cc`, so link here with
# `--whole-archive` to ensure all launch functions are available.
if(BUILD_CUSTOM_KERNEL)
target_link_libraries(ops PRIVATE
-Wl,--whole-archive no_workspace_kernel -Wl,--no-whole-archive)
endif()

set_target_properties(infiniops PROPERTIES INSTALL_RPATH "$ORIGIN")
set_target_properties(ops PROPERTIES INSTALL_RPATH "$ORIGIN")

Expand Down
Loading
Loading