InfiniTensor · zhangyue207 · Apr 15, 2026 · Apr 15, 2026 · Apr 15, 2026 · Apr 15, 2026
diff --git a/.ci/images/ascend/Dockerfile b/.ci/images/ascend/Dockerfile
@@ -18,4 +18,12 @@ RUN pip install --no-cache-dir --progress off \
     pytest-xdist \
     ruff
 
+ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
+ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/aarch64:${LD_LIBRARY_PATH}
+ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
+ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
+ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
+ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
+ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
+
 WORKDIR /workspace
diff --git a/.ci/run.py b/.ci/run.py
@@ -8,6 +8,7 @@
 import subprocess
 import sys
 import uuid
+import xml.etree.ElementTree as ET
 from datetime import datetime
 from pathlib import Path
 
@@ -24,6 +25,42 @@
 _PYTEST_VALUE_FLAGS = {"-n", "-k", "-m", "-p", "--tb", "--junitxml", "--rootdir"}
 
 
+def _junit_xml_indicates_pass(results_dir):
+    """Return True if `pytest` junit XML under `results_dir` reports no failures/errors.
+
+    Used to distinguish a real CI failure from the docker 18.09
+    container-teardown `SIGKILL` (exit code 137) that occurs on this host
+    after a child process exits successfully — bash returns 0 from inside
+    the container, but the docker daemon reports 137 due to a race in its
+    `--rm` cleanup path. The junit XML is written by pytest before that
+    teardown and reliably captures the real outcome of the test stage.
+    """
+    for junit in Path(results_dir).rglob("test-results.xml"):
+        try:
+            root = ET.parse(junit).getroot()
+        except ET.ParseError:
+            continue
+
+        suites = root.findall("testsuite") if root.tag == "testsuites" else [root]
+
+        if not suites:
+            continue
+
+        for suite in suites:
+            try:
+                if int(suite.get("failures", 0)) > 0:
+                    return False
+
+                if int(suite.get("errors", 0)) > 0:
+                    return False
+            except ValueError:
+                return False
+
+        return True
+
+    return False
+
+
 def apply_test_override(run_cmd, test_path):
     """Replace positional test path(s) in a pytest stage command.
 
@@ -437,8 +474,23 @@ def main():
             pool.release(allocated_ids)
 
         if returncode != 0:
-            print(f"job {job_name} failed (exit code {returncode})", file=sys.stderr)
-            failed += 1
+            # Docker 18.09 on this host occasionally SIGKILLs containers
+            # during `--rm` cleanup after the inner process already exited
+            # cleanly, producing exit code 137. Fall back to the pytest
+            # junit XML to recover the real outcome in that case.
+            if returncode == 137 and _junit_xml_indicates_pass(results_dir):
+                print(
+                    f"[warn] job {job_name}: container exited with 137 "
+                    f"(likely docker teardown SIGKILL after clean pytest); "
+                    f"junit XML reports no failures — treating as success",
+                    file=sys.stderr,
+                )
+            else:
+                print(
+                    f"job {job_name} failed (exit code {returncode})",
+                    file=sys.stderr,
+                )
+                failed += 1
 
     sys.exit(1 if failed else 0)
 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -18,6 +18,13 @@ option(WITH_ASCEND "Enable Ascend backend" OFF)
 
 option(WITH_TORCH "Enable PyTorch C++ backend" OFF)
 
+# Default OFF until CANN's `extract_host_stub.py` path handling is fixed for
+# `scikit-build-core` temp-dir builds (triggers `KeyError` on the preprocessed
+# object path). Enable explicitly with `-DBUILD_CUSTOM_KERNEL=ON` when the
+# toolchain is compatible or when building via the standalone
+# `src/ascend/custom_kernel/build.sh` script.
+option(BUILD_CUSTOM_KERNEL "Build custom AscendC kernel PyTorch extension (requires torch_npu)" OFF)
+
 option(AUTO_DETECT_DEVICES "Automatically detect available devices" OFF)
 option(AUTO_DETECT_BACKENDS "Automatically detect available backends" OFF)
 option(GENERATE_PYTHON_BINDINGS "Generate Python bindings" OFF)
@@ -130,6 +137,28 @@ if(WITH_TORCH)
     find_library(C10_LIB c10 HINTS ${_torch_lib_dirs} REQUIRED)
     set(TORCH_LIBRARIES ${TORCH_LIB} ${TORCH_CPU_LIB} ${C10_LIB})
 
+    # `auditwheel`-repaired `torch` wheels bundle transitive dependencies
+    # (e.g. `libgfortran-<hash>.so`, `libopenblasp-<hash>.so`) in a sibling
+    # `torch.libs/` directory that `library_paths()` does not return.  When
+    # building against such a wheel, the linker needs this path to resolve
+    # the bundled NEEDED entries (otherwise: `undefined reference to
+    # _gfortran_etime@GFORTRAN_8` etc.).
+    execute_process(
+        COMMAND ${Python_EXECUTABLE} -c "import os, torch; d = os.path.dirname(torch.__file__); p = os.path.join(os.path.dirname(d), 'torch.libs'); print(p if os.path.isdir(p) else '')"
+        OUTPUT_VARIABLE TORCH_BUNDLED_LIBS_DIR
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+
+    if(TORCH_BUNDLED_LIBS_DIR)
+        list(APPEND CMAKE_BUILD_RPATH "${TORCH_BUNDLED_LIBS_DIR}")
+        list(APPEND CMAKE_INSTALL_RPATH "${TORCH_BUNDLED_LIBS_DIR}")
+        # `rpath-link` is linker-only: lets `ld` resolve the bundled
+        # transitive NEEDED entries at link time without adding them to our
+        # own binary's direct NEEDED list.
+        add_link_options("-Wl,-rpath-link,${TORCH_BUNDLED_LIBS_DIR}")
+        message(STATUS "PyTorch bundled libs: ${TORCH_BUNDLED_LIBS_DIR}")
+    endif()
+
     # Query the `CXX11` ABI setting that `torch` was compiled with.
     # A mismatch causes linker errors (e.g. undefined reference to
     # `c10::Device::Device(std::string const&)`).

diff --git a/scripts/generate_wrappers.py b/scripts/generate_wrappers.py
@@ -94,33 +94,51 @@ def __init__(self, name, constructors, calls):
 
 def _find_optional_tensor_params(op_name):
     """Return a set of parameter names declared as `std::optional<Tensor>` in
-    the base header. `libclang` resolves the type to `int` when the STL
+    the base header.  libclang resolves the type to ``int`` when the STL
     headers are not fully available, so we fall back to a regex scan of the
     source text.
     """
     source = (_BASE_DIR / f"{op_name}.h").read_text()
-
     return set(re.findall(r"std::optional<Tensor>\s+(\w+)", source))
 
 
+def _find_vector_tensor_params(op_name):
+    """Return a set of parameter names declared as `std::vector<Tensor>` in
+    the base header.
+    """
+    import re
+
+    source = (_BASE_DIR / f"{op_name}.h").read_text()
+    return set(re.findall(r"std::vector<Tensor>\s+(\w+)", source))
+
+
 def _generate_pybind11(operator):
     optional_tensor_params = _find_optional_tensor_params(operator.name)
+    vector_tensor_params = _find_vector_tensor_params(operator.name)
 
     def _is_optional_tensor(arg):
         if arg.spelling in optional_tensor_params:
             return True
-
         return "std::optional" in arg.type.spelling and "Tensor" in arg.type.spelling
 
+    def _is_optional(arg):
+        return "std::optional" in arg.type.spelling
+
+    def _is_vector_tensor(arg):
+        if arg.spelling in vector_tensor_params:
+            return True
+        return "std::vector" in arg.type.spelling and "Tensor" in arg.type.spelling
+
     def _generate_params(node):
         parts = []
 
         for arg in node.get_arguments():
             if arg.spelling == "stream":
                 continue
-
             if _is_optional_tensor(arg):
                 parts.append(f"std::optional<py::object> {arg.spelling}")
+            elif _is_vector_tensor(arg):
+                parts.append(f"std::vector<py::object> {arg.spelling}")
             else:
                 param = arg.type.spelling.replace("const Tensor", "py::object").replace(
                     "Tensor", "py::object"
@@ -135,9 +153,10 @@ def _generate_arguments(node):
         for arg in node.get_arguments():
             if arg.spelling == "stream":
                 continue
-
             if _is_optional_tensor(arg):
                 args.append(f"OptionalTensorFromPybind11Handle({arg.spelling})")
+            elif _is_vector_tensor(arg):
+                args.append(f"VectorTensorFromPybind11Handle({arg.spelling})")
             elif "Tensor" in arg.type.spelling:
                 args.append(f"TensorFromPybind11Handle({arg.spelling})")
             else:
@@ -155,21 +174,28 @@ def _generate_init(constructor):
       }}))"""
 
     def _generate_py_args(node):
-        return ", ".join(
-            f'py::arg("{arg.spelling}")'
-            for arg in node.get_arguments()
-            if arg.spelling != "stream"
-        )
+        parts = []
+
+        for arg in node.get_arguments():
+            if arg.spelling == "stream":
+                continue
+
+            if _is_optional(arg):
+                parts.append(f'py::arg("{arg.spelling}") = py::none()')
+            else:
+                parts.append(f'py::arg("{arg.spelling}")')
+
+        return ", ".join(parts)
 
     def _generate_call(op_name, call, method=True):
         call_params = _generate_params(call)
         call_args = _generate_arguments(call)
 
         if not method:
             params = (
-                f"{call_params}, std::uintptr_t stream, std::size_t implementation_index"
+                f"{call_params}, std::size_t implementation_index, std::uintptr_t stream"
                 if call_params
-                else "std::uintptr_t stream, std::size_t implementation_index"
+                else "std::size_t implementation_index, std::uintptr_t stream"
             )
             py_args = _generate_py_args(call)
             py_args_str = f"{py_args}, " if py_args else ""
@@ -183,7 +209,7 @@ def _generate_call(op_name, call, method=True):
                 f"    Config config;\n"
                 f"    config.set_implementation_index(implementation_index);\n"
                 f"    return Self::Call(handle, config, {call_args});\n"
-                f'  }}, {py_args_str}py::kw_only(), py::arg("stream") = 0, py::arg("implementation_index") = 0);'
+                f'  }}, {py_args_str}py::kw_only(), py::arg("implementation_index") = 0, py::arg("stream") = 0);'
             )
 
         return f"""      .def("__call__", [](const Self& self, {call_params}) {{
@@ -224,7 +250,8 @@ def _generate_call(op_name, call, method=True):
 {calls}
       .def_static("active_implementation_indices", [](const std::string& device) {{
         return Self::active_implementation_indices(DeviceTypeFromString(device));
-      }});
+      }})
+      .def_static("clear_cache", &Self::clear_cache);
 
 {callers}
 }}
@@ -447,7 +474,7 @@ def _get_all_ops(devices, with_torch=False):
         nargs="+",
         default="cpu",
         type=str,
-        help="Devices to use. Please pick from `cpu`, `nvidia`, `cambricon`, `ascend`, `metax`, `moore`, `iluvatar`, `kunlun`, `hygon`, and `qy`. (default: `cpu`)",
+        help="Devices to use. Please pick from cpu, nvidia, cambricon, ascend, metax, moore, iluvatar, kunlun, hygon, and qy. (default: cpu)",
     )
 
     parser.add_argument(

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -178,8 +178,10 @@ if(WITH_ASCEND)
         "ascend/*.cc"
         "ascend/*.cpp"
     )
-    # Exclude `kernel_impl.cpp` — AscendC device code, not compiled by the host C++ compiler.
+    # Exclude kernel_impl.cpp — AscendC device code, not compiled by the host C++ compiler.
     list(FILTER ASCEND_SOURCES EXCLUDE REGEX ".*kernel_impl\\.cpp$")
+    # Exclude custom_kernel/ — standalone PyTorch extension, built separately.
+    list(FILTER ASCEND_SOURCES EXCLUDE REGEX ".*/custom_kernel/.*")
 
     target_compile_definitions(infiniops PUBLIC WITH_ASCEND=1)
     target_sources(infiniops PRIVATE ${ASCEND_SOURCES})
@@ -215,7 +217,38 @@ if(WITH_ASCEND)
         "${ASCEND_HOME}/lib64/libopapi.so"
         "${ASCEND_HAL_LIB}")
 
+    # ATB (Ascend Transformer Boost) — provides fused operators like
+    # PagedAttention and ReshapeAndCache that are graph-capture safe.
+    set(ATB_HOME_DIR "$ENV{ATB_HOME_PATH}")
+    if(NOT ATB_HOME_DIR)
+        # Default search path under CANN nnal directory.
+        file(GLOB ATB_SEARCH_DIRS "/usr/local/Ascend/nnal/atb/*/atb/cxx_abi_1")
+        if(ATB_SEARCH_DIRS)
+            list(SORT ATB_SEARCH_DIRS ORDER DESCENDING)
+            list(GET ATB_SEARCH_DIRS 0 ATB_HOME_DIR)
+        endif()
+    endif()
+
+    if(ATB_HOME_DIR AND EXISTS "${ATB_HOME_DIR}/include/atb/operation.h")
+        message(STATUS "ATB found: ${ATB_HOME_DIR}")
+        target_compile_definitions(infiniops PUBLIC INFINI_HAS_ATB=1)
+        target_include_directories(infiniops PUBLIC "${ATB_HOME_DIR}/include")
+        target_link_libraries(infiniops PUBLIC "${ATB_HOME_DIR}/lib/libatb.so")
+    else()
+        message(STATUS "ATB not found — ATB-based operators disabled")
+    endif()
+
     list(APPEND DEVICE_LIST "ascend")
+
+    # Custom AscendC kernels (PyTorch extension, requires torch_npu).
+    if(BUILD_CUSTOM_KERNEL)
+        add_subdirectory(ascend/custom_kernel)
+
+        # Link the compiled AscendC kernel objects into infiniops so that
+        # custom kernel implementations (e.g. RmsNorm index 1) can call
+        # them via the generated launch functions.
+        target_compile_definitions(infiniops PUBLIC INFINI_HAS_CUSTOM_KERNELS=1)
+    endif()
 endif()
 
 if(WITH_TORCH)
@@ -340,6 +373,17 @@ if(GENERATE_PYTHON_BINDINGS)
     target_include_directories(ops PRIVATE ${PROJECT_SOURCE_DIR})
     target_link_libraries(ops PRIVATE infiniops)
 
+    # Custom AscendC kernel objects must be linked directly into ops
+    # because the AscendC toolchain compiles host stubs with hidden
+    # visibility — `libinfiniops.so` cannot re-export those symbols.
+    # The `Operator<..., 1>` template instantiations that call
+    # `aclrtlaunch_*` live in `ops.cc`, so link here with
+    # `--whole-archive` to ensure all launch functions are available.
+    if(BUILD_CUSTOM_KERNEL)
+        target_link_libraries(ops PRIVATE
+            -Wl,--whole-archive no_workspace_kernel -Wl,--no-whole-archive)
+    endif()
+
     set_target_properties(infiniops PROPERTIES INSTALL_RPATH "$ORIGIN")
     set_target_properties(ops PROPERTIES INSTALL_RPATH "$ORIGIN")