diff --git a/.github/ci_config.yml b/.github/ci_config.yml index b9979570a..65b2b208f 100644 --- a/.github/ci_config.yml +++ b/.github/ci_config.yml @@ -15,7 +15,7 @@ platforms: BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3 SKIP_APT: "1" PIP_INDEX_URL: https://pypi.tuna.tsinghua.edu.cn/simple - setup: pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON + setup: pip install scikit-build-core pybind11 libclang pyyaml && rm -rf /tmp/infinirt-src /tmp/infinirt-build /tmp/infinirt-prefix && git clone --depth 1 ${INFINI_RT_GIT_URL:-https://gh-proxy.com/https://github.com/InfiniTensor/InfiniRT.git} /tmp/infinirt-src && cmake -S /tmp/infinirt-src -B /tmp/infinirt-build -DWITH_CPU=ON -DWITH_NVIDIA=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/infinirt-prefix && cmake --build /tmp/infinirt-build -j$(nproc) && cmake --install /tmp/infinirt-build && pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON --config-settings=cmake.define.INFINI_RT_ROOT=/tmp/infinirt-prefix jobs: gpu: type: unittest @@ -51,7 +51,7 @@ platforms: - /lib/firmware:/lib/firmware - /usr/src:/usr/src - /lib/modules:/lib/modules - setup: python -m pip install packaging exceptiongroup typing-extensions pygments pybind11 libclang && python -m pip install . --no-build-isolation --no-deps --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON + setup: python -m pip install scikit-build-core packaging exceptiongroup typing-extensions pygments pybind11 libclang pyyaml && rm -rf /tmp/infinirt-src /tmp/infinirt-build /tmp/infinirt-prefix && git clone --depth 1 ${INFINI_RT_GIT_URL:-https://gh-proxy.com/https://github.com/InfiniTensor/InfiniRT.git} /tmp/infinirt-src && cmake -S /tmp/infinirt-src -B /tmp/infinirt-build -DWITH_CPU=ON -DWITH_ILUVATAR=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/infinirt-prefix -DCMAKE_CUDA_ARCHITECTURES=OFF -DCMAKE_CUDA_COMPILER=/usr/local/corex-4.3.0.20250624/bin/clang++ -DCUDAToolkit_ROOT=/usr/local/corex-4.3.0.20250624 && cmake --build /tmp/infinirt-build -j$(nproc) && cmake --install /tmp/infinirt-build && python -m pip install . --no-build-isolation --no-deps --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON --config-settings=cmake.define.INFINI_RT_ROOT=/tmp/infinirt-prefix jobs: gpu: type: unittest @@ -81,7 +81,7 @@ platforms: - "--privileged" - "--ulimit=memlock=-1" - "--ulimit=stack=67108864" - setup: pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON + setup: pip install scikit-build-core pybind11 libclang pyyaml && rm -rf /tmp/infinirt-src /tmp/infinirt-build /tmp/infinirt-prefix && git clone --depth 1 ${INFINI_RT_GIT_URL:-https://gh-proxy.com/https://github.com/InfiniTensor/InfiniRT.git} /tmp/infinirt-src && cmake -S /tmp/infinirt-src -B /tmp/infinirt-build -DWITH_CPU=ON -DWITH_METAX=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/infinirt-prefix && cmake --build /tmp/infinirt-build -j$(nproc) && cmake --install /tmp/infinirt-build && pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON --config-settings=cmake.define.INFINI_RT_ROOT=/tmp/infinirt-prefix jobs: gpu: type: unittest @@ -109,7 +109,7 @@ platforms: PIP_INDEX_URL: https://pypi.org/simple docker_args: - "--privileged" - setup: pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON + setup: pip install scikit-build-core pybind11 libclang pyyaml && rm -rf /tmp/infinirt-src /tmp/infinirt-build /tmp/infinirt-prefix && git clone --depth 1 ${INFINI_RT_GIT_URL:-https://gh-proxy.com/https://github.com/InfiniTensor/InfiniRT.git} /tmp/infinirt-src && cmake -S /tmp/infinirt-src -B /tmp/infinirt-build -DWITH_CPU=ON -DWITH_MOORE=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/infinirt-prefix && cmake --build /tmp/infinirt-build -j$(nproc) && cmake --install /tmp/infinirt-build && pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON --config-settings=cmake.define.INFINI_RT_ROOT=/tmp/infinirt-prefix jobs: gpu: type: unittest @@ -136,7 +136,7 @@ platforms: PIP_INDEX_URL: https://pypi.org/simple docker_args: - "--privileged" - setup: pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON + setup: pip install scikit-build-core pybind11 libclang pyyaml && rm -rf /tmp/infinirt-src /tmp/infinirt-build /tmp/infinirt-prefix && git clone --depth 1 ${INFINI_RT_GIT_URL:-https://gh-proxy.com/https://github.com/InfiniTensor/InfiniRT.git} /tmp/infinirt-src && cmake -S /tmp/infinirt-src -B /tmp/infinirt-build -DWITH_CPU=ON -DWITH_CAMBRICON=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/infinirt-prefix && cmake --build /tmp/infinirt-build -j$(nproc) && cmake --install /tmp/infinirt-build && pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON --config-settings=cmake.define.INFINI_RT_ROOT=/tmp/infinirt-prefix jobs: gpu: type: unittest @@ -172,7 +172,7 @@ platforms: - "--group-add=video" volumes: - /opt/hyhal:/opt/hyhal:ro - setup: pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON + setup: pip install scikit-build-core pybind11 libclang pyyaml && rm -rf /tmp/infinirt-src /tmp/infinirt-build /tmp/infinirt-prefix && git clone --depth 1 ${INFINI_RT_GIT_URL:-https://gh-proxy.com/https://github.com/InfiniTensor/InfiniRT.git} /tmp/infinirt-src && cmake -S /tmp/infinirt-src -B /tmp/infinirt-build -DWITH_CPU=ON -DWITH_HYGON=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/infinirt-prefix && cmake --build /tmp/infinirt-build -j$(nproc) && cmake --install /tmp/infinirt-build && pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON --config-settings=cmake.define.INFINI_RT_ROOT=/tmp/infinirt-prefix jobs: gpu: type: unittest @@ -209,7 +209,7 @@ platforms: - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi:ro env: ASCEND_HOME_PATH: /usr/local/Ascend/ascend-toolkit/latest - setup: pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON + setup: pip install scikit-build-core pybind11 libclang pyyaml && rm -rf /tmp/infinirt-src /tmp/infinirt-build /tmp/infinirt-prefix && git clone --depth 1 ${INFINI_RT_GIT_URL:-https://gh-proxy.com/https://github.com/InfiniTensor/InfiniRT.git} /tmp/infinirt-src && cmake -S /tmp/infinirt-src -B /tmp/infinirt-build -DWITH_CPU=ON -DWITH_ASCEND=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/infinirt-prefix && cmake --build /tmp/infinirt-build -j$(nproc) && cmake --install /tmp/infinirt-build && pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON --config-settings=cmake.define.INFINI_RT_ROOT=/tmp/infinirt-prefix jobs: npu: type: unittest diff --git a/CMakeLists.txt b/CMakeLists.txt index 2d10efdbe..39c2d25e9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -294,8 +294,6 @@ if(WITH_TORCH) message(STATUS "Found PyTorch: ${TORCH_INCLUDE_DIRS}") endif() -include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src) - # Only one CUDA-like GPU backend can be enabled at a time. set(_gpu_backend_count 0) foreach(_gpu_backend WITH_NVIDIA WITH_ILUVATAR WITH_HYGON WITH_METAX WITH_MOORE WITH_ASCEND) @@ -502,6 +500,90 @@ if(WITH_HYGON AND NOT EXISTS "${DTK_ROOT}/llvm/lib/LLVMgold.so") set(PYBIND11_ENABLE_EXTRAS OFF) endif() +set(INFINI_RT_ROOT "$ENV{INFINI_RT_ROOT}" CACHE PATH "Installed InfiniRT prefix") +set(INFINI_RT_SOURCE_DIR "" CACHE PATH "InfiniRT source directory for development builds") +set(INFINI_RT_INCLUDE_DIRS "" CACHE STRING "InfiniRT include directories") +set(INFINI_RT_LIBRARY "" CACHE FILEPATH "InfiniRT library") + +function(_infiniops_configure_infinirt) + if(TARGET infinirt) + get_target_property(_infinirt_include_dirs infinirt INTERFACE_INCLUDE_DIRECTORIES) + if(_infinirt_include_dirs) + set(INFINI_RT_INCLUDE_DIRS "${_infinirt_include_dirs}" CACHE STRING "InfiniRT include directories" FORCE) + endif() + return() + endif() + + if(INFINI_RT_SOURCE_DIR) + if(NOT EXISTS "${INFINI_RT_SOURCE_DIR}/CMakeLists.txt") + message(FATAL_ERROR + "InfiniRT source tree not found at `${INFINI_RT_SOURCE_DIR}`. " + "Unset `INFINI_RT_SOURCE_DIR` to use an installed InfiniRT package.") + endif() + + add_subdirectory("${INFINI_RT_SOURCE_DIR}" "${CMAKE_BINARY_DIR}/InfiniRT") + set(INFINI_RT_INCLUDE_DIRS + "${INFINI_RT_SOURCE_DIR}/include;${INFINI_RT_SOURCE_DIR}/generated/include" + CACHE STRING "InfiniRT include directories" FORCE) + return() + endif() + + set(_infinirt_include_hints ${INFINI_RT_INCLUDE_DIRS}) + if(INFINI_RT_ROOT) + list(APPEND _infinirt_include_hints "${INFINI_RT_ROOT}/include") + endif() + + find_path(_INFINI_RT_INCLUDE_DIR + NAMES infini/rt.h + HINTS ${_infinirt_include_hints} + PATH_SUFFIXES include + ) + + if(INFINI_RT_LIBRARY) + set(_INFINI_RT_LIBRARY "${INFINI_RT_LIBRARY}") + else() + set(_infinirt_library_hints "") + if(INFINI_RT_ROOT) + list(APPEND _infinirt_library_hints + "${INFINI_RT_ROOT}/lib" + "${INFINI_RT_ROOT}/lib64") + endif() + find_library(_INFINI_RT_LIBRARY + NAMES infinirt + HINTS ${_infinirt_library_hints} + ) + endif() + + if(NOT _INFINI_RT_INCLUDE_DIR OR NOT _INFINI_RT_LIBRARY) + message(FATAL_ERROR + "InfiniRT installed headers/library were not found. " + "Set `INFINI_RT_ROOT` to an installed InfiniRT prefix, or set " + "`INFINI_RT_INCLUDE_DIRS` and `INFINI_RT_LIBRARY` explicitly. " + "For development builds from source, set `INFINI_RT_SOURCE_DIR`.") + endif() + + set(_infinirt_resolved_include_dirs "") + foreach(_infinirt_include_dir IN LISTS _infinirt_include_hints) + if(_infinirt_include_dir AND EXISTS "${_infinirt_include_dir}") + list(APPEND _infinirt_resolved_include_dirs "${_infinirt_include_dir}") + endif() + endforeach() + list(APPEND _infinirt_resolved_include_dirs "${_INFINI_RT_INCLUDE_DIR}") + list(REMOVE_DUPLICATES _infinirt_resolved_include_dirs) + + add_library(infinirt SHARED IMPORTED GLOBAL) + set_target_properties(infinirt PROPERTIES + IMPORTED_LOCATION "${_INFINI_RT_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES "${_infinirt_resolved_include_dirs}" + ) + + set(INFINI_RT_INCLUDE_DIRS "${_infinirt_resolved_include_dirs}" CACHE STRING "InfiniRT include directories" FORCE) + set(INFINI_RT_LIBRARY "${_INFINI_RT_LIBRARY}" CACHE FILEPATH "InfiniRT library" FORCE) + message(STATUS "Using installed InfiniRT: ${_INFINI_RT_LIBRARY}") +endfunction() + +_infiniops_configure_infinirt() + add_subdirectory(src) if(NOT GENERATE_PYTHON_BINDINGS) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 53240b134..39b38497f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -69,13 +69,21 @@ Branch names use the format `/xxx-yyyy-zzzz`, where `` matches the P Using Nvidia as an example: ```bash -pip install .[dev] -C cmake.define.WITH_CPU=ON -C cmake.define.WITH_NVIDIA=ON +pip install .[dev] \ + -C cmake.define.INFINI_RT_ROOT=/path/to/infinirt-prefix \ + -C cmake.define.WITH_CPU=ON \ + -C cmake.define.WITH_NVIDIA=ON ``` -Auto-detection is supported for some platforms, so you can also simply run: +`/path/to/infinirt-prefix` is the InfiniRT install prefix, typically the value +used for InfiniRT's `CMAKE_INSTALL_PREFIX`. It should contain +`include/infini/rt.h` and `lib/libinfinirt.so`. + +Auto-detection is supported for some platforms, so you can also let InfiniOps +detect the device backends while still pointing it at the installed InfiniRT: ```bash -pip install .[dev] +pip install .[dev] -C cmake.define.INFINI_RT_ROOT=/path/to/infinirt-prefix ``` > `[dev]` installs optional development dependencies (e.g. `pytest`) that are not needed for production but required for development and testing. After the first install, subsequent installs only need `pip install .`. @@ -94,6 +102,7 @@ For routine development and pull requests, start with a smoke build plus the smo ```bash python -m pip install .[dev] --no-build-isolation --no-deps \ + --config-settings=cmake.define.INFINI_RT_ROOT=/path/to/infinirt-prefix \ --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON python -m pytest tests -m smoke -q ``` @@ -251,5 +260,7 @@ using T = TypeMapType(list_tag)>; 2. **Segmentation fault during tests**: Run `pytest -n 1`. 3. **`Unknown CMake command "pybind11_add_module"`**: Install pybind11 with `pip install pybind11[global]`. See the [pybind11 installation guide](https://pybind11.readthedocs.io/en/stable/installing.html). 4. **Auto-detection (`AUTO_DETECT_DEVICES`) fails**: Some machines may not expose devices in expected paths (e.g. `/dev/nvidia*`). Use explicit CMake defines instead (e.g. `-C cmake.define.WITH_NVIDIA=ON`). -5. **`bash: pytest: command not found`**: Use `python -m pytest`. -6. **`CUBLAS_STATUS_INVALID_VALUE` in `cublasSgemmStridedBatched`**: PyTorch version issue. Downgrade to `torch<=2.9.1`. +5. **InfiniRT headers or library are not found**: Set `INFINI_RT_ROOT` to the InfiniRT install prefix, or set `INFINI_RT_INCLUDE_DIRS` and `INFINI_RT_LIBRARY` explicitly. +6. **`ImportError: libinfinirt.so: cannot open shared object file`**: Reinstall InfiniOps from a build that was configured with the InfiniRT install prefix so the Python wheel can bundle the runtime library. +7. **`bash: pytest: command not found`**: Use `python -m pytest`. +8. **`CUBLAS_STATUS_INVALID_VALUE` in `cublasSgemmStridedBatched`**: PyTorch version issue. Downgrade to `torch<=2.9.1`. diff --git a/README.md b/README.md index 44a8c71f1..4a8c14232 100644 --- a/README.md +++ b/README.md @@ -8,21 +8,33 @@ InfiniOps is a high-performance, cross-platform operator library supporting mult - CMake 3.18+ - Python 3.10+ - Hardware-specific SDKs (e.g. CUDA Toolkit, MUSA Toolkit) +- Installed InfiniRT headers and library ## Installation -Install with pip (recommended): +Install InfiniRT first, then build InfiniOps with the InfiniRT install prefix: ```bash -pip install . +pip install . -C cmake.define.INFINI_RT_ROOT=/path/to/infinirt-prefix ``` -This auto-detects available platforms on supported backends. To specify platforms explicitly: +`/path/to/infinirt-prefix` is the directory passed to InfiniRT as +`CMAKE_INSTALL_PREFIX`; it should contain `include/infini/rt.h` and +`lib/libinfinirt.so`. + +InfiniOps auto-detects available platforms on supported backends. To specify +platforms explicitly: ```bash -pip install . -C cmake.define.WITH_CPU=ON -C cmake.define.WITH_NVIDIA=ON +pip install . \ + -C cmake.define.INFINI_RT_ROOT=/path/to/infinirt-prefix \ + -C cmake.define.WITH_CPU=ON \ + -C cmake.define.WITH_NVIDIA=ON ``` +The Python wheel installs the required InfiniRT shared library next to the +InfiniOps extension so `import infini.ops` can load its runtime dependency. + ### CMake Options | Option | Description | Default | @@ -36,6 +48,10 @@ pip install . -C cmake.define.WITH_CPU=ON -C cmake.define.WITH_NVIDIA=ON | `-DWITH_CAMBRICON=[ON\|OFF]` | Compile the Cambricon implementation | OFF | | `-DWITH_ASCEND=[ON\|OFF]` | Compile the Ascend implementation | OFF | | `-DAUTO_DETECT_DEVICES=[ON\|OFF]` | Auto-detect available platforms | ON | +| `-DINFINI_RT_ROOT=` | InfiniRT install prefix containing `include/` and `lib/` | `$INFINI_RT_ROOT` | +| `-DINFINI_RT_INCLUDE_DIRS=` | Explicit InfiniRT include directories | empty | +| `-DINFINI_RT_LIBRARY=` | Explicit path to `libinfinirt.so` | empty | +| `-DINFINI_RT_SOURCE_DIR=` | InfiniRT source tree for development builds | empty | If no accelerator options are provided and auto-detection finds nothing, `WITH_CPU` is enabled by default. diff --git a/examples/runtime_api.h b/examples/runtime_api.h index 101fcad59..1bbfdc1f3 100644 --- a/examples/runtime_api.h +++ b/examples/runtime_api.h @@ -2,6 +2,7 @@ #define INFINI_OPS_EXAMPLES_RUNTIME_API_H_ #include "device.h" +#include "runtime.h" #ifdef WITH_NVIDIA #include "native/cuda/nvidia/ops/gemm/cublas.h" @@ -23,8 +24,9 @@ #include "native/ascend/ops/gemm/kernel.h" #include "native/ascend/runtime_.h" #elif WITH_CPU +#include + #include "native/cpu/ops/gemm/gemm.h" -#include "native/cpu/runtime_.h" #else #error "One `WITH_*` backend must be enabled for the examples." #endif diff --git a/scripts/generate_wrappers.py b/scripts/generate_wrappers.py index 31e1ab0aa..0384e0423 100644 --- a/scripts/generate_wrappers.py +++ b/scripts/generate_wrappers.py @@ -83,6 +83,38 @@ def _op_relative_type(op_name): return "::".join(parts) +def _get_infini_rt_include_flags(): + include_dirs = [] + + for include_dir in os.environ.get("INFINI_RT_INCLUDE_DIRS", "").split(os.pathsep): + if include_dir: + include_dirs.append(pathlib.Path(include_dir)) + + infini_rt_root = os.environ.get("INFINI_RT_ROOT") + if infini_rt_root: + include_dirs.append(pathlib.Path(infini_rt_root) / "include") + + infini_rt_source_dir = os.environ.get("INFINI_RT_SOURCE_DIR") + if infini_rt_source_dir: + infini_rt_source_path = pathlib.Path(infini_rt_source_dir) + include_dirs.extend( + ( + infini_rt_source_path / "include", + infini_rt_source_path / "generated" / "include", + ) + ) + + flags = [] + seen = set() + for include_dir in include_dirs: + include_dir = include_dir.resolve() + if include_dir.exists() and include_dir not in seen: + flags.extend(("-I", str(include_dir))) + seen.add(include_dir) + + return tuple(flags) + + def _write_text_if_changed(path: pathlib.Path, content: str) -> bool: """Write `content` only when the file's bytes would change.""" path.parent.mkdir(parents=True, exist_ok=True) @@ -185,14 +217,18 @@ def __call__(self, op_name): index = clang.cindex.Index.create() args = ( - "-std=c++17", - "-x", - "c++", - "-I", - "src", - "-I", - str(_GENERATION_DIR), - ) + _get_system_include_flags() + ( + "-std=c++17", + "-x", + "c++", + "-I", + "src", + "-I", + str(_GENERATION_DIR), + ) + + _get_infini_rt_include_flags() + + _get_system_include_flags() + ) translation_unit = index.parse(str(_find_base_header(op_name)), args=args) nodes = tuple(type(self)._find(translation_unit.cursor, op_name)) @@ -407,6 +443,17 @@ def _find_vector_int64_params(op_name): return set(re.findall(r"std::vector\s+(\w+)", source)) +def _find_tensor_params(op_name): + source = _find_base_header(op_name).read_text() + + params = set() + params.update(re.findall(r"(?:^|[,(]\s*)(?:const\s+)?Tensor\s+(\w+)", source)) + params.update(_find_optional_tensor_params(op_name)) + params.update(_find_vector_tensor_params(op_name)) + + return params + + def _generate_pybind11(operator): optional_tensor_params = _find_optional_tensor_params(operator.name) optional_non_tensor_params = _find_optional_non_tensor_params(operator.name) @@ -840,6 +887,7 @@ def _generate_tensor_caster(name, is_data=False): def _generate_generated_dispatch_entries(operator): optional_tensor_params = _find_optional_tensor_params(operator.name) optional_non_tensor_params = _find_optional_non_tensor_params(operator.name) + tensor_params = _find_tensor_params(operator.name) vector_tensor_params = _find_vector_tensor_params(operator.name) vector_int64_params = _find_vector_int64_params(operator.name) @@ -863,6 +911,15 @@ def _is_vector_tensor(arg): def _is_vector_int64(arg): return arg.spelling in vector_int64_params + def _is_tensor(arg): + if arg.spelling in optional_non_tensor_params: + return False + + if arg.spelling in tensor_params: + return True + + return "Tensor" in arg.type.spelling or "TensorView" in arg.type.spelling + def _generate_params(node): parts = [] @@ -876,6 +933,8 @@ def _generate_params(node): parts.append(f"std::vector {arg.spelling}") elif _is_vector_int64(arg): parts.append(f"std::vector {arg.spelling}") + elif _is_tensor(arg): + parts.append(f"Tensor {arg.spelling}") else: parts.append(f"{arg.type.spelling} {arg.spelling}") @@ -1032,16 +1091,71 @@ def _strip_top_level_const(type_spelling): def _generate_operator_call_instantiation_entries(operator): + optional_tensor_params = _find_optional_tensor_params(operator.name) + optional_non_tensor_params = _find_optional_non_tensor_params(operator.name) + tensor_params = _find_tensor_params(operator.name) + vector_tensor_params = _find_vector_tensor_params(operator.name) + vector_int64_params = _find_vector_int64_params(operator.name) + + def _is_optional_tensor(arg): + spelling = arg.type.spelling + + if "std::optional" in spelling: + return "Tensor" in spelling or "TensorView" in spelling + + if arg.spelling in optional_non_tensor_params: + return False + + if arg.spelling in optional_tensor_params: + return True + + return False + + def _is_vector_tensor(arg): + if arg.spelling in vector_tensor_params: + return True + + return "std::vector" in arg.type.spelling and ( + "Tensor" in arg.type.spelling or "TensorView" in arg.type.spelling + ) + + def _is_vector_int64(arg): + return arg.spelling in vector_int64_params + + def _is_tensor(arg): + if arg.spelling in optional_non_tensor_params: + return False + + if arg.spelling in tensor_params: + return True + + return "Tensor" in arg.type.spelling or "TensorView" in arg.type.spelling + + def _normalized_type(arg): + if _is_optional_tensor(arg): + return "std::optional" + + if _is_vector_tensor(arg): + return "std::vector" + + if _is_vector_int64(arg): + return "std::vector" + + if _is_tensor(arg): + return "Tensor" + + return _strip_top_level_const(arg.type.spelling) + def _generate_template_arguments(node): return ", ".join( - _strip_top_level_const(arg.type.spelling) + _normalized_type(arg) for arg in node.get_arguments() if arg.spelling != "stream" ) def _generate_parameters(node): return ", ".join( - f"const {_strip_top_level_const(arg.type.spelling)}& {arg.spelling}" + f"const {_normalized_type(arg)}& {arg.spelling}" for arg in node.get_arguments() if arg.spelling != "stream" ) @@ -1153,13 +1267,13 @@ def _generate_operator_call_instantiation_source(devices, impl_paths, definition def _device_marker_headers(devices): paths = { - "cpu": "native/cpu/device_.h", - "nvidia": "native/cuda/nvidia/device_.h", - "cambricon": "native/cambricon/device_.h", - "ascend": "native/ascend/device_.h", - "metax": "native/cuda/metax/device_.h", - "moore": "native/cuda/moore/device_.h", - "iluvatar": "native/cuda/iluvatar/device_.h", + "cpu": "infini/rt/cpu/device_.h", + "nvidia": "infini/rt/nvidia/device_.h", + "cambricon": "infini/rt/cambricon/device_.h", + "ascend": "infini/rt/ascend/device_.h", + "metax": "infini/rt/metax/device_.h", + "moore": "infini/rt/moore/device_.h", + "iluvatar": "infini/rt/iluvatar/device_.h", } return [paths[device] for device in devices if device in paths] diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 61d3103c0..0a1870906 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,4 +1,5 @@ -add_library(infiniops SHARED) +add_library(infiniops SHARED operator.h) +set_target_properties(infiniops PROPERTIES LINKER_LANGUAGE CXX) function(_infini_ops_write_if_different path content) set(_should_write TRUE) @@ -35,8 +36,17 @@ endfunction() include(GNUInstallDirs) file(GLOB BASE_SRCS CONFIGURE_DEPENDS "*.cc") +list(FILTER BASE_SRCS EXCLUDE REGEX ".*tensor\\.cc$") target_sources(infiniops PRIVATE ${BASE_SRCS}) +target_link_libraries(infiniops PUBLIC infinirt) + +set(INFINI_RT_INCLUDE_FLAGS "") +foreach(_include_dir IN LISTS INFINI_RT_INCLUDE_DIRS) + list(APPEND INFINI_RT_INCLUDE_FLAGS "-I${_include_dir}") +endforeach() +list(JOIN INFINI_RT_INCLUDE_DIRS ":" INFINI_RT_INCLUDE_DIRS_ENV) + set(DEVICE_LIST "") if(WITH_CPU) @@ -46,7 +56,7 @@ if(WITH_CPU) ) file(GLOB_RECURSE CPU_SOURCES CONFIGURE_DEPENDS ${CPU_PATTERNS}) - list(APPEND CORE_SOURCES ${CPU_SOURCES}) + target_sources(infiniops PRIVATE ${CPU_SOURCES}) target_compile_definitions(infiniops PUBLIC WITH_CPU=1) @@ -222,6 +232,7 @@ if(WITH_CAMBRICON) set(MLU_COMPILE_OPTS -c --bang-mlu-arch=mtp_592 -O3 -fPIC -Wall -Werror -std=c++17 -pthread -I${CMAKE_CURRENT_SOURCE_DIR} -I${NEUWARE_HOME}/include + ${INFINI_RT_INCLUDE_FLAGS} -idirafter /usr/local/neuware/lib/clang/11.1.0/include ) function(compile_mlu_file src_file) @@ -528,6 +539,7 @@ if(WITH_TORCH) -std=c++17 -fPIC -O0 "-I${CMAKE_CURRENT_SOURCE_DIR}" "-I${PROJECT_SOURCE_DIR}/generated" + ${INFINI_RT_INCLUDE_FLAGS} ${_torch_include_flags} ${_torch_extra_flags} -c "${_src}" -o "${_obj}" @@ -553,6 +565,7 @@ if(WITH_TORCH) PROPERTIES POSITION_INDEPENDENT_CODE ON) target_include_directories(infini_ops_torch_objs PRIVATE $ + ${INFINI_RT_INCLUDE_DIRS} ${TORCH_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR}/generated) target_compile_definitions(infini_ops_torch_objs PRIVATE @@ -597,7 +610,11 @@ if(GENERATE_OPERATOR_CALL_INSTANTIATIONS OR GENERATE_PYTHON_BINDINGS) endif() execute_process( - COMMAND ${Python_EXECUTABLE} ${PROJECT_SOURCE_DIR}/scripts/generate_wrappers.py ${GENERATOR_ARGS} + COMMAND ${CMAKE_COMMAND} -E env + INFINI_RT_INCLUDE_DIRS=${INFINI_RT_INCLUDE_DIRS_ENV} + ${Python_EXECUTABLE} + ${PROJECT_SOURCE_DIR}/scripts/generate_wrappers.py + ${GENERATOR_ARGS} WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} RESULT_VARIABLE script_result ) @@ -622,7 +639,8 @@ if(GENERATE_OPERATOR_CALL_INSTANTIATIONS) "-I${CMAKE_CURRENT_SOURCE_DIR}" "-I${PROJECT_SOURCE_DIR}" "-I${PROJECT_SOURCE_DIR}/generated" - "-I${PROJECT_SOURCE_DIR}/generated/include") + "-I${PROJECT_SOURCE_DIR}/generated/include" + ${INFINI_RT_INCLUDE_FLAGS}) foreach(_dir IN LISTS TORCH_INCLUDE_DIRS CUDAToolkit_INCLUDE_DIRS) list(APPEND _iluvatar_call_instantiation_include_flags "-I${_dir}") endforeach() @@ -796,7 +814,8 @@ if(GENERATE_PYTHON_BINDINGS) set(_iluvatar_dispatch_include_flags "-I${CMAKE_CURRENT_SOURCE_DIR}" "-I${PROJECT_SOURCE_DIR}" - "-I${PROJECT_SOURCE_DIR}/generated") + "-I${PROJECT_SOURCE_DIR}/generated" + ${INFINI_RT_INCLUDE_FLAGS}) foreach(_dir IN LISTS TORCH_INCLUDE_DIRS CUDAToolkit_INCLUDE_DIRS) list(APPEND _iluvatar_dispatch_include_flags "-I${_dir}") endforeach() @@ -924,7 +943,14 @@ if(GENERATE_PYTHON_BINDINGS) set_target_properties(infiniops PROPERTIES INSTALL_RPATH "${_INFINI_OPS_INSTALL_RPATH}") set_target_properties(ops PROPERTIES INSTALL_RPATH "${_INFINI_OPS_INSTALL_RPATH}") - install(TARGETS infiniops ops DESTINATION .) + get_target_property(_infinirt_imported infinirt IMPORTED) + if(_infinirt_imported) + install(TARGETS infiniops ops DESTINATION .) + get_target_property(_infinirt_imported_location infinirt IMPORTED_LOCATION) + install(FILES "${_infinirt_imported_location}" DESTINATION .) + else() + install(TARGETS infinirt infiniops ops DESTINATION .) + endif() file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/__init__.py" "") install(FILES "${CMAKE_CURRENT_BINARY_DIR}/__init__.py" DESTINATION .) diff --git a/src/data_type.h b/src/data_type.h index 75483d2b8..4c7329cdc 100644 --- a/src/data_type.h +++ b/src/data_type.h @@ -1,192 +1,34 @@ #ifndef INFINI_OPS_DATA_TYPE_H_ #define INFINI_OPS_DATA_TYPE_H_ -#include -#include -#include +#include -#include "common/constexpr_map.h" #include "common/traits.h" #include "device.h" namespace infini::ops { -enum class DataType : std::int8_t { - kInt8, - kInt16, - kInt32, - kInt64, - kUInt8, - kUInt16, - kUInt32, - kUInt64, - kFloat16, - kBFloat16, - kFloat32, - kFloat64 -}; +using infini::rt::DataType; -constexpr ConstexprMap kDataTypeToSize{{{ - {DataType::kInt8, 1}, - {DataType::kInt16, 2}, - {DataType::kInt32, 4}, - {DataType::kInt64, 8}, - {DataType::kUInt8, 1}, - {DataType::kUInt16, 2}, - {DataType::kUInt32, 4}, - {DataType::kUInt64, 8}, - {DataType::kFloat16, 2}, - {DataType::kBFloat16, 2}, - {DataType::kFloat32, 4}, - {DataType::kFloat64, 8}, -}}}; +using infini::rt::BFloat16; +using infini::rt::Float16; -constexpr ConstexprMap kDataTypeToDesc{{{ - {DataType::kInt8, "int8"}, - {DataType::kInt16, "int16"}, - {DataType::kInt32, "int32"}, - {DataType::kInt64, "int64"}, - {DataType::kUInt8, "uint8"}, - {DataType::kUInt16, "uint16"}, - {DataType::kUInt32, "uint32"}, - {DataType::kUInt64, "uint64"}, - {DataType::kFloat16, "float16"}, - {DataType::kBFloat16, "bfloat16"}, - {DataType::kFloat32, "float32"}, - {DataType::kFloat64, "float64"}, -}}}; - -constexpr ConstexprMap kStringToDataType{{{ - {"int8", DataType::kInt8}, - {"int16", DataType::kInt16}, - {"int32", DataType::kInt32}, - {"int64", DataType::kInt64}, - {"uint8", DataType::kUInt8}, - {"uint16", DataType::kUInt16}, - {"uint32", DataType::kUInt32}, - {"uint64", DataType::kUInt64}, - {"float16", DataType::kFloat16}, - {"bfloat16", DataType::kBFloat16}, - {"float32", DataType::kFloat32}, - {"float64", DataType::kFloat64}, -}}}; - -struct Float16 { - std::uint16_t bits; - - static inline Float16 FromFloat(float val) { - std::uint32_t f32; - std::memcpy(&f32, &val, sizeof(f32)); - std::uint16_t sign = (f32 >> 16) & 0x8000; - std::int32_t exponent = ((f32 >> 23) & 0xFF) - 127; - std::uint32_t mantissa = f32 & 0x7FFFFF; - - if (exponent >= 16) { - // NaN - if (exponent == 128 && mantissa != 0) { - return {static_cast(sign | 0x7E00)}; - } - // Inf - return {static_cast(sign | 0x7C00)}; - } else if (exponent >= -14) { - return {static_cast(sign | ((exponent + 15) << 10) | - (mantissa >> 13))}; - } else if (exponent >= -24) { - mantissa |= 0x800000; - mantissa >>= (-14 - exponent); - return {static_cast(sign | (mantissa >> 13))}; - } - // Too small for subnormal: return signed zero. - return {sign}; - } - - inline float ToFloat() const { - std::uint32_t sign = (bits & 0x8000) << 16; - std::int32_t exponent = (bits >> 10) & 0x1F; - std::uint32_t mantissa = bits & 0x3FF; - std::uint32_t f32_bits; - - if (exponent == 31) { - f32_bits = sign | 0x7F800000 | (mantissa << 13); - } else if (exponent == 0) { - if (mantissa == 0) { - f32_bits = sign; - } else { - exponent = -14; - while ((mantissa & 0x400) == 0) { - mantissa <<= 1; - exponent--; - } - mantissa &= 0x3FF; - f32_bits = sign | ((exponent + 127) << 23) | (mantissa << 13); - } - } else { - f32_bits = sign | ((exponent + 127 - 15) << 23) | (mantissa << 13); - } - - float result; - std::memcpy(&result, &f32_bits, sizeof(result)); - return result; - } -}; - -struct BFloat16 { - std::uint16_t bits; - - static inline BFloat16 FromFloat(float val) { - std::uint32_t bits32; - std::memcpy(&bits32, &val, sizeof(bits32)); - - const std::uint32_t rounding_bias = 0x00007FFF + ((bits32 >> 16) & 1); - std::uint16_t bf16_bits = - static_cast((bits32 + rounding_bias) >> 16); - return {bf16_bits}; - } - - inline float ToFloat() const { - std::uint32_t bits32 = static_cast(bits) << 16; - float result; - std::memcpy(&result, &bits32, sizeof(result)); - return result; - } -}; +using infini::rt::kDataTypeToDesc; +using infini::rt::kDataTypeToSize; +using infini::rt::kStringToDataType; template -struct TypeMap; +using TypeMap = infini::rt::TypeMap; template -using TypeMapType = typename TypeMap::type; - -#define DEFINE_DATA_TYPE_MAPPING(ENUM_VALUE, CPP_TYPE) \ - template \ - struct TypeMap { \ - using type = CPP_TYPE; \ - }; - -DEFINE_DATA_TYPE_MAPPING(kUInt8, std::uint8_t) -DEFINE_DATA_TYPE_MAPPING(kInt8, std::int8_t) -DEFINE_DATA_TYPE_MAPPING(kUInt16, std::uint16_t) -DEFINE_DATA_TYPE_MAPPING(kInt16, std::int16_t) -DEFINE_DATA_TYPE_MAPPING(kUInt32, std::uint32_t) -DEFINE_DATA_TYPE_MAPPING(kInt32, std::int32_t) -DEFINE_DATA_TYPE_MAPPING(kUInt64, std::uint64_t) -DEFINE_DATA_TYPE_MAPPING(kInt64, std::int64_t) -DEFINE_DATA_TYPE_MAPPING(kFloat32, float) -DEFINE_DATA_TYPE_MAPPING(kFloat64, double) -#undef DEFINE_DATA_TYPE_MAPPING +using TypeMapType = infini::rt::TypeMapType; -// Checks whether a C++ type is the bfloat16 or float16 type for the given -// device. Full specializations for each device's float16/bfloat16 types are -// provided in the corresponding platform-specific device type headers. template -inline constexpr bool IsBFloat16 = - std::is_same_v>; +inline constexpr bool IsBFloat16 = infini::rt::IsBFloat16; template -inline constexpr bool IsFP16 = - std::is_same_v>; +inline constexpr bool IsFP16 = infini::rt::IsFP16; -// Defines the common categories of data types using List. using FloatTypes = List; using ReducedFloatTypes = List; using IntTypes = diff --git a/src/device.h b/src/device.h index 688cd0dc2..41a3af3b1 100644 --- a/src/device.h +++ b/src/device.h @@ -1,110 +1,23 @@ #ifndef INFINI_OPS_DEVICE_H_ #define INFINI_OPS_DEVICE_H_ -#include -#include +#include -#include "common/constexpr_map.h" #include "common/traits.h" -#include "hash.h" namespace infini::ops { -class Device { - public: - enum class Type { - kCpu = 0, - kNvidia = 1, - kCambricon = 2, - kAscend = 3, - kMetax = 4, - kMoore = 5, - kIluvatar = 6, - kKunlun = 7, - kHygon = 8, - kQy = 9, - kCount - }; - - Device() = default; - - Device(const Type& type, const int& index = 0) : type_{type}, index_{index} {} - - static const Type TypeFromString(const std::string& name) { - return kDescToDevice.at(name); - } - - static const std::string_view StringFromType(const Type& type) { - return kDeviceToDesc.at(type); - } - - const Type& type() const { return type_; } - - const int& index() const { return index_; } - - std::string ToString() const { - return std::string{StringFromType(type_)} + ":" + std::to_string(index_); - } - - bool operator==(const Device& other) const { - return type_ == other.type_ && index_ == other.index_; - } - - bool operator!=(const Device& other) const { return !(*this == other); } +using Device = infini::rt::Device; - private: - Type type_{Type::kCpu}; +template +using DeviceEnabled = infini::rt::DeviceEnabled; - static constexpr ConstexprMap(Device::Type::kCount)> - kDeviceToDesc{{{ - {Type::kCpu, "cpu"}, - {Type::kNvidia, "nvidia"}, - {Type::kCambricon, "cambricon"}, - {Type::kAscend, "ascend"}, - {Type::kMetax, "metax"}, - {Type::kMoore, "moore"}, - {Type::kIluvatar, "iluvatar"}, - {Type::kKunlun, "kunlun"}, - {Type::kHygon, "hygon"}, - {Type::kQy, "qy"}, - }}}; - - static constexpr ConstexprMap(Device::Type::kCount)> - kDescToDevice{{{ - {"cpu", Type::kCpu}, - {"nvidia", Type::kNvidia}, - {"cambricon", Type::kCambricon}, - {"ascend", Type::kAscend}, - {"metax", Type::kMetax}, - {"moore", Type::kMoore}, - {"iluvatar", Type::kIluvatar}, - {"kunlun", Type::kKunlun}, - {"hygon", Type::kHygon}, - {"qy", Type::kQy}, - }}}; - - int index_{0}; -}; - -// Primary template: Devices are disabled by default. Platform-specific -// headers (e.g. `cpu/device_.h`) specialize this to `std::true_type`. -template -struct DeviceEnabled : std::false_type {}; - -// Defines the common categories of devices using List. using AllDeviceTypes = List; -// Deferred computation of active devices. The `Filter` and `FilterList` -// evaluation are nested inside a class template so that `DeviceEnabled` -// specializations from platform `device_.h` headers are visible at -// instantiation time. Use with a dependent type parameter -// (e.g. `ActiveDevices`) to ensure deferred instantiation. template struct ActiveDevicesImpl { struct Filter { @@ -121,17 +34,4 @@ using ActiveDevices = typename ActiveDevicesImpl::type; } // namespace infini::ops -template <> -struct std::hash { - std::size_t operator()(const infini::ops::Device& device) const { - std::size_t seed{0}; - - HashCombine(seed, device.type()); - - HashCombine(seed, device.index()); - - return seed; - } -}; - #endif diff --git a/src/native/ascend/device_.h b/src/native/ascend/device_.h index 1b246ad38..8d69ab95e 100644 --- a/src/native/ascend/device_.h +++ b/src/native/ascend/device_.h @@ -1,13 +1,6 @@ #ifndef INFINI_OPS_ASCEND_DEVICE__H_ #define INFINI_OPS_ASCEND_DEVICE__H_ -#include "device.h" - -namespace infini::ops { - -template <> -struct DeviceEnabled : std::true_type {}; - -} // namespace infini::ops +#include #endif diff --git a/src/native/ascend/runtime_.h b/src/native/ascend/runtime_.h index 2b9e14136..ed21ad687 100644 --- a/src/native/ascend/runtime_.h +++ b/src/native/ascend/runtime_.h @@ -1,44 +1,8 @@ #ifndef INFINI_OPS_ASCEND_RUNTIME__H_ #define INFINI_OPS_ASCEND_RUNTIME__H_ -// clang-format off -#include "acl/acl.h" -// clang-format on +#include -#include "native/ascend/device_.h" #include "runtime.h" -namespace infini::ops { - -template <> -struct Runtime - : DeviceRuntime> { - using Stream = aclrtStream; - - static constexpr Device::Type kDeviceType = Device::Type::kAscend; - - static constexpr auto Malloc = [](void** ptr, size_t size) { - return aclrtMalloc(ptr, size, ACL_MEM_MALLOC_HUGE_FIRST); - }; - - static constexpr auto Free = aclrtFree; - - static constexpr auto Memcpy = [](void* dst, const void* src, size_t count, - aclrtMemcpyKind kind) { - return aclrtMemcpy(dst, count, src, count, kind); - }; - - static constexpr auto MemcpyHostToDevice = ACL_MEMCPY_HOST_TO_DEVICE; - - static constexpr auto MemcpyDeviceToHost = ACL_MEMCPY_DEVICE_TO_HOST; - - static constexpr auto Memset = [](void* ptr, int value, size_t count) { - return aclrtMemset(ptr, count, value, count); - }; -}; - -static_assert(Runtime::Validate()); - -} // namespace infini::ops - #endif diff --git a/src/native/cambricon/data_type_.h b/src/native/cambricon/data_type_.h index f4ca82da8..805b311ed 100644 --- a/src/native/cambricon/data_type_.h +++ b/src/native/cambricon/data_type_.h @@ -1,23 +1,6 @@ #ifndef INFINI_OPS_CAMBRICON_DATA_TYPE__H_ #define INFINI_OPS_CAMBRICON_DATA_TYPE__H_ -#include "bang_bf16.h" -#include "bang_fp16.h" -#include "data_type.h" -#include "native/cambricon/device_.h" - -namespace infini::ops { - -template <> -struct TypeMap { - using type = __half; -}; - -template <> -struct TypeMap { - using type = __bang_bfloat16; -}; - -} // namespace infini::ops +#include #endif diff --git a/src/native/cambricon/device_.h b/src/native/cambricon/device_.h index f168e4f25..d59054af8 100644 --- a/src/native/cambricon/device_.h +++ b/src/native/cambricon/device_.h @@ -1,13 +1,6 @@ #ifndef INFINI_OPS_CAMBRICON_DEVICE__H_ #define INFINI_OPS_CAMBRICON_DEVICE__H_ -#include "device.h" - -namespace infini::ops { - -template <> -struct DeviceEnabled : std::true_type {}; - -} // namespace infini::ops +#include #endif diff --git a/src/native/cambricon/runtime_.h b/src/native/cambricon/runtime_.h index 7ff30fe9c..1ca395d09 100644 --- a/src/native/cambricon/runtime_.h +++ b/src/native/cambricon/runtime_.h @@ -1,35 +1,8 @@ #ifndef INFINI_OPS_CAMBRICON_RUNTIME_H_ #define INFINI_OPS_CAMBRICON_RUNTIME_H_ -#include +#include -#include "native/cambricon/device_.h" #include "runtime.h" -namespace infini::ops { - -template <> -struct Runtime - : DeviceRuntime> { - using Stream = cnrtQueue_t; - - static constexpr Device::Type kDeviceType = Device::Type::kCambricon; - - static constexpr auto Malloc = cnrtMalloc; - - static constexpr auto Free = cnrtFree; - - static constexpr auto Memcpy = cnrtMemcpy; - - static constexpr auto MemcpyHostToDevice = cnrtMemcpyHostToDev; - - static constexpr auto MemcpyDeviceToHost = cnrtMemcpyDevToHost; - - static constexpr auto Memset = cnrtMemset; -}; - -static_assert(Runtime::Validate()); - -} // namespace infini::ops - #endif diff --git a/src/native/cpu/caster_.h b/src/native/cpu/caster_.h index 7da1bf365..3766191ed 100644 --- a/src/native/cpu/caster_.h +++ b/src/native/cpu/caster_.h @@ -1,10 +1,11 @@ #ifndef INFINI_OPS_COMMON_CPU_CASTER_H_ #define INFINI_OPS_COMMON_CPU_CASTER_H_ +#include + #include #include "caster.h" -#include "native/cpu/data_type_.h" namespace infini::ops { diff --git a/src/native/cpu/data_type_.h b/src/native/cpu/data_type_.h index 36231db51..815de89a4 100644 --- a/src/native/cpu/data_type_.h +++ b/src/native/cpu/data_type_.h @@ -1,21 +1,6 @@ #ifndef INFINI_OPS_CPU_DATA_TYPE__H_ #define INFINI_OPS_CPU_DATA_TYPE__H_ -#include "data_type.h" -#include "native/cpu/device_.h" - -namespace infini::ops { - -template <> -struct TypeMap { - using type = Float16; -}; - -template <> -struct TypeMap { - using type = BFloat16; -}; - -} // namespace infini::ops +#include #endif diff --git a/src/native/cpu/device_.h b/src/native/cpu/device_.h index e5e7d85a3..534e4dcd2 100644 --- a/src/native/cpu/device_.h +++ b/src/native/cpu/device_.h @@ -1,13 +1,6 @@ #ifndef INFINI_OPS_CPU_DEVICE__H_ #define INFINI_OPS_CPU_DEVICE__H_ -#include "device.h" - -namespace infini::ops { - -template <> -struct DeviceEnabled : std::true_type {}; - -} // namespace infini::ops +#include #endif diff --git a/src/native/cpu/runtime_.h b/src/native/cpu/runtime_.h index cb6176ba1..4cfb213ae 100644 --- a/src/native/cpu/runtime_.h +++ b/src/native/cpu/runtime_.h @@ -1,34 +1,8 @@ #ifndef INFINI_OPS_CPU_RUNTIME_H_ #define INFINI_OPS_CPU_RUNTIME_H_ -#include -#include +#include #include "runtime.h" -namespace infini::ops { - -template <> -struct Runtime : RuntimeBase> { - static constexpr Device::Type kDeviceType = Device::Type::kCpu; - - static void Malloc(void** ptr, std::size_t size) { *ptr = std::malloc(size); } - - static void Free(void* ptr) { std::free(ptr); } - - static void Memcpy(void* dst, const void* src, std::size_t size, int) { - std::memcpy(dst, src, size); - } - - static constexpr auto Memset = std::memset; - - static constexpr int MemcpyHostToDevice = 0; - - static constexpr int MemcpyDeviceToHost = 1; -}; - -static_assert(Runtime::Validate()); - -} // namespace infini::ops - #endif diff --git a/src/native/cuda/iluvatar/data_type_.h b/src/native/cuda/iluvatar/data_type_.h index ac997f992..b9e3ed8a3 100644 --- a/src/native/cuda/iluvatar/data_type_.h +++ b/src/native/cuda/iluvatar/data_type_.h @@ -1,29 +1,12 @@ #ifndef INFINI_OPS_ILUVATAR_DATA_TYPE__H_ #define INFINI_OPS_ILUVATAR_DATA_TYPE__H_ -// clang-format off -#include -#include -// clang-format on - -#include "data_type.h" -#include "native/cuda/iluvatar/device_.h" +#include namespace infini::ops { -using cuda_bfloat16 = nv_bfloat16; - -using cuda_bfloat162 = nv_bfloat162; - -template <> -struct TypeMap { - using type = half; -}; - -template <> -struct TypeMap { - using type = __nv_bfloat16; -}; +using infini::rt::cuda_bfloat16; +using infini::rt::cuda_bfloat162; } // namespace infini::ops diff --git a/src/native/cuda/iluvatar/device_.h b/src/native/cuda/iluvatar/device_.h index 4773201da..96743518f 100644 --- a/src/native/cuda/iluvatar/device_.h +++ b/src/native/cuda/iluvatar/device_.h @@ -1,13 +1,6 @@ #ifndef INFINI_OPS_ILUVATAR_DEVICE__H_ #define INFINI_OPS_ILUVATAR_DEVICE__H_ -#include "device.h" - -namespace infini::ops { - -template <> -struct DeviceEnabled : std::true_type {}; - -} // namespace infini::ops +#include #endif diff --git a/src/native/cuda/iluvatar/runtime_.h b/src/native/cuda/iluvatar/runtime_.h index 1e41b68e9..5983e5bbb 100644 --- a/src/native/cuda/iluvatar/runtime_.h +++ b/src/native/cuda/iluvatar/runtime_.h @@ -1,42 +1,9 @@ #ifndef INFINI_OPS_ILUVATAR_RUNTIME_H_ #define INFINI_OPS_ILUVATAR_RUNTIME_H_ -#include +#include -// clang-format off -#include -// clang-format on - -#include "native/cuda/iluvatar/device_.h" #include "native/cuda/iluvatar/runtime_utils.h" -#include "native/cuda/runtime_.h" - -namespace infini::ops { - -template <> -struct Runtime - : CudaRuntime> { - using Stream = cudaStream_t; - - static constexpr Device::Type kDeviceType = Device::Type::kIluvatar; - - static constexpr auto Malloc = [](auto&&... args) { - return cudaMalloc(std::forward(args)...); - }; - - static constexpr auto Memcpy = cudaMemcpy; - - static constexpr auto Free = cudaFree; - - static constexpr auto MemcpyHostToDevice = cudaMemcpyHostToDevice; - - static constexpr auto MemcpyDeviceToHost = cudaMemcpyDeviceToHost; - - static constexpr auto Memset = cudaMemset; -}; - -static_assert(Runtime::Validate()); - -} // namespace infini::ops +#include "runtime.h" #endif diff --git a/src/native/cuda/metax/data_type_.h b/src/native/cuda/metax/data_type_.h index 73e498774..003dd48c0 100644 --- a/src/native/cuda/metax/data_type_.h +++ b/src/native/cuda/metax/data_type_.h @@ -1,28 +1,12 @@ #ifndef INFINI_OPS_METAX_DATA_TYPE__H_ #define INFINI_OPS_METAX_DATA_TYPE__H_ -#include -#include -#include - -#include "data_type.h" -#include "native/cuda/metax/device_.h" +#include namespace infini::ops { -using cuda_bfloat16 = maca_bfloat16; - -using cuda_bfloat162 = maca_bfloat162; - -template <> -struct TypeMap { - using type = __half; -}; - -template <> -struct TypeMap { - using type = __maca_bfloat16; -}; +using infini::rt::cuda_bfloat16; +using infini::rt::cuda_bfloat162; } // namespace infini::ops diff --git a/src/native/cuda/metax/device_.h b/src/native/cuda/metax/device_.h index 6e7c677d7..7aa554ea2 100644 --- a/src/native/cuda/metax/device_.h +++ b/src/native/cuda/metax/device_.h @@ -1,13 +1,6 @@ #ifndef INFINI_OPS_METAX_DEVICE__H_ #define INFINI_OPS_METAX_DEVICE__H_ -#include "device.h" - -namespace infini::ops { - -template <> -struct DeviceEnabled : std::true_type {}; - -} // namespace infini::ops +#include #endif diff --git a/src/native/cuda/metax/runtime_.h b/src/native/cuda/metax/runtime_.h index 6563b6b41..01e9ae74e 100644 --- a/src/native/cuda/metax/runtime_.h +++ b/src/native/cuda/metax/runtime_.h @@ -1,36 +1,9 @@ #ifndef INFINI_OPS_METAX_RUNTIME_H_ #define INFINI_OPS_METAX_RUNTIME_H_ -#include +#include -#include "native/cuda/metax/device_.h" #include "native/cuda/metax/runtime_utils.h" -#include "native/cuda/runtime_.h" - -namespace infini::ops { - -template <> -struct Runtime - : CudaRuntime> { - using Stream = mcStream_t; - - static constexpr Device::Type kDeviceType = Device::Type::kMetax; - - static constexpr auto Malloc = mcMalloc; - - static constexpr auto Memcpy = mcMemcpy; - - static constexpr auto Free = mcFree; - - static constexpr auto MemcpyHostToDevice = mcMemcpyHostToDevice; - - static constexpr auto MemcpyDeviceToHost = mcMemcpyDeviceToHost; - - static constexpr auto Memset = mcMemset; -}; - -static_assert(Runtime::Validate()); - -} // namespace infini::ops +#include "runtime.h" #endif diff --git a/src/native/cuda/moore/data_type_.h b/src/native/cuda/moore/data_type_.h index c97cb33dd..7bc17d047 100644 --- a/src/native/cuda/moore/data_type_.h +++ b/src/native/cuda/moore/data_type_.h @@ -1,27 +1,12 @@ #ifndef INFINI_OPS_MOORE_DATA_TYPE__H_ #define INFINI_OPS_MOORE_DATA_TYPE__H_ -#include -#include - -#include "data_type.h" -#include "native/cuda/moore/device_.h" +#include namespace infini::ops { -using cuda_bfloat16 = __mt_bfloat16; - -using cuda_bfloat162 = __mt_bfloat162; - -template <> -struct TypeMap { - using type = half; -}; - -template <> -struct TypeMap { - using type = __mt_bfloat16; -}; +using infini::rt::cuda_bfloat16; +using infini::rt::cuda_bfloat162; } // namespace infini::ops diff --git a/src/native/cuda/moore/device_.h b/src/native/cuda/moore/device_.h index 472b9acf0..2b23db2dd 100644 --- a/src/native/cuda/moore/device_.h +++ b/src/native/cuda/moore/device_.h @@ -1,13 +1,6 @@ #ifndef INFINI_OPS_MOORE_DEVICE__H_ #define INFINI_OPS_MOORE_DEVICE__H_ -#include "device.h" - -namespace infini::ops { - -template <> -struct DeviceEnabled : std::true_type {}; - -} // namespace infini::ops +#include #endif diff --git a/src/native/cuda/moore/runtime_.h b/src/native/cuda/moore/runtime_.h index bc519b41e..14f3411b1 100644 --- a/src/native/cuda/moore/runtime_.h +++ b/src/native/cuda/moore/runtime_.h @@ -1,44 +1,9 @@ #ifndef INFINI_OPS_MOORE_RUNTIME_H_ #define INFINI_OPS_MOORE_RUNTIME_H_ -#include +#include -#include - -#include "native/cuda/moore/device_.h" #include "native/cuda/moore/runtime_utils.h" -#include "native/cuda/runtime_.h" - -namespace infini::ops { - -template <> -struct Runtime - : CudaRuntime> { - using Stream = musaStream_t; - - static constexpr Device::Type kDeviceType = Device::Type::kMoore; - - static constexpr auto Malloc = [](auto&&... args) { - return musaMalloc(std::forward(args)...); - }; - - static constexpr auto Memcpy = [](auto&&... args) { - return musaMemcpy(std::forward(args)...); - }; - - static constexpr auto Free = [](auto&&... args) { - return musaFree(std::forward(args)...); - }; - - static constexpr auto MemcpyHostToDevice = musaMemcpyHostToDevice; - - static constexpr auto MemcpyDeviceToHost = musaMemcpyDeviceToHost; - - static constexpr auto Memset = musaMemset; -}; - -static_assert(Runtime::Validate()); - -} // namespace infini::ops +#include "runtime.h" #endif diff --git a/src/native/cuda/nvidia/data_type_.h b/src/native/cuda/nvidia/data_type_.h index 1266c0c8a..7f1fdc9e8 100644 --- a/src/native/cuda/nvidia/data_type_.h +++ b/src/native/cuda/nvidia/data_type_.h @@ -1,29 +1,12 @@ #ifndef INFINI_OPS_NVIDIA_DATA_TYPE__H_ #define INFINI_OPS_NVIDIA_DATA_TYPE__H_ -// clang-format off -#include -#include -// clang-format on - -#include "data_type.h" -#include "native/cuda/nvidia/device_.h" +#include namespace infini::ops { -using cuda_bfloat16 = nv_bfloat16; - -using cuda_bfloat162 = nv_bfloat162; - -template <> -struct TypeMap { - using type = half; -}; - -template <> -struct TypeMap { - using type = __nv_bfloat16; -}; +using infini::rt::cuda_bfloat16; +using infini::rt::cuda_bfloat162; } // namespace infini::ops diff --git a/src/native/cuda/nvidia/device_.h b/src/native/cuda/nvidia/device_.h index 52cbb2b84..b46caf035 100644 --- a/src/native/cuda/nvidia/device_.h +++ b/src/native/cuda/nvidia/device_.h @@ -1,13 +1,6 @@ #ifndef INFINI_OPS_NVIDIA_DEVICE__H_ #define INFINI_OPS_NVIDIA_DEVICE__H_ -#include "device.h" - -namespace infini::ops { - -template <> -struct DeviceEnabled : std::true_type {}; - -} // namespace infini::ops +#include #endif diff --git a/src/native/cuda/nvidia/runtime_.h b/src/native/cuda/nvidia/runtime_.h index 326ecdb24..d3474881d 100644 --- a/src/native/cuda/nvidia/runtime_.h +++ b/src/native/cuda/nvidia/runtime_.h @@ -1,42 +1,9 @@ #ifndef INFINI_OPS_NVIDIA_RUNTIME_H_ #define INFINI_OPS_NVIDIA_RUNTIME_H_ -#include +#include -// clang-format off -#include -// clang-format on - -#include "native/cuda/nvidia/device_.h" #include "native/cuda/nvidia/runtime_utils.h" -#include "native/cuda/runtime_.h" - -namespace infini::ops { - -template <> -struct Runtime - : CudaRuntime> { - using Stream = cudaStream_t; - - static constexpr Device::Type kDeviceType = Device::Type::kNvidia; - - static constexpr auto Malloc = [](auto&&... args) { - return cudaMalloc(std::forward(args)...); - }; - - static constexpr auto Memcpy = cudaMemcpy; - - static constexpr auto Free = cudaFree; - - static constexpr auto MemcpyHostToDevice = cudaMemcpyHostToDevice; - - static constexpr auto MemcpyDeviceToHost = cudaMemcpyDeviceToHost; - - static constexpr auto Memset = cudaMemset; -}; - -static_assert(Runtime::Validate()); - -} // namespace infini::ops +#include "runtime.h" #endif diff --git a/src/runtime.h b/src/runtime.h index 38257893c..251be195d 100644 --- a/src/runtime.h +++ b/src/runtime.h @@ -1,54 +1,15 @@ #ifndef INFINI_OPS_RUNTIME_H_ #define INFINI_OPS_RUNTIME_H_ -#include - -#include "device.h" +#include namespace infini::ops { template -struct Runtime; - -/// ## Interface enforcement via CRTP. -/// -/// Inherit from the appropriate base to declare which interface level a -/// `Runtime` specialization implements. After the struct is fully defined, call -/// `static_assert(Runtime<...>::Validate())`. The chained `Validate()` checks -/// every required member's existence and signature at compile time, analogous -/// to how `override` catches signature mismatches for virtual functions. -/// -/// - `RuntimeBase`: `kDeviceType` only (e.g. CPU). -/// - `DeviceRuntime`: adds `Stream`, `Malloc`, and `Free` (e.g. Cambricon). - -/// Every Runtime must provide `static constexpr Device::Type kDeviceType`. -template -struct RuntimeBase { - static constexpr bool Validate() { - static_assert( - std::is_same_v, - Device::Type>, - "`Runtime` must define `static constexpr Device::Type kDeviceType`."); - return true; - } -}; +using Runtime = infini::rt::Runtime; -/// Runtimes with device memory must additionally provide `Stream`, `Malloc`, -/// and `Free`. template -struct DeviceRuntime : RuntimeBase { - static constexpr bool Validate() { - RuntimeBase::Validate(); - static_assert(sizeof(typename Derived::Stream) > 0, - "`Runtime` must define a `Stream` type alias."); - static_assert( - std::is_invocable_v, - "`Runtime::Malloc` must be callable with `(void**, size_t)`."); - static_assert(std::is_invocable_v, - "`Runtime::Free` must be callable with `(void*)`."); - return true; - } -}; +using DeviceRuntime = infini::rt::DeviceRuntime; } // namespace infini::ops diff --git a/src/tensor.h b/src/tensor.h index 290e3cf96..48dc21d54 100644 --- a/src/tensor.h +++ b/src/tensor.h @@ -1,157 +1,12 @@ #ifndef INFINI_OPS_TENSOR_H_ #define INFINI_OPS_TENSOR_H_ -#include -#include -#include - -#include "data_type.h" -#include "device.h" -#include "hash.h" +#include namespace infini::ops { -class Tensor { - public: - using Size = std::size_t; - - using Stride = std::ptrdiff_t; - - using Index = Stride; - - using Shape = std::vector; - - using Strides = std::vector; - - template - Tensor(void* data, const Shape& shape) - : data_{data}, - shape_{shape}, - dtype_{DefaultDataType()}, - device_{DefaultDevice()}, - strides_{DefaultStrides(shape)} {} - - template - Tensor(void* data, const Shape& shape, const DataType& dtype) - : data_{data}, - shape_{shape}, - dtype_{dtype}, - device_{DefaultDevice()}, - strides_{DefaultStrides(shape)} {} - - template - Tensor(void* data, const Shape& shape, const Device& device) - : data_{data}, - shape_{shape}, - dtype_{DefaultDataType()}, - device_{device}, - strides_{DefaultStrides(shape)} {} - - template - Tensor(void* data, const Shape& shape, const DataType& dtype, - const Device& device) - : data_{data}, - shape_{shape}, - dtype_{dtype}, - device_{device}, - strides_{DefaultStrides(shape)} {} - - template - Tensor(void* data, const Shape& shape, const DataType& dtype, - const Device& device, const Strides& strides) - : data_{data}, - shape_{shape}, - dtype_{dtype}, - device_{device}, - strides_{strides} {} - - Tensor(void* data, std::initializer_list shape, const DataType& dtype, - const Device& device, std::initializer_list strides); - - Tensor operator[](const Index& index) const; - - void*& data(); - - const void* data() const; - - const DataType& dtype() const; - - const Device& device() const; - - const Shape& shape() const; - - const Strides& strides() const; - - Size size(const Index& index) const; - - Stride stride(const Index& index) const; - - Size ndim() const; - - Size element_size() const; - - Size numel() const; - - Tensor T() const; - - std::string ToString() const; - - bool HasBroadcastDim() const; - - bool IsContiguous() const; - - private: - static const DataType DefaultDataType(); - - static Device DefaultDevice(); - - static Strides DefaultStrides(const Shape& shape); - - std::string ToStringHelper() const; - - bool IsMergeable(Size dim_start, Size dim_end) const; - - void* data_{nullptr}; - - Shape shape_; - - const DataType dtype_; - - Device device_; - - Strides strides_; -}; +using Tensor = infini::rt::TensorView; } // namespace infini::ops -template <> -struct std::hash { - std::size_t operator()(const infini::ops::Tensor& tensor) const { - std::size_t seed{0}; - - for (const auto& size : tensor.shape()) { - HashCombine(seed, size); - } - - HashCombine(seed, tensor.dtype()); - - HashCombine(seed, tensor.device()); - - for (const auto& stride : tensor.strides()) { - HashCombine(seed, stride); - } - - return seed; - } -}; - -template <> -struct std::equal_to { - bool operator()(const infini::ops::Tensor& a, - const infini::ops::Tensor& b) const { - return a.dtype() == b.dtype() && a.device() == b.device() && - a.shape() == b.shape() && a.strides() == b.strides(); - } -}; - #endif diff --git a/tests/test_generate_wrappers.py b/tests/test_generate_wrappers.py index b4c97f868..931677553 100644 --- a/tests/test_generate_wrappers.py +++ b/tests/test_generate_wrappers.py @@ -62,16 +62,69 @@ class Clamp { text = "\n".join(declarations) assert ( - "MakeClamp(const Config& config, const Tensor input, " + "MakeClamp(const Config& config, Tensor input, " "const std::optional min, const std::optional max, " "Tensor out)" ) in text assert ( - "MakeClamp(const Config& config, const Tensor input, " + "MakeClamp(const Config& config, Tensor input, " "std::optional min, std::optional max, Tensor out)" ) in text +def test_operator_call_instantiations_keep_optional_scalar_and_tensor_overloads_distinct( + monkeypatch, tmp_path +): + module = _load_generator_module() + base_header = tmp_path / "clamp.h" + base_header.write_text( + """ +class Clamp { + public: + virtual void operator()(const Tensor input, const std::optional min, + const std::optional max, Tensor out) const = 0; + virtual void operator()(const Tensor input, const std::optional min, + const std::optional max, Tensor out) const = 0; +}; +""" + ) + monkeypatch.setattr(module, "_find_base_header", lambda op_name: base_header) + + operator = module._Operator( + "clamp", + constructors=[], + calls=[ + module._ParsedFunction( + [ + module._ParsedArgument("const Tensor", "input"), + module._ParsedArgument("const std::optional", "min"), + module._ParsedArgument("const std::optional", "max"), + module._ParsedArgument("Tensor", "out"), + ] + ), + module._ParsedFunction( + [ + module._ParsedArgument("const Tensor", "input"), + module._ParsedArgument("const std::optional", "min"), + module._ParsedArgument("const std::optional", "max"), + module._ParsedArgument("Tensor", "out"), + ] + ), + ], + ) + + declarations, _ = module._generate_operator_call_instantiation_entries(operator) + + text = "\n".join(declarations) + + assert ( + "Call, std::optional, Tensor>" + ) in text + assert ( + "Call, std::optional, Tensor>" + ) in text + + def test_pybind_default_implementation_uses_first_active_index(monkeypatch, tmp_path): module = _load_generator_module() base_header = tmp_path / "mul.h"