diff --git a/.github/ci_config.yml b/.github/ci_config.yml
index b9979570a..65b2b208f 100644
--- a/.github/ci_config.yml
+++ b/.github/ci_config.yml
@@ -15,7 +15,7 @@ platforms:
         BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3
         SKIP_APT: "1"
         PIP_INDEX_URL: https://pypi.tuna.tsinghua.edu.cn/simple
-    setup: pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON
+    setup: pip install scikit-build-core pybind11 libclang pyyaml && rm -rf /tmp/infinirt-src /tmp/infinirt-build /tmp/infinirt-prefix && git clone --depth 1 ${INFINI_RT_GIT_URL:-https://gh-proxy.com/https://github.com/InfiniTensor/InfiniRT.git} /tmp/infinirt-src && cmake -S /tmp/infinirt-src -B /tmp/infinirt-build -DWITH_CPU=ON -DWITH_NVIDIA=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/infinirt-prefix && cmake --build /tmp/infinirt-build -j$(nproc) && cmake --install /tmp/infinirt-build && pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON --config-settings=cmake.define.INFINI_RT_ROOT=/tmp/infinirt-prefix
     jobs:
       gpu:
         type: unittest
@@ -51,7 +51,7 @@ platforms:
       - /lib/firmware:/lib/firmware
       - /usr/src:/usr/src
       - /lib/modules:/lib/modules
-    setup: python -m pip install packaging exceptiongroup typing-extensions pygments pybind11 libclang && python -m pip install . --no-build-isolation --no-deps --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON
+    setup: python -m pip install scikit-build-core packaging exceptiongroup typing-extensions pygments pybind11 libclang pyyaml && rm -rf /tmp/infinirt-src /tmp/infinirt-build /tmp/infinirt-prefix && git clone --depth 1 ${INFINI_RT_GIT_URL:-https://gh-proxy.com/https://github.com/InfiniTensor/InfiniRT.git} /tmp/infinirt-src && cmake -S /tmp/infinirt-src -B /tmp/infinirt-build -DWITH_CPU=ON -DWITH_ILUVATAR=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/infinirt-prefix -DCMAKE_CUDA_ARCHITECTURES=OFF -DCMAKE_CUDA_COMPILER=/usr/local/corex-4.3.0.20250624/bin/clang++ -DCUDAToolkit_ROOT=/usr/local/corex-4.3.0.20250624 && cmake --build /tmp/infinirt-build -j$(nproc) && cmake --install /tmp/infinirt-build && python -m pip install . --no-build-isolation --no-deps --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON --config-settings=cmake.define.INFINI_RT_ROOT=/tmp/infinirt-prefix
     jobs:
       gpu:
         type: unittest
@@ -81,7 +81,7 @@ platforms:
       - "--privileged"
       - "--ulimit=memlock=-1"
       - "--ulimit=stack=67108864"
-    setup: pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON
+    setup: pip install scikit-build-core pybind11 libclang pyyaml && rm -rf /tmp/infinirt-src /tmp/infinirt-build /tmp/infinirt-prefix && git clone --depth 1 ${INFINI_RT_GIT_URL:-https://gh-proxy.com/https://github.com/InfiniTensor/InfiniRT.git} /tmp/infinirt-src && cmake -S /tmp/infinirt-src -B /tmp/infinirt-build -DWITH_CPU=ON -DWITH_METAX=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/infinirt-prefix && cmake --build /tmp/infinirt-build -j$(nproc) && cmake --install /tmp/infinirt-build && pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON --config-settings=cmake.define.INFINI_RT_ROOT=/tmp/infinirt-prefix
     jobs:
       gpu:
         type: unittest
@@ -109,7 +109,7 @@ platforms:
         PIP_INDEX_URL: https://pypi.org/simple
     docker_args:
       - "--privileged"
-    setup: pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON
+    setup: pip install scikit-build-core pybind11 libclang pyyaml && rm -rf /tmp/infinirt-src /tmp/infinirt-build /tmp/infinirt-prefix && git clone --depth 1 ${INFINI_RT_GIT_URL:-https://gh-proxy.com/https://github.com/InfiniTensor/InfiniRT.git} /tmp/infinirt-src && cmake -S /tmp/infinirt-src -B /tmp/infinirt-build -DWITH_CPU=ON -DWITH_MOORE=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/infinirt-prefix && cmake --build /tmp/infinirt-build -j$(nproc) && cmake --install /tmp/infinirt-build && pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON --config-settings=cmake.define.INFINI_RT_ROOT=/tmp/infinirt-prefix
     jobs:
       gpu:
         type: unittest
@@ -136,7 +136,7 @@ platforms:
         PIP_INDEX_URL: https://pypi.org/simple
     docker_args:
       - "--privileged"
-    setup: pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON
+    setup: pip install scikit-build-core pybind11 libclang pyyaml && rm -rf /tmp/infinirt-src /tmp/infinirt-build /tmp/infinirt-prefix && git clone --depth 1 ${INFINI_RT_GIT_URL:-https://gh-proxy.com/https://github.com/InfiniTensor/InfiniRT.git} /tmp/infinirt-src && cmake -S /tmp/infinirt-src -B /tmp/infinirt-build -DWITH_CPU=ON -DWITH_CAMBRICON=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/infinirt-prefix && cmake --build /tmp/infinirt-build -j$(nproc) && cmake --install /tmp/infinirt-build && pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON --config-settings=cmake.define.INFINI_RT_ROOT=/tmp/infinirt-prefix
     jobs:
       gpu:
         type: unittest
@@ -172,7 +172,7 @@ platforms:
       - "--group-add=video"
     volumes:
       - /opt/hyhal:/opt/hyhal:ro
-    setup: pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON
+    setup: pip install scikit-build-core pybind11 libclang pyyaml && rm -rf /tmp/infinirt-src /tmp/infinirt-build /tmp/infinirt-prefix && git clone --depth 1 ${INFINI_RT_GIT_URL:-https://gh-proxy.com/https://github.com/InfiniTensor/InfiniRT.git} /tmp/infinirt-src && cmake -S /tmp/infinirt-src -B /tmp/infinirt-build -DWITH_CPU=ON -DWITH_HYGON=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/infinirt-prefix && cmake --build /tmp/infinirt-build -j$(nproc) && cmake --install /tmp/infinirt-build && pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON --config-settings=cmake.define.INFINI_RT_ROOT=/tmp/infinirt-prefix
     jobs:
       gpu:
         type: unittest
@@ -209,7 +209,7 @@ platforms:
       - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi:ro
     env:
       ASCEND_HOME_PATH: /usr/local/Ascend/ascend-toolkit/latest
-    setup: pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON
+    setup: pip install scikit-build-core pybind11 libclang pyyaml && rm -rf /tmp/infinirt-src /tmp/infinirt-build /tmp/infinirt-prefix && git clone --depth 1 ${INFINI_RT_GIT_URL:-https://gh-proxy.com/https://github.com/InfiniTensor/InfiniRT.git} /tmp/infinirt-src && cmake -S /tmp/infinirt-src -B /tmp/infinirt-build -DWITH_CPU=ON -DWITH_ASCEND=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/infinirt-prefix && cmake --build /tmp/infinirt-build -j$(nproc) && cmake --install /tmp/infinirt-build && pip install .[dev] --no-build-isolation --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON --config-settings=cmake.define.INFINI_RT_ROOT=/tmp/infinirt-prefix
     jobs:
       npu:
         type: unittest
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2d10efdbe..39c2d25e9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -294,8 +294,6 @@ if(WITH_TORCH)
     message(STATUS "Found PyTorch: ${TORCH_INCLUDE_DIRS}")
 endif()
 
-include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src)
-
 # Only one CUDA-like GPU backend can be enabled at a time.
 set(_gpu_backend_count 0)
 foreach(_gpu_backend WITH_NVIDIA WITH_ILUVATAR WITH_HYGON WITH_METAX WITH_MOORE WITH_ASCEND)
@@ -502,6 +500,90 @@ if(WITH_HYGON AND NOT EXISTS "${DTK_ROOT}/llvm/lib/LLVMgold.so")
     set(PYBIND11_ENABLE_EXTRAS OFF)
 endif()
 
+set(INFINI_RT_ROOT "$ENV{INFINI_RT_ROOT}" CACHE PATH "Installed InfiniRT prefix")
+set(INFINI_RT_SOURCE_DIR "" CACHE PATH "InfiniRT source directory for development builds")
+set(INFINI_RT_INCLUDE_DIRS "" CACHE STRING "InfiniRT include directories")
+set(INFINI_RT_LIBRARY "" CACHE FILEPATH "InfiniRT library")
+
+function(_infiniops_configure_infinirt)
+    if(TARGET infinirt)
+        get_target_property(_infinirt_include_dirs infinirt INTERFACE_INCLUDE_DIRECTORIES)
+        if(_infinirt_include_dirs)
+            set(INFINI_RT_INCLUDE_DIRS "${_infinirt_include_dirs}" CACHE STRING "InfiniRT include directories" FORCE)
+        endif()
+        return()
+    endif()
+
+    if(INFINI_RT_SOURCE_DIR)
+        if(NOT EXISTS "${INFINI_RT_SOURCE_DIR}/CMakeLists.txt")
+            message(FATAL_ERROR
+                "InfiniRT source tree not found at `${INFINI_RT_SOURCE_DIR}`. "
+                "Unset `INFINI_RT_SOURCE_DIR` to use an installed InfiniRT package.")
+        endif()
+
+        add_subdirectory("${INFINI_RT_SOURCE_DIR}" "${CMAKE_BINARY_DIR}/InfiniRT")
+        set(INFINI_RT_INCLUDE_DIRS
+            "${INFINI_RT_SOURCE_DIR}/include;${INFINI_RT_SOURCE_DIR}/generated/include"
+            CACHE STRING "InfiniRT include directories" FORCE)
+        return()
+    endif()
+
+    set(_infinirt_include_hints ${INFINI_RT_INCLUDE_DIRS})
+    if(INFINI_RT_ROOT)
+        list(APPEND _infinirt_include_hints "${INFINI_RT_ROOT}/include")
+    endif()
+
+    find_path(_INFINI_RT_INCLUDE_DIR
+        NAMES infini/rt.h
+        HINTS ${_infinirt_include_hints}
+        PATH_SUFFIXES include
+    )
+
+    if(INFINI_RT_LIBRARY)
+        set(_INFINI_RT_LIBRARY "${INFINI_RT_LIBRARY}")
+    else()
+        set(_infinirt_library_hints "")
+        if(INFINI_RT_ROOT)
+            list(APPEND _infinirt_library_hints
+                "${INFINI_RT_ROOT}/lib"
+                "${INFINI_RT_ROOT}/lib64")
+        endif()
+        find_library(_INFINI_RT_LIBRARY
+            NAMES infinirt
+            HINTS ${_infinirt_library_hints}
+        )
+    endif()
+
+    if(NOT _INFINI_RT_INCLUDE_DIR OR NOT _INFINI_RT_LIBRARY)
+        message(FATAL_ERROR
+            "InfiniRT installed headers/library were not found. "
+            "Set `INFINI_RT_ROOT` to an installed InfiniRT prefix, or set "
+            "`INFINI_RT_INCLUDE_DIRS` and `INFINI_RT_LIBRARY` explicitly. "
+            "For development builds from source, set `INFINI_RT_SOURCE_DIR`.")
+    endif()
+
+    set(_infinirt_resolved_include_dirs "")
+    foreach(_infinirt_include_dir IN LISTS _infinirt_include_hints)
+        if(_infinirt_include_dir AND EXISTS "${_infinirt_include_dir}")
+            list(APPEND _infinirt_resolved_include_dirs "${_infinirt_include_dir}")
+        endif()
+    endforeach()
+    list(APPEND _infinirt_resolved_include_dirs "${_INFINI_RT_INCLUDE_DIR}")
+    list(REMOVE_DUPLICATES _infinirt_resolved_include_dirs)
+
+    add_library(infinirt SHARED IMPORTED GLOBAL)
+    set_target_properties(infinirt PROPERTIES
+        IMPORTED_LOCATION "${_INFINI_RT_LIBRARY}"
+        INTERFACE_INCLUDE_DIRECTORIES "${_infinirt_resolved_include_dirs}"
+    )
+
+    set(INFINI_RT_INCLUDE_DIRS "${_infinirt_resolved_include_dirs}" CACHE STRING "InfiniRT include directories" FORCE)
+    set(INFINI_RT_LIBRARY "${_INFINI_RT_LIBRARY}" CACHE FILEPATH "InfiniRT library" FORCE)
+    message(STATUS "Using installed InfiniRT: ${_INFINI_RT_LIBRARY}")
+endfunction()
+
+_infiniops_configure_infinirt()
+
 add_subdirectory(src)
 
 if(NOT GENERATE_PYTHON_BINDINGS)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 53240b134..39b38497f 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -69,13 +69,21 @@ Branch names use the format `<type>/xxx-yyyy-zzzz`, where `<type>` matches the P
 Using Nvidia as an example:
 
 ```bash
-pip install .[dev] -C cmake.define.WITH_CPU=ON -C cmake.define.WITH_NVIDIA=ON
+pip install .[dev] \
+  -C cmake.define.INFINI_RT_ROOT=/path/to/infinirt-prefix \
+  -C cmake.define.WITH_CPU=ON \
+  -C cmake.define.WITH_NVIDIA=ON
 ```
 
-Auto-detection is supported for some platforms, so you can also simply run:
+`/path/to/infinirt-prefix` is the InfiniRT install prefix, typically the value
+used for InfiniRT's `CMAKE_INSTALL_PREFIX`. It should contain
+`include/infini/rt.h` and `lib/libinfinirt.so`.
+
+Auto-detection is supported for some platforms, so you can also let InfiniOps
+detect the device backends while still pointing it at the installed InfiniRT:
 
 ```bash
-pip install .[dev]
+pip install .[dev] -C cmake.define.INFINI_RT_ROOT=/path/to/infinirt-prefix
 ```
 
 > `[dev]` installs optional development dependencies (e.g. `pytest`) that are not needed for production but required for development and testing. After the first install, subsequent installs only need `pip install .`.
@@ -94,6 +102,7 @@ For routine development and pull requests, start with a smoke build plus the smo
 
 ```bash
 python -m pip install .[dev] --no-build-isolation --no-deps \
+  --config-settings=cmake.define.INFINI_RT_ROOT=/path/to/infinirt-prefix \
   --config-settings=cmake.define.INFINI_OPS_SMOKE_BUILD=ON
 python -m pytest tests -m smoke -q
 ```
@@ -251,5 +260,7 @@ using T = TypeMapType<Device::Type::kCpu, ListGet<0>(list_tag)>;
 2. **Segmentation fault during tests**: Run `pytest -n 1`.
 3. **`Unknown CMake command "pybind11_add_module"`**: Install pybind11 with `pip install pybind11[global]`. See the [pybind11 installation guide](https://pybind11.readthedocs.io/en/stable/installing.html).
 4. **Auto-detection (`AUTO_DETECT_DEVICES`) fails**: Some machines may not expose devices in expected paths (e.g. `/dev/nvidia*`). Use explicit CMake defines instead (e.g. `-C cmake.define.WITH_NVIDIA=ON`).
-5. **`bash: pytest: command not found`**: Use `python -m pytest`.
-6. **`CUBLAS_STATUS_INVALID_VALUE` in `cublasSgemmStridedBatched`**: PyTorch version issue. Downgrade to `torch<=2.9.1`.
+5. **InfiniRT headers or library are not found**: Set `INFINI_RT_ROOT` to the InfiniRT install prefix, or set `INFINI_RT_INCLUDE_DIRS` and `INFINI_RT_LIBRARY` explicitly.
+6. **`ImportError: libinfinirt.so: cannot open shared object file`**: Reinstall InfiniOps from a build that was configured with the InfiniRT install prefix so the Python wheel can bundle the runtime library.
+7. **`bash: pytest: command not found`**: Use `python -m pytest`.
+8. **`CUBLAS_STATUS_INVALID_VALUE` in `cublasSgemmStridedBatched`**: PyTorch version issue. Downgrade to `torch<=2.9.1`.
diff --git a/README.md b/README.md
index 44a8c71f1..4a8c14232 100644
--- a/README.md
+++ b/README.md
@@ -8,21 +8,33 @@ InfiniOps is a high-performance, cross-platform operator library supporting mult
 - CMake 3.18+
 - Python 3.10+
 - Hardware-specific SDKs (e.g. CUDA Toolkit, MUSA Toolkit)
+- Installed InfiniRT headers and library
 
 ## Installation
 
-Install with pip (recommended):
+Install InfiniRT first, then build InfiniOps with the InfiniRT install prefix:
 
 ```bash
-pip install .
+pip install . -C cmake.define.INFINI_RT_ROOT=/path/to/infinirt-prefix
 ```
 
-This auto-detects available platforms on supported backends. To specify platforms explicitly:
+`/path/to/infinirt-prefix` is the directory passed to InfiniRT as
+`CMAKE_INSTALL_PREFIX`; it should contain `include/infini/rt.h` and
+`lib/libinfinirt.so`.
+
+InfiniOps auto-detects available platforms on supported backends. To specify
+platforms explicitly:
 
 ```bash
-pip install . -C cmake.define.WITH_CPU=ON -C cmake.define.WITH_NVIDIA=ON
+pip install . \
+  -C cmake.define.INFINI_RT_ROOT=/path/to/infinirt-prefix \
+  -C cmake.define.WITH_CPU=ON \
+  -C cmake.define.WITH_NVIDIA=ON
 ```
 
+The Python wheel installs the required InfiniRT shared library next to the
+InfiniOps extension so `import infini.ops` can load its runtime dependency.
+
 ### CMake Options
 
 | Option | Description | Default |
@@ -36,6 +48,10 @@ pip install . -C cmake.define.WITH_CPU=ON -C cmake.define.WITH_NVIDIA=ON
 | `-DWITH_CAMBRICON=[ON\|OFF]` | Compile the Cambricon implementation | OFF |
 | `-DWITH_ASCEND=[ON\|OFF]` | Compile the Ascend implementation | OFF |
 | `-DAUTO_DETECT_DEVICES=[ON\|OFF]` | Auto-detect available platforms | ON |
+| `-DINFINI_RT_ROOT=<path>` | InfiniRT install prefix containing `include/` and `lib/` | `$INFINI_RT_ROOT` |
+| `-DINFINI_RT_INCLUDE_DIRS=<paths>` | Explicit InfiniRT include directories | empty |
+| `-DINFINI_RT_LIBRARY=<path>` | Explicit path to `libinfinirt.so` | empty |
+| `-DINFINI_RT_SOURCE_DIR=<path>` | InfiniRT source tree for development builds | empty |
 
 If no accelerator options are provided and auto-detection finds nothing, `WITH_CPU` is enabled by default.
 
diff --git a/examples/runtime_api.h b/examples/runtime_api.h
index 101fcad59..1bbfdc1f3 100644
--- a/examples/runtime_api.h
+++ b/examples/runtime_api.h
@@ -2,6 +2,7 @@
 #define INFINI_OPS_EXAMPLES_RUNTIME_API_H_
 
 #include "device.h"
+#include "runtime.h"
 
 #ifdef WITH_NVIDIA
 #include "native/cuda/nvidia/ops/gemm/cublas.h"
@@ -23,8 +24,9 @@
 #include "native/ascend/ops/gemm/kernel.h"
 #include "native/ascend/runtime_.h"
 #elif WITH_CPU
+#include <infini/rt/cpu/runtime_.h>
+
 #include "native/cpu/ops/gemm/gemm.h"
-#include "native/cpu/runtime_.h"
 #else
 #error "One `WITH_*` backend must be enabled for the examples."
 #endif
diff --git a/scripts/generate_wrappers.py b/scripts/generate_wrappers.py
index 31e1ab0aa..0384e0423 100644
--- a/scripts/generate_wrappers.py
+++ b/scripts/generate_wrappers.py
@@ -83,6 +83,38 @@ def _op_relative_type(op_name):
     return "::".join(parts)
 
 
+def _get_infini_rt_include_flags():
+    include_dirs = []
+
+    for include_dir in os.environ.get("INFINI_RT_INCLUDE_DIRS", "").split(os.pathsep):
+        if include_dir:
+            include_dirs.append(pathlib.Path(include_dir))
+
+    infini_rt_root = os.environ.get("INFINI_RT_ROOT")
+    if infini_rt_root:
+        include_dirs.append(pathlib.Path(infini_rt_root) / "include")
+
+    infini_rt_source_dir = os.environ.get("INFINI_RT_SOURCE_DIR")
+    if infini_rt_source_dir:
+        infini_rt_source_path = pathlib.Path(infini_rt_source_dir)
+        include_dirs.extend(
+            (
+                infini_rt_source_path / "include",
+                infini_rt_source_path / "generated" / "include",
+            )
+        )
+
+    flags = []
+    seen = set()
+    for include_dir in include_dirs:
+        include_dir = include_dir.resolve()
+        if include_dir.exists() and include_dir not in seen:
+            flags.extend(("-I", str(include_dir)))
+            seen.add(include_dir)
+
+    return tuple(flags)
+
+
 def _write_text_if_changed(path: pathlib.Path, content: str) -> bool:
     """Write `content` only when the file's bytes would change."""
     path.parent.mkdir(parents=True, exist_ok=True)
@@ -185,14 +217,18 @@ def __call__(self, op_name):
 
         index = clang.cindex.Index.create()
         args = (
-            "-std=c++17",
-            "-x",
-            "c++",
-            "-I",
-            "src",
-            "-I",
-            str(_GENERATION_DIR),
-        ) + _get_system_include_flags()
+            (
+                "-std=c++17",
+                "-x",
+                "c++",
+                "-I",
+                "src",
+                "-I",
+                str(_GENERATION_DIR),
+            )
+            + _get_infini_rt_include_flags()
+            + _get_system_include_flags()
+        )
         translation_unit = index.parse(str(_find_base_header(op_name)), args=args)
 
         nodes = tuple(type(self)._find(translation_unit.cursor, op_name))
@@ -407,6 +443,17 @@ def _find_vector_int64_params(op_name):
     return set(re.findall(r"std::vector<int64_t>\s+(\w+)", source))
 
 
+def _find_tensor_params(op_name):
+    source = _find_base_header(op_name).read_text()
+
+    params = set()
+    params.update(re.findall(r"(?:^|[,(]\s*)(?:const\s+)?Tensor\s+(\w+)", source))
+    params.update(_find_optional_tensor_params(op_name))
+    params.update(_find_vector_tensor_params(op_name))
+
+    return params
+
+
 def _generate_pybind11(operator):
     optional_tensor_params = _find_optional_tensor_params(operator.name)
     optional_non_tensor_params = _find_optional_non_tensor_params(operator.name)
@@ -840,6 +887,7 @@ def _generate_tensor_caster(name, is_data=False):
 def _generate_generated_dispatch_entries(operator):
     optional_tensor_params = _find_optional_tensor_params(operator.name)
     optional_non_tensor_params = _find_optional_non_tensor_params(operator.name)
+    tensor_params = _find_tensor_params(operator.name)
     vector_tensor_params = _find_vector_tensor_params(operator.name)
     vector_int64_params = _find_vector_int64_params(operator.name)
 
@@ -863,6 +911,15 @@ def _is_vector_tensor(arg):
     def _is_vector_int64(arg):
         return arg.spelling in vector_int64_params
 
+    def _is_tensor(arg):
+        if arg.spelling in optional_non_tensor_params:
+            return False
+
+        if arg.spelling in tensor_params:
+            return True
+
+        return "Tensor" in arg.type.spelling or "TensorView" in arg.type.spelling
+
     def _generate_params(node):
         parts = []
 
@@ -876,6 +933,8 @@ def _generate_params(node):
                 parts.append(f"std::vector<Tensor> {arg.spelling}")
             elif _is_vector_int64(arg):
                 parts.append(f"std::vector<int64_t> {arg.spelling}")
+            elif _is_tensor(arg):
+                parts.append(f"Tensor {arg.spelling}")
             else:
                 parts.append(f"{arg.type.spelling} {arg.spelling}")
 
@@ -1032,16 +1091,71 @@ def _strip_top_level_const(type_spelling):
 
 
 def _generate_operator_call_instantiation_entries(operator):
+    optional_tensor_params = _find_optional_tensor_params(operator.name)
+    optional_non_tensor_params = _find_optional_non_tensor_params(operator.name)
+    tensor_params = _find_tensor_params(operator.name)
+    vector_tensor_params = _find_vector_tensor_params(operator.name)
+    vector_int64_params = _find_vector_int64_params(operator.name)
+
+    def _is_optional_tensor(arg):
+        spelling = arg.type.spelling
+
+        if "std::optional" in spelling:
+            return "Tensor" in spelling or "TensorView" in spelling
+
+        if arg.spelling in optional_non_tensor_params:
+            return False
+
+        if arg.spelling in optional_tensor_params:
+            return True
+
+        return False
+
+    def _is_vector_tensor(arg):
+        if arg.spelling in vector_tensor_params:
+            return True
+
+        return "std::vector" in arg.type.spelling and (
+            "Tensor" in arg.type.spelling or "TensorView" in arg.type.spelling
+        )
+
+    def _is_vector_int64(arg):
+        return arg.spelling in vector_int64_params
+
+    def _is_tensor(arg):
+        if arg.spelling in optional_non_tensor_params:
+            return False
+
+        if arg.spelling in tensor_params:
+            return True
+
+        return "Tensor" in arg.type.spelling or "TensorView" in arg.type.spelling
+
+    def _normalized_type(arg):
+        if _is_optional_tensor(arg):
+            return "std::optional<Tensor>"
+
+        if _is_vector_tensor(arg):
+            return "std::vector<Tensor>"
+
+        if _is_vector_int64(arg):
+            return "std::vector<int64_t>"
+
+        if _is_tensor(arg):
+            return "Tensor"
+
+        return _strip_top_level_const(arg.type.spelling)
+
     def _generate_template_arguments(node):
         return ", ".join(
-            _strip_top_level_const(arg.type.spelling)
+            _normalized_type(arg)
             for arg in node.get_arguments()
             if arg.spelling != "stream"
         )
 
     def _generate_parameters(node):
         return ", ".join(
-            f"const {_strip_top_level_const(arg.type.spelling)}& {arg.spelling}"
+            f"const {_normalized_type(arg)}& {arg.spelling}"
             for arg in node.get_arguments()
             if arg.spelling != "stream"
         )
@@ -1153,13 +1267,13 @@ def _generate_operator_call_instantiation_source(devices, impl_paths, definition
 
 def _device_marker_headers(devices):
     paths = {
-        "cpu": "native/cpu/device_.h",
-        "nvidia": "native/cuda/nvidia/device_.h",
-        "cambricon": "native/cambricon/device_.h",
-        "ascend": "native/ascend/device_.h",
-        "metax": "native/cuda/metax/device_.h",
-        "moore": "native/cuda/moore/device_.h",
-        "iluvatar": "native/cuda/iluvatar/device_.h",
+        "cpu": "infini/rt/cpu/device_.h",
+        "nvidia": "infini/rt/nvidia/device_.h",
+        "cambricon": "infini/rt/cambricon/device_.h",
+        "ascend": "infini/rt/ascend/device_.h",
+        "metax": "infini/rt/metax/device_.h",
+        "moore": "infini/rt/moore/device_.h",
+        "iluvatar": "infini/rt/iluvatar/device_.h",
     }
 
     return [paths[device] for device in devices if device in paths]
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 61d3103c0..0a1870906 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,4 +1,5 @@
-add_library(infiniops SHARED)
+add_library(infiniops SHARED operator.h)
+set_target_properties(infiniops PROPERTIES LINKER_LANGUAGE CXX)
 
 function(_infini_ops_write_if_different path content)
     set(_should_write TRUE)
@@ -35,8 +36,17 @@ endfunction()
 include(GNUInstallDirs)
 
 file(GLOB BASE_SRCS CONFIGURE_DEPENDS "*.cc")
+list(FILTER BASE_SRCS EXCLUDE REGEX ".*tensor\\.cc$")
 target_sources(infiniops PRIVATE ${BASE_SRCS})
 
+target_link_libraries(infiniops PUBLIC infinirt)
+
+set(INFINI_RT_INCLUDE_FLAGS "")
+foreach(_include_dir IN LISTS INFINI_RT_INCLUDE_DIRS)
+    list(APPEND INFINI_RT_INCLUDE_FLAGS "-I${_include_dir}")
+endforeach()
+list(JOIN INFINI_RT_INCLUDE_DIRS ":" INFINI_RT_INCLUDE_DIRS_ENV)
+
 set(DEVICE_LIST "")
 
 if(WITH_CPU)
@@ -46,7 +56,7 @@ if(WITH_CPU)
     )
 
     file(GLOB_RECURSE CPU_SOURCES CONFIGURE_DEPENDS ${CPU_PATTERNS})
-    list(APPEND CORE_SOURCES ${CPU_SOURCES})
+    target_sources(infiniops PRIVATE ${CPU_SOURCES})
 
     target_compile_definitions(infiniops PUBLIC WITH_CPU=1)
 
@@ -222,6 +232,7 @@ if(WITH_CAMBRICON)
         set(MLU_COMPILE_OPTS
             -c --bang-mlu-arch=mtp_592 -O3 -fPIC -Wall -Werror -std=c++17 -pthread
             -I${CMAKE_CURRENT_SOURCE_DIR} -I${NEUWARE_HOME}/include
+            ${INFINI_RT_INCLUDE_FLAGS}
             -idirafter /usr/local/neuware/lib/clang/11.1.0/include
         )
         function(compile_mlu_file src_file)
@@ -528,6 +539,7 @@ if(WITH_TORCH)
                     -std=c++17 -fPIC -O0
                     "-I${CMAKE_CURRENT_SOURCE_DIR}"
                     "-I${PROJECT_SOURCE_DIR}/generated"
+                    ${INFINI_RT_INCLUDE_FLAGS}
                     ${_torch_include_flags}
                     ${_torch_extra_flags}
                     -c "${_src}" -o "${_obj}"
@@ -553,6 +565,7 @@ if(WITH_TORCH)
             PROPERTIES POSITION_INDEPENDENT_CODE ON)
         target_include_directories(infini_ops_torch_objs PRIVATE
             $<TARGET_PROPERTY:infiniops,INCLUDE_DIRECTORIES>
+            ${INFINI_RT_INCLUDE_DIRS}
             ${TORCH_INCLUDE_DIRS}
             ${PROJECT_SOURCE_DIR}/generated)
         target_compile_definitions(infini_ops_torch_objs PRIVATE
@@ -597,7 +610,11 @@ if(GENERATE_OPERATOR_CALL_INSTANTIATIONS OR GENERATE_PYTHON_BINDINGS)
     endif()
 
     execute_process(
-        COMMAND ${Python_EXECUTABLE} ${PROJECT_SOURCE_DIR}/scripts/generate_wrappers.py ${GENERATOR_ARGS}
+        COMMAND ${CMAKE_COMMAND} -E env
+                INFINI_RT_INCLUDE_DIRS=${INFINI_RT_INCLUDE_DIRS_ENV}
+                ${Python_EXECUTABLE}
+                ${PROJECT_SOURCE_DIR}/scripts/generate_wrappers.py
+                ${GENERATOR_ARGS}
         WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
         RESULT_VARIABLE script_result
     )
@@ -622,7 +639,8 @@ if(GENERATE_OPERATOR_CALL_INSTANTIATIONS)
             "-I${CMAKE_CURRENT_SOURCE_DIR}"
             "-I${PROJECT_SOURCE_DIR}"
             "-I${PROJECT_SOURCE_DIR}/generated"
-            "-I${PROJECT_SOURCE_DIR}/generated/include")
+            "-I${PROJECT_SOURCE_DIR}/generated/include"
+            ${INFINI_RT_INCLUDE_FLAGS})
         foreach(_dir IN LISTS TORCH_INCLUDE_DIRS CUDAToolkit_INCLUDE_DIRS)
             list(APPEND _iluvatar_call_instantiation_include_flags "-I${_dir}")
         endforeach()
@@ -796,7 +814,8 @@ if(GENERATE_PYTHON_BINDINGS)
         set(_iluvatar_dispatch_include_flags
             "-I${CMAKE_CURRENT_SOURCE_DIR}"
             "-I${PROJECT_SOURCE_DIR}"
-            "-I${PROJECT_SOURCE_DIR}/generated")
+            "-I${PROJECT_SOURCE_DIR}/generated"
+            ${INFINI_RT_INCLUDE_FLAGS})
         foreach(_dir IN LISTS TORCH_INCLUDE_DIRS CUDAToolkit_INCLUDE_DIRS)
             list(APPEND _iluvatar_dispatch_include_flags "-I${_dir}")
         endforeach()
@@ -924,7 +943,14 @@ if(GENERATE_PYTHON_BINDINGS)
     set_target_properties(infiniops PROPERTIES INSTALL_RPATH "${_INFINI_OPS_INSTALL_RPATH}")
     set_target_properties(ops PROPERTIES INSTALL_RPATH "${_INFINI_OPS_INSTALL_RPATH}")
 
-    install(TARGETS infiniops ops DESTINATION .)
+    get_target_property(_infinirt_imported infinirt IMPORTED)
+    if(_infinirt_imported)
+        install(TARGETS infiniops ops DESTINATION .)
+        get_target_property(_infinirt_imported_location infinirt IMPORTED_LOCATION)
+        install(FILES "${_infinirt_imported_location}" DESTINATION .)
+    else()
+        install(TARGETS infinirt infiniops ops DESTINATION .)
+    endif()
 
     file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/__init__.py" "")
     install(FILES "${CMAKE_CURRENT_BINARY_DIR}/__init__.py" DESTINATION .)
diff --git a/src/data_type.h b/src/data_type.h
index 75483d2b8..4c7329cdc 100644
--- a/src/data_type.h
+++ b/src/data_type.h
@@ -1,192 +1,34 @@
 #ifndef INFINI_OPS_DATA_TYPE_H_
 #define INFINI_OPS_DATA_TYPE_H_
 
-#include <cstdint>
-#include <cstring>
-#include <string>
+#include <infini/rt.h>
 
-#include "common/constexpr_map.h"
 #include "common/traits.h"
 #include "device.h"
 
 namespace infini::ops {
 
-enum class DataType : std::int8_t {
-  kInt8,
-  kInt16,
-  kInt32,
-  kInt64,
-  kUInt8,
-  kUInt16,
-  kUInt32,
-  kUInt64,
-  kFloat16,
-  kBFloat16,
-  kFloat32,
-  kFloat64
-};
+using infini::rt::DataType;
 
-constexpr ConstexprMap<DataType, std::size_t, 12> kDataTypeToSize{{{
-    {DataType::kInt8, 1},
-    {DataType::kInt16, 2},
-    {DataType::kInt32, 4},
-    {DataType::kInt64, 8},
-    {DataType::kUInt8, 1},
-    {DataType::kUInt16, 2},
-    {DataType::kUInt32, 4},
-    {DataType::kUInt64, 8},
-    {DataType::kFloat16, 2},
-    {DataType::kBFloat16, 2},
-    {DataType::kFloat32, 4},
-    {DataType::kFloat64, 8},
-}}};
+using infini::rt::BFloat16;
+using infini::rt::Float16;
 
-constexpr ConstexprMap<DataType, std::string_view, 12> kDataTypeToDesc{{{
-    {DataType::kInt8, "int8"},
-    {DataType::kInt16, "int16"},
-    {DataType::kInt32, "int32"},
-    {DataType::kInt64, "int64"},
-    {DataType::kUInt8, "uint8"},
-    {DataType::kUInt16, "uint16"},
-    {DataType::kUInt32, "uint32"},
-    {DataType::kUInt64, "uint64"},
-    {DataType::kFloat16, "float16"},
-    {DataType::kBFloat16, "bfloat16"},
-    {DataType::kFloat32, "float32"},
-    {DataType::kFloat64, "float64"},
-}}};
-
-constexpr ConstexprMap<std::string_view, DataType, 12> kStringToDataType{{{
-    {"int8", DataType::kInt8},
-    {"int16", DataType::kInt16},
-    {"int32", DataType::kInt32},
-    {"int64", DataType::kInt64},
-    {"uint8", DataType::kUInt8},
-    {"uint16", DataType::kUInt16},
-    {"uint32", DataType::kUInt32},
-    {"uint64", DataType::kUInt64},
-    {"float16", DataType::kFloat16},
-    {"bfloat16", DataType::kBFloat16},
-    {"float32", DataType::kFloat32},
-    {"float64", DataType::kFloat64},
-}}};
-
-struct Float16 {
-  std::uint16_t bits;
-
-  static inline Float16 FromFloat(float val) {
-    std::uint32_t f32;
-    std::memcpy(&f32, &val, sizeof(f32));
-    std::uint16_t sign = (f32 >> 16) & 0x8000;
-    std::int32_t exponent = ((f32 >> 23) & 0xFF) - 127;
-    std::uint32_t mantissa = f32 & 0x7FFFFF;
-
-    if (exponent >= 16) {
-      // NaN
-      if (exponent == 128 && mantissa != 0) {
-        return {static_cast<std::uint16_t>(sign | 0x7E00)};
-      }
-      // Inf
-      return {static_cast<std::uint16_t>(sign | 0x7C00)};
-    } else if (exponent >= -14) {
-      return {static_cast<std::uint16_t>(sign | ((exponent + 15) << 10) |
-                                         (mantissa >> 13))};
-    } else if (exponent >= -24) {
-      mantissa |= 0x800000;
-      mantissa >>= (-14 - exponent);
-      return {static_cast<std::uint16_t>(sign | (mantissa >> 13))};
-    }
-    // Too small for subnormal: return signed zero.
-    return {sign};
-  }
-
-  inline float ToFloat() const {
-    std::uint32_t sign = (bits & 0x8000) << 16;
-    std::int32_t exponent = (bits >> 10) & 0x1F;
-    std::uint32_t mantissa = bits & 0x3FF;
-    std::uint32_t f32_bits;
-
-    if (exponent == 31) {
-      f32_bits = sign | 0x7F800000 | (mantissa << 13);
-    } else if (exponent == 0) {
-      if (mantissa == 0) {
-        f32_bits = sign;
-      } else {
-        exponent = -14;
-        while ((mantissa & 0x400) == 0) {
-          mantissa <<= 1;
-          exponent--;
-        }
-        mantissa &= 0x3FF;
-        f32_bits = sign | ((exponent + 127) << 23) | (mantissa << 13);
-      }
-    } else {
-      f32_bits = sign | ((exponent + 127 - 15) << 23) | (mantissa << 13);
-    }
-
-    float result;
-    std::memcpy(&result, &f32_bits, sizeof(result));
-    return result;
-  }
-};
-
-struct BFloat16 {
-  std::uint16_t bits;
-
-  static inline BFloat16 FromFloat(float val) {
-    std::uint32_t bits32;
-    std::memcpy(&bits32, &val, sizeof(bits32));
-
-    const std::uint32_t rounding_bias = 0x00007FFF + ((bits32 >> 16) & 1);
-    std::uint16_t bf16_bits =
-        static_cast<std::uint16_t>((bits32 + rounding_bias) >> 16);
-    return {bf16_bits};
-  }
-
-  inline float ToFloat() const {
-    std::uint32_t bits32 = static_cast<std::uint32_t>(bits) << 16;
-    float result;
-    std::memcpy(&result, &bits32, sizeof(result));
-    return result;
-  }
-};
+using infini::rt::kDataTypeToDesc;
+using infini::rt::kDataTypeToSize;
+using infini::rt::kStringToDataType;
 
 template <Device::Type dev, DataType dtype>
-struct TypeMap;
+using TypeMap = infini::rt::TypeMap<dev, dtype>;
 
 template <Device::Type dev, DataType dtype>
-using TypeMapType = typename TypeMap<dev, dtype>::type;
-
-#define DEFINE_DATA_TYPE_MAPPING(ENUM_VALUE, CPP_TYPE) \
-  template <Device::Type dev>                          \
-  struct TypeMap<dev, DataType::ENUM_VALUE> {          \
-    using type = CPP_TYPE;                             \
-  };
-
-DEFINE_DATA_TYPE_MAPPING(kUInt8, std::uint8_t)
-DEFINE_DATA_TYPE_MAPPING(kInt8, std::int8_t)
-DEFINE_DATA_TYPE_MAPPING(kUInt16, std::uint16_t)
-DEFINE_DATA_TYPE_MAPPING(kInt16, std::int16_t)
-DEFINE_DATA_TYPE_MAPPING(kUInt32, std::uint32_t)
-DEFINE_DATA_TYPE_MAPPING(kInt32, std::int32_t)
-DEFINE_DATA_TYPE_MAPPING(kUInt64, std::uint64_t)
-DEFINE_DATA_TYPE_MAPPING(kInt64, std::int64_t)
-DEFINE_DATA_TYPE_MAPPING(kFloat32, float)
-DEFINE_DATA_TYPE_MAPPING(kFloat64, double)
-#undef DEFINE_DATA_TYPE_MAPPING
+using TypeMapType = infini::rt::TypeMapType<dev, dtype>;
 
-// Checks whether a C++ type is the bfloat16 or float16 type for the given
-// device. Full specializations for each device's float16/bfloat16 types are
-// provided in the corresponding platform-specific device type headers.
 template <Device::Type dev, typename T>
-inline constexpr bool IsBFloat16 =
-    std::is_same_v<T, TypeMapType<dev, DataType::kBFloat16>>;
+inline constexpr bool IsBFloat16 = infini::rt::IsBFloat16<dev, T>;
 
 template <Device::Type dev, typename T>
-inline constexpr bool IsFP16 =
-    std::is_same_v<T, TypeMapType<dev, DataType::kFloat16>>;
+inline constexpr bool IsFP16 = infini::rt::IsFP16<dev, T>;
 
-// Defines the common categories of data types using List.
 using FloatTypes = List<DataType::kFloat32, DataType::kFloat64>;
 using ReducedFloatTypes = List<DataType::kFloat16, DataType::kBFloat16>;
 using IntTypes =
diff --git a/src/device.h b/src/device.h
index 688cd0dc2..41a3af3b1 100644
--- a/src/device.h
+++ b/src/device.h
@@ -1,110 +1,23 @@
 #ifndef INFINI_OPS_DEVICE_H_
 #define INFINI_OPS_DEVICE_H_
 
-#include <string>
-#include <string_view>
+#include <infini/rt.h>
 
-#include "common/constexpr_map.h"
 #include "common/traits.h"
-#include "hash.h"
 
 namespace infini::ops {
 
-class Device {
- public:
-  enum class Type {
-    kCpu = 0,
-    kNvidia = 1,
-    kCambricon = 2,
-    kAscend = 3,
-    kMetax = 4,
-    kMoore = 5,
-    kIluvatar = 6,
-    kKunlun = 7,
-    kHygon = 8,
-    kQy = 9,
-    kCount
-  };
-
-  Device() = default;
-
-  Device(const Type& type, const int& index = 0) : type_{type}, index_{index} {}
-
-  static const Type TypeFromString(const std::string& name) {
-    return kDescToDevice.at(name);
-  }
-
-  static const std::string_view StringFromType(const Type& type) {
-    return kDeviceToDesc.at(type);
-  }
-
-  const Type& type() const { return type_; }
-
-  const int& index() const { return index_; }
-
-  std::string ToString() const {
-    return std::string{StringFromType(type_)} + ":" + std::to_string(index_);
-  }
-
-  bool operator==(const Device& other) const {
-    return type_ == other.type_ && index_ == other.index_;
-  }
-
-  bool operator!=(const Device& other) const { return !(*this == other); }
+using Device = infini::rt::Device;
 
- private:
-  Type type_{Type::kCpu};
+template <Device::Type device_type>
+using DeviceEnabled = infini::rt::DeviceEnabled<device_type>;
 
-  static constexpr ConstexprMap<Device::Type, std::string_view,
-                                static_cast<std::size_t>(Device::Type::kCount)>
-      kDeviceToDesc{{{
-          {Type::kCpu, "cpu"},
-          {Type::kNvidia, "nvidia"},
-          {Type::kCambricon, "cambricon"},
-          {Type::kAscend, "ascend"},
-          {Type::kMetax, "metax"},
-          {Type::kMoore, "moore"},
-          {Type::kIluvatar, "iluvatar"},
-          {Type::kKunlun, "kunlun"},
-          {Type::kHygon, "hygon"},
-          {Type::kQy, "qy"},
-      }}};
-
-  static constexpr ConstexprMap<std::string_view, Device::Type,
-                                static_cast<std::size_t>(Device::Type::kCount)>
-      kDescToDevice{{{
-          {"cpu", Type::kCpu},
-          {"nvidia", Type::kNvidia},
-          {"cambricon", Type::kCambricon},
-          {"ascend", Type::kAscend},
-          {"metax", Type::kMetax},
-          {"moore", Type::kMoore},
-          {"iluvatar", Type::kIluvatar},
-          {"kunlun", Type::kKunlun},
-          {"hygon", Type::kHygon},
-          {"qy", Type::kQy},
-      }}};
-
-  int index_{0};
-};
-
-// Primary template: Devices are disabled by default. Platform-specific
-// headers (e.g. `cpu/device_.h`) specialize this to `std::true_type`.
-template <Device::Type>
-struct DeviceEnabled : std::false_type {};
-
-// Defines the common categories of devices using List.
 using AllDeviceTypes =
     List<Device::Type::kCpu, Device::Type::kNvidia, Device::Type::kCambricon,
          Device::Type::kAscend, Device::Type::kMetax, Device::Type::kMoore,
          Device::Type::kIluvatar, Device::Type::kKunlun, Device::Type::kHygon,
          Device::Type::kQy>;
 
-// Deferred computation of active devices. The `Filter` and `FilterList`
-// evaluation are nested inside a class template so that `DeviceEnabled`
-// specializations from platform `device_.h` headers are visible at
-// instantiation time. Use with a dependent type parameter
-// (e.g. `ActiveDevices<Key>`) to ensure deferred instantiation.
 template <typename>
 struct ActiveDevicesImpl {
   struct Filter {
@@ -121,17 +34,4 @@ using ActiveDevices = typename ActiveDevicesImpl<T>::type;
 
 }  // namespace infini::ops
 
-template <>
-struct std::hash<infini::ops::Device> {
-  std::size_t operator()(const infini::ops::Device& device) const {
-    std::size_t seed{0};
-
-    HashCombine(seed, device.type());
-
-    HashCombine(seed, device.index());
-
-    return seed;
-  }
-};
-
 #endif
diff --git a/src/native/ascend/device_.h b/src/native/ascend/device_.h
index 1b246ad38..8d69ab95e 100644
--- a/src/native/ascend/device_.h
+++ b/src/native/ascend/device_.h
@@ -1,13 +1,6 @@
 #ifndef INFINI_OPS_ASCEND_DEVICE__H_
 #define INFINI_OPS_ASCEND_DEVICE__H_
 
-#include "device.h"
-
-namespace infini::ops {
-
-template <>
-struct DeviceEnabled<Device::Type::kAscend> : std::true_type {};
-
-}  // namespace infini::ops
+#include <infini/rt/ascend/device_.h>
 
 #endif
diff --git a/src/native/ascend/runtime_.h b/src/native/ascend/runtime_.h
index 2b9e14136..ed21ad687 100644
--- a/src/native/ascend/runtime_.h
+++ b/src/native/ascend/runtime_.h
@@ -1,44 +1,8 @@
 #ifndef INFINI_OPS_ASCEND_RUNTIME__H_
 #define INFINI_OPS_ASCEND_RUNTIME__H_
 
-// clang-format off
-#include "acl/acl.h"
-// clang-format on
+#include <infini/rt/ascend/runtime_.h>
 
-#include "native/ascend/device_.h"
 #include "runtime.h"
 
-namespace infini::ops {
-
-template <>
-struct Runtime<Device::Type::kAscend>
-    : DeviceRuntime<Runtime<Device::Type::kAscend>> {
-  using Stream = aclrtStream;
-
-  static constexpr Device::Type kDeviceType = Device::Type::kAscend;
-
-  static constexpr auto Malloc = [](void** ptr, size_t size) {
-    return aclrtMalloc(ptr, size, ACL_MEM_MALLOC_HUGE_FIRST);
-  };
-
-  static constexpr auto Free = aclrtFree;
-
-  static constexpr auto Memcpy = [](void* dst, const void* src, size_t count,
-                                    aclrtMemcpyKind kind) {
-    return aclrtMemcpy(dst, count, src, count, kind);
-  };
-
-  static constexpr auto MemcpyHostToDevice = ACL_MEMCPY_HOST_TO_DEVICE;
-
-  static constexpr auto MemcpyDeviceToHost = ACL_MEMCPY_DEVICE_TO_HOST;
-
-  static constexpr auto Memset = [](void* ptr, int value, size_t count) {
-    return aclrtMemset(ptr, count, value, count);
-  };
-};
-
-static_assert(Runtime<Device::Type::kAscend>::Validate());
-
-}  // namespace infini::ops
-
 #endif
diff --git a/src/native/cambricon/data_type_.h b/src/native/cambricon/data_type_.h
index f4ca82da8..805b311ed 100644
--- a/src/native/cambricon/data_type_.h
+++ b/src/native/cambricon/data_type_.h
@@ -1,23 +1,6 @@
 #ifndef INFINI_OPS_CAMBRICON_DATA_TYPE__H_
 #define INFINI_OPS_CAMBRICON_DATA_TYPE__H_
 
-#include "bang_bf16.h"
-#include "bang_fp16.h"
-#include "data_type.h"
-#include "native/cambricon/device_.h"
-
-namespace infini::ops {
-
-template <>
-struct TypeMap<Device::Type::kCambricon, DataType::kFloat16> {
-  using type = __half;
-};
-
-template <>
-struct TypeMap<Device::Type::kCambricon, DataType::kBFloat16> {
-  using type = __bang_bfloat16;
-};
-
-}  // namespace infini::ops
+#include <infini/rt/cambricon/data_type_.h>
 
 #endif
diff --git a/src/native/cambricon/device_.h b/src/native/cambricon/device_.h
index f168e4f25..d59054af8 100644
--- a/src/native/cambricon/device_.h
+++ b/src/native/cambricon/device_.h
@@ -1,13 +1,6 @@
 #ifndef INFINI_OPS_CAMBRICON_DEVICE__H_
 #define INFINI_OPS_CAMBRICON_DEVICE__H_
 
-#include "device.h"
-
-namespace infini::ops {
-
-template <>
-struct DeviceEnabled<Device::Type::kCambricon> : std::true_type {};
-
-}  // namespace infini::ops
+#include <infini/rt/cambricon/device_.h>
 
 #endif
diff --git a/src/native/cambricon/runtime_.h b/src/native/cambricon/runtime_.h
index 7ff30fe9c..1ca395d09 100644
--- a/src/native/cambricon/runtime_.h
+++ b/src/native/cambricon/runtime_.h
@@ -1,35 +1,8 @@
 #ifndef INFINI_OPS_CAMBRICON_RUNTIME_H_
 #define INFINI_OPS_CAMBRICON_RUNTIME_H_
 
-#include <cnrt.h>
+#include <infini/rt/cambricon/runtime_.h>
 
-#include "native/cambricon/device_.h"
 #include "runtime.h"
 
-namespace infini::ops {
-
-template <>
-struct Runtime<Device::Type::kCambricon>
-    : DeviceRuntime<Runtime<Device::Type::kCambricon>> {
-  using Stream = cnrtQueue_t;
-
-  static constexpr Device::Type kDeviceType = Device::Type::kCambricon;
-
-  static constexpr auto Malloc = cnrtMalloc;
-
-  static constexpr auto Free = cnrtFree;
-
-  static constexpr auto Memcpy = cnrtMemcpy;
-
-  static constexpr auto MemcpyHostToDevice = cnrtMemcpyHostToDev;
-
-  static constexpr auto MemcpyDeviceToHost = cnrtMemcpyDevToHost;
-
-  static constexpr auto Memset = cnrtMemset;
-};
-
-static_assert(Runtime<Device::Type::kCambricon>::Validate());
-
-}  // namespace infini::ops
-
 #endif
diff --git a/src/native/cpu/caster_.h b/src/native/cpu/caster_.h
index 7da1bf365..3766191ed 100644
--- a/src/native/cpu/caster_.h
+++ b/src/native/cpu/caster_.h
@@ -1,10 +1,11 @@
 #ifndef INFINI_OPS_COMMON_CPU_CASTER_H_
 #define INFINI_OPS_COMMON_CPU_CASTER_H_
 
+#include <infini/rt/cpu/data_type_.h>
+
 #include <type_traits>
 
 #include "caster.h"
-#include "native/cpu/data_type_.h"
 
 namespace infini::ops {
 
diff --git a/src/native/cpu/data_type_.h b/src/native/cpu/data_type_.h
index 36231db51..815de89a4 100644
--- a/src/native/cpu/data_type_.h
+++ b/src/native/cpu/data_type_.h
@@ -1,21 +1,6 @@
 #ifndef INFINI_OPS_CPU_DATA_TYPE__H_
 #define INFINI_OPS_CPU_DATA_TYPE__H_
 
-#include "data_type.h"
-#include "native/cpu/device_.h"
-
-namespace infini::ops {
-
-template <>
-struct TypeMap<Device::Type::kCpu, DataType::kFloat16> {
-  using type = Float16;
-};
-
-template <>
-struct TypeMap<Device::Type::kCpu, DataType::kBFloat16> {
-  using type = BFloat16;
-};
-
-}  // namespace infini::ops
+#include <infini/rt/cpu/data_type_.h>
 
 #endif
diff --git a/src/native/cpu/device_.h b/src/native/cpu/device_.h
index e5e7d85a3..534e4dcd2 100644
--- a/src/native/cpu/device_.h
+++ b/src/native/cpu/device_.h
@@ -1,13 +1,6 @@
 #ifndef INFINI_OPS_CPU_DEVICE__H_
 #define INFINI_OPS_CPU_DEVICE__H_
 
-#include "device.h"
-
-namespace infini::ops {
-
-template <>
-struct DeviceEnabled<Device::Type::kCpu> : std::true_type {};
-
-}  // namespace infini::ops
+#include <infini/rt/cpu/device_.h>
 
 #endif
diff --git a/src/native/cpu/runtime_.h b/src/native/cpu/runtime_.h
index cb6176ba1..4cfb213ae 100644
--- a/src/native/cpu/runtime_.h
+++ b/src/native/cpu/runtime_.h
@@ -1,34 +1,8 @@
 #ifndef INFINI_OPS_CPU_RUNTIME_H_
 #define INFINI_OPS_CPU_RUNTIME_H_
 
-#include <cstdlib>
-#include <cstring>
+#include <infini/rt/cpu/runtime_.h>
 
 #include "runtime.h"
 
-namespace infini::ops {
-
-template <>
-struct Runtime<Device::Type::kCpu> : RuntimeBase<Runtime<Device::Type::kCpu>> {
-  static constexpr Device::Type kDeviceType = Device::Type::kCpu;
-
-  static void Malloc(void** ptr, std::size_t size) { *ptr = std::malloc(size); }
-
-  static void Free(void* ptr) { std::free(ptr); }
-
-  static void Memcpy(void* dst, const void* src, std::size_t size, int) {
-    std::memcpy(dst, src, size);
-  }
-
-  static constexpr auto Memset = std::memset;
-
-  static constexpr int MemcpyHostToDevice = 0;
-
-  static constexpr int MemcpyDeviceToHost = 1;
-};
-
-static_assert(Runtime<Device::Type::kCpu>::Validate());
-
-}  // namespace infini::ops
-
 #endif
diff --git a/src/native/cuda/iluvatar/data_type_.h b/src/native/cuda/iluvatar/data_type_.h
index ac997f992..b9e3ed8a3 100644
--- a/src/native/cuda/iluvatar/data_type_.h
+++ b/src/native/cuda/iluvatar/data_type_.h
@@ -1,29 +1,12 @@
 #ifndef INFINI_OPS_ILUVATAR_DATA_TYPE__H_
 #define INFINI_OPS_ILUVATAR_DATA_TYPE__H_
 
-// clang-format off
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-// clang-format on
-
-#include "data_type.h"
-#include "native/cuda/iluvatar/device_.h"
+#include <infini/rt/iluvatar/data_type_.h>
 
 namespace infini::ops {
 
-using cuda_bfloat16 = nv_bfloat16;
-
-using cuda_bfloat162 = nv_bfloat162;
-
-template <>
-struct TypeMap<Device::Type::kIluvatar, DataType::kFloat16> {
-  using type = half;
-};
-
-template <>
-struct TypeMap<Device::Type::kIluvatar, DataType::kBFloat16> {
-  using type = __nv_bfloat16;
-};
+using infini::rt::cuda_bfloat16;
+using infini::rt::cuda_bfloat162;
 
 }  // namespace infini::ops
 
diff --git a/src/native/cuda/iluvatar/device_.h b/src/native/cuda/iluvatar/device_.h
index 4773201da..96743518f 100644
--- a/src/native/cuda/iluvatar/device_.h
+++ b/src/native/cuda/iluvatar/device_.h
@@ -1,13 +1,6 @@
 #ifndef INFINI_OPS_ILUVATAR_DEVICE__H_
 #define INFINI_OPS_ILUVATAR_DEVICE__H_
 
-#include "device.h"
-
-namespace infini::ops {
-
-template <>
-struct DeviceEnabled<Device::Type::kIluvatar> : std::true_type {};
-
-}  // namespace infini::ops
+#include <infini/rt/iluvatar/device_.h>
 
 #endif
diff --git a/src/native/cuda/iluvatar/runtime_.h b/src/native/cuda/iluvatar/runtime_.h
index 1e41b68e9..5983e5bbb 100644
--- a/src/native/cuda/iluvatar/runtime_.h
+++ b/src/native/cuda/iluvatar/runtime_.h
@@ -1,42 +1,9 @@
 #ifndef INFINI_OPS_ILUVATAR_RUNTIME_H_
 #define INFINI_OPS_ILUVATAR_RUNTIME_H_
 
-#include <utility>
+#include <infini/rt/iluvatar/runtime_.h>
 
-// clang-format off
-#include <cuda_runtime.h>
-// clang-format on
-
-#include "native/cuda/iluvatar/device_.h"
 #include "native/cuda/iluvatar/runtime_utils.h"
-#include "native/cuda/runtime_.h"
-
-namespace infini::ops {
-
-template <>
-struct Runtime<Device::Type::kIluvatar>
-    : CudaRuntime<Runtime<Device::Type::kIluvatar>> {
-  using Stream = cudaStream_t;
-
-  static constexpr Device::Type kDeviceType = Device::Type::kIluvatar;
-
-  static constexpr auto Malloc = [](auto&&... args) {
-    return cudaMalloc(std::forward<decltype(args)>(args)...);
-  };
-
-  static constexpr auto Memcpy = cudaMemcpy;
-
-  static constexpr auto Free = cudaFree;
-
-  static constexpr auto MemcpyHostToDevice = cudaMemcpyHostToDevice;
-
-  static constexpr auto MemcpyDeviceToHost = cudaMemcpyDeviceToHost;
-
-  static constexpr auto Memset = cudaMemset;
-};
-
-static_assert(Runtime<Device::Type::kIluvatar>::Validate());
-
-}  // namespace infini::ops
+#include "runtime.h"
 
 #endif
diff --git a/src/native/cuda/metax/data_type_.h b/src/native/cuda/metax/data_type_.h
index 73e498774..003dd48c0 100644
--- a/src/native/cuda/metax/data_type_.h
+++ b/src/native/cuda/metax/data_type_.h
@@ -1,28 +1,12 @@
 #ifndef INFINI_OPS_METAX_DATA_TYPE__H_
 #define INFINI_OPS_METAX_DATA_TYPE__H_
 
-#include <common/maca_bfloat16.h>
-#include <common/maca_fp16.h>
-#include <mcr/mc_runtime.h>
-
-#include "data_type.h"
-#include "native/cuda/metax/device_.h"
+#include <infini/rt/metax/data_type_.h>
 
 namespace infini::ops {
 
-using cuda_bfloat16 = maca_bfloat16;
-
-using cuda_bfloat162 = maca_bfloat162;
-
-template <>
-struct TypeMap<Device::Type::kMetax, DataType::kFloat16> {
-  using type = __half;
-};
-
-template <>
-struct TypeMap<Device::Type::kMetax, DataType::kBFloat16> {
-  using type = __maca_bfloat16;
-};
+using infini::rt::cuda_bfloat16;
+using infini::rt::cuda_bfloat162;
 
 }  // namespace infini::ops
 
diff --git a/src/native/cuda/metax/device_.h b/src/native/cuda/metax/device_.h
index 6e7c677d7..7aa554ea2 100644
--- a/src/native/cuda/metax/device_.h
+++ b/src/native/cuda/metax/device_.h
@@ -1,13 +1,6 @@
 #ifndef INFINI_OPS_METAX_DEVICE__H_
 #define INFINI_OPS_METAX_DEVICE__H_
 
-#include "device.h"
-
-namespace infini::ops {
-
-template <>
-struct DeviceEnabled<Device::Type::kMetax> : std::true_type {};
-
-}  // namespace infini::ops
+#include <infini/rt/metax/device_.h>
 
 #endif
diff --git a/src/native/cuda/metax/runtime_.h b/src/native/cuda/metax/runtime_.h
index 6563b6b41..01e9ae74e 100644
--- a/src/native/cuda/metax/runtime_.h
+++ b/src/native/cuda/metax/runtime_.h
@@ -1,36 +1,9 @@
 #ifndef INFINI_OPS_METAX_RUNTIME_H_
 #define INFINI_OPS_METAX_RUNTIME_H_
 
-#include <mcr/mc_runtime.h>
+#include <infini/rt/metax/runtime_.h>
 
-#include "native/cuda/metax/device_.h"
 #include "native/cuda/metax/runtime_utils.h"
-#include "native/cuda/runtime_.h"
-
-namespace infini::ops {
-
-template <>
-struct Runtime<Device::Type::kMetax>
-    : CudaRuntime<Runtime<Device::Type::kMetax>> {
-  using Stream = mcStream_t;
-
-  static constexpr Device::Type kDeviceType = Device::Type::kMetax;
-
-  static constexpr auto Malloc = mcMalloc;
-
-  static constexpr auto Memcpy = mcMemcpy;
-
-  static constexpr auto Free = mcFree;
-
-  static constexpr auto MemcpyHostToDevice = mcMemcpyHostToDevice;
-
-  static constexpr auto MemcpyDeviceToHost = mcMemcpyDeviceToHost;
-
-  static constexpr auto Memset = mcMemset;
-};
-
-static_assert(Runtime<Device::Type::kMetax>::Validate());
-
-}  // namespace infini::ops
+#include "runtime.h"
 
 #endif
diff --git a/src/native/cuda/moore/data_type_.h b/src/native/cuda/moore/data_type_.h
index c97cb33dd..7bc17d047 100644
--- a/src/native/cuda/moore/data_type_.h
+++ b/src/native/cuda/moore/data_type_.h
@@ -1,27 +1,12 @@
 #ifndef INFINI_OPS_MOORE_DATA_TYPE__H_
 #define INFINI_OPS_MOORE_DATA_TYPE__H_
 
-#include <musa_bf16.h>
-#include <musa_fp16.h>
-
-#include "data_type.h"
-#include "native/cuda/moore/device_.h"
+#include <infini/rt/moore/data_type_.h>
 
 namespace infini::ops {
 
-using cuda_bfloat16 = __mt_bfloat16;
-
-using cuda_bfloat162 = __mt_bfloat162;
-
-template <>
-struct TypeMap<Device::Type::kMoore, DataType::kFloat16> {
-  using type = half;
-};
-
-template <>
-struct TypeMap<Device::Type::kMoore, DataType::kBFloat16> {
-  using type = __mt_bfloat16;
-};
+using infini::rt::cuda_bfloat16;
+using infini::rt::cuda_bfloat162;
 
 }  // namespace infini::ops
 
diff --git a/src/native/cuda/moore/device_.h b/src/native/cuda/moore/device_.h
index 472b9acf0..2b23db2dd 100644
--- a/src/native/cuda/moore/device_.h
+++ b/src/native/cuda/moore/device_.h
@@ -1,13 +1,6 @@
 #ifndef INFINI_OPS_MOORE_DEVICE__H_
 #define INFINI_OPS_MOORE_DEVICE__H_
 
-#include "device.h"
-
-namespace infini::ops {
-
-template <>
-struct DeviceEnabled<Device::Type::kMoore> : std::true_type {};
-
-}  // namespace infini::ops
+#include <infini/rt/moore/device_.h>
 
 #endif
diff --git a/src/native/cuda/moore/runtime_.h b/src/native/cuda/moore/runtime_.h
index bc519b41e..14f3411b1 100644
--- a/src/native/cuda/moore/runtime_.h
+++ b/src/native/cuda/moore/runtime_.h
@@ -1,44 +1,9 @@
 #ifndef INFINI_OPS_MOORE_RUNTIME_H_
 #define INFINI_OPS_MOORE_RUNTIME_H_
 
-#include <musa_runtime.h>
+#include <infini/rt/moore/runtime_.h>
 
-#include <utility>
-
-#include "native/cuda/moore/device_.h"
 #include "native/cuda/moore/runtime_utils.h"
-#include "native/cuda/runtime_.h"
-
-namespace infini::ops {
-
-template <>
-struct Runtime<Device::Type::kMoore>
-    : CudaRuntime<Runtime<Device::Type::kMoore>> {
-  using Stream = musaStream_t;
-
-  static constexpr Device::Type kDeviceType = Device::Type::kMoore;
-
-  static constexpr auto Malloc = [](auto&&... args) {
-    return musaMalloc(std::forward<decltype(args)>(args)...);
-  };
-
-  static constexpr auto Memcpy = [](auto&&... args) {
-    return musaMemcpy(std::forward<decltype(args)>(args)...);
-  };
-
-  static constexpr auto Free = [](auto&&... args) {
-    return musaFree(std::forward<decltype(args)>(args)...);
-  };
-
-  static constexpr auto MemcpyHostToDevice = musaMemcpyHostToDevice;
-
-  static constexpr auto MemcpyDeviceToHost = musaMemcpyDeviceToHost;
-
-  static constexpr auto Memset = musaMemset;
-};
-
-static_assert(Runtime<Device::Type::kMoore>::Validate());
-
-}  // namespace infini::ops
+#include "runtime.h"
 
 #endif
diff --git a/src/native/cuda/nvidia/data_type_.h b/src/native/cuda/nvidia/data_type_.h
index 1266c0c8a..7f1fdc9e8 100644
--- a/src/native/cuda/nvidia/data_type_.h
+++ b/src/native/cuda/nvidia/data_type_.h
@@ -1,29 +1,12 @@
 #ifndef INFINI_OPS_NVIDIA_DATA_TYPE__H_
 #define INFINI_OPS_NVIDIA_DATA_TYPE__H_
 
-// clang-format off
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-// clang-format on
-
-#include "data_type.h"
-#include "native/cuda/nvidia/device_.h"
+#include <infini/rt/nvidia/data_type_.h>
 
 namespace infini::ops {
 
-using cuda_bfloat16 = nv_bfloat16;
-
-using cuda_bfloat162 = nv_bfloat162;
-
-template <>
-struct TypeMap<Device::Type::kNvidia, DataType::kFloat16> {
-  using type = half;
-};
-
-template <>
-struct TypeMap<Device::Type::kNvidia, DataType::kBFloat16> {
-  using type = __nv_bfloat16;
-};
+using infini::rt::cuda_bfloat16;
+using infini::rt::cuda_bfloat162;
 
 }  // namespace infini::ops
 
diff --git a/src/native/cuda/nvidia/device_.h b/src/native/cuda/nvidia/device_.h
index 52cbb2b84..b46caf035 100644
--- a/src/native/cuda/nvidia/device_.h
+++ b/src/native/cuda/nvidia/device_.h
@@ -1,13 +1,6 @@
 #ifndef INFINI_OPS_NVIDIA_DEVICE__H_
 #define INFINI_OPS_NVIDIA_DEVICE__H_
 
-#include "device.h"
-
-namespace infini::ops {
-
-template <>
-struct DeviceEnabled<Device::Type::kNvidia> : std::true_type {};
-
-}  // namespace infini::ops
+#include <infini/rt/nvidia/device_.h>
 
 #endif
diff --git a/src/native/cuda/nvidia/runtime_.h b/src/native/cuda/nvidia/runtime_.h
index 326ecdb24..d3474881d 100644
--- a/src/native/cuda/nvidia/runtime_.h
+++ b/src/native/cuda/nvidia/runtime_.h
@@ -1,42 +1,9 @@
 #ifndef INFINI_OPS_NVIDIA_RUNTIME_H_
 #define INFINI_OPS_NVIDIA_RUNTIME_H_
 
-#include <utility>
+#include <infini/rt/nvidia/runtime_.h>
 
-// clang-format off
-#include <cuda_runtime.h>
-// clang-format on
-
-#include "native/cuda/nvidia/device_.h"
 #include "native/cuda/nvidia/runtime_utils.h"
-#include "native/cuda/runtime_.h"
-
-namespace infini::ops {
-
-template <>
-struct Runtime<Device::Type::kNvidia>
-    : CudaRuntime<Runtime<Device::Type::kNvidia>> {
-  using Stream = cudaStream_t;
-
-  static constexpr Device::Type kDeviceType = Device::Type::kNvidia;
-
-  static constexpr auto Malloc = [](auto&&... args) {
-    return cudaMalloc(std::forward<decltype(args)>(args)...);
-  };
-
-  static constexpr auto Memcpy = cudaMemcpy;
-
-  static constexpr auto Free = cudaFree;
-
-  static constexpr auto MemcpyHostToDevice = cudaMemcpyHostToDevice;
-
-  static constexpr auto MemcpyDeviceToHost = cudaMemcpyDeviceToHost;
-
-  static constexpr auto Memset = cudaMemset;
-};
-
-static_assert(Runtime<Device::Type::kNvidia>::Validate());
-
-}  // namespace infini::ops
+#include "runtime.h"
 
 #endif
diff --git a/src/runtime.h b/src/runtime.h
index 38257893c..251be195d 100644
--- a/src/runtime.h
+++ b/src/runtime.h
@@ -1,54 +1,15 @@
 #ifndef INFINI_OPS_RUNTIME_H_
 #define INFINI_OPS_RUNTIME_H_
 
-#include <type_traits>
-
-#include "device.h"
+#include <infini/rt.h>
 
 namespace infini::ops {
 
 template <Device::Type device_type>
-struct Runtime;
-
-/// ## Interface enforcement via CRTP.
-///
-/// Inherit from the appropriate base to declare which interface level a
-/// `Runtime` specialization implements. After the struct is fully defined, call
-/// `static_assert(Runtime<...>::Validate())`. The chained `Validate()` checks
-/// every required member's existence and signature at compile time, analogous
-/// to how `override` catches signature mismatches for virtual functions.
-///
-/// - `RuntimeBase`: `kDeviceType` only (e.g. CPU).
-/// - `DeviceRuntime`: adds `Stream`, `Malloc`, and `Free` (e.g. Cambricon).
-
-/// Every Runtime must provide `static constexpr Device::Type kDeviceType`.
-template <typename Derived>
-struct RuntimeBase {
-  static constexpr bool Validate() {
-    static_assert(
-        std::is_same_v<std::remove_cv_t<decltype(Derived::kDeviceType)>,
-                       Device::Type>,
-        "`Runtime` must define `static constexpr Device::Type kDeviceType`.");
-    return true;
-  }
-};
+using Runtime = infini::rt::Runtime<device_type>;
 
-/// Runtimes with device memory must additionally provide `Stream`, `Malloc`,
-/// and `Free`.
 template <typename Derived>
-struct DeviceRuntime : RuntimeBase<Derived> {
-  static constexpr bool Validate() {
-    RuntimeBase<Derived>::Validate();
-    static_assert(sizeof(typename Derived::Stream) > 0,
-                  "`Runtime` must define a `Stream` type alias.");
-    static_assert(
-        std::is_invocable_v<decltype(Derived::Malloc), void**, size_t>,
-        "`Runtime::Malloc` must be callable with `(void**, size_t)`.");
-    static_assert(std::is_invocable_v<decltype(Derived::Free), void*>,
-                  "`Runtime::Free` must be callable with `(void*)`.");
-    return true;
-  }
-};
+using DeviceRuntime = infini::rt::DeviceRuntime<Derived>;
 
 }  // namespace infini::ops
 
diff --git a/src/tensor.h b/src/tensor.h
index 290e3cf96..48dc21d54 100644
--- a/src/tensor.h
+++ b/src/tensor.h
@@ -1,157 +1,12 @@
 #ifndef INFINI_OPS_TENSOR_H_
 #define INFINI_OPS_TENSOR_H_
 
-#include <cstdint>
-#include <string>
-#include <vector>
-
-#include "data_type.h"
-#include "device.h"
-#include "hash.h"
+#include <infini/rt.h>
 
 namespace infini::ops {
 
-class Tensor {
- public:
-  using Size = std::size_t;
-
-  using Stride = std::ptrdiff_t;
-
-  using Index = Stride;
-
-  using Shape = std::vector<Size>;
-
-  using Strides = std::vector<Stride>;
-
-  template <typename Shape>
-  Tensor(void* data, const Shape& shape)
-      : data_{data},
-        shape_{shape},
-        dtype_{DefaultDataType()},
-        device_{DefaultDevice()},
-        strides_{DefaultStrides(shape)} {}
-
-  template <typename Shape>
-  Tensor(void* data, const Shape& shape, const DataType& dtype)
-      : data_{data},
-        shape_{shape},
-        dtype_{dtype},
-        device_{DefaultDevice()},
-        strides_{DefaultStrides(shape)} {}
-
-  template <typename Shape>
-  Tensor(void* data, const Shape& shape, const Device& device)
-      : data_{data},
-        shape_{shape},
-        dtype_{DefaultDataType()},
-        device_{device},
-        strides_{DefaultStrides(shape)} {}
-
-  template <typename Shape>
-  Tensor(void* data, const Shape& shape, const DataType& dtype,
-         const Device& device)
-      : data_{data},
-        shape_{shape},
-        dtype_{dtype},
-        device_{device},
-        strides_{DefaultStrides(shape)} {}
-
-  template <typename Shape, typename Strides>
-  Tensor(void* data, const Shape& shape, const DataType& dtype,
-         const Device& device, const Strides& strides)
-      : data_{data},
-        shape_{shape},
-        dtype_{dtype},
-        device_{device},
-        strides_{strides} {}
-
-  Tensor(void* data, std::initializer_list<Size> shape, const DataType& dtype,
-         const Device& device, std::initializer_list<Stride> strides);
-
-  Tensor operator[](const Index& index) const;
-
-  void*& data();
-
-  const void* data() const;
-
-  const DataType& dtype() const;
-
-  const Device& device() const;
-
-  const Shape& shape() const;
-
-  const Strides& strides() const;
-
-  Size size(const Index& index) const;
-
-  Stride stride(const Index& index) const;
-
-  Size ndim() const;
-
-  Size element_size() const;
-
-  Size numel() const;
-
-  Tensor T() const;
-
-  std::string ToString() const;
-
-  bool HasBroadcastDim() const;
-
-  bool IsContiguous() const;
-
- private:
-  static const DataType DefaultDataType();
-
-  static Device DefaultDevice();
-
-  static Strides DefaultStrides(const Shape& shape);
-
-  std::string ToStringHelper() const;
-
-  bool IsMergeable(Size dim_start, Size dim_end) const;
-
-  void* data_{nullptr};
-
-  Shape shape_;
-
-  const DataType dtype_;
-
-  Device device_;
-
-  Strides strides_;
-};
+using Tensor = infini::rt::TensorView;
 
 }  // namespace infini::ops
 
-template <>
-struct std::hash<infini::ops::Tensor> {
-  std::size_t operator()(const infini::ops::Tensor& tensor) const {
-    std::size_t seed{0};
-
-    for (const auto& size : tensor.shape()) {
-      HashCombine(seed, size);
-    }
-
-    HashCombine(seed, tensor.dtype());
-
-    HashCombine(seed, tensor.device());
-
-    for (const auto& stride : tensor.strides()) {
-      HashCombine(seed, stride);
-    }
-
-    return seed;
-  }
-};
-
-template <>
-struct std::equal_to<infini::ops::Tensor> {
-  bool operator()(const infini::ops::Tensor& a,
-                  const infini::ops::Tensor& b) const {
-    return a.dtype() == b.dtype() && a.device() == b.device() &&
-           a.shape() == b.shape() && a.strides() == b.strides();
-  }
-};
-
 #endif
diff --git a/tests/test_generate_wrappers.py b/tests/test_generate_wrappers.py
index b4c97f868..931677553 100644
--- a/tests/test_generate_wrappers.py
+++ b/tests/test_generate_wrappers.py
@@ -62,16 +62,69 @@ class Clamp {
     text = "\n".join(declarations)
 
     assert (
-        "MakeClamp(const Config& config, const Tensor input, "
+        "MakeClamp(const Config& config, Tensor input, "
         "const std::optional<double> min, const std::optional<double> max, "
         "Tensor out)"
     ) in text
     assert (
-        "MakeClamp(const Config& config, const Tensor input, "
+        "MakeClamp(const Config& config, Tensor input, "
         "std::optional<Tensor> min, std::optional<Tensor> max, Tensor out)"
     ) in text
 
 
+def test_operator_call_instantiations_keep_optional_scalar_and_tensor_overloads_distinct(
+    monkeypatch, tmp_path
+):
+    module = _load_generator_module()
+    base_header = tmp_path / "clamp.h"
+    base_header.write_text(
+        """
+class Clamp {
+ public:
+  virtual void operator()(const Tensor input, const std::optional<double> min,
+                          const std::optional<double> max, Tensor out) const = 0;
+  virtual void operator()(const Tensor input, const std::optional<Tensor> min,
+                          const std::optional<Tensor> max, Tensor out) const = 0;
+};
+"""
+    )
+    monkeypatch.setattr(module, "_find_base_header", lambda op_name: base_header)
+
+    operator = module._Operator(
+        "clamp",
+        constructors=[],
+        calls=[
+            module._ParsedFunction(
+                [
+                    module._ParsedArgument("const Tensor", "input"),
+                    module._ParsedArgument("const std::optional<double>", "min"),
+                    module._ParsedArgument("const std::optional<double>", "max"),
+                    module._ParsedArgument("Tensor", "out"),
+                ]
+            ),
+            module._ParsedFunction(
+                [
+                    module._ParsedArgument("const Tensor", "input"),
+                    module._ParsedArgument("const std::optional<Tensor>", "min"),
+                    module._ParsedArgument("const std::optional<Tensor>", "max"),
+                    module._ParsedArgument("Tensor", "out"),
+                ]
+            ),
+        ],
+    )
+
+    declarations, _ = module._generate_operator_call_instantiation_entries(operator)
+
+    text = "\n".join(declarations)
+
+    assert (
+        "Call<Tensor, std::optional<double>, std::optional<double>, Tensor>"
+    ) in text
+    assert (
+        "Call<Tensor, std::optional<Tensor>, std::optional<Tensor>, Tensor>"
+    ) in text
+
+
 def test_pybind_default_implementation_uses_first_active_index(monkeypatch, tmp_path):
     module = _load_generator_module()
     base_header = tmp_path / "mul.h"