diff --git a/simpler_setup/insight_trace/__init__.py b/simpler_setup/insight_trace/__init__.py new file mode 100644 index 000000000..8c1e47a54 --- /dev/null +++ b/simpler_setup/insight_trace/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""MindStudio Insight trace workspace generation.""" diff --git a/simpler_setup/insight_trace/arg_resolver.py b/simpler_setup/insight_trace/arg_resolver.py new file mode 100644 index 000000000..665badbd3 --- /dev/null +++ b/simpler_setup/insight_trace/arg_resolver.py @@ -0,0 +1,40 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +from __future__ import annotations + +from pathlib import Path + +from .args_pkg import _load_arg_spec, load_kernel_dump_args, resolve_builtin_args, resolve_scene_test_args +from .models import KernelSpec, SceneCaseContext, TraceArg + + +def resolve_args( + context: SceneCaseContext, + kernel: KernelSpec, + arg_spec: Path | None = None, + dump_dir: Path | None = None, + dispatch_id: int | None = None, +) -> tuple[TraceArg, ...]: + if arg_spec is not None: + return _load_arg_spec(arg_spec) + if dump_dir is not None: + if dispatch_id is None: + raise ValueError("--dispatch-id is required with --dump-dir") + return load_kernel_dump_args(dump_dir, kernel.func_id, dispatch_id) + if _should_use_builtin_fallback(context): + return resolve_builtin_args(context, kernel) + try: + return resolve_scene_test_args(context) + except Exception: + return resolve_builtin_args(context, kernel) + + +def _should_use_builtin_fallback(context: SceneCaseContext) -> bool: + module_path = context.module_dir.as_posix() + return "paged_attention" in module_path or "spmd_multiblock_mix" in module_path diff --git a/simpler_setup/insight_trace/args_pkg/__init__.py b/simpler_setup/insight_trace/args_pkg/__init__.py new file mode 100644 index 000000000..07d7dc5dd --- /dev/null +++ b/simpler_setup/insight_trace/args_pkg/__init__.py @@ -0,0 +1,11 @@ +from .from_dump import load_kernel_dump_args +from .from_scene_test import resolve_scene_test_args +from .from_spec import _load_arg_spec +from .recipes import resolve_builtin_args + +__all__ = [ + "_load_arg_spec", + "load_kernel_dump_args", + "resolve_builtin_args", + "resolve_scene_test_args", +] diff --git a/simpler_setup/insight_trace/args_pkg/from_dump.py b/simpler_setup/insight_trace/args_pkg/from_dump.py new file mode 100644 index 000000000..8ed5033d3 --- /dev/null +++ b/simpler_setup/insight_trace/args_pkg/from_dump.py @@ -0,0 +1,65 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +from __future__ import annotations + +import json +from pathlib import Path + +from ..models import TraceArg, TraceScalarArg, TraceTensorArg + + +def load_kernel_dump_args(dump_dir: Path, func_id: int, dispatch_id: int) -> tuple[TraceArg, ...]: + dump_path = _kernel_dump_path(dump_dir) + raw = json.loads(dump_path.read_text()) + dispatch = None + for item in raw.get("dispatches", []): + if int(item.get("func_id", -1)) == func_id and int(item.get("dispatch_id", -1)) == dispatch_id: + dispatch = item + break + if dispatch is None: + raise ValueError(f"No kernel args dump dispatch matches func_id={func_id}, dispatch_id={dispatch_id}") + + result: list[TraceArg] = [] + for item in dispatch.get("args", []): + index = int(item["arg_index"]) + kind = item["kind"] + if kind == "tensor": + result.append( + TraceTensorArg( + index=index, + name=f"arg{index}", + dtype=item["dtype"], + shape=tuple(int(dim) for dim in item["shape"]), + ) + ) + elif kind == "scalar": + value = item["value"] + pack_mode = item.get("pack_mode", "value") + if (pack_mode == "bits" or item["dtype"] == "FLOAT32_BITS") and isinstance(value, float): + value = _f32_bits(value) + result.append(TraceScalarArg(index, f"arg{index}", item["dtype"], value, pack_mode)) + elif kind in {"local_context", "global_context"}: + continue + else: + raise ValueError(f"Unknown kernel dump arg kind: {kind}") + return tuple(sorted(result, key=lambda arg: arg.index)) + + +def _kernel_dump_path(dump_dir: Path) -> Path: + candidates = (dump_dir / "kernel_args_dump.json", dump_dir / "tensor_dump" / "kernel_args_dump.json") + for path in candidates: + if path.is_file(): + return path + raise ValueError(f"kernel_args_dump.json not found under {dump_dir}") + + +def _f32_bits(value: float) -> int: + import struct + + return struct.unpack("I", struct.pack("f", value))[0] \ No newline at end of file diff --git a/simpler_setup/insight_trace/args_pkg/from_scene_test.py b/simpler_setup/insight_trace/args_pkg/from_scene_test.py new file mode 100644 index 000000000..af6bd7c42 --- /dev/null +++ b/simpler_setup/insight_trace/args_pkg/from_scene_test.py @@ -0,0 +1,142 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +from __future__ import annotations + +import ctypes + +from simpler.task_interface import ArgDirection +from simpler_setup.scene_test import Scalar, Tensor + +from ..models import SceneCaseContext, TraceArg, TraceScalarArg, TraceTensorArg + +_TORCH_DTYPE_MAP = { + "torch.float32": "FLOAT32", + "torch.float16": "FLOAT16", + "torch.bfloat16": "BFLOAT16", + "torch.int32": "INT32", + "torch.int64": "INT64", + "torch.uint8": "UINT8", + "torch.int8": "INT8", + "torch.bool": "BOOL", +} + +_CTYPES_DTYPE_MAP = { + ctypes.c_bool: "BOOL", + ctypes.c_int8: "INT8", + ctypes.c_uint8: "UINT8", + ctypes.c_int16: "INT16", + ctypes.c_uint16: "UINT16", + ctypes.c_int32: "INT32", + ctypes.c_uint32: "UINT32", + ctypes.c_int64: "INT64", + ctypes.c_uint64: "UINT64", + ctypes.c_float: "FLOAT32_BITS", + ctypes.c_double: "FLOAT64_BITS", +} + +_ROLE_MAP = { + ArgDirection.IN: "input", + ArgDirection.OUT: "output", + ArgDirection.INOUT: "inout", +} + + +def resolve_scene_test_args(context: SceneCaseContext) -> tuple[TraceArg, ...]: + builder = context.test_class().generate_args(context.case.get("params", {})) + orch_signature = context.callable_spec.get("orchestration", {}).get("signature") + if orch_signature is None: + raise ValueError("No orchestration signature available for generic insight trace arg inference") + + result: list[TraceArg] = [] + tensor_index = 0 + arg_index = 0 + for spec in builder.specs: + if isinstance(spec, Tensor): + if tensor_index >= len(orch_signature): + raise ValueError( + f"Tensor '{spec.name}' at index {tensor_index} has no matching orchestration signature entry" + ) + result.append( + TraceTensorArg( + index=arg_index, + name=spec.name, + dtype=_tensor_dtype_name(spec.value), + shape=tuple(int(dim) for dim in spec.value.shape), + role=_ROLE_MAP.get(orch_signature[tensor_index], "input"), + ) + ) + tensor_index += 1 + arg_index += 1 + continue + + if isinstance(spec, Scalar): + result.append( + TraceScalarArg( + index=arg_index, + name=spec.name, + dtype=_scalar_dtype_name(spec.value), + value=_scalar_value(spec.value), + pack_mode=_scalar_pack_mode(spec.value), + ) + ) + arg_index += 1 + continue + + raise ValueError(f"Unsupported TaskArgsBuilder spec type: {type(spec).__name__}") + + if tensor_index != len(orch_signature): + raise ValueError( + f"Orchestration signature length {len(orch_signature)} does not match tensor count {tensor_index}" + ) + return tuple(result) + + +def _tensor_dtype_name(value) -> str: + key = str(value.dtype) + try: + return _TORCH_DTYPE_MAP[key] + except KeyError as exc: + raise ValueError(f"Unsupported tensor dtype for insight trace: {key}") from exc + + +def _scalar_dtype_name(value) -> str: + if type(value) in _CTYPES_DTYPE_MAP: + return _CTYPES_DTYPE_MAP[type(value)] + if isinstance(value, bool): + return "BOOL" + if isinstance(value, int): + return "INT64" + if isinstance(value, float): + return "FLOAT32_BITS" + raise ValueError(f"Unsupported scalar type for insight trace: {type(value).__name__}") + + +def _scalar_value(value): + if isinstance(value, ctypes._SimpleCData): + raw = value.value + else: + raw = value + if isinstance(raw, bool): + return int(raw) + if isinstance(raw, float): + return _f32_bits(raw) + return raw + + +def _scalar_pack_mode(value) -> str: + dtype = _scalar_dtype_name(value) + if dtype.endswith("_BITS"): + return "bits" + return "value" + + +def _f32_bits(value: float) -> int: + import struct + + return struct.unpack("I", struct.pack("f", value))[0] diff --git a/simpler_setup/insight_trace/args_pkg/from_spec.py b/simpler_setup/insight_trace/args_pkg/from_spec.py new file mode 100644 index 000000000..cd681943f --- /dev/null +++ b/simpler_setup/insight_trace/args_pkg/from_spec.py @@ -0,0 +1,54 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +from __future__ import annotations + +import json +from pathlib import Path + +from ..models import TraceArg, TraceScalarArg, TraceTensorArg + + +def _load_arg_spec(path: Path) -> tuple[TraceArg, ...]: + raw = json.loads(path.read_text()) + result: list[TraceArg] = [] + for item in raw.get("args", raw): + if item["kind"] == "tensor": + result.append( + TraceTensorArg( + index=int(item["index"]), + name=item["name"], + dtype=item["dtype"], + shape=tuple(int(dim) for dim in item["shape"]), + role=item.get("role", "input"), + fill=item.get("fill", "zero"), + ) + ) + elif item["kind"] == "scalar": + value = item["value"] + pack_mode = item.get("pack_mode", "value") + if (pack_mode == "bits" or item["dtype"] == "FLOAT32_BITS") and isinstance(value, float): + value = _f32_bits(value) + result.append( + TraceScalarArg( + index=int(item["index"]), + name=item["name"], + dtype=item["dtype"], + value=value, + pack_mode=pack_mode, + ) + ) + else: + raise ValueError(f"Unknown arg kind: {item['kind']}") + return tuple(sorted(result, key=lambda arg: arg.index)) + + +def _f32_bits(value: float) -> int: + import struct + + return struct.unpack("I", struct.pack("f", value))[0] \ No newline at end of file diff --git a/simpler_setup/insight_trace/args_pkg/recipes.py b/simpler_setup/insight_trace/args_pkg/recipes.py new file mode 100644 index 000000000..94de6d4a9 --- /dev/null +++ b/simpler_setup/insight_trace/args_pkg/recipes.py @@ -0,0 +1,131 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +from __future__ import annotations + +import ctypes + +from ..kernel_analyzer import read_arg_indices +from ..models import KernelShape, KernelSpec, SceneCaseContext, TraceArg, TraceScalarArg, TraceTensorArg + + +def resolve_builtin_args(context: SceneCaseContext, kernel: KernelSpec) -> tuple[TraceArg, ...]: + args = _paged_attention_recipe(context, kernel) + read_indices = read_arg_indices(kernel.source_path) + missing = sorted(index for index in read_indices if index not in {arg.index for arg in args}) + if missing: + raise ValueError(f"Argument recipe for {kernel.name} does not cover args indices: {missing}") + return args + + +def _paged_attention_recipe(context: SceneCaseContext, kernel: KernelSpec) -> tuple[TraceArg, ...]: + module_path = context.module_dir.as_posix() + if "spmd_multiblock_mix" in module_path: + total_cl = sum(int(blocks) * 3 for blocks, _ in ((2, 0), (8, 6), (12, 30), (24, 66), (48, 138))) + return ( + TraceTensorArg(0, "output", "FLOAT32", (total_cl * 16,), role="inout"), + TraceScalarArg(1, "base_cl", "UINT64", 0), + ) + if "paged_attention" not in module_path: + raise ValueError("No built-in insight trace recipe for this test module; pass --arg-spec") + params = context.case.get("params", {}) + q_tile = 16 + block_size = int(params["block_size"]) + head_dim = int(params["head_dim"]) + scale = _scalar_value(context, "scale", default=1.0) + + if kernel.shape == KernelShape.SPMD_MIX: + batch = int(params["batch"]) + num_heads = int(params["num_heads"]) + kv_head_num = int(params["kv_head_num"]) + context_len = int(params["context_len"]) + max_model_len = int(params["max_model_len"]) + max_num_blocks_per_req = max_model_len // block_size + q_tile = 16 if num_heads <= 16 else 64 + q_loop = (num_heads + q_tile - 1) // q_tile + total_logical_blocks = batch * q_loop + total_blocks = batch * ((context_len + block_size - 1) // block_size) + return ( + TraceTensorArg(0, "query", "BFLOAT16", (batch, num_heads, head_dim)), + TraceTensorArg(1, "key_cache", "BFLOAT16", (total_blocks, block_size, kv_head_num, head_dim)), + TraceTensorArg(2, "value_cache", "BFLOAT16", (total_blocks, block_size, kv_head_num, head_dim)), + TraceTensorArg(3, "block_table", "INT32", (batch, max_num_blocks_per_req)), + TraceTensorArg(4, "context_lens", "INT32", (batch,)), + TraceTensorArg(5, "out", "FLOAT32", (batch, num_heads, head_dim)), + TraceTensorArg(6, "sij_fifo", "FLOAT32", (1,)), + TraceTensorArg(7, "pij_fifo", "BFLOAT16", (1,)), + TraceTensorArg(8, "oi_fifo", "FLOAT32", (1,)), + TraceScalarArg(9, "scale_value", "FLOAT32_BITS", _f32_bits(float(scale)), "bits"), + TraceScalarArg(10, "num_heads", "UINT64", num_heads), + TraceScalarArg(11, "head_dim", "UINT64", head_dim), + TraceScalarArg(12, "block_size", "UINT64", block_size), + TraceScalarArg(13, "max_num_blocks_per_req", "UINT64", max_num_blocks_per_req), + TraceScalarArg(14, "q_loop", "UINT64", q_loop), + TraceScalarArg(15, "total_logical_blocks", "UINT64", total_logical_blocks), + TraceScalarArg(16, "q_tile", "UINT64", q_tile), + ) + + recipes: dict[str, tuple[TraceArg, ...]] = { + "QK": ( + TraceTensorArg(0, "qi", "BFLOAT16", (q_tile, head_dim)), + TraceTensorArg(1, "kj", "BFLOAT16", (block_size, head_dim)), + TraceTensorArg(2, "sij", "FLOAT32", (q_tile, block_size)), + TraceScalarArg(4, "head_dim", "UINT64", head_dim), + TraceScalarArg(5, "block_size", "UINT64", block_size), + ), + "SF": ( + TraceTensorArg(0, "sij", "FLOAT32", (q_tile, block_size)), + TraceTensorArg(1, "pij", "BFLOAT16", (q_tile, block_size)), + TraceTensorArg(2, "mij", "FLOAT32", (q_tile,)), + TraceTensorArg(3, "lij", "FLOAT32", (q_tile,)), + TraceScalarArg(4, "scale", "FLOAT32_BITS", _f32_bits(float(scale)), "bits"), + ), + "PV": ( + TraceTensorArg(0, "pij", "BFLOAT16", (q_tile, block_size)), + TraceTensorArg(1, "vj", "BFLOAT16", (block_size, head_dim)), + TraceTensorArg(2, "oi_new", "FLOAT32", (q_tile, head_dim)), + TraceScalarArg(4, "block_size", "UINT64", block_size), + TraceScalarArg(5, "head_dim", "UINT64", head_dim), + ), + "UP": ( + TraceTensorArg(0, "mij", "FLOAT32", (q_tile,)), + TraceTensorArg(1, "lij", "FLOAT32", (q_tile,)), + TraceTensorArg(2, "oi_new", "FLOAT32", (q_tile, head_dim)), + TraceTensorArg(3, "mi", "FLOAT32", (q_tile,)), + TraceTensorArg(4, "li", "FLOAT32", (q_tile,)), + TraceTensorArg(5, "oi", "FLOAT32", (q_tile, head_dim)), + TraceTensorArg(6, "dst", "FLOAT32", (q_tile, head_dim)), + TraceScalarArg(7, "is_first", "UINT64", 1), + TraceScalarArg(8, "is_last", "UINT64", 1), + TraceScalarArg(10, "head_dim", "UINT64", head_dim), + ), + } + if kernel.name not in recipes: + raise ValueError(f"No paged_attention recipe for kernel {kernel.name}") + return recipes[kernel.name] + + +def _scalar_value(context: SceneCaseContext, name: str, default): + try: + builder = context.test_class().generate_args(context.case.get("params", {})) + except Exception: # noqa: BLE001 + return default + for spec in getattr(builder, "specs", []): + if getattr(spec, "name", None) != name: + continue + value = spec.value + if isinstance(value, ctypes._SimpleCData): + return value.value + return value + return default + + +def _f32_bits(value: float) -> int: + import struct + + return struct.unpack("I", struct.pack("f", value))[0] \ No newline at end of file diff --git a/simpler_setup/insight_trace/case_loader.py b/simpler_setup/insight_trace/case_loader.py new file mode 100644 index 000000000..eb0b84a1d --- /dev/null +++ b/simpler_setup/insight_trace/case_loader.py @@ -0,0 +1,59 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +from __future__ import annotations + +import importlib.util +import inspect +import sys +from pathlib import Path +from types import ModuleType + +from .models import SceneCaseContext + + +def load_module(path: Path) -> ModuleType: + path = path.resolve() + spec = importlib.util.spec_from_file_location(path.stem, str(path)) + if spec is None or spec.loader is None: + raise ValueError(f"Cannot load test module: {path}") + module = importlib.util.module_from_spec(spec) + sys.modules[spec.name] = module + spec.loader.exec_module(module) + return module + + +def find_scene_test_class(module: ModuleType) -> type: + candidates = [] + for obj in module.__dict__.values(): + if inspect.isclass(obj) and hasattr(obj, "CALLABLE") and hasattr(obj, "CASES"): + if getattr(obj, "_st_level", None) == 2: + candidates.append(obj) + if not candidates: + raise ValueError("No level-2 SceneTestCase class found") + if len(candidates) > 1: + names = ", ".join(cls.__name__ for cls in candidates) + raise ValueError(f"Multiple SceneTestCase classes found: {names}") + return candidates[0] + + +def load_scene_case(test_module: Path, case_name: str) -> SceneCaseContext: + module = load_module(test_module) + test_class = find_scene_test_class(module) + case = next((case for case in test_class.CASES if case.get("name") == case_name), None) + if case is None: + available = ", ".join(case.get("name", "") for case in test_class.CASES) + raise ValueError(f"Unknown case {case_name!r}; available cases: {available}") + return SceneCaseContext( + test_class=test_class, + case=case, + callable_spec=test_class.CALLABLE, + test_module=test_module.resolve(), + module_dir=test_module.resolve().parent, + runtime=getattr(test_class, "_st_runtime", ""), + ) diff --git a/simpler_setup/insight_trace/cli.py b/simpler_setup/insight_trace/cli.py new file mode 100644 index 000000000..b08146add --- /dev/null +++ b/simpler_setup/insight_trace/cli.py @@ -0,0 +1,44 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +from __future__ import annotations + +from .arg_resolver import resolve_args +from .case_loader import load_scene_case +from .cli_pkg import build_parser as _build_parser +from .cli_pkg import hw_block_num as _hw_block_num +from .cli_pkg import run_ptoas, run_simpler +from .cli_pkg import spmd_meta as _spmd_meta +from .kernel_analyzer import select_kernel, validate_single_task_kernel +from .models import TraceBackend +from .runner import run_workspace +from .workspace import create_workspace + +_run_simpler = run_simpler +_run_ptoas = run_ptoas + + +def main(argv: list[str] | None = None) -> int: + parser = _build_parser() + args = parser.parse_args(argv) + try: + if args.backend == TraceBackend.PTOAS.value: + result = run_ptoas(args) + else: + result = run_simpler(args) + except Exception as exc: # noqa: BLE001 + print(f"insight trace failed: {exc}") + return 1 + print(f"Insight trace workspace: {result.workspace_dir}") + if result.simulator_dir is not None: + print(f"MindStudio Insight input: {result.simulator_dir}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/simpler_setup/insight_trace/cli_pkg/__init__.py b/simpler_setup/insight_trace/cli_pkg/__init__.py new file mode 100644 index 000000000..da65cf2e8 --- /dev/null +++ b/simpler_setup/insight_trace/cli_pkg/__init__.py @@ -0,0 +1,18 @@ +from .config import build_trace_config, default_output_dir, hw_block_num, pto_isa_root, resolve_platform_arch, spmd_meta, with_soc_version_override +from .entry_ptoas import build_ptoas_config, run_ptoas +from .entry_simpler import run_simpler +from .parser import build_parser + +__all__ = [ + "build_parser", + "build_ptoas_config", + "build_trace_config", + "default_output_dir", + "hw_block_num", + "pto_isa_root", + "resolve_platform_arch", + "run_ptoas", + "run_simpler", + "spmd_meta", + "with_soc_version_override", +] diff --git a/simpler_setup/insight_trace/cli_pkg/config.py b/simpler_setup/insight_trace/cli_pkg/config.py new file mode 100644 index 000000000..0ff659dd9 --- /dev/null +++ b/simpler_setup/insight_trace/cli_pkg/config.py @@ -0,0 +1,137 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +from __future__ import annotations + +from datetime import datetime +from pathlib import Path + +from simpler_setup.environment import PROJECT_ROOT +from simpler_setup.pto_isa import ensure_pto_isa_root + +from ..models import ( + KernelShape, + PlatformArch, + PlatformFamily, + SPMDDispatch, + SPMDReplayMeta, + TraceBackend, + TraceConfig, +) + + +def build_trace_config(args, context, kernel, trace_args): + output_dir = args.output_dir or default_output_dir(args.case, kernel.name, datetime.now()) + + platform = resolve_platform_arch(args.platform, context) + if args.soc_version: + platform = with_soc_version_override(platform, args.soc_version) + + return TraceConfig( + backend=TraceBackend.SIMPLER, + test_module=args.test_module.resolve(), + case_name=args.case, + kernel_spec=kernel, + args=trace_args, + output_dir=output_dir, + repo_root=PROJECT_ROOT, + cann_home=args.cann_home, + pto_isa_root=pto_isa_root(args.pto_isa_root), + platform_arch=platform, + device=args.device, + launch_count=args.launch_count, + timeout=args.timeout, + hw_block_num=hw_block_num(args, kernel), + dry_run=args.dry_run, + spmd_meta=spmd_meta(kernel, platform), + ) + + +def default_output_dir(case_name: str, kernel_name: str, now: datetime) -> Path: + timestamp = now.strftime("%Y%m%d_%H%M%S") + safe_case = case_name.replace("/", "_") + safe_kernel = kernel_name.replace("/", "_") + return PROJECT_ROOT / "outputs" / f"insight_trace_{safe_case}_{safe_kernel}_{timestamp}" + + +def resolve_platform_arch(platform_arg: str, context) -> PlatformArch: + if platform_arg not in ("a2a3", "a5"): + raise ValueError(f"Unknown platform family: {platform_arg!r}") + family = PlatformFamily.A5 if platform_arg == "a5" else PlatformFamily.A2A3 + + if context is not None: + case_platforms = context.case.get("platforms", []) + if case_platforms and not any(p.startswith(platform_arg) for p in case_platforms): + raise ValueError( + f"Platform family {platform_arg!r} does not match case platforms {case_platforms}" + ) + + return PlatformArch.for_family(family) + + +def with_soc_version_override(platform: PlatformArch, soc_version: str) -> PlatformArch: + return PlatformArch( + family=platform.family, + soc_version=soc_version, + cce_aicore_number=platform.cce_aicore_number, + pto_arch_macro=platform.pto_arch_macro, + cce_aicore_arch=platform.cce_aicore_arch, + prologue_event_id7=platform.prologue_event_id7, + prologue_pipe_fix=platform.prologue_pipe_fix, + runtime_include_roots=platform.runtime_include_roots, + platform_include_roots=platform.platform_include_roots, + ) + + +def hw_block_num(args, kernel) -> int: + if kernel.shape == KernelShape.SPMD_MIX: + if args.platform == PlatformFamily.A5.value and kernel.source_path.name == "kernel_spmd_mix.cpp": + return 2 + 8 + 12 + 24 + 48 + return 24 + return args.hw_block_num + + +def spmd_meta(kernel, platform: PlatformArch | None = None): + if kernel.shape != KernelShape.SPMD_MIX: + return None + if platform is not None and platform.family == PlatformFamily.A5: + if kernel.source_path.name != "kernel_spmd_mix.cpp": + return None + return SPMDReplayMeta( + hw_block_dim=24, + aiv_lanes_per_core=2, + dispatches=( + SPMDDispatch(logical_block_num=2, scalar_overrides=((1, 0),)), + SPMDDispatch(logical_block_num=8, scalar_overrides=((1, 6),)), + SPMDDispatch(logical_block_num=12, scalar_overrides=((1, 30),)), + SPMDDispatch(logical_block_num=24, scalar_overrides=((1, 66),)), + SPMDDispatch(logical_block_num=48, scalar_overrides=((1, 138),)), + ), + ) + if kernel.source_path.name != "paged_attention_parallel.cpp": + raise ValueError(f"No SPMD replay metadata for kernel {kernel.name}") + fifo_depth = 2 + max_q_tile = 64 + max_block_size = 128 + head_dim = 128 + hw_block_dim = 24 + return SPMDReplayMeta( + hw_block_dim=hw_block_dim, + aiv_lanes_per_core=2, + fifo_sizes=( + max_q_tile * max_block_size * 4 * fifo_depth, + max_q_tile * max_block_size * 2 * fifo_depth, + max_q_tile * head_dim * 4 * fifo_depth, + ), + ) + + +def pto_isa_root(path: Path | None) -> Path: + if path is not None: + return path.resolve() + return Path(ensure_pto_isa_root()).resolve() diff --git a/simpler_setup/insight_trace/cli_pkg/entry_ptoas.py b/simpler_setup/insight_trace/cli_pkg/entry_ptoas.py new file mode 100644 index 000000000..62f74d573 --- /dev/null +++ b/simpler_setup/insight_trace/cli_pkg/entry_ptoas.py @@ -0,0 +1,43 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +from __future__ import annotations + +from datetime import datetime + +from ..models import PtoasTraceConfig +from ..ptoas_backend import collect_ptoas_trace, generate_ptoas_workspace +from .config import default_output_dir, pto_isa_root + + +def build_ptoas_config(args): + testcase = f"{args.kernel_base_name}_msprof" + return PtoasTraceConfig( + ptoas_root=args.ptoas_root.resolve(), + source_cpp=args.source_cpp.resolve(), + testcase_name=testcase, + kernel_base_name=args.kernel_base_name, + aicore_arch=args.aicore_arch, + output_dir=args.output_dir or default_output_dir("ptoas", args.kernel_base_name, datetime.now()), + cann_home=args.cann_home, + pto_isa_root=pto_isa_root(args.pto_isa_root), + soc_version=args.soc_version, + timeout=args.timeout, + launch_count=args.launch_count, + kernel_symbol=args.kernel_symbol, + ) + + +def run_ptoas(args): + if args.ptoas_root is None or args.source_cpp is None or args.kernel_base_name is None or args.aicore_arch is None: + raise ValueError("ptoas backend requires --ptoas-root, --source-cpp, --kernel-base-name, and --aicore-arch") + config = build_ptoas_config(args) + if args.dry_run: + workspace = generate_ptoas_workspace(config) + return type("DryRunResult", (), {"workspace_dir": workspace.run_root, "simulator_dir": None})() + return collect_ptoas_trace(config) diff --git a/simpler_setup/insight_trace/cli_pkg/entry_simpler.py b/simpler_setup/insight_trace/cli_pkg/entry_simpler.py new file mode 100644 index 000000000..2f3f70659 --- /dev/null +++ b/simpler_setup/insight_trace/cli_pkg/entry_simpler.py @@ -0,0 +1,29 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +from __future__ import annotations + +from .config import build_trace_config + + +def run_simpler(args): + from .. import cli as cli_module + + if args.test_module is None or args.case is None: + raise ValueError("simpler backend requires test_module and --case") + context = cli_module.load_scene_case(args.test_module, args.case) + kernel = cli_module.select_kernel(context, args.kernel, args.func_id, args.kernel_source) + cli_module.validate_single_task_kernel(kernel) + if args.dump_dir is not None and (args.func_id is None or args.dispatch_id is None): + raise ValueError("simpler backend requires --func-id and --dispatch-id with --dump-dir") + trace_args = cli_module.resolve_args(context, kernel, args.arg_spec, args.dump_dir, args.dispatch_id) + config = build_trace_config(args, context, kernel, trace_args) + result = cli_module.create_workspace(config) + if args.dry_run: + return result + return cli_module.run_workspace(config) diff --git a/simpler_setup/insight_trace/cli_pkg/parser.py b/simpler_setup/insight_trace/cli_pkg/parser.py new file mode 100644 index 000000000..a66239f8e --- /dev/null +++ b/simpler_setup/insight_trace/cli_pkg/parser.py @@ -0,0 +1,53 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +from __future__ import annotations + +import argparse +import os +from pathlib import Path + +from ..models import TraceBackend + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description="Generate MindStudio Insight replay workspaces and final trace artifacts for an incore kernel" + ) + parser.add_argument("test_module", nargs="?", type=Path) + parser.add_argument("--backend", choices=[item.value for item in TraceBackend], default=TraceBackend.SIMPLER.value) + parser.add_argument("--case") + selector = parser.add_mutually_exclusive_group() + selector.add_argument("--kernel") + selector.add_argument("--func-id", type=int) + selector.add_argument("--kernel-source") + parser.add_argument("--platform", default="a2a3") + parser.add_argument("--runtime", default="tensormap_and_ringbuffer") + parser.add_argument("--output-dir", type=Path, help="Workspace directory; final Insight artifacts are exported under /insight_export") + parser.add_argument("--cann-home", type=Path, default=_default_cann_home()) + parser.add_argument("--pto-isa-root", type=Path) + parser.add_argument("--soc-version") + parser.add_argument("--device", type=int, default=0) + parser.add_argument("--launch-count", type=int, default=1) + parser.add_argument("--timeout", type=int, default=120) + parser.add_argument("--hw-block-num", type=int, default=1) + parser.add_argument("--arg-spec", type=Path) + parser.add_argument("--dump-dir", type=Path) + parser.add_argument("--dispatch-id", type=int) + parser.add_argument("--dry-run", action="store_true", help="Generate the replay workspace only and skip build/collect/export") + parser.add_argument("--ptoas-root", type=Path) + parser.add_argument("--source-cpp", type=Path) + parser.add_argument("--kernel-base-name") + parser.add_argument("--aicore-arch") + parser.add_argument("--kernel-symbol") + return parser + + +def _default_cann_home() -> Path | None: + value = os.environ.get("CANN_HOME") or os.environ.get("ASCEND_HOME_PATH") + return Path(value) if value else None diff --git a/simpler_setup/insight_trace/kernel_analyzer.py b/simpler_setup/insight_trace/kernel_analyzer.py new file mode 100644 index 000000000..e27b56707 --- /dev/null +++ b/simpler_setup/insight_trace/kernel_analyzer.py @@ -0,0 +1,92 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +from __future__ import annotations + +import re +from functools import lru_cache +from pathlib import Path + +from .models import KernelShape, KernelSpec, SceneCaseContext + +_ARG_READ_RE = re.compile(r"args\[(\d+)\]") + + +def select_kernel( + context: SceneCaseContext, + kernel_name: str | None = None, + func_id: int | None = None, + kernel_source: str | None = None, +) -> KernelSpec: + selectors = [kernel_name is not None, func_id is not None, kernel_source is not None] + if sum(selectors) != 1: + raise ValueError("Exactly one of --kernel, --func-id, or --kernel-source is required") + + incores = context.callable_spec.get("incores", []) + selected = None + for entry in incores: + source = Path(entry.get("source", "")) + if kernel_name is not None and entry.get("name") == kernel_name: + selected = entry + break + if func_id is not None and entry.get("func_id") == func_id: + selected = entry + break + if kernel_source is not None: + requested = Path(kernel_source) + if source == requested or source.name == requested.name or source.as_posix().endswith(requested.as_posix()): + selected = entry + break + if selected is None: + raise ValueError("Kernel selector did not match any CALLABLE['incores'] entry") + + source_path = Path(selected["source"]).resolve() + shape = classify_kernel(selected.get("core_type", ""), source_path) + return KernelSpec( + name=selected.get("name", source_path.stem), + func_id=int(selected["func_id"]), + core_type=selected.get("core_type", ""), + source_path=source_path, + shape=shape, + ) + + +def classify_kernel(core_type: str, source_path: Path) -> KernelShape: + source = _read_source(source_path) + if "/kernels/mix/" in source_path.as_posix(): + return KernelShape.SPMD_MIX + if "SPMD_LOCAL_CONTEXT_INDEX" in source or "SPMD_GLOBAL_CONTEXT_INDEX" in source: + return KernelShape.SPMD_MIX + if "get_block_idx(args)" in source or "get_sub_block_id(args)" in source or "get_block_num(args)" in source: + return KernelShape.SPMD_MIX + if "args[48]" in source or "args[49]" in source: + return KernelShape.SPMD_MIX + if core_type == "aic": + return KernelShape.AIC_ONLY + if core_type == "aiv": + return KernelShape.AIV_ONLY + if "/kernels/aic/" in source_path.as_posix(): + return KernelShape.AIC_ONLY + if "/kernels/aiv/" in source_path.as_posix(): + return KernelShape.AIV_ONLY + raise ValueError(f"Cannot classify kernel core type: {core_type!r}") + + +def read_arg_indices(source_path: Path) -> set[int]: + return {int(match.group(1)) for match in _ARG_READ_RE.finditer(_read_source(source_path))} + + +def validate_single_task_kernel(kernel: KernelSpec) -> None: + source = _read_source(kernel.source_path) + if "kernel_entry" not in source or "int64_t *args" not in source: + raise ValueError(f"Kernel does not look like kernel_entry(args): {kernel.source_path}") + + +@lru_cache(maxsize=None) +def _read_source(source_path: Path) -> str: + return source_path.resolve().read_text() diff --git a/simpler_setup/insight_trace/models.py b/simpler_setup/insight_trace/models.py new file mode 100644 index 000000000..0980be848 --- /dev/null +++ b/simpler_setup/insight_trace/models.py @@ -0,0 +1,193 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +from __future__ import annotations + +from dataclasses import dataclass, field +from enum import Enum +from pathlib import Path +from typing import Any, Optional, Union + + +class PlatformFamily(str, Enum): + A2A3 = "a2a3" + A5 = "a5" + + +@dataclass(frozen=True) +class PlatformArch: + family: PlatformFamily = PlatformFamily.A2A3 + soc_version: str = "dav_2201" + cce_aicore_number: int = 220 + pto_arch_macro: str = "PTO_NPU_ARCH_A2A3" + cce_aicore_arch: str = "dav-c220" + prologue_event_id7: str = "((event_t)7)" + prologue_pipe_fix: str = "((pipe_t)10)" + runtime_include_roots: tuple[str, ...] = ( + "src/a2a3/runtime/tensormap_and_ringbuffer/runtime", + "src/a2a3/runtime/tensormap_and_ringbuffer/common", + "src/a2a3/runtime/tensormap_and_ringbuffer/orchestration", + ) + platform_include_roots: tuple[str, ...] = ( + "src/a2a3/platform/include", + ) + + @staticmethod + def for_family(family: PlatformFamily) -> "PlatformArch": + if family == PlatformFamily.A5: + return _A5_PLATFORM + return _A2A3_PLATFORM + + +_A2A3_PLATFORM = PlatformArch( + family=PlatformFamily.A2A3, + soc_version="dav_2201", + cce_aicore_number=220, + pto_arch_macro="PTO_NPU_ARCH_A2A3", + cce_aicore_arch="dav-c220", +) + +_A5_PLATFORM = PlatformArch( + family=PlatformFamily.A5, + soc_version="dav_3510", + cce_aicore_number=310, + pto_arch_macro="PTO_NPU_ARCH_A5", + cce_aicore_arch="dav-c310", + prologue_event_id7="((::event_t)7)", + prologue_pipe_fix="((::pipe_t)10)", + runtime_include_roots=( + "src/a5/runtime/tensormap_and_ringbuffer/runtime", + "src/a5/runtime/tensormap_and_ringbuffer/common", + "src/a5/runtime/tensormap_and_ringbuffer/orchestration", + ), + platform_include_roots=( + "src/a5/platform/include", + ), +) + + +class KernelShape(str, Enum): + AIC_ONLY = "aic-only" + AIV_ONLY = "aiv-only" + SPMD_MIX = "spmd-mix" + + +class TraceBackend(str, Enum): + SIMPLER = "simpler" + PTOAS = "ptoas" + + +@dataclass(frozen=True) +class KernelSpec: + name: str + func_id: int + core_type: str + source_path: Path + shape: KernelShape + + +@dataclass(frozen=True) +class TraceTensorArg: + index: int + name: str + dtype: str + shape: tuple[int, ...] + role: str = "input" + fill: str = "zero" + + +@dataclass(frozen=True) +class TraceScalarArg: + index: int + name: str + dtype: str + value: Union[int, float] + pack_mode: str = "value" + + +TraceArg = Union[TraceTensorArg, TraceScalarArg] + + +@dataclass(frozen=True) +class SPMDDispatch: + logical_block_num: int + scalar_overrides: tuple[tuple[int, int], ...] = () + + +@dataclass(frozen=True) +class SPMDReplayMeta: + hw_block_dim: int = 24 + aiv_lanes_per_core: int = 2 + fifo_sizes: tuple[int, int, int] = (0, 0, 0) # (sij_bytes, pij_bytes, oi_bytes) per hw block + dispatches: tuple[SPMDDispatch, ...] = () + + +@dataclass(frozen=True) +class TraceConfig: + backend: TraceBackend + test_module: Optional[Path] + case_name: Optional[str] + kernel_spec: Optional[KernelSpec] + args: tuple[TraceArg, ...] + output_dir: Path + repo_root: Path + cann_home: Optional[Path] + pto_isa_root: Optional[Path] + platform_arch: PlatformArch = _A2A3_PLATFORM + device: int = 0 + launch_count: int = 1 + timeout: int = 120 + hw_block_num: int = 1 + dry_run: bool = False + spmd_meta: Optional[SPMDReplayMeta] = None + metadata: dict[str, Any] = field(default_factory=dict) + + +@dataclass(frozen=True) +class TraceResult: + workspace_dir: Path + simulator_dir: Optional[Path] + collect_log: Optional[Path] + export_log: Optional[Path] + + +@dataclass(frozen=True) +class SceneCaseContext: + test_class: type + case: dict[str, Any] + callable_spec: dict[str, Any] + test_module: Path + module_dir: Path + runtime: str + + +@dataclass(frozen=True) +class PtoasTraceConfig: + ptoas_root: Path + source_cpp: Path + testcase_name: str + kernel_base_name: str + aicore_arch: str + output_dir: Path + cann_home: Optional[Path] + pto_isa_root: Optional[Path] + soc_version: str = "dav_2201" + timeout: int = 120 + launch_count: int = 1 + kernel_symbol: Optional[str] = None + + +@dataclass(frozen=True) +class PtoasWorkspace: + run_root: Path + case_root: Path + case_dir: Path + build_dir: Path + application: Path + kernel_lib: Path + kernel_symbol: str diff --git a/simpler_setup/insight_trace/ptoas_backend.py b/simpler_setup/insight_trace/ptoas_backend.py new file mode 100644 index 000000000..6898e1626 --- /dev/null +++ b/simpler_setup/insight_trace/ptoas_backend.py @@ -0,0 +1,202 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +from __future__ import annotations + +import os +import shutil +import subprocess +from pathlib import Path + +from .models import PtoasTraceConfig, PtoasWorkspace, TraceResult +from .runner import find_simulator_dir, validate_simulator_dir + + +def generate_ptoas_workspace(config: PtoasTraceConfig) -> PtoasWorkspace: + case_root = config.output_dir / "cases" + case_dir = case_root / "ptoas" / config.testcase_name + build_dir = config.output_dir / "build" + generator = config.ptoas_root / "test" / "npu_validation" / "scripts" / "generate_testcase.py" + if not generator.is_file(): + raise FileNotFoundError(f"PTOAS testcase generator not found: {generator}") + + _run( + [ + "python3", + str(generator), + "--input", + str(config.source_cpp), + "--testcase", + config.testcase_name, + "--output-root", + str(case_root), + "--run-mode", + "sim", + "--soc-version", + config.soc_version, + "--aicore-arch", + config.aicore_arch, + ], + cwd=config.ptoas_root, + ) + env = _env(config, build_dir) + _run( + [ + "cmake", + "-G", + "Ninja", + "-S", + str(case_dir), + "-B", + str(build_dir), + f"-DSOC_VERSION={config.soc_version}", + f"-DPTO_ISA_ROOT={config.pto_isa_root or ''}", + ], + env=env, + ) + _run(["cmake", "--build", str(build_dir), "--target", f"{config.testcase_name}_sim"], env=env) + + kernel_lib = build_dir / f"lib{config.testcase_name}_kernel.so" + symbol = config.kernel_symbol or resolve_kernel_symbol(kernel_lib, config.kernel_base_name) + return PtoasWorkspace( + run_root=config.output_dir, + case_root=case_root, + case_dir=case_dir, + build_dir=build_dir, + application=build_dir / f"{config.testcase_name}_sim", + kernel_lib=kernel_lib, + kernel_symbol=symbol, + ) + + +def collect_ptoas_trace(config: PtoasTraceConfig) -> TraceResult: + workspace = generate_ptoas_workspace(config) + _prepare_inputs(workspace.case_dir) + collect_root = config.output_dir / "msprof_collect" + export_root = config.output_dir / "insight_export" + collect_root.mkdir(parents=True, exist_ok=True) + export_root.mkdir(parents=True, exist_ok=True) + collect_root.chmod(0o700) + export_root.chmod(0o700) + env = _env(config, workspace.build_dir) + collect_log = collect_root / "msprof_collect.log" + with collect_log.open("w") as log: + result = subprocess.run( + [ + "msprof", + "op", + "simulator", + f"--application={workspace.application}", + f"--kernel-name={workspace.kernel_symbol}", + f"--launch-count={config.launch_count}", + f"--soc-version={config.soc_version}", + f"--timeout={config.timeout}", + f"--output={collect_root / 'out'}", + ], + cwd=workspace.case_dir, + env=env, + stdout=log, + stderr=subprocess.STDOUT, + text=True, + check=False, + ) + if result.returncode != 0: + raise RuntimeError(f"PTOAS msprof collect failed; see {collect_log}") + export_src = _export_source(collect_root / "out") + export_log = export_root / "msprof_export.log" + with export_log.open("w") as log: + result = subprocess.run( + ["msprof", "op", "simulator", f"--export={export_src}", f"--output={export_root}"], + env=env, + stdout=log, + stderr=subprocess.STDOUT, + text=True, + check=False, + ) + if result.returncode != 0: + raise RuntimeError(f"PTOAS msprof export failed; see {export_log}") + simulator_dir = find_simulator_dir(export_root) + validate_simulator_dir(simulator_dir) + return TraceResult(config.output_dir, simulator_dir, collect_log, export_log) + + +def resolve_kernel_symbol(kernel_lib: Path, kernel_base_name: str) -> str: + result = subprocess.run(["nm", "-D", str(kernel_lib)], check=True, capture_output=True, text=True) + symbols = [] + for line in result.stdout.splitlines(): + parts = line.split() + if len(parts) >= 3 and parts[-2] in {"T", "W"}: + symbols.append(parts[-1]) + if not symbols: + raise ValueError(f"No exported text symbols found in {kernel_lib}") + + proc = subprocess.run( + ["c++filt"], + input="\n".join(symbols), + check=True, + capture_output=True, + text=True, + ) + demangled = proc.stdout.splitlines() + if len(symbols) != len(demangled): + raise ValueError( + f"Mismatched symbol count for {kernel_lib}: {len(symbols)} symbols, {len(demangled)} demangled names" + ) + candidates = [symbol for symbol, name in zip(symbols, demangled) if name.startswith(f"{kernel_base_name}(")] + if len(candidates) != 1: + matches = [f"{symbol}: {name}" for symbol, name in zip(symbols, demangled) if kernel_base_name in name] + raise ValueError(f"Expected one symbol for {kernel_base_name}, found {len(candidates)}: {matches}") + return candidates[0] + + +def _prepare_inputs(case_dir: Path) -> None: + golden = case_dir / "golden.py" + if not golden.is_file(): + raise FileNotFoundError(f"PTOAS golden input generator not found: {golden}") + result = subprocess.run(["python3", str(golden)], cwd=case_dir, check=False) + if result.returncode != 0: + raise RuntimeError("golden.py failed; PTOAS zero-input fallback is not implemented") + + +def _export_source(out_dir: Path) -> Path: + opprofs = sorted(out_dir.glob("OPPROF_*")) + if not opprofs: + raise FileNotFoundError(f"No OPPROF dir under {out_dir}") + opprof = opprofs[-1] + tmp_dump = opprof / "device0" / "tmp_dump" + if tmp_dump.is_dir(): + pc_start = next((opprof / "device0").glob("**/dump/pc_start_addr.txt"), None) + if pc_start and not (tmp_dump / "pc_start_addr.txt").exists(): + shutil.copy2(pc_start, tmp_dump / "pc_start_addr.txt") + return tmp_dump + dump = opprof / "dump" + if dump.is_dir(): + return dump + raise FileNotFoundError(f"No tmp_dump or dump under {opprof}") + + +def _env(config: PtoasTraceConfig, build_dir: Path) -> dict[str, str]: + env = os.environ.copy() + if config.cann_home is not None: + env["CANN_HOME"] = str(config.cann_home) + env["ASCEND_HOME_PATH"] = str(config.cann_home) + if config.pto_isa_root is not None: + env["PTO_ISA_ROOT"] = str(config.pto_isa_root) + cann_home = env.get("ASCEND_HOME_PATH") or env.get("CANN_HOME") + if cann_home: + sim_lib = Path(cann_home) / "aarch64-linux" / "simulator" / config.soc_version / "lib" + env["LD_LIBRARY_PATH"] = ":".join( + [str(build_dir), str(sim_lib), str(Path(cann_home) / "lib64"), env.get("LD_LIBRARY_PATH", "")] + ) + return env + + +def _run(cmd: list[str], cwd: Path | None = None, env: dict[str, str] | None = None) -> None: + result = subprocess.run(cmd, cwd=cwd, env=env, check=False, text=True) + if result.returncode != 0: + raise RuntimeError(f"Command failed: {' '.join(cmd)}") diff --git a/simpler_setup/insight_trace/runner.py b/simpler_setup/insight_trace/runner.py new file mode 100644 index 000000000..349b9efc9 --- /dev/null +++ b/simpler_setup/insight_trace/runner.py @@ -0,0 +1,53 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +from __future__ import annotations + +import os +import subprocess +from pathlib import Path + +from .models import TraceConfig, TraceResult + + +def run_workspace(config: TraceConfig) -> TraceResult: + script = config.output_dir / "run_collect.sh" + env = os.environ.copy() + if config.cann_home is not None: + env["CANN_HOME"] = str(config.cann_home) + if config.pto_isa_root is not None: + env["PTO_ISA_ROOT"] = str(config.pto_isa_root) + env["REPO_ROOT"] = str(config.repo_root) + result = subprocess.run(["bash", str(script)], cwd=str(config.output_dir), env=env, check=False, text=True) + if result.returncode != 0: + raise RuntimeError(f"insight trace collection failed; see {config.output_dir / 'msprof_collect'}") + simulator_dir = find_simulator_dir(config.output_dir / "insight_export") + validate_simulator_dir(simulator_dir) + return TraceResult( + workspace_dir=config.output_dir, + simulator_dir=simulator_dir, + collect_log=config.output_dir / "msprof_collect" / "msprof_collect.log", + export_log=config.output_dir / "insight_export" / "msprof_export.log", + ) + + +def find_simulator_dir(export_root: Path) -> Path: + candidates = sorted(export_root.glob("OPPROF_*/simulator")) + if not candidates: + raise FileNotFoundError(f"No OPPROF simulator directory under {export_root}") + return candidates[-1] + + +def validate_simulator_dir(simulator_dir: Path) -> None: + required = [simulator_dir / "trace.json", simulator_dir / "visualize_data.bin"] + missing = [path for path in required if not path.is_file()] + if missing: + names = ", ".join(str(path) for path in missing) + raise FileNotFoundError(f"Missing Insight trace artifacts: {names}") + if not list(simulator_dir.glob("core*/*instr_exe*.csv")): + raise FileNotFoundError(f"No instr_exe CSV files under {simulator_dir}") diff --git a/simpler_setup/insight_trace/templates.py b/simpler_setup/insight_trace/templates.py new file mode 100644 index 000000000..e3b4514f1 --- /dev/null +++ b/simpler_setup/insight_trace/templates.py @@ -0,0 +1,9 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +from .templates_pkg import render_cmake, render_config, render_host, render_kernel, render_launch, render_run_collect diff --git a/simpler_setup/insight_trace/templates_pkg/__init__.py b/simpler_setup/insight_trace/templates_pkg/__init__.py new file mode 100644 index 000000000..2f26d120a --- /dev/null +++ b/simpler_setup/insight_trace/templates_pkg/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +from .host import render_host +from .kernel import render_kernel, render_launch +from .project import render_cmake, render_config, render_run_collect + +__all__ = [ + "render_kernel", + "render_launch", + "render_host", + "render_cmake", + "render_run_collect", + "render_config", +] diff --git a/simpler_setup/insight_trace/templates_pkg/common.py b/simpler_setup/insight_trace/templates_pkg/common.py new file mode 100644 index 000000000..669cba46d --- /dev/null +++ b/simpler_setup/insight_trace/templates_pkg/common.py @@ -0,0 +1,73 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +from __future__ import annotations + +from functools import reduce +from operator import mul + +from ..models import TraceConfig, TraceScalarArg, TraceTensorArg + +_DTYPE_CPP = { + "FLOAT32": "DataType::FLOAT32", + "BFLOAT16": "DataType::BFLOAT16", + "INT32": "DataType::INT32", + "UINT64": "DataType::UINT64", +} + +_DTYPE_SIZE = { + "FLOAT32": 4, + "BFLOAT16": 2, + "INT32": 4, + "UINT64": 8, +} + +def _prologue(config: TraceConfig) -> str: + arch = config.platform_arch + return f"""#ifndef __CCE_AICORE__ +#define __CCE_AICORE__ {arch.cce_aicore_number} +#endif +#include +#ifndef {arch.pto_arch_macro} +#define {arch.pto_arch_macro} +#endif +#ifndef EVENT_ID7 +#define EVENT_ID7 {arch.prologue_event_id7} +#endif +#ifndef PIPE_FIX +#define PIPE_FIX {arch.prologue_pipe_fix} +#endif +""" + +def _arg_to_json(arg: TraceTensorArg | TraceScalarArg) -> dict: + if isinstance(arg, TraceTensorArg): + return {"index": arg.index, "kind": "tensor", "dtype": arg.dtype, "shape": list(arg.shape), "name": arg.name} + return {"index": arg.index, "kind": "scalar", "dtype": arg.dtype, "value": arg.value, "name": arg.name} + +def _validate_args(args: tuple[TraceTensorArg | TraceScalarArg, ...]) -> None: + seen = set() + for arg in args: + if arg.index < 0 or arg.index >= 50: + raise ValueError(f"Arg index {arg.index} exceeds max slots (50)") + if arg.index in seen: + raise ValueError(f"Duplicate arg index {arg.index}") + seen.add(arg.index) + +def _require_kernel(config: TraceConfig): + if config.kernel_spec is None: + raise ValueError("simpler backend requires a kernel spec") + return config.kernel_spec + +def _require_spmd_meta(config: TraceConfig): + if config.spmd_meta is None: + raise ValueError("SPMD mix kernels require spmd_meta") + return config.spmd_meta + +def _camel(name: str) -> str: + return "".join(part.capitalize() for part in name.split("_")) + diff --git a/simpler_setup/insight_trace/templates_pkg/host.py b/simpler_setup/insight_trace/templates_pkg/host.py new file mode 100644 index 000000000..594d0b793 --- /dev/null +++ b/simpler_setup/insight_trace/templates_pkg/host.py @@ -0,0 +1,563 @@ +from __future__ import annotations + +from functools import reduce +from operator import mul + +from ..models import KernelShape, TraceConfig, TraceScalarArg, TraceTensorArg +from .common import _DTYPE_CPP, _DTYPE_SIZE, _camel, _require_kernel, _require_spmd_meta, _validate_args + +def render_host(config: TraceConfig) -> str: + _validate_args(config.args) + kernel = _require_kernel(config) + tensors = [arg for arg in config.args if isinstance(arg, TraceTensorArg)] + scalars = [arg for arg in config.args if isinstance(arg, TraceScalarArg)] + constants = [] + allocs = [] + tensor_inits = [] + frees = [] + for i, arg in enumerate(tensors): + elements = reduce(mul, arg.shape, 1) + size = elements * _DTYPE_SIZE[arg.dtype] + shape_name = f"shape_{arg.name}" + constants.append(f"constexpr int k{_camel(arg.name)}Bytes = {size};") + constants.append(f"uint32_t {shape_name}[{len(arg.shape)}] = {{{', '.join(str(dim) for dim in arg.shape)}}};") + allocs.append(f" void *d_{arg.name} = nullptr;") + allocs.append(f" ACL_CHECK(aclrtMalloc(&d_{arg.name}, k{_camel(arg.name)}Bytes, ACL_MEM_MALLOC_HUGE_FIRST));") + allocs.append(f" ACL_CHECK(aclrtMemset(d_{arg.name}, k{_camel(arg.name)}Bytes, 0, k{_camel(arg.name)}Bytes));") + tensor_inits.append( + f" tensors[{i}] = make_tensor_external(d_{arg.name}, {shape_name}, {len(arg.shape)}, {_DTYPE_CPP[arg.dtype]});" + ) + frees.append(f" ACL_CHECK(aclrtFree(d_{arg.name}));") + + if kernel.shape == KernelShape.SPMD_MIX: + if config.platform_arch.family.value == "a5" and config.spmd_meta is not None: + return _render_a5_spmd_host(config, tensors, scalars, constants, allocs, tensor_inits, frees) + if config.platform_arch.family.value == "a5": + return _render_a5_mixed_host(config, tensors, scalars, constants, allocs, tensor_inits, frees) + return _render_spmd_host(config, tensors, scalars, constants, allocs, tensor_inits, frees) + return _render_single_task_host(tensors, scalars, constants, allocs, tensor_inits, frees) +def _render_single_task_host(tensors, scalars, constants, allocs, tensor_inits, frees) -> str: + arg_assigns = [] + for i, arg in enumerate(tensors): + arg_assigns.append( + f" args[{arg.index}] = static_cast(reinterpret_cast(d_tensors) + {i} * sizeof(Tensor));" + ) + for arg in scalars: + arg_assigns.append(f" args[{arg.index}] = static_cast({int(arg.value)});") + + return f"""#include +#include +#include +#include + +#include "acl/acl.h" +#include "pto_orchestration_api.h" + +constexpr int kArgsSlots = 50; +constexpr int kNumTensors = {len(tensors)}; +{chr(10).join(constants)} + +#define ACL_CHECK(expr) \ + do {{ \ + aclError _err = (expr); \ + if (_err != ACL_SUCCESS) {{ \ + fprintf(stderr, \"ACL error %d at %s:%d\\n\", _err, __FILE__, __LINE__); \ + exit(1); \ + }} \ + }} while (0) + +extern \"C\" void launch_replay(void *args, void *stream); + +int main() {{ + ACL_CHECK(aclInit(nullptr)); + int device_id = 0; + if (const char *env = getenv(\"ACL_DEVICE_ID\")) {{ + device_id = atoi(env); + }} + ACL_CHECK(aclrtSetDevice(device_id)); + + aclrtStream stream; + ACL_CHECK(aclrtCreateStream(&stream)); + +{chr(10).join(allocs)} + + void *tensors_mem = malloc(kNumTensors * sizeof(Tensor)); + Tensor *tensors = static_cast(tensors_mem); +{chr(10).join(tensor_inits)} + + void *d_tensors = nullptr; + ACL_CHECK(aclrtMalloc(&d_tensors, kNumTensors * sizeof(Tensor), ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMemcpy(d_tensors, kNumTensors * sizeof(Tensor), tensors, kNumTensors * sizeof(Tensor), ACL_MEMCPY_HOST_TO_DEVICE)); + + std::array args{{}}; +{chr(10).join(arg_assigns)} + + void *d_args = nullptr; + ACL_CHECK(aclrtMalloc(&d_args, sizeof(args), ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMemcpy(d_args, sizeof(args), args.data(), sizeof(args), ACL_MEMCPY_HOST_TO_DEVICE)); + + launch_replay(d_args, stream); + ACL_CHECK(aclrtSynchronizeStream(stream)); + + ACL_CHECK(aclrtFree(d_args)); + ACL_CHECK(aclrtFree(d_tensors)); + free(tensors_mem); +{chr(10).join(reversed(frees))} + ACL_CHECK(aclrtDestroyStream(stream)); + ACL_CHECK(aclrtResetDevice(device_id)); + ACL_CHECK(aclFinalize()); + + printf(\"Replay completed successfully.\\n\"); + return 0; +}} +""" +def _render_a5_mixed_host(config, tensors, scalars, constants, allocs, tensor_inits, frees) -> str: + if config.spmd_meta is not None: + raise ValueError("A5 mixed host replay does not support SPMD context synthesis in the first pass") + + aic_assigns = [] + aiv_assigns = [] + for i, arg in enumerate(tensors): + expr = f"static_cast(reinterpret_cast(d_tensors) + {i} * sizeof(Tensor))" + aic_assigns.append(f" row[{arg.index}] = {expr};") + aiv_assigns.append(f" row[{arg.index}] = {expr};") + for arg in scalars: + expr = f"static_cast({int(arg.value)})" + aic_assigns.append(f" row[{arg.index}] = {expr};") + aiv_assigns.append(f" row[{arg.index}] = {expr};") + + return f'''#include +#include +#include +#include + +#include "acl/acl.h" +#include "pto_orchestration_api.h" + +constexpr int kArgsSlots = 50; +constexpr int kNumTensors = {len(tensors)}; +constexpr int kHwBlocks = {config.hw_block_num}; +constexpr int kAivLanesPerCore = 2; +constexpr int kAivRows = kHwBlocks * kAivLanesPerCore; +{chr(10).join(constants)} + +#define ACL_CHECK(expr) \ + do {{ \ + aclError _err = (expr); \ + if (_err != ACL_SUCCESS) {{ \ + fprintf(stderr, "ACL error %d at %s:%d\\n", _err, __FILE__, __LINE__); \ + exit(1); \ + }} \ + }} while (0) + +extern "C" void launch_replay(void *aic_args, void *aiv_args, void *stream); + +int main() {{ + ACL_CHECK(aclInit(nullptr)); + int device_id = 0; + if (const char *env = getenv("ACL_DEVICE_ID")) {{ + device_id = atoi(env); + }} + ACL_CHECK(aclrtSetDevice(device_id)); + + aclrtStream stream; + ACL_CHECK(aclrtCreateStream(&stream)); + +{chr(10).join(allocs)} + + void *tensors_mem = malloc(kNumTensors * sizeof(Tensor)); + Tensor *tensors = static_cast(tensors_mem); +{chr(10).join(tensor_inits)} + + void *d_tensors = nullptr; + ACL_CHECK(aclrtMalloc(&d_tensors, kNumTensors * sizeof(Tensor), ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMemcpy(d_tensors, kNumTensors * sizeof(Tensor), tensors, kNumTensors * sizeof(Tensor), ACL_MEMCPY_HOST_TO_DEVICE)); + + std::vector aic_args(kHwBlocks * kArgsSlots, 0); + std::vector aiv_args(kAivRows * kArgsSlots, 0); + for (int r = 0; r < kHwBlocks; ++r) {{ + int64_t *row = aic_args.data() + static_cast(r) * kArgsSlots; +{chr(10).join(aic_assigns)} + }} + for (int r = 0; r < kAivRows; ++r) {{ + int64_t *row = aiv_args.data() + static_cast(r) * kArgsSlots; +{chr(10).join(aiv_assigns)} + }} + + void *d_aic_args = nullptr; + void *d_aiv_args = nullptr; + ACL_CHECK(aclrtMalloc(&d_aic_args, aic_args.size() * sizeof(int64_t), ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc(&d_aiv_args, aiv_args.size() * sizeof(int64_t), ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMemcpy(d_aic_args, aic_args.size() * sizeof(int64_t), aic_args.data(), aic_args.size() * sizeof(int64_t), ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_aiv_args, aiv_args.size() * sizeof(int64_t), aiv_args.data(), aiv_args.size() * sizeof(int64_t), ACL_MEMCPY_HOST_TO_DEVICE)); + + launch_replay(d_aic_args, d_aiv_args, stream); + ACL_CHECK(aclrtSynchronizeStream(stream)); + + ACL_CHECK(aclrtFree(d_aiv_args)); + ACL_CHECK(aclrtFree(d_aic_args)); + ACL_CHECK(aclrtFree(d_tensors)); + free(tensors_mem); +{chr(10).join(reversed(frees))} + ACL_CHECK(aclrtDestroyStream(stream)); + ACL_CHECK(aclrtResetDevice(device_id)); + ACL_CHECK(aclFinalize()); + + printf("Replay completed successfully.\\n"); + return 0; +}} +''' +def _render_a5_spmd_host(config, tensors, scalars, constants, allocs, tensor_inits, frees) -> str: + meta = _require_spmd_meta(config) + + aic_assigns = [] + aiv_assigns = [] + for i, arg in enumerate(tensors): + expr = f"static_cast(reinterpret_cast(d_tensors) + {i} * sizeof(Tensor))" + aic_assigns.append(f" row[{arg.index}] = {expr};") + aiv_assigns.append(f" row[{arg.index}] = {expr};") + for arg in scalars: + expr = f"static_cast({int(arg.value)})" + aic_assigns.append(f" row[{arg.index}] = {expr};") + aiv_assigns.append(f" row[{arg.index}] = {expr};") + + dispatch_inits = [] + for dispatch in meta.dispatches: + scalar_pairs = ", ".join(f"{{{index}, {value}}}" for index, value in dispatch.scalar_overrides) + dispatch_inits.append(f" {{{dispatch.logical_block_num}, {{{scalar_pairs}}}}}") + + aiv_lane_assigns = chr(10).join(line.replace("row[", "lane_row[") for line in aiv_assigns) + + return f'''#include +#include +#include +#include +#include + +#include "acl/acl.h" +#include "pto_orchestration_api.h" +#include "intrinsic.h" + +constexpr int kArgsSlots = 50; +constexpr int kNumTensors = {len(tensors)}; +constexpr int kAivLanesPerCore = {meta.aiv_lanes_per_core}; +constexpr int kHwBlocks = {config.hw_block_num}; +{chr(10).join(constants)} + +#define ACL_CHECK(expr) \ + do {{ \ + aclError _err = (expr); \ + if (_err != ACL_SUCCESS) {{ \ + fprintf(stderr, "ACL error %d at %s:%d\\n", _err, __FILE__, __LINE__); \ + exit(1); \ + }} \ + }} while (0) + +struct ReplayDispatch {{ + int logical_block_num; + std::vector> scalar_overrides; +}}; + +extern "C" void launch_replay(void *aic_args, void *aiv_args, void *stream); + +int main() {{ + ACL_CHECK(aclInit(nullptr)); + int device_id = 0; + if (const char *env = getenv("ACL_DEVICE_ID")) {{ + device_id = atoi(env); + }} + ACL_CHECK(aclrtSetDevice(device_id)); + + aclrtStream stream; + ACL_CHECK(aclrtCreateStream(&stream)); + +{chr(10).join(allocs)} + + void *tensors_mem = malloc(kNumTensors * sizeof(Tensor)); + Tensor *tensors = static_cast(tensors_mem); +{chr(10).join(tensor_inits)} + + void *d_tensors = nullptr; + ACL_CHECK(aclrtMalloc(&d_tensors, kNumTensors * sizeof(Tensor), ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMemcpy(d_tensors, kNumTensors * sizeof(Tensor), tensors, kNumTensors * sizeof(Tensor), ACL_MEMCPY_HOST_TO_DEVICE)); + + const std::vector dispatches = {{ +{',\n'.join(dispatch_inits)} + }}; + int total_rows = 0; + for (const auto &dispatch : dispatches) {{ + total_rows += dispatch.logical_block_num; + }} + + std::vector aic_local(total_rows); + std::vector aic_global(total_rows); + std::vector aiv_local(total_rows * kAivLanesPerCore); + std::vector aiv_global(total_rows * kAivLanesPerCore); + + int row_base = 0; + for (const auto &dispatch : dispatches) {{ + for (int block_idx = 0; block_idx < dispatch.logical_block_num; ++block_idx) {{ + int row_index = row_base + block_idx; + aic_local[row_index].s_block_idx = block_idx; + aic_local[row_index].s_block_num = dispatch.logical_block_num; + aic_global[row_index].sub_block_id = 0; + for (int lane = 0; lane < kAivLanesPerCore; ++lane) {{ + int lane_row_index = row_index * kAivLanesPerCore + lane; + aiv_local[lane_row_index].s_block_idx = block_idx; + aiv_local[lane_row_index].s_block_num = dispatch.logical_block_num; + aiv_global[lane_row_index].sub_block_id = lane; + }} + }} + row_base += dispatch.logical_block_num; + }} + + void *d_aic_local = nullptr; + void *d_aic_global = nullptr; + void *d_aiv_local = nullptr; + void *d_aiv_global = nullptr; + ACL_CHECK(aclrtMalloc(&d_aic_local, aic_local.size() * sizeof(LocalContext), ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc(&d_aic_global, aic_global.size() * sizeof(GlobalContext), ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc(&d_aiv_local, aiv_local.size() * sizeof(LocalContext), ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc(&d_aiv_global, aiv_global.size() * sizeof(GlobalContext), ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMemcpy(d_aic_local, aic_local.size() * sizeof(LocalContext), aic_local.data(), aic_local.size() * sizeof(LocalContext), ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_aic_global, aic_global.size() * sizeof(GlobalContext), aic_global.data(), aic_global.size() * sizeof(GlobalContext), ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_aiv_local, aiv_local.size() * sizeof(LocalContext), aiv_local.data(), aiv_local.size() * sizeof(LocalContext), ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_aiv_global, aiv_global.size() * sizeof(GlobalContext), aiv_global.data(), aiv_global.size() * sizeof(GlobalContext), ACL_MEMCPY_HOST_TO_DEVICE)); + + std::vector aic_args(total_rows * kArgsSlots, 0); + std::vector aiv_args(total_rows * kAivLanesPerCore * kArgsSlots, 0); + row_base = 0; + for (const auto &dispatch : dispatches) {{ + for (int block_idx = 0; block_idx < dispatch.logical_block_num; ++block_idx) {{ + int row_index = row_base + block_idx; + int64_t *row = aic_args.data() + static_cast(row_index) * kArgsSlots; +{chr(10).join(aic_assigns)} + for (const auto &[arg_index, value] : dispatch.scalar_overrides) {{ + row[arg_index] = value; + }} + row[48] = static_cast(reinterpret_cast(d_aic_local) + row_index * sizeof(LocalContext)); + row[49] = static_cast(reinterpret_cast(d_aic_global) + row_index * sizeof(GlobalContext)); + + for (int lane = 0; lane < kAivLanesPerCore; ++lane) {{ + int lane_row_index = row_index * kAivLanesPerCore + lane; + int64_t *lane_row = aiv_args.data() + static_cast(lane_row_index) * kArgsSlots; +{aiv_lane_assigns} + for (const auto &[arg_index, value] : dispatch.scalar_overrides) {{ + lane_row[arg_index] = value; + }} + lane_row[48] = static_cast(reinterpret_cast(d_aiv_local) + lane_row_index * sizeof(LocalContext)); + lane_row[49] = static_cast(reinterpret_cast(d_aiv_global) + lane_row_index * sizeof(GlobalContext)); + }} + }} + row_base += dispatch.logical_block_num; + }} + + void *d_aic_args = nullptr; + void *d_aiv_args = nullptr; + ACL_CHECK(aclrtMalloc(&d_aic_args, aic_args.size() * sizeof(int64_t), ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc(&d_aiv_args, aiv_args.size() * sizeof(int64_t), ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMemcpy(d_aic_args, aic_args.size() * sizeof(int64_t), aic_args.data(), aic_args.size() * sizeof(int64_t), ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_aiv_args, aiv_args.size() * sizeof(int64_t), aiv_args.data(), aiv_args.size() * sizeof(int64_t), ACL_MEMCPY_HOST_TO_DEVICE)); + + launch_replay(d_aic_args, d_aiv_args, stream); + ACL_CHECK(aclrtSynchronizeStream(stream)); + + ACL_CHECK(aclrtFree(d_aiv_args)); + ACL_CHECK(aclrtFree(d_aic_args)); + ACL_CHECK(aclrtFree(d_aiv_global)); + ACL_CHECK(aclrtFree(d_aiv_local)); + ACL_CHECK(aclrtFree(d_aic_global)); + ACL_CHECK(aclrtFree(d_aic_local)); + ACL_CHECK(aclrtFree(d_tensors)); + free(tensors_mem); +{chr(10).join(reversed(frees))} + ACL_CHECK(aclrtDestroyStream(stream)); + ACL_CHECK(aclrtResetDevice(device_id)); + ACL_CHECK(aclFinalize()); + + printf("Replay completed successfully.\\n"); + return 0; +}} +''' +def _render_spmd_host(config, tensors, scalars, constants, allocs, tensor_inits, frees) -> str: + if config.platform_arch.family.value == "a5": + raise ValueError("A5 SPMD mix host replay is not supported in the first pass; use a single-task A5 kernel or an A5 mixed kernel without context synthesis") + meta = _require_spmd_meta(config) + spmd_fifo_names = {"sij_fifo", "pij_fifo", "oi_fifo"} + + filtered_constants = [] + filtered_allocs = [] + filtered_tensor_inits = [] + filtered_frees = [] + spmd_tensor_lines = [] + filtered_tensors: list[TraceTensorArg] = [] + fifo_tensor_map: dict[str, TraceTensorArg] = {} + + for i, arg in enumerate(tensors): + if arg.name in spmd_fifo_names: + fifo_tensor_map[arg.name] = arg + continue + filtered_tensors.append(arg) + filtered_constants.append(constants[i * 2]) + filtered_constants.append(constants[i * 2 + 1]) + filtered_allocs.extend(allocs[i * 3 : i * 3 + 3]) + filtered_tensor_inits.append(tensor_inits[i]) + filtered_frees.append(frees[i]) + + fifo_specs = ( + ("sij_fifo", "kSpmdSijFifoBytes", "d_sij_fifo", "shape_sij_fifo", "DataType::FLOAT32", meta.fifo_sizes[0]), + ("pij_fifo", "kSpmdPijFifoBytes", "d_pij_fifo", "shape_pij_fifo", "DataType::BFLOAT16", meta.fifo_sizes[1]), + ("oi_fifo", "kSpmdOiFifoBytes", "d_oi_fifo", "shape_oi_fifo", "DataType::FLOAT32", meta.fifo_sizes[2]), + ) + for name, const_name, dev_name, shape_name, dtype_cpp, fifo_bytes in fifo_specs: + arg = fifo_tensor_map.get(name) + if arg is None: + continue + filtered_constants.append(f"constexpr int {const_name} = {fifo_bytes};") + filtered_constants.append(f"uint32_t {shape_name}[1] = {{{fifo_bytes}}};") + spmd_tensor_lines.append( + f" tensors[{len(filtered_tensors)}] = make_tensor_external({dev_name}, {shape_name}, 1, {dtype_cpp});" + ) + filtered_tensors.append(arg) + + aic_assigns = [] + aiv_assigns = [] + for i, arg in enumerate(filtered_tensors): + expr = f"static_cast(reinterpret_cast(d_tensors) + {i} * sizeof(Tensor))" + aic_assigns.append(f" row[{arg.index}] = {expr};") + aiv_assigns.append(f" row[{arg.index}] = {expr};") + for arg in scalars: + aic_assigns.append(f" row[{arg.index}] = static_cast({int(arg.value)});") + aiv_assigns.append(f" row[{arg.index}] = static_cast({int(arg.value)});") + + return f"""#include +#include +#include +#include + +#include \"acl/acl.h\" +#include \"pto_orchestration_api.h\" +#include \"intrinsic.h\" + +constexpr int kArgsSlots = 50; +constexpr int kNumTensors = {len(filtered_tensors)}; +constexpr int kHwBlocks = {meta.hw_block_dim}; +constexpr int kAivLanesPerCore = {meta.aiv_lanes_per_core}; +constexpr int kAivRows = kHwBlocks * kAivLanesPerCore; +{chr(10).join(filtered_constants)} + +#define ACL_CHECK(expr) \ + do {{ \ + aclError _err = (expr); \ + if (_err != ACL_SUCCESS) {{ \ + fprintf(stderr, \"ACL error %d at %s:%d\\n\", _err, __FILE__, __LINE__); \ + exit(1); \ + }} \ + }} while (0) + +extern \"C\" void launch_replay(void *aic_args, void *aiv_args, void *stream); + +int main() {{ + ACL_CHECK(aclInit(nullptr)); + int device_id = 0; + if (const char *env = getenv(\"ACL_DEVICE_ID\")) {{ + device_id = atoi(env); + }} + ACL_CHECK(aclrtSetDevice(device_id)); + + aclrtStream stream; + ACL_CHECK(aclrtCreateStream(&stream)); + +{chr(10).join(filtered_allocs)} + void *d_sij_fifo = nullptr; + void *d_pij_fifo = nullptr; + void *d_oi_fifo = nullptr; + ACL_CHECK(aclrtMalloc(&d_sij_fifo, kSpmdSijFifoBytes, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMemset(d_sij_fifo, kSpmdSijFifoBytes, 0, kSpmdSijFifoBytes)); + ACL_CHECK(aclrtMalloc(&d_pij_fifo, kSpmdPijFifoBytes, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMemset(d_pij_fifo, kSpmdPijFifoBytes, 0, kSpmdPijFifoBytes)); + ACL_CHECK(aclrtMalloc(&d_oi_fifo, kSpmdOiFifoBytes, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMemset(d_oi_fifo, kSpmdOiFifoBytes, 0, kSpmdOiFifoBytes)); + + void *tensors_mem = malloc(kNumTensors * sizeof(Tensor)); + Tensor *tensors = static_cast(tensors_mem); +{chr(10).join(filtered_tensor_inits)} +{chr(10).join(spmd_tensor_lines)} + + void *d_tensors = nullptr; + ACL_CHECK(aclrtMalloc(&d_tensors, kNumTensors * sizeof(Tensor), ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMemcpy(d_tensors, kNumTensors * sizeof(Tensor), tensors, kNumTensors * sizeof(Tensor), ACL_MEMCPY_HOST_TO_DEVICE)); + + std::vector aic_local(kHwBlocks); + std::vector aic_global(kHwBlocks); + std::vector aiv_local(kAivRows); + std::vector aiv_global(kAivRows); + for (int r = 0; r < kHwBlocks; ++r) {{ + aic_local[r].block_idx = r; + aic_local[r].block_num = kHwBlocks; + aic_global[r].sub_block_id = 0; + }} + for (int r = 0; r < kAivRows; ++r) {{ + aiv_local[r].block_idx = r / kAivLanesPerCore; + aiv_local[r].block_num = kHwBlocks; + aiv_global[r].sub_block_id = r % kAivLanesPerCore; + }} + + void *d_aic_local = nullptr; + void *d_aic_global = nullptr; + void *d_aiv_local = nullptr; + void *d_aiv_global = nullptr; + ACL_CHECK(aclrtMalloc(&d_aic_local, aic_local.size() * sizeof(LocalContext), ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc(&d_aic_global, aic_global.size() * sizeof(GlobalContext), ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc(&d_aiv_local, aiv_local.size() * sizeof(LocalContext), ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc(&d_aiv_global, aiv_global.size() * sizeof(GlobalContext), ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMemcpy(d_aic_local, aic_local.size() * sizeof(LocalContext), aic_local.data(), aic_local.size() * sizeof(LocalContext), ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_aic_global, aic_global.size() * sizeof(GlobalContext), aic_global.data(), aic_global.size() * sizeof(GlobalContext), ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_aiv_local, aiv_local.size() * sizeof(LocalContext), aiv_local.data(), aiv_local.size() * sizeof(LocalContext), ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_aiv_global, aiv_global.size() * sizeof(GlobalContext), aiv_global.data(), aiv_global.size() * sizeof(GlobalContext), ACL_MEMCPY_HOST_TO_DEVICE)); + + std::vector aic_args(kHwBlocks * kArgsSlots, 0); + std::vector aiv_args(kAivRows * kArgsSlots, 0); + for (int r = 0; r < kHwBlocks; ++r) {{ + int64_t *row = aic_args.data() + static_cast(r) * kArgsSlots; +{chr(10).join(aic_assigns)} + row[48] = static_cast(reinterpret_cast(d_aic_local) + r * sizeof(LocalContext)); + row[49] = static_cast(reinterpret_cast(d_aic_global) + r * sizeof(GlobalContext)); + }} + for (int r = 0; r < kAivRows; ++r) {{ + int64_t *row = aiv_args.data() + static_cast(r) * kArgsSlots; +{chr(10).join(aiv_assigns)} + row[48] = static_cast(reinterpret_cast(d_aiv_local) + r * sizeof(LocalContext)); + row[49] = static_cast(reinterpret_cast(d_aiv_global) + r * sizeof(GlobalContext)); + }} + + void *d_aic_args = nullptr; + void *d_aiv_args = nullptr; + ACL_CHECK(aclrtMalloc(&d_aic_args, aic_args.size() * sizeof(int64_t), ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc(&d_aiv_args, aiv_args.size() * sizeof(int64_t), ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMemcpy(d_aic_args, aic_args.size() * sizeof(int64_t), aic_args.data(), aic_args.size() * sizeof(int64_t), ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_aiv_args, aiv_args.size() * sizeof(int64_t), aiv_args.data(), aiv_args.size() * sizeof(int64_t), ACL_MEMCPY_HOST_TO_DEVICE)); + + launch_replay(d_aic_args, d_aiv_args, stream); + ACL_CHECK(aclrtSynchronizeStream(stream)); + + ACL_CHECK(aclrtFree(d_aiv_args)); + ACL_CHECK(aclrtFree(d_aic_args)); + ACL_CHECK(aclrtFree(d_aiv_global)); + ACL_CHECK(aclrtFree(d_aiv_local)); + ACL_CHECK(aclrtFree(d_aic_global)); + ACL_CHECK(aclrtFree(d_aic_local)); + ACL_CHECK(aclrtFree(d_tensors)); + free(tensors_mem); + ACL_CHECK(aclrtFree(d_oi_fifo)); + ACL_CHECK(aclrtFree(d_pij_fifo)); + ACL_CHECK(aclrtFree(d_sij_fifo)); +{chr(10).join(reversed(filtered_frees))} + ACL_CHECK(aclrtDestroyStream(stream)); + ACL_CHECK(aclrtResetDevice(device_id)); + ACL_CHECK(aclFinalize()); + + printf(\"Replay completed successfully.\\n\"); + return 0; +}} +""" diff --git a/simpler_setup/insight_trace/templates_pkg/kernel.py b/simpler_setup/insight_trace/templates_pkg/kernel.py new file mode 100644 index 000000000..01493b402 --- /dev/null +++ b/simpler_setup/insight_trace/templates_pkg/kernel.py @@ -0,0 +1,80 @@ +from __future__ import annotations + +from ..models import KernelShape, TraceConfig +from .common import _prologue, _require_kernel + +def render_kernel(config: TraceConfig) -> str: + kernel = _require_kernel(config) + prologue = _prologue(config) + if kernel.shape == KernelShape.SPMD_MIX: + return f"""#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +#if defined(__DAV_CUBE__) || defined(__DAV_VEC__) +{prologue}#include \"{kernel.source_path}\" +#endif + +extern \"C\" __global__ AICORE void replay_entry( + __gm__ int64_t *aic_args, __gm__ int64_t *aiv_args +) {{ +#if defined(__DAV_CUBE__) + int32_t hw_idx = get_block_idx(); + kernel_entry(aic_args + static_cast(hw_idx) * 50); +#endif +#if defined(__DAV_VEC__) + int32_t lane_idx = static_cast( + get_block_idx() * get_subblockdim() + get_subblockid()); + kernel_entry(aiv_args + static_cast(lane_idx) * 50); +#endif +}} +""" + + include_guard = "__DAV_CUBE__" if kernel.shape == KernelShape.AIC_ONLY else "__DAV_VEC__" + return f"""#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +#if defined({include_guard}) +{prologue}#include \"{kernel.source_path}\" +#endif + +extern \"C\" __global__ AICORE void replay_entry(__gm__ int64_t *args) {{ +#if defined({include_guard}) + kernel_entry(args); +#endif +}} +""" + +def render_launch(config: TraceConfig) -> str: + kernel = _require_kernel(config) + if kernel.shape == KernelShape.SPMD_MIX: + return f"""#include +#ifndef AICORE +#define AICORE [aicore] +#endif + +extern \"C\" __global__ AICORE void replay_entry( + __gm__ int64_t *aic_args, __gm__ int64_t *aiv_args); + +extern \"C\" void launch_replay(void *aic_args, void *aiv_args, void *stream) {{ + replay_entry<<<{config.hw_block_num}, nullptr, stream>>>( + (__gm__ int64_t *)aic_args, (__gm__ int64_t *)aiv_args); +}} +""" + + return f"""#include +#ifndef AICORE +#define AICORE [aicore] +#endif + +extern \"C\" __global__ AICORE void replay_entry(__gm__ int64_t *args); + +extern \"C\" void launch_replay(void *args, void *stream) {{ + replay_entry<<<{config.hw_block_num}, nullptr, stream>>>((__gm__ int64_t *)args); +}} +""" diff --git a/simpler_setup/insight_trace/templates_pkg/project.py b/simpler_setup/insight_trace/templates_pkg/project.py new file mode 100644 index 000000000..e338c7dbc --- /dev/null +++ b/simpler_setup/insight_trace/templates_pkg/project.py @@ -0,0 +1,175 @@ +from __future__ import annotations + +from ..models import TraceConfig +from .common import _arg_to_json, _require_kernel + +def render_cmake(config: TraceConfig) -> str: + arch = config.platform_arch + runtime_includes = "\n".join(f" ${{REPO_ROOT}}/{r}" for r in arch.runtime_include_roots) + platform_includes = "\n".join(f" ${{REPO_ROOT}}/{p}" for p in arch.platform_include_roots) + return f"""cmake_minimum_required(VERSION 3.16) + +set(CMAKE_C_COMPILER bisheng) +set(CMAKE_CXX_COMPILER bisheng) + +project(insight_trace_replay) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) + +if(NOT DEFINED ENV{{ASCEND_HOME_PATH}}) + message(FATAL_ERROR "ASCEND_HOME_PATH is not set (source CANN set_env.sh first)") +endif() +set(ASCEND_HOME_PATH $ENV{{ASCEND_HOME_PATH}}) +set(SOC_VERSION {arch.soc_version} CACHE STRING "Simulator SoC version") +set(PTO_ISA_ROOT $ENV{{PTO_ISA_ROOT}} CACHE PATH "PTO ISA root") +set(REPO_ROOT $ENV{{REPO_ROOT}} CACHE PATH "simpler repo root") + +add_compile_options( + -D_FORTIFY_SOURCE=2 -O2 -std=c++17 + -Wno-macro-redefined -Wno-ignored-attributes + -fstack-protector-strong -fPIC +) +add_link_options(-s -Wl,-z,relro -Wl,-z,now) + +set(CMAKE_CCE_COMPILE_OPTIONS + -xcce -fenable-matrix --cce-aicore-enable-tl -fPIC + -Xhost-start -Xhost-end + "SHELL:-mllvm -cce-aicore-stack-size=0x8000" + "SHELL:-mllvm -cce-aicore-function-stack-size=0x8000" + "SHELL:-mllvm -cce-aicore-record-overflow=true" + "SHELL:-mllvm -cce-aicore-addr-transform" + "SHELL:-mllvm -cce-aicore-dcci-insert-for-scalar=false" +) +set(CMAKE_CPP_COMPILE_OPTIONS + -xc++ + "SHELL:-include stdint.h" + "SHELL:-include stddef.h" +) + +set(COMMON_INCLUDES + ${{PTO_ISA_ROOT}}/include + ${{PTO_ISA_ROOT}}/include/pto +{runtime_includes} + ${{REPO_ROOT}}/src/common/task_interface +{platform_includes} + ${{REPO_ROOT}}/simpler_setup/incore + ${{ASCEND_HOME_PATH}}/pkg_inc + ${{ASCEND_HOME_PATH}}/pkg_inc/profiling + ${{ASCEND_HOME_PATH}}/pkg_inc/runtime/runtime + ${{ASCEND_HOME_PATH}}/include +) + +add_library(replay_kernel SHARED replay_kernel.cpp replay_launch.cpp) +target_compile_options(replay_kernel PRIVATE + ${{CMAKE_CCE_COMPILE_OPTIONS}} + --cce-aicore-arch={arch.cce_aicore_arch} + -DREGISTER_BASE -std=c++17) +target_include_directories(replay_kernel PRIVATE ${{COMMON_INCLUDES}}) +target_link_options(replay_kernel PRIVATE --cce-fatobj-link) + +add_executable(replay_host replay_host.cpp) +target_compile_options(replay_host PRIVATE ${{CMAKE_CPP_COMPILE_OPTIONS}}) +target_include_directories(replay_host PRIVATE ${{COMMON_INCLUDES}}) +target_link_directories(replay_host PUBLIC + ${{ASCEND_HOME_PATH}}/lib64 + ${{ASCEND_HOME_PATH}}/aarch64-linux/simulator/${{SOC_VERSION}}/lib +) +target_link_libraries(replay_host PRIVATE + replay_kernel + runtime_camodel + stdc++ ascendcl m tiling_api platform c_sec dl nnopbase +) +""" +def render_run_collect(config: TraceConfig) -> str: + cann_default = str(config.cann_home) if config.cann_home else "" + pto_default = str(config.pto_isa_root) if config.pto_isa_root else "" + return f"""#!/usr/bin/env bash +set -euo pipefail + +CANN_HOME="${{CANN_HOME:-{cann_default}}}" +PTO_ISA_ROOT="${{PTO_ISA_ROOT:-{pto_default}}}" +REPO_ROOT="${{REPO_ROOT:-{config.repo_root}}}" +: "${{CANN_HOME:?CANN_HOME must be set}}" +: "${{PTO_ISA_ROOT:?PTO_ISA_ROOT must be set}}" +: "${{REPO_ROOT:?REPO_ROOT must be set}}" + +WS="$(cd "$(dirname "$(readlink -f "$0")")" && pwd)" +SOC_VERSION="${{SOC_VERSION:-{config.platform_arch.soc_version}}}" +DEVICE_ID="${{TARGET_DEVICE_ID:-${{NPU_LOCKED_DEVICE:-{config.device}}}}}" +BUILD_DIR="$WS/build" +COLLECT_DIR="$WS/msprof_collect" +EXPORT_ROOT="$WS/insight_export" + +source "$CANN_HOME/../cann/set_env.sh" 2>/dev/null \ + || source "$CANN_HOME/set_env.sh" +export ASCEND_HOME_PATH="$CANN_HOME" +SIM_LIB_DIR="$CANN_HOME/aarch64-linux/simulator/$SOC_VERSION/lib" +export LD_LIBRARY_PATH="$BUILD_DIR:$SIM_LIB_DIR:$CANN_HOME/lib64:$CANN_HOME/aarch64-linux/devlib:$CANN_HOME/devlib:${{LD_LIBRARY_PATH:-}}" +export ACL_DEVICE_ID="$DEVICE_ID" +mkdir -p "$BUILD_DIR" "$COLLECT_DIR" "$EXPORT_ROOT" +chmod 700 "$COLLECT_DIR" "$EXPORT_ROOT" + +cmake -G Ninja -S "$WS" -B "$BUILD_DIR" \ + -DSOC_VERSION="$SOC_VERSION" \ + -DPTO_ISA_ROOT="$PTO_ISA_ROOT" \ + -DREPO_ROOT="$REPO_ROOT" +cmake --build "$BUILD_DIR" --target replay_host + +msprof op simulator \ + --application="$BUILD_DIR/replay_host" \ + --kernel-name="replay_entry" \ + --launch-count={config.launch_count} \ + --soc-version="$SOC_VERSION" \ + --timeout={config.timeout} \ + --output="$COLLECT_DIR/out" \ + 2>&1 | tee "$COLLECT_DIR/msprof_collect.log" + +OPPROF_DIR="$(find "$COLLECT_DIR/out" -maxdepth 1 -mindepth 1 -type d -name 'OPPROF_*' | sort | tail -n 1)" +test -n "$OPPROF_DIR" +if [[ -d "$OPPROF_DIR/device0/tmp_dump" ]]; then + EXPORT_SRC="$OPPROF_DIR/device0/tmp_dump" +else + EXPORT_SRC="$OPPROF_DIR/dump" +fi + +msprof op simulator --export="$EXPORT_SRC" --output="$EXPORT_ROOT" \ + 2>&1 | tee "$EXPORT_ROOT/msprof_export.log" +""" +def render_config(config: TraceConfig) -> dict: + kernel = _require_kernel(config) + result = { + "backend": config.backend.value, + "test_module": str(config.test_module) if config.test_module else None, + "case": config.case_name, + "kernel": { + "name": kernel.name, + "func_id": kernel.func_id, + "core_type": kernel.core_type, + "source": str(kernel.source_path), + "shape": kernel.shape.value, + }, + "replay": { + "hw_block_num": config.hw_block_num, + "soc_version": config.platform_arch.soc_version, + "platform": config.platform_arch.family.value, + "timeout": config.timeout, + "launch_count": config.launch_count, + }, + "args": [_arg_to_json(arg) for arg in config.args], + } + if config.spmd_meta is not None: + result["spmd"] = { + "hw_block_dim": config.spmd_meta.hw_block_dim, + "aiv_lanes_per_core": config.spmd_meta.aiv_lanes_per_core, + "fifo_sizes": list(config.spmd_meta.fifo_sizes), + "dispatches": [ + { + "logical_block_num": dispatch.logical_block_num, + "scalar_overrides": [[index, value] for index, value in dispatch.scalar_overrides], + } + for dispatch in config.spmd_meta.dispatches + ], + } + return result diff --git a/simpler_setup/insight_trace/workspace.py b/simpler_setup/insight_trace/workspace.py new file mode 100644 index 000000000..57937f593 --- /dev/null +++ b/simpler_setup/insight_trace/workspace.py @@ -0,0 +1,39 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +from __future__ import annotations + +import json +import stat +from pathlib import Path + +from .models import TraceConfig, TraceResult +from .templates import render_cmake, render_config, render_host, render_kernel, render_launch, render_run_collect + + +def create_workspace(config: TraceConfig) -> TraceResult: + config.output_dir.mkdir(parents=True, exist_ok=True) + files = { + "replay_kernel.cpp": render_kernel(config), + "replay_launch.cpp": render_launch(config), + "replay_host.cpp": render_host(config), + "CMakeLists.txt": render_cmake(config), + "run_collect.sh": render_run_collect(config), + "insight_trace_config.json": json.dumps(render_config(config), indent=2) + "\n", + } + for name, content in files.items(): + path = config.output_dir / name + path.write_text(content) + if name == "run_collect.sh": + path.chmod(path.stat().st_mode | stat.S_IXUSR) + return TraceResult( + workspace_dir=config.output_dir, + simulator_dir=None, + collect_log=config.output_dir / "msprof_collect" / "msprof_collect.log", + export_log=config.output_dir / "insight_export" / "msprof_export.log", + ) diff --git a/simpler_setup/tools/insight_trace.py b/simpler_setup/tools/insight_trace.py new file mode 100644 index 000000000..61af69179 --- /dev/null +++ b/simpler_setup/tools/insight_trace.py @@ -0,0 +1,12 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +from simpler_setup.insight_trace.cli import main + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/ut/py/test_insight_trace_core.py b/tests/ut/py/test_insight_trace_core.py new file mode 100644 index 000000000..e66486af6 --- /dev/null +++ b/tests/ut/py/test_insight_trace_core.py @@ -0,0 +1,672 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +import os +import subprocess +import sys +import textwrap +from argparse import Namespace +from pathlib import Path +from unittest.mock import Mock + +import pytest + +from simpler_setup.insight_trace import cli as insight_trace_cli +from simpler_setup.insight_trace.arg_resolver import load_kernel_dump_args, resolve_args +from simpler_setup.insight_trace.case_loader import load_scene_case +from simpler_setup.insight_trace.kernel_analyzer import select_kernel +from simpler_setup.insight_trace.models import ( + KernelShape, + KernelSpec, + PlatformArch, + PlatformFamily, + SPMDDispatch, + SPMDReplayMeta, + TraceBackend, + TraceConfig, + TraceScalarArg, + TraceTensorArg, +) +from simpler_setup.insight_trace.templates import render_host, render_kernel, render_launch +from simpler_setup.insight_trace.workspace import create_workspace + + +def _a5_bgemm_module() -> Path: + return Path(__file__).resolve().parents[3] / "examples/a5/tensormap_and_ringbuffer/bgemm/test_bgemm.py" + + +def _a5_platform() -> PlatformArch: + return PlatformArch.for_family(PlatformFamily.A5) + + +def _paged_attention_module() -> Path: + return Path(__file__).resolve().parents[3] / "examples/a2a3/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py" + + +def _a5_paged_attention_module() -> Path: + return Path(__file__).resolve().parents[3] / "examples/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py" + + +def _vector_example_module() -> Path: + return Path(__file__).resolve().parents[3] / "tests/st/a2a3/host_build_graph/vector_example/test_vector_example.py" + + +def _a5_spmd_multiblock_mix_module() -> Path: + return Path(__file__).resolve().parents[3] / "tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/test_spmd_multiblock_mix.py" + + +def _spmd_kernel() -> KernelSpec: + path = ( + Path(__file__).resolve().parents[3] + / "tests/st/a2a3/tensormap_and_ringbuffer/spmd_paged_attention/kernels/mix/paged_attention_parallel.cpp" + ) + return KernelSpec("MixedKernels", 0, "mix", path, KernelShape.SPMD_MIX) + + +def _spmd_config(tmp_path: Path) -> TraceConfig: + return TraceConfig( + backend=TraceBackend.SIMPLER, + test_module=None, + case_name="spmd", + kernel_spec=_spmd_kernel(), + args=( + TraceTensorArg(0, "query", "BFLOAT16", (1, 128)), + TraceTensorArg(6, "sij_fifo", "FLOAT32", (1,)), + TraceScalarArg(9, "scale_value", "FLOAT32_BITS", 1065353216, "bits"), + TraceScalarArg(10, "num_heads", "UINT64", 16), + ), + output_dir=tmp_path, + repo_root=tmp_path, + cann_home=None, + pto_isa_root=None, + platform_arch=PlatformArch.for_family(PlatformFamily.A2A3), + hw_block_num=24, + spmd_meta=SPMDReplayMeta( + hw_block_dim=24, + aiv_lanes_per_core=2, + fifo_sizes=(65536, 32768, 65536), + ), + ) + + +def _a5_spmd_config(tmp_path: Path) -> TraceConfig: + context = load_scene_case(_a5_spmd_multiblock_mix_module(), "Case1") + kernel = select_kernel(context, kernel_name="SPMD_MIX_AIC") + return TraceConfig( + backend=TraceBackend.SIMPLER, + test_module=_a5_spmd_multiblock_mix_module(), + case_name="Case1", + kernel_spec=kernel, + args=( + TraceTensorArg(0, "output", "FLOAT32", (4512,), role="inout"), + TraceScalarArg(1, "base_cl", "UINT64", 0), + ), + output_dir=tmp_path, + repo_root=tmp_path, + cann_home=None, + pto_isa_root=None, + platform_arch=_a5_platform(), + hw_block_num=94, + spmd_meta=SPMDReplayMeta( + hw_block_dim=24, + aiv_lanes_per_core=2, + dispatches=( + SPMDDispatch(2, ((1, 0),)), + SPMDDispatch(8, ((1, 6),)), + SPMDDispatch(12, ((1, 30),)), + SPMDDispatch(24, ((1, 66),)), + SPMDDispatch(48, ((1, 138),)), + ), + ), + ) + + +def _a5_bgemm_config(tmp_path: Path) -> TraceConfig: + context = load_scene_case(_a5_bgemm_module(), "default") + kernel = select_kernel(context, kernel_name="GEMM") + args = resolve_args( + None, + None, + arg_spec=_write_arg_spec( + tmp_path, + '{"args":[' + '{"kind":"tensor","index":0,"name":"A","dtype":"FLOAT32","shape":[131072]},' + '{"kind":"tensor","index":1,"name":"B","dtype":"FLOAT32","shape":[131072]},' + '{"kind":"tensor","index":2,"name":"C","dtype":"FLOAT32","shape":[131072]}' + ']}' + ), + ) + return TraceConfig( + backend=TraceBackend.SIMPLER, + test_module=_a5_bgemm_module(), + case_name="default", + kernel_spec=kernel, + args=args, + output_dir=tmp_path, + repo_root=tmp_path, + cann_home=None, + pto_isa_root=None, + platform_arch=_a5_platform(), + hw_block_num=1, + ) + + +def _write_arg_spec(tmp_path: Path, contents: str) -> Path: + spec = tmp_path / "args.json" + spec.write_text(contents) + return spec + + +def test_selects_a5_bgemm_as_spmd_mix(): + context = load_scene_case(_a5_bgemm_module(), "default") + kernel = select_kernel(context, kernel_name="GEMM") + assert kernel.func_id == 0 + assert kernel.shape == KernelShape.SPMD_MIX + + +def test_selects_a5_spmd_multiblock_mix_as_spmd_mix(): + context = load_scene_case(_a5_spmd_multiblock_mix_module(), "Case1") + kernel = select_kernel(context, kernel_name="SPMD_MIX_AIC") + assert kernel.func_id == 0 + assert kernel.shape == KernelShape.SPMD_MIX + + +def test_run_simpler_calls_workspace_and_runner_for_non_dry_run(monkeypatch, tmp_path): + context = load_scene_case(_a5_bgemm_module(), "default") + kernel = select_kernel(context, kernel_name="GEMM") + trace_args = resolve_args( + None, + None, + _write_arg_spec( + tmp_path, + '{"args":[' + '{"kind":"tensor","index":0,"name":"A","dtype":"FLOAT32","shape":[131072]},' + '{"kind":"tensor","index":1,"name":"B","dtype":"FLOAT32","shape":[131072]},' + '{"kind":"tensor","index":2,"name":"C","dtype":"FLOAT32","shape":[131072]}' + ']}' + ), + ) + args = Namespace( + test_module=_a5_bgemm_module(), + case="default", + kernel="GEMM", + func_id=None, + kernel_source=None, + platform="a5", + runtime="tensormap_and_ringbuffer", + output_dir=tmp_path, + cann_home=None, + pto_isa_root=None, + soc_version=None, + device=0, + launch_count=1, + timeout=120, + hw_block_num=1, + arg_spec=None, + dump_dir=None, + dispatch_id=None, + dry_run=False, + ptoas_root=None, + source_cpp=None, + kernel_base_name=None, + aicore_arch=None, + kernel_symbol=None, + backend=TraceBackend.SIMPLER.value, + ) + + create_mock = Mock(return_value="workspace") + run_result = type( + "Result", + (), + {"workspace_dir": tmp_path, "simulator_dir": tmp_path / "insight_export" / "OPPROF_x" / "simulator"}, + )() + run_mock = Mock(return_value=run_result) + monkeypatch.setattr(insight_trace_cli, "create_workspace", create_mock) + monkeypatch.setattr(insight_trace_cli, "run_workspace", run_mock) + monkeypatch.setattr(insight_trace_cli, "load_scene_case", lambda test_module, case: context) + monkeypatch.setattr( + insight_trace_cli, + "select_kernel", + lambda selected_context, kernel_name, func_id, kernel_source: kernel, + ) + monkeypatch.setattr(insight_trace_cli, "validate_single_task_kernel", lambda selected_kernel: None) + monkeypatch.setattr( + insight_trace_cli, + "resolve_args", + lambda selected_context, selected_kernel, arg_spec, dump_dir, dispatch_id: trace_args, + ) + + result = insight_trace_cli._run_simpler(args) + + assert result == run_result + create_mock.assert_called_once() + run_mock.assert_called_once() + + +def test_run_simpler_stops_after_workspace_for_dry_run(monkeypatch, tmp_path): + context = load_scene_case(_a5_bgemm_module(), "default") + kernel = select_kernel(context, kernel_name="GEMM") + trace_args = resolve_args( + None, + None, + _write_arg_spec( + tmp_path, + '{"args":[' + '{"kind":"tensor","index":0,"name":"A","dtype":"FLOAT32","shape":[131072]},' + '{"kind":"tensor","index":1,"name":"B","dtype":"FLOAT32","shape":[131072]},' + '{"kind":"tensor","index":2,"name":"C","dtype":"FLOAT32","shape":[131072]}' + ']}' + ), + ) + args = Namespace( + test_module=_a5_bgemm_module(), + case="default", + kernel="GEMM", + func_id=None, + kernel_source=None, + platform="a5", + runtime="tensormap_and_ringbuffer", + output_dir=tmp_path, + cann_home=None, + pto_isa_root=None, + soc_version=None, + device=0, + launch_count=1, + timeout=120, + hw_block_num=1, + arg_spec=None, + dump_dir=None, + dispatch_id=None, + dry_run=True, + ptoas_root=None, + source_cpp=None, + kernel_base_name=None, + aicore_arch=None, + kernel_symbol=None, + backend=TraceBackend.SIMPLER.value, + ) + + create_result = type("Result", (), {"workspace_dir": tmp_path, "simulator_dir": None})() + create_mock = Mock(return_value=create_result) + run_mock = Mock() + monkeypatch.setattr(insight_trace_cli, "create_workspace", create_mock) + monkeypatch.setattr(insight_trace_cli, "run_workspace", run_mock) + monkeypatch.setattr(insight_trace_cli, "load_scene_case", lambda test_module, case: context) + monkeypatch.setattr( + insight_trace_cli, + "select_kernel", + lambda selected_context, kernel_name, func_id, kernel_source: kernel, + ) + monkeypatch.setattr(insight_trace_cli, "validate_single_task_kernel", lambda selected_kernel: None) + monkeypatch.setattr( + insight_trace_cli, + "resolve_args", + lambda selected_context, selected_kernel, arg_spec, dump_dir, dispatch_id: trace_args, + ) + + result = insight_trace_cli._run_simpler(args) + + assert result == create_result + create_mock.assert_called_once() + run_mock.assert_not_called() + + +def test_loads_paged_attention_case(): + context = load_scene_case(_paged_attention_module(), "CaseSmall1") + assert context.case["params"]["block_size"] == 16 + assert context.runtime == "tensormap_and_ringbuffer" + + +def test_selects_sf_as_aiv_only(): + context = load_scene_case(_paged_attention_module(), "CaseSmall1") + kernel = select_kernel(context, kernel_name="SF") + assert kernel.func_id == 1 + assert kernel.shape == KernelShape.AIV_ONLY + + +def test_resolves_a5_qk_recipe_includes_shape_scalars(): + context = load_scene_case(_a5_paged_attention_module(), "SmallCase1") + kernel = select_kernel(context, kernel_name="QK") + args = resolve_args(context, kernel) + assert args[0] == TraceTensorArg(0, "qi", "BFLOAT16", (16, 16)) + assert args[1] == TraceTensorArg(1, "kj", "BFLOAT16", (16, 16)) + assert args[2] == TraceTensorArg(2, "sij", "FLOAT32", (16, 16)) + assert args[3] == TraceScalarArg(4, "head_dim", "UINT64", 16) + assert args[4] == TraceScalarArg(5, "block_size", "UINT64", 16) + + +def test_resolves_generic_vector_example_args(): + context = load_scene_case(_vector_example_module(), "default") + kernel = select_kernel(context, func_id=0) + args = resolve_args(context, kernel) + assert args == ( + TraceTensorArg(0, "a", "FLOAT32", (128 * 128,), role="input"), + TraceTensorArg(1, "b", "FLOAT32", (128 * 128,), role="input"), + TraceTensorArg(2, "f", "FLOAT32", (128 * 128,), role="output"), + ) + + +def test_load_kernel_dump_args_skips_context_slots(tmp_path): + dump = tmp_path / "tensor_dump" + dump.mkdir() + (dump / "kernel_args_dump.json").write_text( + """ + { + "dispatches": [ + {"dispatch_id": 7, "func_id": 3, "args": [ + {"arg_index": 2, "kind": "tensor", "dtype": "FLOAT32", "shape": [16]}, + {"arg_index": 48, "kind": "local_context", "dtype": "UINT64"}, + {"arg_index": 0, "kind": "tensor", "dtype": "BFLOAT16", "shape": [16, 16]}, + {"arg_index": 3, "kind": "scalar", "dtype": "UINT64", "value": 1, "pack_mode": "bits"}, + {"arg_index": 49, "kind": "global_context", "dtype": "UINT64"} + ]} + ] + } + """ + ) + args = load_kernel_dump_args(tmp_path, func_id=3, dispatch_id=7) + assert args == ( + TraceTensorArg(0, "arg0", "BFLOAT16", (16, 16)), + TraceTensorArg(2, "arg2", "FLOAT32", (16,)), + TraceScalarArg(3, "arg3", "UINT64", 1, "bits"), + ) + + +def test_load_kernel_dump_args_requires_matching_func_and_dispatch(tmp_path): + (tmp_path / "kernel_args_dump.json").write_text('{"dispatches":[{"dispatch_id":1,"func_id":2,"args":[]}]}') + with pytest.raises(ValueError, match="func_id=3, dispatch_id=1"): + load_kernel_dump_args(tmp_path, func_id=3, dispatch_id=1) + + +def test_resolves_spmd_mix_recipe(tmp_path): + context = load_scene_case( + Path(__file__).resolve().parents[3] + / "tests/st/a2a3/tensormap_and_ringbuffer/spmd_paged_attention/test_spmd_paged_attention.py", + "Case1", + ) + kernel = select_kernel(context, kernel_name="PA_AIC") + args = resolve_args(context, kernel) + assert args[0] == TraceTensorArg(0, "query", "BFLOAT16", (256, 16, 128)) + assert args[6] == TraceTensorArg(6, "sij_fifo", "FLOAT32", (1,)) + assert args[9] == TraceScalarArg(9, "scale_value", "FLOAT32_BITS", 1065353216, "bits") + assert args[15] == TraceScalarArg(15, "total_logical_blocks", "UINT64", 256) + assert args[16] == TraceScalarArg(16, "q_tile", "UINT64", 16) + + +def test_resolves_a5_spmd_multiblock_mix_recipe(): + context = load_scene_case(_a5_spmd_multiblock_mix_module(), "Case1") + kernel = select_kernel(context, kernel_name="SPMD_MIX_AIC") + args = resolve_args(context, kernel) + assert args == ( + TraceTensorArg(0, "output", "FLOAT32", (4512,), role="inout"), + TraceScalarArg(1, "base_cl", "UINT64", 0), + ) + + +def test_render_a5_bgemm_uses_a5_platform_values(tmp_path): + config = _a5_bgemm_config(tmp_path) + kernel_rendered = render_kernel(config) + launch_rendered = render_launch(config) + host_rendered = render_host(config) + workspace = create_workspace(config) + + assert "#define __CCE_AICORE__ 310" in kernel_rendered + assert "#define PTO_NPU_ARCH_A5" in kernel_rendered + assert "#define EVENT_ID7 ((::event_t)7)" in kernel_rendered + assert "__gm__ int64_t *aic_args, __gm__ int64_t *aiv_args" in kernel_rendered + assert "launch_replay(void *aic_args, void *aiv_args, void *stream)" in launch_rendered + assert "std::vector aic_args(kHwBlocks * kArgsSlots, 0);" in host_rendered + assert "std::vector aiv_args(kAivRows * kArgsSlots, 0);" in host_rendered + assert "launch_replay(d_aic_args, d_aiv_args, stream);" in host_rendered + assert "dav-c310" in (workspace.workspace_dir / "CMakeLists.txt").read_text() + assert "src/a5/runtime/tensormap_and_ringbuffer/runtime" in (workspace.workspace_dir / "CMakeLists.txt").read_text() + assert 'SOC_VERSION="${SOC_VERSION:-dav_3510}"' in (workspace.workspace_dir / "run_collect.sh").read_text() + assert '"platform": "a5"' in (workspace.workspace_dir / "insight_trace_config.json").read_text() + + +def test_render_spmd_launch_uses_dual_arg_signature(tmp_path): + rendered = render_launch(_spmd_config(tmp_path)) + assert "launch_replay(void *aic_args, void *aiv_args, void *stream)" in rendered + assert "(__gm__ int64_t *)aic_args, (__gm__ int64_t *)aiv_args" in rendered + + +def test_render_spmd_host_includes_context_and_dual_rows(tmp_path): + rendered = render_host(_spmd_config(tmp_path)) + assert "#include \"intrinsic.h\"" in rendered + assert "std::vector aic_local(kHwBlocks);" in rendered + assert "std::vector aiv_global(kAivRows);" in rendered + assert "std::vector aic_args(kHwBlocks * kArgsSlots, 0);" in rendered + assert "std::vector aiv_args(kAivRows * kArgsSlots, 0);" in rendered + assert "row[48] = static_cast(reinterpret_cast(d_aic_local) + r * sizeof(LocalContext));" in rendered + assert "row[49] = static_cast(reinterpret_cast(d_aiv_global) + r * sizeof(GlobalContext));" in rendered + + +def test_render_a5_spmd_host_uses_dispatch_rows_and_a5_context_fields(tmp_path): + rendered = render_host(_a5_spmd_config(tmp_path)) + assert "total_rows += dispatch.logical_block_num;" in rendered + assert "aic_local[row_index].s_block_idx = block_idx;" in rendered + assert "aic_local[row_index].s_block_num = dispatch.logical_block_num;" in rendered + assert "aiv_global[lane_row_index].sub_block_id = lane;" in rendered + assert "std::vector aic_args(total_rows * kArgsSlots, 0);" in rendered + assert "std::vector aiv_args(total_rows * kAivLanesPerCore * kArgsSlots, 0);" in rendered + assert "row[arg_index] = value;" in rendered + assert "lane_row[arg_index] = value;" in rendered + + +def test_render_host_rejects_duplicate_arg_index(tmp_path): + config = TraceConfig( + backend=TraceBackend.SIMPLER, + test_module=None, + case_name="case", + kernel_spec=None, + args=(TraceScalarArg(1, "a", "UINT64", 1), TraceScalarArg(1, "b", "UINT64", 2)), + output_dir=tmp_path, + repo_root=tmp_path, + cann_home=None, + pto_isa_root=None, + ) + with pytest.raises(ValueError, match="Duplicate arg index"): + render_host(config) + + +def test_render_spmd_kernel_uses_dual_arg_wrapper(tmp_path): + rendered = render_kernel(_spmd_config(tmp_path)) + assert "__gm__ int64_t *aic_args, __gm__ int64_t *aiv_args" in rendered + assert "get_block_idx() * get_subblockdim() + get_subblockid()" in rendered + assert "static_cast(hw_idx) * 50" in rendered + + +def test_a5_spmd_meta_and_hw_block_num_are_configured(): + context = load_scene_case(_a5_spmd_multiblock_mix_module(), "Case1") + kernel = select_kernel(context, kernel_name="SPMD_MIX_AIC") + meta = insight_trace_cli._spmd_meta(kernel, _a5_platform()) + assert meta is not None + assert meta.dispatches == ( + SPMDDispatch(2, ((1, 0),)), + SPMDDispatch(8, ((1, 6),)), + SPMDDispatch(12, ((1, 30),)), + SPMDDispatch(24, ((1, 66),)), + SPMDDispatch(48, ((1, 138),)), + ) + hw_block_num = insight_trace_cli._hw_block_num(Namespace(platform="a5", hw_block_num=1), kernel) + assert hw_block_num == 94 + + +@pytest.mark.skipif( + os.environ.get("SIMPLER_RUN_INSIGHT_E2E") != "1", + reason="Set SIMPLER_RUN_INSIGHT_E2E=1 to run full SPMD E2E (build + collect + export)", +) +def test_spmd_e2e_full(tmp_path): + """End-to-end SPMD mix insight trace: workspace generation -> collect -> export -> artifact validation. + + Skipped by default. Enable with: + SIMPLER_RUN_INSIGHT_E2E=1 pytest tests/ut/py/test_insight_trace_core.py::test_spmd_e2e_full + + Requires: CANN installed, msprof available, PTO_ISA_ROOT set. + """ + import os + import subprocess + import sys + + # Absolute path to the real kernel source (matches existing _spmd_kernel) + repo_root = Path(__file__).resolve().parents[3] + kernel_source_rel = ( + "tests/st/a2a3/tensormap_and_ringbuffer/spmd_paged_attention/kernels/mix/paged_attention_parallel.cpp" + ) + kernel_source = repo_root / kernel_source_rel + + # Minimal SceneTestCase module with a tiny SPMD case (batch=1, num_heads=1, head_dim=128, block_size=128) + # so the simulator finishes quickly. + module_content = textwrap.dedent(f"""\ + # Auto-generated for SPMD E2E test + import torch + from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test + + @scene_test(level=2, runtime="tensormap_and_ringbuffer") + class TestSPMDE2E(SceneTestCase): + CALLABLE = {{ + "incores": [ + {{ + "func_id": 0, + "name": "PA_AIC", + "source": "{kernel_source}", + "core_type": "aic", + }}, + {{ + "func_id": 1, + "name": "PA_AIV", + "source": "{kernel_source}", + "core_type": "aiv", + }}, + ], + }} + CASES = [ + {{ + "name": "Tiny", + "platforms": ["a2a3"], + "config": {{"aicpu_thread_num": 1, "block_dim": 2}}, + "params": {{ + "batch": 1, + "num_heads": 1, + "kv_head_num": 1, + "head_dim": 128, + "block_size": 128, + "context_len": 128, + "max_model_len": 128, + "dtype": "bfloat16", + }}, + }}, + ] + def generate_args(self, params): + result = [ + ("query", torch.empty(1, 1, 128, dtype=torch.bfloat16)), + ("key_cache", torch.empty(1, 128, 1, 128, dtype=torch.bfloat16)), + ("value_cache", torch.empty(1, 128, 1, 128, dtype=torch.bfloat16)), + ("block_table", torch.zeros(1, 1, dtype=torch.int32)), + ("context_lens", torch.tensor([128], dtype=torch.int32)), + ("out", torch.zeros(1, 1, 128, dtype=torch.float32)), + ("scale", torch.tensor(1.0)), + ] + specs = [] + for name, value in result: + if isinstance(value, torch.Tensor): + specs.append(Tensor(name, value)) + else: + specs.append(Scalar(name, value)) + return TaskArgsBuilder(*specs) + """) + module_file = tmp_path / "test_spmd_e2e_module.py" + module_file.write_text(module_content) + + arg_spec = tmp_path / "spmd_args.json" + arg_spec.write_text( + textwrap.dedent( + """\ + { + "args": [ + {"kind": "tensor", "index": 0, "name": "query", "dtype": "BFLOAT16", "shape": [1, 1, 128]}, + {"kind": "tensor", "index": 1, "name": "key_cache", "dtype": "BFLOAT16", "shape": [1, 128, 1, 128]}, + {"kind": "tensor", "index": 2, "name": "value_cache", "dtype": "BFLOAT16", "shape": [1, 128, 1, 128]}, + {"kind": "tensor", "index": 3, "name": "block_table", "dtype": "INT32", "shape": [1, 1]}, + {"kind": "tensor", "index": 4, "name": "context_lens", "dtype": "INT32", "shape": [1]}, + {"kind": "tensor", "index": 5, "name": "out", "dtype": "FLOAT32", "shape": [1, 1, 128]}, + {"kind": "tensor", "index": 6, "name": "sij_fifo", "dtype": "FLOAT32", "shape": [1]}, + {"kind": "tensor", "index": 7, "name": "pij_fifo", "dtype": "BFLOAT16", "shape": [1]}, + {"kind": "tensor", "index": 8, "name": "oi_fifo", "dtype": "FLOAT32", "shape": [1]}, + {"kind": "scalar", "index": 9, "name": "scale_value", "dtype": "FLOAT32_BITS", "value": 1065353216, "pack_mode": "bits"}, + {"kind": "scalar", "index": 10, "name": "num_heads", "dtype": "UINT64", "value": 1}, + {"kind": "scalar", "index": 11, "name": "head_dim", "dtype": "UINT64", "value": 128}, + {"kind": "scalar", "index": 12, "name": "block_size", "dtype": "UINT64", "value": 128}, + {"kind": "scalar", "index": 13, "name": "max_num_blocks_per_req", "dtype": "UINT64", "value": 1}, + {"kind": "scalar", "index": 14, "name": "q_loop", "dtype": "UINT64", "value": 1}, + {"kind": "scalar", "index": 15, "name": "total_logical_blocks", "dtype": "UINT64", "value": 1}, + {"kind": "scalar", "index": 16, "name": "q_tile", "dtype": "UINT64", "value": 1} + ] + } + """ + ) + ) + + ws_dir = tmp_path / "ws" + ws_dir.mkdir() + + pto_isa_root = os.environ.get("PTO_ISA_ROOT") + cann_home = os.environ.get("CANN_HOME") + + # Step 1: generate workspace + env = os.environ.copy() + env["PYTHONPATH"] = str(repo_root) + os.pathsep + env.get("PYTHONPATH", "") + gen_result = subprocess.run( + [ + sys.executable, "-m", "simpler_setup.tools.insight_trace", + str(module_file), + "--case", "Tiny", + "--kernel", "PA_AIC", + "--output-dir", str(ws_dir), + "--arg-spec", str(arg_spec), + ], + capture_output=True, + text=True, + env=env, + ) + assert gen_result.returncode == 0, f"workspace generation failed: {gen_result.stderr}" + + assert (ws_dir / "replay_kernel.cpp").is_file() + assert (ws_dir / "replay_launch.cpp").is_file() + assert (ws_dir / "replay_host.cpp").is_file() + assert (ws_dir / "run_collect.sh").is_file() + + # Step 2: run_collect.sh (build + collect + export) + env = os.environ.copy() + env["PYTHONPATH"] = str(repo_root) + os.pathsep + env.get("PYTHONPATH", "") + if pto_isa_root: + env["PTO_ISA_ROOT"] = pto_isa_root + if cann_home: + env["CANN_HOME"] = cann_home + env["REPO_ROOT"] = str(repo_root) + + collect_result = subprocess.run( + ["bash", str(ws_dir / "run_collect.sh")], + capture_output=True, + text=True, + timeout=900, + env=env, + ) + assert collect_result.returncode == 0, f"collect failed: {collect_result.stderr}" + + # Step 3: validate exported artifacts + export_root = ws_dir / "insight_export" + opp_dirs = sorted((export_root).glob("OPPROF_*/simulator")) + assert opp_dirs, f"No OPPROF simulator dir under {export_root}" + sim_dir = opp_dirs[-1] + + assert (sim_dir / "trace.json").is_file(), "missing simulator/trace.json" + assert (sim_dir / "visualize_data.bin").is_file(), "missing simulator/visualize_data.bin" + csv_files = list(sim_dir.glob("core*/*instr_exe*.csv")) + assert csv_files, "no instr_exe CSV files" + assert len(csv_files) >= 3, f"Expected at least 3 CSV files (AIC + 2 AIV lanes), got {len(csv_files)}" \ No newline at end of file