diff --git a/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/_index.md b/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/_index.md new file mode 100644 index 0000000000..c4cd8b1889 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/_index.md @@ -0,0 +1,68 @@ +--- +title: Profile GPT-2 instruction mix with Arm Performix + +description: Learn how to profile GPT-2 inference on Arm Neoverse with the Arm Performix Instruction Mix recipe, identify scalar versus vector execution patterns, and improve throughput with NEON, SVE, and KleidiAI kernels. + +minutes_to_complete: 45 + +who_is_this_for: This is an introductory topic for developers who want to get started using the instruction mix recipe in Arm Performix through a practical example. + +learning_objectives: + - Explain how the Instruction Mix recipe combines static disassembly with runtime sampling to show execution behavior + - Build and run the GPT-2 inference example on an Arm Linux server + - Identify why matrix multiplication dominates runtime and how vectorization changes the instruction mix + - Compare throughput and instruction mix across scalar, NEON, SVE, and KleidiAI implementations + +prerequisites: + - Access to Arm Performix configured with a remote Arm Linux target. For setup, see the [Arm Performix install guide](/install-guides/performix/) + - Basic understanding of C++ and compiler optimization + - Basic understanding of matrix multiplication + - Basic understanding of writing SIMD code with Neon and/or SVE. + +author: + - Kieran Hejmadi + - Oliver Grainge + +### Tags +skilllevels: Introductory +subjects: Performance and Architecture +armips: + - Neoverse +tools_software_languages: + - Arm Performix + - C++ + - LLM + - NEON + - SVE +operatingsystems: + - Linux +further_reading: + - resource: + title: Arm Performix User Guide + link: https://developer.arm.com/documentation/110163/latest + type: documentation + - resource: + title: Find code hotspots with Arm Performix + link: /learning-paths/servers-and-cloud-computing/cpu_hotspot_performix/ + type: learning-path + - resource: + title: Identify code hotspots using Arm Performix through the Arm MCP Server + link: /learning-paths/servers-and-cloud-computing/performix-mcp-agent/ + type: learning-path + - resource: + title: Arm MCP Server GitHub Repository + link: https://github.com/arm/mcp + type: website + - resource: + title: GPT-2 Example repository + link: https://github.com/arm-education/GPT-2-Example + type: website + + + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/_next-steps.md b/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/_next-steps.md new file mode 100644 index 0000000000..727b395ddd --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # The weight controls the order of the pages. _index.md always has weight 1. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/code_hotspot.webp b/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/code_hotspot.webp new file mode 100644 index 0000000000..27b9929efc Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/code_hotspot.webp differ diff --git a/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/code_hotspot_results.webp b/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/code_hotspot_results.webp new file mode 100644 index 0000000000..996530fa2e Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/code_hotspot_results.webp differ diff --git a/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/configuring-performix.webp b/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/configuring-performix.webp new file mode 100644 index 0000000000..ec339e933c Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/configuring-performix.webp differ diff --git a/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/dynamic-functions.webp b/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/dynamic-functions.webp new file mode 100644 index 0000000000..ec6db728c7 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/dynamic-functions.webp differ diff --git a/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/gpt2-baseline.gif b/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/gpt2-baseline.gif new file mode 100644 index 0000000000..5725fccfee Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/gpt2-baseline.gif differ diff --git a/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/hotspot.webp b/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/hotspot.webp new file mode 100644 index 0000000000..3d1b929392 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/hotspot.webp differ diff --git a/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/how-to-1.md b/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/how-to-1.md new file mode 100644 index 0000000000..da66d39212 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/how-to-1.md @@ -0,0 +1,44 @@ +--- +title: Background +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## What the instruction mix recipe shows + +The Arm Performix Instruction Mix recipe shows the types and proportions of machine instructions your workload executes at runtime and in static analysis, so you can see how efficiently your code uses Arm CPU hardware resources. + +The Instruction Mix recipe classifies each instruction into a group. The available groups depend on the Neoverse architecture version you are profiling. Therefore the categories you see may vary depending on the version of Arm Neoverse you are using. Typical categories include: + +- integer and floating-point arithmetic +- memory loads and stores (including exclusive operations) +- control flow instructions, such as branches and loops +- specialized instructions, such as cryptographic operations +- SIMD (Single Instruction, Multiple Data) instructions, including NEON (fixed 128-bit) and SVE (scalable vector length) + +The instruction mix result gives you two complementary views: + +- static analysis, which inspects compiled machine code without running it +- dynamic analysis, which measures instruction usage during real execution + +Together, these views help you verify whether architecture-specific features are actually active in hot code paths. + +## Why instruction mix is useful + +Instruction mix is useful when you need to confirm that performance-critical code uses Arm CPU features effectively. This is especially helpful when you are, for example, validating the effectiveness of compiler autovectorization. + +For example, if a hot function is mostly scalar at runtime when you expected NEON or SVE activity, that often indicates missed vectorization opportunities. You can then focus optimization work on compiler flags, data layout, loop structure, and kernel implementation to improve throughput where it matters most. + +## Why use a GPT-2 workload + +In this Learning Path, you run the [GPT-2 Medium](https://huggingface.co/openai-community/gpt2-medium) model on a minimal C++ inference engine to analyze instruction mix and throughput. This model is available under a [modified MIT License](https://github.com/openai/gpt-2/blob/master/LICENSE). You will confirm that matrix multiplication (`matmul`) is the hot path, then compare how scalar, NEON, and SVE implementations change instruction behavior and token generation speed. + +This example implements only the forward inference path, with no back propagation or training. You do not need to understand the full transformer architecture to complete this Learning Path. Familiarity with matrix multiplication is enough. For background on GPT-2, see the original 2019 paper, [Language Models are Unsupervised Multitask Learners](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) + +You will also try implementing your own `matmul` kernels that target NEON and SVE, then use instruction mix data to verify that these vector paths are active and improving throughput. + +## What you've learned and what's next + +In this section, you learned what instruction mix represents and why it is useful for LLM inference optimization on Arm. Next, you will set up the GPT-2 example, build the binaries, and run a baseline test. diff --git a/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/how-to-2.md b/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/how-to-2.md new file mode 100644 index 0000000000..e5f848b2f5 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/how-to-2.md @@ -0,0 +1,112 @@ +--- +title: Set up and run GPT-2 baseline +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Prepare the environment + +Use an Arm Linux target, such as an Arm Neoverse cloud instance. The results in this Learning Path were collected on a Graviton 3 instance based on Neoverse V1 running Ubuntu 24.04 LTS. If you have not configured Arm Performix yet, complete setup and target connection using the [Arm Performix install guide](/install-guides/performix/). + +Install build prerequisites and clone the GPT-2 example repository: + +```bash +sudo apt update +sudo apt install -y git g++ cmake python3 python3-venv +git clone --recurse-submodules https://github.com/arm-education/GPT-2-Example.git +cd GPT-2-Example +git checkout tags/v0.0.2 +``` + +## Export GPT-2 model assets + +The C++ runtime expects exported model binaries. Create a Python virtual environment, install dependencies, and export GPT-2 Medium weights and vocabulary: + +This Learning Path uses [openai-community/gpt2-medium on Hugging Face](https://huggingface.co/openai-community/gpt2-medium), which corresponds to the GPT-2 Medium model from the original OpenAI GPT-2 release in 2019. The model has 355 million parameters, and in this workflow it runs with unquantized FP32 (32-bit floating-point) weights. + +```bash +python3 -m venv venv +source venv/bin/activate +pip install -r src/requirements.txt +python3 src/export_gpt2.py --model gpt2-medium +``` + +This creates: + +- `models/gpt2-medium/weights.bin` +- `models/gpt2-medium/vocab.bin` + +## Review the source code + +The `src/gpt2.cpp` file implements the end-to-end GPT-2 inference loop. Each generated token triggers a forward pass over all 24 transformer layers. Inside each layer, `matmul` is called multiple times: for the query/key/value projection, the attention output projection, and both feed-forward layers. It is called once more at the end for logits projection over the vocabulary: + +```cpp +// Attention QKV projection +matmul(s.qkv.data(), s.xb.data(), + w.c_attn_w.data()+(size_t)l*3*E*E, + w.c_attn_b.data()+(size_t)l*3*E, E, 3*E); + +// FFN expand +matmul(s.mlp_h.data(), s.xb.data(), + w.mlp_fc_w.data()+(size_t)l*4*E*E, + w.mlp_fc_b.data()+(size_t)l*4*E, E, 4*E); + +// Logits projection (vocab_size x n_embd) +matmul(s.logits.data(), s.x.data(), w.wte.data(), nullptr, E, cfg.vocab_size); +``` + +The `matmul` dispatch in `gpt2.cpp` selects a kernel at compile time based on a preprocessor flag: + +```cpp +static void matmul(float *out, const float *x, const float *W, const float *b, + int n_in, int n_out) { +#if defined(GPT2_KERNEL_NEON) + kernels::matmul_neon(out, x, W, b, n_in, n_out); +#elif defined(GPT2_KERNEL_SVE) + kernels::matmul_sve(out, x, W, b, n_in, n_out); +#elif defined(GPT2_KERNEL_USER) + kernels::matmul_user(out, x, W, b, n_in, n_out); +#else + kernels::matmul_ref(out, x, W, b, n_in, n_out); +#endif +} +``` + +The baseline kernel (`src/kernels/matmul_ref.cpp`) is a straightforward scalar nested for loop: for each output row, it walks the weight matrix row and accumulates a dot product with the input vector: + +```cpp +void matmul_ref(float *out, const float *x, const float *W, const float *b, + int n_in, int n_out) { + for (int i = 0; i < n_out; i++) { + float acc = b ? b[i] : 0.f; + const float *row = W + (size_t)i * n_in; + for (int j = 0; j < n_in; j++) acc += row[j] * x[j]; + out[i] = acc; + } +} +``` + +This scalar implementation can leave NEON and SVE vector units underused if the compiler cannot efficiently autovectorize it. Because `matmul` is called hundreds of times per token, explicitly optimizing this kernel guarantees SIMD execution where most of the available compute is spent. + +## Build and run the baseline + +Configure and build the project with CMake. The project uses `-O2 -g`, which keeps optimization enabled while preserving debug symbols for profiling. + +```bash +cmake -S . -B build -DBUILD_USER_MATMUL=ON +cmake --build build --parallel +``` + +Run the scalar baseline binary: + +```bash +./build/gpt2 --model gpt2-medium "Once upon a time" -n 20 +``` + +![Animated terminal output showing GPT-2 baseline inference running on Arm Linux, including generated text and the final tokens-per-second summary used for baseline comparison.#center](./gpt2-baseline.gif "GPT-2 baseline runtime output on Arm Linux") + +## What you've learned and what's next + +You now have a working baseline binary and model files. Next, you will use the Instruction Mix recipe in Arm Performix to inspect static disassembly and dynamic runtime behavior. diff --git a/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/how-to-3.md b/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/how-to-3.md new file mode 100644 index 0000000000..bb7b153c4a --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/how-to-3.md @@ -0,0 +1,65 @@ +--- +title: Profile with instruction mix +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Find the code hotspot + +Before you optimize, identify where the application spends most of its time. Use the Code Hotspots recipe to periodically sample the running application and build a profile of the functions that execute most often. + +Open Arm Performix and select the **Code Hotspots** recipe. If this is your first run on the target, complete tool deployment as prompted. + +Set the launch command to your baseline binary with the number of tokens (`-n`) set to 150. This value keeps startup overhead small compared to inference time, so the profile minimizes the time taken to load the model weights: + +![Arm Performix Code Hotspots recipe configuration showing launch arguments for the GPT-2 baseline run with -n 150 to emphasize inference runtime.#center](./code_hotspot.webp "Code Hotspots recipe configuration for GPT-2 baseline") + +The results show that `kernels::matmul_ref()` is the hottest function. Double-clicking on the function with show which lines of source code the samples are mostly attributed to the accumulate step of `kernels::matmul_ref()`. + +![Arm Performix hotspot results table showing matmul_ref as the dominant runtime function during GPT-2 baseline inference.#center](./code_hotspot_results.webp "Hotspot results highlighting matmul_ref") + +This confirms that matrix multiplication is the highest-impact optimization target. + +## Assess compiler output + +We can use online tools such as [Compiler Explorer](https://godbolt.org/) to conveniently see how this function is being compiled with the `-O2 -g` flags. + + +{{< godbolt width="100%" height="400px" mode="assembly" opt="-O2 -g" src="void matmul_ref(float *out, const float *x, const float *W, const float *b, int n_in, int n_out)\n{\n for (int i = 0; i < n_out; i++) {\n float acc = b ? b[i] : 0.f;\n const float *row = W + (unsigned long long)i * (unsigned long long)n_in;\n for (int j = 0; j < n_in; j++) {\n acc += row[j] * x[j];\n }\n out[i] = acc;\n }\n}" >}} + +This view helps you spot missed vectorization opportunities. In an optimized build, you would expect the accumulation step to use SIMD instructions, for example `fmla v0.4s, v3.4s, v2.4s` with use of the vector register (`v0->v3`). However, assembly inspection has limitations. First, you need familiarity with SIMD mnemonics to recognize vectorized code. Second, this narrow snippet does not show whether changing compiler flags introduces regressions in other parts of the codebase. Third, and most importantly, this static view does not show which instructions in this function run most often on the CPU. + +The Instruction Mix recipe helps fill this gap. + +## Configure the Instruction Mix recipe + +Open Arm Performix and select the **Instruction Mix** recipe. If this is your first run on the target, complete tool deployment as prompted. +Set the launch command to your baseline binary with the same runtime arguments used for baseline testing: + +```output +/build/gpt2 --model gpt2-medium "Once upon a time" -n 150` +``` + +Use the same model and prompt arguments as your baseline terminal run so the measurements are comparable. + +![Arm Performix recipe setup screen showing Instruction Mix recipe selected with launch settings configured for the GPT-2 baseline executable.#center](./configuring-performix.webp "Configure Arm Performix Instruction Mix recipe") + +### Analyze static disassembly + +After the run completes, review static disassembly first. This view is ordered by percentage contribution and provides a high-level profile of the application’s generated instruction stream. It can help you identify broad characteristics, such as whether the code is branch-heavy, dominated by memory operations, or making effective use of SIMD instructions. Use this static view to understand overall code generation patterns rather than to attribute performance to specific functions or source lines. Dynamic analysis is typically more relevant for optimization because it reflects the instructions that are actually executed at runtime. + +![Arm Performix static disassembly view showing instruction category breakdown for GPT-2 hot paths, highlighting scalar-heavy sections in baseline matmul code.#center](./static_disassembly.webp "Static disassembly instruction classification") + +### Dynamic analysis + +Then inspect dynamic analysis bar chart to see where sampled runtime work is concentrated. Dynamic data is typically more useful for optimization because it reflects actual execution behavior for your input, runtime settings, and call frequencies. + +![Arm Performix dynamic functions table showing most runtime samples in matmul-related functions for baseline GPT-2 inference.#center](./instruction_mix_dynamic_analysis.webp "Dynamic function sample distribution") + +Finally, in dynamic functions, you can break down operation types to individual functions. This is particularly useful when no single function dominates the profile, allowing you to inspect dynamic instruction patterns for specific functions. + +## What you've learned and what's next + +You used Instruction Mix to confirm that baseline runtime is dominated by scalar-heavy `matmul` execution. Next, you will compare updated instruction mix and throughput across scalar, NEON, SVE, and KleidiAI variants. diff --git a/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/how-to-4.md b/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/how-to-4.md new file mode 100644 index 0000000000..2939443d3d --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/how-to-4.md @@ -0,0 +1,71 @@ +--- +title: Optimize +weight: 5 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Complete the challenge (optional) + +In this project, `src/kernels/matmul_user.cpp` is your editable implementation file. The baseline behavior in this file is scalar, and the build uses `-O2 -g`, so compiler optimization is enabled but vector hardware is still underused in the hot loop. + +Use the profiling evidence from Performix to implement your own NEON or SVE intrinsics in `src/kernels/matmul_user.cpp`, then rebuild and profile `gpt2_user`. + +{{% notice Hint %}} + +Focus on the accumulation loop in `matmul_user` (`acc += row[j] * x[j];`). Think about lane utilization, loop unrolling, and handling the tail when the input width is not an exact multiple of the vector width. + +{{% /notice %}} + +Rebuild after your edits: + +```bash +cmake -S . -B build -DBUILD_USER_MATMUL=ON +cmake --build build --parallel +``` + +Then profile the `build/gpt2_user` binary with the same runtime arguments and compare the Instruction Mix and throughput against baseline. + +Example solutions are available in: + +- `src/kernels/matmul_neon.cpp` +- `src/kernels/matmul_sve.cpp` + +You can use `AGENTS.md` in the GPT-2 example repository for guided learning support. + +### Use the Arm MCP Server with Performix (optional) + +You can also use an MCP-compatible coding assistant, such as GitHub Copilot or Codex, with the Arm MCP Server. This gives the assistant direct tool access to run Performix recipes on your remote Arm target and create a faster feedback loop while you iterate on `matmul_user`. + +For setup details, see [Automate x86-to-Arm application migration using Arm MCP Server](/learning-paths/servers-and-cloud-computing/arm-mcp-server/). + +Install Docker if needed, then pull the MCP server image: + +```bash +docker pull armlimited/arm-mcp:latest +``` + +To allow Performix access to remote targets from inside the container, mount your workspace plus SSH key and known hosts in your Codex MCP configuration (example `~/.codex/config.toml`): + +```output +[mcp_servers.arm-mcp] +command = "docker" +args = [ + "run", + "--rm", + "-i", + "-v", "/path/to/your/workspace:/workspace", + "-v", "/path/to/your/ssh/private_key:/run/keys/ssh-key.pem:ro", + "-v", "/path/to/your/ssh/known_hosts:/run/keys/known_hosts:ro", + "armlimited/arm-mcp" +] +``` + +Restart your coding assistant, then prompt it to run Performix Instruction Mix and Code Hotspots on your `gpt2_user` binary and suggest Arm intrinsics improvements. + +![Screenshot of a coding assistant prompt configured to use Arm MCP Server tools for running Performix recipes and analyzing matmul_user optimization opportunities in the GPT-2 workload.#center](./mcp-performix-prompt.webp "Coding assistant prompt for Performix analysis through Arm MCP Server") + +## What you've learned and what's next + +In this optional section, you implemented and profiled a custom `matmul_user` kernel using the same workflow you used for baseline analysis. Next, you will compare instruction mix and throughput across scalar, NEON, SVE, and KleidiAI variants. diff --git a/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/how-to-5.md b/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/how-to-5.md new file mode 100644 index 0000000000..f1fd169753 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/how-to-5.md @@ -0,0 +1,75 @@ +--- +title: Compare Neon and SVE +weight: 7 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Confirm instruction-mix changes after vectorization + +In the earlier build step, you created `gpt2_neon` and `gpt2_sve`. These binaries use the reference solutions in `matmul_neon.cpp` and `matmul_sve.cpp`, respectively. + +Rerun the Instruction Mix recipe with the `gpt2_neon` binary using the same recipe settings and workload arguments as baseline. After the run completes, select both runs and click **Compare** to open a comparison view. + +{{% notice Tip %}} + +Rename each run with a descriptive name, such as baseline and Neon, so you can identify and compare results quickly. + +{{% /notice %}} + +The baseline profile is mostly scalar instructions. After you add Neon intrinsics, the instruction mix shifts toward Advanced SIMD (Neon) instructions, showing that the code is using Arm Neon hardware more effectively. + +![Instruction Mix comparison view showing scalar-dominant baseline versus NEON variant with increased ASIMD instruction share in the hot matmul path.#center](./neon_scalar_instruction_mix.webp "NEON versus scalar instruction mix") + +You can also compare SVE variants in the same way. The increase in SVE operations shows that this path is now utilizing SVE hardware. + +![Instruction Mix comparison between baseline and SVE variant showing increased vector instruction usage and reduced scalar share in hot execution paths.#center](./sve_vs_baseline.webp "SVE versus baseline instruction mix") + +## Compare throughput across kernels + +#### Neon kernel + +You can also inspect the Neon intrinsic implementation using Compiler Explorer, where the hot accumulation step (`vacc`) runs in aSIMD (Neon) registers such as `v0`: + +{{< godbolt width="100%" height="400px" mode="assembly" opt="-O2 -g -march=armv8.2-a+simd" src="#include \n\nvoid matmul_neon(float *out, const float *x, const float *W, const float *b,\n int n_in, int n_out) {\n for (int i = 0; i < n_out; i++) {\n float acc = b ? b[i] : 0.f;\n const float *row = W + (unsigned long long)i * (unsigned long long)n_in;\n int j = 0;\n float32x4_t vacc = vdupq_n_f32(0.f);\n for (; j + 4 <= n_in; j += 4) {\n const float32x4_t vw = vld1q_f32(row + j);\n const float32x4_t vx = vld1q_f32(x + j);\n vacc = vfmaq_f32(vacc, vw, vx);\n }\n acc += vaddvq_f32(vacc);\n for (; j < n_in; j++) acc += row[j] * x[j];\n out[i] = acc;\n }\n}" >}} + + +### SVE kernel + +For variable-length vectorization, compare with an explicit SVE implementation that assumes SVE support, where the hot accumulation step (`vacc`) runs in SVE z registers with predicate-controlled loads and multiply-accumulate: + +{{< godbolt width="100%" height="400px" mode="assembly" opt="-O2 -g -march=armv8.2-a+sve" src="#include \n#include \n\nvoid matmul_sve(float *out, const float *x, const float *W, const float *b,\n int n_in, int n_out) {\n for (int i = 0; i < n_out; i++) {\n float acc = b ? b[i] : 0.f;\n const float *row = W + (size_t)i * n_in;\n svfloat32_t vacc = svdup_f32(0.f);\n int j = 0;\n while (j < n_in) {\n svbool_t pg = svwhilelt_b32((uint64_t)j, (uint64_t)n_in);\n svfloat32_t vw = svld1(pg, row + j);\n svfloat32_t vx = svld1(pg, x + j);\n vacc = svmla_f32_m(pg, vacc, vw, vx);\n j += svcntw();\n }\n acc += svaddv_f32(svptrue_b32(), vacc);\n out[i] = acc;\n }\n}" >}} + + +For a full-page view, open [Godbolt session with all three matmul kernels](https://godbolt.org/z/E4a7Wxh8K). + +## Speed up + +Run the provided comparison script to measure tokens per second across all available binaries: + +```bash bash { command_line="user@host | 2-30"} +./compare_gpt2_variants.sh +Model: gpt2-medium +Prompt: Once upon a time +Tokens: 20 +Runs: 1 + +== gpt2 == +run 1: 3.04976 tok/s +avg: 3.049760 tok/s + +== gpt2_neon == +run 1: 11.3649 tok/s +avg: 11.364900 tok/s + +== gpt2_sve == +run 1: 13.907 tok/s +avg: 13.907000 tok/s + +== gpt2_user == +run 1: 3.04859 tok/s +avg: 3.048590 tok/s +``` + +These results show that intrinsics increase throughput from about 3 tok/s in the scalar baseline to about 13.9 tok/s with SVE. Next, you will use optimized libraries to push performance further. diff --git a/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/how-to-6.md b/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/how-to-6.md new file mode 100644 index 0000000000..860987d583 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/how-to-6.md @@ -0,0 +1,142 @@ +--- +title: Accelerate with KleidiAI +weight: 8 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Understand why KleidiAI improves matmul + +Getting close to peak matmul performance is complex, especially in transformer inference where one kernel runs many times per token. Raw SIMD intrinsics can accelerate execution, but higher-level algorithms can improve matrix multiplication through techniques such as packing and tiling. + +- *Packing* is a preprocessing step which rearranges matrix data into a layout that matches the compute kernel, which reduces cache misses and improves contiguous vector loads. + +- *Tiling* breaks large matrices into smaller blocks that fit better in cache, so data is reused more often and memory bandwidth pressure is lower. +Arm created [KleidiAI](https://github.com/ARM-software/kleidiai) to provide the fastest Arm CPU microkernels on packing and tiled matrix multiplication so you can use these optimizations without writing and tuning every low-level kernel yourself. + +## Trace the KleidiAI path in this GPT-2 example + +The file `src/kernels/matmul_kai_sve.cpp` is the runtime bridge between your model code and a KleidiAI float32 SVE microkernel. Since packing is a preprocessing step, this workflow uses a slightly modified inference engine, `gpt2_kai_sve.cpp`. It builds a `kai_matmul_clamp_f32_f32_f32p_ukernel` function table and binds function pointers such as: + +- `kai_get_n_step_matmul_clamp_f32_f32_f32p4vlx1b_6x4vl_sve_mla` +- `kai_get_rhs_packed_offset_matmul_clamp_f32_f32_f32p4vlx1b_6x4vl_sve_mla` +- `kai_run_matmul_clamp_f32_f32_f32p4vlx1b_6x4vl_sve_mla` + +The `kai_run_matmul_clamp_f32_f32_f32p4vlx1b_6x4vl_sve_mla` is our entry. As per [the naming convention](https://github.com/ARM-software/kleidiai/blob/main/kai/ukernels/matmul/README.md), this kernel performs an FP32 matrix multiplication, computing output tiles of 6 × 4VL (6 rows by four SVE vector lengths of columns) using SVE (multiply–accumulate) MLA instructions on prepacked RHS weights for efficient cache and SIMD utilization. + +For more details on KleidiAI, please refer to the [official GitLab repository](https://gitlab.arm.com/kleidi/kleidiai). + +## Read the source code: key steps + +All matmul implementations in `src/kernels/` follow the same high-level pattern from `matmul.h`: they compute float32 output from float32 input vectors and weights. The KleidiAI variant keeps this behavior but changes the RHS argument to a packed buffer (`const uint8_t* rhs_packed`) so the compute path can consume prepacked tiles. + +The header interface in KleidiAI defines the core run method through the ukernel function table, and this path uses the float32-to-float32 kernel family (`f32_f32_f32p`). + +```cpp +static const kai_matmul_clamp_f32_f32_f32p_ukernel ukernel = { + /* geometry helpers */ + kai_get_n_step_matmul_clamp_f32_f32_f32p4vlx1b_6x4vl_sve_mla, + kai_get_rhs_packed_offset_matmul_clamp_f32_f32_f32p4vlx1b_6x4vl_sve_mla, + /* core compute */ + kai_run_matmul_clamp_f32_f32_f32p4vlx1b_6x4vl_sve_mla, +}; + +const size_t n_step = ukernel.get_n_step(); +const size_t n_blocks = ((size_t)n_out + n_step - 1) / n_step; +const size_t rhs_offset = ukernel.get_rhs_packed_offset(n_start, k); + +ukernel.run_matmul(m, n_step, k, x, lhs_stride, + rhs_packed + rhs_offset, + out + n_start, dst_stride_row, dst_stride_col, + -FLT_MAX, FLT_MAX); +``` + +The execution flow is: + +1. Query kernel tile size (`n_step`) from the ukernel interface. +2. Split output columns into `n_step` blocks, which are smaller sub-matrices. +3. Use `get_rhs_packed_offset(...)` to locate the packed RHS chunk for each block. +4. Call `run_matmul(...)` for each block, with optional thread-level parallelism. + +In `src/gpt2_kai_sve.cpp`, runtime code prepares packed weights once with `kai_run_rhs_pack_kxn_x32p4vlx1b_x32_x32_sve`, stores them in `PackedWeights`, and calls `kernels::matmul_kai_sve(...)` at runtime. This is why the runtime file and `matmul_kai_sve.cpp` must match: the runtime produces packed buffers in the format expected by the same f32 ukernel family. + + +## Compare SVE intrinsics and KleidiAI + +Run the comparison script from the repository root with the following command. + +```bash { command_line="user@host | 2-30"} +./compare_gpt2_variants.sh kai +Model: gpt2-medium +Prompt: Once upon a time +Tokens: 20 +Runs: 1 +KleidiAI matmul threads (--matmul-threads): 1 + +== gpt2 == +run 1: 3.04907 tok/s +avg: 3.049070 tok/s + +== gpt2_neon == +run 1: 11.4139 tok/s +avg: 11.413900 tok/s + +== gpt2_sve == +run 1: 13.7321 tok/s +avg: 13.732100 tok/s + +== gpt2_kai_sve == +run 1: 15.9847 tok/s +avg: 15.984700 tok/s + +== gpt2_user == +run 1: 3.04784 tok/s +avg: 3.047840 tok/s +``` + +`gpt2_sve` uses SVE intrinsics directly but does not use KleidiAI packing and microkernel dispatch. `gpt2_kai_sve` adds those optimized packing and ukernel paths, which is why throughput is higher in this workload. Compared to the non-vectorized baseline (`gpt2`), this FP32 KleidiAI microkernel path achieves about a 5.4x speedup in this example, without quantization. + + +### Run with multiple threads + +You can increase throughput by running the KleidiAI path with multiple matmul threads. For this 355M model, tune `--matmul-threads` heuristically on your target system to find the optimal value. For our Graviton 3 instance, we observe a max token generation speed of 34.5 token/s with 4 threads. + + +``` bash { command_line="user@host | 3-50"} +cd build +./gpt2_kai_sve --model gpt2-medium "Once upon a time" -n 150 --matmul-threads 4 +Weights path: /home/ubuntu/GPT-2-DEMO/GPT-2-Example/models/gpt2-medium/weights.bin +Vocab path: /home/ubuntu/GPT-2-DEMO/GPT-2-Example/models/gpt2-medium/vocab.bin +Matmul threads: 4 +GPT-2 embd=1024 layers=24 heads=16 vocab=50257 + loaded wte (51463168) + loaded wpe (1048576) + loaded ln1_w (24576) + loaded ln1_b (24576) + loaded c_attn_w (75497472) + loaded c_attn_b (73728) + loaded c_proj_w (25165824) + loaded c_proj_b (24576) + loaded ln2_w (24576) + loaded ln2_b (24576) + loaded mlp_fc_w (100663296) + loaded mlp_fc_b (98304) + loaded mlp_pj_w (100663296) + loaded mlp_pj_b (24576) + loaded ln_f_w (1024) + loaded ln_f_b (1024) +Packed weights for 24 layers + logit projection +Tokeniser: 50257 tokens, 50000 merges +[4 prompt tokens] +Once upon a time, there was a village with three boys running around. They were starting to want to be strong, go out into the town and fight. As the boys were growing up, they wanted to become the strongest of all the boys in the village. The trouble was that the village elders told them that if they didn't go out into the village to participate in the weekly martial arts training, they would suffer the wrath of God. + +They would be kicked out of the village, and they would suffer and die as punishment. So the boys created a forest village, then entered the forest to start training. They, one by one, went out into the forest, and from day to day, they put more and more effort into their martial arts. + +[150 tokens, 34.5787 tok/s] + +``` + +## Summary + +In this Learning Path, you used Arm Performix Instruction Mix to detect scalar-heavy hot paths, validated vectorization changes with static and dynamic evidence, and compared baseline, Neon, SVE, and KleidiAI-backed matmul implementations. This workflow is transferable to your own codebase: use instruction mix to detect missed vectorization and other unexpected instruction-balance patterns, validate changes with static and runtime evidence, and then tune to meet your performance requirements. \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/instruction_mix_dynamic_analysis.webp b/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/instruction_mix_dynamic_analysis.webp new file mode 100644 index 0000000000..c36cc2d178 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/instruction_mix_dynamic_analysis.webp differ diff --git a/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/mcp-performix-prompt.webp b/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/mcp-performix-prompt.webp new file mode 100644 index 0000000000..4e0b6d8edb Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/mcp-performix-prompt.webp differ diff --git a/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/neon_scalar_instruction_mix.webp b/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/neon_scalar_instruction_mix.webp new file mode 100644 index 0000000000..6cfe3e8757 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/neon_scalar_instruction_mix.webp differ diff --git a/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/static_disassembly.webp b/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/static_disassembly.webp new file mode 100644 index 0000000000..d823afd8c9 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/static_disassembly.webp differ diff --git a/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/sve_vs_baseline.webp b/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/sve_vs_baseline.webp new file mode 100644 index 0000000000..80b84f603b Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/performix-instruction-mix/sve_vs_baseline.webp differ