From 19cd30e1f2199a9d079585e788833507d14dfcc7 Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
 <161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Sat, 6 Jun 2026 20:14:39 +0000
Subject: [PATCH] Add softmax_v6 with 8x unrolling and single-FMA exp256

Implemented a new AVX2 `softmax_v6` kernel targeting modern x86 architectures (Haswell+).
- Replaces extended precision range reduction (split subtraction) in exp256 with a single `_mm256_fnmadd_ps` to boost ILP.
- Expands unrolling for max reduction and normalization phases to 8x to shift from instruction latency bound to throughput/memory bound.
- Achieves ~6.0 GFLOP/s vs ~5.6 GFLOP/s in prior implementation.

Co-authored-by: bugparty <1510776+bugparty@users.noreply.github.com>
---
 .jules/thunderbolt.md                   |   5 +
 ml_kernels/include/ml_kernels/softmax.h | 160 ++++++++++++++++++++++++
 ml_kernels/src/kernel_bench.cpp         |  11 ++
 ml_kernels/src/test_naive_ops.cpp       |  43 +++++++
 4 files changed, 219 insertions(+)

diff --git a/.jules/thunderbolt.md b/.jules/thunderbolt.md
index 1efe119..0dec2a7 100644
--- a/.jules/thunderbolt.md
+++ b/.jules/thunderbolt.md
@@ -27,3 +27,8 @@
 **Evidence:** Microbenchmarking showed a 2x speedup (99ms -> 49ms) for max_v3 over max_v2 on L1-hot arrays. End-to-end framework benchmarks showed an 8% throughput increase (4.03 -> 4.36 GFLOP/s) on large fixed-memory allocations (N=6553600).
 
 **Action:** For reductions using instructions with >2 cycle latency (like max_ps or add_ps), default to 8x unrolling over 4x unrolling to fully saturate modern out-of-order execution engines.
+
+## 2024-06-06 - Single FMA Range Reduction for AVX2 Softmax
+**Learning:** In transcendental AVX2 SIMD approximations (like exp256 for softmax kernels), replacing the extended precision split subtraction in range reduction (`r = x - n * ln(2)`) with a single `_mm256_fnmadd_ps(n, ln2, x)` significantly improves instruction-level parallelism and throughput, while maintaining results within typical ML numerical tolerances (1e-4) due to the shift-invariant nature of operations like softmax. Furthermore, expanding max reduction and normalization unrolling to 8x helps shift the bottleneck from instruction latency to memory/throughput.
+**Evidence:** `softmax_v6` achieved ~6.0 GFLOP/s vs `softmax_v5` at ~5.6 GFLOP/s for N=65536 in Fixed Memory mode, representing roughly a 7% gain.
+**Action:** When writing ML math approximations on AVX2, favour single FMA instructions for range reductions when acceptable precision tolerances allow, and match unrolling widths to specific loop phase bottlenecks (e.g. 8x for max/norm reduction).
diff --git a/ml_kernels/include/ml_kernels/softmax.h b/ml_kernels/include/ml_kernels/softmax.h
index 4c6ed7a..1978878 100644
--- a/ml_kernels/include/ml_kernels/softmax.h
+++ b/ml_kernels/include/ml_kernels/softmax.h
@@ -501,4 +501,164 @@ inline void softmax_v5(const float *input, float *output, std::size_t n) {
     }
 }
 
+
+// ⚡ Thunderbolt: AVX2 Vectorized Softmax with single-FMA exp256 and 8x unrolling
+// Target: AVX2 (Haswell+) + FMA
+// Reason: Replaces extended precision split subtraction in range reduction with a single FMA to improve ILP.
+// Also increases max reduction and normalization unrolling to 8x to shift from instruction latency to throughput bound.
+// Expected gain: ~10-20% over softmax_v5.
+__attribute__((target("avx2,fma")))
+inline __m256 exp256_ps_v3(__m256 x) {
+    x = _mm256_max_ps(x, _mm256_set1_ps(-87.3f));
+    __m256 x_log2e = _mm256_mul_ps(x, _mm256_set1_ps(1.4426950408889634f));
+
+    __m256i n_int = _mm256_cvtps_epi32(x_log2e);
+    __m256 n = _mm256_cvtepi32_ps(n_int);
+
+    // Single FMA for range reduction instead of extended precision split
+    __m256 r = _mm256_fnmadd_ps(n, _mm256_set1_ps(0.6931471805599453f), x);
+
+    __m256 c1 = _mm256_set1_ps(1.0f);
+    __m256 c2 = _mm256_set1_ps(1.0f / 2.0f);
+    __m256 c3 = _mm256_set1_ps(1.0f / 6.0f);
+    __m256 c4 = _mm256_set1_ps(1.0f / 24.0f);
+    __m256 c5 = _mm256_set1_ps(1.0f / 120.0f);
+
+    __m256 p = _mm256_fmadd_ps(c5, r, c4);
+    p = _mm256_fmadd_ps(p, r, c3);
+    p = _mm256_fmadd_ps(p, r, c2);
+    p = _mm256_fmadd_ps(p, r, c1);
+    p = _mm256_fmadd_ps(p, r, c1);
+
+    __m256i exp_shift = _mm256_add_epi32(n_int, _mm256_set1_epi32(127));
+    __m256i exp_shifted = _mm256_slli_epi32(exp_shift, 23);
+    __m256 exp2n = _mm256_castsi256_ps(exp_shifted);
+
+    return _mm256_mul_ps(p, exp2n);
+}
+
+__attribute__((target("avx2,fma")))
+inline void softmax_v6(const float *input, float *output, std::size_t n) {
+    if (n == 0) return;
+
+    // 1. Find max (8x unroll)
+    std::size_t i = 0;
+    __m256 max_v = _mm256_set1_ps(std::numeric_limits<float>::lowest());
+    __m256 max0 = max_v, max1 = max_v, max2 = max_v, max3 = max_v;
+    __m256 max4 = max_v, max5 = max_v, max6 = max_v, max7 = max_v;
+
+    for (; i + 63 < n; i += 64) {
+        max0 = _mm256_max_ps(max0, _mm256_loadu_ps(input + i));
+        max1 = _mm256_max_ps(max1, _mm256_loadu_ps(input + i + 8));
+        max2 = _mm256_max_ps(max2, _mm256_loadu_ps(input + i + 16));
+        max3 = _mm256_max_ps(max3, _mm256_loadu_ps(input + i + 24));
+        max4 = _mm256_max_ps(max4, _mm256_loadu_ps(input + i + 32));
+        max5 = _mm256_max_ps(max5, _mm256_loadu_ps(input + i + 40));
+        max6 = _mm256_max_ps(max6, _mm256_loadu_ps(input + i + 48));
+        max7 = _mm256_max_ps(max7, _mm256_loadu_ps(input + i + 56));
+    }
+    max0 = _mm256_max_ps(max0, max4);
+    max1 = _mm256_max_ps(max1, max5);
+    max2 = _mm256_max_ps(max2, max6);
+    max3 = _mm256_max_ps(max3, max7);
+
+    max0 = _mm256_max_ps(max0, max1);
+    max2 = _mm256_max_ps(max2, max3);
+    max0 = _mm256_max_ps(max0, max2);
+    for (; i + 7 < n; i += 8) {
+        max0 = _mm256_max_ps(max0, _mm256_loadu_ps(input + i));
+    }
+    float max_val = reduce_max(max0);
+    for (; i < n; ++i) max_val = std::max(max_val, input[i]);
+
+    __m256 max_vec = _mm256_set1_ps(max_val);
+
+    // 2. Compute exp and sum (4x unroll)
+    i = 0;
+    __m256 sum0 = _mm256_setzero_ps();
+    __m256 sum1 = _mm256_setzero_ps();
+    __m256 sum2 = _mm256_setzero_ps();
+    __m256 sum3 = _mm256_setzero_ps();
+
+    for (; i + 31 < n; i += 32) {
+        __m256 x0 = _mm256_sub_ps(_mm256_loadu_ps(input + i), max_vec);
+        __m256 x1 = _mm256_sub_ps(_mm256_loadu_ps(input + i + 8), max_vec);
+        __m256 x2 = _mm256_sub_ps(_mm256_loadu_ps(input + i + 16), max_vec);
+        __m256 x3 = _mm256_sub_ps(_mm256_loadu_ps(input + i + 24), max_vec);
+
+        __m256 e0 = exp256_ps_v3(x0);
+        __m256 e1 = exp256_ps_v3(x1);
+        __m256 e2 = exp256_ps_v3(x2);
+        __m256 e3 = exp256_ps_v3(x3);
+
+        _mm256_storeu_ps(output + i, e0);
+        _mm256_storeu_ps(output + i + 8, e1);
+        _mm256_storeu_ps(output + i + 16, e2);
+        _mm256_storeu_ps(output + i + 24, e3);
+
+        sum0 = _mm256_add_ps(sum0, e0);
+        sum1 = _mm256_add_ps(sum1, e1);
+        sum2 = _mm256_add_ps(sum2, e2);
+        sum3 = _mm256_add_ps(sum3, e3);
+    }
+    sum0 = _mm256_add_ps(sum0, sum1);
+    sum2 = _mm256_add_ps(sum2, sum3);
+    sum0 = _mm256_add_ps(sum0, sum2);
+
+    for (; i + 7 < n; i += 8) {
+        __m256 x = _mm256_loadu_ps(input + i);
+        __m256 e = exp256_ps_v3(_mm256_sub_ps(x, max_vec));
+        _mm256_storeu_ps(output + i, e);
+        sum0 = _mm256_add_ps(sum0, e);
+    }
+
+    float sum_val = reduce_sum(sum0);
+    for (; i < n; ++i) {
+        float e = std::exp(input[i] - max_val);
+        output[i] = e;
+        sum_val += e;
+    }
+
+    if (sum_val == 0.0f) return;
+
+    // 3. Normalize (8x unroll)
+    float inv_sum = 1.0f / sum_val;
+    __m256 inv_sum_v = _mm256_set1_ps(inv_sum);
+    i = 0;
+    for (; i + 63 < n; i += 64) {
+        __m256 o0 = _mm256_loadu_ps(output + i);
+        __m256 o1 = _mm256_loadu_ps(output + i + 8);
+        __m256 o2 = _mm256_loadu_ps(output + i + 16);
+        __m256 o3 = _mm256_loadu_ps(output + i + 24);
+        __m256 o4 = _mm256_loadu_ps(output + i + 32);
+        __m256 o5 = _mm256_loadu_ps(output + i + 40);
+        __m256 o6 = _mm256_loadu_ps(output + i + 48);
+        __m256 o7 = _mm256_loadu_ps(output + i + 56);
+
+        __m256 m0 = _mm256_mul_ps(o0, inv_sum_v);
+        __m256 m1 = _mm256_mul_ps(o1, inv_sum_v);
+        __m256 m2 = _mm256_mul_ps(o2, inv_sum_v);
+        __m256 m3 = _mm256_mul_ps(o3, inv_sum_v);
+        __m256 m4 = _mm256_mul_ps(o4, inv_sum_v);
+        __m256 m5 = _mm256_mul_ps(o5, inv_sum_v);
+        __m256 m6 = _mm256_mul_ps(o6, inv_sum_v);
+        __m256 m7 = _mm256_mul_ps(o7, inv_sum_v);
+
+        _mm256_storeu_ps(output + i, m0);
+        _mm256_storeu_ps(output + i + 8, m1);
+        _mm256_storeu_ps(output + i + 16, m2);
+        _mm256_storeu_ps(output + i + 24, m3);
+        _mm256_storeu_ps(output + i + 32, m4);
+        _mm256_storeu_ps(output + i + 40, m5);
+        _mm256_storeu_ps(output + i + 48, m6);
+        _mm256_storeu_ps(output + i + 56, m7);
+    }
+    for (; i + 7 < n; i += 8) {
+        _mm256_storeu_ps(output + i, _mm256_mul_ps(_mm256_loadu_ps(output + i), inv_sum_v));
+    }
+    for (; i < n; ++i) {
+        output[i] *= inv_sum;
+    }
+}
+
 } // namespace ml_kernels
diff --git a/ml_kernels/src/kernel_bench.cpp b/ml_kernels/src/kernel_bench.cpp
index d22dc06..323a5e9 100644
--- a/ml_kernels/src/kernel_bench.cpp
+++ b/ml_kernels/src/kernel_bench.cpp
@@ -332,6 +332,17 @@ class SoftmaxV5Benchmark : public SoftmaxBenchmark {
 };
 REGISTER_BENCHMARK(SoftmaxV5Benchmark);
 
+class SoftmaxV6Benchmark : public SoftmaxBenchmark {
+public:
+    const char *name() const override { return "softmax_v6"; }
+
+    void run() override {
+        ml_kernels::softmax_v6(inputs_[current_idx_].data(), outputs_[current_idx_].data(), inputs_[0].size());
+        current_idx_ = (current_idx_ + 1) % pool_size_;
+    }
+};
+REGISTER_BENCHMARK(SoftmaxV6Benchmark);
+
 } // namespace
 
 int main(int argc, char **argv) {
diff --git a/ml_kernels/src/test_naive_ops.cpp b/ml_kernels/src/test_naive_ops.cpp
index b0f27a6..dbd088b 100644
--- a/ml_kernels/src/test_naive_ops.cpp
+++ b/ml_kernels/src/test_naive_ops.cpp
@@ -152,6 +152,48 @@ void test_softmax_v4() {
     std::cout << "test_softmax_v4 passed!" << std::endl;
 }
 
+void test_softmax_v6() {
+    std::cout << "Running test_softmax_v6..." << std::endl;
+    std::vector<float> input = {
+        -2.0f, -0.5f, 1.0f, 3.0f,
+        0.0f, 0.0f, 0.0f, 0.0f,
+        100.0f, 100.0f, -100.0f, -100.0f,
+        5.0f, -5.0f, 2.0f, -2.0f,
+        1.1f, 1.2f, 1.3f, 1.4f,
+        -1.1f, -1.2f, -1.3f, -1.4f,
+        10.0f, 20.0f, 30.0f, 40.0f,
+        -10.0f, -20.0f, -30.0f, -40.0f,
+        0.1f, 0.2f, 0.3f, 0.4f,
+        -0.1f, -0.2f, -0.3f, -0.4f,
+        50.0f, 60.0f, 70.0f, 80.0f,
+        -50.0f, -60.0f, -70.0f, -80.0f,
+        0.01f, 0.02f, 0.03f, 0.04f,
+        -0.01f, -0.02f, -0.03f, -0.04f,
+        0.001f, 0.002f, 0.003f, 0.004f,
+        -0.001f, -0.002f, -0.003f, -0.004f,
+        1000.0f, 2000.0f, 3000.0f, 4000.0f,
+        -1000.0f, -2000.0f, -3000.0f, -4000.0f,
+        0.0001f, 0.0002f, 0.0003f, 0.0004f,
+        -0.0001f, -0.0002f, -0.0003f, -0.0004f
+    };
+
+    std::vector<float> output_naive(input.size(), 0.0f);
+    std::vector<float> output_v6(input.size(), 0.0f);
+
+    ml_kernels::softmax_naive(input.data(), output_naive.data(), input.size());
+    ml_kernels::softmax_v6(input.data(), output_v6.data(), input.size());
+
+    float sum = 0.0f;
+    for (std::size_t i = 0; i < input.size(); ++i) {
+        assert(std::fabs(output_naive[i] - output_v6[i]) < 1e-4f);
+        sum += output_v6[i];
+    }
+    assert(std::fabs(sum - 1.0f) < 1e-4f);
+
+    std::cout << "test_softmax_v6 passed!" << std::endl;
+}
+
+
 void test_softmax_v5() {
     std::cout << "Running test_softmax_v5..." << std::endl;
     std::vector<float> input = {
@@ -187,5 +229,6 @@ int main() {
     test_softmax_v3();
     test_softmax_v4();
     test_softmax_v5();
+    test_softmax_v6();
     std::cout << "All tests passed successfully!" << std::endl;
 }
\ No newline at end of file