diff --git a/benchmark/Project.toml b/benchmark/Project.toml
new file mode 100644
index 00000000..4e3050c2
--- /dev/null
+++ b/benchmark/Project.toml
@@ -0,0 +1,11 @@
+[deps]
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
+DoubleFloats = "497a8b3b-efae-58df-a0af-a86822472b78"
+GenericLinearAlgebra = "14197337-ba66-59df-a3e3-ca00e7dcff7a"
+GenericSchur = "c145ed77-6b09-5dd9-b285-bf645a82121e"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+MatrixAlgebraKit = "6c742aac-3347-4629-af66-fc926824e5e4"
+Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
diff --git a/benchmark/README.md b/benchmark/README.md
new file mode 100644
index 00000000..123aed36
--- /dev/null
+++ b/benchmark/README.md
@@ -0,0 +1,47 @@
+# Matrix exponential benchmarks
+
+`bench_exponential.jl` times and accuracy-tests every applicable `MatrixAlgebraKit.exponential`
+algorithm (`MatrixFunctionViaTaylor`, `MatrixFunctionViaLA`, `MatrixFunctionViaEig`,
+`MatrixFunctionViaEigh`) for `Float64`, `ComplexF64`, `Double64`, `Complex{Double64}`,
+`BigFloat` and `Complex{BigFloat}`.
+The `Double64` (double-double, ~106-bit) types sit between the hardware BLAS floats and
+arbitrary-precision `BigFloat`, and exercise the `Eig`/`Eigh` paths through the generic
+GenericSchur / GenericLinearAlgebra drivers (the `LA` LAPACK path applies to BLAS types only).
+
+Accuracy inputs are built from a *known* spectrum as `A = V · Diag(λ) · V⁻¹`, so the exact
+`exp(A) = V · Diag(exp λ) · V⁻¹` is available analytically (computed at high precision).
+Eigenvalue families (`small`, `wide`, `imaginary`, `stiff`, `illconditioned`, `hermitian`)
+and eigenvector families (`unitary`, `general`, `illconditioned`) are chosen to stress or
+favor particular algorithms.
+
+## Setup (once)
+
+From the repository root:
+
+```sh
+julia --project=benchmark -e 'using Pkg; Pkg.develop(path=pwd()); Pkg.instantiate()'
+```
+
+This `dev`s the in-tree `MatrixAlgebraKit` into the benchmark's own environment.
+
+## Run
+
+```sh
+julia --project=benchmark benchmark/bench_exponential.jl            # full run
+julia --project=benchmark benchmark/bench_exponential.jl --quick    # fast smoke run
+julia --project=benchmark benchmark/bench_exponential.jl --csv out.csv
+```
+
+Results are printed as per-type accuracy and timing tables and written to two CSVs derived
+from the `--csv` path (default base `benchmark/results.csv`): `*-accuracy.csv` and
+`*-timing.csv`. They are separate files because the accuracy and timing passes report
+disjoint quantities — a single combined table would leave the timing columns empty on the
+accuracy rows and vice-versa.
+
+## Accuracy columns
+
+- `analytic_err` — relative error vs the analytic `V·Diag(exp λ)·V⁻¹` reference (primary).
+- `recomp_err` — relative error vs an independent high-precision `exp` of the exact input.
+- `expexp_err` — `‖exp(A)·exp(−A) − I‖₁` (reference-free consistency check).
+- `det_err` — relative error of `det(exp A)` against `exp(tr A)`.
+- `condV` — designed condition number of the eigenvector basis (context for non-normal cases).
diff --git a/benchmark/bench_exponential.jl b/benchmark/bench_exponential.jl
new file mode 100644
index 00000000..93cf8a50
--- /dev/null
+++ b/benchmark/bench_exponential.jl
@@ -0,0 +1,509 @@
+# Benchmark and accuracy harness for `MatrixAlgebraKit.exponential`.
+#
+# For each element type it times every applicable exponential algorithm over a range of
+# matrix sizes, and measures each algorithm's accuracy on matrices with a *known* spectrum.
+#
+# Accuracy inputs are built as `A = V · Diag(λ) · V⁻¹`, so the exact exponential
+# `exp(A) = V · Diag(exp λ) · V⁻¹` is known analytically.  The eigenvalues `λ` are drawn
+# from families chosen to help or hurt particular algorithms (small/wide/imaginary/stiff/
+# ill-conditioned), and the eigenvectors `V` are either unitary (→ normal / Hermitian `A`)
+# or a general basis with a prescribed condition number (→ non-normal `A`).  The analytic
+# reference is formed at high precision (`Complex{BigFloat}` at `HIGH_BITS` bits).
+#
+# Usage:
+#   julia --project=benchmark benchmark/bench_exponential.jl [--quick] [--csv path]
+#
+#   --quick     small sizes / short timing budget, for a fast smoke run
+#   --csv PATH  write the flat results table to PATH (default: benchmark/results.csv)
+
+using LinearAlgebra
+using LinearAlgebra: BLAS
+using Random
+using Printf
+using DelimitedFiles
+using StableRNGs
+using BenchmarkTools
+using DoubleFloats
+using MatrixAlgebraKit
+using MatrixAlgebraKit: MatrixFunctionViaTaylor, MatrixFunctionViaLA,
+    MatrixFunctionViaEig, MatrixFunctionViaEigh, QRIteration, GS, GLA
+# Loading these makes the generic (BigFloat / Double64) eig / eigh drivers available.
+using GenericSchur
+using GenericLinearAlgebra
+
+# ----------------------------------------------------------------------------------------
+# Configuration
+# ----------------------------------------------------------------------------------------
+const QUICK = "--quick" in ARGS
+const CSV_PATH = let i = findfirst(==("--csv"), ARGS)
+    i === nothing ? joinpath(@__DIR__, "results.csv") : ARGS[i + 1]
+end
+
+# Ordered by increasing precision: hardware BLAS floats, then double-double (~106 bits,
+# software but fast), then arbitrary-precision BigFloat.
+const ELTYPES = (
+    Float64, ComplexF64, Double64, Complex{Double64}, BigFloat, Complex{BigFloat},
+)
+
+_isblas(::Type{T}) where {T} = T <: Union{Float32, Float64, ComplexF32, ComplexF64}
+_isdouble(::Type{T}) where {T} = T <: Union{Double64, Complex{Double64}}
+
+# Sizes for the timing sweep (generic types are far slower, so use fewer / smaller).
+# Double64 is software but only ~10-30× slower than BLAS, so it gets an intermediate range;
+# BigFloat is far slower still.
+blas_sizes() = QUICK ? [8, 16, 32] : [8, 16, 32, 64, 128, 256]
+double_sizes() = QUICK ? [8, 16] : [8, 16, 32, 64, 128]
+generic_sizes() = QUICK ? [8, 16] : [8, 16, 32, 64]
+perf_sizes(T) = _isblas(T) ? blas_sizes() : (_isdouble(T) ? double_sizes() : generic_sizes())
+
+# Sizes for the accuracy sweep (kept small: high-precision reference is expensive).
+acc_sizes() = QUICK ? [12] : [16, 48]
+
+# Timing budget per (matrix, algorithm), in seconds.
+perf_seconds(T) = QUICK ? 0.1 : (_isblas(T) ? 0.3 : 0.8)
+
+# Precision of the analytic reference and its tolerance (must beat the tested precision).
+const HIGH_BITS = QUICK ? 512 : 1024
+const REF_TOL = QUICK ? big"1e-50" : big"1e-90"
+
+# Designed condition number of the eigenvector basis `V` per family.
+const GENCOND = 1.0e2
+const ILLCOND = 1.0e8
+
+# Column order for the printed tables.
+const ALG_ORDER = ["Taylor", "LA", "Eig", "Eigh"]
+
+# Accuracy cases: (eigenvalue family, eigenvector family).
+const ACC_CASES = [
+    (:hermitian, :unitary),       # real spectrum, unitary V  -> Hermitian / symmetric
+    (:small, :unitary),           # tiny normal matrix        -> few squarings
+    (:imaginary, :unitary),       # oscillatory normal matrix
+    (:wide, :unitary),            # large-norm normal matrix  -> many squarings
+    (:stiff, :general),           # widely varying real parts -> over/underflow
+    (:wide, :general),            # non-normal, large norm
+    (:small, :illconditioned),    # near-defective basis      -> eig-path cancellation
+]
+
+# Performance cases: (label, eigenvalue family, eigenvector family).
+const PERF_CASES = [
+    ("general", :wide, :general),   # non-normal, non-trivial norm
+    ("hermitian", :hermitian, :unitary),  # exercises the Eigh path
+]
+
+# ----------------------------------------------------------------------------------------
+# Random matrix helpers (support Complex{BigFloat}, which `randn` does not directly)
+# ----------------------------------------------------------------------------------------
+_randn(rng, ::Type{T}, dims...) where {T <: Real} = randn(rng, T, dims...)
+function _randn(rng, ::Type{Complex{T}}, dims...) where {T <: Real}
+    return randn(rng, T, dims...) .+ im .* randn(rng, T, dims...)
+end
+
+unitary_matrix(rng, ::Type{RT}, n) where {RT} = Matrix(qr(_randn(rng, RT, n, n)).Q)
+
+# General basis with singular values geometrically spread from 1 to `targetcond`.
+function general_matrix(rng, ::Type{RT}, n, targetcond) where {RT}
+    U = unitary_matrix(rng, RT, n)
+    W = unitary_matrix(rng, RT, n)
+    s = exp.(range(0.0, log(targetcond); length = n))
+    S = Diagonal(real(RT).(s))
+    return U * S * W
+end
+
+# ----------------------------------------------------------------------------------------
+# Spectra
+# ----------------------------------------------------------------------------------------
+function sample_eig(family, rng)::ComplexF64
+    if family === :small
+        return 0.15 * randn(rng) + 0.15im * (0.2 + abs(randn(rng)))
+    elseif family === :wide
+        r = exp(log(0.05) + (log(30.0) - log(0.05)) * rand(rng))
+        θ = 0.1 + (π - 0.2) * rand(rng)
+        return r * cis(θ)
+    elseif family === :imaginary
+        return im * (0.5 + 8.0 * rand(rng))
+    elseif family === :stiff
+        return (-40.0 + 44.0 * rand(rng)) + im * (0.3 + 0.5 * abs(randn(rng)))
+    elseif family === :illconditioned
+        return 0.5 * randn(rng) + im * (0.2 + 0.3 * abs(randn(rng)))
+    else
+        error("unknown eigenvalue family $family")
+    end
+end
+
+# A conjugate-closed spectrum of length `n` (so a real matrix with this spectrum exists).
+function make_spectrum(family, n, rng)::Vector{ComplexF64}
+    if family === :hermitian
+        return complex.(2 .* randn(rng, n))
+    end
+    half = fld(n, 2)
+    μ = ComplexF64[]
+    for _ in 1:half
+        z = sample_eig(family, rng)
+        push!(μ, z)
+        push!(μ, conj(z))
+    end
+    if isodd(n)
+        push!(μ, complex(real(sample_eig(family, rng))))
+    end
+    return μ
+end
+
+# Real block form of a conjugate-closed spectrum, together with its analytic exponential.
+# Real eigenvalues sit on the diagonal; a complex pair a±bi becomes a 2×2 block [a b; -b a].
+# Must be called inside a `setprecision` block so the BigFloat ops use `HIGH_BITS`.
+function real_canonical(spectrum::Vector{ComplexF64})
+    tol = 1.0e-9
+    reals = BigFloat[]
+    pairs = Tuple{BigFloat, BigFloat}[]
+    for z in spectrum
+        if abs(imag(z)) < tol
+            push!(reals, BigFloat(real(z)))
+        elseif imag(z) > 0
+            push!(pairs, (BigFloat(real(z)), BigFloat(imag(z))))
+        end
+    end
+    n = length(reals) + 2 * length(pairs)
+    D = zeros(BigFloat, n, n)
+    E = zeros(BigFloat, n, n)
+    i = 1
+    for a in reals
+        D[i, i] = a
+        E[i, i] = exp(a)
+        i += 1
+    end
+    for (a, b) in pairs
+        D[i, i] = a
+        D[i, i + 1] = b
+        D[i + 1, i] = -b
+        D[i + 1, i + 1] = a
+        ea, cb, sb = exp(a), cos(b), sin(b)
+        E[i, i] = ea * cb
+        E[i, i + 1] = ea * sb
+        E[i + 1, i] = -ea * sb
+        E[i + 1, i + 1] = ea * cb
+        i += 2
+    end
+    return D, E
+end
+
+# ----------------------------------------------------------------------------------------
+# Case construction: returns the input matrix at type `T` and the analytic reference.
+# ----------------------------------------------------------------------------------------
+function build_case(::Type{T}, n, eigfam, evecfam, rng) where {T}
+    RT_hi = (T <: Complex) ? Complex{BigFloat} : BigFloat
+    targetcond = evecfam === :illconditioned ? ILLCOND :
+        (evecfam === :general ? GENCOND : 1.0)
+    spectrum = make_spectrum(eigfam, n, rng)
+
+    A_hi, expA_hi = setprecision(BigFloat, HIGH_BITS) do
+        V = evecfam === :unitary ? unitary_matrix(rng, RT_hi, n) :
+            general_matrix(rng, RT_hi, n, targetcond)
+        Vi = inv(V)
+        if T <: Complex
+            λ = Complex{BigFloat}.(spectrum)
+            A = V * Diagonal(λ) * Vi
+            expA = V * Diagonal(exp.(λ)) * Vi
+            return Complex{BigFloat}.(A), Complex{BigFloat}.(expA)
+        else
+            D, E = real_canonical(spectrum)
+            A = V * D * Vi
+            expA = V * E * Vi
+            return A, Complex{BigFloat}.(expA)
+        end
+    end
+
+    # Round the input down to the tested precision (default BigFloat precision for generic T).
+    A_T = (T <: Complex) ? Complex{real(T)}.(A_hi) : real(T).(A_hi)
+    herm = (eigfam === :hermitian && evecfam === :unitary)
+    herm && (A_T = (A_T + A_T') / 2)  # enforce exact Hermiticity for the Eigh path
+    return (A = Matrix(A_T), ref = expA_hi, condV = targetcond, hermitian = herm)
+end
+
+# ----------------------------------------------------------------------------------------
+# Algorithm selection per input
+# ----------------------------------------------------------------------------------------
+_trydefault(f, A) = try
+    f(typeof(A))
+catch
+    nothing
+end
+
+# The library only registers default eig / eigh algorithms for BLAS and BigFloat element
+# types, but GenericSchur / GenericLinearAlgebra actually drive any `AbstractFloat`.  For
+# types without a registered default (e.g. Double64) fall back to those drivers directly,
+# using the same `QRIteration` the BigFloat default resolves to.
+function eig_algorithm_for(A)
+    eig = _trydefault(MatrixAlgebraKit.default_eig_algorithm, A)
+    return eig !== nothing ? eig : QRIteration(; driver = GS())
+end
+function eigh_algorithm_for(A)
+    eigh = _trydefault(MatrixAlgebraKit.default_eigh_algorithm, A)
+    return eigh !== nothing ? eigh : QRIteration(; driver = GLA())
+end
+
+function algorithms_for(A, hermitian::Bool)
+    T = eltype(A)
+    algs = Tuple{String, Any}[("Taylor", MatrixFunctionViaTaylor())]
+    if T <: LinearAlgebra.BlasFloat
+        push!(algs, ("LA", MatrixFunctionViaLA()))
+    end
+    eig = eig_algorithm_for(A)
+    eig !== nothing && push!(algs, ("Eig", MatrixFunctionViaEig(eig)))
+    if hermitian
+        eigh = eigh_algorithm_for(A)
+        eigh !== nothing && push!(algs, ("Eigh", MatrixFunctionViaEigh(eigh)))
+    end
+    return algs
+end
+
+# ----------------------------------------------------------------------------------------
+# Accuracy metrics
+# ----------------------------------------------------------------------------------------
+function relerr(X, ref)
+    return setprecision(BigFloat, HIGH_BITS) do
+        Xc = Complex{BigFloat}.(X)
+        Float64(opnorm(Xc .- ref, 1) / opnorm(ref, 1))
+    end
+end
+
+# Independent high-precision recomputation of exp of the *actual* input, so algorithms are
+# also compared to a common reference for the exact matrix they were handed.
+function highprec_exp(A_T)
+    return setprecision(BigFloat, HIGH_BITS) do
+        Ahi = Complex{BigFloat}.(A_T)
+        exponential(Ahi, MatrixFunctionViaTaylor(; tol = REF_TOL, balance = true))
+    end
+end
+
+# Reference-free consistency checks.
+function expexp_err(A, alg)
+    E = exponential(A, alg)
+    Em = exponential(-A, alg)
+    return Float64(opnorm(E * Em - I, 1))
+end
+function det_err(A, E)
+    return try
+        d = det(E)
+        target = exp(tr(A))
+        Float64(abs(d - target) / abs(target))
+    catch
+        NaN
+    end
+end
+
+# ----------------------------------------------------------------------------------------
+# Result rows
+# ----------------------------------------------------------------------------------------
+mutable struct Row
+    pass::String
+    size::Int
+    eltype::String
+    eigfam::String
+    evecfam::String
+    alg::String
+    time_s::Float64
+    allocs::Float64
+    analytic_err::Float64
+    recomp_err::Float64
+    expexp_err::Float64
+    det_err::Float64
+    condV::Float64
+    norm1::Float64
+    status::String
+end
+
+const ROWS = Row[]
+
+# ----------------------------------------------------------------------------------------
+# Passes
+# ----------------------------------------------------------------------------------------
+function run_accuracy!()
+    println("\n", "="^80)
+    println("ACCURACY PASS  (analytic reference at $HIGH_BITS bits)")
+    println("="^80)
+    for T in ELTYPES
+        for n in acc_sizes()
+            for (eigfam, evecfam) in ACC_CASES
+                rng = StableRNG(123)
+                case = build_case(T, n, eigfam, evecfam, rng)
+                A = case.A
+                norm1 = Float64(opnorm(A, 1))
+                recomp = highprec_exp(A)
+                for (name, alg) in algorithms_for(A, case.hermitian)
+                    row = Row(
+                        "acc", n, string(T), string(eigfam), string(evecfam), name,
+                        NaN, NaN, NaN, NaN, NaN, NaN, case.condV, norm1, "ok",
+                    )
+                    try
+                        E = exponential(A, alg)
+                        row.analytic_err = relerr(E, case.ref)
+                        row.recomp_err = relerr(E, recomp)
+                        row.expexp_err = expexp_err(A, alg)
+                        row.det_err = det_err(A, E)
+                    catch e
+                        row.status = "error: $(sprint(showerror, e))"
+                    end
+                    push!(ROWS, row)
+                end
+            end
+        end
+    end
+    return
+end
+
+function run_performance!()
+    println("\n", "="^80)
+    println("PERFORMANCE PASS")
+    println("="^80)
+    for T in ELTYPES
+        sec = perf_seconds(T)
+        BenchmarkTools.DEFAULT_PARAMETERS.seconds = sec
+        for n in perf_sizes(T)
+            for (label, eigfam, evecfam) in PERF_CASES
+                rng = StableRNG(123)
+                case = build_case(T, n, eigfam, evecfam, rng)
+                A = case.A
+                norm1 = Float64(opnorm(A, 1))
+                for (name, alg) in algorithms_for(A, case.hermitian)
+                    row = Row(
+                        "perf", n, string(T), label, string(evecfam), name,
+                        NaN, NaN, NaN, NaN, NaN, NaN, case.condV, norm1, "ok",
+                    )
+                    try
+                        b = @benchmark exponential($A, $alg)
+                        tr_ = minimum(b)
+                        row.time_s = tr_.time / 1.0e9
+                        row.allocs = Float64(tr_.allocs)
+                    catch e
+                        row.status = "error: $(sprint(showerror, e))"
+                    end
+                    push!(ROWS, row)
+                end
+            end
+        end
+    end
+    return
+end
+
+# ----------------------------------------------------------------------------------------
+# Reporting
+# ----------------------------------------------------------------------------------------
+fmt_time(x) = isnan(x) ? "     -   " : (
+        x < 1.0e-3 ? @sprintf("%7.1fµs", x * 1.0e6) :
+        x < 1.0 ? @sprintf("%7.2fms", x * 1.0e3) : @sprintf("%7.3fs ", x)
+    )
+fmt_err(x) = isnan(x) ? "    -    " : @sprintf("%9.2e", x)
+
+function print_perf_tables()
+    for T in ELTYPES
+        for (label, _, _) in PERF_CASES
+            rows = filter(r -> r.pass == "perf" && r.eltype == string(T) && r.eigfam == label, ROWS)
+            isempty(rows) && continue
+            algs = filter(a -> any(r -> r.alg == a, rows), ALG_ORDER)
+            println("\n[$T]  timing — $label matrix   (min time per call)")
+            @printf("  %5s", "n")
+            for a in algs
+                @printf("  %10s", a)
+            end
+            println()
+            for n in sort(unique(r.size for r in rows))
+                @printf("  %5d", n)
+                for a in algs
+                    r = findfirst(x -> x.size == n && x.alg == a, rows)
+                    @printf("  %10s", r === nothing ? "-" : strip(fmt_time(rows[r].time_s)))
+                end
+                println()
+            end
+        end
+    end
+    return
+end
+
+function print_acc_tables()
+    for T in ELTYPES
+        rows = filter(r -> r.pass == "acc" && r.eltype == string(T), ROWS)
+        isempty(rows) && continue
+        algs = filter(a -> any(r -> r.alg == a, rows), ALG_ORDER)
+        println("\n[$T]  accuracy — relative error vs analytic reference")
+        @printf("  %-26s %5s", "case (eig/evec)", "n")
+        for a in algs
+            @printf("  %9s", a)
+        end
+        println()
+        for n in sort(unique(r.size for r in rows))
+            for (eigfam, evecfam) in ACC_CASES
+                sel = filter(r -> r.size == n && r.eigfam == string(eigfam) && r.evecfam == string(evecfam), rows)
+                isempty(sel) && continue
+                @printf("  %-26s %5d", "$eigfam/$evecfam", n)
+                for a in algs
+                    r = findfirst(x -> x.alg == a, sel)
+                    @printf("  %9s", r === nothing ? "-" : strip(fmt_err(sel[r].analytic_err)))
+                end
+                println()
+            end
+        end
+    end
+    return
+end
+
+# The accuracy and timing passes produce disjoint columns, so they are written to two
+# separate files rather than one flat table with half the columns empty.
+function _write_csv(path, header, rows, cols)
+    return open(path, "w") do io
+        writedlm(io, permutedims(header), ',')
+        for r in rows
+            writedlm(io, permutedims(Any[getfield(r, c) for c in cols]), ',')
+        end
+    end
+end
+
+function write_csv(path)
+    base, ext = splitext(path)
+    isempty(ext) && (ext = ".csv")
+    acc_path = base * "-accuracy" * ext
+    perf_path = base * "-timing" * ext
+
+    acc = filter(r -> r.pass == "acc", ROWS)
+    perf = filter(r -> r.pass == "perf", ROWS)
+
+    _write_csv(
+        acc_path,
+        [
+            "size", "eltype", "eig_family", "evec_family", "algorithm",
+            "analytic_err", "recomp_err", "expexp_err", "det_err", "condV", "norm1", "status",
+        ],
+        acc,
+        (
+            :size, :eltype, :eigfam, :evecfam, :alg,
+            :analytic_err, :recomp_err, :expexp_err, :det_err, :condV, :norm1, :status,
+        ),
+    )
+    _write_csv(
+        perf_path,
+        [
+            "size", "eltype", "case", "evec_family", "algorithm",
+            "time_s", "allocs", "condV", "norm1", "status",
+        ],
+        perf,
+        (:size, :eltype, :eigfam, :evecfam, :alg, :time_s, :allocs, :condV, :norm1, :status),
+    )
+    println("\nWrote $(length(acc)) accuracy rows to $acc_path")
+    return println("Wrote $(length(perf)) timing rows to $perf_path")
+end
+
+# ----------------------------------------------------------------------------------------
+# Main
+# ----------------------------------------------------------------------------------------
+function main()
+    # Single-threaded BLAS for a clean, reproducible algorithm-to-algorithm comparison.
+    BLAS.set_num_threads(1)
+    @info "matrix-exponential benchmark" QUICK HIGH_BITS blas_threads = BLAS.get_num_threads() types = ELTYPES
+    run_accuracy!()
+    run_performance!()
+    print_acc_tables()
+    print_perf_tables()
+    write_csv(CSV_PATH)
+    return nothing
+end
+
+main()
diff --git a/docs/src/user_interface/algorithms.md b/docs/src/user_interface/algorithms.md
index e6030787..d90b7607 100644
--- a/docs/src/user_interface/algorithms.md
+++ b/docs/src/user_interface/algorithms.md
@@ -102,6 +102,7 @@ The following algorithms for matrix functions are available.
 
 | Algorithm | Applicable matrix functions | Key keyword arguments |
 |:----------|:--------------------------|:----------------------|
+| [`MatrixFunctionViaTaylor`](@ref) | exponential | `tol`, `balance` |
 | [`MatrixFunctionViaLA`](@ref) | exponential | |
 | [`MatrixFunctionViaEig`](@ref) | exponential | `eig_alg` |
 | [`MatrixFunctionViaEigh`](@ref) | exponential | `eigh_alg` |
diff --git a/docs/src/user_interface/matrix_functions.md b/docs/src/user_interface/matrix_functions.md
index 20564773..6153f2e6 100644
--- a/docs/src/user_interface/matrix_functions.md
+++ b/docs/src/user_interface/matrix_functions.md
@@ -23,14 +23,16 @@ Additionally, the `f!` method typically assumes that it is allowed to destroy th
 ## Exponential
 
 The [exponential](https://en.wikipedia.org/wiki/Matrix_exponential) of a square matrix `A` is used in many scientific applications, as it arises in the solution of an autonomous linear differential equation.
-An implementation for the matrix exponential based on a Padé approximation is available in `LinearAlgebra`, and can be accessed by the algorithm [`MatrixFunctionViaLA`](@ref).
-For more generic data types, the exponential can be calculated by first calculating the (hermitian) eigenvalue decomposition, and then computing
-the scalar exponential of the diagonal elements.
+The default algorithm [`MatrixFunctionViaTaylor`](@ref) is a pure-Julia scaling-and-squaring evaluation of the Taylor series.
+As it requires no LAPACK support, it also applies to generic data types at arbitrary precision.
+Alternatively, an implementation based on a Padé approximation is available in `LinearAlgebra`, and can be accessed by the algorithm [`MatrixFunctionViaLA`](@ref).
+The exponential can also be calculated by first calculating the (hermitian) eigenvalue decomposition, and then computing the scalar exponential of the diagonal elements.
 This strategy is implemented via the algorithms [`MatrixFunctionViaEig`](@ref) and [`MatrixFunctionViaEigh`](@ref), and call `eig_full` and `eigh_full`, respectively.
 Additionally, in order to calculate `exp(τ * A)`, the function `exponential` can be called with `(τ, A)`, using the same algorithms as before.
 
 ```@docs; canonical=false
 exponential
+MatrixAlgebraKit.MatrixFunctionViaTaylor
 MatrixAlgebraKit.MatrixFunctionViaLA
 MatrixAlgebraKit.MatrixFunctionViaEig
 MatrixAlgebraKit.MatrixFunctionViaEigh
diff --git a/src/MatrixAlgebraKit.jl b/src/MatrixAlgebraKit.jl
index 7b9d5ef1..4d4e0084 100644
--- a/src/MatrixAlgebraKit.jl
+++ b/src/MatrixAlgebraKit.jl
@@ -41,7 +41,7 @@ export LAPACK_HouseholderQR, LAPACK_HouseholderLQ, LAPACK_Simple, LAPACK_Expert,
 export GLA_HouseholderQR, GLA_QRIteration, GS_QRIteration
 export LQViaTransposedQR
 export PolarViaSVD, PolarNewton
-export MatrixFunctionViaLA, MatrixFunctionViaEig, MatrixFunctionViaEigh
+export MatrixFunctionViaLA, MatrixFunctionViaEig, MatrixFunctionViaEigh, MatrixFunctionViaTaylor
 export DefaultAlgorithm
 export DiagonalAlgorithm
 export NativeBlocked
@@ -94,6 +94,7 @@ include("common/safemethods.jl")
 include("common/view.jl")
 include("common/regularinv.jl")
 include("common/matrixproperties.jl")
+include("common/balancing.jl")
 include("common/utility.jl")
 
 include("yalapack.jl")
diff --git a/src/common/balancing.jl b/src/common/balancing.jl
new file mode 100644
index 00000000..8f0a0dac
--- /dev/null
+++ b/src/common/balancing.jl
@@ -0,0 +1,54 @@
+"""
+    balance!(A::AbstractMatrix; radix=2, maxiter=100) -> A, scale
+
+Balance the square matrix `A` in place through a diagonal similarity `A ← D⁻¹ A D` that
+reduces its norm. Each sweep computes, for every index at once, the power-of-`radix` factor
+that best equalizes the off-diagonal row and column norms (a simultaneous, Osborne-style
+variant of the Parlett–Reinsch scaling), and applies all factors together; sweeps repeat
+until no index changes or `maxiter` is reached. Because it is expressed entirely through
+reductions and broadcasts (no scalar indexing), the same code runs on CPU and GPU arrays.
+
+The returned `scale` holds the diagonal of `D`, so that a matrix function of the original
+input can be recovered from a matrix function `f` of the balanced matrix through
+`D f(D⁻¹AD) D⁻¹`, i.e. `expA[i, j] = scale[i] * f(B)[i, j] / scale[j]`.
+"""
+function balance!(A::AbstractMatrix{T}; radix::Integer = 2, maxiter::Integer = 100) where {T}
+    n = LinearAlgebra.checksquare(A)
+    R = real(T)
+    β = convert(R, radix)
+    logβ = log(β)
+    scale = fill!(similar(A, R, n), one(R))
+
+    colnorm = similar(A, R, n)
+    rownorm = similar(A, R, n)
+    f = similar(A, R, n)
+    colsum = reshape(colnorm, 1, n)
+    rowsum = reshape(rownorm, n, 1)
+    d = abs.(diagview(A))
+    fᵀ = transpose(f)
+
+    for _ in 1:maxiter
+        fill!(colsum, zero(R))
+        Base.mapreducedim!(abs, +, colsum, A)
+        fill!(rowsum, zero(R))
+        Base.mapreducedim!(abs, +, rowsum, A)
+        colnorm .-= d
+        rownorm .-= d
+        f .= _balance_factor.(colnorm, rownorm, β, logβ)
+        all(isone, f) && break
+        # apply Aᵢⱼ ← Aᵢⱼ fⱼ / fᵢ (i.e. column j scaled by fⱼ, row i by 1/fᵢ) and accumulate.
+        A .= A .* fᵀ ./ f
+        scale .*= f
+    end
+
+    return A, scale
+end
+
+# Nearest power-of-`radix` factor `f` that equalizes the scaled off-diagonal norms
+# `colnorm·f` and `rownorm/f`, kept only when it reduces their sum (avoids oscillation and
+# leaves degenerate rows/columns untouched).
+@inline function _balance_factor(colnorm::R, rownorm::R, β::R, logβ::R) where {R}
+    (colnorm > 0 && rownorm > 0) || return one(R)
+    f = β^round(log(rownorm / colnorm) / (2 * logβ))
+    return (colnorm * f + rownorm / f) < convert(R, 0.95) * (colnorm + rownorm) ? f : one(R)
+end
diff --git a/src/implementations/exponential.jl b/src/implementations/exponential.jl
index cbaebeed..283d5ce2 100644
--- a/src/implementations/exponential.jl
+++ b/src/implementations/exponential.jl
@@ -107,3 +107,100 @@ function exponential!((τ, A)::Tuple{Number, AbstractMatrix}, expA::AbstractMatr
     check_input(exponential!, (τ, A), expA, alg)
     return map_diagonal!(x -> exp(x * τ), expA, A)
 end
+
+# Taylor logic
+# ------------
+function exponential!(A::AbstractMatrix, expA::AbstractMatrix, alg::MatrixFunctionViaTaylor)
+    check_input(exponential!, A, expA, alg)
+    T = eltype(A)
+    R = real(T)
+    tol = convert(R, get(alg.kwargs, :tol, eps(R)))
+
+    dobalance = get(alg.kwargs, :balance, false)
+    scale = dobalance ? balance!(A)[2] : nothing
+
+    θ = LinearAlgebra.opnorm(A, 1)
+    iszero(θ) && return one!(expA)
+
+    order, squarings = taylor_order_and_squarings(θ, tol)
+    squarings > 0 && rmul!(A, inv(convert(T, 2)^squarings))
+
+    X = taylor_polynomial(A, order)
+    Y = expA # can reuse this memory
+    for _ in 1:squarings
+        mul!(Y, X, X)
+        X, Y = Y, X
+    end
+
+    if dobalance
+        expA .= scale .* X ./ transpose(scale)
+    else
+        X === expA || copyto!(expA, X)
+    end
+    return expA
+end
+
+# Truncation order `m` and number of squarings `s` minimizing the Paterson–Stockmeyer
+# matrix-multiplication count subject to the Taylor remainder bound `(θ/2ˢ)ᵐ⁺¹/(m+1)! ≤ tol`.
+function taylor_order_and_squarings(θ::Real, tol::Real)
+    log2θ = log2(Float64(θ))
+    log2tol = log2(Float64(tol))
+    log2factorial = 0.0
+    best_order = 1
+    best_squarings = 0
+    best_cost = typemax(Int)
+    for order in 1:100
+        log2factorial += log2(order + 1)
+        squarings = max(0, ceil(Int, log2θ - (log2tol + log2factorial) / (order + 1)))
+        blocksize = ceil(Int, sqrt(order + 1))
+        cost = (blocksize - 1) + (cld(order + 1, blocksize) - 1) + squarings
+        if cost < best_cost
+            best_cost = cost
+            best_order = order
+            best_squarings = squarings
+        end
+    end
+    return best_order, best_squarings
+end
+
+# Evaluate ∑ₖ₌₀ᵐ Aᵏ/k! via the Paterson–Stockmeyer scheme, returning a freshly allocated matrix.
+function taylor_polynomial(A::AbstractMatrix, order::Integer)
+    T = eltype(A)
+    blocksize = ceil(Int, sqrt(order + 1))
+
+    invfactorial = Vector{T}(undef, order + 1)
+    invfactorial[1] = one(T)
+    for k in 1:order
+        invfactorial[k + 1] = invfactorial[k] / k
+    end
+
+    powers = Vector{typeof(A)}(undef, blocksize)
+    powers[1] = A
+    for p in 2:blocksize
+        powers[p] = powers[p - 1] * A
+    end
+
+    result = similar(A)
+    block = similar(A)
+    numblocks = fld(order, blocksize)
+    taylor_block!(result, powers, invfactorial, numblocks, blocksize, order)
+    for j in (numblocks - 1):-1:0
+        mul!(block, result, powers[blocksize])
+        taylor_block!(result, powers, invfactorial, j, blocksize, order)
+        result .+= block
+    end
+    return result
+end
+
+# Sub-polynomial ∑ᵢ₌₀ᵇ⁻¹ c_{jb+i} Aⁱ of degree `blocksize - 1`, written into `out`.
+function taylor_block!(out::AbstractMatrix, powers, invfactorial, j::Integer, blocksize::Integer, order::Integer)
+    base = j * blocksize
+    fill!(out, zero(eltype(out)))
+    diagview(out) .= invfactorial[base + 1]
+    for i in 1:(blocksize - 1)
+        k = base + i
+        k > order && break
+        out .+= invfactorial[k + 1] .* powers[i]
+    end
+    return out
+end
diff --git a/src/interface/exponential.jl b/src/interface/exponential.jl
index dd4d00d9..902a4ba4 100644
--- a/src/interface/exponential.jl
+++ b/src/interface/exponential.jl
@@ -24,7 +24,7 @@ Compute the exponential of the square matrix `A` or `τ * A`,
 # -------------------
 default_exponential_algorithm(A; kwargs...) = default_exponential_algorithm(typeof(A); kwargs...)
 function default_exponential_algorithm(T::Type; kwargs...)
-    return MatrixFunctionViaLA(; kwargs...)
+    return MatrixFunctionViaTaylor(; kwargs...)
 end
 function default_exponential_algorithm(::Type{T}; kwargs...) where {T <: Diagonal}
     return DiagonalAlgorithm(; kwargs...)
diff --git a/src/interface/matrixfunctions.jl b/src/interface/matrixfunctions.jl
index 8474f9bc..35ad4407 100644
--- a/src/interface/matrixfunctions.jl
+++ b/src/interface/matrixfunctions.jl
@@ -8,6 +8,18 @@ Algorithm type to denote finding the exponential of `A` via the implementation o
 """
 @algdef MatrixFunctionViaLA
 
+"""
+    MatrixFunctionViaTaylor(; tol=eps, balance=true)
+
+Algorithm type to denote finding the exponential of `A` through a pure-Julia scaling-and-squaring
+evaluation of its Taylor series, following Fasi & Higham (2018).
+The truncation order and the number of squarings are chosen to reach a relative accuracy `tol`,
+and the Taylor polynomial is evaluated with the Paterson–Stockmeyer scheme.
+When `balance` is `true`, `A` is first balanced by a diagonal similarity.
+As this algorithm requires no LAPACK support, it also applies at arbitrary precision.
+"""
+@algdef MatrixFunctionViaTaylor
+
 """
     MatrixFunctionViaEigh(eigh_alg)
 
diff --git a/test/exponential.jl b/test/exponential.jl
index ad7628c3..c71c1edc 100644
--- a/test/exponential.jl
+++ b/test/exponential.jl
@@ -5,6 +5,7 @@ using StableRNGs
 using MatrixAlgebraKit: diagview
 using LinearAlgebra
 using LinearAlgebra: exp
+using CUDA, AMDGPU
 
 BLASFloats = (Float32, Float64, ComplexF32, ComplexF64)
 GenericFloats = (Float16, ComplexF16, BigFloat, Complex{BigFloat})
@@ -21,7 +22,7 @@ GenericFloats = (Float16, ComplexF16, BigFloat, Complex{BigFloat})
     @test expA ≈ expA2
     @test A == Ac
 
-    algs = (MatrixFunctionViaLA(), MatrixFunctionViaEig(LAPACK_Simple()))
+    algs = (MatrixFunctionViaLA(), MatrixFunctionViaEig(LAPACK_Simple()), MatrixFunctionViaTaylor())
     @testset "algorithm $alg" for alg in algs
         expA2 = @constinferred exponential(A, alg)
         @test expA ≈ expA2
@@ -46,7 +47,7 @@ end
     @test expAτ ≈ expAτ2
     @test A == Ac
 
-    algs = (MatrixFunctionViaLA(), MatrixFunctionViaEig(LAPACK_Simple()))
+    algs = (MatrixFunctionViaLA(), MatrixFunctionViaEig(LAPACK_Simple()), MatrixFunctionViaTaylor())
     @testset "algorithm $alg" for alg in algs
         expAτ2 = @constinferred exponential((τ, A), alg)
         @test expAτ ≈ expAτ2
@@ -86,3 +87,60 @@ end
     @test expAτ ≈ expAτ2
     @test A == Ac
 end
+
+@testset "exponential via Taylor for T = $T" for T in GenericFloats
+    rng = StableRNG(123)
+    m = 54
+
+    A = randn(rng, T, m, m) ./ (2 * m)
+    τ = randn(rng, T)
+    Ac = copy(A)
+    alg = MatrixFunctionViaTaylor()
+    reltol = max(1.0e-7, sqrt(eps(real(T))))
+
+    expA = @constinferred exponential(A, alg)
+    @test A == Ac
+    @test isapprox(expA * exponential(-A, alg), I; rtol = reltol)
+    @test isapprox(ComplexF64.(expA), exp(ComplexF64.(A)); rtol = reltol)
+
+    expτA = @constinferred exponential((τ, A), alg)
+    @test isapprox(ComplexF64.(expτA), exp(ComplexF64(τ) .* ComplexF64.(A)); rtol = reltol)
+end
+
+# GPU tests
+# ---------
+# The Taylor exponential is backend-generic, so the same code runs on GPU. Compare device
+# results against the CPU reference, exercising both `balance` settings (fix 1 & 3) and the
+# scaled `(τ, A)` entrypoint. A badly-scaled matrix exercises the balancing path. If any step
+# fell back to scalar indexing these would error under GPUArrays' scalar-indexing guard.
+function test_exponential_gpu(ArrayT, T)
+    rng = StableRNG(123)
+    m = 54
+    A = randn(rng, T, m, m) ./ (2 * m)
+    τ = randn(rng, T)
+
+    # badly-scaled similarity transform Aᵢⱼ ← Aᵢⱼ sᵢ / sⱼ, to give balancing work to do
+    s = exp10.(range(-real(T)(3), real(T)(3), length = m))
+    Abad = A .* s ./ transpose(s)
+
+    for M in (A, Abad)
+        M_gpu = ArrayT(M)
+        for alg in (MatrixFunctionViaTaylor(), MatrixFunctionViaTaylor(; balance = false))
+            @test Array(exponential(M_gpu, alg)) ≈ exponential(M, alg)
+            @test Array(exponential((τ, M_gpu), alg)) ≈ exponential((τ, M), alg)
+        end
+    end
+    return nothing
+end
+
+if CUDA.functional()
+    @testset "exponential on CUDA for T = $T" for T in BLASFloats
+        test_exponential_gpu(CuArray, T)
+    end
+end
+
+if AMDGPU.functional()
+    @testset "exponential on AMDGPU for T = $T" for T in BLASFloats
+        test_exponential_gpu(ROCArray, T)
+    end
+end