diff --git a/benchmark/Project.toml b/benchmark/Project.toml new file mode 100644 index 00000000..4e3050c2 --- /dev/null +++ b/benchmark/Project.toml @@ -0,0 +1,11 @@ +[deps] +BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab" +DoubleFloats = "497a8b3b-efae-58df-a0af-a86822472b78" +GenericLinearAlgebra = "14197337-ba66-59df-a3e3-ca00e7dcff7a" +GenericSchur = "c145ed77-6b09-5dd9-b285-bf645a82121e" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +MatrixAlgebraKit = "6c742aac-3347-4629-af66-fc926824e5e4" +Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" diff --git a/benchmark/README.md b/benchmark/README.md new file mode 100644 index 00000000..123aed36 --- /dev/null +++ b/benchmark/README.md @@ -0,0 +1,47 @@ +# Matrix exponential benchmarks + +`bench_exponential.jl` times and accuracy-tests every applicable `MatrixAlgebraKit.exponential` +algorithm (`MatrixFunctionViaTaylor`, `MatrixFunctionViaLA`, `MatrixFunctionViaEig`, +`MatrixFunctionViaEigh`) for `Float64`, `ComplexF64`, `Double64`, `Complex{Double64}`, +`BigFloat` and `Complex{BigFloat}`. +The `Double64` (double-double, ~106-bit) types sit between the hardware BLAS floats and +arbitrary-precision `BigFloat`, and exercise the `Eig`/`Eigh` paths through the generic +GenericSchur / GenericLinearAlgebra drivers (the `LA` LAPACK path applies to BLAS types only). + +Accuracy inputs are built from a *known* spectrum as `A = V · Diag(λ) · V⁻¹`, so the exact +`exp(A) = V · Diag(exp λ) · V⁻¹` is available analytically (computed at high precision). +Eigenvalue families (`small`, `wide`, `imaginary`, `stiff`, `illconditioned`, `hermitian`) +and eigenvector families (`unitary`, `general`, `illconditioned`) are chosen to stress or +favor particular algorithms. + +## Setup (once) + +From the repository root: + +```sh +julia --project=benchmark -e 'using Pkg; Pkg.develop(path=pwd()); Pkg.instantiate()' +``` + +This `dev`s the in-tree `MatrixAlgebraKit` into the benchmark's own environment. + +## Run + +```sh +julia --project=benchmark benchmark/bench_exponential.jl # full run +julia --project=benchmark benchmark/bench_exponential.jl --quick # fast smoke run +julia --project=benchmark benchmark/bench_exponential.jl --csv out.csv +``` + +Results are printed as per-type accuracy and timing tables and written to two CSVs derived +from the `--csv` path (default base `benchmark/results.csv`): `*-accuracy.csv` and +`*-timing.csv`. They are separate files because the accuracy and timing passes report +disjoint quantities — a single combined table would leave the timing columns empty on the +accuracy rows and vice-versa. + +## Accuracy columns + +- `analytic_err` — relative error vs the analytic `V·Diag(exp λ)·V⁻¹` reference (primary). +- `recomp_err` — relative error vs an independent high-precision `exp` of the exact input. +- `expexp_err` — `‖exp(A)·exp(−A) − I‖₁` (reference-free consistency check). +- `det_err` — relative error of `det(exp A)` against `exp(tr A)`. +- `condV` — designed condition number of the eigenvector basis (context for non-normal cases). diff --git a/benchmark/bench_exponential.jl b/benchmark/bench_exponential.jl new file mode 100644 index 00000000..93cf8a50 --- /dev/null +++ b/benchmark/bench_exponential.jl @@ -0,0 +1,509 @@ +# Benchmark and accuracy harness for `MatrixAlgebraKit.exponential`. +# +# For each element type it times every applicable exponential algorithm over a range of +# matrix sizes, and measures each algorithm's accuracy on matrices with a *known* spectrum. +# +# Accuracy inputs are built as `A = V · Diag(λ) · V⁻¹`, so the exact exponential +# `exp(A) = V · Diag(exp λ) · V⁻¹` is known analytically. The eigenvalues `λ` are drawn +# from families chosen to help or hurt particular algorithms (small/wide/imaginary/stiff/ +# ill-conditioned), and the eigenvectors `V` are either unitary (→ normal / Hermitian `A`) +# or a general basis with a prescribed condition number (→ non-normal `A`). The analytic +# reference is formed at high precision (`Complex{BigFloat}` at `HIGH_BITS` bits). +# +# Usage: +# julia --project=benchmark benchmark/bench_exponential.jl [--quick] [--csv path] +# +# --quick small sizes / short timing budget, for a fast smoke run +# --csv PATH write the flat results table to PATH (default: benchmark/results.csv) + +using LinearAlgebra +using LinearAlgebra: BLAS +using Random +using Printf +using DelimitedFiles +using StableRNGs +using BenchmarkTools +using DoubleFloats +using MatrixAlgebraKit +using MatrixAlgebraKit: MatrixFunctionViaTaylor, MatrixFunctionViaLA, + MatrixFunctionViaEig, MatrixFunctionViaEigh, QRIteration, GS, GLA +# Loading these makes the generic (BigFloat / Double64) eig / eigh drivers available. +using GenericSchur +using GenericLinearAlgebra + +# ---------------------------------------------------------------------------------------- +# Configuration +# ---------------------------------------------------------------------------------------- +const QUICK = "--quick" in ARGS +const CSV_PATH = let i = findfirst(==("--csv"), ARGS) + i === nothing ? joinpath(@__DIR__, "results.csv") : ARGS[i + 1] +end + +# Ordered by increasing precision: hardware BLAS floats, then double-double (~106 bits, +# software but fast), then arbitrary-precision BigFloat. +const ELTYPES = ( + Float64, ComplexF64, Double64, Complex{Double64}, BigFloat, Complex{BigFloat}, +) + +_isblas(::Type{T}) where {T} = T <: Union{Float32, Float64, ComplexF32, ComplexF64} +_isdouble(::Type{T}) where {T} = T <: Union{Double64, Complex{Double64}} + +# Sizes for the timing sweep (generic types are far slower, so use fewer / smaller). +# Double64 is software but only ~10-30× slower than BLAS, so it gets an intermediate range; +# BigFloat is far slower still. +blas_sizes() = QUICK ? [8, 16, 32] : [8, 16, 32, 64, 128, 256] +double_sizes() = QUICK ? [8, 16] : [8, 16, 32, 64, 128] +generic_sizes() = QUICK ? [8, 16] : [8, 16, 32, 64] +perf_sizes(T) = _isblas(T) ? blas_sizes() : (_isdouble(T) ? double_sizes() : generic_sizes()) + +# Sizes for the accuracy sweep (kept small: high-precision reference is expensive). +acc_sizes() = QUICK ? [12] : [16, 48] + +# Timing budget per (matrix, algorithm), in seconds. +perf_seconds(T) = QUICK ? 0.1 : (_isblas(T) ? 0.3 : 0.8) + +# Precision of the analytic reference and its tolerance (must beat the tested precision). +const HIGH_BITS = QUICK ? 512 : 1024 +const REF_TOL = QUICK ? big"1e-50" : big"1e-90" + +# Designed condition number of the eigenvector basis `V` per family. +const GENCOND = 1.0e2 +const ILLCOND = 1.0e8 + +# Column order for the printed tables. +const ALG_ORDER = ["Taylor", "LA", "Eig", "Eigh"] + +# Accuracy cases: (eigenvalue family, eigenvector family). +const ACC_CASES = [ + (:hermitian, :unitary), # real spectrum, unitary V -> Hermitian / symmetric + (:small, :unitary), # tiny normal matrix -> few squarings + (:imaginary, :unitary), # oscillatory normal matrix + (:wide, :unitary), # large-norm normal matrix -> many squarings + (:stiff, :general), # widely varying real parts -> over/underflow + (:wide, :general), # non-normal, large norm + (:small, :illconditioned), # near-defective basis -> eig-path cancellation +] + +# Performance cases: (label, eigenvalue family, eigenvector family). +const PERF_CASES = [ + ("general", :wide, :general), # non-normal, non-trivial norm + ("hermitian", :hermitian, :unitary), # exercises the Eigh path +] + +# ---------------------------------------------------------------------------------------- +# Random matrix helpers (support Complex{BigFloat}, which `randn` does not directly) +# ---------------------------------------------------------------------------------------- +_randn(rng, ::Type{T}, dims...) where {T <: Real} = randn(rng, T, dims...) +function _randn(rng, ::Type{Complex{T}}, dims...) where {T <: Real} + return randn(rng, T, dims...) .+ im .* randn(rng, T, dims...) +end + +unitary_matrix(rng, ::Type{RT}, n) where {RT} = Matrix(qr(_randn(rng, RT, n, n)).Q) + +# General basis with singular values geometrically spread from 1 to `targetcond`. +function general_matrix(rng, ::Type{RT}, n, targetcond) where {RT} + U = unitary_matrix(rng, RT, n) + W = unitary_matrix(rng, RT, n) + s = exp.(range(0.0, log(targetcond); length = n)) + S = Diagonal(real(RT).(s)) + return U * S * W +end + +# ---------------------------------------------------------------------------------------- +# Spectra +# ---------------------------------------------------------------------------------------- +function sample_eig(family, rng)::ComplexF64 + if family === :small + return 0.15 * randn(rng) + 0.15im * (0.2 + abs(randn(rng))) + elseif family === :wide + r = exp(log(0.05) + (log(30.0) - log(0.05)) * rand(rng)) + θ = 0.1 + (π - 0.2) * rand(rng) + return r * cis(θ) + elseif family === :imaginary + return im * (0.5 + 8.0 * rand(rng)) + elseif family === :stiff + return (-40.0 + 44.0 * rand(rng)) + im * (0.3 + 0.5 * abs(randn(rng))) + elseif family === :illconditioned + return 0.5 * randn(rng) + im * (0.2 + 0.3 * abs(randn(rng))) + else + error("unknown eigenvalue family $family") + end +end + +# A conjugate-closed spectrum of length `n` (so a real matrix with this spectrum exists). +function make_spectrum(family, n, rng)::Vector{ComplexF64} + if family === :hermitian + return complex.(2 .* randn(rng, n)) + end + half = fld(n, 2) + μ = ComplexF64[] + for _ in 1:half + z = sample_eig(family, rng) + push!(μ, z) + push!(μ, conj(z)) + end + if isodd(n) + push!(μ, complex(real(sample_eig(family, rng)))) + end + return μ +end + +# Real block form of a conjugate-closed spectrum, together with its analytic exponential. +# Real eigenvalues sit on the diagonal; a complex pair a±bi becomes a 2×2 block [a b; -b a]. +# Must be called inside a `setprecision` block so the BigFloat ops use `HIGH_BITS`. +function real_canonical(spectrum::Vector{ComplexF64}) + tol = 1.0e-9 + reals = BigFloat[] + pairs = Tuple{BigFloat, BigFloat}[] + for z in spectrum + if abs(imag(z)) < tol + push!(reals, BigFloat(real(z))) + elseif imag(z) > 0 + push!(pairs, (BigFloat(real(z)), BigFloat(imag(z)))) + end + end + n = length(reals) + 2 * length(pairs) + D = zeros(BigFloat, n, n) + E = zeros(BigFloat, n, n) + i = 1 + for a in reals + D[i, i] = a + E[i, i] = exp(a) + i += 1 + end + for (a, b) in pairs + D[i, i] = a + D[i, i + 1] = b + D[i + 1, i] = -b + D[i + 1, i + 1] = a + ea, cb, sb = exp(a), cos(b), sin(b) + E[i, i] = ea * cb + E[i, i + 1] = ea * sb + E[i + 1, i] = -ea * sb + E[i + 1, i + 1] = ea * cb + i += 2 + end + return D, E +end + +# ---------------------------------------------------------------------------------------- +# Case construction: returns the input matrix at type `T` and the analytic reference. +# ---------------------------------------------------------------------------------------- +function build_case(::Type{T}, n, eigfam, evecfam, rng) where {T} + RT_hi = (T <: Complex) ? Complex{BigFloat} : BigFloat + targetcond = evecfam === :illconditioned ? ILLCOND : + (evecfam === :general ? GENCOND : 1.0) + spectrum = make_spectrum(eigfam, n, rng) + + A_hi, expA_hi = setprecision(BigFloat, HIGH_BITS) do + V = evecfam === :unitary ? unitary_matrix(rng, RT_hi, n) : + general_matrix(rng, RT_hi, n, targetcond) + Vi = inv(V) + if T <: Complex + λ = Complex{BigFloat}.(spectrum) + A = V * Diagonal(λ) * Vi + expA = V * Diagonal(exp.(λ)) * Vi + return Complex{BigFloat}.(A), Complex{BigFloat}.(expA) + else + D, E = real_canonical(spectrum) + A = V * D * Vi + expA = V * E * Vi + return A, Complex{BigFloat}.(expA) + end + end + + # Round the input down to the tested precision (default BigFloat precision for generic T). + A_T = (T <: Complex) ? Complex{real(T)}.(A_hi) : real(T).(A_hi) + herm = (eigfam === :hermitian && evecfam === :unitary) + herm && (A_T = (A_T + A_T') / 2) # enforce exact Hermiticity for the Eigh path + return (A = Matrix(A_T), ref = expA_hi, condV = targetcond, hermitian = herm) +end + +# ---------------------------------------------------------------------------------------- +# Algorithm selection per input +# ---------------------------------------------------------------------------------------- +_trydefault(f, A) = try + f(typeof(A)) +catch + nothing +end + +# The library only registers default eig / eigh algorithms for BLAS and BigFloat element +# types, but GenericSchur / GenericLinearAlgebra actually drive any `AbstractFloat`. For +# types without a registered default (e.g. Double64) fall back to those drivers directly, +# using the same `QRIteration` the BigFloat default resolves to. +function eig_algorithm_for(A) + eig = _trydefault(MatrixAlgebraKit.default_eig_algorithm, A) + return eig !== nothing ? eig : QRIteration(; driver = GS()) +end +function eigh_algorithm_for(A) + eigh = _trydefault(MatrixAlgebraKit.default_eigh_algorithm, A) + return eigh !== nothing ? eigh : QRIteration(; driver = GLA()) +end + +function algorithms_for(A, hermitian::Bool) + T = eltype(A) + algs = Tuple{String, Any}[("Taylor", MatrixFunctionViaTaylor())] + if T <: LinearAlgebra.BlasFloat + push!(algs, ("LA", MatrixFunctionViaLA())) + end + eig = eig_algorithm_for(A) + eig !== nothing && push!(algs, ("Eig", MatrixFunctionViaEig(eig))) + if hermitian + eigh = eigh_algorithm_for(A) + eigh !== nothing && push!(algs, ("Eigh", MatrixFunctionViaEigh(eigh))) + end + return algs +end + +# ---------------------------------------------------------------------------------------- +# Accuracy metrics +# ---------------------------------------------------------------------------------------- +function relerr(X, ref) + return setprecision(BigFloat, HIGH_BITS) do + Xc = Complex{BigFloat}.(X) + Float64(opnorm(Xc .- ref, 1) / opnorm(ref, 1)) + end +end + +# Independent high-precision recomputation of exp of the *actual* input, so algorithms are +# also compared to a common reference for the exact matrix they were handed. +function highprec_exp(A_T) + return setprecision(BigFloat, HIGH_BITS) do + Ahi = Complex{BigFloat}.(A_T) + exponential(Ahi, MatrixFunctionViaTaylor(; tol = REF_TOL, balance = true)) + end +end + +# Reference-free consistency checks. +function expexp_err(A, alg) + E = exponential(A, alg) + Em = exponential(-A, alg) + return Float64(opnorm(E * Em - I, 1)) +end +function det_err(A, E) + return try + d = det(E) + target = exp(tr(A)) + Float64(abs(d - target) / abs(target)) + catch + NaN + end +end + +# ---------------------------------------------------------------------------------------- +# Result rows +# ---------------------------------------------------------------------------------------- +mutable struct Row + pass::String + size::Int + eltype::String + eigfam::String + evecfam::String + alg::String + time_s::Float64 + allocs::Float64 + analytic_err::Float64 + recomp_err::Float64 + expexp_err::Float64 + det_err::Float64 + condV::Float64 + norm1::Float64 + status::String +end + +const ROWS = Row[] + +# ---------------------------------------------------------------------------------------- +# Passes +# ---------------------------------------------------------------------------------------- +function run_accuracy!() + println("\n", "="^80) + println("ACCURACY PASS (analytic reference at $HIGH_BITS bits)") + println("="^80) + for T in ELTYPES + for n in acc_sizes() + for (eigfam, evecfam) in ACC_CASES + rng = StableRNG(123) + case = build_case(T, n, eigfam, evecfam, rng) + A = case.A + norm1 = Float64(opnorm(A, 1)) + recomp = highprec_exp(A) + for (name, alg) in algorithms_for(A, case.hermitian) + row = Row( + "acc", n, string(T), string(eigfam), string(evecfam), name, + NaN, NaN, NaN, NaN, NaN, NaN, case.condV, norm1, "ok", + ) + try + E = exponential(A, alg) + row.analytic_err = relerr(E, case.ref) + row.recomp_err = relerr(E, recomp) + row.expexp_err = expexp_err(A, alg) + row.det_err = det_err(A, E) + catch e + row.status = "error: $(sprint(showerror, e))" + end + push!(ROWS, row) + end + end + end + end + return +end + +function run_performance!() + println("\n", "="^80) + println("PERFORMANCE PASS") + println("="^80) + for T in ELTYPES + sec = perf_seconds(T) + BenchmarkTools.DEFAULT_PARAMETERS.seconds = sec + for n in perf_sizes(T) + for (label, eigfam, evecfam) in PERF_CASES + rng = StableRNG(123) + case = build_case(T, n, eigfam, evecfam, rng) + A = case.A + norm1 = Float64(opnorm(A, 1)) + for (name, alg) in algorithms_for(A, case.hermitian) + row = Row( + "perf", n, string(T), label, string(evecfam), name, + NaN, NaN, NaN, NaN, NaN, NaN, case.condV, norm1, "ok", + ) + try + b = @benchmark exponential($A, $alg) + tr_ = minimum(b) + row.time_s = tr_.time / 1.0e9 + row.allocs = Float64(tr_.allocs) + catch e + row.status = "error: $(sprint(showerror, e))" + end + push!(ROWS, row) + end + end + end + end + return +end + +# ---------------------------------------------------------------------------------------- +# Reporting +# ---------------------------------------------------------------------------------------- +fmt_time(x) = isnan(x) ? " - " : ( + x < 1.0e-3 ? @sprintf("%7.1fµs", x * 1.0e6) : + x < 1.0 ? @sprintf("%7.2fms", x * 1.0e3) : @sprintf("%7.3fs ", x) + ) +fmt_err(x) = isnan(x) ? " - " : @sprintf("%9.2e", x) + +function print_perf_tables() + for T in ELTYPES + for (label, _, _) in PERF_CASES + rows = filter(r -> r.pass == "perf" && r.eltype == string(T) && r.eigfam == label, ROWS) + isempty(rows) && continue + algs = filter(a -> any(r -> r.alg == a, rows), ALG_ORDER) + println("\n[$T] timing — $label matrix (min time per call)") + @printf(" %5s", "n") + for a in algs + @printf(" %10s", a) + end + println() + for n in sort(unique(r.size for r in rows)) + @printf(" %5d", n) + for a in algs + r = findfirst(x -> x.size == n && x.alg == a, rows) + @printf(" %10s", r === nothing ? "-" : strip(fmt_time(rows[r].time_s))) + end + println() + end + end + end + return +end + +function print_acc_tables() + for T in ELTYPES + rows = filter(r -> r.pass == "acc" && r.eltype == string(T), ROWS) + isempty(rows) && continue + algs = filter(a -> any(r -> r.alg == a, rows), ALG_ORDER) + println("\n[$T] accuracy — relative error vs analytic reference") + @printf(" %-26s %5s", "case (eig/evec)", "n") + for a in algs + @printf(" %9s", a) + end + println() + for n in sort(unique(r.size for r in rows)) + for (eigfam, evecfam) in ACC_CASES + sel = filter(r -> r.size == n && r.eigfam == string(eigfam) && r.evecfam == string(evecfam), rows) + isempty(sel) && continue + @printf(" %-26s %5d", "$eigfam/$evecfam", n) + for a in algs + r = findfirst(x -> x.alg == a, sel) + @printf(" %9s", r === nothing ? "-" : strip(fmt_err(sel[r].analytic_err))) + end + println() + end + end + end + return +end + +# The accuracy and timing passes produce disjoint columns, so they are written to two +# separate files rather than one flat table with half the columns empty. +function _write_csv(path, header, rows, cols) + return open(path, "w") do io + writedlm(io, permutedims(header), ',') + for r in rows + writedlm(io, permutedims(Any[getfield(r, c) for c in cols]), ',') + end + end +end + +function write_csv(path) + base, ext = splitext(path) + isempty(ext) && (ext = ".csv") + acc_path = base * "-accuracy" * ext + perf_path = base * "-timing" * ext + + acc = filter(r -> r.pass == "acc", ROWS) + perf = filter(r -> r.pass == "perf", ROWS) + + _write_csv( + acc_path, + [ + "size", "eltype", "eig_family", "evec_family", "algorithm", + "analytic_err", "recomp_err", "expexp_err", "det_err", "condV", "norm1", "status", + ], + acc, + ( + :size, :eltype, :eigfam, :evecfam, :alg, + :analytic_err, :recomp_err, :expexp_err, :det_err, :condV, :norm1, :status, + ), + ) + _write_csv( + perf_path, + [ + "size", "eltype", "case", "evec_family", "algorithm", + "time_s", "allocs", "condV", "norm1", "status", + ], + perf, + (:size, :eltype, :eigfam, :evecfam, :alg, :time_s, :allocs, :condV, :norm1, :status), + ) + println("\nWrote $(length(acc)) accuracy rows to $acc_path") + return println("Wrote $(length(perf)) timing rows to $perf_path") +end + +# ---------------------------------------------------------------------------------------- +# Main +# ---------------------------------------------------------------------------------------- +function main() + # Single-threaded BLAS for a clean, reproducible algorithm-to-algorithm comparison. + BLAS.set_num_threads(1) + @info "matrix-exponential benchmark" QUICK HIGH_BITS blas_threads = BLAS.get_num_threads() types = ELTYPES + run_accuracy!() + run_performance!() + print_acc_tables() + print_perf_tables() + write_csv(CSV_PATH) + return nothing +end + +main() diff --git a/docs/src/user_interface/algorithms.md b/docs/src/user_interface/algorithms.md index e6030787..d90b7607 100644 --- a/docs/src/user_interface/algorithms.md +++ b/docs/src/user_interface/algorithms.md @@ -102,6 +102,7 @@ The following algorithms for matrix functions are available. | Algorithm | Applicable matrix functions | Key keyword arguments | |:----------|:--------------------------|:----------------------| +| [`MatrixFunctionViaTaylor`](@ref) | exponential | `tol`, `balance` | | [`MatrixFunctionViaLA`](@ref) | exponential | | | [`MatrixFunctionViaEig`](@ref) | exponential | `eig_alg` | | [`MatrixFunctionViaEigh`](@ref) | exponential | `eigh_alg` | diff --git a/docs/src/user_interface/matrix_functions.md b/docs/src/user_interface/matrix_functions.md index 20564773..6153f2e6 100644 --- a/docs/src/user_interface/matrix_functions.md +++ b/docs/src/user_interface/matrix_functions.md @@ -23,14 +23,16 @@ Additionally, the `f!` method typically assumes that it is allowed to destroy th ## Exponential The [exponential](https://en.wikipedia.org/wiki/Matrix_exponential) of a square matrix `A` is used in many scientific applications, as it arises in the solution of an autonomous linear differential equation. -An implementation for the matrix exponential based on a Padé approximation is available in `LinearAlgebra`, and can be accessed by the algorithm [`MatrixFunctionViaLA`](@ref). -For more generic data types, the exponential can be calculated by first calculating the (hermitian) eigenvalue decomposition, and then computing -the scalar exponential of the diagonal elements. +The default algorithm [`MatrixFunctionViaTaylor`](@ref) is a pure-Julia scaling-and-squaring evaluation of the Taylor series. +As it requires no LAPACK support, it also applies to generic data types at arbitrary precision. +Alternatively, an implementation based on a Padé approximation is available in `LinearAlgebra`, and can be accessed by the algorithm [`MatrixFunctionViaLA`](@ref). +The exponential can also be calculated by first calculating the (hermitian) eigenvalue decomposition, and then computing the scalar exponential of the diagonal elements. This strategy is implemented via the algorithms [`MatrixFunctionViaEig`](@ref) and [`MatrixFunctionViaEigh`](@ref), and call `eig_full` and `eigh_full`, respectively. Additionally, in order to calculate `exp(τ * A)`, the function `exponential` can be called with `(τ, A)`, using the same algorithms as before. ```@docs; canonical=false exponential +MatrixAlgebraKit.MatrixFunctionViaTaylor MatrixAlgebraKit.MatrixFunctionViaLA MatrixAlgebraKit.MatrixFunctionViaEig MatrixAlgebraKit.MatrixFunctionViaEigh diff --git a/src/MatrixAlgebraKit.jl b/src/MatrixAlgebraKit.jl index 7b9d5ef1..4d4e0084 100644 --- a/src/MatrixAlgebraKit.jl +++ b/src/MatrixAlgebraKit.jl @@ -41,7 +41,7 @@ export LAPACK_HouseholderQR, LAPACK_HouseholderLQ, LAPACK_Simple, LAPACK_Expert, export GLA_HouseholderQR, GLA_QRIteration, GS_QRIteration export LQViaTransposedQR export PolarViaSVD, PolarNewton -export MatrixFunctionViaLA, MatrixFunctionViaEig, MatrixFunctionViaEigh +export MatrixFunctionViaLA, MatrixFunctionViaEig, MatrixFunctionViaEigh, MatrixFunctionViaTaylor export DefaultAlgorithm export DiagonalAlgorithm export NativeBlocked @@ -94,6 +94,7 @@ include("common/safemethods.jl") include("common/view.jl") include("common/regularinv.jl") include("common/matrixproperties.jl") +include("common/balancing.jl") include("common/utility.jl") include("yalapack.jl") diff --git a/src/common/balancing.jl b/src/common/balancing.jl new file mode 100644 index 00000000..8f0a0dac --- /dev/null +++ b/src/common/balancing.jl @@ -0,0 +1,54 @@ +""" + balance!(A::AbstractMatrix; radix=2, maxiter=100) -> A, scale + +Balance the square matrix `A` in place through a diagonal similarity `A ← D⁻¹ A D` that +reduces its norm. Each sweep computes, for every index at once, the power-of-`radix` factor +that best equalizes the off-diagonal row and column norms (a simultaneous, Osborne-style +variant of the Parlett–Reinsch scaling), and applies all factors together; sweeps repeat +until no index changes or `maxiter` is reached. Because it is expressed entirely through +reductions and broadcasts (no scalar indexing), the same code runs on CPU and GPU arrays. + +The returned `scale` holds the diagonal of `D`, so that a matrix function of the original +input can be recovered from a matrix function `f` of the balanced matrix through +`D f(D⁻¹AD) D⁻¹`, i.e. `expA[i, j] = scale[i] * f(B)[i, j] / scale[j]`. +""" +function balance!(A::AbstractMatrix{T}; radix::Integer = 2, maxiter::Integer = 100) where {T} + n = LinearAlgebra.checksquare(A) + R = real(T) + β = convert(R, radix) + logβ = log(β) + scale = fill!(similar(A, R, n), one(R)) + + colnorm = similar(A, R, n) + rownorm = similar(A, R, n) + f = similar(A, R, n) + colsum = reshape(colnorm, 1, n) + rowsum = reshape(rownorm, n, 1) + d = abs.(diagview(A)) + fᵀ = transpose(f) + + for _ in 1:maxiter + fill!(colsum, zero(R)) + Base.mapreducedim!(abs, +, colsum, A) + fill!(rowsum, zero(R)) + Base.mapreducedim!(abs, +, rowsum, A) + colnorm .-= d + rownorm .-= d + f .= _balance_factor.(colnorm, rownorm, β, logβ) + all(isone, f) && break + # apply Aᵢⱼ ← Aᵢⱼ fⱼ / fᵢ (i.e. column j scaled by fⱼ, row i by 1/fᵢ) and accumulate. + A .= A .* fᵀ ./ f + scale .*= f + end + + return A, scale +end + +# Nearest power-of-`radix` factor `f` that equalizes the scaled off-diagonal norms +# `colnorm·f` and `rownorm/f`, kept only when it reduces their sum (avoids oscillation and +# leaves degenerate rows/columns untouched). +@inline function _balance_factor(colnorm::R, rownorm::R, β::R, logβ::R) where {R} + (colnorm > 0 && rownorm > 0) || return one(R) + f = β^round(log(rownorm / colnorm) / (2 * logβ)) + return (colnorm * f + rownorm / f) < convert(R, 0.95) * (colnorm + rownorm) ? f : one(R) +end diff --git a/src/implementations/exponential.jl b/src/implementations/exponential.jl index cbaebeed..283d5ce2 100644 --- a/src/implementations/exponential.jl +++ b/src/implementations/exponential.jl @@ -107,3 +107,100 @@ function exponential!((τ, A)::Tuple{Number, AbstractMatrix}, expA::AbstractMatr check_input(exponential!, (τ, A), expA, alg) return map_diagonal!(x -> exp(x * τ), expA, A) end + +# Taylor logic +# ------------ +function exponential!(A::AbstractMatrix, expA::AbstractMatrix, alg::MatrixFunctionViaTaylor) + check_input(exponential!, A, expA, alg) + T = eltype(A) + R = real(T) + tol = convert(R, get(alg.kwargs, :tol, eps(R))) + + dobalance = get(alg.kwargs, :balance, false) + scale = dobalance ? balance!(A)[2] : nothing + + θ = LinearAlgebra.opnorm(A, 1) + iszero(θ) && return one!(expA) + + order, squarings = taylor_order_and_squarings(θ, tol) + squarings > 0 && rmul!(A, inv(convert(T, 2)^squarings)) + + X = taylor_polynomial(A, order) + Y = expA # can reuse this memory + for _ in 1:squarings + mul!(Y, X, X) + X, Y = Y, X + end + + if dobalance + expA .= scale .* X ./ transpose(scale) + else + X === expA || copyto!(expA, X) + end + return expA +end + +# Truncation order `m` and number of squarings `s` minimizing the Paterson–Stockmeyer +# matrix-multiplication count subject to the Taylor remainder bound `(θ/2ˢ)ᵐ⁺¹/(m+1)! ≤ tol`. +function taylor_order_and_squarings(θ::Real, tol::Real) + log2θ = log2(Float64(θ)) + log2tol = log2(Float64(tol)) + log2factorial = 0.0 + best_order = 1 + best_squarings = 0 + best_cost = typemax(Int) + for order in 1:100 + log2factorial += log2(order + 1) + squarings = max(0, ceil(Int, log2θ - (log2tol + log2factorial) / (order + 1))) + blocksize = ceil(Int, sqrt(order + 1)) + cost = (blocksize - 1) + (cld(order + 1, blocksize) - 1) + squarings + if cost < best_cost + best_cost = cost + best_order = order + best_squarings = squarings + end + end + return best_order, best_squarings +end + +# Evaluate ∑ₖ₌₀ᵐ Aᵏ/k! via the Paterson–Stockmeyer scheme, returning a freshly allocated matrix. +function taylor_polynomial(A::AbstractMatrix, order::Integer) + T = eltype(A) + blocksize = ceil(Int, sqrt(order + 1)) + + invfactorial = Vector{T}(undef, order + 1) + invfactorial[1] = one(T) + for k in 1:order + invfactorial[k + 1] = invfactorial[k] / k + end + + powers = Vector{typeof(A)}(undef, blocksize) + powers[1] = A + for p in 2:blocksize + powers[p] = powers[p - 1] * A + end + + result = similar(A) + block = similar(A) + numblocks = fld(order, blocksize) + taylor_block!(result, powers, invfactorial, numblocks, blocksize, order) + for j in (numblocks - 1):-1:0 + mul!(block, result, powers[blocksize]) + taylor_block!(result, powers, invfactorial, j, blocksize, order) + result .+= block + end + return result +end + +# Sub-polynomial ∑ᵢ₌₀ᵇ⁻¹ c_{jb+i} Aⁱ of degree `blocksize - 1`, written into `out`. +function taylor_block!(out::AbstractMatrix, powers, invfactorial, j::Integer, blocksize::Integer, order::Integer) + base = j * blocksize + fill!(out, zero(eltype(out))) + diagview(out) .= invfactorial[base + 1] + for i in 1:(blocksize - 1) + k = base + i + k > order && break + out .+= invfactorial[k + 1] .* powers[i] + end + return out +end diff --git a/src/interface/exponential.jl b/src/interface/exponential.jl index dd4d00d9..902a4ba4 100644 --- a/src/interface/exponential.jl +++ b/src/interface/exponential.jl @@ -24,7 +24,7 @@ Compute the exponential of the square matrix `A` or `τ * A`, # ------------------- default_exponential_algorithm(A; kwargs...) = default_exponential_algorithm(typeof(A); kwargs...) function default_exponential_algorithm(T::Type; kwargs...) - return MatrixFunctionViaLA(; kwargs...) + return MatrixFunctionViaTaylor(; kwargs...) end function default_exponential_algorithm(::Type{T}; kwargs...) where {T <: Diagonal} return DiagonalAlgorithm(; kwargs...) diff --git a/src/interface/matrixfunctions.jl b/src/interface/matrixfunctions.jl index 8474f9bc..35ad4407 100644 --- a/src/interface/matrixfunctions.jl +++ b/src/interface/matrixfunctions.jl @@ -8,6 +8,18 @@ Algorithm type to denote finding the exponential of `A` via the implementation o """ @algdef MatrixFunctionViaLA +""" + MatrixFunctionViaTaylor(; tol=eps, balance=true) + +Algorithm type to denote finding the exponential of `A` through a pure-Julia scaling-and-squaring +evaluation of its Taylor series, following Fasi & Higham (2018). +The truncation order and the number of squarings are chosen to reach a relative accuracy `tol`, +and the Taylor polynomial is evaluated with the Paterson–Stockmeyer scheme. +When `balance` is `true`, `A` is first balanced by a diagonal similarity. +As this algorithm requires no LAPACK support, it also applies at arbitrary precision. +""" +@algdef MatrixFunctionViaTaylor + """ MatrixFunctionViaEigh(eigh_alg) diff --git a/test/exponential.jl b/test/exponential.jl index ad7628c3..c71c1edc 100644 --- a/test/exponential.jl +++ b/test/exponential.jl @@ -5,6 +5,7 @@ using StableRNGs using MatrixAlgebraKit: diagview using LinearAlgebra using LinearAlgebra: exp +using CUDA, AMDGPU BLASFloats = (Float32, Float64, ComplexF32, ComplexF64) GenericFloats = (Float16, ComplexF16, BigFloat, Complex{BigFloat}) @@ -21,7 +22,7 @@ GenericFloats = (Float16, ComplexF16, BigFloat, Complex{BigFloat}) @test expA ≈ expA2 @test A == Ac - algs = (MatrixFunctionViaLA(), MatrixFunctionViaEig(LAPACK_Simple())) + algs = (MatrixFunctionViaLA(), MatrixFunctionViaEig(LAPACK_Simple()), MatrixFunctionViaTaylor()) @testset "algorithm $alg" for alg in algs expA2 = @constinferred exponential(A, alg) @test expA ≈ expA2 @@ -46,7 +47,7 @@ end @test expAτ ≈ expAτ2 @test A == Ac - algs = (MatrixFunctionViaLA(), MatrixFunctionViaEig(LAPACK_Simple())) + algs = (MatrixFunctionViaLA(), MatrixFunctionViaEig(LAPACK_Simple()), MatrixFunctionViaTaylor()) @testset "algorithm $alg" for alg in algs expAτ2 = @constinferred exponential((τ, A), alg) @test expAτ ≈ expAτ2 @@ -86,3 +87,60 @@ end @test expAτ ≈ expAτ2 @test A == Ac end + +@testset "exponential via Taylor for T = $T" for T in GenericFloats + rng = StableRNG(123) + m = 54 + + A = randn(rng, T, m, m) ./ (2 * m) + τ = randn(rng, T) + Ac = copy(A) + alg = MatrixFunctionViaTaylor() + reltol = max(1.0e-7, sqrt(eps(real(T)))) + + expA = @constinferred exponential(A, alg) + @test A == Ac + @test isapprox(expA * exponential(-A, alg), I; rtol = reltol) + @test isapprox(ComplexF64.(expA), exp(ComplexF64.(A)); rtol = reltol) + + expτA = @constinferred exponential((τ, A), alg) + @test isapprox(ComplexF64.(expτA), exp(ComplexF64(τ) .* ComplexF64.(A)); rtol = reltol) +end + +# GPU tests +# --------- +# The Taylor exponential is backend-generic, so the same code runs on GPU. Compare device +# results against the CPU reference, exercising both `balance` settings (fix 1 & 3) and the +# scaled `(τ, A)` entrypoint. A badly-scaled matrix exercises the balancing path. If any step +# fell back to scalar indexing these would error under GPUArrays' scalar-indexing guard. +function test_exponential_gpu(ArrayT, T) + rng = StableRNG(123) + m = 54 + A = randn(rng, T, m, m) ./ (2 * m) + τ = randn(rng, T) + + # badly-scaled similarity transform Aᵢⱼ ← Aᵢⱼ sᵢ / sⱼ, to give balancing work to do + s = exp10.(range(-real(T)(3), real(T)(3), length = m)) + Abad = A .* s ./ transpose(s) + + for M in (A, Abad) + M_gpu = ArrayT(M) + for alg in (MatrixFunctionViaTaylor(), MatrixFunctionViaTaylor(; balance = false)) + @test Array(exponential(M_gpu, alg)) ≈ exponential(M, alg) + @test Array(exponential((τ, M_gpu), alg)) ≈ exponential((τ, M), alg) + end + end + return nothing +end + +if CUDA.functional() + @testset "exponential on CUDA for T = $T" for T in BLASFloats + test_exponential_gpu(CuArray, T) + end +end + +if AMDGPU.functional() + @testset "exponential on AMDGPU for T = $T" for T in BLASFloats + test_exponential_gpu(ROCArray, T) + end +end