From 976e8076960dc9c18e699c9d4162553b7fc6bf9c Mon Sep 17 00:00:00 2001
From: Matthew Fishman <mfishman@flatironinstitute.org>
Date: Wed, 1 Jul 2026 13:55:51 -0400
Subject: [PATCH 1/3] Speed up named-tensor broadcasting

## Summary

Reworks elementwise broadcasting on named tensors (`a + b`, `a .+ b`, `c .= a .+ b`) to skip Base's named-axis broadcast machinery and lower directly onto TensorAlgebra's linear-combination path over the backing arrays, aligning operands by dimension name.

A named broadcast previously rebuilt `NamedUnitRange` axes through `combine_axes`/`broadcast_shape` on every call and ran a runtime `promote_op` inference call for the result element type, because a named tensor's `eltype` is not inferrable. A 2x2 add allocated 52 times. Now `instantiate` is a no-op for the named style, the destination names come from `dimnames` of the operands rather than `axes(bc)`, and the unnamed work runs behind a function barrier where the result element type is inferrable. The same 2x2 add drops to 10 allocations (3.19 KiB to 464 B) and is several times faster, and larger dense adds improve as well.

`materialize!` is intercepted for the named style so the in-place path no longer reconstructs the broadcast over `axes(dest)` and re-enters the axis machinery. In-place is now cheaper than out-of-place, as it should be. Aligning an operand to the destination names also drops the previous `aligneddims` round trip.

With the axis machinery off every path, the `combine_axes`/`broadcast_shape`/`promote_shape`/`check_broadcast_shape` overloads and the broadcasted `similar` are removed. `axes` and `similar` on a raw lazy named `Broadcasted` are no longer supported, which nothing relies on: materialization goes through `dimnames`, and the non-linear fused fallback runs on the unnamed broadcast.
---
 Project.toml               |   2 +-
 src/abstractnamedtensor.jl |   7 +-
 src/broadcast.jl           | 203 +++++++++++--------------------------
 src/namedtensoroperator.jl |   6 --
 4 files changed, 69 insertions(+), 149 deletions(-)

diff --git a/Project.toml b/Project.toml
index e8ccfe4..8aa9262 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "ITensorBase"
 uuid = "4795dd04-0d67-49bb-8f44-b89c448a1dc7"
-version = "0.10.0"
+version = "0.10.1"
 authors = ["ITensor developers <support@itensor.org> and contributors"]
 
 [workspace]
diff --git a/src/abstractnamedtensor.jl b/src/abstractnamedtensor.jl
index 581fcb2..d0e00dc 100644
--- a/src/abstractnamedtensor.jl
+++ b/src/abstractnamedtensor.jl
@@ -84,7 +84,12 @@ dimnametype(type::Type{<:AbstractNamedTensor}) = Any
 # Unwrapping the names (named-array interface).
 # TODO: Use `IsNamed` trait?
 unnamed(a::AbstractNamedTensor) = throw(MethodError(unnamed, a))
-unnamed(a::AbstractNamedTensor, inds) = unnamed(aligneddims(a, inds))
+function unnamed(a::AbstractNamedTensor, names)
+    array = unnamed(a)
+    # Fast path: already aligned to `names`, so no permutation and no `getperm` allocation.
+    dimnames(a) == names && return array
+    return permuteddims(array, Tuple(getperm(dimnames(a), names)))
+end
 unname(a::AbstractNamedTensor, inds) = unnamed(aligndims(a, inds))
 
 """
diff --git a/src/broadcast.jl b/src/broadcast.jl
index b857dae..be5114a 100644
--- a/src/broadcast.jl
+++ b/src/broadcast.jl
@@ -1,7 +1,5 @@
-using ..ITensorBase: AbstractNamedTensor, ITensorBase, NamedUnitRange, getperm, inds, name,
-    named, nameddims, unname, unnamed
-using Base.Broadcast: Broadcast as BC, Broadcasted, broadcast_shape, broadcasted,
-    check_broadcast_shape, combine_axes
+using ..ITensorBase: AbstractNamedTensor, ITensorBase, dimnames, named, nameddims, unnamed
+using Base.Broadcast: Broadcast as BC, Broadcasted, broadcasted
 using TensorAlgebra: TensorAlgebra as TA
 
 abstract type AbstractNamedTensorStyle{N} <: BC.AbstractArrayStyle{N} end
@@ -22,103 +20,10 @@ end
 # `AbstractArray`); without this the default `broadcastable` wraps it in a `Ref`.
 BC.broadcastable(a::AbstractNamedTensor) = a
 
-function BC.combine_axes(
-        a1::AbstractNamedTensor, a_rest::AbstractNamedTensor...
-    )
-    return broadcast_shape(axes(a1), combine_axes(a_rest...))
-end
-function BC.combine_axes(a1::AbstractNamedTensor, a2::AbstractNamedTensor)
-    return broadcast_shape(axes(a1), axes(a2))
-end
-BC.combine_axes(a::AbstractNamedTensor) = axes(a)
-
-# The named axes are a `Tuple` of `NamedUnitRange`s. Dispatch the
-# name-aware shape combination on that tuple form (the elements are not
-# `AbstractUnitRange`s, so Base's positional tuple-shape methods do not apply).
-function BC.broadcast_shape(
-        ax1::Tuple{NamedUnitRange, Vararg{NamedUnitRange}},
-        ax2::Tuple{NamedUnitRange, Vararg{NamedUnitRange}},
-        ax_rest::Tuple{NamedUnitRange, Vararg{NamedUnitRange}}...
-    )
-    return broadcast_shape(broadcast_shape(ax1, ax2), ax_rest...)
-end
-
-function BC.broadcast_shape(
-        ax1::Tuple{NamedUnitRange, Vararg{NamedUnitRange}},
-        ax2::Tuple{NamedUnitRange, Vararg{NamedUnitRange}}
-    )
-    return promote_shape(ax1, ax2)
-end
-
-# Handle scalar values.
-function BC.broadcast_shape(
-        ax1::Tuple{}, ax2::Tuple{NamedUnitRange, Vararg{NamedUnitRange}}
-    )
-    return ax2
-end
-function BC.broadcast_shape(
-        ax1::Tuple{NamedUnitRange, Vararg{NamedUnitRange}}, ax2::Tuple{}
-    )
-    return ax1
-end
-
-function Base.promote_shape(
-        ax1::Tuple{NamedUnitRange, Vararg{NamedUnitRange}},
-        ax2::Tuple{NamedUnitRange, Vararg{NamedUnitRange}}
-    )
-    return set_promote_shape(ax1, ax2)
-end
-
-function set_promote_shape(
-        ax1::Tuple{NamedUnitRange, Vararg{NamedUnitRange, N}},
-        ax2::Tuple{NamedUnitRange, Vararg{NamedUnitRange, N}}
-    ) where {N}
-    perm = getperm(ax2, ax1)
-    ax2_aligned = map(i -> ax2[i], perm)
-    ax_promoted = promote_shape(unnamed.(ax1), unnamed.(ax2_aligned))
-    return named.(ax_promoted, name.(ax1))
-end
-
-# Handle operations like `randn() + randn(2, 2)[i, j]``.
-# TODO: Decide if this should be a general definition for `AbstractNamedTensor`,
-# or just for `AbstractNamedTensor`.
-function set_promote_shape(
-        ax1::Tuple{}, ax2::Tuple{NamedUnitRange, Vararg{NamedUnitRange}}
-    )
-    return ax2
-end
-
-# Handle operations like `randn(2, 2)[i, j] + randn()`.
-# TODO: Decide if this should be a general definition for `AbstractNamedTensor`,
-# or just for `AbstractNamedTensor`.
-function set_promote_shape(
-        ax1::Tuple{NamedUnitRange, Vararg{NamedUnitRange}}, ax2::Tuple{}
-    )
-    return ax1
-end
-
-function BC.check_broadcast_shape(
-        ax1::Tuple{NamedUnitRange, Vararg{NamedUnitRange}},
-        ax2::Tuple{NamedUnitRange, Vararg{NamedUnitRange}}
-    )
-    return set_check_broadcast_shape(ax1, ax2)
-end
-
-function set_check_broadcast_shape(
-        ax1::Tuple{Any, Vararg{Any, N}},
-        ax2::Tuple{Any, Vararg{Any, N}}
-    ) where {N}
-    perm = getperm(ax2, ax1)
-    ax2_aligned = map(i -> ax2[i], perm)
-    check_broadcast_shape(unnamed.(ax1), unnamed.(ax2_aligned))
-    return nothing
-end
-set_check_broadcast_shape(ax1::Tuple{}, ax2::Tuple{}) = nothing
-
-broadcasted_unnamed(x::Number, inds) = x
-broadcasted_unnamed(a::AbstractNamedTensor, inds) = unnamed(a, inds)
-function broadcasted_unnamed(bc::Broadcasted, inds)
-    return broadcasted(bc.f, Base.Fix2(broadcasted_unnamed, inds).(bc.args)...)
+broadcasted_unnamed(x::Number, names) = x
+broadcasted_unnamed(a::AbstractNamedTensor, names) = unnamed(a, names)
+function broadcasted_unnamed(bc::Broadcasted, names)
+    return broadcasted(bc.f, Base.Fix2(broadcasted_unnamed, names).(bc.args)...)
 end
 
 # A bare (unnamed) array operand, used as an allocation prototype so a broadcast
@@ -129,58 +34,74 @@ unnamed_prototype(arg::AbstractNamedTensor, args...) = unnamed(arg)
 unnamed_prototype(arg::Broadcasted, args...) = unnamed_prototype(arg.args..., args...)
 unnamed_prototype(arg, args...) = unnamed_prototype(args...)
 
-function Base.similar(bc::Broadcasted{<:AbstractNamedTensorStyle}, elt::Type, ax)
-    inds_a = name.(ax)
-    bc_unnamed = broadcasted_unnamed(bc, inds_a)
-    a_unnamed = similar(bc_unnamed, elt)
-    return nameddims(a_unnamed, inds_a)
+# Skip Base's shape-combination step: named broadcasts don't need the `NamedUnitRange` axis
+# machinery. Name compatibility is handled by the per-operand alignment in `broadcasted_unnamed`
+# (via `getperm`), and unnamed-shape compatibility by TensorAlgebra.
+BC.instantiate(bc::Broadcasted{<:AbstractNamedTensorStyle}) = bc
+
+# The destination dimension names of a broadcast are those of its first named operand.
+# Sourcing them here (rather than from `axes(bc)`) keeps the named axes off the hot path.
+_dimnames(a::AbstractNamedTensor, args...) = dimnames(a)
+_dimnames(bc::Broadcasted, args...) = _dimnames(bc.args..., args...)
+_dimnames(_, args...) = _dimnames(args...)
+dimnames(bc::Broadcasted) = _dimnames(bc.args...)
+
+# The result element type of a linear combination, from the concrete unnamed leaves at runtime.
+# `eltype(::LinearBroadcasted)` uses `Base.promote_op`, which runs a live inference call here
+# because the leaves wrap a named tensor's (non-inferrable) backing array, so promote the
+# concrete `eltype`s instead.
+_lineareltype(a::AbstractArray) = eltype(a)
+function _lineareltype(s::TA.ScaledBroadcasted)
+    return promote_type(typeof(TA.coeff(s)), _lineareltype(TA.unscaled(s)))
 end
+_lineareltype(s::TA.AddBroadcasted) = promote_type(map(_lineareltype, TA.addends(s))...)
 
-inds(bc::Broadcasted) = name.(axes(bc))
 function Base.copy(bc::Broadcasted{<:AbstractNamedTensorStyle})
-    # We could use:
-    # ```julia
-    # elt = combine_eltypes(bc.f, bc.args)
-    # copyto!(similar(bc, elt), bc)
-    # ```
-    # but `combine_eltypes` is based on type inference, which might fail.
-    # Calling broadcasted on the unnamed arrays reuses the code logic in
-    # Base.Broadcast for handling cases where type inference fails by determining
-    # the output element type at runtime with widening.
-    inds_dest = inds(bc)
-    bc_unnamed = broadcasted_unnamed(bc, inds_dest)
+    nms = dimnames(bc)
+    dest_unnamed = _copy_unnamed(broadcasted_unnamed(bc, nms), unnamed_prototype(bc))
+    return nameddims(dest_unnamed, nms)
+end
+
+# Function barrier: `broadcasted_unnamed` and `unnamed_prototype` produce concretely-typed
+# values whose *inferred* types are abstract (the named backing array is abstract), so this
+# call re-specializes on the concrete runtime types and everything below is type-stable
+# (`eltype(lb)` is now inferrable, no runtime `promote_op`). Inlining the body into `copy`
+# instead costs one extra allocation per call.
+function _copy_unnamed(bc_unnamed, prototype)
     lb = TA.tryflattenlinear(bc_unnamed)
-    if isnothing(lb)
-        # Not a linear combination: ordinary fused broadcast.
-        dest_unnamed = copy(bc_unnamed)
-    else
-        # Linear: lower to bipermutedimsopadd!. Allocate from an operand so the
-        # result keeps the backend, using the backend's result axes (not `lb`'s).
-        dest_axes = unnamed.(Tuple(axes(bc)))
-        dest_unnamed = similar(unnamed_prototype(bc), eltype(lb), dest_axes)
-        copyto!(dest_unnamed, lb)
-    end
-    return nameddims(dest_unnamed, inds_dest)
+    isnothing(lb) && return copy(bc_unnamed)
+    return copyto!(similar(prototype, eltype(lb)), lb)
+end
+
+# `Base.Broadcast.materialize!` otherwise reconstructs the broadcast over `axes(dest)` and
+# re-runs `instantiate`, forcing the `NamedUnitRange` axis machinery this style's `instantiate`
+# no-op exists to skip (`combine_axes`/`set_promote_shape`). Route straight to `copyto!`, which
+# aligns by dimname instead.
+function BC.materialize!(
+        dest::AbstractNamedTensor,
+        bc::Broadcasted{<:AbstractNamedTensorStyle}
+    )
+    copyto!(dest, bc)
+    return dest
 end
 
 function Base.copyto!(
         dest::AbstractNamedTensor,
         bc::Broadcasted{<:AbstractNamedTensorStyle}
     )
-    dest_unnamed = unnamed(dest)
-    inds_dest = axes(dest)
-    bc_unnamed = broadcasted_unnamed(bc, inds_dest)
-    lb = TA.tryflattenlinear(bc_unnamed)
-    if isnothing(lb)
-        # Not a linear combination: ordinary fused broadcast.
-        copyto!(dest_unnamed, bc_unnamed)
-    else
-        # Linear: lower to bipermutedimsopadd! into the existing dest.
-        copyto!(dest_unnamed, lb)
-    end
+    _copyto_unnamed!(unnamed(dest), broadcasted_unnamed(bc, dimnames(dest)))
     return dest
 end
 
+# Function barrier mirroring `_copy_unnamed`: `unnamed(dest)` and `broadcasted_unnamed`
+# have abstract inferred types (the named backing array is abstract), so this call
+# re-specializes on the concrete runtime types and the flatten/lower below is type-stable.
+function _copyto_unnamed!(dest_unnamed, bc_unnamed)
+    lb = TA.tryflattenlinear(bc_unnamed)
+    isnothing(lb) && return copyto!(dest_unnamed, bc_unnamed)
+    return copyto!(dest_unnamed, lb)
+end
+
 # Operator-preserving broadcasting.
 #
 # An `NamedTensorOperator` broadcasts as itself (it does not peel to its `state`), so
@@ -190,7 +111,7 @@ end
 #   - operator ⊗ scalar → operator (`2 .* op` stays an operator),
 #   - operator ⊗ non-operator tensor → error.
 # The `BroadcastStyle(::Type{<:NamedTensorOperator})` mapping and the operator-specific
-# `copy` / `similar` (which unwrap, delegate to `NamedTensorStyle`, then rewrap) live in
+# `copy` (which unwraps, delegates to `NamedTensorStyle`, then rewraps) live in
 # `itensoroperator.jl`, where `NamedTensorOperator` is defined. `*` (contraction) is
 # unchanged and still decays to `state`.
 
diff --git a/src/namedtensoroperator.jl b/src/namedtensoroperator.jl
index b757140..c5f8ebe 100644
--- a/src/namedtensoroperator.jl
+++ b/src/namedtensoroperator.jl
@@ -342,12 +342,6 @@ function Base.copy(bc::Broadcasted{<:NamedTensorOperatorStyle})
     return operator(result, cod, dom)
 end
 
-function Base.similar(bc::Broadcasted{<:NamedTensorOperatorStyle}, elt::Type, ax)
-    cod, dom = broadcast_operator_codomain_domain(bc)
-    result = similar(statebroadcasted(bc), elt, ax)
-    return operator(result, cod, dom)
-end
-
 for f in MATRIX_FUNCTIONS
     @eval begin
         function Base.$f(a::NamedTensorOperator)

From 7b397aaafd47070283e55bd548328982824568fc Mon Sep 17 00:00:00 2001
From: Matthew Fishman <mfishman@flatironinstitute.org>
Date: Wed, 1 Jul 2026 14:13:56 -0400
Subject: [PATCH 2/3] Move the broadcast alignment fast path into
 broadcasted_unnamed

Keep `unnamed(a, names)` a general permute-to-names method and handle the already-aligned shortcut in `broadcasted_unnamed`, its only caller. The shortcut is load-bearing: without it every aligned operand (including the always-aligned first one) is wrapped in an identity `PermutedDimsArray`, which breaks the clean lowering and makes a small add several times slower.
---
 src/abstractnamedtensor.jl | 5 +----
 src/broadcast.jl           | 8 +++++++-
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/abstractnamedtensor.jl b/src/abstractnamedtensor.jl
index d0e00dc..b819305 100644
--- a/src/abstractnamedtensor.jl
+++ b/src/abstractnamedtensor.jl
@@ -85,10 +85,7 @@ dimnametype(type::Type{<:AbstractNamedTensor}) = Any
 # TODO: Use `IsNamed` trait?
 unnamed(a::AbstractNamedTensor) = throw(MethodError(unnamed, a))
 function unnamed(a::AbstractNamedTensor, names)
-    array = unnamed(a)
-    # Fast path: already aligned to `names`, so no permutation and no `getperm` allocation.
-    dimnames(a) == names && return array
-    return permuteddims(array, Tuple(getperm(dimnames(a), names)))
+    return permuteddims(unnamed(a), Tuple(getperm(dimnames(a), names)))
 end
 unname(a::AbstractNamedTensor, inds) = unnamed(aligndims(a, inds))
 
diff --git a/src/broadcast.jl b/src/broadcast.jl
index be5114a..c5c4527 100644
--- a/src/broadcast.jl
+++ b/src/broadcast.jl
@@ -21,7 +21,13 @@ end
 BC.broadcastable(a::AbstractNamedTensor) = a
 
 broadcasted_unnamed(x::Number, names) = x
-broadcasted_unnamed(a::AbstractNamedTensor, names) = unnamed(a, names)
+function broadcasted_unnamed(a::AbstractNamedTensor, names)
+    # An operand already aligned to the destination names (the first operand always, and the
+    # common case for the rest) needs no permutation, avoiding a `getperm` allocation and the
+    # identity `permuteddims` wrapper. Skipping it makes a small add several times slower.
+    dimnames(a) == names && return unnamed(a)
+    return unnamed(a, names)
+end
 function broadcasted_unnamed(bc::Broadcasted, names)
     return broadcasted(bc.f, Base.Fix2(broadcasted_unnamed, names).(bc.args)...)
 end

From 2061c941e39c4012c4aa8c30479398842568bb9a Mon Sep 17 00:00:00 2001
From: Matthew Fishman <mfishman@flatironinstitute.org>
Date: Wed, 1 Jul 2026 14:36:08 -0400
Subject: [PATCH 3/3] Build the alignment permutation with a static-length
 ntuple

Behind a function barrier that recovers the concrete backing array, `ndims` is a compile-time constant, so the permutation can be built as an `ntuple(..., Val(ndims))` (an `NTuple{N,Int}`) rather than `Tuple(getperm(...))` (a `Tuple{Vararg{Int}}` whose length is not inferrable). That lets `permuteddims` build a concretely-typed wrapper, roughly halving the cost of a permuted add (a 2x2 permuted add goes from about 1130 ns to about 900 ns). Aligned adds are unaffected, taking the alignment fast path in `broadcasted_unnamed`.
---
 src/abstractnamedtensor.jl | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/abstractnamedtensor.jl b/src/abstractnamedtensor.jl
index b819305..1a4a69f 100644
--- a/src/abstractnamedtensor.jl
+++ b/src/abstractnamedtensor.jl
@@ -85,7 +85,14 @@ dimnametype(type::Type{<:AbstractNamedTensor}) = Any
 # TODO: Use `IsNamed` trait?
 unnamed(a::AbstractNamedTensor) = throw(MethodError(unnamed, a))
 function unnamed(a::AbstractNamedTensor, names)
-    return permuteddims(unnamed(a), Tuple(getperm(dimnames(a), names)))
+    return _permuteddims_to(unnamed(a), getperm(dimnames(a), names))
+end
+# Function barrier: `unnamed(a)` is abstractly typed, so dispatching on the concrete array here
+# makes `ndims` a compile-time constant. Building the permutation as an `ntuple(…, Val(ndims))`
+# (an `NTuple{N,Int}`) rather than `Tuple(perm)` (a length-non-inferrable `Tuple{Vararg{Int}}`)
+# lets `permuteddims` build a concretely-typed wrapper, roughly halving the permute cost.
+@noinline function _permuteddims_to(array::AbstractArray, perm)
+    return permuteddims(array, ntuple(i -> perm[i], Val(ndims(array))))
 end
 unname(a::AbstractNamedTensor, inds) = unnamed(aligndims(a, inds))