From ae0d442d2c822c2054feba3eca44cf45ff0123e8 Mon Sep 17 00:00:00 2001
From: Matthew Fishman <mfishman@flatironinstitute.org>
Date: Wed, 1 Jul 2026 18:59:28 -0400
Subject: [PATCH 1/2] Build permuted named-tensor add on the field-based
 PermutedDims

## Summary

Aligns a misaligned operand in named-tensor addition with `TensorAlgebra.PermutedDims` instead of `Base.PermutedDimsArray`. `PermutedDimsArray` encodes the permutation in a type parameter, so a runtime permutation forces a type-unstable, allocating construction on every permuted add. `PermutedDims` stores the permutation in a field, builds cheaply and type-stably, and is a broadcast leaf the linear-combination fold absorbs, so a permuted add no longer pays that type-construction floor and closes most of the gap to an aligned add.

Builds on the broadcast-friendly `PermutedDims` from https://github.com/ITensor/TensorAlgebra.jl/pull/198.
---
 Project.toml               |  8 ++++++--
 src/abstractnamedtensor.jl | 14 +++++++++-----
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/Project.toml b/Project.toml
index 397af2d..cfe0a44 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "ITensorBase"
 uuid = "4795dd04-0d67-49bb-8f44-b89c448a1dc7"
-version = "0.10.2"
+version = "0.10.3"
 authors = ["ITensor developers <support@itensor.org> and contributors"]
 
 [workspace]
@@ -30,6 +30,10 @@ Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 Mooncake = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6"
 TensorOperations = "6aa20fa7-93e2-5fca-9bc0-fbd0db3c71a2"
 
+[sources.TensorAlgebra]
+rev = "mf/permuteddims-broadcast"
+url = "https://github.com/ITensor/TensorAlgebra.jl"
+
 [extensions]
 ITensorBaseAdaptExt = "Adapt"
 ITensorBaseMooncakeExt = "Mooncake"
@@ -49,7 +53,7 @@ Mooncake = "0.4.202, 0.5"
 OrderedCollections = "1.6"
 Random = "1.10"
 SimpleTraits = "0.9.4"
-TensorAlgebra = "0.15"
+TensorAlgebra = "0.15.1"
 TensorOperations = "5.3.1"
 TermInterface = "2"
 TupleTools = "1.6"
diff --git a/src/abstractnamedtensor.jl b/src/abstractnamedtensor.jl
index 1a4a69f..f500a71 100644
--- a/src/abstractnamedtensor.jl
+++ b/src/abstractnamedtensor.jl
@@ -87,12 +87,16 @@ unnamed(a::AbstractNamedTensor) = throw(MethodError(unnamed, a))
 function unnamed(a::AbstractNamedTensor, names)
     return _permuteddims_to(unnamed(a), getperm(dimnames(a), names))
 end
-# Function barrier: `unnamed(a)` is abstractly typed, so dispatching on the concrete array here
-# makes `ndims` a compile-time constant. Building the permutation as an `ntuple(…, Val(ndims))`
-# (an `NTuple{N,Int}`) rather than `Tuple(perm)` (a length-non-inferrable `Tuple{Vararg{Int}}`)
-# lets `permuteddims` build a concretely-typed wrapper, roughly halving the permute cost.
+# Align a misaligned operand for the elementwise broadcast by wrapping it in a lazy permuted
+# view. `TensorAlgebra.PermutedDims` stores the permutation in a field (unlike
+# `Base.PermutedDimsArray`, which encodes it in a type parameter, so a runtime permutation forces
+# runtime type construction), so it builds cheaply and type-stably; it is a broadcast leaf the
+# linear-combination path absorbs via `bipermutedimsopadd!`. Function barrier: `unnamed(a)` is
+# abstractly typed, so dispatching on the concrete array here makes `ndims` a compile-time
+# constant, and the `ntuple(…, Val(ndims))` builds an inferrable `NTuple{N,Int}` permutation
+# rather than a length-non-inferrable `Tuple(::Vector)`.
 @noinline function _permuteddims_to(array::AbstractArray, perm)
-    return permuteddims(array, ntuple(i -> perm[i], Val(ndims(array))))
+    return TensorAlgebra.PermutedDims(array, ntuple(i -> perm[i], Val(ndims(array))))
 end
 unname(a::AbstractNamedTensor, inds) = unnamed(aligndims(a, inds))
 

From 4ed4c8bbebcd362ff20f6a298418baef96ddc7fd Mon Sep 17 00:00:00 2001
From: Matthew Fishman <mfishman@flatironinstitute.org>
Date: Wed, 1 Jul 2026 19:43:50 -0400
Subject: [PATCH 2/2] Confine the field-based PermutedDims to the broadcast
 alignment path

The public `unnamed(a, names)` keeps returning a `Base.PermutedDimsArray` (a full array), so callers outside broadcasting are unaffected. Only `broadcasted_unnamed` aligns a misaligned operand with `TensorAlgebra.PermutedDims`, whose minimal array interface stays confined to the broadcast hot path and is never handed to users. Also removes the `[sources]` pin now that TensorAlgebra 0.15.1 is registered, and switches a GPU-unsafe `dot` test to eager `unname`.
---
 Project.toml               |  4 ----
 src/abstractnamedtensor.jl | 14 +++++---------
 src/broadcast.jl           | 16 ++++++++++++++--
 test/test_linearalgebra.jl |  4 ++--
 4 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/Project.toml b/Project.toml
index cfe0a44..dac0c96 100644
--- a/Project.toml
+++ b/Project.toml
@@ -30,10 +30,6 @@ Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 Mooncake = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6"
 TensorOperations = "6aa20fa7-93e2-5fca-9bc0-fbd0db3c71a2"
 
-[sources.TensorAlgebra]
-rev = "mf/permuteddims-broadcast"
-url = "https://github.com/ITensor/TensorAlgebra.jl"
-
 [extensions]
 ITensorBaseAdaptExt = "Adapt"
 ITensorBaseMooncakeExt = "Mooncake"
diff --git a/src/abstractnamedtensor.jl b/src/abstractnamedtensor.jl
index f500a71..1a4a69f 100644
--- a/src/abstractnamedtensor.jl
+++ b/src/abstractnamedtensor.jl
@@ -87,16 +87,12 @@ unnamed(a::AbstractNamedTensor) = throw(MethodError(unnamed, a))
 function unnamed(a::AbstractNamedTensor, names)
     return _permuteddims_to(unnamed(a), getperm(dimnames(a), names))
 end
-# Align a misaligned operand for the elementwise broadcast by wrapping it in a lazy permuted
-# view. `TensorAlgebra.PermutedDims` stores the permutation in a field (unlike
-# `Base.PermutedDimsArray`, which encodes it in a type parameter, so a runtime permutation forces
-# runtime type construction), so it builds cheaply and type-stably; it is a broadcast leaf the
-# linear-combination path absorbs via `bipermutedimsopadd!`. Function barrier: `unnamed(a)` is
-# abstractly typed, so dispatching on the concrete array here makes `ndims` a compile-time
-# constant, and the `ntuple(…, Val(ndims))` builds an inferrable `NTuple{N,Int}` permutation
-# rather than a length-non-inferrable `Tuple(::Vector)`.
+# Function barrier: `unnamed(a)` is abstractly typed, so dispatching on the concrete array here
+# makes `ndims` a compile-time constant. Building the permutation as an `ntuple(…, Val(ndims))`
+# (an `NTuple{N,Int}`) rather than `Tuple(perm)` (a length-non-inferrable `Tuple{Vararg{Int}}`)
+# lets `permuteddims` build a concretely-typed wrapper, roughly halving the permute cost.
 @noinline function _permuteddims_to(array::AbstractArray, perm)
-    return TensorAlgebra.PermutedDims(array, ntuple(i -> perm[i], Val(ndims(array))))
+    return permuteddims(array, ntuple(i -> perm[i], Val(ndims(array))))
 end
 unname(a::AbstractNamedTensor, inds) = unnamed(aligndims(a, inds))
 
diff --git a/src/broadcast.jl b/src/broadcast.jl
index c5c4527..14b8b0e 100644
--- a/src/broadcast.jl
+++ b/src/broadcast.jl
@@ -1,4 +1,5 @@
-using ..ITensorBase: AbstractNamedTensor, ITensorBase, dimnames, named, nameddims, unnamed
+using ..ITensorBase:
+    AbstractNamedTensor, ITensorBase, dimnames, getperm, named, nameddims, unnamed
 using Base.Broadcast: Broadcast as BC, Broadcasted, broadcasted
 using TensorAlgebra: TensorAlgebra as TA
 
@@ -26,7 +27,18 @@ function broadcasted_unnamed(a::AbstractNamedTensor, names)
     # common case for the rest) needs no permutation, avoiding a `getperm` allocation and the
     # identity `permuteddims` wrapper. Skipping it makes a small add several times slower.
     dimnames(a) == names && return unnamed(a)
-    return unnamed(a, names)
+    return _broadcast_permuteddims_to(unnamed(a), getperm(dimnames(a), names))
+end
+# Broadcasting-only alignment: unlike the public `unnamed(a, names)` (which returns a
+# `Base.PermutedDimsArray`, a full array), this wraps in `TensorAlgebra.PermutedDims`, which stores
+# the permutation in a field rather than a type parameter, so it builds cheaply and type-stably
+# from the runtime permutation and is a broadcast leaf the linear-combination fold absorbs via
+# `bipermutedimsopadd!`. `PermutedDims` has almost no array interface, so it stays confined to this
+# hot path and is never handed back to users. Function barrier: `unnamed(a)` is abstractly typed,
+# so dispatching on the concrete array makes `ndims` a compile-time constant for the inferrable
+# `ntuple(…, Val(ndims))` permutation.
+@noinline function _broadcast_permuteddims_to(array::AbstractArray, perm)
+    return TA.PermutedDims(array, ntuple(i -> perm[i], Val(ndims(array))))
 end
 function broadcasted_unnamed(bc::Broadcasted, names)
     return broadcasted(bc.f, Base.Fix2(broadcasted_unnamed, names).(bc.args)...)
diff --git a/test/test_linearalgebra.jl b/test/test_linearalgebra.jl
index c292760..a360345 100644
--- a/test/test_linearalgebra.jl
+++ b/test/test_linearalgebra.jl
@@ -1,5 +1,5 @@
 import LinearAlgebra as LA
-using ITensorBase: dimnames, named, unnamed
+using ITensorBase: dimnames, named, unname, unnamed
 using Test: @test, @testset
 
 @testset "LinearAlgebra (eltype=$(elt))" for elt in
@@ -14,5 +14,5 @@ using Test: @test, @testset
     @test unnamed(LA.lmul!(2, copy(a))) ≈ 2 * unnamed(a)
     @test unnamed(LA.rdiv!(copy(a), 2)) ≈ unnamed(a) / 2
     @test unnamed(LA.ldiv!(2, copy(a))) ≈ 2 \ unnamed(a)
-    @test LA.dot(a, b) ≈ LA.dot(unnamed(a), unnamed(b, dimnames(a)))
+    @test LA.dot(a, b) ≈ LA.dot(unnamed(a), unname(b, dimnames(a)))
 end