From 479adaf36b815f9ebecd43f19b0f288e39b5f5ac Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Tue, 21 Apr 2026 15:30:20 -0400
Subject: [PATCH 01/23] Simplify index manipulation API with unified in-place
 interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Merge the two-tier `permute!`/`add_permute!` pattern into a single tier: `permute!`, `braid!`, `transpose!`, and `repartition!` now directly accept optional `α`, `β`, `backend`, and `allocator` arguments with sensible defaults (One(), Zero(), DefaultBackend(), DefaultAllocator()), matching TensorOperations convention. The old `add_permute!`, `add_braid!`, and `add_transpose!` are deprecated wrappers that emit `Base.depwarn` and forward to the new functions. The `allocator` kwarg is fully threaded through the internal call chain (`add_transform!`, all kernel functions, and `allocate_buffers`). Mooncake AD rules are updated to use the new function names.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../indexmanipulations.jl                     |  36 +--
 ext/TensorKitMooncakeExt/planaroperations.jl  |   2 +-
 src/planar/planaroperations.jl                |   6 +-
 src/tensors/braidingtensor.jl                 |   4 +-
 src/tensors/indexmanipulations.jl             | 294 ++++++++++--------
 src/tensors/tensoroperations.jl               |   4 +-
 src/tensors/treetransformers.jl               |   6 +-
 test/mooncake/indexmanipulations.jl           |  26 +-
 8 files changed, 210 insertions(+), 168 deletions(-)

diff --git a/ext/TensorKitMooncakeExt/indexmanipulations.jl b/ext/TensorKitMooncakeExt/indexmanipulations.jl
index c3bc3e26c..1fccdd9e6 100644
--- a/ext/TensorKitMooncakeExt/indexmanipulations.jl
+++ b/ext/TensorKitMooncakeExt/indexmanipulations.jl
@@ -1,11 +1,11 @@
 for transform in (:permute, :transpose)
-    add_transform! = Symbol(:add_, transform, :!)
-    add_transform_pullback = Symbol(add_transform!, :_pullback)
+    transform! = Symbol(transform, :!)
+    transform_pullback = Symbol(transform!, :_pullback)
     @eval @is_primitive(
         DefaultCtx,
         ReverseMode,
         Tuple{
-            typeof(TK.$add_transform!),
+            typeof(TK.$transform!),
             AbstractTensorMap,
             AbstractTensorMap, Index2Tuple,
             Number, Number, Vararg{Any},
@@ -13,7 +13,7 @@ for transform in (:permute, :transpose)
     )
 
     @eval function Mooncake.rrule!!(
-            ::CoDual{typeof(TK.$add_transform!)},
+            ::CoDual{typeof(TK.$transform!)},
             C_ΔC::CoDual{<:AbstractTensorMap},
             A_ΔA::CoDual{<:AbstractTensorMap}, p_Δp::CoDual{<:Index2Tuple},
             α_Δα::CoDual{<:Number}, β_Δβ::CoDual{<:Number},
@@ -30,17 +30,17 @@ for transform in (:permute, :transpose)
 
         # if we need to compute Δa, it is faster to allocate an intermediate permuted A
         # and store that instead of repeating the permutation in the pullback each time.
-        # effectively, we replace `add_permute` by `add ∘ permute`.
+        # effectively, we replace `permute!/transpose!` by `add ∘ permute/transpose`.
         Ap = if _needs_tangent(α)
             Ap = $transform(A, p)
             add!(C, Ap, α, β)
             Ap
         else
-            TK.$add_transform!(C, A, p, α, β, ba...)
+            TK.$transform!(C, A, p, α, β, ba...)
             nothing
         end
 
-        function $add_transform_pullback(::NoRData)
+        function $transform_pullback(::NoRData)
             copy!(C, C_cache)
 
             # ΔA
@@ -50,10 +50,10 @@ for transform in (:permute, :transpose)
             TC = VectorInterface.promote_scale(ΔC, α)
             if scalartype(ΔA) <: Real && !(TC <: Real)
                 ΔAc = TO.tensoralloc_add(TC, ΔC, pΔA, false, Val(false))
-                TK.$add_transform!(ΔAc, ΔC, pΔA, conj(α), Zero(), ba...)
+                TK.$transform!(ΔAc, ΔC, pΔA, conj(α), Zero(), ba...)
                 add!(ΔA, real(ΔAc))
             else
-                TK.$add_transform!(ΔA, ΔC, pΔA, conj(α), One(), ba...)
+                TK.$transform!(ΔA, ΔC, pΔA, conj(α), One(), ba...)
             end
             ΔAr = NoRData()
 
@@ -64,7 +64,7 @@ for transform in (:permute, :transpose)
             return NoRData(), ΔCr, ΔAr, NoRData(), Δαr, Δβr, map(Returns(NoRData()), ba)...
         end
 
-        return C_ΔC, $add_transform_pullback
+        return C_ΔC, $transform_pullback
     end
 end
 
@@ -72,7 +72,7 @@ end
     DefaultCtx,
     ReverseMode,
     Tuple{
-        typeof(TK.add_braid!),
+        typeof(TK.braid!),
         AbstractTensorMap,
         AbstractTensorMap, Index2Tuple, IndexTuple,
         Number, Number, Vararg{Any},
@@ -80,7 +80,7 @@ end
 )
 
 function Mooncake.rrule!!(
-        ::CoDual{typeof(TK.add_braid!)},
+        ::CoDual{typeof(TK.braid!)},
         C_ΔC::CoDual{<:AbstractTensorMap},
         A_ΔA::CoDual{<:AbstractTensorMap}, p_Δp::CoDual{<:Index2Tuple}, levels_Δlevels::CoDual{<:IndexTuple},
         α_Δα::CoDual{<:Number}, β_Δβ::CoDual{<:Number},
@@ -98,17 +98,17 @@ function Mooncake.rrule!!(
 
     # if we need to compute Δa, it is faster to allocate an intermediate braided A
     # and store that instead of repeating the permutation in the pullback each time.
-    # effectively, we replace `add_permute` by `add ∘ permute`.
+    # effectively, we replace `braid!` by `add ∘ braid`.
     Ap = if _needs_tangent(α)
         Ap = braid(A, p, levels)
         add!(C, Ap, α, β)
         Ap
     else
-        TK.add_braid!(C, A, p, levels, α, β, ba...)
+        TK.braid!(C, A, p, levels, α, β, ba...)
         nothing
     end
 
-    function add_braid!_pullback(::NoRData)
+    function braid!_pullback(::NoRData)
         copy!(C, C_cache)
 
         # ΔA
@@ -118,10 +118,10 @@ function Mooncake.rrule!!(
         TC = VectorInterface.promote_scale(ΔC, α)
         if scalartype(ΔA) <: Real && !(TC <: Real)
             ΔAc = TO.tensoralloc_add(TC, ΔC, pΔA, false, Val(false))
-            TK.add_braid!(ΔAc, ΔC, pΔA, ilevels, conj(α), Zero(), ba...)
+            TK.braid!(ΔAc, ΔC, pΔA, ilevels, conj(α), Zero(), ba...)
             add!(ΔA, real(ΔAc))
         else
-            TK.add_braid!(ΔA, ΔC, pΔA, ilevels, conj(α), One(), ba...)
+            TK.braid!(ΔA, ΔC, pΔA, ilevels, conj(α), One(), ba...)
         end
         ΔAr = NoRData()
 
@@ -132,7 +132,7 @@ function Mooncake.rrule!!(
         return NoRData(), ΔCr, ΔAr, NoRData(), NoRData(), Δαr, Δβr, map(Returns(NoRData()), ba)...
     end
 
-    return C_ΔC, add_braid!_pullback
+    return C_ΔC, braid!_pullback
 end
 
 # both are needed for correctly capturing every dispatch
diff --git a/ext/TensorKitMooncakeExt/planaroperations.jl b/ext/TensorKitMooncakeExt/planaroperations.jl
index 3c75fe2da..abbef5004 100644
--- a/ext/TensorKitMooncakeExt/planaroperations.jl
+++ b/ext/TensorKitMooncakeExt/planaroperations.jl
@@ -60,7 +60,7 @@
 #     if length(q[1]) == 0
 #         ip = invperm(linearize(p))
 #         pΔA = _repartition(ip, A)
-#         TK.add_transpose!(ΔA, ΔC, pΔA, conj(α), One(), backend, allocator)
+#         TK.transpose!(ΔA, ΔC, pΔA, conj(α), One(), backend, allocator)
 #         return NoRData()
 #     end
 #     # if length(q[1]) == 1
diff --git a/src/planar/planaroperations.jl b/src/planar/planaroperations.jl
index cde772982..758bb708a 100644
--- a/src/planar/planaroperations.jl
+++ b/src/planar/planaroperations.jl
@@ -32,7 +32,7 @@ function planaradd!(
         α::Number, β::Number,
         backend, allocator
     )
-    return add_transpose!(C, A, p, α, β, backend)
+    return transpose!(C, A, p, α, β, backend)
 end
 
 # insert default backend
@@ -173,7 +173,7 @@ function planarcontract!(
         A′ = TO.tensoralloc_add(
             scalartype(A), A, (oindA, cindA), false, Val(true), allocator
         )
-        add_transpose!(A′, A, (oindA, cindA), One(), Zero(), backend)
+        transpose!(A′, A, (oindA, cindA), One(), Zero(), backend)
     end
 
     if cindB == codB && oindB == domB
@@ -182,7 +182,7 @@ function planarcontract!(
         B′ = TensorOperations.tensoralloc_add(
             scalartype(B), B, (cindB, oindB), false, Val(true), allocator
         )
-        add_transpose!(B′, B, (cindB, oindB), One(), Zero(), backend)
+        transpose!(B′, B, (cindB, oindB), One(), Zero(), backend)
     end
     mul!(C, A′, B′, α, β)
     (oindA == codA && cindA == domA) || TO.tensorfree!(A′, allocator)
diff --git a/src/tensors/braidingtensor.jl b/src/tensors/braidingtensor.jl
index f08dd8181..a02d3b884 100644
--- a/src/tensors/braidingtensor.jl
+++ b/src/tensors/braidingtensor.jl
@@ -256,7 +256,7 @@ function planarcontract!(
     end
 
     if BraidingStyle(sectortype(B)) isa Bosonic
-        return add_permute!(C, B, (reverse(cindB), oindB), α, β, backend)
+        return permute!(C, B, (reverse(cindB), oindB), α, β, backend)
     end
 
     τ_levels = A.adjoint ? (1, 2, 2, 1) : (2, 1, 1, 2)
@@ -313,7 +313,7 @@ function planarcontract!(
     p = (oindA, reverse(cindA))
     N = length(oindA)
     levels = (ntuple(identity, N)..., (B.adjoint ? (N + 1, N + 2) : (N + 2, N + 1))...)
-    return add_braid!(C, A, p, levels, α, β, backend)
+    return braid!(C, A, p, levels, α, β, backend)
 end
 
 # ambiguity fix:
diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index 3108abb17..ce998f396 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -47,22 +47,29 @@ function flip(t::AbstractTensorMap, I; inv::Bool = false)
 end
 
 """
-    permute!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap, (p₁, p₂)::Index2Tuple)
+    permute!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap, (p₁, p₂)::Index2Tuple,
+             α::Number=One(), β::Number=Zero(), backend::AbstractBackend...;
+             allocator=TO.DefaultAllocator())
         -> tdst
 
-Write into `tdst` the result of permuting the indices of `tsrc`.
+Write into `tdst` the result of adding `α * tsrc` to `β * tdst` after permuting the indices of `tsrc`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
-                
-See [`permute`](@ref) for creating a new tensor and [`add_permute!`](@ref) for a more general version.
+
+See [`permute`](@ref) for creating a new tensor.
 """
 @propagate_inbounds function Base.permute!(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple
+        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
+        α::Number=One(), β::Number=Zero(), backend::AbstractBackend...;
+        allocator=TO.DefaultAllocator()
     )
-    return add_permute!(tdst, tsrc, p, One(), Zero())
+    @boundscheck spacecheck_transform(permute, tdst, tsrc, p)
+    transformer = treepermuter(tdst, tsrc, p)
+    return @inbounds add_transform!(tdst, tsrc, p, transformer, α, β, backend...; allocator)
 end
 
 """
-    permute(tsrc::AbstractTensorMap, (p₁, p₂)::Index2Tuple; copy::Bool = false) -> tdst::TensorMap
+    permute(tsrc::AbstractTensorMap, (p₁, p₂)::Index2Tuple; copy::Bool = false,
+            allocator=TO.DefaultAllocator()) -> tdst::TensorMap
 
 Return tensor `tdst` obtained by permuting the indices of `tsrc`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
@@ -70,9 +77,11 @@ The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂`
 If `copy = false`, `tdst` might share data with `tsrc` whenever possible.
 Otherwise, a copy is always made.
 
-To permute into an existing destination, see [permute!](@ref) and [`add_permute!`](@ref)
+To permute into an existing destination, see [permute!](@ref)
 """
-function permute(t::AbstractTensorMap, p::Index2Tuple; copy::Bool = false)
+function permute(
+        t::AbstractTensorMap, p::Index2Tuple; copy::Bool = false, allocator=TO.DefaultAllocator()
+    )
     # share data if possible
     if !copy
         if p == (codomainind(t), domainind(t))
@@ -84,14 +93,15 @@ function permute(t::AbstractTensorMap, p::Index2Tuple; copy::Bool = false)
 
     # general case
     tdst = similar(t, promote_permute(t), permute(space(t), p))
-    return @inbounds permute!(tdst, t, p)
+    return @inbounds permute!(tdst, t, p; allocator)
 end
-function permute(t::AdjointTensorMap, (p₁, p₂)::Index2Tuple; copy::Bool = false)
+function permute(t::AdjointTensorMap, (p₁, p₂)::Index2Tuple; copy::Bool = false, allocator=TO.DefaultAllocator())
     p₁′ = adjointtensorindices(t, p₂)
     p₂′ = adjointtensorindices(t, p₁)
-    return adjoint(permute(adjoint(t), (p₁′, p₂′); copy))
+    return adjoint(permute(adjoint(t), (p₁′, p₂′); copy, allocator))
 end
-permute(t::AbstractTensorMap, p::IndexTuple; copy::Bool = false) = permute(t, (p, ()); copy)
+permute(t::AbstractTensorMap, p::IndexTuple; copy::Bool = false, allocator=TO.DefaultAllocator()) =
+    permute(t, (p, ()); copy, allocator)
 
 function has_shared_permute(t::AbstractTensorMap, (p₁, p₂)::Index2Tuple)
     return (p₁ === codomainind(t) && p₂ === domainind(t))
@@ -118,25 +128,33 @@ end
 # Braid
 """
     braid!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap,
-           (p₁, p₂)::Index2Tuple, levels::Tuple)
+           (p₁, p₂)::Index2Tuple, levels::IndexTuple,
+           α::Number=One(), β::Number=Zero(), backend::AbstractBackend...;
+           allocator=TO.DefaultAllocator())
         -> tdst
 
-Write into `tdst` the result of braiding the indices of `tsrc`.
+Write into `tdst` the result of adding `α * tsrc` to `β * tdst` after braiding the indices of `tsrc`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
 Here, `levels` is a tuple of length `numind(tsrc)` that assigns a level or height to the indices of `tsrc`,
 which determines whether they will braid over or under any other index with which they have to change places.
 
-See [`braid`](@ref) for creating a new tensor and [`add_braid!`](@ref) for a more general version.
+See [`braid`](@ref) for creating a new tensor.
 """
 @propagate_inbounds function braid!(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple
+        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple,
+        α::Number=One(), β::Number=Zero(), backend::AbstractBackend...;
+        allocator=TO.DefaultAllocator()
     )
-    return add_braid!(tdst, tsrc, p, levels, One(), Zero())
+    @boundscheck spacecheck_transform(braid, tdst, tsrc, p, levels)
+    levels1 = TupleTools.getindices(levels, codomainind(tsrc))
+    levels2 = TupleTools.getindices(levels, domainind(tsrc))
+    transformer = treebraider(tdst, tsrc, p, (levels1, levels2))
+    return @inbounds add_transform!(tdst, tsrc, p, transformer, α, β, backend...; allocator)
 end
 
 """
     braid(tsrc::AbstractTensorMap, (p₁, p₂)::Index2Tuple, levels::IndexTuple;
-          copy::Bool = false)
+          copy::Bool = false, allocator=TO.DefaultAllocator())
         -> tdst::TensorMap
 
 Return tensor `tdst` obtained by braiding the indices of `tsrc`.
@@ -146,19 +164,20 @@ which determines whether they will braid over or under any other index with whic
 
 If `copy=false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
 
-To braid into an existing destination, see [braid!](@ref) and [`add_braid!`](@ref)
+To braid into an existing destination, see [braid!](@ref)
 """
 function braid(
-        t::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple; copy::Bool = false
+        t::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple;
+        copy::Bool = false, allocator=TO.DefaultAllocator()
     )
     length(levels) == numind(t) || throw(ArgumentError("invalid levels"))
 
-    BraidingStyle(sectortype(t)) isa SymmetricBraiding && return permute(t, p; copy)
+    BraidingStyle(sectortype(t)) isa SymmetricBraiding && return permute(t, p; copy, allocator)
     (!copy && p == (codomainind(t), domainind(t))) && return t
 
     # general case
     tdst = similar(t, promote_braid(t), permute(space(t), p))
-    return @inbounds braid!(tdst, t, p, levels)
+    return @inbounds braid!(tdst, t, p, levels; allocator)
 end
 # TODO: braid for `AdjointTensorMap`; think about how to map the `levels` argument.
 
@@ -167,25 +186,32 @@ _transpose_indices(t::AbstractTensorMap) = (reverse(domainind(t)), reverse(codom
 
 """
     transpose!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap,
-               (p₁, p₂)::Index2Tuple)
+               (p₁, p₂)::Index2Tuple,
+               α::Number=One(), β::Number=Zero(), backend::AbstractBackend...;
+               allocator=TO.DefaultAllocator())
         -> tdst
 
-Write into `tdst` the result of transposing the indices of `tsrc`.
+Write into `tdst` the result of adding `α * tsrc` to `β * tdst` after transposing the indices of `tsrc`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
 The new index positions should be attainable without any indices crossing each other, i.e.,
 the permutation `(p₁..., reverse(p₂)...)` should constitute a cyclic permutation of `(codomainind(tsrc)..., reverse(domainind(tsrc))...)`.
 
-See [`transpose`](@ref) for creating a new tensor and [`add_transpose!`](@ref) for a more general version.
+See [`transpose`](@ref) for creating a new tensor.
 """
 @propagate_inbounds function LinearAlgebra.transpose!(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, (p₁, p₂)::Index2Tuple = _transpose_indices(tsrc)
+        tdst::AbstractTensorMap, tsrc::AbstractTensorMap,
+        p::Index2Tuple = _transpose_indices(tsrc),
+        α::Number=One(), β::Number=Zero(), backend::AbstractBackend...;
+        allocator=TO.DefaultAllocator()
     )
-    return add_transpose!(tdst, tsrc, (p₁, p₂), One(), Zero())
+    @boundscheck spacecheck_transform(transpose, tdst, tsrc, p)
+    transformer = treetransposer(tdst, tsrc, p)
+    return @inbounds add_transform!(tdst, tsrc, p, transformer, α, β, backend...; allocator)
 end
 
 """
     transpose(tsrc::AbstractTensorMap, (p₁, p₂)::Index2Tuple;
-              copy::Bool=false)
+              copy::Bool=false, allocator=TO.DefaultAllocator())
         -> tdst::TensorMap
 
 Return tensor `tdst` obtained by transposing the indices of `tsrc`.
@@ -195,50 +221,59 @@ the permutation `(p₁..., reverse(p₂)...)` should constitute a cyclic permuta
 
 If `copy=false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
 
-To permute into an existing destination, see [permute!](@ref) and [`add_permute!`](@ref)
+To transpose into an existing destination, see [transpose!](@ref)
 """
 function LinearAlgebra.transpose(
         t::AbstractTensorMap, p::Index2Tuple = _transpose_indices(t);
-        copy::Bool = false
+        copy::Bool = false, allocator=TO.DefaultAllocator()
     )
-    sectortype(t) === Trivial && return permute(t, p; copy)
+    sectortype(t) === Trivial && return permute(t, p; copy, allocator)
     (!copy && p == (codomainind(t), domainind(t))) && return t
 
     # general case
     tdst = similar(t, promote_transpose(t), permute(space(t), p))
-    return @inbounds transpose!(tdst, t, p)
+    return @inbounds transpose!(tdst, t, p; allocator)
 end
 
 function LinearAlgebra.transpose(
         t::AdjointTensorMap, (p₁, p₂)::Index2Tuple = _transpose_indices(t);
-        copy::Bool = false
+        copy::Bool = false, allocator=TO.DefaultAllocator()
     )
     p₁′ = map(n -> adjointtensorindex(t, n), p₂)
     p₂′ = map(n -> adjointtensorindex(t, n), p₁)
-    return adjoint(transpose(adjoint(t), (p₁′, p₂′); copy = copy))
+    return adjoint(transpose(adjoint(t), (p₁′, p₂′); copy, allocator))
 end
 
 """
-    repartition!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap) -> tdst
+    repartition!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap,
+                 α::Number=One(), β::Number=Zero(), backend::AbstractBackend...;
+                 allocator=TO.DefaultAllocator())
+        -> tdst
 
-Write into `tdst` the result of repartitioning the indices of `tsrc`. This is just a special
-case of a transposition that only changes the number of in- and outgoing indices.
+Write into `tdst` the result of adding `α * tsrc` to `β * tdst` after repartitioning the indices of
+`tsrc`. This is just a special case of a transposition that only changes the number of in- and
+outgoing indices.
 
 See [`repartition`](@ref) for creating a new tensor.
 """
-@propagate_inbounds function repartition!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap)
+@propagate_inbounds function repartition!(
+        tdst::AbstractTensorMap, tsrc::AbstractTensorMap,
+        α::Number=One(), β::Number=Zero(), backend::AbstractBackend...;
+        allocator=TO.DefaultAllocator()
+    )
     check_spacetype(tdst, tsrc)
     numind(tsrc) == numind(tdst) ||
         throw(ArgumentError("tsrc and tdst should have an equal amount of indices"))
     all_inds = (codomainind(tsrc)..., reverse(domainind(tsrc))...)
     p₁ = ntuple(i -> all_inds[i], numout(tdst))
     p₂ = reverse(ntuple(i -> all_inds[i + numout(tdst)], numin(tdst)))
-    return transpose!(tdst, tsrc, (p₁, p₂))
+    return transpose!(tdst, tsrc, (p₁, p₂), α, β, backend...; allocator)
 end
 
 """
     repartition(
-        tsrc::AbstractTensorMap{T, S}, N₁::Int, N₂::Int; copy::Bool=false
+        tsrc::AbstractTensorMap{T, S}, N₁::Int, N₂::Int; copy::Bool=false,
+        allocator=TO.DefaultAllocator()
     ) where {T, S} -> tdst::AbstractTensorMap{T, S, N₁, N₂}
 
 Return tensor `tdst` obtained by repartitioning the indices of `t`.
@@ -249,14 +284,15 @@ If `copy=false`, `tdst` might share data with `tsrc` whenever possible. Otherwis
 To repartition into an existing destination, see [repartition!](@ref).
 """
 @constprop :aggressive function repartition(
-        t::AbstractTensorMap, N₁::Int, N₂::Int = numind(t) - N₁; copy::Bool = false
+        t::AbstractTensorMap, N₁::Int, N₂::Int = numind(t) - N₁;
+        copy::Bool = false, allocator=TO.DefaultAllocator()
     )
     N₁ + N₂ == numind(t) ||
         throw(ArgumentError("Invalid repartition: $(numind(t)) to ($N₁, $N₂)"))
     all_inds = (codomainind(t)..., reverse(domainind(t))...)
     p₁ = ntuple(i -> all_inds[i], N₁)
     p₂ = reverse(ntuple(i -> all_inds[i + N₁], N₂))
-    return transpose(t, (p₁, p₂); copy)
+    return transpose(t, (p₁, p₂); copy, allocator)
 end
 
 # Twist
@@ -394,7 +430,7 @@ For this to work, that factor has to be isomorphic to the field of scalars.
 
 If `copy=false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
 
-This operation undoes the work of [`insertleftunit`](@ref insertleftunit(::AbstractTensorMap, ::Val{i}) where {i}) 
+This operation undoes the work of [`insertleftunit`](@ref insertleftunit(::AbstractTensorMap, ::Val{i}) where {i})
 and [`insertrightunit`](@ref insertrightunit(::AbstractTensorMap, ::Val{i}) where {i}).
 """
 function removeunit(t::AbstractTensorMap, ::Val{i}; copy::Bool = false) where {i}
@@ -447,67 +483,56 @@ end
     return nothing
 end
 
-
+# Deprecated add_*! wrappers
+# --------------------------
 """
-    add_permute!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap, (p₁, p₂)::Index2Tuple,
-                 α::Number, β::Number, backend::AbstractBackend...)
-
-Return the updated `tdst`, which is the result of adding `α * tsrc` to `tdst` after permuting 
-the indices of `tsrc` according to `(p₁, p₂)`.
+    add_permute!(tdst, tsrc, (p₁, p₂)::Index2Tuple, α::Number, β::Number, backend::AbstractBackend...)
 
-See also [`permute`](@ref), [`permute!`](@ref), [`add_braid!`](@ref), [`add_transpose!`](@ref).
+!!! warning "Deprecated"
+    `add_permute!` is deprecated. Use `permute!(tdst, tsrc, p, α, β, backend...)` instead.
 """
-@propagate_inbounds function add_permute!(
+function add_permute!(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
         α::Number, β::Number, backend::AbstractBackend...
     )
-    @boundscheck spacecheck_transform(permute, tdst, tsrc, p)
-    transformer = treepermuter(tdst, tsrc, p)
-    return @inbounds add_transform!(tdst, tsrc, p, transformer, α, β, backend...)
+    Base.depwarn("`add_permute!` is deprecated, use `permute!` instead", :add_permute!)
+    return @inbounds permute!(tdst, tsrc, p, α, β, backend...)
 end
 
 """
-    add_braid!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap, (p₁, p₂)::Index2Tuple,
-               levels::IndexTuple, α::Number, β::Number, backend::AbstractBackend...)
+    add_braid!(tdst, tsrc, (p₁, p₂)::Index2Tuple, levels::IndexTuple, α::Number, β::Number,
+               backend::AbstractBackend...)
 
-Return the updated `tdst`, which is the result of adding `α * tsrc` to `tdst` after braiding
-the indices of `tsrc` according to `(p₁, p₂)` and `levels`.
-
-See also [`braid`](@ref), [`braid!`](@ref), [`add_permute!`](@ref), [`add_transpose!`](@ref).
+!!! warning "Deprecated"
+    `add_braid!` is deprecated. Use `braid!(tdst, tsrc, p, levels, α, β, backend...)` instead.
 """
-@propagate_inbounds function add_braid!(
+function add_braid!(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple,
         α::Number, β::Number, backend::AbstractBackend...
     )
-    @boundscheck spacecheck_transform(braid, tdst, tsrc, p, levels)
-    levels1 = TupleTools.getindices(levels, codomainind(tsrc))
-    levels2 = TupleTools.getindices(levels, domainind(tsrc))
-    # TODO: arg order for tensormaps is different than for fusiontrees
-    transformer = treebraider(tdst, tsrc, p, (levels1, levels2))
-    return @inbounds add_transform!(tdst, tsrc, p, transformer, α, β, backend...)
+    Base.depwarn("`add_braid!` is deprecated, use `braid!` instead", :add_braid!)
+    return @inbounds braid!(tdst, tsrc, p, levels, α, β, backend...)
 end
 
 """
-    add_transpose!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap, (p₁, p₂)::Index2Tuple,
-                   α::Number, β::Number, backend::AbstractBackend...)
+    add_transpose!(tdst, tsrc, (p₁, p₂)::Index2Tuple, α::Number, β::Number,
+                   backend::AbstractBackend...)
 
-Return the updated `tdst`, which is the result of adding `α * tsrc` to `tdst` after transposing
-the indices of `tsrc` according to `(p₁, p₂)`.
-
-See also [`transpose`](@ref), [`transpose!`](@ref), [`add_permute!`](@ref), [`add_braid!`](@ref).
+!!! warning "Deprecated"
+    `add_transpose!` is deprecated. Use `transpose!(tdst, tsrc, p, α, β, backend...)` instead.
 """
-@propagate_inbounds function add_transpose!(
+function add_transpose!(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
         α::Number, β::Number, backend::AbstractBackend...
     )
-    @boundscheck spacecheck_transform(transpose, tdst, tsrc, p)
-    transformer = treetransposer(tdst, tsrc, p)
-    return @inbounds add_transform!(tdst, tsrc, p, transformer, α, β, backend...)
+    Base.depwarn("`add_transpose!` is deprecated, use `transpose!` instead", :add_transpose!)
+    return @inbounds transpose!(tdst, tsrc, p, α, β, backend...)
 end
 
 @propagate_inbounds function add_transform!(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, transformer,
-        α::Number, β::Number, backend::AbstractBackend...
+        α::Number, β::Number, backend::AbstractBackend...;
+        allocator=TO.DefaultAllocator()
     )
     @boundscheck spacecheck_transform(permute, tdst, tsrc, p)
 
@@ -515,14 +540,15 @@ end
         add!(tdst, tsrc, α, β)
     else
         I = sectortype(tdst)
+        _backend = isempty(backend) ? TO.DefaultBackend() : only(backend)
         if I === Trivial
-            add_trivial_kernel!(tdst, tsrc, p, transformer, α, β, backend...)
+            add_trivial_kernel!(tdst, tsrc, p, transformer, α, β, _backend; allocator)
         else
             style = FusionStyle(I)
             if use_threaded_transform(tdst, transformer)
-                add_kernel_threaded!(style, tdst, tsrc, p, transformer, α, β, backend...)
+                add_kernel_threaded!(style, tdst, tsrc, p, transformer, α, β, _backend; allocator)
             else
-                add_kernel_nonthreaded!(style, tdst, tsrc, p, transformer, α, β, backend...)
+                add_kernel_nonthreaded!(style, tdst, tsrc, p, transformer, α, β, _backend; allocator)
             end
         end
     end
@@ -539,70 +565,75 @@ end
 
 # Trivial implementations
 # -----------------------
-function add_trivial_kernel!(tdst, tsrc, p, transformer, α, β, backend...)
-    TO.tensoradd!(tdst[], tsrc[], p, false, α, β, backend...)
+function add_trivial_kernel!(tdst, tsrc, p, transformer, α, β, backend; allocator=TO.DefaultAllocator())
+    TO.tensoradd!(tdst[], tsrc[], p, false, α, β, backend, allocator)
     return nothing
 end
 
 # Non-threaded implementations
 # ----------------------------
 function add_kernel_nonthreaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer, α, β, backend...
+        ::UniqueFusion, tdst, tsrc, p, transformer, α, β, backend; allocator=TO.DefaultAllocator()
     )
     for (f₁, f₂) in fusiontrees(tsrc)
-        _add_transform_single!(tdst, tsrc, p, (f₁, f₂), transformer, α, β, backend...)
+        _add_transform_single!(tdst, tsrc, p, (f₁, f₂), transformer, α, β, backend; allocator)
     end
     return nothing
 end
 function add_kernel_nonthreaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer::AbelianTreeTransformer, α, β, backend...
+        ::UniqueFusion, tdst, tsrc, p, transformer::AbelianTreeTransformer, α, β, backend;
+        allocator=TO.DefaultAllocator()
     )
     for subtransformer in transformer.data
-        _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend...)
+        _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend; allocator)
     end
     return nothing
 end
-function add_kernel_nonthreaded!(::FusionStyle, tdst, tsrc, p, transformer, α, β, backend...)
+function add_kernel_nonthreaded!(
+        ::FusionStyle, tdst, tsrc, p, transformer, α, β, backend; allocator=TO.DefaultAllocator()
+    )
     # preallocate buffers
-    buffers = allocate_buffers(tdst, tsrc, transformer)
+    buffers = allocate_buffers(tdst, tsrc, transformer; allocator)
 
     for src in fusionblocks(tsrc)
         if length(src) == 1
-            _add_transform_single!(tdst, tsrc, p, src, transformer, α, β, backend...)
+            _add_transform_single!(tdst, tsrc, p, src, transformer, α, β, backend; allocator)
         else
-            _add_transform_multi!(tdst, tsrc, p, src, transformer, buffers, α, β, backend...)
+            _add_transform_multi!(tdst, tsrc, p, src, transformer, buffers, α, β, backend; allocator)
         end
     end
     return nothing
 end
 # specialization in the case of TensorMap
 function add_kernel_nonthreaded!(
-        ::FusionStyle, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend...
+        ::FusionStyle, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend;
+        allocator=TO.DefaultAllocator()
     )
     # preallocate buffers
-    buffers = allocate_buffers(tdst, tsrc, transformer)
+    buffers = allocate_buffers(tdst, tsrc, transformer; allocator)
 
     for subtransformer in transformer.data
         # Special case without intermediate buffers whenever there is only a single block
         if length(subtransformer[1]) == 1
-            _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend...)
+            _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend; allocator)
         else
-            _add_transform_multi!(tdst, tsrc, p, subtransformer, buffers, α, β, backend...)
+            _add_transform_multi!(tdst, tsrc, p, subtransformer, buffers, α, β, backend; allocator)
         end
     end
     return nothing
 end
 # ambiguity resolution
 function add_kernel_nonthreaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend...
+        ::UniqueFusion, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend;
+        allocator=TO.DefaultAllocator()
     )
     throw(ArgumentError("Cannot combine `GenericTreeTransformer` with `UniqueFusion`"))
 end
 # Threaded implementations
 # ------------------------
 function add_kernel_threaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer, α, β, backend...;
-        ntasks::Int = get_num_transformer_threads()
+        ::UniqueFusion, tdst, tsrc, p, transformer, α, β, backend;
+        ntasks::Int = get_num_transformer_threads(), allocator=TO.DefaultAllocator()
     )
     trees = fusiontrees(tsrc)
     nblocks = length(trees)
@@ -613,15 +644,15 @@ function add_kernel_threaded!(
                 local_counter = Threads.atomic_add!(counter, 1)
                 local_counter > nblocks && break
                 @inbounds (f₁, f₂) = trees[local_counter]
-                _add_transform_single!(tdst, tsrc, p, (f₁, f₂), transformer, α, β, backend...)
+                _add_transform_single!(tdst, tsrc, p, (f₁, f₂), transformer, α, β, backend; allocator)
             end
         end
     end
     return nothing
 end
 function add_kernel_threaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer::AbelianTreeTransformer, α, β, backend...;
-        ntasks::Int = get_num_transformer_threads()
+        ::UniqueFusion, tdst, tsrc, p, transformer::AbelianTreeTransformer, α, β, backend;
+        ntasks::Int = get_num_transformer_threads(), allocator=TO.DefaultAllocator()
     )
     nblocks = length(transformer.data)
     counter = Threads.Atomic{Int}(1)
@@ -631,7 +662,7 @@ function add_kernel_threaded!(
                 local_counter = Threads.atomic_add!(counter, 1)
                 local_counter > nblocks && break
                 @inbounds subtransformer = transformer.data[local_counter]
-                _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend...)
+                _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend; allocator)
             end
         end
     end
@@ -639,8 +670,8 @@ function add_kernel_threaded!(
 end
 
 function add_kernel_threaded!(
-        ::FusionStyle, tdst, tsrc, p, transformer, α, β, backend...;
-        ntasks::Int = get_num_transformer_threads()
+        ::FusionStyle, tdst, tsrc, p, transformer, α, β, backend;
+        ntasks::Int = get_num_transformer_threads(), allocator=TO.DefaultAllocator()
     )
     allblocks = fusionblocks(tsrc)
     nblocks = length(allblocks)
@@ -649,16 +680,16 @@ function add_kernel_threaded!(
     Threads.@sync for _ in 1:min(ntasks, nblocks)
         Threads.@spawn begin
             # preallocate buffers for each task
-            buffers = allocate_buffers(tdst, tsrc, transformer)
+            buffers = allocate_buffers(tdst, tsrc, transformer; allocator)
 
             while true
                 local_counter = Threads.atomic_add!(counter, 1)
                 local_counter > nblocks && break
                 @inbounds src = allblocks[local_counter]
                 if length(src) == 1
-                    _add_transform_single!(tdst, tsrc, p, src, transformer, α, β, backend...)
+                    _add_transform_single!(tdst, tsrc, p, src, transformer, α, β, backend; allocator)
                 else
-                    _add_transform_multi!(tdst, tsrc, p, src, transformer, buffers, α, β, backend...)
+                    _add_transform_multi!(tdst, tsrc, p, src, transformer, buffers, α, β, backend; allocator)
                 end
             end
         end
@@ -668,8 +699,8 @@ function add_kernel_threaded!(
 end
 # specialization in the case of TensorMap
 function add_kernel_threaded!(
-        ::FusionStyle, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend...;
-        ntasks::Int = get_num_transformer_threads()
+        ::FusionStyle, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend;
+        ntasks::Int = get_num_transformer_threads(), allocator=TO.DefaultAllocator()
     )
     nblocks = length(transformer.data)
 
@@ -677,16 +708,16 @@ function add_kernel_threaded!(
     Threads.@sync for _ in 1:min(ntasks, nblocks)
         Threads.@spawn begin
             # preallocate buffers for each task
-            buffers = allocate_buffers(tdst, tsrc, transformer)
+            buffers = allocate_buffers(tdst, tsrc, transformer; allocator)
 
             while true
                 local_counter = Threads.atomic_add!(counter, 1)
                 local_counter > nblocks && break
                 @inbounds subtransformer = transformer.data[local_counter]
                 if length(subtransformer[1]) == 1
-                    _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend...)
+                    _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend; allocator)
                 else
-                    _add_transform_multi!(tdst, tsrc, p, subtransformer, buffers, α, β, backend...)
+                    _add_transform_multi!(tdst, tsrc, p, subtransformer, buffers, α, β, backend; allocator)
                 end
             end
         end
@@ -696,8 +727,8 @@ function add_kernel_threaded!(
 end
 # ambiguity resolution
 function add_kernel_threaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend...;
-        ntasks::Int = get_num_transformer_threads()
+        ::UniqueFusion, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend;
+        ntasks::Int = get_num_transformer_threads(), allocator=TO.DefaultAllocator()
     )
     throw(ArgumentError("Cannot combine `GenericTreeTransformer` with `UniqueFusion`"))
 end
@@ -705,40 +736,49 @@ end
 
 # Auxiliary methods
 # -----------------
-function _add_transform_single!(tdst, tsrc, p, (f₁, f₂)::FusionTreePair, transformer, α, β, backend...)
+function _add_transform_single!(
+        tdst, tsrc, p, (f₁, f₂)::FusionTreePair, transformer, α, β, backend;
+        allocator=TO.DefaultAllocator()
+    )
     (f₁′, f₂′), coeff = transformer((f₁, f₂))
-    @inbounds TO.tensoradd!(tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * coeff, β, backend...)
+    @inbounds TO.tensoradd!(tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * coeff, β, backend, allocator)
     return nothing
 end
-function _add_transform_single!(tdst, tsrc, p, src::FusionTreeBlock, transformer, α, β, backend...)
+function _add_transform_single!(
+        tdst, tsrc, p, src::FusionTreeBlock, transformer, α, β, backend;
+        allocator=TO.DefaultAllocator()
+    )
     dst, U = transformer(src)
     f₁, f₂ = only(fusiontrees(src))
     f₁′, f₂′ = only(fusiontrees(dst))
     coeff = only(U)
-    @inbounds TO.tensoradd!(tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * coeff, β, backend...)
+    @inbounds TO.tensoradd!(tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * coeff, β, backend, allocator)
     return nothing
 end
 function _add_transform_single!(
         tdst, tsrc, p, (coeff, struct_dst, struct_src)::AbelianTransformerData,
-        α, β, backend...
+        α, β, backend; allocator=TO.DefaultAllocator()
     )
     subblock_dst = StridedView(tdst.data, struct_dst...)
     subblock_src = StridedView(tsrc.data, struct_src...)
-    TO.tensoradd!(subblock_dst, subblock_src, p, false, α * coeff, β, backend...)
+    TO.tensoradd!(subblock_dst, subblock_src, p, false, α * coeff, β, backend, allocator)
     return nothing
 end
 function _add_transform_single!(
         tdst, tsrc, p, (basistransform, structs_dst, structs_src)::GenericTransformerData,
-        α, β, backend...
+        α, β, backend; allocator=TO.DefaultAllocator()
     )
     struct_dst = (structs_dst[1], only(structs_dst[2])...)
     struct_src = (structs_src[1], only(structs_src[2])...)
     coeff = only(basistransform)
-    _add_transform_single!(tdst, tsrc, p, (coeff, struct_dst, struct_src), α, β, backend...)
+    _add_transform_single!(tdst, tsrc, p, (coeff, struct_dst, struct_src), α, β, backend; allocator)
     return nothing
 end
 
-function _add_transform_multi!(tdst, tsrc, p, src::FusionTreeBlock, transformer, (buffer1, buffer2), α, β, backend...)
+function _add_transform_multi!(
+        tdst, tsrc, p, src::FusionTreeBlock, transformer, (buffer1, buffer2), α, β, backend;
+        allocator=TO.DefaultAllocator()
+    )
     dst, U = transformer(src)
     rows, cols = size(U)
     sz_src = size(tsrc[first(fusiontrees(src))...])
@@ -764,14 +804,14 @@ function _add_transform_multi!(tdst, tsrc, p, src::FusionTreeBlock, transformer,
     for (i, (f₃, f₄)) in enumerate(fusiontrees(dst))
         subblock_dst = tdst[f₃, f₄]
         bufblock_dst = sreshape(buffer_dst[:, i], sz_src)
-        TO.tensoradd!(subblock_dst, bufblock_dst, p, false, One(), β, backend...)
+        TO.tensoradd!(subblock_dst, bufblock_dst, p, false, One(), β, backend, allocator)
     end
 
     return nothing
 end
 function _add_transform_multi!(
         tdst, tsrc, p, (U, (sz_dst, structs_dst), (sz_src, structs_src)),
-        (buffer1, buffer2), α, β, backend...
+        (buffer1, buffer2), α, β, backend; allocator=TO.DefaultAllocator()
     )
     rows, cols = size(U)
     blocksize = prod(sz_src)
@@ -796,7 +836,7 @@ function _add_transform_multi!(
     for (i, struct_dst) in enumerate(structs_dst)
         subblock_dst = StridedView(tdst.data, sz_dst, struct_dst...)
         bufblock_dst = sreshape(buffer_dst[:, i], sz_src)
-        TO.tensoradd!(subblock_dst, bufblock_dst, p, false, One(), β, backend...)
+        TO.tensoradd!(subblock_dst, bufblock_dst, p, false, One(), β, backend, allocator)
     end
 
     return nothing
diff --git a/src/tensors/tensoroperations.jl b/src/tensors/tensoroperations.jl
index 3fc79cf0c..375b63768 100644
--- a/src/tensors/tensoroperations.jl
+++ b/src/tensors/tensoroperations.jl
@@ -43,9 +43,9 @@ function TO.tensoradd!(
     if conjA
         A′ = adjoint(A)
         pA′ = adjointtensorindices(A, _canonicalize(pA, C))
-        add_permute!(C, A′, pA′, α, β, backend)
+        permute!(C, A′, pA′, α, β, backend)
     else
-        add_permute!(C, A, _canonicalize(pA, C), α, β, backend)
+        permute!(C, A, _canonicalize(pA, C), α, β, backend)
     end
     return C
 end
diff --git a/src/tensors/treetransformers.jl b/src/tensors/treetransformers.jl
index 30ec1de0f..d796754c3 100644
--- a/src/tensors/treetransformers.jl
+++ b/src/tensors/treetransformers.jl
@@ -135,13 +135,15 @@ function buffersize(transformer::GenericTreeTransformer)
 end
 
 function allocate_buffers(
-        tdst::TensorMap, tsrc::TensorMap, transformer::GenericTreeTransformer
+        tdst::TensorMap, tsrc::TensorMap, transformer::GenericTreeTransformer;
+        allocator=TO.DefaultAllocator()
     )
     sz = buffersize(transformer)
     return similar(tdst.data, sz), similar(tsrc.data, sz)
 end
 function allocate_buffers(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, transformer
+        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, transformer;
+        allocator=TO.DefaultAllocator()
     )
     # be pessimistic and assume the worst for now
     sz = dim(space(tsrc))
diff --git a/test/mooncake/indexmanipulations.jl b/test/mooncake/indexmanipulations.jl
index 4dd6413cf..390721d71 100644
--- a/test/mooncake/indexmanipulations.jl
+++ b/test/mooncake/indexmanipulations.jl
@@ -18,7 +18,7 @@ eltypes = (Float64, ComplexF64)
     hasbraiding = BraidingStyle(sectortype(eltype(V))) isa HasBraiding
     symmetricbraiding = BraidingStyle(sectortype(eltype(V))) isa SymmetricBraiding
 
-    symmetricbraiding && @timedtestset "add_permute!" begin
+    symmetricbraiding && @timedtestset "permute!" begin
         A = randn(T, V[1] ⊗ V[2] ← (V[3] ⊗ V[4] ⊗ V[5])')
         α = randn(T)
         β = randn(T)
@@ -27,12 +27,12 @@ eltypes = (Float64, ComplexF64)
         for _ in 1:5
             p = randindextuple(numind(A))
             C = randn!(permute(A, p))
-            Mooncake.TestUtils.test_rule(rng, TensorKit.add_permute!, C, A, p, α, β; atol, rtol, mode)
+            Mooncake.TestUtils.test_rule(rng, TensorKit.permute!, C, A, p, α, β; atol, rtol, mode)
             A = C
         end
     end
 
-    @timedtestset "add_transpose!" begin
+    @timedtestset "transpose!" begin
         A = randn(T, V[1] ⊗ V[2] ← (V[3] ⊗ V[4] ⊗ V[5])')
         α = randn(T)
         β = randn(T)
@@ -41,18 +41,18 @@ eltypes = (Float64, ComplexF64)
         for _ in 1:2
             p = randcircshift(numout(A), numin(A))
             C = randn!(transpose(A, p))
-            Mooncake.TestUtils.test_rule(rng, TensorKit.add_transpose!, C, A, p, One(), Zero(); atol, rtol, mode)
-            Mooncake.TestUtils.test_rule(rng, TensorKit.add_transpose!, C, A, p, α, β; atol, rtol, mode)
+            Mooncake.TestUtils.test_rule(rng, TensorKit.transpose!, C, A, p, One(), Zero(); atol, rtol, mode)
+            Mooncake.TestUtils.test_rule(rng, TensorKit.transpose!, C, A, p, α, β; atol, rtol, mode)
             if !(T <: Real)
-                Mooncake.TestUtils.test_rule(rng, TensorKit.add_transpose!, C, real(A), p, α, β; atol, rtol, mode)
-                Mooncake.TestUtils.test_rule(rng, TensorKit.add_transpose!, C, A, p, real(α), β; atol, rtol, mode)
-                Mooncake.TestUtils.test_rule(rng, TensorKit.add_transpose!, C, real(A), p, real(α), β; atol, rtol, mode)
+                Mooncake.TestUtils.test_rule(rng, TensorKit.transpose!, C, real(A), p, α, β; atol, rtol, mode)
+                Mooncake.TestUtils.test_rule(rng, TensorKit.transpose!, C, A, p, real(α), β; atol, rtol, mode)
+                Mooncake.TestUtils.test_rule(rng, TensorKit.transpose!, C, real(A), p, real(α), β; atol, rtol, mode)
             end
             A = C
         end
     end
 
-    hasbraiding && @timedtestset "add_braid!" begin
+    hasbraiding && @timedtestset "braid!" begin
         A = randn(T, V[1] ⊗ V[2] ← (V[3] ⊗ V[4] ⊗ V[5])')
         α = randn(T)
         β = randn(T)
@@ -62,11 +62,11 @@ eltypes = (Float64, ComplexF64)
             p = randcircshift(numout(A), numin(A))
             levels = Tuple(randperm(numind(A)))
             C = randn!(transpose(A, p))
-            Mooncake.TestUtils.test_rule(rng, TensorKit.add_braid!, C, A, p, levels, α, β; atol, rtol, mode)
+            Mooncake.TestUtils.test_rule(rng, TensorKit.braid!, C, A, p, levels, α, β; atol, rtol, mode)
             if !(T <: Real)
-                Mooncake.TestUtils.test_rule(rng, TensorKit.add_braid!, C, real(A), p, levels, α, β; atol, rtol, mode)
-                Mooncake.TestUtils.test_rule(rng, TensorKit.add_braid!, C, A, p, levels, real(α), β; atol, rtol, mode)
-                Mooncake.TestUtils.test_rule(rng, TensorKit.add_braid!, C, A, p, levels, real(α), real(β); atol, rtol, mode)
+                Mooncake.TestUtils.test_rule(rng, TensorKit.braid!, C, real(A), p, levels, α, β; atol, rtol, mode)
+                Mooncake.TestUtils.test_rule(rng, TensorKit.braid!, C, A, p, levels, real(α), β; atol, rtol, mode)
+                Mooncake.TestUtils.test_rule(rng, TensorKit.braid!, C, A, p, levels, real(α), real(β); atol, rtol, mode)
             end
             A = C
         end

From 6a50d11ebbb2cd1b50f759738fb6b0a8f017f15f Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Thu, 23 Apr 2026 10:19:52 -0400
Subject: [PATCH 02/23] Align index manipulation API with TensorOperations
 dispatch convention

Replace single-method-with-variadics (`backend::AbstractBackend...`) with the TensorOperations-style dispatch chain: four separate overloads per in-place function inserting One()/Zero(), DefaultBackend(), and DefaultAllocator() successively so that the full 7-arg form is the implementation endpoint. Move `allocator` from keyword to positional argument throughout the internal chain (`add_transform!`, all kernel functions, `allocate_buffers`). Non-inplace functions (`permute`, `braid`, `transpose`, `repartition`) gain `backend` as a new keyword alongside the existing `allocator` keyword, keeping the user-facing API ergonomic. Docstrings updated to use TO-style bracket notation showing optional arguments.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/tensors/indexmanipulations.jl | 327 +++++++++++++++++-------------
 src/tensors/treetransformers.jl   |   4 +-
 2 files changed, 192 insertions(+), 139 deletions(-)

diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index ce998f396..c76fe766f 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -47,40 +47,54 @@ function flip(t::AbstractTensorMap, I; inv::Bool = false)
 end
 
 """
-    permute!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap, (p₁, p₂)::Index2Tuple,
-             α::Number=One(), β::Number=Zero(), backend::AbstractBackend...;
-             allocator=TO.DefaultAllocator())
-        -> tdst
+    permute!(tdst, tsrc, (p₁, p₂)::Index2Tuple[, α=1[, β=0[, backend[, allocator]]]]) -> tdst
 
-Write into `tdst` the result of adding `α * tsrc` to `β * tdst` after permuting the indices of `tsrc`.
+Compute `tdst = β * tdst + α * permute(tsrc, (p₁, p₂))`, writing the result into `tdst`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
+Optionally specify a `backend` and `allocator` for the underlying array operation.
 
-See [`permute`](@ref) for creating a new tensor.
+See also [`permute`](@ref) for creating a new tensor.
 """
+function Base.permute!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple)
+    return permute!(tdst, tsrc, p, One(), Zero())
+end
+function Base.permute!(
+        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
+        α::Number, β::Number
+    )
+    return permute!(tdst, tsrc, p, α, β, TO.DefaultBackend())
+end
+function Base.permute!(
+        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
+        α::Number, β::Number, backend
+    )
+    return permute!(tdst, tsrc, p, α, β, backend, TO.DefaultAllocator())
+end
 @propagate_inbounds function Base.permute!(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
-        α::Number=One(), β::Number=Zero(), backend::AbstractBackend...;
-        allocator=TO.DefaultAllocator()
+        α::Number, β::Number, backend, allocator
     )
     @boundscheck spacecheck_transform(permute, tdst, tsrc, p)
     transformer = treepermuter(tdst, tsrc, p)
-    return @inbounds add_transform!(tdst, tsrc, p, transformer, α, β, backend...; allocator)
+    return @inbounds add_transform!(tdst, tsrc, p, transformer, α, β, backend, allocator)
 end
 
 """
-    permute(tsrc::AbstractTensorMap, (p₁, p₂)::Index2Tuple; copy::Bool = false,
-            allocator=TO.DefaultAllocator()) -> tdst::TensorMap
+    permute(tsrc, (p₁, p₂)::Index2Tuple; copy=false,
+            backend=DefaultBackend(), allocator=DefaultAllocator()) -> tdst::TensorMap
 
 Return tensor `tdst` obtained by permuting the indices of `tsrc`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
 
 If `copy = false`, `tdst` might share data with `tsrc` whenever possible.
 Otherwise, a copy is always made.
+Optionally specify a `backend` and `allocator` for the underlying array operation.
 
-To permute into an existing destination, see [permute!](@ref)
+See also [`permute!`](@ref) for writing into an existing destination.
 """
 function permute(
-        t::AbstractTensorMap, p::Index2Tuple; copy::Bool = false, allocator=TO.DefaultAllocator()
+        t::AbstractTensorMap, p::Index2Tuple;
+        copy::Bool = false, backend=TO.DefaultBackend(), allocator=TO.DefaultAllocator()
     )
     # share data if possible
     if !copy
@@ -93,15 +107,20 @@ function permute(
 
     # general case
     tdst = similar(t, promote_permute(t), permute(space(t), p))
-    return @inbounds permute!(tdst, t, p; allocator)
+    return @inbounds permute!(tdst, t, p, One(), Zero(), backend, allocator)
 end
-function permute(t::AdjointTensorMap, (p₁, p₂)::Index2Tuple; copy::Bool = false, allocator=TO.DefaultAllocator())
+function permute(
+        t::AdjointTensorMap, (p₁, p₂)::Index2Tuple;
+        copy::Bool = false, backend=TO.DefaultBackend(), allocator=TO.DefaultAllocator()
+    )
     p₁′ = adjointtensorindices(t, p₂)
     p₂′ = adjointtensorindices(t, p₁)
-    return adjoint(permute(adjoint(t), (p₁′, p₂′); copy, allocator))
+    return adjoint(permute(adjoint(t), (p₁′, p₂′); copy, backend, allocator))
 end
-permute(t::AbstractTensorMap, p::IndexTuple; copy::Bool = false, allocator=TO.DefaultAllocator()) =
-    permute(t, (p, ()); copy, allocator)
+permute(
+    t::AbstractTensorMap, p::IndexTuple;
+    copy::Bool = false, backend=TO.DefaultBackend(), allocator=TO.DefaultAllocator()
+) = permute(t, (p, ()); copy, backend, allocator)
 
 function has_shared_permute(t::AbstractTensorMap, (p₁, p₂)::Index2Tuple)
     return (p₁ === codomainind(t) && p₂ === domainind(t))
@@ -127,35 +146,47 @@ end
 
 # Braid
 """
-    braid!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap,
-           (p₁, p₂)::Index2Tuple, levels::IndexTuple,
-           α::Number=One(), β::Number=Zero(), backend::AbstractBackend...;
-           allocator=TO.DefaultAllocator())
-        -> tdst
+    braid!(tdst, tsrc, (p₁, p₂)::Index2Tuple, levels::IndexTuple[, α=1[, β=0[, backend[, allocator]]]]) -> tdst
 
-Write into `tdst` the result of adding `α * tsrc` to `β * tdst` after braiding the indices of `tsrc`.
+Compute `tdst = β * tdst + α * braid(tsrc, (p₁, p₂), levels)`, writing the result into `tdst`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
 Here, `levels` is a tuple of length `numind(tsrc)` that assigns a level or height to the indices of `tsrc`,
 which determines whether they will braid over or under any other index with which they have to change places.
+Optionally specify a `backend` and `allocator` for the underlying array operation.
 
-See [`braid`](@ref) for creating a new tensor.
+See also [`braid`](@ref) for creating a new tensor.
 """
+function braid!(
+        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple
+    )
+    return braid!(tdst, tsrc, p, levels, One(), Zero())
+end
+function braid!(
+        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple,
+        α::Number, β::Number
+    )
+    return braid!(tdst, tsrc, p, levels, α, β, TO.DefaultBackend())
+end
+function braid!(
+        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple,
+        α::Number, β::Number, backend
+    )
+    return braid!(tdst, tsrc, p, levels, α, β, backend, TO.DefaultAllocator())
+end
 @propagate_inbounds function braid!(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple,
-        α::Number=One(), β::Number=Zero(), backend::AbstractBackend...;
-        allocator=TO.DefaultAllocator()
+        α::Number, β::Number, backend, allocator
     )
     @boundscheck spacecheck_transform(braid, tdst, tsrc, p, levels)
     levels1 = TupleTools.getindices(levels, codomainind(tsrc))
     levels2 = TupleTools.getindices(levels, domainind(tsrc))
     transformer = treebraider(tdst, tsrc, p, (levels1, levels2))
-    return @inbounds add_transform!(tdst, tsrc, p, transformer, α, β, backend...; allocator)
+    return @inbounds add_transform!(tdst, tsrc, p, transformer, α, β, backend, allocator)
 end
 
 """
-    braid(tsrc::AbstractTensorMap, (p₁, p₂)::Index2Tuple, levels::IndexTuple;
-          copy::Bool = false, allocator=TO.DefaultAllocator())
-        -> tdst::TensorMap
+    braid(tsrc, (p₁, p₂)::Index2Tuple, levels::IndexTuple; copy=false,
+          backend=DefaultBackend(), allocator=DefaultAllocator()) -> tdst::TensorMap
 
 Return tensor `tdst` obtained by braiding the indices of `tsrc`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
@@ -163,21 +194,22 @@ Here, `levels` is a tuple of length `numind(tsrc)` that assigns a level or heigh
 which determines whether they will braid over or under any other index with which they have to change places.
 
 If `copy=false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
+Optionally specify a `backend` and `allocator` for the underlying array operation.
 
-To braid into an existing destination, see [braid!](@ref)
+See also [`braid!`](@ref) for writing into an existing destination.
 """
 function braid(
         t::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple;
-        copy::Bool = false, allocator=TO.DefaultAllocator()
+        copy::Bool = false, backend=TO.DefaultBackend(), allocator=TO.DefaultAllocator()
     )
     length(levels) == numind(t) || throw(ArgumentError("invalid levels"))
 
-    BraidingStyle(sectortype(t)) isa SymmetricBraiding && return permute(t, p; copy, allocator)
+    BraidingStyle(sectortype(t)) isa SymmetricBraiding && return permute(t, p; copy, backend, allocator)
     (!copy && p == (codomainind(t), domainind(t))) && return t
 
     # general case
     tdst = similar(t, promote_braid(t), permute(space(t), p))
-    return @inbounds braid!(tdst, t, p, levels; allocator)
+    return @inbounds braid!(tdst, t, p, levels, One(), Zero(), backend, allocator)
 end
 # TODO: braid for `AdjointTensorMap`; think about how to map the `levels` argument.
 
@@ -185,81 +217,108 @@ end
 _transpose_indices(t::AbstractTensorMap) = (reverse(domainind(t)), reverse(codomainind(t)))
 
 """
-    transpose!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap,
-               (p₁, p₂)::Index2Tuple,
-               α::Number=One(), β::Number=Zero(), backend::AbstractBackend...;
-               allocator=TO.DefaultAllocator())
-        -> tdst
+    transpose!(tdst, tsrc, (p₁, p₂)::Index2Tuple[, α=1[, β=0[, backend[, allocator]]]]) -> tdst
 
-Write into `tdst` the result of adding `α * tsrc` to `β * tdst` after transposing the indices of `tsrc`.
+Compute `tdst = β * tdst + α * transpose(tsrc, (p₁, p₂))`, writing the result into `tdst`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
 The new index positions should be attainable without any indices crossing each other, i.e.,
-the permutation `(p₁..., reverse(p₂)...)` should constitute a cyclic permutation of `(codomainind(tsrc)..., reverse(domainind(tsrc))...)`.
+the permutation `(p₁..., reverse(p₂)...)` should constitute a cyclic permutation of
+`(codomainind(tsrc)..., reverse(domainind(tsrc))...)`.
+Optionally specify a `backend` and `allocator` for the underlying array operation.
 
-See [`transpose`](@ref) for creating a new tensor.
+See also [`transpose`](@ref) for creating a new tensor.
 """
+function LinearAlgebra.transpose!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap)
+    return transpose!(tdst, tsrc, _transpose_indices(tsrc))
+end
+function LinearAlgebra.transpose!(
+        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple
+    )
+    return transpose!(tdst, tsrc, p, One(), Zero())
+end
+function LinearAlgebra.transpose!(
+        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
+        α::Number, β::Number
+    )
+    return transpose!(tdst, tsrc, p, α, β, TO.DefaultBackend())
+end
+function LinearAlgebra.transpose!(
+        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
+        α::Number, β::Number, backend
+    )
+    return transpose!(tdst, tsrc, p, α, β, backend, TO.DefaultAllocator())
+end
 @propagate_inbounds function LinearAlgebra.transpose!(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap,
-        p::Index2Tuple = _transpose_indices(tsrc),
-        α::Number=One(), β::Number=Zero(), backend::AbstractBackend...;
-        allocator=TO.DefaultAllocator()
+        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
+        α::Number, β::Number, backend, allocator
     )
     @boundscheck spacecheck_transform(transpose, tdst, tsrc, p)
     transformer = treetransposer(tdst, tsrc, p)
-    return @inbounds add_transform!(tdst, tsrc, p, transformer, α, β, backend...; allocator)
+    return @inbounds add_transform!(tdst, tsrc, p, transformer, α, β, backend, allocator)
 end
 
 """
-    transpose(tsrc::AbstractTensorMap, (p₁, p₂)::Index2Tuple;
-              copy::Bool=false, allocator=TO.DefaultAllocator())
-        -> tdst::TensorMap
+    transpose(tsrc, (p₁, p₂)::Index2Tuple; copy=false,
+              backend=DefaultBackend(), allocator=DefaultAllocator()) -> tdst::TensorMap
 
 Return tensor `tdst` obtained by transposing the indices of `tsrc`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
 The new index positions should be attainable without any indices crossing each other, i.e.,
-the permutation `(p₁..., reverse(p₂)...)` should constitute a cyclic permutation of `(codomainind(tsrc)..., reverse(domainind(tsrc))...)`.
+the permutation `(p₁..., reverse(p₂)...)` should constitute a cyclic permutation of
+`(codomainind(tsrc)..., reverse(domainind(tsrc))...)`.
 
 If `copy=false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
+Optionally specify a `backend` and `allocator` for the underlying array operation.
 
-To transpose into an existing destination, see [transpose!](@ref)
+See also [`transpose!`](@ref) for writing into an existing destination.
 """
 function LinearAlgebra.transpose(
         t::AbstractTensorMap, p::Index2Tuple = _transpose_indices(t);
-        copy::Bool = false, allocator=TO.DefaultAllocator()
+        copy::Bool = false, backend=TO.DefaultBackend(), allocator=TO.DefaultAllocator()
     )
-    sectortype(t) === Trivial && return permute(t, p; copy, allocator)
+    sectortype(t) === Trivial && return permute(t, p; copy, backend, allocator)
     (!copy && p == (codomainind(t), domainind(t))) && return t
 
     # general case
     tdst = similar(t, promote_transpose(t), permute(space(t), p))
-    return @inbounds transpose!(tdst, t, p; allocator)
+    return @inbounds transpose!(tdst, t, p, One(), Zero(), backend, allocator)
 end
 
 function LinearAlgebra.transpose(
         t::AdjointTensorMap, (p₁, p₂)::Index2Tuple = _transpose_indices(t);
-        copy::Bool = false, allocator=TO.DefaultAllocator()
+        copy::Bool = false, backend=TO.DefaultBackend(), allocator=TO.DefaultAllocator()
     )
     p₁′ = map(n -> adjointtensorindex(t, n), p₂)
     p₂′ = map(n -> adjointtensorindex(t, n), p₁)
-    return adjoint(transpose(adjoint(t), (p₁′, p₂′); copy, allocator))
+    return adjoint(transpose(adjoint(t), (p₁′, p₂′); copy, backend, allocator))
 end
 
 """
-    repartition!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap,
-                 α::Number=One(), β::Number=Zero(), backend::AbstractBackend...;
-                 allocator=TO.DefaultAllocator())
-        -> tdst
+    repartition!(tdst, tsrc[, α=1[, β=0[, backend[, allocator]]]]) -> tdst
 
-Write into `tdst` the result of adding `α * tsrc` to `β * tdst` after repartitioning the indices of
-`tsrc`. This is just a special case of a transposition that only changes the number of in- and
-outgoing indices.
+Compute `tdst = β * tdst + α * repartition(tsrc)`, writing the result into `tdst`.
+This is a special case of `transpose!` that only changes the partition of indices between
+codomain and domain, without changing their cyclic order.
+Optionally specify a `backend` and `allocator` for the underlying array operation.
 
-See [`repartition`](@ref) for creating a new tensor.
+See also [`repartition`](@ref) for creating a new tensor.
 """
+function repartition!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap)
+    return repartition!(tdst, tsrc, One(), Zero())
+end
+function repartition!(
+        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, α::Number, β::Number
+    )
+    return repartition!(tdst, tsrc, α, β, TO.DefaultBackend())
+end
+function repartition!(
+        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, α::Number, β::Number, backend
+    )
+    return repartition!(tdst, tsrc, α, β, backend, TO.DefaultAllocator())
+end
 @propagate_inbounds function repartition!(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap,
-        α::Number=One(), β::Number=Zero(), backend::AbstractBackend...;
-        allocator=TO.DefaultAllocator()
+        α::Number, β::Number, backend, allocator
     )
     check_spacetype(tdst, tsrc)
     numind(tsrc) == numind(tdst) ||
@@ -267,32 +326,32 @@ See [`repartition`](@ref) for creating a new tensor.
     all_inds = (codomainind(tsrc)..., reverse(domainind(tsrc))...)
     p₁ = ntuple(i -> all_inds[i], numout(tdst))
     p₂ = reverse(ntuple(i -> all_inds[i + numout(tdst)], numin(tdst)))
-    return transpose!(tdst, tsrc, (p₁, p₂), α, β, backend...; allocator)
+    return transpose!(tdst, tsrc, (p₁, p₂), α, β, backend, allocator)
 end
 
 """
-    repartition(
-        tsrc::AbstractTensorMap{T, S}, N₁::Int, N₂::Int; copy::Bool=false,
-        allocator=TO.DefaultAllocator()
-    ) where {T, S} -> tdst::AbstractTensorMap{T, S, N₁, N₂}
+    repartition(tsrc, N₁::Int, N₂::Int=numind(tsrc)-N₁; copy=false,
+                backend=DefaultBackend(), allocator=DefaultAllocator()) -> tdst
 
-Return tensor `tdst` obtained by repartitioning the indices of `t`.
-The codomain and domain of `tdst` correspond to the first `N₁` and last `N₂` spaces of `t`, respectively.
+Return tensor `tdst` obtained by repartitioning the indices of `tsrc`.
+The codomain and domain of `tdst` correspond to the first `N₁` and last `N₂` spaces of `tsrc`,
+respectively.
 
 If `copy=false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
+Optionally specify a `backend` and `allocator` for the underlying array operation.
 
-To repartition into an existing destination, see [repartition!](@ref).
+See also [`repartition!`](@ref) for writing into an existing destination.
 """
 @constprop :aggressive function repartition(
         t::AbstractTensorMap, N₁::Int, N₂::Int = numind(t) - N₁;
-        copy::Bool = false, allocator=TO.DefaultAllocator()
+        copy::Bool = false, backend=TO.DefaultBackend(), allocator=TO.DefaultAllocator()
     )
     N₁ + N₂ == numind(t) ||
         throw(ArgumentError("Invalid repartition: $(numind(t)) to ($N₁, $N₂)"))
     all_inds = (codomainind(t)..., reverse(domainind(t))...)
     p₁ = ntuple(i -> all_inds[i], N₁)
     p₂ = reverse(ntuple(i -> all_inds[i + N₁], N₂))
-    return transpose(t, (p₁, p₂); copy, allocator)
+    return transpose(t, (p₁, p₂); copy, backend, allocator)
 end
 
 # Twist
@@ -486,10 +545,10 @@ end
 # Deprecated add_*! wrappers
 # --------------------------
 """
-    add_permute!(tdst, tsrc, (p₁, p₂)::Index2Tuple, α::Number, β::Number, backend::AbstractBackend...)
+    add_permute!(tdst, tsrc, (p₁, p₂)::Index2Tuple, α::Number, β::Number[, backend])
 
 !!! warning "Deprecated"
-    `add_permute!` is deprecated. Use `permute!(tdst, tsrc, p, α, β, backend...)` instead.
+    `add_permute!` is deprecated. Use `permute!(tdst, tsrc, p, α, β[, backend])` instead.
 """
 function add_permute!(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
@@ -500,11 +559,10 @@ function add_permute!(
 end
 
 """
-    add_braid!(tdst, tsrc, (p₁, p₂)::Index2Tuple, levels::IndexTuple, α::Number, β::Number,
-               backend::AbstractBackend...)
+    add_braid!(tdst, tsrc, (p₁, p₂)::Index2Tuple, levels::IndexTuple, α::Number, β::Number[, backend])
 
 !!! warning "Deprecated"
-    `add_braid!` is deprecated. Use `braid!(tdst, tsrc, p, levels, α, β, backend...)` instead.
+    `add_braid!` is deprecated. Use `braid!(tdst, tsrc, p, levels, α, β[, backend])` instead.
 """
 function add_braid!(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple,
@@ -515,11 +573,10 @@ function add_braid!(
 end
 
 """
-    add_transpose!(tdst, tsrc, (p₁, p₂)::Index2Tuple, α::Number, β::Number,
-                   backend::AbstractBackend...)
+    add_transpose!(tdst, tsrc, (p₁, p₂)::Index2Tuple, α::Number, β::Number[, backend])
 
 !!! warning "Deprecated"
-    `add_transpose!` is deprecated. Use `transpose!(tdst, tsrc, p, α, β, backend...)` instead.
+    `add_transpose!` is deprecated. Use `transpose!(tdst, tsrc, p, α, β[, backend])` instead.
 """
 function add_transpose!(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
@@ -531,8 +588,7 @@ end
 
 @propagate_inbounds function add_transform!(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, transformer,
-        α::Number, β::Number, backend::AbstractBackend...;
-        allocator=TO.DefaultAllocator()
+        α::Number, β::Number, backend, allocator
     )
     @boundscheck spacecheck_transform(permute, tdst, tsrc, p)
 
@@ -540,15 +596,14 @@ end
         add!(tdst, tsrc, α, β)
     else
         I = sectortype(tdst)
-        _backend = isempty(backend) ? TO.DefaultBackend() : only(backend)
         if I === Trivial
-            add_trivial_kernel!(tdst, tsrc, p, transformer, α, β, _backend; allocator)
+            add_trivial_kernel!(tdst, tsrc, p, transformer, α, β, backend, allocator)
         else
             style = FusionStyle(I)
             if use_threaded_transform(tdst, transformer)
-                add_kernel_threaded!(style, tdst, tsrc, p, transformer, α, β, _backend; allocator)
+                add_kernel_threaded!(style, tdst, tsrc, p, transformer, α, β, backend, allocator)
             else
-                add_kernel_nonthreaded!(style, tdst, tsrc, p, transformer, α, β, _backend; allocator)
+                add_kernel_nonthreaded!(style, tdst, tsrc, p, transformer, α, β, backend, allocator)
             end
         end
     end
@@ -565,7 +620,7 @@ end
 
 # Trivial implementations
 # -----------------------
-function add_trivial_kernel!(tdst, tsrc, p, transformer, α, β, backend; allocator=TO.DefaultAllocator())
+function add_trivial_kernel!(tdst, tsrc, p, transformer, α, β, backend, allocator)
     TO.tensoradd!(tdst[], tsrc[], p, false, α, β, backend, allocator)
     return nothing
 end
@@ -573,67 +628,67 @@ end
 # Non-threaded implementations
 # ----------------------------
 function add_kernel_nonthreaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer, α, β, backend; allocator=TO.DefaultAllocator()
+        ::UniqueFusion, tdst, tsrc, p, transformer, α, β, backend, allocator
     )
     for (f₁, f₂) in fusiontrees(tsrc)
-        _add_transform_single!(tdst, tsrc, p, (f₁, f₂), transformer, α, β, backend; allocator)
+        _add_transform_single!(tdst, tsrc, p, (f₁, f₂), transformer, α, β, backend, allocator)
     end
     return nothing
 end
 function add_kernel_nonthreaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer::AbelianTreeTransformer, α, β, backend;
-        allocator=TO.DefaultAllocator()
+        ::UniqueFusion, tdst, tsrc, p, transformer::AbelianTreeTransformer, α, β, backend,
+        allocator
     )
     for subtransformer in transformer.data
-        _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend; allocator)
+        _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend, allocator)
     end
     return nothing
 end
 function add_kernel_nonthreaded!(
-        ::FusionStyle, tdst, tsrc, p, transformer, α, β, backend; allocator=TO.DefaultAllocator()
+        ::FusionStyle, tdst, tsrc, p, transformer, α, β, backend, allocator
     )
     # preallocate buffers
-    buffers = allocate_buffers(tdst, tsrc, transformer; allocator)
+    buffers = allocate_buffers(tdst, tsrc, transformer, allocator)
 
     for src in fusionblocks(tsrc)
         if length(src) == 1
-            _add_transform_single!(tdst, tsrc, p, src, transformer, α, β, backend; allocator)
+            _add_transform_single!(tdst, tsrc, p, src, transformer, α, β, backend, allocator)
         else
-            _add_transform_multi!(tdst, tsrc, p, src, transformer, buffers, α, β, backend; allocator)
+            _add_transform_multi!(tdst, tsrc, p, src, transformer, buffers, α, β, backend, allocator)
         end
     end
     return nothing
 end
 # specialization in the case of TensorMap
 function add_kernel_nonthreaded!(
-        ::FusionStyle, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend;
-        allocator=TO.DefaultAllocator()
+        ::FusionStyle, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend,
+        allocator
     )
     # preallocate buffers
-    buffers = allocate_buffers(tdst, tsrc, transformer; allocator)
+    buffers = allocate_buffers(tdst, tsrc, transformer, allocator)
 
     for subtransformer in transformer.data
         # Special case without intermediate buffers whenever there is only a single block
         if length(subtransformer[1]) == 1
-            _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend; allocator)
+            _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend, allocator)
         else
-            _add_transform_multi!(tdst, tsrc, p, subtransformer, buffers, α, β, backend; allocator)
+            _add_transform_multi!(tdst, tsrc, p, subtransformer, buffers, α, β, backend, allocator)
         end
     end
     return nothing
 end
 # ambiguity resolution
 function add_kernel_nonthreaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend;
-        allocator=TO.DefaultAllocator()
+        ::UniqueFusion, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend,
+        allocator
     )
     throw(ArgumentError("Cannot combine `GenericTreeTransformer` with `UniqueFusion`"))
 end
 # Threaded implementations
 # ------------------------
 function add_kernel_threaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer, α, β, backend;
-        ntasks::Int = get_num_transformer_threads(), allocator=TO.DefaultAllocator()
+        ::UniqueFusion, tdst, tsrc, p, transformer, α, β, backend, allocator;
+        ntasks::Int = get_num_transformer_threads()
     )
     trees = fusiontrees(tsrc)
     nblocks = length(trees)
@@ -644,15 +699,15 @@ function add_kernel_threaded!(
                 local_counter = Threads.atomic_add!(counter, 1)
                 local_counter > nblocks && break
                 @inbounds (f₁, f₂) = trees[local_counter]
-                _add_transform_single!(tdst, tsrc, p, (f₁, f₂), transformer, α, β, backend; allocator)
+                _add_transform_single!(tdst, tsrc, p, (f₁, f₂), transformer, α, β, backend, allocator)
             end
         end
     end
     return nothing
 end
 function add_kernel_threaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer::AbelianTreeTransformer, α, β, backend;
-        ntasks::Int = get_num_transformer_threads(), allocator=TO.DefaultAllocator()
+        ::UniqueFusion, tdst, tsrc, p, transformer::AbelianTreeTransformer, α, β, backend,
+        allocator; ntasks::Int = get_num_transformer_threads()
     )
     nblocks = length(transformer.data)
     counter = Threads.Atomic{Int}(1)
@@ -662,7 +717,7 @@ function add_kernel_threaded!(
                 local_counter = Threads.atomic_add!(counter, 1)
                 local_counter > nblocks && break
                 @inbounds subtransformer = transformer.data[local_counter]
-                _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend; allocator)
+                _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend, allocator)
             end
         end
     end
@@ -670,8 +725,8 @@ function add_kernel_threaded!(
 end
 
 function add_kernel_threaded!(
-        ::FusionStyle, tdst, tsrc, p, transformer, α, β, backend;
-        ntasks::Int = get_num_transformer_threads(), allocator=TO.DefaultAllocator()
+        ::FusionStyle, tdst, tsrc, p, transformer, α, β, backend, allocator;
+        ntasks::Int = get_num_transformer_threads()
     )
     allblocks = fusionblocks(tsrc)
     nblocks = length(allblocks)
@@ -680,16 +735,16 @@ function add_kernel_threaded!(
     Threads.@sync for _ in 1:min(ntasks, nblocks)
         Threads.@spawn begin
             # preallocate buffers for each task
-            buffers = allocate_buffers(tdst, tsrc, transformer; allocator)
+            buffers = allocate_buffers(tdst, tsrc, transformer, allocator)
 
             while true
                 local_counter = Threads.atomic_add!(counter, 1)
                 local_counter > nblocks && break
                 @inbounds src = allblocks[local_counter]
                 if length(src) == 1
-                    _add_transform_single!(tdst, tsrc, p, src, transformer, α, β, backend; allocator)
+                    _add_transform_single!(tdst, tsrc, p, src, transformer, α, β, backend, allocator)
                 else
-                    _add_transform_multi!(tdst, tsrc, p, src, transformer, buffers, α, β, backend; allocator)
+                    _add_transform_multi!(tdst, tsrc, p, src, transformer, buffers, α, β, backend, allocator)
                 end
             end
         end
@@ -699,8 +754,8 @@ function add_kernel_threaded!(
 end
 # specialization in the case of TensorMap
 function add_kernel_threaded!(
-        ::FusionStyle, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend;
-        ntasks::Int = get_num_transformer_threads(), allocator=TO.DefaultAllocator()
+        ::FusionStyle, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend,
+        allocator; ntasks::Int = get_num_transformer_threads()
     )
     nblocks = length(transformer.data)
 
@@ -708,16 +763,16 @@ function add_kernel_threaded!(
     Threads.@sync for _ in 1:min(ntasks, nblocks)
         Threads.@spawn begin
             # preallocate buffers for each task
-            buffers = allocate_buffers(tdst, tsrc, transformer; allocator)
+            buffers = allocate_buffers(tdst, tsrc, transformer, allocator)
 
             while true
                 local_counter = Threads.atomic_add!(counter, 1)
                 local_counter > nblocks && break
                 @inbounds subtransformer = transformer.data[local_counter]
                 if length(subtransformer[1]) == 1
-                    _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend; allocator)
+                    _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend, allocator)
                 else
-                    _add_transform_multi!(tdst, tsrc, p, subtransformer, buffers, α, β, backend; allocator)
+                    _add_transform_multi!(tdst, tsrc, p, subtransformer, buffers, α, β, backend, allocator)
                 end
             end
         end
@@ -727,8 +782,8 @@ function add_kernel_threaded!(
 end
 # ambiguity resolution
 function add_kernel_threaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend;
-        ntasks::Int = get_num_transformer_threads(), allocator=TO.DefaultAllocator()
+        ::UniqueFusion, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend,
+        allocator; ntasks::Int = get_num_transformer_threads()
     )
     throw(ArgumentError("Cannot combine `GenericTreeTransformer` with `UniqueFusion`"))
 end
@@ -737,16 +792,14 @@ end
 # Auxiliary methods
 # -----------------
 function _add_transform_single!(
-        tdst, tsrc, p, (f₁, f₂)::FusionTreePair, transformer, α, β, backend;
-        allocator=TO.DefaultAllocator()
+        tdst, tsrc, p, (f₁, f₂)::FusionTreePair, transformer, α, β, backend, allocator
     )
     (f₁′, f₂′), coeff = transformer((f₁, f₂))
     @inbounds TO.tensoradd!(tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * coeff, β, backend, allocator)
     return nothing
 end
 function _add_transform_single!(
-        tdst, tsrc, p, src::FusionTreeBlock, transformer, α, β, backend;
-        allocator=TO.DefaultAllocator()
+        tdst, tsrc, p, src::FusionTreeBlock, transformer, α, β, backend, allocator
     )
     dst, U = transformer(src)
     f₁, f₂ = only(fusiontrees(src))
@@ -757,7 +810,7 @@ function _add_transform_single!(
 end
 function _add_transform_single!(
         tdst, tsrc, p, (coeff, struct_dst, struct_src)::AbelianTransformerData,
-        α, β, backend; allocator=TO.DefaultAllocator()
+        α, β, backend, allocator
     )
     subblock_dst = StridedView(tdst.data, struct_dst...)
     subblock_src = StridedView(tsrc.data, struct_src...)
@@ -766,18 +819,18 @@ function _add_transform_single!(
 end
 function _add_transform_single!(
         tdst, tsrc, p, (basistransform, structs_dst, structs_src)::GenericTransformerData,
-        α, β, backend; allocator=TO.DefaultAllocator()
+        α, β, backend, allocator
     )
     struct_dst = (structs_dst[1], only(structs_dst[2])...)
     struct_src = (structs_src[1], only(structs_src[2])...)
     coeff = only(basistransform)
-    _add_transform_single!(tdst, tsrc, p, (coeff, struct_dst, struct_src), α, β, backend; allocator)
+    _add_transform_single!(tdst, tsrc, p, (coeff, struct_dst, struct_src), α, β, backend, allocator)
     return nothing
 end
 
 function _add_transform_multi!(
-        tdst, tsrc, p, src::FusionTreeBlock, transformer, (buffer1, buffer2), α, β, backend;
-        allocator=TO.DefaultAllocator()
+        tdst, tsrc, p, src::FusionTreeBlock, transformer, (buffer1, buffer2), α, β, backend,
+        allocator
     )
     dst, U = transformer(src)
     rows, cols = size(U)
@@ -811,7 +864,7 @@ function _add_transform_multi!(
 end
 function _add_transform_multi!(
         tdst, tsrc, p, (U, (sz_dst, structs_dst), (sz_src, structs_src)),
-        (buffer1, buffer2), α, β, backend; allocator=TO.DefaultAllocator()
+        (buffer1, buffer2), α, β, backend, allocator
     )
     rows, cols = size(U)
     blocksize = prod(sz_src)
diff --git a/src/tensors/treetransformers.jl b/src/tensors/treetransformers.jl
index d796754c3..8c63b00a8 100644
--- a/src/tensors/treetransformers.jl
+++ b/src/tensors/treetransformers.jl
@@ -135,14 +135,14 @@ function buffersize(transformer::GenericTreeTransformer)
 end
 
 function allocate_buffers(
-        tdst::TensorMap, tsrc::TensorMap, transformer::GenericTreeTransformer;
+        tdst::TensorMap, tsrc::TensorMap, transformer::GenericTreeTransformer,
         allocator=TO.DefaultAllocator()
     )
     sz = buffersize(transformer)
     return similar(tdst.data, sz), similar(tsrc.data, sz)
 end
 function allocate_buffers(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, transformer;
+        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, transformer,
         allocator=TO.DefaultAllocator()
     )
     # be pessimistic and assume the worst for now

From 8b0e3c2db09cda65e59e020834f714abef22ab24 Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Thu, 23 Apr 2026 11:02:28 -0400
Subject: [PATCH 03/23] simplify implementation

---
 src/TensorKit.jl                  |   3 +-
 src/tensors/indexmanipulations.jl | 121 ++++++++----------------------
 2 files changed, 34 insertions(+), 90 deletions(-)

diff --git a/src/TensorKit.jl b/src/TensorKit.jl
index d8361ac79..db3d9c50f 100644
--- a/src/TensorKit.jl
+++ b/src/TensorKit.jl
@@ -91,8 +91,7 @@ export left_orth, right_orth, left_null, right_null,
     isisometric, isunitary, project_isometric, project_isometric!,
     isposdef, isposdef!, sylvester, rank, cond
 
-export braid, braid!, permute, permute!, transpose, transpose!, twist, twist!, repartition,
-    repartition!
+export braid, braid!, permute, permute!, transpose, transpose!, twist, twist!, repartition, repartition!
 export catdomain, catcodomain, absorb, absorb!
 
 # tensor operations
diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index c76fe766f..f9eb1b4da 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -46,8 +46,11 @@ function flip(t::AbstractTensorMap, I; inv::Bool = false)
     return t′
 end
 
+# --------------
+#   permute(!)
+# --------------
 """
-    permute!(tdst, tsrc, (p₁, p₂)::Index2Tuple[, α=1[, β=0[, backend[, allocator]]]]) -> tdst
+    permute!(tdst, tsrc, (p₁, p₂)::Index2Tuple, α = 1, β = 0, [backend], [allocator]) -> tdst
 
 Compute `tdst = β * tdst + α * permute(tsrc, (p₁, p₂))`, writing the result into `tdst`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
@@ -55,24 +58,10 @@ Optionally specify a `backend` and `allocator` for the underlying array operatio
 
 See also [`permute`](@ref) for creating a new tensor.
 """
-function Base.permute!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple)
-    return permute!(tdst, tsrc, p, One(), Zero())
-end
-function Base.permute!(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
-        α::Number, β::Number
-    )
-    return permute!(tdst, tsrc, p, α, β, TO.DefaultBackend())
-end
-function Base.permute!(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
-        α::Number, β::Number, backend
-    )
-    return permute!(tdst, tsrc, p, α, β, backend, TO.DefaultAllocator())
-end
 @propagate_inbounds function Base.permute!(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
-        α::Number, β::Number, backend, allocator
+        α::Number = One(), β::Number = Zero(),
+        backend::AbstractBackend = TO.DefaultBackend(), allocator = TO.DefaultAllocator()
     )
     @boundscheck spacecheck_transform(permute, tdst, tsrc, p)
     transformer = treepermuter(tdst, tsrc, p)
@@ -80,8 +69,7 @@ end
 end
 
 """
-    permute(tsrc, (p₁, p₂)::Index2Tuple; copy=false,
-            backend=DefaultBackend(), allocator=DefaultAllocator()) -> tdst::TensorMap
+    permute(tsrc, (p₁, p₂)::Index2Tuple; copy = false, [backend], [allocator]) -> tdst
 
 Return tensor `tdst` obtained by permuting the indices of `tsrc`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
@@ -94,7 +82,7 @@ See also [`permute!`](@ref) for writing into an existing destination.
 """
 function permute(
         t::AbstractTensorMap, p::Index2Tuple;
-        copy::Bool = false, backend=TO.DefaultBackend(), allocator=TO.DefaultAllocator()
+        copy::Bool = false, backend::AbstractBackend = TO.DefaultBackend(), allocator = TO.DefaultAllocator()
     )
     # share data if possible
     if !copy
@@ -109,18 +97,12 @@ function permute(
     tdst = similar(t, promote_permute(t), permute(space(t), p))
     return @inbounds permute!(tdst, t, p, One(), Zero(), backend, allocator)
 end
-function permute(
-        t::AdjointTensorMap, (p₁, p₂)::Index2Tuple;
-        copy::Bool = false, backend=TO.DefaultBackend(), allocator=TO.DefaultAllocator()
-    )
+function permute(t::AdjointTensorMap, (p₁, p₂)::Index2Tuple; kwargs...)
     p₁′ = adjointtensorindices(t, p₂)
     p₂′ = adjointtensorindices(t, p₁)
-    return adjoint(permute(adjoint(t), (p₁′, p₂′); copy, backend, allocator))
+    return adjoint(permute(adjoint(t), (p₁′, p₂′); kwargs...))
 end
-permute(
-    t::AbstractTensorMap, p::IndexTuple;
-    copy::Bool = false, backend=TO.DefaultBackend(), allocator=TO.DefaultAllocator()
-) = permute(t, (p, ()); copy, backend, allocator)
+permute(t::AbstractTensorMap, p::IndexTuple; kwargs...) = permute(t, (p, ()); kwargs...)
 
 function has_shared_permute(t::AbstractTensorMap, (p₁, p₂)::Index2Tuple)
     return (p₁ === codomainind(t) && p₂ === domainind(t))
@@ -144,9 +126,11 @@ function has_shared_permute(t::AdjointTensorMap, (p₁, p₂)::Index2Tuple)
     return has_shared_permute(t', (p₁′, p₂′))
 end
 
-# Braid
+# -------------
+#   braid(!)
+# -------------
 """
-    braid!(tdst, tsrc, (p₁, p₂)::Index2Tuple, levels::IndexTuple[, α=1[, β=0[, backend[, allocator]]]]) -> tdst
+    braid!(tdst, tsrc, (p₁, p₂)::Index2Tuple, levels::IndexTuple, α = 1, β = 0, [backend], [allocator]) -> tdst
 
 Compute `tdst = β * tdst + α * braid(tsrc, (p₁, p₂), levels)`, writing the result into `tdst`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
@@ -156,26 +140,10 @@ Optionally specify a `backend` and `allocator` for the underlying array operatio
 
 See also [`braid`](@ref) for creating a new tensor.
 """
-function braid!(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple
-    )
-    return braid!(tdst, tsrc, p, levels, One(), Zero())
-end
-function braid!(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple,
-        α::Number, β::Number
-    )
-    return braid!(tdst, tsrc, p, levels, α, β, TO.DefaultBackend())
-end
-function braid!(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple,
-        α::Number, β::Number, backend
-    )
-    return braid!(tdst, tsrc, p, levels, α, β, backend, TO.DefaultAllocator())
-end
 @propagate_inbounds function braid!(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple,
-        α::Number, β::Number, backend, allocator
+        α::Number = One(), β::Number = Zero(),
+        backend::AbstractBackend = TO.DefaultBackend(), allocator = TO.DefaultAllocator()
     )
     @boundscheck spacecheck_transform(braid, tdst, tsrc, p, levels)
     levels1 = TupleTools.getindices(levels, codomainind(tsrc))
@@ -200,7 +168,7 @@ See also [`braid!`](@ref) for writing into an existing destination.
 """
 function braid(
         t::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple;
-        copy::Bool = false, backend=TO.DefaultBackend(), allocator=TO.DefaultAllocator()
+        copy::Bool = false, backend::AbstractBackend = TO.DefaultBackend(), allocator = TO.DefaultAllocator()
     )
     length(levels) == numind(t) || throw(ArgumentError("invalid levels"))
 
@@ -213,11 +181,13 @@ function braid(
 end
 # TODO: braid for `AdjointTensorMap`; think about how to map the `levels` argument.
 
-# Transpose
+# ----------------
+#   transpose(!)
+# ----------------
 _transpose_indices(t::AbstractTensorMap) = (reverse(domainind(t)), reverse(codomainind(t)))
 
 """
-    transpose!(tdst, tsrc, (p₁, p₂)::Index2Tuple[, α=1[, β=0[, backend[, allocator]]]]) -> tdst
+    transpose!(tdst, tsrc, (p₁, p₂)::Index2Tuple, α = 1, β = 0, [backend], [allocator]) -> tdst
 
 Compute `tdst = β * tdst + α * transpose(tsrc, (p₁, p₂))`, writing the result into `tdst`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
@@ -231,26 +201,10 @@ See also [`transpose`](@ref) for creating a new tensor.
 function LinearAlgebra.transpose!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap)
     return transpose!(tdst, tsrc, _transpose_indices(tsrc))
 end
-function LinearAlgebra.transpose!(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple
-    )
-    return transpose!(tdst, tsrc, p, One(), Zero())
-end
-function LinearAlgebra.transpose!(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
-        α::Number, β::Number
-    )
-    return transpose!(tdst, tsrc, p, α, β, TO.DefaultBackend())
-end
-function LinearAlgebra.transpose!(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
-        α::Number, β::Number, backend
-    )
-    return transpose!(tdst, tsrc, p, α, β, backend, TO.DefaultAllocator())
-end
 @propagate_inbounds function LinearAlgebra.transpose!(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
-        α::Number, β::Number, backend, allocator
+        α::Number = One(), β::Number = Zero(),
+        backend::AbstractBackend = TO.DefaultBackend(), allocator = TO.DefaultAllocator()
     )
     @boundscheck spacecheck_transform(transpose, tdst, tsrc, p)
     transformer = treetransposer(tdst, tsrc, p)
@@ -274,7 +228,7 @@ See also [`transpose!`](@ref) for writing into an existing destination.
 """
 function LinearAlgebra.transpose(
         t::AbstractTensorMap, p::Index2Tuple = _transpose_indices(t);
-        copy::Bool = false, backend=TO.DefaultBackend(), allocator=TO.DefaultAllocator()
+        copy::Bool = false, backend = TO.DefaultBackend(), allocator = TO.DefaultAllocator()
     )
     sectortype(t) === Trivial && return permute(t, p; copy, backend, allocator)
     (!copy && p == (codomainind(t), domainind(t))) && return t
@@ -286,15 +240,18 @@ end
 
 function LinearAlgebra.transpose(
         t::AdjointTensorMap, (p₁, p₂)::Index2Tuple = _transpose_indices(t);
-        copy::Bool = false, backend=TO.DefaultBackend(), allocator=TO.DefaultAllocator()
+        copy::Bool = false, backend = TO.DefaultBackend(), allocator = TO.DefaultAllocator()
     )
     p₁′ = map(n -> adjointtensorindex(t, n), p₂)
     p₂′ = map(n -> adjointtensorindex(t, n), p₁)
     return adjoint(transpose(adjoint(t), (p₁′, p₂′); copy, backend, allocator))
 end
 
+# -------------------
+#   repartition(!)
+# -------------------
 """
-    repartition!(tdst, tsrc[, α=1[, β=0[, backend[, allocator]]]]) -> tdst
+    repartition!(tdst, tsrc, α = 1, β = 0, [backend], [allocator]) -> tdst
 
 Compute `tdst = β * tdst + α * repartition(tsrc)`, writing the result into `tdst`.
 This is a special case of `transpose!` that only changes the partition of indices between
@@ -303,22 +260,10 @@ Optionally specify a `backend` and `allocator` for the underlying array operatio
 
 See also [`repartition`](@ref) for creating a new tensor.
 """
-function repartition!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap)
-    return repartition!(tdst, tsrc, One(), Zero())
-end
-function repartition!(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, α::Number, β::Number
-    )
-    return repartition!(tdst, tsrc, α, β, TO.DefaultBackend())
-end
-function repartition!(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, α::Number, β::Number, backend
-    )
-    return repartition!(tdst, tsrc, α, β, backend, TO.DefaultAllocator())
-end
 @propagate_inbounds function repartition!(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap,
-        α::Number, β::Number, backend, allocator
+        α::Number = One(), β::Number = Zero(),
+        backend::AbstractBackend = TO.DefaultBackend(), allocator = TO.DefaultAllocator()
     )
     check_spacetype(tdst, tsrc)
     numind(tsrc) == numind(tdst) ||
@@ -344,7 +289,7 @@ See also [`repartition!`](@ref) for writing into an existing destination.
 """
 @constprop :aggressive function repartition(
         t::AbstractTensorMap, N₁::Int, N₂::Int = numind(t) - N₁;
-        copy::Bool = false, backend=TO.DefaultBackend(), allocator=TO.DefaultAllocator()
+        copy::Bool = false, backend = TO.DefaultBackend(), allocator = TO.DefaultAllocator()
     )
     N₁ + N₂ == numind(t) ||
         throw(ArgumentError("Invalid repartition: $(numind(t)) to ($N₁, $N₂)"))

From 79b8c84d593624f507a5f4d82a2599c2a90392a3 Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Thu, 23 Apr 2026 11:59:11 -0400
Subject: [PATCH 04/23] minor code improvements

---
 src/tensors/indexmanipulations.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index f9eb1b4da..f852f156b 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -111,8 +111,8 @@ function has_shared_permute(t::TensorMap, (p₁, p₂)::Index2Tuple)
     if p₁ === codomainind(t) && p₂ === domainind(t)
         return true
     elseif sectortype(t) === Trivial
-        stridet = i -> stride(t[], i)
-        sizet = i -> size(t[], i)
+        stridet = Base.Fix1(stride, t[])
+        sizet = Base.Fix1(size, t[])
         canfuse1, d1, s1 = TO._canfuse(sizet.(p₁), stridet.(p₁))
         canfuse2, d2, s2 = TO._canfuse(sizet.(p₂), stridet.(p₂))
         return canfuse1 && canfuse2 && s1 == 1 && (d2 == 1 || s2 == d1)
@@ -170,7 +170,7 @@ function braid(
         t::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple;
         copy::Bool = false, backend::AbstractBackend = TO.DefaultBackend(), allocator = TO.DefaultAllocator()
     )
-    length(levels) == numind(t) || throw(ArgumentError("invalid levels"))
+    length(levels) == numind(t) || throw(ArgumentError(lazy"length of levels should be $(numind(t)), got $(length(levels))"))
 
     BraidingStyle(sectortype(t)) isa SymmetricBraiding && return permute(t, p; copy, backend, allocator)
     (!copy && p == (codomainind(t), domainind(t))) && return t

From 30ca26c1fa059258002ef658c215a3b9f6d27d59 Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Fri, 24 Apr 2026 09:20:53 -0400
Subject: [PATCH 05/23] add braid codepath for adjoint tensors

---
 src/tensors/indexmanipulations.jl  | 11 ++++++++++-
 test/tensors/indexmanipulations.jl |  8 ++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index f852f156b..21af7ce93 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -179,7 +179,16 @@ function braid(
     tdst = similar(t, promote_braid(t), permute(space(t), p))
     return @inbounds braid!(tdst, t, p, levels, One(), Zero(), backend, allocator)
 end
-# TODO: braid for `AdjointTensorMap`; think about how to map the `levels` argument.
+function braid(
+        t::AdjointTensorMap, (p₁, p₂)::Index2Tuple, levels::IndexTuple;
+        kwargs...
+    )
+    p₁′ = adjointtensorindices(t, p₂)
+    p₂′ = adjointtensorindices(t, p₁)
+    perm = adjointtensorindices(adjoint(t), ntuple(identity, numind(t)))
+    levels′ = TupleTools.getindices(levels, perm)
+    return adjoint(braid(adjoint(t), (p₁′, p₂′), levels′; kwargs...))
+end
 
 # ----------------
 #   transpose(!)
diff --git a/test/tensors/indexmanipulations.jl b/test/tensors/indexmanipulations.jl
index c38b182a2..836418b3f 100644
--- a/test/tensors/indexmanipulations.jl
+++ b/test/tensors/indexmanipulations.jl
@@ -129,6 +129,14 @@ for V in spacelist
             @tensor tb[a, b] := flip(t1, (1, 3))[x, y, a, z] * flip(t2, (2, 4))[y, b, z, x]
             @test flip(ta, (1, 2)) ≈ tb
         end
+        hasbraiding && !symmetricbraiding && @timedtestset "Braid AdjointTensorMap: adjoint identity" begin
+            t = rand(ComplexF64, V1 ⊗ V2 ← V3)
+            p = ((2,), (1, 3))
+            levels = (1, 3, 2)
+            t1 = copy(braid(t', p, levels))
+            t2 = braid(copy(t'), p, levels)
+            @test t1 ≈ t2
+        end
     end
     TensorKit.empty_globalcaches!()
 end

From 820044842c95cbe08b0544ca137e43d0ac948d47 Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Fri, 24 Apr 2026 14:47:02 -0400
Subject: [PATCH 06/23] rework `add_transform` kernels for TensorMap to only
 take data vector

---
 src/tensors/indexmanipulations.jl | 114 ++++++++++++++++--------------
 src/tensors/treetransformers.jl   |   7 ++
 2 files changed, 67 insertions(+), 54 deletions(-)

diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index 21af7ce93..bb46b1690 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -564,6 +564,34 @@ end
 
     return tdst
 end
+@propagate_inbounds function add_transform!(
+        tdst::TensorMap, tsrc::TensorMap, p::Index2Tuple, transformer::TreeTransformer,
+        α::Number, β::Number, backend, allocator
+    )
+    @boundscheck spacecheck_transform(permute, tdst, tsrc, p)
+    if p[1] === codomainind(tsrc) && p[2] === domainind(tsrc)
+        add!(tdst, tsrc, α, β)
+        return tdst
+    end
+    if use_threaded_transform(tdst, transformer)
+        add_kernel_threaded!(tdst.data, tsrc.data, p, transformer, α, β, backend, allocator)
+    else
+        add_kernel_nonthreaded!(tdst.data, tsrc.data, p, transformer, α, β, backend, allocator)
+    end
+    return tdst
+end
+@propagate_inbounds function add_transform!(
+        tdst::TensorMap, tsrc::TensorMap, p::Index2Tuple, ::TrivialTreeTransformer,
+        α::Number, β::Number, backend, allocator
+    )
+    @boundscheck spacecheck_transform(permute, tdst, tsrc, p)
+    if p[1] === codomainind(tsrc) && p[2] === domainind(tsrc)
+        add!(tdst, tsrc, α, β)
+        return tdst
+    end
+    TO.tensoradd!(tdst[], tsrc[], p, false, α, β, backend, allocator)
+    return tdst
+end
 
 function use_threaded_transform(t::TensorMap, transformer)
     return get_num_transformer_threads() > 1 && length(t.data) > Strided.MINTHREADLENGTH
@@ -572,13 +600,6 @@ function use_threaded_transform(t::AbstractTensorMap, transformer)
     return get_num_transformer_threads() > 1 && dim(space(t)) > Strided.MINTHREADLENGTH
 end
 
-# Trivial implementations
-# -----------------------
-function add_trivial_kernel!(tdst, tsrc, p, transformer, α, β, backend, allocator)
-    TO.tensoradd!(tdst[], tsrc[], p, false, α, β, backend, allocator)
-    return nothing
-end
-
 # Non-threaded implementations
 # ----------------------------
 function add_kernel_nonthreaded!(
@@ -590,11 +611,11 @@ function add_kernel_nonthreaded!(
     return nothing
 end
 function add_kernel_nonthreaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer::AbelianTreeTransformer, α, β, backend,
-        allocator
+        data_dst::DenseVector, data_src::DenseVector, p, transformer::AbelianTreeTransformer,
+        α, β, backend, allocator
     )
     for subtransformer in transformer.data
-        _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend, allocator)
+        _add_transform_single!(data_dst, data_src, p, subtransformer, α, β, backend, allocator)
     end
     return nothing
 end
@@ -615,29 +636,21 @@ function add_kernel_nonthreaded!(
 end
 # specialization in the case of TensorMap
 function add_kernel_nonthreaded!(
-        ::FusionStyle, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend,
-        allocator
+        data_dst::DenseVector, data_src::DenseVector, p, transformer::GenericTreeTransformer,
+        α, β, backend, allocator
     )
-    # preallocate buffers
-    buffers = allocate_buffers(tdst, tsrc, transformer, allocator)
+    buffers = allocate_buffers(data_dst, data_src, transformer, allocator)
 
     for subtransformer in transformer.data
         # Special case without intermediate buffers whenever there is only a single block
         if length(subtransformer[1]) == 1
-            _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend, allocator)
+            _add_transform_single!(data_dst, data_src, p, subtransformer, α, β, backend, allocator)
         else
-            _add_transform_multi!(tdst, tsrc, p, subtransformer, buffers, α, β, backend, allocator)
+            _add_transform_multi!(data_dst, data_src, p, subtransformer, buffers, α, β, backend, allocator)
         end
     end
     return nothing
 end
-# ambiguity resolution
-function add_kernel_nonthreaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend,
-        allocator
-    )
-    throw(ArgumentError("Cannot combine `GenericTreeTransformer` with `UniqueFusion`"))
-end
 # Threaded implementations
 # ------------------------
 function add_kernel_threaded!(
@@ -660,8 +673,8 @@ function add_kernel_threaded!(
     return nothing
 end
 function add_kernel_threaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer::AbelianTreeTransformer, α, β, backend,
-        allocator; ntasks::Int = get_num_transformer_threads()
+        data_dst::DenseVector, data_src::DenseVector, p, transformer::AbelianTreeTransformer,
+        α, β, backend, allocator; ntasks::Int = get_num_transformer_threads()
     )
     nblocks = length(transformer.data)
     counter = Threads.Atomic{Int}(1)
@@ -671,7 +684,7 @@ function add_kernel_threaded!(
                 local_counter = Threads.atomic_add!(counter, 1)
                 local_counter > nblocks && break
                 @inbounds subtransformer = transformer.data[local_counter]
-                _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend, allocator)
+                _add_transform_single!(data_dst, data_src, p, subtransformer, α, β, backend, allocator)
             end
         end
     end
@@ -708,8 +721,8 @@ function add_kernel_threaded!(
 end
 # specialization in the case of TensorMap
 function add_kernel_threaded!(
-        ::FusionStyle, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend,
-        allocator; ntasks::Int = get_num_transformer_threads()
+        data_dst::DenseVector, data_src::DenseVector, p, transformer::GenericTreeTransformer,
+        α, β, backend, allocator; ntasks::Int = get_num_transformer_threads()
     )
     nblocks = length(transformer.data)
 
@@ -717,16 +730,16 @@ function add_kernel_threaded!(
     Threads.@sync for _ in 1:min(ntasks, nblocks)
         Threads.@spawn begin
             # preallocate buffers for each task
-            buffers = allocate_buffers(tdst, tsrc, transformer, allocator)
+            buffers = allocate_buffers(data_dst, data_src, transformer, allocator)
 
             while true
                 local_counter = Threads.atomic_add!(counter, 1)
                 local_counter > nblocks && break
                 @inbounds subtransformer = transformer.data[local_counter]
                 if length(subtransformer[1]) == 1
-                    _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend, allocator)
+                    _add_transform_single!(data_dst, data_src, p, subtransformer, α, β, backend, allocator)
                 else
-                    _add_transform_multi!(tdst, tsrc, p, subtransformer, buffers, α, β, backend, allocator)
+                    _add_transform_multi!(data_dst, data_src, p, subtransformer, buffers, α, β, backend, allocator)
                 end
             end
         end
@@ -734,13 +747,6 @@ function add_kernel_threaded!(
 
     return nothing
 end
-# ambiguity resolution
-function add_kernel_threaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend,
-        allocator; ntasks::Int = get_num_transformer_threads()
-    )
-    throw(ArgumentError("Cannot combine `GenericTreeTransformer` with `UniqueFusion`"))
-end
 
 
 # Auxiliary methods
@@ -763,22 +769,24 @@ function _add_transform_single!(
     return nothing
 end
 function _add_transform_single!(
-        tdst, tsrc, p, (coeff, struct_dst, struct_src)::AbelianTransformerData,
+        data_dst::DenseVector, data_src::DenseVector, p,
+        (coeff, struct_dst, struct_src)::AbelianTransformerData,
         α, β, backend, allocator
     )
-    subblock_dst = StridedView(tdst.data, struct_dst...)
-    subblock_src = StridedView(tsrc.data, struct_src...)
+    subblock_dst = StridedView(data_dst, struct_dst...)
+    subblock_src = StridedView(data_src, struct_src...)
     TO.tensoradd!(subblock_dst, subblock_src, p, false, α * coeff, β, backend, allocator)
     return nothing
 end
 function _add_transform_single!(
-        tdst, tsrc, p, (basistransform, structs_dst, structs_src)::GenericTransformerData,
+        data_dst::DenseVector, data_src::DenseVector, p,
+        (basistransform, structs_dst, structs_src)::GenericTransformerData,
         α, β, backend, allocator
     )
     struct_dst = (structs_dst[1], only(structs_dst[2])...)
     struct_src = (structs_src[1], only(structs_src[2])...)
     coeff = only(basistransform)
-    _add_transform_single!(tdst, tsrc, p, (coeff, struct_dst, struct_src), α, β, backend, allocator)
+    _add_transform_single!(data_dst, data_src, p, (coeff, struct_dst, struct_src), α, β, backend, allocator)
     return nothing
 end
 
@@ -817,33 +825,31 @@ function _add_transform_multi!(
     return nothing
 end
 function _add_transform_multi!(
-        tdst, tsrc, p, (U, (sz_dst, structs_dst), (sz_src, structs_src)),
+        data_dst::DenseVector, data_src::DenseVector, p,
+        (U, (sz_dst, structs_dst), (sz_src, structs_src))::GenericTransformerData,
         (buffer1, buffer2), α, β, backend, allocator
     )
     rows, cols = size(U)
     blocksize = prod(sz_src)
-    matsize = (
-        prod(TupleTools.getindices(sz_src, codomainind(tsrc))),
-        prod(TupleTools.getindices(sz_src, domainind(tsrc))),
-    )
 
     # Filling up a buffer with contiguous data
     buffer_src = StridedView(buffer2, (blocksize, cols), (1, blocksize), 0)
+    ptriv = (ntuple(identity, length(sz_src)), ())
     for (i, struct_src) in enumerate(structs_src)
-        subblock_src = sreshape(StridedView(tsrc.data, sz_src, struct_src...), matsize)
-        bufblock_src = sreshape(buffer_src[:, i], matsize)
-        copy!(bufblock_src, subblock_src)
+        subblock_src = StridedView(data_src, sz_src, struct_src...)
+        bufblock_src = sreshape(buffer_src[:, i], sz_src)
+        TO.tensoradd!(bufblock_src, subblock_src, ptriv, false, One(), Zero(), backend, allocator)
     end
 
     # Resummation into a second buffer using BLAS
     buffer_dst = StridedView(buffer1, (blocksize, rows), (1, blocksize), 0)
-    mul!(buffer_dst, buffer_src, transpose(StridedView(U)), α, Zero())
+    mul!(buffer_dst, buffer_src, transpose(StridedView(U)))
 
     # Filling up the output
     for (i, struct_dst) in enumerate(structs_dst)
-        subblock_dst = StridedView(tdst.data, sz_dst, struct_dst...)
+        subblock_dst = StridedView(data_dst, sz_dst, struct_dst...)
         bufblock_dst = sreshape(buffer_dst[:, i], sz_src)
-        TO.tensoradd!(subblock_dst, bufblock_dst, p, false, One(), β, backend, allocator)
+        TO.tensoradd!(subblock_dst, bufblock_dst, p, false, α, β, backend, allocator)
     end
 
     return nothing
diff --git a/src/tensors/treetransformers.jl b/src/tensors/treetransformers.jl
index 8c63b00a8..82032b067 100644
--- a/src/tensors/treetransformers.jl
+++ b/src/tensors/treetransformers.jl
@@ -141,6 +141,13 @@ function allocate_buffers(
     sz = buffersize(transformer)
     return similar(tdst.data, sz), similar(tsrc.data, sz)
 end
+function allocate_buffers(
+        data_dst::DenseVector, data_src::DenseVector, transformer::GenericTreeTransformer,
+        allocator=TO.DefaultAllocator()
+    )
+    sz = buffersize(transformer)
+    return similar(data_dst, sz), similar(data_src, sz)
+end
 function allocate_buffers(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap, transformer,
         allocator=TO.DefaultAllocator()

From 383d9184ebd5972d5e7ecf95bd87453a3f3aa07b Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Fri, 24 Apr 2026 20:05:58 -0400
Subject: [PATCH 07/23] inline transform helpers into add_transform_kernel!

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/tensors/indexmanipulations.jl | 334 ++++++------------------------
 1 file changed, 65 insertions(+), 269 deletions(-)

diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index bb46b1690..1909e5d1f 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -551,47 +551,20 @@ end
     else
         I = sectortype(tdst)
         if I === Trivial
-            add_trivial_kernel!(tdst, tsrc, p, transformer, α, β, backend, allocator)
+            TO.tensoradd!(tdst[], tsrc[], p, false, α, β, backend, allocator)
         else
-            style = FusionStyle(I)
-            if use_threaded_transform(tdst, transformer)
-                add_kernel_threaded!(style, tdst, tsrc, p, transformer, α, β, backend, allocator)
+            ntasks = use_threaded_transform(tdst, transformer) ? get_num_transformer_threads() : 1
+            scheduler = ntasks == 1 ? SerialScheduler() : DynamicScheduler(; ntasks, split = :roundrobin)
+            if tdst isa TensorMap && tsrc isa TensorMap # unpack data fields to avoid specializing
+                add_transform_kernel!(tdst.data, tsrc.data, p, transformer, α, β, backend, allocator, scheduler)
             else
-                add_kernel_nonthreaded!(style, tdst, tsrc, p, transformer, α, β, backend, allocator)
+                add_transform_kernel!(tdst, tsrc, p, transformer, α, β, backend, allocator, scheduler)
             end
         end
     end
 
     return tdst
 end
-@propagate_inbounds function add_transform!(
-        tdst::TensorMap, tsrc::TensorMap, p::Index2Tuple, transformer::TreeTransformer,
-        α::Number, β::Number, backend, allocator
-    )
-    @boundscheck spacecheck_transform(permute, tdst, tsrc, p)
-    if p[1] === codomainind(tsrc) && p[2] === domainind(tsrc)
-        add!(tdst, tsrc, α, β)
-        return tdst
-    end
-    if use_threaded_transform(tdst, transformer)
-        add_kernel_threaded!(tdst.data, tsrc.data, p, transformer, α, β, backend, allocator)
-    else
-        add_kernel_nonthreaded!(tdst.data, tsrc.data, p, transformer, α, β, backend, allocator)
-    end
-    return tdst
-end
-@propagate_inbounds function add_transform!(
-        tdst::TensorMap, tsrc::TensorMap, p::Index2Tuple, ::TrivialTreeTransformer,
-        α::Number, β::Number, backend, allocator
-    )
-    @boundscheck spacecheck_transform(permute, tdst, tsrc, p)
-    if p[1] === codomainind(tsrc) && p[2] === domainind(tsrc)
-        add!(tdst, tsrc, α, β)
-        return tdst
-    end
-    TO.tensoradd!(tdst[], tsrc[], p, false, α, β, backend, allocator)
-    return tdst
-end
 
 function use_threaded_transform(t::TensorMap, transformer)
     return get_num_transformer_threads() > 1 && length(t.data) > Strided.MINTHREADLENGTH
@@ -600,257 +573,80 @@ function use_threaded_transform(t::AbstractTensorMap, transformer)
     return get_num_transformer_threads() > 1 && dim(space(t)) > Strided.MINTHREADLENGTH
 end
 
-# Non-threaded implementations
-# ----------------------------
-function add_kernel_nonthreaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer, α, β, backend, allocator
+function add_transform_kernel!(
+        tdst, tsrc, p, transformer, α, β, backend, allocator, scheduler
     )
-    for (f₁, f₂) in fusiontrees(tsrc)
-        _add_transform_single!(tdst, tsrc, p, (f₁, f₂), transformer, α, β, backend, allocator)
-    end
-    return nothing
-end
-function add_kernel_nonthreaded!(
-        data_dst::DenseVector, data_src::DenseVector, p, transformer::AbelianTreeTransformer,
-        α, β, backend, allocator
-    )
-    for subtransformer in transformer.data
-        _add_transform_single!(data_dst, data_src, p, subtransformer, α, β, backend, allocator)
-    end
-    return nothing
-end
-function add_kernel_nonthreaded!(
-        ::FusionStyle, tdst, tsrc, p, transformer, α, β, backend, allocator
-    )
-    # preallocate buffers
-    buffers = allocate_buffers(tdst, tsrc, transformer, allocator)
-
-    for src in fusionblocks(tsrc)
-        if length(src) == 1
-            _add_transform_single!(tdst, tsrc, p, src, transformer, α, β, backend, allocator)
-        else
-            _add_transform_multi!(tdst, tsrc, p, src, transformer, buffers, α, β, backend, allocator)
+    I = sectortype(tdst)
+    if FusionStyle(I) === UniqueFusion()
+        tforeach(fusiontrees(tsrc); scheduler) do (f₁, f₂)
+            (f₁′, f₂′), coeff = transformer((f₁, f₂))
+            @inbounds TO.tensoradd!(tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * coeff, β, backend, allocator)
         end
-    end
-    return nothing
-end
-# specialization in the case of TensorMap
-function add_kernel_nonthreaded!(
-        data_dst::DenseVector, data_src::DenseVector, p, transformer::GenericTreeTransformer,
-        α, β, backend, allocator
-    )
-    buffers = allocate_buffers(data_dst, data_src, transformer, allocator)
-
-    for subtransformer in transformer.data
-        # Special case without intermediate buffers whenever there is only a single block
-        if length(subtransformer[1]) == 1
-            _add_transform_single!(data_dst, data_src, p, subtransformer, α, β, backend, allocator)
-        else
-            _add_transform_multi!(data_dst, data_src, p, subtransformer, buffers, α, β, backend, allocator)
-        end
-    end
-    return nothing
-end
-# Threaded implementations
-# ------------------------
-function add_kernel_threaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer, α, β, backend, allocator;
-        ntasks::Int = get_num_transformer_threads()
-    )
-    trees = fusiontrees(tsrc)
-    nblocks = length(trees)
-    counter = Threads.Atomic{Int}(1)
-    Threads.@sync for _ in 1:min(ntasks, nblocks)
-        Threads.@spawn begin
-            while true
-                local_counter = Threads.atomic_add!(counter, 1)
-                local_counter > nblocks && break
-                @inbounds (f₁, f₂) = trees[local_counter]
-                _add_transform_single!(tdst, tsrc, p, (f₁, f₂), transformer, α, β, backend, allocator)
-            end
-        end
-    end
-    return nothing
-end
-function add_kernel_threaded!(
-        data_dst::DenseVector, data_src::DenseVector, p, transformer::AbelianTreeTransformer,
-        α, β, backend, allocator; ntasks::Int = get_num_transformer_threads()
-    )
-    nblocks = length(transformer.data)
-    counter = Threads.Atomic{Int}(1)
-    Threads.@sync for _ in 1:min(ntasks, nblocks)
-        Threads.@spawn begin
-            while true
-                local_counter = Threads.atomic_add!(counter, 1)
-                local_counter > nblocks && break
-                @inbounds subtransformer = transformer.data[local_counter]
-                _add_transform_single!(data_dst, data_src, p, subtransformer, α, β, backend, allocator)
+    else
+        tl_buffers = OhMyThreads.TaskLocalValue(() -> allocate_buffers(tdst, tsrc, transformer, allocator))
+        tforeach(fusionblocks(tsrc); scheduler) do src
+            dst, U = transformer(src)
+            if length(src) == 1
+                (f₁, f₂) = only(fusiontrees(src))
+                (f₁′, f₂′) = only(fusiontrees(dst))
+                @inbounds TO.tensoradd!(tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * only(U), β, backend, allocator)
+            else
+                buffer1, buffer2 = tl_buffers[]
+                rows, cols = size(U)
+                sz_src = size(tsrc[first(fusiontrees(src))...])
+                blocksize = prod(sz_src)
+                ptriv = (ntuple(identity, length(sz_src)), ())
+                buffer_src = StridedView(buffer2, (blocksize, cols), (1, blocksize), 0)
+                for (i, (f₁, f₂)) in enumerate(fusiontrees(src))
+                    TO.tensoradd!(sreshape(buffer_src[:, i], sz_src), tsrc[f₁, f₂], ptriv, false, One(), Zero(), backend, allocator)
+                end
+                buffer_dst = StridedView(buffer1, (blocksize, rows), (1, blocksize), 0)
+                mul!(buffer_dst, buffer_src, transpose(StridedView(U)))
+                for (i, (f₃, f₄)) in enumerate(fusiontrees(dst))
+                    TO.tensoradd!(tdst[f₃, f₄], sreshape(buffer_dst[:, i], sz_src), p, false, α, β, backend, allocator)
+                end
             end
         end
     end
     return nothing
 end
 
-function add_kernel_threaded!(
-        ::FusionStyle, tdst, tsrc, p, transformer, α, β, backend, allocator;
-        ntasks::Int = get_num_transformer_threads()
+# specialization in the case of TensorMap
+function add_transform_kernel!(
+        data_dst::DenseVector, data_src::DenseVector, p, transformer::AbelianTreeTransformer,
+        α, β, backend, allocator, scheduler
     )
-    allblocks = fusionblocks(tsrc)
-    nblocks = length(allblocks)
-
-    counter = Threads.Atomic{Int}(1)
-    Threads.@sync for _ in 1:min(ntasks, nblocks)
-        Threads.@spawn begin
-            # preallocate buffers for each task
-            buffers = allocate_buffers(tdst, tsrc, transformer, allocator)
-
-            while true
-                local_counter = Threads.atomic_add!(counter, 1)
-                local_counter > nblocks && break
-                @inbounds src = allblocks[local_counter]
-                if length(src) == 1
-                    _add_transform_single!(tdst, tsrc, p, src, transformer, α, β, backend, allocator)
-                else
-                    _add_transform_multi!(tdst, tsrc, p, src, transformer, buffers, α, β, backend, allocator)
-                end
-            end
-        end
+    tforeach(transformer.data; scheduler) do (coeff, struct_dst, struct_src)
+        TO.tensoradd!(StridedView(data_dst, struct_dst...), StridedView(data_src, struct_src...), p, false, α * coeff, β, backend, allocator)
     end
-
     return nothing
 end
-# specialization in the case of TensorMap
-function add_kernel_threaded!(
+function add_transform_kernel!(
         data_dst::DenseVector, data_src::DenseVector, p, transformer::GenericTreeTransformer,
-        α, β, backend, allocator; ntasks::Int = get_num_transformer_threads()
-    )
-    nblocks = length(transformer.data)
-
-    counter = Threads.Atomic{Int}(1)
-    Threads.@sync for _ in 1:min(ntasks, nblocks)
-        Threads.@spawn begin
-            # preallocate buffers for each task
-            buffers = allocate_buffers(data_dst, data_src, transformer, allocator)
-
-            while true
-                local_counter = Threads.atomic_add!(counter, 1)
-                local_counter > nblocks && break
-                @inbounds subtransformer = transformer.data[local_counter]
-                if length(subtransformer[1]) == 1
-                    _add_transform_single!(data_dst, data_src, p, subtransformer, α, β, backend, allocator)
-                else
-                    _add_transform_multi!(data_dst, data_src, p, subtransformer, buffers, α, β, backend, allocator)
-                end
+        α, β, backend, allocator, scheduler
+    )
+    tl_buffers = OhMyThreads.TaskLocalValue(() -> allocate_buffers(data_dst, data_src, transformer, allocator))
+    tforeach(transformer.data; scheduler) do (U, (sz_dst, structs_dst), (sz_src, structs_src))
+        if length(U) == 1
+            coeff = only(U)
+            TO.tensoradd!(StridedView(data_dst, sz_dst, only(structs_dst)...),
+                          StridedView(data_src, sz_src, only(structs_src)...),
+                          p, false, α * coeff, β, backend, allocator)
+        else
+            buffer1, buffer2 = tl_buffers[]
+            rows, cols = size(U)
+            blocksize = prod(sz_src)
+            ptriv = (ntuple(identity, length(sz_src)), ())
+            buffer_src = StridedView(buffer2, (blocksize, cols), (1, blocksize), 0)
+            for (i, struct_src_i) in enumerate(structs_src)
+                TO.tensoradd!(sreshape(buffer_src[:, i], sz_src), StridedView(data_src, sz_src, struct_src_i...), ptriv, false, One(), Zero(), backend, allocator)
+            end
+            buffer_dst = StridedView(buffer1, (blocksize, rows), (1, blocksize), 0)
+            mul!(buffer_dst, buffer_src, transpose(StridedView(U)))
+            for (i, struct_dst_i) in enumerate(structs_dst)
+                TO.tensoradd!(StridedView(data_dst, sz_dst, struct_dst_i...), sreshape(buffer_dst[:, i], sz_src), p, false, α, β, backend, allocator)
             end
         end
     end
-
-    return nothing
-end
-
-
-# Auxiliary methods
-# -----------------
-function _add_transform_single!(
-        tdst, tsrc, p, (f₁, f₂)::FusionTreePair, transformer, α, β, backend, allocator
-    )
-    (f₁′, f₂′), coeff = transformer((f₁, f₂))
-    @inbounds TO.tensoradd!(tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * coeff, β, backend, allocator)
-    return nothing
-end
-function _add_transform_single!(
-        tdst, tsrc, p, src::FusionTreeBlock, transformer, α, β, backend, allocator
-    )
-    dst, U = transformer(src)
-    f₁, f₂ = only(fusiontrees(src))
-    f₁′, f₂′ = only(fusiontrees(dst))
-    coeff = only(U)
-    @inbounds TO.tensoradd!(tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * coeff, β, backend, allocator)
-    return nothing
-end
-function _add_transform_single!(
-        data_dst::DenseVector, data_src::DenseVector, p,
-        (coeff, struct_dst, struct_src)::AbelianTransformerData,
-        α, β, backend, allocator
-    )
-    subblock_dst = StridedView(data_dst, struct_dst...)
-    subblock_src = StridedView(data_src, struct_src...)
-    TO.tensoradd!(subblock_dst, subblock_src, p, false, α * coeff, β, backend, allocator)
-    return nothing
-end
-function _add_transform_single!(
-        data_dst::DenseVector, data_src::DenseVector, p,
-        (basistransform, structs_dst, structs_src)::GenericTransformerData,
-        α, β, backend, allocator
-    )
-    struct_dst = (structs_dst[1], only(structs_dst[2])...)
-    struct_src = (structs_src[1], only(structs_src[2])...)
-    coeff = only(basistransform)
-    _add_transform_single!(data_dst, data_src, p, (coeff, struct_dst, struct_src), α, β, backend, allocator)
-    return nothing
-end
-
-function _add_transform_multi!(
-        tdst, tsrc, p, src::FusionTreeBlock, transformer, (buffer1, buffer2), α, β, backend,
-        allocator
-    )
-    dst, U = transformer(src)
-    rows, cols = size(U)
-    sz_src = size(tsrc[first(fusiontrees(src))...])
-    blocksize = prod(sz_src)
-    matsize = (
-        prod(TupleTools.getindices(sz_src, codomainind(tsrc))),
-        prod(TupleTools.getindices(sz_src, domainind(tsrc))),
-    )
-
-    # Filling up a buffer with contiguous data
-    buffer_src = StridedView(buffer2, (blocksize, cols), (1, blocksize), 0)
-    for (i, (f₁, f₂)) in enumerate(fusiontrees(src))
-        subblock_src = sreshape(tsrc[f₁, f₂], matsize)
-        bufblock_src = sreshape(buffer_src[:, i], matsize)
-        copy!(bufblock_src, subblock_src)
-    end
-
-    # Resummation into a second buffer using BLAS
-    buffer_dst = StridedView(buffer1, (blocksize, rows), (1, blocksize), 0)
-    mul!(buffer_dst, buffer_src, transpose(StridedView(U)), α, Zero())
-
-    # Filling up the output
-    for (i, (f₃, f₄)) in enumerate(fusiontrees(dst))
-        subblock_dst = tdst[f₃, f₄]
-        bufblock_dst = sreshape(buffer_dst[:, i], sz_src)
-        TO.tensoradd!(subblock_dst, bufblock_dst, p, false, One(), β, backend, allocator)
-    end
-
-    return nothing
-end
-function _add_transform_multi!(
-        data_dst::DenseVector, data_src::DenseVector, p,
-        (U, (sz_dst, structs_dst), (sz_src, structs_src))::GenericTransformerData,
-        (buffer1, buffer2), α, β, backend, allocator
-    )
-    rows, cols = size(U)
-    blocksize = prod(sz_src)
-
-    # Filling up a buffer with contiguous data
-    buffer_src = StridedView(buffer2, (blocksize, cols), (1, blocksize), 0)
-    ptriv = (ntuple(identity, length(sz_src)), ())
-    for (i, struct_src) in enumerate(structs_src)
-        subblock_src = StridedView(data_src, sz_src, struct_src...)
-        bufblock_src = sreshape(buffer_src[:, i], sz_src)
-        TO.tensoradd!(bufblock_src, subblock_src, ptriv, false, One(), Zero(), backend, allocator)
-    end
-
-    # Resummation into a second buffer using BLAS
-    buffer_dst = StridedView(buffer1, (blocksize, rows), (1, blocksize), 0)
-    mul!(buffer_dst, buffer_src, transpose(StridedView(U)))
-
-    # Filling up the output
-    for (i, struct_dst) in enumerate(structs_dst)
-        subblock_dst = StridedView(data_dst, sz_dst, struct_dst...)
-        bufblock_dst = sreshape(buffer_dst[:, i], sz_src)
-        TO.tensoradd!(subblock_dst, bufblock_dst, p, false, α, β, backend, allocator)
-    end
-
     return nothing
 end

From 9155b28524560966969d3dfaf0ce8f2726c86a22 Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Sat, 25 Apr 2026 21:15:23 -0400
Subject: [PATCH 08/23] make deprecations simpler

---
 src/tensors/indexmanipulations.jl | 61 +++++++++----------------------
 1 file changed, 17 insertions(+), 44 deletions(-)

diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index 1909e5d1f..91ded3140 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -498,47 +498,18 @@ end
 
 # Deprecated add_*! wrappers
 # --------------------------
-"""
-    add_permute!(tdst, tsrc, (p₁, p₂)::Index2Tuple, α::Number, β::Number[, backend])
-
-!!! warning "Deprecated"
-    `add_permute!` is deprecated. Use `permute!(tdst, tsrc, p, α, β[, backend])` instead.
-"""
-function add_permute!(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
-        α::Number, β::Number, backend::AbstractBackend...
-    )
-    Base.depwarn("`add_permute!` is deprecated, use `permute!` instead", :add_permute!)
-    return @inbounds permute!(tdst, tsrc, p, α, β, backend...)
-end
-
-"""
-    add_braid!(tdst, tsrc, (p₁, p₂)::Index2Tuple, levels::IndexTuple, α::Number, β::Number[, backend])
-
-!!! warning "Deprecated"
-    `add_braid!` is deprecated. Use `braid!(tdst, tsrc, p, levels, α, β[, backend])` instead.
-"""
-function add_braid!(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple,
-        α::Number, β::Number, backend::AbstractBackend...
-    )
-    Base.depwarn("`add_braid!` is deprecated, use `braid!` instead", :add_braid!)
-    return @inbounds braid!(tdst, tsrc, p, levels, α, β, backend...)
-end
-
-"""
-    add_transpose!(tdst, tsrc, (p₁, p₂)::Index2Tuple, α::Number, β::Number[, backend])
-
-!!! warning "Deprecated"
-    `add_transpose!` is deprecated. Use `transpose!(tdst, tsrc, p, α, β[, backend])` instead.
-"""
-function add_transpose!(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
-        α::Number, β::Number, backend::AbstractBackend...
-    )
-    Base.depwarn("`add_transpose!` is deprecated, use `transpose!` instead", :add_transpose!)
-    return @inbounds transpose!(tdst, tsrc, p, α, β, backend...)
-end
+Base.@deprecate(
+    add_permute!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, α::Number, β::Number, backend::AbstractBackend...),
+    permute!(tdst, tsrc, p, α, β, backend...)
+)
+Base.@deprecate(
+    add_braid!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple, α::Number, β::Number, backend::AbstractBackend...),
+    braid!(tdst, tsrc, p, levels, α, β, backend...)
+)
+Base.@deprecate(
+    add_transpose!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, α::Number, β::Number, backend::AbstractBackend...),
+    transpose!(tdst, tsrc, p, α, β, backend...)
+)
 
 @propagate_inbounds function add_transform!(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, transformer,
@@ -629,9 +600,11 @@ function add_transform_kernel!(
     tforeach(transformer.data; scheduler) do (U, (sz_dst, structs_dst), (sz_src, structs_src))
         if length(U) == 1
             coeff = only(U)
-            TO.tensoradd!(StridedView(data_dst, sz_dst, only(structs_dst)...),
-                          StridedView(data_src, sz_src, only(structs_src)...),
-                          p, false, α * coeff, β, backend, allocator)
+            TO.tensoradd!(
+                StridedView(data_dst, sz_dst, only(structs_dst)...),
+                StridedView(data_src, sz_src, only(structs_src)...),
+                p, false, α * coeff, β, backend, allocator
+            )
         else
             buffer1, buffer2 = tl_buffers[]
             rows, cols = size(U)

From bbffece247ae7dd35e529eedb38681a6822b499c Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Sat, 25 Apr 2026 21:25:24 -0400
Subject: [PATCH 09/23] refactor and explain

---
 src/tensors/indexmanipulations.jl | 85 ++++++++++++++++++++++++++-----
 1 file changed, 73 insertions(+), 12 deletions(-)

diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index 91ded3140..4170f5fab 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -515,8 +515,14 @@ Base.@deprecate(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, transformer,
         α::Number, β::Number, backend, allocator
     )
+    # `permute` is used as a stand-in for all index rearrangements here: permute, braid, and
+    # transpose all produce the same destination space for a given permutation tuple `p`.
     @boundscheck spacecheck_transform(permute, tdst, tsrc, p)
 
+    # Three cases, from cheapest to most expensive:
+    #   1. trivial permutation: delegate to `add!` which handles α/β scaling directly
+    #   2. Trivial sector type: no fusion tree bookkeeping, call tensoradd! on the raw array
+    #   3. general case: iterate over (blocks of) fusion trees, potentially multi-threaded
     if p[1] === codomainind(tsrc) && p[2] === domainind(tsrc)
         add!(tdst, tsrc, α, β)
     else
@@ -549,32 +555,57 @@ function add_transform_kernel!(
     )
     I = sectortype(tdst)
     if FusionStyle(I) === UniqueFusion()
+        # Abelian / unique-fusion: each source fusion tree pair (f₁, f₂) maps to exactly
+        # one destination pair (f₁′, f₂′) with a scalar coefficient. No mixing occurs.
         tforeach(fusiontrees(tsrc); scheduler) do (f₁, f₂)
             (f₁′, f₂′), coeff = transformer((f₁, f₂))
-            @inbounds TO.tensoradd!(tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * coeff, β, backend, allocator)
+            @inbounds TO.tensoradd!(
+                tdst[f₁′, f₂′], tsrc[f₁, f₂],
+                p, false, α * coeff, β, backend, allocator
+            )
         end
     else
+        # Non-Abelian fusion: trees sharing the same set of uncoupled (external) sectors
+        # form a *fusion block* and mix under the transformation via a recoupling matrix U
+        # (rows = destination trees, columns = source trees). We iterate over blocks.
         tl_buffers = OhMyThreads.TaskLocalValue(() -> allocate_buffers(tdst, tsrc, transformer, allocator))
         tforeach(fusionblocks(tsrc); scheduler) do src
             dst, U = transformer(src)
             if length(src) == 1
+                # Degenerate block: single tree, U is a 1×1 scalar — skip the buffer + matmul.
                 (f₁, f₂) = only(fusiontrees(src))
                 (f₁′, f₂′) = only(fusiontrees(dst))
-                @inbounds TO.tensoradd!(tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * only(U), β, backend, allocator)
+                @inbounds TO.tensoradd!(
+                    tdst[f₁′, f₂′], tsrc[f₁, f₂],
+                    p, false, α * only(U), β, backend, allocator
+                )
             else
+                # Multi-tree block: apply recoupling via a three-step pack → matmul → unpack.
+                #   1. Extract: flatten each source block into a column of buffer_src
+                #      (shape blocksize × cols), using a trivial permutation so that the
+                #      index layout is canonical before the matmul.
+                #   2. Recoupling: buffer_dst = buffer_src * U^T  (blocksize × rows)
+                #   3. Insert: scatter columns of buffer_dst to destination blocks,
+                #      applying the actual permutation p in the same step.
                 buffer1, buffer2 = tl_buffers[]
                 rows, cols = size(U)
                 sz_src = size(tsrc[first(fusiontrees(src))...])
                 blocksize = prod(sz_src)
                 ptriv = (ntuple(identity, length(sz_src)), ())
                 buffer_src = StridedView(buffer2, (blocksize, cols), (1, blocksize), 0)
-                for (i, (f₁, f₂)) in enumerate(fusiontrees(src))
-                    TO.tensoradd!(sreshape(buffer_src[:, i], sz_src), tsrc[f₁, f₂], ptriv, false, One(), Zero(), backend, allocator)
+                @inbounds for (i, (f₁, f₂)) in enumerate(fusiontrees(src))
+                    TO.tensoradd!(
+                        sreshape(buffer_src[:, i], sz_src), tsrc[f₁, f₂],
+                        ptriv, false, One(), Zero(), backend, allocator
+                    )
                 end
                 buffer_dst = StridedView(buffer1, (blocksize, rows), (1, blocksize), 0)
                 mul!(buffer_dst, buffer_src, transpose(StridedView(U)))
-                for (i, (f₃, f₄)) in enumerate(fusiontrees(dst))
-                    TO.tensoradd!(tdst[f₃, f₄], sreshape(buffer_dst[:, i], sz_src), p, false, α, β, backend, allocator)
+                @inbounds for (i, (f₃, f₄)) in enumerate(fusiontrees(dst))
+                    TO.tensoradd!(
+                        tdst[f₃, f₄], sreshape(buffer_dst[:, i], sz_src),
+                        p, false, α, β, backend, allocator
+                    )
                 end
             end
         end
@@ -582,13 +613,21 @@ function add_transform_kernel!(
     return nothing
 end
 
-# specialization in the case of TensorMap
+# TensorMap specializations: operate directly on the flat data vector to avoid
+# repeated dictionary lookups into t.data. The transformer has precomputed all
+# StridedView descriptors (size, offset, strides) for each fusion tree block.
+# No symmetry types left -- no repeated specialization needed
 function add_transform_kernel!(
         data_dst::DenseVector, data_src::DenseVector, p, transformer::AbelianTreeTransformer,
         α, β, backend, allocator, scheduler
     )
+    # Each entry is (coeff, struct_dst, struct_src) where struct_{dst,src} = (size, offset, strides)
+    # locating the block for one fusion tree pair inside the flat data vector.
     tforeach(transformer.data; scheduler) do (coeff, struct_dst, struct_src)
-        TO.tensoradd!(StridedView(data_dst, struct_dst...), StridedView(data_src, struct_src...), p, false, α * coeff, β, backend, allocator)
+        TO.tensoradd!(
+            StridedView(data_dst, struct_dst...), StridedView(data_src, struct_src...),
+            p, false, α * coeff, β, backend, allocator
+        )
     end
     return nothing
 end
@@ -596,9 +635,14 @@ function add_transform_kernel!(
         data_dst::DenseVector, data_src::DenseVector, p, transformer::GenericTreeTransformer,
         α, β, backend, allocator, scheduler
     )
+    # Each entry covers one fusion block:
+    #   U            — recoupling matrix (rows = dst trees, cols = src trees)
+    #   sz_{dst,src} — array shape of each block (same for all trees in the block)
+    #   structs_{dst,src}[i] — (offset, strides) into the flat data vector for tree i
     tl_buffers = OhMyThreads.TaskLocalValue(() -> allocate_buffers(data_dst, data_src, transformer, allocator))
     tforeach(transformer.data; scheduler) do (U, (sz_dst, structs_dst), (sz_src, structs_src))
         if length(U) == 1
+            # Degenerate block with a single tree: no matmul needed.
             coeff = only(U)
             TO.tensoradd!(
                 StridedView(data_dst, sz_dst, only(structs_dst)...),
@@ -606,18 +650,35 @@ function add_transform_kernel!(
                 p, false, α * coeff, β, backend, allocator
             )
         else
+            # Multi-tree block: pack → recoupling matmul → unpack.
+            # buffer2 = source staging area, buffer1 = destination staging area.
             buffer1, buffer2 = tl_buffers[]
             rows, cols = size(U)
             blocksize = prod(sz_src)
             ptriv = (ntuple(identity, length(sz_src)), ())
+
+            # 1. Extract: copy each source block into column i of buffer_src as a flat vector,
+            #    using a trivial permutation so the layout is canonical before the matmul.
             buffer_src = StridedView(buffer2, (blocksize, cols), (1, blocksize), 0)
-            for (i, struct_src_i) in enumerate(structs_src)
-                TO.tensoradd!(sreshape(buffer_src[:, i], sz_src), StridedView(data_src, sz_src, struct_src_i...), ptriv, false, One(), Zero(), backend, allocator)
+            @inbounds for (i, struct_src_i) in enumerate(structs_src)
+                TO.tensoradd!(
+                    sreshape(buffer_src[:, i], sz_src), StridedView(data_src, sz_src, struct_src_i...),
+                    ptriv, false, One(), Zero(), backend, allocator
+                )
             end
+
+            # 2. Recoupling: buffer_dst = buffer_src * U^T  (each output tree is a linear
+            #    combination of input trees weighted by the recoupling coefficients).
             buffer_dst = StridedView(buffer1, (blocksize, rows), (1, blocksize), 0)
             mul!(buffer_dst, buffer_src, transpose(StridedView(U)))
-            for (i, struct_dst_i) in enumerate(structs_dst)
-                TO.tensoradd!(StridedView(data_dst, sz_dst, struct_dst_i...), sreshape(buffer_dst[:, i], sz_src), p, false, α, β, backend, allocator)
+
+            # 3. Insert: scatter column i of buffer_dst into the destination, applying the
+            #    actual index permutation p in the same tensoradd! call.
+            @inbounds for (i, struct_dst_i) in enumerate(structs_dst)
+                TO.tensoradd!(
+                    StridedView(data_dst, sz_dst, struct_dst_i...), sreshape(buffer_dst[:, i], sz_src),
+                    p, false, α, β, backend, allocator
+                )
             end
         end
     end

From f7cd89a1e08d50d92e466f19a4164f92abf1463e Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Sat, 25 Apr 2026 21:37:03 -0400
Subject: [PATCH 10/23] use updated scalar types

---
 src/tensors/indexmanipulations.jl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index 4170f5fab..10bc331e8 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -15,11 +15,8 @@ for (operation, manipulation) in (
         $promote_op(::Type{T}) where {T <: AbstractTensorMap} =
             $promote_op(scalartype(T), sectortype(T))
         $promote_op(::Type{T}, ::Type{I}) where {T <: Number, I <: Sector} =
-            sectorscalartype(I) <: Integer ? T :
-            sectorscalartype(I) <: Real ? float(T) : complex(T)
-        # TODO: currently the manipulations all use sectorscalartype, change to:
-        # $manipulation_scalartype(I) <: Integer ? T :
-        # $manipulation_scalartype(I) <: Real ? float(T) : complex(T)
+            $manipulation_scalartype(I) <: Integer ? T :
+            $manipulation_scalartype(I) <: Real ? float(T) : complex(T)
     end
 end
 
@@ -369,7 +366,10 @@ If `copy = false`, `tdst` might share data with `tsrc` whenever possible. Otherw
 See [`twist!`](@ref) for storing the result in place.
 """
 function twist(t::AbstractTensorMap, inds; inv::Bool = false, copy::Bool = false)
-    !copy && has_shared_twist(t, inds) && return t
+    if has_shared_twist(t, inds)
+        copy || return t
+        return copy!(similar(t), t)
+    end
     tdst = similar(t, promote_twist(t))
     copy!(tdst, t)
     return twist!(tdst, inds; inv)

From 27844a09950311b326777331c53a9a8447eedf86 Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Sat, 25 Apr 2026 21:56:40 -0400
Subject: [PATCH 11/23] reorganization

---
 src/tensors/indexmanipulations.jl | 374 ++++++++++++++++--------------
 1 file changed, 194 insertions(+), 180 deletions(-)

diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index 10bc331e8..bfeeb6f82 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -1,25 +1,10 @@
-# Index manipulations
-#---------------------
-
-# find the scalartype after applying operations: take into account fusion and/or braiding
-# might need to become Float or Complex to capture complex recoupling coefficients but don't alter precision
-for (operation, manipulation) in (
-        :flip => :sector, :twist => :braiding,
-        :transpose => :fusion, :permute => :sector, :braid => :sector,
-    )
-    promote_op = Symbol(:promote_, operation)
-    manipulation_scalartype = Symbol(manipulation, :scalartype)
-
-    @eval begin
-        $promote_op(t::AbstractTensorMap) = $promote_op(typeof(t))
-        $promote_op(::Type{T}) where {T <: AbstractTensorMap} =
-            $promote_op(scalartype(T), sectortype(T))
-        $promote_op(::Type{T}, ::Type{I}) where {T <: Number, I <: Sector} =
-            $manipulation_scalartype(I) <: Integer ? T :
-            $manipulation_scalartype(I) <: Real ? float(T) : complex(T)
-    end
-end
+# =============
+#  Reweighting
+# =============
 
+# ------
+# flip
+# ------
 """
     flip(t::AbstractTensorMap, I) -> t′::AbstractTensorMap
 
@@ -43,6 +28,172 @@ function flip(t::AbstractTensorMap, I; inv::Bool = false)
     return t′
 end
 
+# ---------
+# twist(!)
+# ---------
+function has_shared_twist(t, inds)
+    I = sectortype(t)
+    if BraidingStyle(I) == NoBraiding()
+        for i in inds
+            cs = sectors(space(t, i))
+            all(isunit, cs) || throw(SectorMismatch(lazy"Cannot twist sectors $cs"))
+        end
+        return true
+    elseif BraidingStyle(I) == Bosonic()
+        return true
+    else
+        for i in inds
+            cs = sectors(space(t, i))
+            all(isone ∘ twist, cs) || return false
+        end
+        return true
+    end
+end
+
+"""
+    twist!(t::AbstractTensorMap, i::Int; inv::Bool=false) -> t
+    twist!(t::AbstractTensorMap, inds; inv::Bool=false) -> t
+
+Apply a twist to the `i`th index of `t`, or all indices in `inds`, storing the result in `t`.
+If `inv=true`, use the inverse twist.
+
+See [`twist`](@ref) for creating a new tensor.
+"""
+function twist!(t::AbstractTensorMap, inds; inv::Bool = false)
+    if !all(in(allind(t)), inds)
+        msg = "Can't twist indices $inds of a tensor with only $(numind(t)) indices."
+        throw(ArgumentError(msg))
+    end
+    (scalartype(t) <: Real && !(sectorscalartype(sectortype(t)) <: Real)) &&
+        throw(ArgumentError("Can't in-place twist a real tensor with complex sector type"))
+    has_shared_twist(t, inds) && return t
+
+    (scalartype(t) <: Real && !(sectorscalartype(sectortype(t)) <: Real)) &&
+        throw(ArgumentError("No in-place `twist!` for a real tensor with complex sector type"))
+
+    N₁ = numout(t)
+    for (f₁, f₂) in fusiontrees(t)
+        θ = prod(i -> i <= N₁ ? twist(f₁.uncoupled[i]) : twist(f₂.uncoupled[i - N₁]), inds)
+        inv && (θ = θ')
+        scale!(t[f₁, f₂], θ)
+    end
+    return t
+end
+
+"""
+    twist(tsrc::AbstractTensorMap, i::Int; inv::Bool = false, copy::Bool = false) -> tdst
+    twist(tsrc::AbstractTensorMap, inds; inv::Bool = false, copy::Bool = false) -> tdst
+
+Apply a twist to the `i`th index of `tsrc` and return the result as a new tensor.
+If `inv = true`, use the inverse twist.
+If `copy = false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
+
+See [`twist!`](@ref) for storing the result in place.
+"""
+function twist(t::AbstractTensorMap, inds; inv::Bool = false, copy::Bool = false)
+    if has_shared_twist(t, inds)
+        copy || return t
+        return copy!(similar(t), t)
+    end
+    tdst = similar(t, promote_twist(t))
+    copy!(tdst, t)
+    return twist!(tdst, inds; inv)
+end
+
+# =========================
+#  Space insertion/removal
+# =========================
+
+# Methods which change the number of indices, implement using `Val(i)` for type inference
+"""
+    insertleftunit(tsrc::AbstractTensorMap, i=numind(t) + 1;
+                   conj=false, dual=false, copy=false) -> tdst
+
+Insert a trivial vector space, isomorphic to the underlying field, at position `i`,
+which can be specified as an `Int` or as `Val(i)` for improved type stability.
+More specifically, adds a left monoidal unit or its dual.
+
+If `copy=false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
+
+See also [`insertrightunit`](@ref insertrightunit(::AbstractTensorMap, ::Val{i}) where {i}),
+[`removeunit`](@ref removeunit(::AbstractTensorMap, ::Val{i}) where {i}).
+"""
+function insertleftunit(
+        t::AbstractTensorMap, ::Val{i} = Val(numind(t) + 1);
+        copy::Bool = false, conj::Bool = false, dual::Bool = false
+    ) where {i}
+    W = insertleftunit(space(t), Val(i); conj, dual)
+    if t isa TensorMap
+        return TensorMap{scalartype(t)}(copy ? Base.copy(t.data) : t.data, W)
+    else
+        tdst = similar(t, W)
+        for (c, b) in blocks(t)
+            copy!(block(tdst, c), b)
+        end
+        return tdst
+    end
+end
+
+"""
+    insertrightunit(tsrc::AbstractTensorMap, i=numind(t);
+                    conj=false, dual=false, copy=false) -> tdst
+
+Insert a trivial vector space, isomorphic to the underlying field, after position `i`,
+which can be specified as an `Int` or as `Val(i)` for improved type stability.
+More specifically, adds a right monoidal unit or its dual.
+
+If `copy=false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
+
+See also [`insertleftunit`](@ref insertleftunit(::AbstractTensorMap, ::Val{i}) where {i}),
+[`removeunit`](@ref removeunit(::AbstractTensorMap, ::Val{i}) where {i}).
+"""
+function insertrightunit(
+        t::AbstractTensorMap, ::Val{i} = Val(numind(t));
+        copy::Bool = false, conj::Bool = false, dual::Bool = false
+    ) where {i}
+    W = insertrightunit(space(t), Val(i); conj, dual)
+    if t isa TensorMap
+        return TensorMap{scalartype(t)}(copy ? Base.copy(t.data) : t.data, W)
+    else
+        tdst = similar(t, W)
+        for (c, b) in blocks(t)
+            copy!(block(tdst, c), b)
+        end
+        return tdst
+    end
+end
+
+"""
+    removeunit(tsrc::AbstractTensorMap, i; copy=false) -> tdst
+
+This removes a trivial tensor product factor at position `1 ≤ i ≤ N`, where `i`
+can be specified as an `Int` or as `Val(i)` for improved type stability.
+For this to work, that factor has to be isomorphic to the field of scalars.
+
+If `copy=false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
+
+This operation undoes the work of [`insertleftunit`](@ref insertleftunit(::AbstractTensorMap, ::Val{i}) where {i})
+and [`insertrightunit`](@ref insertrightunit(::AbstractTensorMap, ::Val{i}) where {i}).
+"""
+function removeunit(t::AbstractTensorMap, ::Val{i}; copy::Bool = false) where {i}
+    W = removeunit(space(t), Val(i))
+    if t isa TensorMap
+        return TensorMap{scalartype(t)}(copy ? Base.copy(t.data) : t.data, W)
+    else
+        tdst = similar(t, W)
+        for (c, b) in blocks(t)
+            copy!(block(tdst, c), b)
+        end
+        return tdst
+    end
+end
+
+# TODO: fusion/splitting of indices
+
+# ============================
+# Index rearrangements
+# ============================
+
 # --------------
 #   permute(!)
 # --------------
@@ -61,8 +212,8 @@ See also [`permute`](@ref) for creating a new tensor.
         backend::AbstractBackend = TO.DefaultBackend(), allocator = TO.DefaultAllocator()
     )
     @boundscheck spacecheck_transform(permute, tdst, tsrc, p)
-    transformer = treepermuter(tdst, tsrc, p)
-    return @inbounds add_transform!(tdst, tsrc, p, transformer, α, β, backend, allocator)
+    levels = ntuple(identity, numind(tsrc))
+    return @inbounds braid!(tdst, tsrc, p, levels, α, β, backend, allocator)
 end
 
 """
@@ -92,7 +243,8 @@ function permute(
 
     # general case
     tdst = similar(t, promote_permute(t), permute(space(t), p))
-    return @inbounds permute!(tdst, t, p, One(), Zero(), backend, allocator)
+    levels = ntuple(identity, numind(t))
+    return @inbounds braid!(tdst, t, p, levels, One(), Zero(), backend, allocator)
 end
 function permute(t::AdjointTensorMap, (p₁, p₂)::Index2Tuple; kwargs...)
     p₁′ = adjointtensorindices(t, p₂)
@@ -169,7 +321,6 @@ function braid(
     )
     length(levels) == numind(t) || throw(ArgumentError(lazy"length of levels should be $(numind(t)), got $(length(levels))"))
 
-    BraidingStyle(sectortype(t)) isa SymmetricBraiding && return permute(t, p; copy, backend, allocator)
     (!copy && p == (codomainind(t), domainind(t))) && return t
 
     # general case
@@ -305,166 +456,29 @@ See also [`repartition!`](@ref) for writing into an existing destination.
     return transpose(t, (p₁, p₂); copy, backend, allocator)
 end
 
-# Twist
-function has_shared_twist(t, inds)
-    I = sectortype(t)
-    if BraidingStyle(I) == NoBraiding()
-        for i in inds
-            cs = sectors(space(t, i))
-            all(isunit, cs) || throw(SectorMismatch(lazy"Cannot twist sectors $cs"))
-        end
-        return true
-    elseif BraidingStyle(I) == Bosonic()
-        return true
-    else
-        for i in inds
-            cs = sectors(space(t, i))
-            all(isone ∘ twist, cs) || return false
-        end
-        return true
-    end
-end
-
-"""
-    twist!(t::AbstractTensorMap, i::Int; inv::Bool=false) -> t
-    twist!(t::AbstractTensorMap, inds; inv::Bool=false) -> t
-
-Apply a twist to the `i`th index of `t`, or all indices in `inds`, storing the result in `t`.
-If `inv=true`, use the inverse twist.
-
-See [`twist`](@ref) for creating a new tensor.
-"""
-function twist!(t::AbstractTensorMap, inds; inv::Bool = false)
-    if !all(in(allind(t)), inds)
-        msg = "Can't twist indices $inds of a tensor with only $(numind(t)) indices."
-        throw(ArgumentError(msg))
-    end
-    (scalartype(t) <: Real && !(sectorscalartype(sectortype(t)) <: Real)) &&
-        throw(ArgumentError("Can't in-place twist a real tensor with complex sector type"))
-    has_shared_twist(t, inds) && return t
-
-    (scalartype(t) <: Real && !(sectorscalartype(sectortype(t)) <: Real)) &&
-        throw(ArgumentError("No in-place `twist!` for a real tensor with complex sector type"))
-
-    N₁ = numout(t)
-    for (f₁, f₂) in fusiontrees(t)
-        θ = prod(i -> i <= N₁ ? twist(f₁.uncoupled[i]) : twist(f₂.uncoupled[i - N₁]), inds)
-        inv && (θ = θ')
-        scale!(t[f₁, f₂], θ)
-    end
-    return t
-end
-
-"""
-    twist(tsrc::AbstractTensorMap, i::Int; inv::Bool = false, copy::Bool = false) -> tdst
-    twist(tsrc::AbstractTensorMap, inds; inv::Bool = false, copy::Bool = false) -> tdst
-
-Apply a twist to the `i`th index of `tsrc` and return the result as a new tensor.
-If `inv = true`, use the inverse twist.
-If `copy = false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
-
-See [`twist!`](@ref) for storing the result in place.
-"""
-function twist(t::AbstractTensorMap, inds; inv::Bool = false, copy::Bool = false)
-    if has_shared_twist(t, inds)
-        copy || return t
-        return copy!(similar(t), t)
-    end
-    tdst = similar(t, promote_twist(t))
-    copy!(tdst, t)
-    return twist!(tdst, inds; inv)
-end
-
-# Methods which change the number of indices, implement using `Val(i)` for type inference
-"""
-    insertleftunit(tsrc::AbstractTensorMap, i=numind(t) + 1;
-                   conj=false, dual=false, copy=false) -> tdst
-
-Insert a trivial vector space, isomorphic to the underlying field, at position `i`,
-which can be specified as an `Int` or as `Val(i)` for improved type stability.
-More specifically, adds a left monoidal unit or its dual.
-
-If `copy=false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
-
-See also [`insertrightunit`](@ref insertrightunit(::AbstractTensorMap, ::Val{i}) where {i}),
-[`removeunit`](@ref removeunit(::AbstractTensorMap, ::Val{i}) where {i}).
-"""
-function insertleftunit(
-        t::AbstractTensorMap, ::Val{i} = Val(numind(t) + 1);
-        copy::Bool = false, conj::Bool = false, dual::Bool = false
-    ) where {i}
-    W = insertleftunit(space(t), Val(i); conj, dual)
-    if t isa TensorMap
-        return TensorMap{scalartype(t)}(copy ? Base.copy(t.data) : t.data, W)
-    else
-        tdst = similar(t, W)
-        for (c, b) in blocks(t)
-            copy!(block(tdst, c), b)
-        end
-        return tdst
-    end
-end
-
-"""
-    insertrightunit(tsrc::AbstractTensorMap, i=numind(t);
-                    conj=false, dual=false, copy=false) -> tdst
-
-Insert a trivial vector space, isomorphic to the underlying field, after position `i`,
-which can be specified as an `Int` or as `Val(i)` for improved type stability.
-More specifically, adds a right monoidal unit or its dual.
-
-If `copy=false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
-
-See also [`insertleftunit`](@ref insertleftunit(::AbstractTensorMap, ::Val{i}) where {i}),
-[`removeunit`](@ref removeunit(::AbstractTensorMap, ::Val{i}) where {i}).
-"""
-function insertrightunit(
-        t::AbstractTensorMap, ::Val{i} = Val(numind(t));
-        copy::Bool = false, conj::Bool = false, dual::Bool = false
-    ) where {i}
-    W = insertrightunit(space(t), Val(i); conj, dual)
-    if t isa TensorMap
-        return TensorMap{scalartype(t)}(copy ? Base.copy(t.data) : t.data, W)
-    else
-        tdst = similar(t, W)
-        for (c, b) in blocks(t)
-            copy!(block(tdst, c), b)
-        end
-        return tdst
-    end
-end
-
-"""
-    removeunit(tsrc::AbstractTensorMap, i; copy=false) -> tdst
-
-This removes a trivial tensor product factor at position `1 ≤ i ≤ N`, where `i`
-can be specified as an `Int` or as `Val(i)` for improved type stability.
-For this to work, that factor has to be isomorphic to the field of scalars.
+#-------------------------------------
+# Internal implementations
+#-------------------------------------
 
-If `copy=false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
+# find the scalartype after applying operations: take into account fusion and/or braiding
+# might need to become Float or Complex to capture complex recoupling coefficients but don't alter precision
+for (operation, manipulation) in (
+        :flip => :sector, :twist => :braiding,
+        :transpose => :fusion, :permute => :sector, :braid => :sector,
+    )
+    promote_op = Symbol(:promote_, operation)
+    manipulation_scalartype = Symbol(manipulation, :scalartype)
 
-This operation undoes the work of [`insertleftunit`](@ref insertleftunit(::AbstractTensorMap, ::Val{i}) where {i})
-and [`insertrightunit`](@ref insertrightunit(::AbstractTensorMap, ::Val{i}) where {i}).
-"""
-function removeunit(t::AbstractTensorMap, ::Val{i}; copy::Bool = false) where {i}
-    W = removeunit(space(t), Val(i))
-    if t isa TensorMap
-        return TensorMap{scalartype(t)}(copy ? Base.copy(t.data) : t.data, W)
-    else
-        tdst = similar(t, W)
-        for (c, b) in blocks(t)
-            copy!(block(tdst, c), b)
-        end
-        return tdst
+    @eval begin
+        $promote_op(t::AbstractTensorMap) = $promote_op(typeof(t))
+        $promote_op(::Type{T}) where {T <: AbstractTensorMap} =
+            $promote_op(scalartype(T), sectortype(T))
+        $promote_op(::Type{T}, ::Type{I}) where {T <: Number, I <: Sector} =
+            $manipulation_scalartype(I) <: Integer ? T :
+            $manipulation_scalartype(I) <: Real ? float(T) : complex(T)
     end
 end
 
-# Fusing and splitting
-# TODO: add functionality for easy fusing and splitting of tensor indices
-
-#-------------------------------------
-# Full implementations based on `add`
-#-------------------------------------
 spacecheck_transform(f, tdst::AbstractTensorMap, tsrc::AbstractTensorMap, args...) =
     spacecheck_transform(f, space(tdst), space(tsrc), args...)
 @noinline function spacecheck_transform(f, Vdst::TensorMapSpace, Vsrc::TensorMapSpace, p::Index2Tuple)

From 5fe0a6e176a26b9cde48efb92952e77753053a4d Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Sat, 25 Apr 2026 21:56:52 -0400
Subject: [PATCH 12/23] all permutes go through braid

---
 src/tensors/treetransformers.jl | 33 ++++++++++++++-------------------
 1 file changed, 14 insertions(+), 19 deletions(-)

diff --git a/src/tensors/treetransformers.jl b/src/tensors/treetransformers.jl
index 82032b067..b602e11ea 100644
--- a/src/tensors/treetransformers.jl
+++ b/src/tensors/treetransformers.jl
@@ -136,21 +136,21 @@ end
 
 function allocate_buffers(
         tdst::TensorMap, tsrc::TensorMap, transformer::GenericTreeTransformer,
-        allocator=TO.DefaultAllocator()
+        allocator = TO.DefaultAllocator()
     )
     sz = buffersize(transformer)
     return similar(tdst.data, sz), similar(tsrc.data, sz)
 end
 function allocate_buffers(
         data_dst::DenseVector, data_src::DenseVector, transformer::GenericTreeTransformer,
-        allocator=TO.DefaultAllocator()
+        allocator = TO.DefaultAllocator()
     )
     sz = buffersize(transformer)
     return similar(data_dst, sz), similar(data_src, sz)
 end
 function allocate_buffers(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap, transformer,
-        allocator=TO.DefaultAllocator()
+        allocator = TO.DefaultAllocator()
     )
     # be pessimistic and assume the worst for now
     sz = dim(space(tsrc))
@@ -194,22 +194,17 @@ end
     return TreeTransformer(fusiontreebraider, p, Vdst, Vsrc)
 end
 
-for (transform, treetransformer) in
-    ((:permute, :treepermuter), (:transpose, :treetransposer))
-    @eval begin
-        function $treetransformer(::AbstractTensorMap, ::AbstractTensorMap, p::Index2Tuple)
-            return fusiontreetransform(f) = $transform(f, p)
-        end
-        function $treetransformer(tdst::TensorMap, tsrc::TensorMap, p::Index2Tuple)
-            return $treetransformer(space(tdst), space(tsrc), p)
-        end
-        @cached function $treetransformer(
-                Vdst::TensorMapSpace, Vsrc::TensorMapSpace, p::Index2Tuple
-            )::treetransformertype(Vdst, Vsrc)
-            fusiontreetransform(f) = $transform(f, p)
-            return TreeTransformer(fusiontreetransform, p, Vdst, Vsrc)
-        end
-    end
+function treetransposer(::AbstractTensorMap, ::AbstractTensorMap, p::Index2Tuple)
+    return fusiontreetransform(f) = transpose(f, p)
+end
+function treetransposer(tdst::TensorMap, tsrc::TensorMap, p::Index2Tuple)
+    return treetransposer(space(tdst), space(tsrc), p)
+end
+@cached function treetransposer(
+        Vdst::TensorMapSpace, Vsrc::TensorMapSpace, p::Index2Tuple
+    )::treetransformertype(Vdst, Vsrc)
+    fusiontreetransform(f) = transpose(f, p)
+    return TreeTransformer(fusiontreetransform, p, Vdst, Vsrc)
 end
 
 # default cachestyle is GlobalLRUCache

From 6ebca1e340184c42a267fc4bb2cfa06bd1469bc7 Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Sat, 25 Apr 2026 21:59:46 -0400
Subject: [PATCH 13/23] format docstrings

---
 src/tensors/indexmanipulations.jl | 36 ++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index bfeeb6f82..4a1cfc997 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -51,8 +51,8 @@ function has_shared_twist(t, inds)
 end
 
 """
-    twist!(t::AbstractTensorMap, i::Int; inv::Bool=false) -> t
-    twist!(t::AbstractTensorMap, inds; inv::Bool=false) -> t
+    twist!(t::AbstractTensorMap, i::Int; inv::Bool = false) -> t
+    twist!(t::AbstractTensorMap, inds; inv::Bool = false) -> t
 
 Apply a twist to the `i`th index of `t`, or all indices in `inds`, storing the result in `t`.
 If `inv=true`, use the inverse twist.
@@ -106,8 +106,10 @@ end
 
 # Methods which change the number of indices, implement using `Val(i)` for type inference
 """
-    insertleftunit(tsrc::AbstractTensorMap, i=numind(t) + 1;
-                   conj=false, dual=false, copy=false) -> tdst
+    insertleftunit(
+            tsrc::AbstractTensorMap, i = numind(t) + 1;
+            conj = false, dual = false, copy = false
+        ) -> tdst
 
 Insert a trivial vector space, isomorphic to the underlying field, at position `i`,
 which can be specified as an `Int` or as `Val(i)` for improved type stability.
@@ -135,8 +137,10 @@ function insertleftunit(
 end
 
 """
-    insertrightunit(tsrc::AbstractTensorMap, i=numind(t);
-                    conj=false, dual=false, copy=false) -> tdst
+    insertrightunit(
+            tsrc::AbstractTensorMap, i = numind(t);
+            conj = false, dual = false, copy = false
+        ) -> tdst
 
 Insert a trivial vector space, isomorphic to the underlying field, after position `i`,
 which can be specified as an `Int` or as `Val(i)` for improved type stability.
@@ -164,7 +168,7 @@ function insertrightunit(
 end
 
 """
-    removeunit(tsrc::AbstractTensorMap, i; copy=false) -> tdst
+    removeunit(tsrc::AbstractTensorMap, i; copy = false) -> tdst
 
 This removes a trivial tensor product factor at position `1 ≤ i ≤ N`, where `i`
 can be specified as an `Int` or as `Val(i)` for improved type stability.
@@ -302,8 +306,10 @@ See also [`braid`](@ref) for creating a new tensor.
 end
 
 """
-    braid(tsrc, (p₁, p₂)::Index2Tuple, levels::IndexTuple; copy=false,
-          backend=DefaultBackend(), allocator=DefaultAllocator()) -> tdst::TensorMap
+    braid(
+            tsrc, (p₁, p₂)::Index2Tuple, levels::IndexTuple; copy = false,
+            backend = DefaultBackend(), allocator = DefaultAllocator()
+        ) -> tdst::TensorMap
 
 Return tensor `tdst` obtained by braiding the indices of `tsrc`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
@@ -369,8 +375,10 @@ end
 end
 
 """
-    transpose(tsrc, (p₁, p₂)::Index2Tuple; copy=false,
-              backend=DefaultBackend(), allocator=DefaultAllocator()) -> tdst::TensorMap
+    transpose(
+            tsrc, (p₁, p₂)::Index2Tuple; copy = false,
+            backend = DefaultBackend(), allocator = DefaultAllocator()
+        ) -> tdst::TensorMap
 
 Return tensor `tdst` obtained by transposing the indices of `tsrc`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
@@ -432,8 +440,10 @@ See also [`repartition`](@ref) for creating a new tensor.
 end
 
 """
-    repartition(tsrc, N₁::Int, N₂::Int=numind(tsrc)-N₁; copy=false,
-                backend=DefaultBackend(), allocator=DefaultAllocator()) -> tdst
+    repartition(
+            tsrc, N₁::Int, N₂::Int = numind(tsrc) - N₁; copy = false,
+            backend = DefaultBackend(), allocator = DefaultAllocator()
+        ) -> tdst
 
 Return tensor `tdst` obtained by repartitioning the indices of `tsrc`.
 The codomain and domain of `tdst` correspond to the first `N₁` and last `N₂` spaces of `tsrc`,

From 9eee29ac2d48c1b2ac5809b7028557dafdcf6388 Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Sat, 25 Apr 2026 22:22:19 -0400
Subject: [PATCH 14/23] update docs

---
 docs/make.jl                        |   4 +-
 docs/src/lib/tensors.md             |   6 --
 docs/src/man/indexmanipulations.md  | 107 ++++++++++++++++++++++++++++
 docs/src/man/tensormanipulations.md | 103 +-------------------------
 4 files changed, 111 insertions(+), 109 deletions(-)
 create mode 100644 docs/src/man/indexmanipulations.md

diff --git a/docs/make.jl b/docs/make.jl
index 34b025580..5517755ab 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -9,7 +9,7 @@ end
 using Documenter
 using Random
 using TensorKit
-using TensorKit: FusionTreePair, FusionTreeBlock, Index2Tuple
+using TensorKit: FusionTreePair, FusionTreeBlock, Index2Tuple, IndexTuple
 using TensorKit.TensorKitSectors
 using TensorKit.MatrixAlgebraKit
 using DocumenterInterLinks
@@ -27,7 +27,7 @@ pages = [
         "man/spaces.md", "man/symmetries.md",
         "man/sectors.md", "man/gradedspaces.md",
         "man/fusiontrees.md", "man/tensors.md",
-        "man/tensormanipulations.md",
+        "man/indexmanipulations.md", "man/tensormanipulations.md",
     ],
     "Library" => [
         "lib/sectors.md", "lib/fusiontrees.md",
diff --git a/docs/src/lib/tensors.md b/docs/src/lib/tensors.md
index b19537e3f..22e4c2c1b 100644
--- a/docs/src/lib/tensors.md
+++ b/docs/src/lib/tensors.md
@@ -184,12 +184,6 @@ repartition!
 twist!
 ```
 
-```@docs
-TensorKit.add_permute!
-TensorKit.add_braid!
-TensorKit.add_transpose!
-```
-
 ### Tensor map composition, traces, contractions and tensor products
 
 ```@docs
diff --git a/docs/src/man/indexmanipulations.md b/docs/src/man/indexmanipulations.md
new file mode 100644
index 000000000..15908d02f
--- /dev/null
+++ b/docs/src/man/indexmanipulations.md
@@ -0,0 +1,107 @@
+# [Index manipulations](@id s_indexmanipulations)
+
+```@setup indexmanip
+using TensorKit
+using LinearAlgebra
+```
+
+Tensor maps have a bipartition of their indices into a codomain and a domain.
+Index manipulations are operations that reorganize this structure: reordering indices, moving them between domain and codomain, flipping arrows, applying twists, or inserting and removing trivial factors.
+
+Throughout this page, index positions are specified using `Index2Tuple{N₁,N₂}`, i.e. a pair `(p₁, p₂)` of tuples.
+The indices in `p₁` form the new codomain, and those in `p₂` form the new domain.
+The helper functions [`codomainind`](@ref), [`domainind`](@ref), [`allind`](@ref), [`numout`](@ref) and [`numin`](@ref) are available to retrieve the current index structure of a tensor.
+
+## Permuting and braiding
+
+For sector types with a symmetric braiding (`BraidingStyle(I) isa SymmetricBraiding`), use [`permute`](@ref):
+
+```@docs; canonical=false
+permute(::AbstractTensorMap, ::Index2Tuple)
+permute!(::AbstractTensorMap, ::AbstractTensorMap, ::Index2Tuple)
+```
+
+For general braiding, use [`braid`](@ref), which requires an additional `levels` argument that assigns a height to each index.
+When two indices need to exchange places, the index with the higher level crosses over the index with the lower level.
+
+```@docs; canonical=false
+braid(::AbstractTensorMap, ::Index2Tuple, ::IndexTuple)
+braid!
+```
+
+For plain tensors (`sectortype(t) == Trivial`), `permute` acts exactly like `permutedims` on the underlying array data:
+
+```@repl indexmanip
+V = ℂ^2;
+t = randn(V ⊗ V ← V ⊗ V);
+ta = convert(Array, t);
+t′ = permute(t, ((4, 2, 3), (1,)));
+convert(Array, t′) ≈ permutedims(ta, (4, 2, 3, 1))
+```
+
+## Transposing and repartitioning
+
+[`transpose`](@ref) is a special case of braiding restricted to *cyclic permutations*, i.e. permutations where indices do not cross.
+Unlike a generic `braid`, it introduces a compensating (inverse) twist, which is necessary to satisfy the categorical definition of transpose.
+
+```@raw html
+<img src="../img/tensor-transpose.svg" alt="transpose" class="color-invertible"/>
+```
+
+```@docs; canonical=false
+transpose(::AbstractTensorMap, ::Index2Tuple)
+transpose!
+```
+
+[`repartition`](@ref) is a further special case that only changes the codomain/domain split while preserving cyclic order:
+
+```@docs; canonical=false
+repartition(::AbstractTensorMap, ::Int, ::Int)
+repartition!
+```
+
+## Flipping arrows
+
+[`flip`](@ref) applies an isomorphism to change the arrow direction on selected indices:
+
+```@docs; canonical=false
+flip(t::AbstractTensorMap, I)
+```
+
+!!! note
+    `flip` is not involutory: `flip(flip(t, I), I) ≠ t` in general.
+    Use `flip(flip(t, I), I; inv=true)` to recover the original tensor.
+
+## Twisting
+
+[`twist`](@ref) applies the monoidal twist to one or more indices.
+For `BraidingStyle(I) == Bosonic()`, all twists are trivial and `twist` returns the tensor unchanged.
+
+```@docs; canonical=false
+twist(::AbstractTensorMap, ::Int)
+twist!
+```
+
+## Inserting and removing unit spaces
+
+The following functions insert or remove a trivial tensor product factor (a space isomorphic to the scalar field) at a given position.
+Passing `Val(i)` instead of an integer `i` improves type stability.
+
+```@docs; canonical=false
+insertleftunit(::AbstractTensorMap, ::Val{i}) where {i}
+insertrightunit(::AbstractTensorMap, ::Val{i}) where {i}
+removeunit(::AbstractTensorMap, ::Val{i}) where {i}
+```
+
+## Fusing and splitting indices
+
+There is no dedicated function for fusing or splitting indices.
+For a plain tensor (`sectortype(t) == Trivial`), this is equivalent to `reshape` on the underlying array.
+In the general case, one can construct an explicit isomorphism using [`isomorphism`](@ref) (or [`unitary`](@ref) for Euclidean spaces) and contract it with the tensor:
+
+```julia
+u = unitary(fuse(space(t, i) ⊗ space(t, j)), space(t, i) ⊗ space(t, j))
+# then contract u with indices i and j of t via @tensor
+```
+
+Note that tensor factorizations (SVD, QR, etc.) can be applied directly to any index bipartition without needing to fuse indices first; see [Tensor factorizations](@ref ss_tensor_factorization).
diff --git a/docs/src/man/tensormanipulations.md b/docs/src/man/tensormanipulations.md
index 15285fd78..2f238f963 100644
--- a/docs/src/man/tensormanipulations.md
+++ b/docs/src/man/tensormanipulations.md
@@ -83,107 +83,8 @@ norm(t8) ≈ norm(t4)*norm(t6)
 
 ## [Index manipulations](@id ss_indexmanipulation)
 
-In many cases, the bipartition of tensor indices (i.e. `ElementarySpace` instances) between the codomain and domain is not fixed throughout the different operations that need to be performed on that tensor map, i.e. we want to use the duality to move spaces from domain to codomain and vice versa.
-Furthermore, we want to use the braiding to reshuffle the order of the indices.
-
-For this, we use an interface that is closely related to that for manipulating splitting- fusion tree pairs, namely [`braid`](@ref) and [`permute`](@ref), with the interface
-
-```julia
-braid(t::AbstractTensorMap{T,S,N₁,N₂}, (p1, p2)::Index2Tuple{N₁′,N₂′}, levels::IndexTuple{N₁+N₂,Int})
-```
-
-and
-
-```julia
-permute(t::AbstractTensorMap{T,S,N₁,N₂}, (p1, p2)::Index2Tuple{N₁′,N₂′}; copy = false)
-```
-
-both of which return an instance of `AbstractTensorMap{T, S, N₁′, N₂′}`.
-
-In these methods, `p1` and `p2` specify which of the original tensor indices ranging from `1` to `N₁ + N₂` make up the new codomain (with `N₁′` spaces) and new domain (with `N₂′` spaces).
-Hence, `(p1..., p2...)` should be a valid permutation of `1:(N₁ + N₂)`.
-Note that, throughout TensorKit.jl, permutations are always specified using tuples of `Int`s, for reasons of type stability.
-For `braid`, we also need to specify `levels` or depths for each of the indices of the original tensor, which determine whether indices will braid over or underneath each other (use the braiding or its inverse).
-We refer to the section on [manipulating fusion trees](@ref ss_fusiontrees) for more details.
-
-When `BraidingStyle(sectortype(t)) isa SymmetricBraiding`, we can use the simpler interface of `permute`, which does not require the argument `levels`.
-`permute` accepts a keyword argument `copy`.
-When `copy == true`, the result will be a tensor with newly allocated data that can independently be modified from that of the input tensor `t`.
-When `copy` takes the default value `false`, `permute` can try to return the result in a way that it shares its data with the input tensor `t`, though this is only possible in specific cases (e.g. when `sectortype(S) == Trivial` and `(p1..., p2...) = (1:(N₁+N₂)...)`).
-
-Both `braid` and `permute` come in a version where the result is stored in an already existing tensor, i.e. [`braid!(tdst, tsrc, (p1, p2), levels)`](@ref) and [`permute!(tdst, tsrc, (p1, p2))`](@ref).
-
-Another operation that belongs under index manipulations is taking the `transpose` of a tensor, i.e. `LinearAlgebra.transpose(t)` and `LinearAlgebra.transpose!(tdst, tsrc)`, both of which are reexported by TensorKit.jl.
-Note that `transpose(t)` is not simply equal to reshuffling domain and codomain with `braid(t, (1:(N₁+N₂)...), reverse(domainind(tsrc)), reverse(codomainind(tsrc))))`.
-Indeed, the graphical representation (where we draw the codomain and domain as a single object), makes clear that this introduces an additional (inverse) twist, which is then compensated in the `transpose` implementation.
-
-```@raw html
-<img src="../img/tensor-transpose.svg" alt="transpose" class="color-invertible"/>
-```
-
-In categorical language, the reason for this extra twist is that we use the left coevaluation ``η``, but the right evaluation ``\tilde{ϵ}``, when repartitioning the indices between domain and codomain.
-
-There are a number of other index related manipulations.
-We can apply a twist (or inverse twist) to one of the tensor map indices via [`twist(t, i; inv = false)`](@ref) or [`twist!(t, i; inv = false)`](@ref).
-Note that the latter method does not store the result in a new destination tensor, but just modifies the tensor `t` in place.
-Twisting several indices simultaneously can be obtained by using the defining property
-
-```math
-θ_{V⊗W} = τ_{W,V} ∘ (θ_W ⊗ θ_V) ∘ τ_{V,W} = (θ_V ⊗ θ_W) ∘ τ_{W,V} ∘ τ_{V,W},
-```
-
-but is currently not implemented explicitly.
-
-For all sector types `I` with `BraidingStyle(I) == Bosonic()`, all twists are `1` and thus have no effect.
-Let us start with some examples, in which we illustrate that, albeit `permute` might act highly non-trivial on the fusion trees and on the corresponding data, after conversion to a regular `Array` (when possible), it just acts like `permutedims`
-
-```@repl tensors
-domain(t) → codomain(t)
-ta = convert(Array, t);
-t′ = permute(t, (1, 2, 3, 4));
-domain(t′) → codomain(t′)
-convert(Array, t′) ≈ ta
-t′′ = permute(t, ((4, 2, 3), (1,)));
-domain(t′′) → codomain(t′′)
-convert(Array, t′′) ≈ permutedims(ta, (4, 2, 3, 1))
-transpose(t)
-convert(Array, transpose(t)) ≈ permutedims(ta, (4, 3, 2, 1))
-dot(t2, t) ≈ dot(transpose(t2), transpose(t))
-transpose(transpose(t)) ≈ t
-twist(t, 3) ≈ t
-```
-
-Note that `transpose` acts like one would expect on a `TensorMap{T, S, 1, 1}`.
-On a `TensorMap{T, S, N₁, N₂}`, because `transpose` replaces the codomain with the dual of the domain, which has its tensor product operation reversed, this in the end amounts in a complete reversal of all tensor indices when representing it as a plain multi-dimensional `Array`.
-Also, note that we have not defined the conjugation of `TensorMap` instances.
-One definition that one could think of is `conj(t) = adjoint(transpose(t))`.
-However note that `codomain(adjoint(tranpose(t))) == domain(transpose(t)) == dual(codomain(t))` and similarly `domain(adjoint(tranpose(t))) == dual(domain(t))`, where `dual` of a `ProductSpace` is composed of the dual of the `ElementarySpace` instances, in reverse order of tensor product.
-This might be very confusing, and as such we leave tensor conjugation undefined.
-However, note that we have a conjugation syntax within the context of [tensor contractions](@ref ss_tensor_contraction).
-
-To show the effect of `twist`, we now consider a type of sector `I` for which `BraidingStyle(I) != Bosonic()`.
-In particular, we use `FibonacciAnyon`.
-We cannot convert the resulting `TensorMap` to an `Array`, so we have to rely on indirect tests to verify our results.
-
-```@repl tensors
-V1 = GradedSpace{FibonacciAnyon}(:I => 3, :τ => 2)
-V2 = GradedSpace{FibonacciAnyon}(:I => 2, :τ => 1)
-m = randn(Float32, V1, V2)
-transpose(m)
-twist(braid(m, ((2,), (1,)), (1, 2)), 1)
-t1 = randn(V1 * V2', V2 * V1);
-t2 = randn(ComplexF64, V1 * V2', V2 * V1);
-dot(t1, t2) ≈ dot(transpose(t1), transpose(t2))
-transpose(transpose(t1)) ≈ t1
-```
-
-A final operation that one might expect in this section is to fuse or join indices, and its inverse, to split a given index into two or more indices.
-For a plain tensor (i.e. with `sectortype(t) == Trivial`) amount to the equivalent of `reshape` on the multidimensional data.
-However, this represents only one possibility, as there is no canonically unique way to embed the tensor product of two spaces `V1 ⊗ V2` in a new space `V = fuse(V1 ⊗ V2)`.
-Such a mapping can always be accompagnied by a basis transform.
-However, one particular choice is created by the function `isomorphism`, or for `EuclideanProduct` spaces, `unitary`.
-Hence, we can join or fuse two indices of a tensor by first constructing `u = unitary(fuse(space(t, i) ⊗ space(t, j)), space(t, i) ⊗ space(t, j))` and then contracting this map with indices `i` and `j` of `t`, as explained in the section on [contracting tensors](@ref ss_tensor_contraction).
-Note, however, that a typical algorithm is not expected to often need to fuse and split indices, as e.g. tensor factorizations can easily be applied without needing to `reshape` or fuse indices first, as explained in the next section.
+Index manipulations are operations that reorganize the bipartition of indices between the codomain and domain, possibly also reordering them or applying braiding isomorphisms.
+They are covered in detail on a dedicated page: [Index manipulations](@ref s_indexmanipulations).
 
 ## [Tensor factorizations](@id ss_tensor_factorization)
 

From e210bb164ce5ee394446797eddb0558b174b330c Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Sat, 25 Apr 2026 22:38:48 -0400
Subject: [PATCH 15/23] try to improve on docs

---
 docs/src/man/indexmanipulations.md | 132 +++++++++++++++--------------
 1 file changed, 70 insertions(+), 62 deletions(-)

diff --git a/docs/src/man/indexmanipulations.md b/docs/src/man/indexmanipulations.md
index 15908d02f..97de61974 100644
--- a/docs/src/man/indexmanipulations.md
+++ b/docs/src/man/indexmanipulations.md
@@ -5,103 +5,111 @@ using TensorKit
 using LinearAlgebra
 ```
 
-Tensor maps have a bipartition of their indices into a codomain and a domain.
-Index manipulations are operations that reorganize this structure: reordering indices, moving them between domain and codomain, flipping arrows, applying twists, or inserting and removing trivial factors.
+A `TensorMap{T, S, N₁, N₂}` is a linear map from a domain (a `ProductSpace{S, N₂}`) to a codomain (a `ProductSpace{S, N₁}`).
+In practice, the bipartition of the `N₁ + N₂` indices between domain and codomain is often not fixed: algorithms typically need to reshuffle indices between the two sides, reorder them, or change the arrow direction on individual indices before passing a tensor to a factorization or contraction.
 
-Throughout this page, index positions are specified using `Index2Tuple{N₁,N₂}`, i.e. a pair `(p₁, p₂)` of tuples.
-The indices in `p₁` form the new codomain, and those in `p₂` form the new domain.
-The helper functions [`codomainind`](@ref), [`domainind`](@ref), [`allind`](@ref), [`numout`](@ref) and [`numin`](@ref) are available to retrieve the current index structure of a tensor.
+Index manipulations cover all such operations.
+They act on the structure of the tensor data in a way that is fully determined by the categorical data of the `sectortype`, such that TensorKit automatically manipulates the tensor entries accordingly.
+The operations fall into three groups, which mirror the structure of the source file:
 
-## Permuting and braiding
+*   **Reweighting**: [`flip`](@ref) and [`twist`](@ref) apply local isomorphisms to individual indices without changing the index structure.
+*   **Space insertion/removal**: [`insertleftunit`](@ref), [`insertrightunit`](@ref) and [`removeunit`](@ref) add or remove trivial (scalar) index factors.
+*   **Index rearrangements**: [`permute`](@ref), [`braid`](@ref), [`transpose`](@ref) and [`repartition`](@ref) reorder indices and/or move them between domain and codomain.
 
-For sector types with a symmetric braiding (`BraidingStyle(I) isa SymmetricBraiding`), use [`permute`](@ref):
+Throughout this page, new index positions are specified using `Index2Tuple{N₁, N₂}`, i.e. a pair `(p₁, p₂)` of index tuples.
+The indices listed in `p₁` form the new codomain and those in `p₂` form the new domain.
+The following helpers retrieve the current index structure of a tensor:
 
 ```@docs; canonical=false
-permute(::AbstractTensorMap, ::Index2Tuple)
-permute!(::AbstractTensorMap, ::AbstractTensorMap, ::Index2Tuple)
+numout
+numin
+numind
+codomainind
+domainind
+allind
 ```
 
-For general braiding, use [`braid`](@ref), which requires an additional `levels` argument that assigns a height to each index.
-When two indices need to exchange places, the index with the higher level crosses over the index with the lower level.
+## Reweighting
 
-```@docs; canonical=false
-braid(::AbstractTensorMap, ::Index2Tuple, ::IndexTuple)
-braid!
-```
+Reweighting operations modify the entries of a tensor by applying local isomorphisms to individual indices, without changing the number of indices or their partition between domain and codomain.
 
-For plain tensors (`sectortype(t) == Trivial`), `permute` acts exactly like `permutedims` on the underlying array data:
+[`flip`](@ref) changes the arrow direction on selected indices by applying the corresponding isomorphism between a space and its dual.
+[`twist`](@ref) applies the topological spin (monoidal twist) to selected indices; for `BraidingStyle(I) == Bosonic()` this is always trivial.
 
-```@repl indexmanip
-V = ℂ^2;
-t = randn(V ⊗ V ← V ⊗ V);
-ta = convert(Array, t);
-t′ = permute(t, ((4, 2, 3), (1,)));
-convert(Array, t′) ≈ permutedims(ta, (4, 2, 3, 1))
+```@docs; canonical=false
+flip(t::AbstractTensorMap, I)
+twist(::AbstractTensorMap, ::Int)
+twist!
 ```
 
-## Transposing and repartitioning
-
-[`transpose`](@ref) is a special case of braiding restricted to *cyclic permutations*, i.e. permutations where indices do not cross.
-Unlike a generic `braid`, it introduces a compensating (inverse) twist, which is necessary to satisfy the categorical definition of transpose.
+## Inserting and removing unit spaces
 
-```@raw html
-<img src="../img/tensor-transpose.svg" alt="transpose" class="color-invertible"/>
-```
+These functions add or remove a trivial tensor product factor at a specified index position, without affecting any other indices.
+[`insertleftunit`](@ref) inserts before position `i` and [`insertrightunit`](@ref) inserts after position `i`; [`removeunit`](@ref) undoes either insertion.
+Passing `Val(i)` instead of an `Int` for the position may improve type stability.
 
 ```@docs; canonical=false
-transpose(::AbstractTensorMap, ::Index2Tuple)
-transpose!
+insertleftunit(::AbstractTensorMap, ::Val{i}) where {i}
+insertrightunit(::AbstractTensorMap, ::Val{i}) where {i}
+removeunit(::AbstractTensorMap, ::Val{i}) where {i}
 ```
 
-[`repartition`](@ref) is a further special case that only changes the codomain/domain split while preserving cyclic order:
-
-```@docs; canonical=false
-repartition(::AbstractTensorMap, ::Int, ::Int)
-repartition!
-```
+## Index rearrangements
 
-## Flipping arrows
+These operations reorder indices and/or move them between domain and codomain by applying the transposing or braiding isomorphisms of the underlying category.
+They form a hierarchy from most general to most restricted:
 
-[`flip`](@ref) applies an isomorphism to change the arrow direction on selected indices:
+- [`braid`](@ref) is the most general: it accepts any permutation and requires a `levels` argument — a tuple of heights, one per index — that determines whether each index crosses over or under the others it has to pass.
+- [`permute`](@ref) is a simpler interface for sector types with a symmetric braiding (`BraidingStyle(I) isa SymmetricBraiding`), where over- and under-crossings are equivalent and `levels` is therefore not needed.
+- [`transpose`](@ref) is restricted to *cyclic* permutations (indices do not cross). Unlike `braid`, it introduces a compensating (inverse) twist to satisfy the categorical definition of transpose, as illustrated below:
 
-```@docs; canonical=false
-flip(t::AbstractTensorMap, I)
+```@raw html
+<img src="../img/tensor-transpose.svg" alt="transpose" class="color-invertible"/>
 ```
 
-!!! note
-    `flip` is not involutory: `flip(flip(t, I), I) ≠ t` in general.
-    Use `flip(flip(t, I), I; inv=true)` to recover the original tensor.
+- [`repartition`](@ref) only moves the codomain/domain boundary without reordering the indices at all.
 
-## Twisting
+For plain tensors (`sectortype(t) == Trivial`), `permute` and `braid` act like `permutedims` on the underlying array:
 
-[`twist`](@ref) applies the monoidal twist to one or more indices.
-For `BraidingStyle(I) == Bosonic()`, all twists are trivial and `twist` returns the tensor unchanged.
-
-```@docs; canonical=false
-twist(::AbstractTensorMap, ::Int)
-twist!
+```@repl indexmanip
+V = ℂ^2;
+t = randn(V ⊗ V ← V ⊗ V);
+ta = convert(Array, t);
+t′ = permute(t, ((4, 2, 3), (1,)));
+convert(Array, t′) ≈ permutedims(ta, (4, 2, 3, 1))
 ```
 
-## Inserting and removing unit spaces
-
-The following functions insert or remove a trivial tensor product factor (a space isomorphic to the scalar field) at a given position.
-Passing `Val(i)` instead of an integer `i` improves type stability.
-
 ```@docs; canonical=false
-insertleftunit(::AbstractTensorMap, ::Val{i}) where {i}
-insertrightunit(::AbstractTensorMap, ::Val{i}) where {i}
-removeunit(::AbstractTensorMap, ::Val{i}) where {i}
+braid(::AbstractTensorMap, ::Index2Tuple, ::IndexTuple)
+braid!
+permute(::AbstractTensorMap, ::Index2Tuple)
+permute!(::AbstractTensorMap, ::AbstractTensorMap, ::Index2Tuple)
+transpose(::AbstractTensorMap, ::Index2Tuple)
+transpose!
+repartition(::AbstractTensorMap, ::Int, ::Int)
+repartition!
 ```
 
 ## Fusing and splitting indices
 
 There is no dedicated function for fusing or splitting indices.
 For a plain tensor (`sectortype(t) == Trivial`), this is equivalent to `reshape` on the underlying array.
-In the general case, one can construct an explicit isomorphism using [`isomorphism`](@ref) (or [`unitary`](@ref) for Euclidean spaces) and contract it with the tensor:
+
+In the general case there is no canonical embedding of `V1 ⊗ V2` into the fused space `V = fuse(V1 ⊗ V2)`: any two such embeddings differ by a basis transform, i.e. there is a gauge freedom.
+TensorKit resolves this by requiring the user to construct an explicit isomorphism — the *fuser* — and contract it with the tensor:
+
+```julia
+f = unitary(fuse(space(t, i) ⊗ space(t, j)), space(t, i) ⊗ space(t, j))
+@tensor t_fused[…, a, …] := f[a, i, j] * t[…, i, j, …]
+```
+
+Splitting is then the adjoint of the same map:
 
 ```julia
-u = unitary(fuse(space(t, i) ⊗ space(t, j)), space(t, i) ⊗ space(t, j))
-# then contract u with indices i and j of t via @tensor
+@tensor t_split[…, i, j, …] := f'[i, j, a] * t_fused[…, a, …]
 ```
 
+Using `f'` as the splitter guarantees that the round-trip is the identity, i.e. `t_split == t`.
+Using a *different* isomorphism to split would give a physically equivalent but numerically different tensor, so it is important to keep `f` and its adjoint consistent throughout a calculation.
+
 Note that tensor factorizations (SVD, QR, etc.) can be applied directly to any index bipartition without needing to fuse indices first; see [Tensor factorizations](@ref ss_tensor_factorization).

From ea8155cd57efa6ff26b858abc59560fd848526cc Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Sat, 25 Apr 2026 22:59:34 -0400
Subject: [PATCH 16/23] rework buffer interaction

---
 src/tensors/indexmanipulations.jl | 16 ++++++++--------
 src/tensors/treetransformers.jl   | 28 ----------------------------
 2 files changed, 8 insertions(+), 36 deletions(-)

diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index 4a1cfc997..a5d618872 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -592,7 +592,6 @@ function add_transform_kernel!(
         # Non-Abelian fusion: trees sharing the same set of uncoupled (external) sectors
         # form a *fusion block* and mix under the transformation via a recoupling matrix U
         # (rows = destination trees, columns = source trees). We iterate over blocks.
-        tl_buffers = OhMyThreads.TaskLocalValue(() -> allocate_buffers(tdst, tsrc, transformer, allocator))
         tforeach(fusionblocks(tsrc); scheduler) do src
             dst, U = transformer(src)
             if length(src) == 1
@@ -611,19 +610,19 @@ function add_transform_kernel!(
                 #   2. Recoupling: buffer_dst = buffer_src * U^T  (blocksize × rows)
                 #   3. Insert: scatter columns of buffer_dst to destination blocks,
                 #      applying the actual permutation p in the same step.
-                buffer1, buffer2 = tl_buffers[]
                 rows, cols = size(U)
                 sz_src = size(tsrc[first(fusiontrees(src))...])
                 blocksize = prod(sz_src)
                 ptriv = (ntuple(identity, length(sz_src)), ())
-                buffer_src = StridedView(buffer2, (blocksize, cols), (1, blocksize), 0)
+                buffer = TO.tensoralloc(storagetype(tdst), blocksize * (rows + cols), Val(true), allocator)
+                buffer_dst = StridedView(buffer, (blocksize, rows), (1, blocksize), 0)
+                buffer_src = StridedView(buffer, (blocksize, cols), (1, blocksize), blocksize * rows)
                 @inbounds for (i, (f₁, f₂)) in enumerate(fusiontrees(src))
                     TO.tensoradd!(
                         sreshape(buffer_src[:, i], sz_src), tsrc[f₁, f₂],
                         ptriv, false, One(), Zero(), backend, allocator
                     )
                 end
-                buffer_dst = StridedView(buffer1, (blocksize, rows), (1, blocksize), 0)
                 mul!(buffer_dst, buffer_src, transpose(StridedView(U)))
                 @inbounds for (i, (f₃, f₄)) in enumerate(fusiontrees(dst))
                     TO.tensoradd!(
@@ -631,6 +630,7 @@ function add_transform_kernel!(
                         p, false, α, β, backend, allocator
                     )
                 end
+                TO.tensorfree!(buffer, allocator)
             end
         end
     end
@@ -663,7 +663,6 @@ function add_transform_kernel!(
     #   U            — recoupling matrix (rows = dst trees, cols = src trees)
     #   sz_{dst,src} — array shape of each block (same for all trees in the block)
     #   structs_{dst,src}[i] — (offset, strides) into the flat data vector for tree i
-    tl_buffers = OhMyThreads.TaskLocalValue(() -> allocate_buffers(data_dst, data_src, transformer, allocator))
     tforeach(transformer.data; scheduler) do (U, (sz_dst, structs_dst), (sz_src, structs_src))
         if length(U) == 1
             # Degenerate block with a single tree: no matmul needed.
@@ -676,14 +675,15 @@ function add_transform_kernel!(
         else
             # Multi-tree block: pack → recoupling matmul → unpack.
             # buffer2 = source staging area, buffer1 = destination staging area.
-            buffer1, buffer2 = tl_buffers[]
             rows, cols = size(U)
             blocksize = prod(sz_src)
             ptriv = (ntuple(identity, length(sz_src)), ())
+            buffer = TO.tensoralloc(typeof(data_dst), blocksize * (rows + cols), Val(true), allocator)
+            buffer_dst = StridedView(buffer, (blocksize, rows), (1, blocksize), 0)
+            buffer_src = StridedView(buffer, (blocksize, cols), (1, blocksize), blocksize * rows)
 
             # 1. Extract: copy each source block into column i of buffer_src as a flat vector,
             #    using a trivial permutation so the layout is canonical before the matmul.
-            buffer_src = StridedView(buffer2, (blocksize, cols), (1, blocksize), 0)
             @inbounds for (i, struct_src_i) in enumerate(structs_src)
                 TO.tensoradd!(
                     sreshape(buffer_src[:, i], sz_src), StridedView(data_src, sz_src, struct_src_i...),
@@ -693,7 +693,6 @@ function add_transform_kernel!(
 
             # 2. Recoupling: buffer_dst = buffer_src * U^T  (each output tree is a linear
             #    combination of input trees weighted by the recoupling coefficients).
-            buffer_dst = StridedView(buffer1, (blocksize, rows), (1, blocksize), 0)
             mul!(buffer_dst, buffer_src, transpose(StridedView(U)))
 
             # 3. Insert: scatter column i of buffer_dst into the destination, applying the
@@ -704,6 +703,7 @@ function add_transform_kernel!(
                     p, false, α, β, backend, allocator
                 )
             end
+            TO.tensorfree!(buffer, allocator)
         end
     end
     return nothing
diff --git a/src/tensors/treetransformers.jl b/src/tensors/treetransformers.jl
index b602e11ea..aaecbe746 100644
--- a/src/tensors/treetransformers.jl
+++ b/src/tensors/treetransformers.jl
@@ -128,34 +128,6 @@ function repack_transformer_structure(structures::Dictionary, trees)
     return sz, strides_offsets
 end
 
-function buffersize(transformer::GenericTreeTransformer)
-    return maximum(transformer.data; init = 0) do (basistransform, structures_dst, _)
-        return prod(structures_dst[1]) * size(basistransform, 1)
-    end
-end
-
-function allocate_buffers(
-        tdst::TensorMap, tsrc::TensorMap, transformer::GenericTreeTransformer,
-        allocator = TO.DefaultAllocator()
-    )
-    sz = buffersize(transformer)
-    return similar(tdst.data, sz), similar(tsrc.data, sz)
-end
-function allocate_buffers(
-        data_dst::DenseVector, data_src::DenseVector, transformer::GenericTreeTransformer,
-        allocator = TO.DefaultAllocator()
-    )
-    sz = buffersize(transformer)
-    return similar(data_dst, sz), similar(data_src, sz)
-end
-function allocate_buffers(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, transformer,
-        allocator = TO.DefaultAllocator()
-    )
-    # be pessimistic and assume the worst for now
-    sz = dim(space(tsrc))
-    return similar(storagetype(tdst), sz), similar(storagetype(tsrc), sz)
-end
 
 function treetransformertype(Vdst, Vsrc)
     I = sectortype(Vdst)

From db29707890822b29a000e12dfc9dcd5474896bd2 Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Sat, 25 Apr 2026 23:04:36 -0400
Subject: [PATCH 17/23] insert allocator checkpoints/resets

---
 Project.toml                      | 2 +-
 src/tensors/indexmanipulations.jl | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 07f19efca..5dec19b49 100644
--- a/Project.toml
+++ b/Project.toml
@@ -55,7 +55,7 @@ Random = "1"
 ScopedValues = "1.3.0"
 Strided = "2"
 TensorKitSectors = "0.3.7"
-TensorOperations = "5.1"
+TensorOperations = "5.5"
 TupleTools = "1.5"
 VectorInterface = "0.4.8, 0.5"
 cuTENSOR = "6"
diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index a5d618872..c1cb47671 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -589,6 +589,7 @@ function add_transform_kernel!(
             )
         end
     else
+        cp = TO.allocator_checkpoint!(allocator)
         # Non-Abelian fusion: trees sharing the same set of uncoupled (external) sectors
         # form a *fusion block* and mix under the transformation via a recoupling matrix U
         # (rows = destination trees, columns = source trees). We iterate over blocks.
@@ -633,6 +634,7 @@ function add_transform_kernel!(
                 TO.tensorfree!(buffer, allocator)
             end
         end
+        TO.allocator_reset!(allocator, cp)
     end
     return nothing
 end
@@ -663,6 +665,7 @@ function add_transform_kernel!(
     #   U            — recoupling matrix (rows = dst trees, cols = src trees)
     #   sz_{dst,src} — array shape of each block (same for all trees in the block)
     #   structs_{dst,src}[i] — (offset, strides) into the flat data vector for tree i
+    cp = TO.allocator_checkpoint!(allocator)
     tforeach(transformer.data; scheduler) do (U, (sz_dst, structs_dst), (sz_src, structs_src))
         if length(U) == 1
             # Degenerate block with a single tree: no matmul needed.
@@ -706,5 +709,6 @@ function add_transform_kernel!(
             TO.tensorfree!(buffer, allocator)
         end
     end
+    TO.allocator_reset!(allocator, cp)
     return nothing
 end

From 5034aef526adf6926b1bbeed3d31cd4a11e8d9db Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Sun, 26 Apr 2026 08:57:29 -0400
Subject: [PATCH 18/23] attempt to improve `cond` precision in test

---
 test/factorizations/svd.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/factorizations/svd.jl b/test/factorizations/svd.jl
index cec5e1dda..bf61d9e5a 100644
--- a/test/factorizations/svd.jl
+++ b/test/factorizations/svd.jl
@@ -49,9 +49,9 @@ for V in spacelist
             end
             for T in eltypes, t in (randn(T, W, W), randn(T, W, W)')
                 project_hermitian!(t)
-                vals = @constinferred LinearAlgebra.eigvals(t)
-                λmax = maximum(s -> maximum(abs, s), values(vals))
-                λmin = minimum(s -> minimum(abs, s), values(vals))
+                vals = @constinferred eigh_vals(t)
+                λmax = maximum(abs, vals)
+                λmin = minimum(abs, vals)
                 @test cond(t) ≈ λmax / λmin
             end
         end

From 3a1f46a256ee88896e8fd87947700fc7d4a59d8f Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Sun, 26 Apr 2026 13:06:05 -0400
Subject: [PATCH 19/23] add hook for `adapt_transformer`

---
 ext/TensorKitAMDGPUExt/roctensormap.jl | 9 +++++++++
 ext/TensorKitCUDAExt/cutensormap.jl    | 9 +++++++--
 src/tensors/indexmanipulations.jl      | 1 +
 src/tensors/treetransformers.jl        | 9 +++++++++
 4 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/ext/TensorKitAMDGPUExt/roctensormap.jl b/ext/TensorKitAMDGPUExt/roctensormap.jl
index f2f094c60..c65f91062 100644
--- a/ext/TensorKitAMDGPUExt/roctensormap.jl
+++ b/ext/TensorKitAMDGPUExt/roctensormap.jl
@@ -162,3 +162,12 @@ for f in (:sqrt, :log, :asin, :acos, :acosh, :atanh, :acoth)
         return tf
     end
 end
+
+function TensorKit.adapt_transformer(
+        t::TensorKit.GenericTreeTransformer, data::ROCVector
+    )
+    new_data = map(t.data) do (U, structs_dst, structs_src)
+        return AMDGPU.Adapt.adapt(ROCArray, U), structs_dst, structs_src
+    end
+    return TensorKit.GenericTreeTransformer(new_data)
+end
diff --git a/ext/TensorKitCUDAExt/cutensormap.jl b/ext/TensorKitCUDAExt/cutensormap.jl
index 2fefb3a24..0516d7d15 100644
--- a/ext/TensorKitCUDAExt/cutensormap.jl
+++ b/ext/TensorKitCUDAExt/cutensormap.jl
@@ -169,6 +169,11 @@ for f in (:sqrt, :log, :asin, :acos, :acosh, :atanh, :acoth)
     end
 end
 
-function TensorKit._add_transform_multi!(tdst::CuTensorMap, tsrc, p, (U, structs_dst, structs_src)::Tuple{<:Array, TD, TS}, buffers, alpha, beta, backend...) where {TD, TS}
-    return TensorKit._add_transform_multi!(tdst, tsrc, p, (CUDA.CUDACore.Adapt.adapt(CuArray, U), structs_dst, structs_src), buffers, alpha, beta, backend...)
+function TensorKit.adapt_transformer(
+        t::TensorKit.GenericTreeTransformer, data::CuVector
+    )
+    new_data = map(t.data) do (U, structs_dst, structs_src)
+        return CUDA.Adapt.adapt(CuArray, U), structs_dst, structs_src
+    end
+    return TensorKit.GenericTreeTransformer(new_data)
 end
diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index c1cb47671..84865d5e4 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -661,6 +661,7 @@ function add_transform_kernel!(
         data_dst::DenseVector, data_src::DenseVector, p, transformer::GenericTreeTransformer,
         α, β, backend, allocator, scheduler
     )
+    transformer = adapt_transformer(transformer, data_dst)
     # Each entry covers one fusion block:
     #   U            — recoupling matrix (rows = dst trees, cols = src trees)
     #   sz_{dst,src} — array shape of each block (same for all trees in the block)
diff --git a/src/tensors/treetransformers.jl b/src/tensors/treetransformers.jl
index aaecbe746..664f8a0d6 100644
--- a/src/tensors/treetransformers.jl
+++ b/src/tensors/treetransformers.jl
@@ -203,3 +203,12 @@ end
 function _transformer_weight((mat, structs_dst, structs_src)::GenericTransformerData)
     return length(mat) * prod(structs_dst[1])
 end
+
+"""
+    adapt_transformer(transformer::TreeTransformer, data::AbstractVector)
+
+Return a version of `transformer` whose internal arrays are compatible with `data`.
+Default is a no-op. Backends (e.g. CUDA, AMDGPU) should overload this for their vector types
+to ensure the recoupling matrix `U` inside `GenericTreeTransformer` is on the correct device.
+"""
+adapt_transformer(t::TreeTransformer, ::AbstractVector) = t

From 7cad5031a180a260b6fcc5ac4fb751c05f62713b Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Sun, 26 Apr 2026 13:55:59 -0400
Subject: [PATCH 20/23] collapse docstrings

---
 docs/src/lib/fusiontrees.md        | 1 +
 docs/src/lib/sectors.md            | 1 +
 docs/src/lib/spaces.md             | 1 +
 docs/src/lib/tensors.md            | 1 +
 docs/src/man/indexmanipulations.md | 9 ++++-----
 5 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/docs/src/lib/fusiontrees.md b/docs/src/lib/fusiontrees.md
index 8e037af93..57033eca6 100644
--- a/docs/src/lib/fusiontrees.md
+++ b/docs/src/lib/fusiontrees.md
@@ -2,6 +2,7 @@
 
 ```@meta
 CurrentModule = TensorKit
+CollapsedDocStrings = true
 ```
 
 # Type hierarchy
diff --git a/docs/src/lib/sectors.md b/docs/src/lib/sectors.md
index f56980bf0..8a696c675 100644
--- a/docs/src/lib/sectors.md
+++ b/docs/src/lib/sectors.md
@@ -2,6 +2,7 @@
 
 ```@meta
 CurrentModule = TensorKit
+CollapsedDocStrings = true
 ```
 
 ## Type hierarchy
diff --git a/docs/src/lib/spaces.md b/docs/src/lib/spaces.md
index e5705fe3e..a6cce06a1 100644
--- a/docs/src/lib/spaces.md
+++ b/docs/src/lib/spaces.md
@@ -2,6 +2,7 @@
 
 ```@meta
 CurrentModule = TensorKit
+CollapsedDocStrings = true
 ```
 
 ## Type hierarchy
diff --git a/docs/src/lib/tensors.md b/docs/src/lib/tensors.md
index 22e4c2c1b..1ef3da0fb 100644
--- a/docs/src/lib/tensors.md
+++ b/docs/src/lib/tensors.md
@@ -2,6 +2,7 @@
 
 ```@meta
 CurrentModule = TensorKit
+CollapsedDocStrings = true
 ```
 
 ## Type hierarchy
diff --git a/docs/src/man/indexmanipulations.md b/docs/src/man/indexmanipulations.md
index 97de61974..bc5cc331e 100644
--- a/docs/src/man/indexmanipulations.md
+++ b/docs/src/man/indexmanipulations.md
@@ -1,5 +1,9 @@
 # [Index manipulations](@id s_indexmanipulations)
 
+```@meta
+CollapsedDocStrings = true
+```
+
 ```@setup indexmanip
 using TensorKit
 using LinearAlgebra
@@ -62,11 +66,6 @@ They form a hierarchy from most general to most restricted:
 - [`braid`](@ref) is the most general: it accepts any permutation and requires a `levels` argument — a tuple of heights, one per index — that determines whether each index crosses over or under the others it has to pass.
 - [`permute`](@ref) is a simpler interface for sector types with a symmetric braiding (`BraidingStyle(I) isa SymmetricBraiding`), where over- and under-crossings are equivalent and `levels` is therefore not needed.
 - [`transpose`](@ref) is restricted to *cyclic* permutations (indices do not cross). Unlike `braid`, it introduces a compensating (inverse) twist to satisfy the categorical definition of transpose, as illustrated below:
-
-```@raw html
-<img src="../img/tensor-transpose.svg" alt="transpose" class="color-invertible"/>
-```
-
 - [`repartition`](@ref) only moves the codomain/domain boundary without reordering the indices at all.
 
 For plain tensors (`sectortype(t) == Trivial`), `permute` and `braid` act like `permutedims` on the underlying array:

From e2b1fd98afbc023335609492653abcbfaad0d2a1 Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Sun, 26 Apr 2026 14:18:00 -0400
Subject: [PATCH 21/23] update docs structure

---
 docs/make.jl                        |  10 +-
 docs/src/man/contractions.md        | 106 +++++++++++++
 docs/src/man/factorizations.md      |  46 ++++++
 docs/src/man/indexmanipulations.md  |   2 +-
 docs/src/man/linearalgebra.md       |  85 ++++++++++
 docs/src/man/tensormanipulations.md | 238 ----------------------------
 docs/src/man/tensors.md             |   2 +-
 7 files changed, 247 insertions(+), 242 deletions(-)
 create mode 100644 docs/src/man/contractions.md
 create mode 100644 docs/src/man/factorizations.md
 create mode 100644 docs/src/man/linearalgebra.md
 delete mode 100644 docs/src/man/tensormanipulations.md

diff --git a/docs/make.jl b/docs/make.jl
index 5517755ab..d72b851a6 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -26,8 +26,14 @@ pages = [
         "man/intro.md", "man/tutorial.md",
         "man/spaces.md", "man/symmetries.md",
         "man/sectors.md", "man/gradedspaces.md",
-        "man/fusiontrees.md", "man/tensors.md",
-        "man/indexmanipulations.md", "man/tensormanipulations.md",
+        "man/fusiontrees.md",
+        "Tensors" => [
+            "man/tensors.md",
+            "man/linearalgebra.md",
+            "man/indexmanipulations.md",
+            "man/factorizations.md",
+            "man/contractions.md",
+        ],
     ],
     "Library" => [
         "lib/sectors.md", "lib/fusiontrees.md",
diff --git a/docs/src/man/contractions.md b/docs/src/man/contractions.md
new file mode 100644
index 000000000..d70b5681d
--- /dev/null
+++ b/docs/src/man/contractions.md
@@ -0,0 +1,106 @@
+# [Tensor contractions and tensor networks](@id ss_tensor_contraction)
+
+One of the most important operation with tensor maps is to compose them, more generally known as contracting them.
+As mentioned in the section on [category theory](@ref s_categories), a typical composition of maps in a ribbon category can graphically be represented as a planar arrangement of the morphisms (i.e. tensor maps, boxes with lines eminating from top and bottom, corresponding to source and target, i.e. domain and codomain), where the lines connecting the source and targets of the different morphisms should be thought of as ribbons, that can braid over or underneath each other, and that can twist.
+Technically, we can embed this diagram in ``ℝ × [0,1]`` and attach all the unconnected line endings corresponding objects in the source at some position ``(x,0)`` for ``x∈ℝ``, and all line endings corresponding to objects in the target at some position ``(x,1)``.
+The resulting morphism is then invariant under what is known as *framed three-dimensional isotopy*, i.e. three-dimensional rearrangements of the morphism that respect the rules of boxes connected by ribbons whose open endings are kept fixed.
+Such a two-dimensional diagram cannot easily be encoded in a single line of code.
+
+However, things simplify when the braiding is symmetric (such that over- and under- crossings become equivalent, i.e. just crossings), and when twists, i.e. self-crossings in this case, are trivial.
+This amounts to `BraidingStyle(I) == Bosonic()` in the language of TensorKit.jl, and is true for any subcategory of ``\mathbf{Vect}``, i.e. ordinary tensors, possibly with some symmetry constraint.
+The case of ``\mathbf{SVect}`` and its subcategories, and more general categories, are discussed below.
+
+In the case of trivial twists, we can deform the diagram such that we first combine every morphism with a number of coevaluations ``η`` so as to represent it as a tensor, i.e. with a trivial domain.
+We can then rearrange the morphism to be all ligned up horizontally, where the original morphism compositions are now being performed by evaluations ``ϵ``.
+This process will generate a number of crossings and twists, where the latter can be omitted because they act trivially.
+Similarly, double crossings can also be omitted.
+As a consequence, the diagram, or the morphism it represents, is completely specified by the tensors it is composed of, and which indices between the different tensors are connect, via the evaluation ``ϵ``, and which indices make up the source and target of the resulting morphism.
+If we also compose the resulting morphisms with coevaluations so that it has a trivial domain, we just have one type of unconnected lines, henceforth called open indices.
+We sketch such a rearrangement in the following picture
+
+```@raw html
+<img src="../img/tensor-bosoniccontraction.svg" alt="tensor unitary" class="color-invertible"/>
+```
+
+Hence, we can now specify such a tensor diagram, henceforth called a tensor contraction or also tensor network, using a one-dimensional syntax that mimicks [abstract index notation](https://en.wikipedia.org/wiki/Abstract_index_notation) and specifies which indices are connected by the evaluation map using Einstein's summation conventation.
+Indeed, for `BraidingStyle(I) == Bosonic()`, such a tensor contraction can take the same format as if all tensors were just multi-dimensional arrays.
+For this, we rely on the interface provided by the package [TensorOperations.jl](https://github.com/QuantumKitHub/TensorOperations.jl).
+
+The above picture would be encoded as
+```julia
+@tensor E[a, b, c, d, e] := A[v, w, d, x] * B[y, z, c, x] * C[v, e, y, b] * D[a, w, z]
+```
+or
+```julia
+@tensor E[:] := A[1, 2, -4, 3] * B[4, 5, -3, 3] * C[1, -5, 4, -2] * D[-1, 2, 5]
+```
+where the latter syntax is known as NCON-style, and labels the unconnected or outgoing indices with negative integers, and the contracted indices with positive integers.
+
+A number of remarks are in order.
+TensorOperations.jl accepts both integers and any valid variable name as dummy label for indices, and everything in between `[ ]` is not resolved in the current context but interpreted as a dummy label.
+Here, we label the indices of a `TensorMap`, like `A::TensorMap{T, S, N₁, N₂}`, in a linear fashion, where the first position corresponds to the first space in `codomain(A)`, and so forth, up to position `N₁`.
+Index `N₁ + 1` then corresponds to the first space in `domain(A)`.
+However, because we have applied the coevaluation ``η``, it actually corresponds to the corresponding dual space, in accordance with the interface of [`space(A, i)`](@ref) that we introduced [above](@ref ss_tensor_properties), and as indiated by the dotted box around ``A`` in the above picture.
+The same holds for the other tensor maps.
+Note that our convention also requires that we braid indices that we brought from the domain to the codomain, and so this is only unambiguous for a symmetric braiding, where there is a unique way to permute the indices.
+
+With the current syntax, we create a new object `E` because we use the definition operator `:=`.
+Furthermore, with the current syntax, it will be a `Tensor`, i.e. it will have a trivial domain, and correspond to the dotted box in the picture above, rather than the actual morphism `E`.
+We can also directly define `E` with the correct codomain and domain by rather using
+```julia
+@tensor E[a b c;d e] := A[v, w, d, x] * B[y, z, c, x] * C[v, e, y, b] * D[a, w, z]
+```
+or
+```julia
+@tensor E[(a, b, c);(d, e)] := A[v, w, d, x] * B[y, z, c, x] * C[v, e, y, b] * D[a, w, z]
+```
+where the latter syntax can also be used when the codomain is empty.
+When using the assignment operator `=`, the `TensorMap` `E` is assumed to exist and the contents will be written to the currently allocated memory.
+Note that for existing tensors, both on the left hand side and right hand side, trying to specify the indices in the domain and the codomain seperately using the above syntax, has no effect, as the bipartition of indices are already fixed by the existing object.
+Hence, if `E` has been created by the previous line of code, all of the following lines are now equivalent
+```julia
+@tensor E[(a, b, c);(d, e)] = A[v, w, d, x] * B[y, z, c, x] * C[v, e, y, b] * D[a, w, z]
+@tensor E[a, b, c, d, e] = A[v w d; x] * B[(y, z, c); (x, )] * C[v e y; b] * D[a, w, z]
+@tensor E[a b; c d e] = A[v; w d x] * B[y, z, c, x] * C[v, e, y, b] * D[a w; z]
+```
+and none of those will or can change the partition of the indices of `E` into its codomain and its domain.
+
+Two final remarks are in order.
+Firstly, the order of the tensors appearing on the right hand side is irrelevant, as we can reorder them by using the allowed moves of the Penrose graphical calculus, which yields some crossings and a twist.
+As the latter is trivial, it can be omitted, and we just use the same rules to evaluate the newly ordered tensor network.
+For the particular case of matrix-matrix multiplication, which also captures more general settings by appropriotely combining spaces into a single line, we indeed find
+
+```@raw html
+<img src="../img/tensor-contractionreorder.svg" alt="tensor contraction reorder" class="color-invertible"/>
+```
+
+or thus, the following two lines of code yield the same result
+```julia
+@tensor C[i, j] := B[i, k] * A[k, j]
+@tensor C[i, j] := A[k, j] * B[i, k]
+```
+Reordering of tensors can be used internally by the `@tensor` macro to evaluate the contraction in a more efficient manner.
+In particular, the NCON-style of specifying the contraction gives the user control over the order, and there are other macros, such as `@tensoropt`, that try to automate this process.
+There is also an `@ncon` macro and `ncon` function, an we recommend reading the [manual of TensorOperations.jl](https://quantumkithub.github.io/TensorOperations.jl/stable/) to learn more about the possibilities and how they work.
+
+A final remark involves the use of adjoints of tensors.
+The current framework is such that the user should not be too worried about the actual bipartition into codomain and domain of a given `TensorMap` instance.
+Indeed, for tensor contractions the `@tensor` macro figures out the correct manipulations automatically.
+However, when wanting to use the `adjoint` of an instance `t::TensorMap{T, S, N₁, N₂}`, the resulting `adjoint(t)` is an `AbstractTensorMap{T, S, N₂, N₁}` and one needs to know the values of `N₁` and `N₂` to know exactly where the `i`th index of `t` will end up in `adjoint(t)`, and hence the index order of `t'`.
+Within the `@tensor` macro, one can instead use `conj()` on the whole index expression so as to be able to use the original index ordering of `t`.
+For example, for `TensorMap{T, S, 1, 1}` instances, this yields exactly the equivalence one expects, namely one between the following two expressions:
+
+```julia
+@tensor C[i, j] := B'[i, k] * A[k, j]
+@tensor C[i, j] := conj(B[k, i]) * A[k, j]
+```
+
+For e.g. an instance `A::TensorMap{T, S, 3, 2}`, the following two syntaxes have the same effect within an `@tensor` expression: `conj(A[a, b, c, d, e])` and `A'[d, e, a, b, c]`.
+
+## Fermionic tensor contractions
+
+TODO
+
+## Anyonic tensor contractions
+
+TODO
diff --git a/docs/src/man/factorizations.md b/docs/src/man/factorizations.md
new file mode 100644
index 000000000..2fbd8b382
--- /dev/null
+++ b/docs/src/man/factorizations.md
@@ -0,0 +1,46 @@
+# [Tensor factorizations](@id ss_tensor_factorization)
+
+```@setup tensors
+using TensorKit
+using LinearAlgebra
+```
+
+As tensors are linear maps, they suport various kinds of factorizations.
+These functions all interpret the provided `AbstractTensorMap` instances as a map from `domain` to `codomain`, which can be thought of as reshaping the tensor into a matrix according to the current bipartition of the indices.
+
+TensorKit's factorizations are provided by [MatrixAlgebraKit.jl](https://github.com/QuantumKitHub/MatrixAlgebraKit.jl), which is used to supply both the interface, as well as the implementation of the various operations on the blocks of data.
+For specific details on the provided functionality, we refer to its [documentation page](https://quantumkithub.github.io/MatrixAlgebraKit.jl/stable/user_interface/decompositions/).
+
+Finally, note that each of the factorizations takes the current partition of `domain` and `codomain` as the *axis* along which to matricize and perform the factorization.
+In order to obtain factorizations according to a different bipartition of the indices, we can use any of the previously mentioned [index manipulations](@ref s_indexmanipulations) before the factorization.
+
+Some examples to conclude this section
+```@repl tensors
+V1 = SU₂Space(0 => 2, 1/2 => 1)
+V2 = SU₂Space(0 => 1, 1/2 => 1, 1 => 1)
+
+t = randn(V1 ⊗ V1, V2);
+U, S, Vh = svd_compact(t);
+t ≈ U * S * Vh
+D, V = eigh_full(t' * t);
+D ≈ S * S
+U' * U ≈ id(domain(U))
+S
+
+Q, R = left_orth(t; alg = :svd);
+Q' * Q ≈ id(domain(Q))
+t ≈ Q * R
+
+U2, S2, Vh2, ε = svd_trunc(t; trunc = truncspace(V1));
+Vh2 * Vh2' ≈ id(codomain(Vh2))
+S2
+ε ≈ norm(block(S, Irrep[SU₂](1))) * sqrt(dim(Irrep[SU₂](1)))
+
+L, Q = right_orth(permute(t, ((1,), (2, 3))));
+codomain(L), domain(L), domain(Q)
+Q * Q'
+P = Q' * Q;
+P ≈ P * P
+t′ = permute(t, ((1,), (2, 3)));
+t′ ≈ t′ * P
+```
diff --git a/docs/src/man/indexmanipulations.md b/docs/src/man/indexmanipulations.md
index bc5cc331e..8379aac3d 100644
--- a/docs/src/man/indexmanipulations.md
+++ b/docs/src/man/indexmanipulations.md
@@ -65,7 +65,7 @@ They form a hierarchy from most general to most restricted:
 
 - [`braid`](@ref) is the most general: it accepts any permutation and requires a `levels` argument — a tuple of heights, one per index — that determines whether each index crosses over or under the others it has to pass.
 - [`permute`](@ref) is a simpler interface for sector types with a symmetric braiding (`BraidingStyle(I) isa SymmetricBraiding`), where over- and under-crossings are equivalent and `levels` is therefore not needed.
-- [`transpose`](@ref) is restricted to *cyclic* permutations (indices do not cross). Unlike `braid`, it introduces a compensating (inverse) twist to satisfy the categorical definition of transpose, as illustrated below:
+- [`transpose`](@ref) is restricted to *cyclic* permutations (indices do not cross).
 - [`repartition`](@ref) only moves the codomain/domain boundary without reordering the indices at all.
 
 For plain tensors (`sectortype(t) == Trivial`), `permute` and `braid` act like `permutedims` on the underlying array:
diff --git a/docs/src/man/linearalgebra.md b/docs/src/man/linearalgebra.md
new file mode 100644
index 000000000..98c9f489a
--- /dev/null
+++ b/docs/src/man/linearalgebra.md
@@ -0,0 +1,85 @@
+# [Basic linear algebra](@id ss_tensor_linalg)
+
+```@setup tensors
+using TensorKit
+using LinearAlgebra
+```
+
+`AbstractTensorMap` instances `t` represent linear maps, i.e. homomorphisms in a `𝕜`-linear category, just like matrices.
+To a large extent, they follow the interface of `Matrix` in Julia's `LinearAlgebra` standard library.
+Many methods from `LinearAlgebra` are (re)exported by TensorKit.jl, and can then us be used without `using LinearAlgebra` explicitly.
+In all of the following methods, the implementation acts directly on the underlying matrix blocks (typically using the same method) and never needs to perform any basis transforms.
+
+In particular, `AbstractTensorMap` instances can be composed, provided the domain of the first object coincides with the codomain of the second.
+Composing tensor maps uses the regular multiplication symbol as in `t = t1 * t2`, which is also used for matrix multiplication.
+TensorKit.jl also supports (and exports) the mutating method `mul!(t, t1, t2)`.
+We can then also try to invert a tensor map using `inv(t)`, though this can only exist if the domain and codomain are isomorphic, which can e.g. be checked as `fuse(codomain(t)) == fuse(domain(t))`.
+If the inverse is composed with another tensor `t2`, we can use the syntax `t1 \ t2` or `t2 / t1`.
+However, this syntax also accepts instances `t1` whose domain and codomain are not isomorphic, and then amounts to `pinv(t1)`, the Moore-Penrose pseudoinverse.
+This, however, is only really justified as minimizing the least squares problem if `InnerProductStyle(t) <: EuclideanProduct`.
+
+`AbstractTensorMap` instances behave themselves as vectors (i.e. they are `𝕜`-linear) and so they can be multiplied by scalars and, if they live in the same space, i.e. have the same domain and codomain, they can be added to each other.
+There is also a `zero(t)`, the additive identity, which produces a zero tensor with the same domain and codomain as `t`.
+In addition, `TensorMap` supports basic Julia methods such as `fill!` and `copy!`, as well as `copy(t)` to create a copy with independent data.
+Aside from basic `+` and `*` operations, TensorKit.jl reexports a number of efficient in-place methods from `LinearAlgebra`, such as `axpy!` (for `y ← α * x + y`), `axpby!` (for `y ← α * x + β * y`), `lmul!` and `rmul!` (for `y ← α * y` and `y ← y * α`, which is typically the same) and `mul!`, which can also be used for out-of-place scalar multiplication `y ← α * x`.
+
+For `S = spacetype(t)` where `InnerProductStyle(S) <: EuclideanProduct`, we can compute `norm(t)`, and for two such instances, the inner product `dot(t1, t2)`, provided `t1` and `t2` have the same domain and codomain.
+Furthermore, there is `normalize(t)` and `normalize!(t)` to return a scaled version of `t` with unit norm.
+These operations should also exist for `InnerProductStyle(S) <: HasInnerProduct`, but require an interface for defining a custom inner product in these spaces.
+Currently, there is no concrete subtype of `HasInnerProduct` that is not an `EuclideanProduct`.
+In particular, `CartesianSpace`, `ComplexSpace` and `GradedSpace` all have `InnerProductStyle(S) <: EuclideanProduct`.
+
+With tensors that have `InnerProductStyle(t) <: EuclideanProduct` there is associated an adjoint operation, given by `adjoint(t)` or simply `t'`, such that `domain(t') == codomain(t)` and `codomain(t') == domain(t)`.
+Note that for an instance `t::TensorMap{S, N₁, N₂}`, `t'` is simply stored in a wrapper called `AdjointTensorMap{S, N₂, N₁}`, which is another subtype of `AbstractTensorMap`.
+This should be mostly invisible to the user, as all methods should work for this type as well.
+It can be hard to reason about the index order of `t'`, i.e. index `i` of `t` appears in `t'` at index position `j = TensorKit.adjointtensorindex(t, i)`, where the latter method is typically not necessary and hence unexported.
+There is also a plural `TensorKit.adjointtensorindices` to convert multiple indices at once.
+Note that, because the adjoint interchanges domain and codomain, we have `space(t', j) == space(t, i)'`.
+
+`AbstractTensorMap` instances can furthermore be tested for exact (`t1 == t2`) or approximate (`t1 ≈ t2`) equality, though the latter requires that `norm` can be computed.
+
+When tensor map instances are endomorphisms, i.e. they have the same domain and codomain, there is a multiplicative identity which can be obtained as `one(t)` or `one!(t)`, where the latter overwrites the contents of `t`.
+The multiplicative identity on a space `V` can also be obtained using `id(A, V)` as discussed [above](@ref ss_tensor_construction), such that for a general homomorphism `t′`, we have `t′ == id(codomain(t′)) * t′ == t′ * id(domain(t′))`.
+Returning to the case of endomorphisms `t`, we can compute the trace via `tr(t)` and exponentiate them using `exp(t)`, or if the contents of `t` can be destroyed in the process, `exp!(t)`.
+Furthermore, there are a number of tensor factorizations for both endomorphisms and general homomorphisms that we discuss on the [Tensor factorizations](@ref ss_tensor_factorization) page.
+
+Finally, there are a number of operations that also belong in this paragraph because of their analogy to common matrix operations.
+The tensor product of two `TensorMap` instances `t1` and `t2` is obtained as `t1 ⊗ t2` and results in a new `TensorMap` with `codomain(t1 ⊗ t2) = codomain(t1) ⊗ codomain(t2)` and `domain(t1 ⊗ t2) = domain(t1) ⊗ domain(t2)`.
+If we have two `TensorMap{T, S, N, 1}` instances `t1` and `t2` with the same codomain, we can combine them in a way that is analogous to `hcat`, i.e. we stack them such that the new tensor `catdomain(t1, t2)` has also the same codomain, but has a domain which is `domain(t1) ⊕ domain(t2)`.
+Similarly, if `t1` and `t2` are of type `TensorMap{T, S, 1, N}` and have the same domain, the operation `catcodomain(t1, t2)` results in a new tensor with the same domain and a codomain given by `codomain(t1) ⊕ codomain(t2)`, which is the analogy of `vcat`.
+Note that direct sum only makes sense between `ElementarySpace` objects, i.e. there is no way to give a tensor product meaning to a direct sum of tensor product spaces.
+
+Time for some more examples:
+```@repl tensors
+using TensorKit # hide
+V1 = ℂ^2
+t = randn(V1 ← V1 ⊗ V1 ⊗ V1)
+t == t + zero(t) == t * id(domain(t)) == id(codomain(t)) * t
+t2 = randn(ComplexF64, codomain(t), domain(t));
+dot(t2, t)
+tr(t2' * t)
+dot(t2, t) ≈ dot(t', t2')
+dot(t2, t2)
+norm(t2)^2
+t3 = copy!(similar(t, ComplexF64), t);
+t3 == t
+rmul!(t3, 0.8);
+t3 ≈ 0.8 * t
+axpby!(0.5, t2, 1.3im, t3);
+t3 ≈ 0.5 * t2 + 0.8 * 1.3im * t
+t4 = randn(fuse(codomain(t)), codomain(t));
+t5 = TensorMap{Float64}(undef, fuse(codomain(t)), domain(t));
+mul!(t5, t4, t) == t4 * t
+inv(t4) * t4 ≈ id(codomain(t))
+t4 * inv(t4) ≈ id(fuse(codomain(t)))
+t4 \ (t4 * t) ≈ t
+t6 = randn(ComplexF64, V1, codomain(t));
+numout(t4) == numout(t6) == 1
+t7 = catcodomain(t4, t6);
+foreach(println, (codomain(t4), codomain(t6), codomain(t7)))
+norm(t7) ≈ sqrt(norm(t4)^2 + norm(t6)^2)
+t8 = t4 ⊗ t6;
+foreach(println, (codomain(t4), codomain(t6), codomain(t8)))
+foreach(println, (domain(t4), domain(t6), domain(t8)))
+norm(t8) ≈ norm(t4)*norm(t6)
+```
diff --git a/docs/src/man/tensormanipulations.md b/docs/src/man/tensormanipulations.md
deleted file mode 100644
index 2f238f963..000000000
--- a/docs/src/man/tensormanipulations.md
+++ /dev/null
@@ -1,238 +0,0 @@
-# [Manipulating tensors](@id s_tensormanipulations)
-
-## [Vector space and linear algebra operations](@id ss_tensor_linalg)
-
-`AbstractTensorMap` instances `t` represent linear maps, i.e. homomorphisms in a `𝕜`-linear category, just like matrices.
-To a large extent, they follow the interface of `Matrix` in Julia's `LinearAlgebra` standard library.
-Many methods from `LinearAlgebra` are (re)exported by TensorKit.jl, and can then us be used without `using LinearAlgebra` explicitly.
-In all of the following methods, the implementation acts directly on the underlying matrix blocks (typically using the same method) and never needs to perform any basis transforms.
-
-In particular, `AbstractTensorMap` instances can be composed, provided the domain of the first object coincides with the codomain of the second.
-Composing tensor maps uses the regular multiplication symbol as in `t = t1 * t2`, which is also used for matrix multiplication.
-TensorKit.jl also supports (and exports) the mutating method `mul!(t, t1, t2)`.
-We can then also try to invert a tensor map using `inv(t)`, though this can only exist if the domain and codomain are isomorphic, which can e.g. be checked as `fuse(codomain(t)) == fuse(domain(t))`.
-If the inverse is composed with another tensor `t2`, we can use the syntax `t1 \ t2` or `t2 / t1`.
-However, this syntax also accepts instances `t1` whose domain and codomain are not isomorphic, and then amounts to `pinv(t1)`, the Moore-Penrose pseudoinverse.
-This, however, is only really justified as minimizing the least squares problem if `InnerProductStyle(t) <: EuclideanProduct`.
-
-`AbstractTensorMap` instances behave themselves as vectors (i.e. they are `𝕜`-linear) and so they can be multiplied by scalars and, if they live in the same space, i.e. have the same domain and codomain, they can be added to each other.
-There is also a `zero(t)`, the additive identity, which produces a zero tensor with the same domain and codomain as `t`.
-In addition, `TensorMap` supports basic Julia methods such as `fill!` and `copy!`, as well as `copy(t)` to create a copy with independent data.
-Aside from basic `+` and `*` operations, TensorKit.jl reexports a number of efficient in-place methods from `LinearAlgebra`, such as `axpy!` (for `y ← α * x + y`), `axpby!` (for `y ← α * x + β * y`), `lmul!` and `rmul!` (for `y ← α * y` and `y ← y * α`, which is typically the same) and `mul!`, which can also be used for out-of-place scalar multiplication `y ← α * x`.
-
-For `S = spacetype(t)` where `InnerProductStyle(S) <: EuclideanProduct`, we can compute `norm(t)`, and for two such instances, the inner product `dot(t1, t2)`, provided `t1` and `t2` have the same domain and codomain.
-Furthermore, there is `normalize(t)` and `normalize!(t)` to return a scaled version of `t` with unit norm.
-These operations should also exist for `InnerProductStyle(S) <: HasInnerProduct`, but require an interface for defining a custom inner product in these spaces.
-Currently, there is no concrete subtype of `HasInnerProduct` that is not an `EuclideanProduct`.
-In particular, `CartesianSpace`, `ComplexSpace` and `GradedSpace` all have `InnerProductStyle(S) <: EuclideanProduct`.
-
-With tensors that have `InnerProductStyle(t) <: EuclideanProduct` there is associated an adjoint operation, given by `adjoint(t)` or simply `t'`, such that `domain(t') == codomain(t)` and `codomain(t') == domain(t)`.
-Note that for an instance `t::TensorMap{S, N₁, N₂}`, `t'` is simply stored in a wrapper called `AdjointTensorMap{S, N₂, N₁}`, which is another subtype of `AbstractTensorMap`.
-This should be mostly invisible to the user, as all methods should work for this type as well.
-It can be hard to reason about the index order of `t'`, i.e. index `i` of `t` appears in `t'` at index position `j = TensorKit.adjointtensorindex(t, i)`, where the latter method is typically not necessary and hence unexported.
-There is also a plural `TensorKit.adjointtensorindices` to convert multiple indices at once.
-Note that, because the adjoint interchanges domain and codomain, we have `space(t', j) == space(t, i)'`.
-
-`AbstractTensorMap` instances can furthermore be tested for exact (`t1 == t2`) or approximate (`t1 ≈ t2`) equality, though the latter requires that `norm` can be computed.
-
-When tensor map instances are endomorphisms, i.e. they have the same domain and codomain, there is a multiplicative identity which can be obtained as `one(t)` or `one!(t)`, where the latter overwrites the contents of `t`.
-The multiplicative identity on a space `V` can also be obtained using `id(A, V)` as discussed [above](@ref ss_tensor_construction), such that for a general homomorphism `t′`, we have `t′ == id(codomain(t′)) * t′ == t′ * id(domain(t′))`.
-Returning to the case of endomorphisms `t`, we can compute the trace via `tr(t)` and exponentiate them using `exp(t)`, or if the contents of `t` can be destroyed in the process, `exp!(t)`.
-Furthermore, there are a number of tensor factorizations for both endomorphisms and general homomorphism that we discuss below.
-
-Finally, there are a number of operations that also belong in this paragraph because of their analogy to common matrix operations.
-The tensor product of two `TensorMap` instances `t1` and `t2` is obtained as `t1 ⊗ t2` and results in a new `TensorMap` with `codomain(t1 ⊗ t2) = codomain(t1) ⊗ codomain(t2)` and `domain(t1 ⊗ t2) = domain(t1) ⊗ domain(t2)`.
-If we have two `TensorMap{T, S, N, 1}` instances `t1` and `t2` with the same codomain, we can combine them in a way that is analogous to `hcat`, i.e. we stack them such that the new tensor `catdomain(t1, t2)` has also the same codomain, but has a domain which is `domain(t1) ⊕ domain(t2)`.
-Similarly, if `t1` and `t2` are of type `TensorMap{T, S, 1, N}` and have the same domain, the operation `catcodomain(t1, t2)` results in a new tensor with the same domain and a codomain given by `codomain(t1) ⊕ codomain(t2)`, which is the analogy of `vcat`.
-Note that direct sum only makes sense between `ElementarySpace` objects, i.e. there is no way to give a tensor product meaning to a direct sum of tensor product spaces.
-
-Time for some more examples:
-```@repl tensors
-using TensorKit # hide
-V1 = ℂ^2
-t = randn(V1 ← V1 ⊗ V1 ⊗ V1)
-t == t + zero(t) == t * id(domain(t)) == id(codomain(t)) * t
-t2 = randn(ComplexF64, codomain(t), domain(t));
-dot(t2, t)
-tr(t2' * t)
-dot(t2, t) ≈ dot(t', t2')
-dot(t2, t2)
-norm(t2)^2
-t3 = copy!(similar(t, ComplexF64), t);
-t3 == t
-rmul!(t3, 0.8);
-t3 ≈ 0.8 * t
-axpby!(0.5, t2, 1.3im, t3);
-t3 ≈ 0.5 * t2 + 0.8 * 1.3im * t
-t4 = randn(fuse(codomain(t)), codomain(t));
-t5 = TensorMap{Float64}(undef, fuse(codomain(t)), domain(t));
-mul!(t5, t4, t) == t4 * t
-inv(t4) * t4 ≈ id(codomain(t))
-t4 * inv(t4) ≈ id(fuse(codomain(t)))
-t4 \ (t4 * t) ≈ t
-t6 = randn(ComplexF64, V1, codomain(t));
-numout(t4) == numout(t6) == 1
-t7 = catcodomain(t4, t6);
-foreach(println, (codomain(t4), codomain(t6), codomain(t7)))
-norm(t7) ≈ sqrt(norm(t4)^2 + norm(t6)^2)
-t8 = t4 ⊗ t6;
-foreach(println, (codomain(t4), codomain(t6), codomain(t8)))
-foreach(println, (domain(t4), domain(t6), domain(t8)))
-norm(t8) ≈ norm(t4)*norm(t6)
-```
-
-## [Index manipulations](@id ss_indexmanipulation)
-
-Index manipulations are operations that reorganize the bipartition of indices between the codomain and domain, possibly also reordering them or applying braiding isomorphisms.
-They are covered in detail on a dedicated page: [Index manipulations](@ref s_indexmanipulations).
-
-## [Tensor factorizations](@id ss_tensor_factorization)
-
-As tensors are linear maps, they suport various kinds of factorizations.
-These functions all interpret the provided `AbstractTensorMap` instances as a map from `domain` to `codomain`, which can be thought of as reshaping the tensor into a matrix according to the current bipartition of the indices.
-
-TensorKit's factorizations are provided by [MatrixAlgebraKit.jl](https://github.com/QuantumKitHub/MatrixAlgebraKit.jl), which is used to supply both the interface, as well as the implementation of the various operations on the blocks of data.
-For specific details on the provided functionality, we refer to its [documentation page](https://quantumkithub.github.io/MatrixAlgebraKit.jl/stable/user_interface/decompositions/).
-
-Finally, note that each of the factorizations takes the current partition of `domain` and `codomain` as the *axis* along which to matricize and perform the factorization.
-In order to obtain factorizations according to a different bipartition of the indices, we can use any of the previously mentioned [index manipulations](@ref ss_indexmanipulation) before the factorization.
-
-Some examples to conclude this section
-```@repl tensors
-V1 = SU₂Space(0 => 2, 1/2 => 1)
-V2 = SU₂Space(0 => 1, 1/2 => 1, 1 => 1)
-
-t = randn(V1 ⊗ V1, V2);
-U, S, Vh = svd_compact(t);
-t ≈ U * S * Vh
-D, V = eigh_full(t' * t);
-D ≈ S * S
-U' * U ≈ id(domain(U))
-S
-
-Q, R = left_orth(t; alg = :svd);
-Q' * Q ≈ id(domain(Q))
-t ≈ Q * R
-
-U2, S2, Vh2, ε = svd_trunc(t; trunc = truncspace(V1));
-Vh2 * Vh2' ≈ id(codomain(Vh2))
-S2
-ε ≈ norm(block(S, Irrep[SU₂](1))) * sqrt(dim(Irrep[SU₂](1)))
-
-L, Q = right_orth(permute(t, ((1,), (2, 3))));
-codomain(L), domain(L), domain(Q)
-Q * Q'
-P = Q' * Q;
-P ≈ P * P
-t′ = permute(t, ((1,), (2, 3)));
-t′ ≈ t′ * P
-```
-
-## [Bosonic tensor contractions and tensor networks](@id ss_tensor_contraction)
-
-One of the most important operation with tensor maps is to compose them, more generally known as contracting them.
-As mentioned in the section on [category theory](@ref s_categories), a typical composition of maps in a ribbon category can graphically be represented as a planar arrangement of the morphisms (i.e. tensor maps, boxes with lines eminating from top and bottom, corresponding to source and target, i.e. domain and codomain), where the lines connecting the source and targets of the different morphisms should be thought of as ribbons, that can braid over or underneath each other, and that can twist.
-Technically, we can embed this diagram in ``ℝ × [0,1]`` and attach all the unconnected line endings corresponding objects in the source at some position ``(x,0)`` for ``x∈ℝ``, and all line endings corresponding to objects in the target at some position ``(x,1)``.
-The resulting morphism is then invariant under what is known as *framed three-dimensional isotopy*, i.e. three-dimensional rearrangements of the morphism that respect the rules of boxes connected by ribbons whose open endings are kept fixed.
-Such a two-dimensional diagram cannot easily be encoded in a single line of code.
-
-However, things simplify when the braiding is symmetric (such that over- and under- crossings become equivalent, i.e. just crossings), and when twists, i.e. self-crossings in this case, are trivial.
-This amounts to `BraidingStyle(I) == Bosonic()` in the language of TensorKit.jl, and is true for any subcategory of ``\mathbf{Vect}``, i.e. ordinary tensors, possibly with some symmetry constraint.
-The case of ``\mathbf{SVect}`` and its subcategories, and more general categories, are discussed below.
-
-In the case of trivial twists, we can deform the diagram such that we first combine every morphism with a number of coevaluations ``η`` so as to represent it as a tensor, i.e. with a trivial domain.
-We can then rearrange the morphism to be all ligned up horizontally, where the original morphism compositions are now being performed by evaluations ``ϵ``.
-This process will generate a number of crossings and twists, where the latter can be omitted because they act trivially.
-Similarly, double crossings can also be omitted.
-As a consequence, the diagram, or the morphism it represents, is completely specified by the tensors it is composed of, and which indices between the different tensors are connect, via the evaluation ``ϵ``, and which indices make up the source and target of the resulting morphism.
-If we also compose the resulting morphisms with coevaluations so that it has a trivial domain, we just have one type of unconnected lines, henceforth called open indices.
-We sketch such a rearrangement in the following picture
-
-```@raw html
-<img src="../img/tensor-bosoniccontraction.svg" alt="tensor unitary" class="color-invertible"/>
-```
-
-Hence, we can now specify such a tensor diagram, henceforth called a tensor contraction or also tensor network, using a one-dimensional syntax that mimicks [abstract index notation](https://en.wikipedia.org/wiki/Abstract_index_notation) and specifies which indices are connected by the evaluation map using Einstein's summation conventation.
-Indeed, for `BraidingStyle(I) == Bosonic()`, such a tensor contraction can take the same format as if all tensors were just multi-dimensional arrays.
-For this, we rely on the interface provided by the package [TensorOperations.jl](https://github.com/QuantumKitHub/TensorOperations.jl).
-
-The above picture would be encoded as
-```julia
-@tensor E[a, b, c, d, e] := A[v, w, d, x] * B[y, z, c, x] * C[v, e, y, b] * D[a, w, z]
-```
-or
-```julia
-@tensor E[:] := A[1, 2, -4, 3] * B[4, 5, -3, 3] * C[1, -5, 4, -2] * D[-1, 2, 5]
-```
-where the latter syntax is known as NCON-style, and labels the unconnected or outgoing indices with negative integers, and the contracted indices with positive integers.
-
-A number of remarks are in order.
-TensorOperations.jl accepts both integers and any valid variable name as dummy label for indices, and everything in between `[ ]` is not resolved in the current context but interpreted as a dummy label.
-Here, we label the indices of a `TensorMap`, like `A::TensorMap{T, S, N₁, N₂}`, in a linear fashion, where the first position corresponds to the first space in `codomain(A)`, and so forth, up to position `N₁`.
-Index `N₁ + 1` then corresponds to the first space in `domain(A)`.
-However, because we have applied the coevaluation ``η``, it actually corresponds to the corresponding dual space, in accordance with the interface of [`space(A, i)`](@ref) that we introduced [above](@ref ss_tensor_properties), and as indiated by the dotted box around ``A`` in the above picture.
-The same holds for the other tensor maps.
-Note that our convention also requires that we braid indices that we brought from the domain to the codomain, and so this is only unambiguous for a symmetric braiding, where there is a unique way to permute the indices.
-
-With the current syntax, we create a new object `E` because we use the definition operator `:=`.
-Furthermore, with the current syntax, it will be a `Tensor`, i.e. it will have a trivial domain, and correspond to the dotted box in the picture above, rather than the actual morphism `E`.
-We can also directly define `E` with the correct codomain and domain by rather using
-```julia
-@tensor E[a b c;d e] := A[v, w, d, x] * B[y, z, c, x] * C[v, e, y, b] * D[a, w, z]
-```
-or
-```julia
-@tensor E[(a, b, c);(d, e)] := A[v, w, d, x] * B[y, z, c, x] * C[v, e, y, b] * D[a, w, z]
-```
-where the latter syntax can also be used when the codomain is empty.
-When using the assignment operator `=`, the `TensorMap` `E` is assumed to exist and the contents will be written to the currently allocated memory.
-Note that for existing tensors, both on the left hand side and right hand side, trying to specify the indices in the domain and the codomain seperately using the above syntax, has no effect, as the bipartition of indices are already fixed by the existing object.
-Hence, if `E` has been created by the previous line of code, all of the following lines are now equivalent
-```julia
-@tensor E[(a, b, c);(d, e)] = A[v, w, d, x] * B[y, z, c, x] * C[v, e, y, b] * D[a, w, z]
-@tensor E[a, b, c, d, e] = A[v w d; x] * B[(y, z, c); (x, )] * C[v e y; b] * D[a, w, z]
-@tensor E[a b; c d e] = A[v; w d x] * B[y, z, c, x] * C[v, e, y, b] * D[a w; z]
-```
-and none of those will or can change the partition of the indices of `E` into its codomain and its domain.
-
-Two final remarks are in order.
-Firstly, the order of the tensors appearing on the right hand side is irrelevant, as we can reorder them by using the allowed moves of the Penrose graphical calculus, which yields some crossings and a twist.
-As the latter is trivial, it can be omitted, and we just use the same rules to evaluate the newly ordered tensor network.
-For the particular case of matrix-matrix multiplication, which also captures more general settings by appropriotely combining spaces into a single line, we indeed find
-
-```@raw html
-<img src="../img/tensor-contractionreorder.svg" alt="tensor contraction reorder" class="color-invertible"/>
-```
-
-or thus, the following two lines of code yield the same result
-```julia
-@tensor C[i, j] := B[i, k] * A[k, j]
-@tensor C[i, j] := A[k, j] * B[i, k]
-```
-Reordering of tensors can be used internally by the `@tensor` macro to evaluate the contraction in a more efficient manner.
-In particular, the NCON-style of specifying the contraction gives the user control over the order, and there are other macros, such as `@tensoropt`, that try to automate this process.
-There is also an `@ncon` macro and `ncon` function, an we recommend reading the [manual of TensorOperations.jl](https://quantumkithub.github.io/TensorOperations.jl/stable/) to learn more about the possibilities and how they work.
-
-A final remark involves the use of adjoints of tensors.
-The current framework is such that the user should not be too worried about the actual bipartition into codomain and domain of a given `TensorMap` instance.
-Indeed, for tensor contractions the `@tensor` macro figures out the correct manipulations automatically.
-However, when wanting to use the `adjoint` of an instance `t::TensorMap{T, S, N₁, N₂}`, the resulting `adjoint(t)` is an `AbstractTensorMap{T, S, N₂, N₁}` and one needs to know the values of `N₁` and `N₂` to know exactly where the `i`th index of `t` will end up in `adjoint(t)`, and hence the index order of `t'`.
-Within the `@tensor` macro, one can instead use `conj()` on the whole index expression so as to be able to use the original index ordering of `t`.
-For example, for `TensorMap{T, S, 1, 1}` instances, this yields exactly the equivalence one expects, namely one between the following two expressions:
-
-```julia
-@tensor C[i, j] := B'[i, k] * A[k, j]
-@tensor C[i, j] := conj(B[k, i]) * A[k, j]
-```
-
-For e.g. an instance `A::TensorMap{T, S, 3, 2}`, the following two syntaxes have the same effect within an `@tensor` expression: `conj(A[a, b, c, d, e])` and `A'[d, e, a, b, c]`.
-
-Some examples:
-
-## Fermionic tensor contractions
-
-TODO
-
-## Anyonic tensor contractions
-
-TODO
diff --git a/docs/src/man/tensors.md b/docs/src/man/tensors.md
index 2921e5f1b..155dbbe4d 100644
--- a/docs/src/man/tensors.md
+++ b/docs/src/man/tensors.md
@@ -5,7 +5,7 @@ using TensorKit
 using LinearAlgebra
 ```
 
-This last page explains how to create and manipulate tensors in TensorKit.jl.
+This page explains how to construct and access tensors in TensorKit.jl.
 As this is probably the most important part of the manual, we will also focus more strongly on the usage and interface, and less so on the underlying implementation.
 The only aspect of the implementation that we will address is the storage of the tensor data, as this is important to know how to create and initialize a tensor, but will in fact also shed light on how some of the methods work.
 

From 5d9f28118f4c28b5278fe304b89b68fe3394d033 Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Sun, 26 Apr 2026 17:32:19 -0400
Subject: [PATCH 22/23] update `adapt_transformer`

---
 ext/TensorKitAMDGPUExt/roctensormap.jl |  9 +--------
 ext/TensorKitCUDAExt/cutensormap.jl    |  9 +--------
 src/tensors/indexmanipulations.jl      | 16 +++++++++++++---
 src/tensors/treetransformers.jl        |  9 ---------
 4 files changed, 15 insertions(+), 28 deletions(-)

diff --git a/ext/TensorKitAMDGPUExt/roctensormap.jl b/ext/TensorKitAMDGPUExt/roctensormap.jl
index c65f91062..56dea938e 100644
--- a/ext/TensorKitAMDGPUExt/roctensormap.jl
+++ b/ext/TensorKitAMDGPUExt/roctensormap.jl
@@ -163,11 +163,4 @@ for f in (:sqrt, :log, :asin, :acos, :acosh, :atanh, :acoth)
     end
 end
 
-function TensorKit.adapt_transformer(
-        t::TensorKit.GenericTreeTransformer, data::ROCVector
-    )
-    new_data = map(t.data) do (U, structs_dst, structs_src)
-        return AMDGPU.Adapt.adapt(ROCArray, U), structs_dst, structs_src
-    end
-    return TensorKit.GenericTreeTransformer(new_data)
-end
+TensorKit.adapt_transformer(U::AbstractMatrix, ::Type{A}) where {A <: ROCVector} = AMDGPU.Adapt.adapt(ROCArray, U)
diff --git a/ext/TensorKitCUDAExt/cutensormap.jl b/ext/TensorKitCUDAExt/cutensormap.jl
index 0516d7d15..ed2bb2757 100644
--- a/ext/TensorKitCUDAExt/cutensormap.jl
+++ b/ext/TensorKitCUDAExt/cutensormap.jl
@@ -169,11 +169,4 @@ for f in (:sqrt, :log, :asin, :acos, :acosh, :atanh, :acoth)
     end
 end
 
-function TensorKit.adapt_transformer(
-        t::TensorKit.GenericTreeTransformer, data::CuVector
-    )
-    new_data = map(t.data) do (U, structs_dst, structs_src)
-        return CUDA.Adapt.adapt(CuArray, U), structs_dst, structs_src
-    end
-    return TensorKit.GenericTreeTransformer(new_data)
-end
+TensorKit.adapt_transformer(U::AbstractMatrix, ::Type{A}) where {A <: CuVector} = CUDA.CUDACore.Adapt.adapt(CuArray, U)
diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index 84865d5e4..e66ddf195 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -624,7 +624,8 @@ function add_transform_kernel!(
                         ptriv, false, One(), Zero(), backend, allocator
                     )
                 end
-                mul!(buffer_dst, buffer_src, transpose(StridedView(U)))
+                U′ = adapt_transformer(U, storagetype(tdst))
+                mul!(buffer_dst, buffer_src, transpose(StridedView(U′)))
                 @inbounds for (i, (f₃, f₄)) in enumerate(fusiontrees(dst))
                     TO.tensoradd!(
                         tdst[f₃, f₄], sreshape(buffer_dst[:, i], sz_src),
@@ -661,7 +662,6 @@ function add_transform_kernel!(
         data_dst::DenseVector, data_src::DenseVector, p, transformer::GenericTreeTransformer,
         α, β, backend, allocator, scheduler
     )
-    transformer = adapt_transformer(transformer, data_dst)
     # Each entry covers one fusion block:
     #   U            — recoupling matrix (rows = dst trees, cols = src trees)
     #   sz_{dst,src} — array shape of each block (same for all trees in the block)
@@ -697,7 +697,8 @@ function add_transform_kernel!(
 
             # 2. Recoupling: buffer_dst = buffer_src * U^T  (each output tree is a linear
             #    combination of input trees weighted by the recoupling coefficients).
-            mul!(buffer_dst, buffer_src, transpose(StridedView(U)))
+            U′ = adapt_transformer(U, typeof(data_dst))
+            mul!(buffer_dst, buffer_src, transpose(StridedView(U′)))
 
             # 3. Insert: scatter column i of buffer_dst into the destination, applying the
             #    actual index permutation p in the same tensoradd! call.
@@ -713,3 +714,12 @@ function add_transform_kernel!(
     TO.allocator_reset!(allocator, cp)
     return nothing
 end
+
+"""
+    adapt_transformer(U::AbstractMatrix, ::Type{A})
+
+Return a version of the basis transformation `U` that is compatible for storage type `A`.
+Default is a no-op.
+Backends (e.g. CUDA, AMDGPU) should overload this for their vector types to ensure the recoupling matrix `U` is on the correct device.
+"""
+adapt_transformer(U::AbstractMatrix, ::Type{A}) where {A} = U
diff --git a/src/tensors/treetransformers.jl b/src/tensors/treetransformers.jl
index 664f8a0d6..aaecbe746 100644
--- a/src/tensors/treetransformers.jl
+++ b/src/tensors/treetransformers.jl
@@ -203,12 +203,3 @@ end
 function _transformer_weight((mat, structs_dst, structs_src)::GenericTransformerData)
     return length(mat) * prod(structs_dst[1])
 end
-
-"""
-    adapt_transformer(transformer::TreeTransformer, data::AbstractVector)
-
-Return a version of `transformer` whose internal arrays are compatible with `data`.
-Default is a no-op. Backends (e.g. CUDA, AMDGPU) should overload this for their vector types
-to ensure the recoupling matrix `U` inside `GenericTreeTransformer` is on the correct device.
-"""
-adapt_transformer(t::TreeTransformer, ::AbstractVector) = t

From 2c44dca9c94f065f37cfc9a6ad3393a366a12d6d Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Tue, 28 Apr 2026 16:40:59 -0400
Subject: [PATCH 23/23] multithreading is hard -- race conditions are easy...

---
 src/tensors/indexmanipulations.jl | 118 +++++++++++++++++-------------
 src/tensors/treetransformers.jl   |   6 ++
 2 files changed, 75 insertions(+), 49 deletions(-)

diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index e66ddf195..499f01ae0 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -584,59 +584,68 @@ function add_transform_kernel!(
         tforeach(fusiontrees(tsrc); scheduler) do (f₁, f₂)
             (f₁′, f₂′), coeff = transformer((f₁, f₂))
             @inbounds TO.tensoradd!(
-                tdst[f₁′, f₂′], tsrc[f₁, f₂],
-                p, false, α * coeff, β, backend, allocator
+                tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * coeff, β, backend, allocator
             )
         end
-    else
-        cp = TO.allocator_checkpoint!(allocator)
-        # Non-Abelian fusion: trees sharing the same set of uncoupled (external) sectors
-        # form a *fusion block* and mix under the transformation via a recoupling matrix U
-        # (rows = destination trees, columns = source trees). We iterate over blocks.
-        tforeach(fusionblocks(tsrc); scheduler) do src
-            dst, U = transformer(src)
-            if length(src) == 1
-                # Degenerate block: single tree, U is a 1×1 scalar — skip the buffer + matmul.
-                (f₁, f₂) = only(fusiontrees(src))
-                (f₁′, f₂′) = only(fusiontrees(dst))
-                @inbounds TO.tensoradd!(
-                    tdst[f₁′, f₂′], tsrc[f₁, f₂],
-                    p, false, α * only(U), β, backend, allocator
+        return nothing
+    end
+    cp = TO.allocator_checkpoint!(allocator)
+    # Non-Abelian fusion: trees sharing the same set of uncoupled (external) sectors
+    # form a *fusion block* and mix under the transformation via a recoupling matrix U
+    # (rows = destination trees, columns = source trees). We iterate over blocks.
+
+    # buffers have to be created without race condition: err on the side of caution
+    buffersz = 2 * buffersize(transformer)
+    generate_buffer = let lock = Threads.ReentrantLock(), allocator = allocator
+        () -> @lock lock TO.tensoralloc(typeof(data_dst), buffersz, Val(true), allocator)
+    end
+
+    OhMyThreads.@tasks for src in fusionblocks(tsrc)
+        # setup
+        OhMyThreads.@set scheduler = scheduler
+        OhMyThreads.@local buffer = generate_buffer()
+
+        dst, U = transformer(src)
+
+        if length(src) == 1
+            # Degenerate block: single tree, U is a 1×1 scalar — skip the buffer + matmul.
+            (f₁, f₂) = only(fusiontrees(src))
+            (f₁′, f₂′) = only(fusiontrees(dst))
+            @inbounds TO.tensoradd!(
+                tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * only(U), β, backend, allocator
+            )
+        else
+            # Multi-tree block: apply recoupling via a three-step pack → matmul → unpack.
+            #   1. Extract: flatten each source block into a column of buffer_src
+            #      (shape blocksize × cols), using a trivial permutation so that the
+            #      index layout is canonical before the matmul.
+            #   2. Recoupling: buffer_dst = buffer_src * U^T  (blocksize × rows)
+            #   3. Insert: scatter columns of buffer_dst to destination blocks,
+            #      applying the actual permutation p in the same step.
+            rows, cols = size(U)
+            sz_src = size(tsrc[first(fusiontrees(src))...])
+            blocksize = prod(sz_src)
+            ptriv = (ntuple(identity, length(sz_src)), ())
+            buffer_dst = StridedView(buffer, (blocksize, rows), (1, blocksize), 0)
+            buffer_src = StridedView(buffer, (blocksize, cols), (1, blocksize), blocksize * rows)
+            @inbounds for (i, (f₁, f₂)) in enumerate(fusiontrees(src))
+                TO.tensoradd!(
+                    sreshape(buffer_src[:, i], sz_src), tsrc[f₁, f₂],
+                    ptriv, false, One(), Zero(), backend, allocator
                 )
-            else
-                # Multi-tree block: apply recoupling via a three-step pack → matmul → unpack.
-                #   1. Extract: flatten each source block into a column of buffer_src
-                #      (shape blocksize × cols), using a trivial permutation so that the
-                #      index layout is canonical before the matmul.
-                #   2. Recoupling: buffer_dst = buffer_src * U^T  (blocksize × rows)
-                #   3. Insert: scatter columns of buffer_dst to destination blocks,
-                #      applying the actual permutation p in the same step.
-                rows, cols = size(U)
-                sz_src = size(tsrc[first(fusiontrees(src))...])
-                blocksize = prod(sz_src)
-                ptriv = (ntuple(identity, length(sz_src)), ())
-                buffer = TO.tensoralloc(storagetype(tdst), blocksize * (rows + cols), Val(true), allocator)
-                buffer_dst = StridedView(buffer, (blocksize, rows), (1, blocksize), 0)
-                buffer_src = StridedView(buffer, (blocksize, cols), (1, blocksize), blocksize * rows)
-                @inbounds for (i, (f₁, f₂)) in enumerate(fusiontrees(src))
-                    TO.tensoradd!(
-                        sreshape(buffer_src[:, i], sz_src), tsrc[f₁, f₂],
-                        ptriv, false, One(), Zero(), backend, allocator
-                    )
-                end
-                U′ = adapt_transformer(U, storagetype(tdst))
-                mul!(buffer_dst, buffer_src, transpose(StridedView(U′)))
-                @inbounds for (i, (f₃, f₄)) in enumerate(fusiontrees(dst))
-                    TO.tensoradd!(
-                        tdst[f₃, f₄], sreshape(buffer_dst[:, i], sz_src),
-                        p, false, α, β, backend, allocator
-                    )
-                end
-                TO.tensorfree!(buffer, allocator)
             end
+            U′ = adapt_transformer(U, storagetype(tdst))
+            mul!(buffer_dst, buffer_src, transpose(StridedView(U′)))
+            @inbounds for (i, (f₃, f₄)) in enumerate(fusiontrees(dst))
+                TO.tensoradd!(
+                    tdst[f₃, f₄], sreshape(buffer_dst[:, i], sz_src),
+                    p, false, α, β, backend, allocator
+                )
+            end
+            TO.tensorfree!(buffer, allocator)
         end
-        TO.allocator_reset!(allocator, cp)
     end
+    TO.allocator_reset!(allocator, cp)
     return nothing
 end
 
@@ -667,7 +676,19 @@ function add_transform_kernel!(
     #   sz_{dst,src} — array shape of each block (same for all trees in the block)
     #   structs_{dst,src}[i] — (offset, strides) into the flat data vector for tree i
     cp = TO.allocator_checkpoint!(allocator)
-    tforeach(transformer.data; scheduler) do (U, (sz_dst, structs_dst), (sz_src, structs_src))
+
+    # buffers have to be created without race condition: err on the side of caution
+    buffersz = 2 * buffersize(transformer)
+    generate_buffer = let lock = Threads.ReentrantLock(), allocator = allocator
+        () -> @lock lock TO.tensoralloc(typeof(data_dst), buffersz, Val(true), allocator)
+    end
+
+    OhMyThreads.@tasks for subtransformer in transformer.data
+        # setup
+        OhMyThreads.@set scheduler = scheduler
+        OhMyThreads.@local buffer = generate_buffer()
+        U, (sz_dst, structs_dst), (sz_src, structs_src) = subtransformer
+
         if length(U) == 1
             # Degenerate block with a single tree: no matmul needed.
             coeff = only(U)
@@ -682,7 +703,6 @@ function add_transform_kernel!(
             rows, cols = size(U)
             blocksize = prod(sz_src)
             ptriv = (ntuple(identity, length(sz_src)), ())
-            buffer = TO.tensoralloc(typeof(data_dst), blocksize * (rows + cols), Val(true), allocator)
             buffer_dst = StridedView(buffer, (blocksize, rows), (1, blocksize), 0)
             buffer_src = StridedView(buffer, (blocksize, cols), (1, blocksize), blocksize * rows)
 
diff --git a/src/tensors/treetransformers.jl b/src/tensors/treetransformers.jl
index aaecbe746..f68378cc2 100644
--- a/src/tensors/treetransformers.jl
+++ b/src/tensors/treetransformers.jl
@@ -203,3 +203,9 @@ end
 function _transformer_weight((mat, structs_dst, structs_src)::GenericTransformerData)
     return length(mat) * prod(structs_dst[1])
 end
+
+function buffersize(transformer::GenericTreeTransformer)
+    return maximum(transformer.data; init = 0) do (basistransform, structures_dst, _)
+        return prod(structures_dst[1]) * size(basistransform, 1)
+    end
+end