diff --git a/Project.toml b/Project.toml
index 07f19efca..5dec19b49 100644
--- a/Project.toml
+++ b/Project.toml
@@ -55,7 +55,7 @@ Random = "1"
 ScopedValues = "1.3.0"
 Strided = "2"
 TensorKitSectors = "0.3.7"
-TensorOperations = "5.1"
+TensorOperations = "5.5"
 TupleTools = "1.5"
 VectorInterface = "0.4.8, 0.5"
 cuTENSOR = "6"
diff --git a/docs/make.jl b/docs/make.jl
index 34b025580..d72b851a6 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -9,7 +9,7 @@ end
 using Documenter
 using Random
 using TensorKit
-using TensorKit: FusionTreePair, FusionTreeBlock, Index2Tuple
+using TensorKit: FusionTreePair, FusionTreeBlock, Index2Tuple, IndexTuple
 using TensorKit.TensorKitSectors
 using TensorKit.MatrixAlgebraKit
 using DocumenterInterLinks
@@ -26,8 +26,14 @@ pages = [
         "man/intro.md", "man/tutorial.md",
         "man/spaces.md", "man/symmetries.md",
         "man/sectors.md", "man/gradedspaces.md",
-        "man/fusiontrees.md", "man/tensors.md",
-        "man/tensormanipulations.md",
+        "man/fusiontrees.md",
+        "Tensors" => [
+            "man/tensors.md",
+            "man/linearalgebra.md",
+            "man/indexmanipulations.md",
+            "man/factorizations.md",
+            "man/contractions.md",
+        ],
     ],
     "Library" => [
         "lib/sectors.md", "lib/fusiontrees.md",
diff --git a/docs/src/lib/fusiontrees.md b/docs/src/lib/fusiontrees.md
index 8e037af93..57033eca6 100644
--- a/docs/src/lib/fusiontrees.md
+++ b/docs/src/lib/fusiontrees.md
@@ -2,6 +2,7 @@
 
 ```@meta
 CurrentModule = TensorKit
+CollapsedDocStrings = true
 ```
 
 # Type hierarchy
diff --git a/docs/src/lib/sectors.md b/docs/src/lib/sectors.md
index f56980bf0..8a696c675 100644
--- a/docs/src/lib/sectors.md
+++ b/docs/src/lib/sectors.md
@@ -2,6 +2,7 @@
 
 ```@meta
 CurrentModule = TensorKit
+CollapsedDocStrings = true
 ```
 
 ## Type hierarchy
diff --git a/docs/src/lib/spaces.md b/docs/src/lib/spaces.md
index e5705fe3e..a6cce06a1 100644
--- a/docs/src/lib/spaces.md
+++ b/docs/src/lib/spaces.md
@@ -2,6 +2,7 @@
 
 ```@meta
 CurrentModule = TensorKit
+CollapsedDocStrings = true
 ```
 
 ## Type hierarchy
diff --git a/docs/src/lib/tensors.md b/docs/src/lib/tensors.md
index b19537e3f..1ef3da0fb 100644
--- a/docs/src/lib/tensors.md
+++ b/docs/src/lib/tensors.md
@@ -2,6 +2,7 @@
 
 ```@meta
 CurrentModule = TensorKit
+CollapsedDocStrings = true
 ```
 
 ## Type hierarchy
@@ -184,12 +185,6 @@ repartition!
 twist!
 ```
 
-```@docs
-TensorKit.add_permute!
-TensorKit.add_braid!
-TensorKit.add_transpose!
-```
-
 ### Tensor map composition, traces, contractions and tensor products
 
 ```@docs
diff --git a/docs/src/man/contractions.md b/docs/src/man/contractions.md
new file mode 100644
index 000000000..d70b5681d
--- /dev/null
+++ b/docs/src/man/contractions.md
@@ -0,0 +1,106 @@
+# [Tensor contractions and tensor networks](@id ss_tensor_contraction)
+
+One of the most important operation with tensor maps is to compose them, more generally known as contracting them.
+As mentioned in the section on [category theory](@ref s_categories), a typical composition of maps in a ribbon category can graphically be represented as a planar arrangement of the morphisms (i.e. tensor maps, boxes with lines eminating from top and bottom, corresponding to source and target, i.e. domain and codomain), where the lines connecting the source and targets of the different morphisms should be thought of as ribbons, that can braid over or underneath each other, and that can twist.
+Technically, we can embed this diagram in ``ℝ × [0,1]`` and attach all the unconnected line endings corresponding objects in the source at some position ``(x,0)`` for ``x∈ℝ``, and all line endings corresponding to objects in the target at some position ``(x,1)``.
+The resulting morphism is then invariant under what is known as *framed three-dimensional isotopy*, i.e. three-dimensional rearrangements of the morphism that respect the rules of boxes connected by ribbons whose open endings are kept fixed.
+Such a two-dimensional diagram cannot easily be encoded in a single line of code.
+
+However, things simplify when the braiding is symmetric (such that over- and under- crossings become equivalent, i.e. just crossings), and when twists, i.e. self-crossings in this case, are trivial.
+This amounts to `BraidingStyle(I) == Bosonic()` in the language of TensorKit.jl, and is true for any subcategory of ``\mathbf{Vect}``, i.e. ordinary tensors, possibly with some symmetry constraint.
+The case of ``\mathbf{SVect}`` and its subcategories, and more general categories, are discussed below.
+
+In the case of trivial twists, we can deform the diagram such that we first combine every morphism with a number of coevaluations ``η`` so as to represent it as a tensor, i.e. with a trivial domain.
+We can then rearrange the morphism to be all ligned up horizontally, where the original morphism compositions are now being performed by evaluations ``ϵ``.
+This process will generate a number of crossings and twists, where the latter can be omitted because they act trivially.
+Similarly, double crossings can also be omitted.
+As a consequence, the diagram, or the morphism it represents, is completely specified by the tensors it is composed of, and which indices between the different tensors are connect, via the evaluation ``ϵ``, and which indices make up the source and target of the resulting morphism.
+If we also compose the resulting morphisms with coevaluations so that it has a trivial domain, we just have one type of unconnected lines, henceforth called open indices.
+We sketch such a rearrangement in the following picture
+
+```@raw html
+<img src="../img/tensor-bosoniccontraction.svg" alt="tensor unitary" class="color-invertible"/>
+```
+
+Hence, we can now specify such a tensor diagram, henceforth called a tensor contraction or also tensor network, using a one-dimensional syntax that mimicks [abstract index notation](https://en.wikipedia.org/wiki/Abstract_index_notation) and specifies which indices are connected by the evaluation map using Einstein's summation conventation.
+Indeed, for `BraidingStyle(I) == Bosonic()`, such a tensor contraction can take the same format as if all tensors were just multi-dimensional arrays.
+For this, we rely on the interface provided by the package [TensorOperations.jl](https://github.com/QuantumKitHub/TensorOperations.jl).
+
+The above picture would be encoded as
+```julia
+@tensor E[a, b, c, d, e] := A[v, w, d, x] * B[y, z, c, x] * C[v, e, y, b] * D[a, w, z]
+```
+or
+```julia
+@tensor E[:] := A[1, 2, -4, 3] * B[4, 5, -3, 3] * C[1, -5, 4, -2] * D[-1, 2, 5]
+```
+where the latter syntax is known as NCON-style, and labels the unconnected or outgoing indices with negative integers, and the contracted indices with positive integers.
+
+A number of remarks are in order.
+TensorOperations.jl accepts both integers and any valid variable name as dummy label for indices, and everything in between `[ ]` is not resolved in the current context but interpreted as a dummy label.
+Here, we label the indices of a `TensorMap`, like `A::TensorMap{T, S, N₁, N₂}`, in a linear fashion, where the first position corresponds to the first space in `codomain(A)`, and so forth, up to position `N₁`.
+Index `N₁ + 1` then corresponds to the first space in `domain(A)`.
+However, because we have applied the coevaluation ``η``, it actually corresponds to the corresponding dual space, in accordance with the interface of [`space(A, i)`](@ref) that we introduced [above](@ref ss_tensor_properties), and as indiated by the dotted box around ``A`` in the above picture.
+The same holds for the other tensor maps.
+Note that our convention also requires that we braid indices that we brought from the domain to the codomain, and so this is only unambiguous for a symmetric braiding, where there is a unique way to permute the indices.
+
+With the current syntax, we create a new object `E` because we use the definition operator `:=`.
+Furthermore, with the current syntax, it will be a `Tensor`, i.e. it will have a trivial domain, and correspond to the dotted box in the picture above, rather than the actual morphism `E`.
+We can also directly define `E` with the correct codomain and domain by rather using
+```julia
+@tensor E[a b c;d e] := A[v, w, d, x] * B[y, z, c, x] * C[v, e, y, b] * D[a, w, z]
+```
+or
+```julia
+@tensor E[(a, b, c);(d, e)] := A[v, w, d, x] * B[y, z, c, x] * C[v, e, y, b] * D[a, w, z]
+```
+where the latter syntax can also be used when the codomain is empty.
+When using the assignment operator `=`, the `TensorMap` `E` is assumed to exist and the contents will be written to the currently allocated memory.
+Note that for existing tensors, both on the left hand side and right hand side, trying to specify the indices in the domain and the codomain seperately using the above syntax, has no effect, as the bipartition of indices are already fixed by the existing object.
+Hence, if `E` has been created by the previous line of code, all of the following lines are now equivalent
+```julia
+@tensor E[(a, b, c);(d, e)] = A[v, w, d, x] * B[y, z, c, x] * C[v, e, y, b] * D[a, w, z]
+@tensor E[a, b, c, d, e] = A[v w d; x] * B[(y, z, c); (x, )] * C[v e y; b] * D[a, w, z]
+@tensor E[a b; c d e] = A[v; w d x] * B[y, z, c, x] * C[v, e, y, b] * D[a w; z]
+```
+and none of those will or can change the partition of the indices of `E` into its codomain and its domain.
+
+Two final remarks are in order.
+Firstly, the order of the tensors appearing on the right hand side is irrelevant, as we can reorder them by using the allowed moves of the Penrose graphical calculus, which yields some crossings and a twist.
+As the latter is trivial, it can be omitted, and we just use the same rules to evaluate the newly ordered tensor network.
+For the particular case of matrix-matrix multiplication, which also captures more general settings by appropriotely combining spaces into a single line, we indeed find
+
+```@raw html
+<img src="../img/tensor-contractionreorder.svg" alt="tensor contraction reorder" class="color-invertible"/>
+```
+
+or thus, the following two lines of code yield the same result
+```julia
+@tensor C[i, j] := B[i, k] * A[k, j]
+@tensor C[i, j] := A[k, j] * B[i, k]
+```
+Reordering of tensors can be used internally by the `@tensor` macro to evaluate the contraction in a more efficient manner.
+In particular, the NCON-style of specifying the contraction gives the user control over the order, and there are other macros, such as `@tensoropt`, that try to automate this process.
+There is also an `@ncon` macro and `ncon` function, an we recommend reading the [manual of TensorOperations.jl](https://quantumkithub.github.io/TensorOperations.jl/stable/) to learn more about the possibilities and how they work.
+
+A final remark involves the use of adjoints of tensors.
+The current framework is such that the user should not be too worried about the actual bipartition into codomain and domain of a given `TensorMap` instance.
+Indeed, for tensor contractions the `@tensor` macro figures out the correct manipulations automatically.
+However, when wanting to use the `adjoint` of an instance `t::TensorMap{T, S, N₁, N₂}`, the resulting `adjoint(t)` is an `AbstractTensorMap{T, S, N₂, N₁}` and one needs to know the values of `N₁` and `N₂` to know exactly where the `i`th index of `t` will end up in `adjoint(t)`, and hence the index order of `t'`.
+Within the `@tensor` macro, one can instead use `conj()` on the whole index expression so as to be able to use the original index ordering of `t`.
+For example, for `TensorMap{T, S, 1, 1}` instances, this yields exactly the equivalence one expects, namely one between the following two expressions:
+
+```julia
+@tensor C[i, j] := B'[i, k] * A[k, j]
+@tensor C[i, j] := conj(B[k, i]) * A[k, j]
+```
+
+For e.g. an instance `A::TensorMap{T, S, 3, 2}`, the following two syntaxes have the same effect within an `@tensor` expression: `conj(A[a, b, c, d, e])` and `A'[d, e, a, b, c]`.
+
+## Fermionic tensor contractions
+
+TODO
+
+## Anyonic tensor contractions
+
+TODO
diff --git a/docs/src/man/factorizations.md b/docs/src/man/factorizations.md
new file mode 100644
index 000000000..2fbd8b382
--- /dev/null
+++ b/docs/src/man/factorizations.md
@@ -0,0 +1,46 @@
+# [Tensor factorizations](@id ss_tensor_factorization)
+
+```@setup tensors
+using TensorKit
+using LinearAlgebra
+```
+
+As tensors are linear maps, they suport various kinds of factorizations.
+These functions all interpret the provided `AbstractTensorMap` instances as a map from `domain` to `codomain`, which can be thought of as reshaping the tensor into a matrix according to the current bipartition of the indices.
+
+TensorKit's factorizations are provided by [MatrixAlgebraKit.jl](https://github.com/QuantumKitHub/MatrixAlgebraKit.jl), which is used to supply both the interface, as well as the implementation of the various operations on the blocks of data.
+For specific details on the provided functionality, we refer to its [documentation page](https://quantumkithub.github.io/MatrixAlgebraKit.jl/stable/user_interface/decompositions/).
+
+Finally, note that each of the factorizations takes the current partition of `domain` and `codomain` as the *axis* along which to matricize and perform the factorization.
+In order to obtain factorizations according to a different bipartition of the indices, we can use any of the previously mentioned [index manipulations](@ref s_indexmanipulations) before the factorization.
+
+Some examples to conclude this section
+```@repl tensors
+V1 = SU₂Space(0 => 2, 1/2 => 1)
+V2 = SU₂Space(0 => 1, 1/2 => 1, 1 => 1)
+
+t = randn(V1 ⊗ V1, V2);
+U, S, Vh = svd_compact(t);
+t ≈ U * S * Vh
+D, V = eigh_full(t' * t);
+D ≈ S * S
+U' * U ≈ id(domain(U))
+S
+
+Q, R = left_orth(t; alg = :svd);
+Q' * Q ≈ id(domain(Q))
+t ≈ Q * R
+
+U2, S2, Vh2, ε = svd_trunc(t; trunc = truncspace(V1));
+Vh2 * Vh2' ≈ id(codomain(Vh2))
+S2
+ε ≈ norm(block(S, Irrep[SU₂](1))) * sqrt(dim(Irrep[SU₂](1)))
+
+L, Q = right_orth(permute(t, ((1,), (2, 3))));
+codomain(L), domain(L), domain(Q)
+Q * Q'
+P = Q' * Q;
+P ≈ P * P
+t′ = permute(t, ((1,), (2, 3)));
+t′ ≈ t′ * P
+```
diff --git a/docs/src/man/indexmanipulations.md b/docs/src/man/indexmanipulations.md
new file mode 100644
index 000000000..8379aac3d
--- /dev/null
+++ b/docs/src/man/indexmanipulations.md
@@ -0,0 +1,114 @@
+# [Index manipulations](@id s_indexmanipulations)
+
+```@meta
+CollapsedDocStrings = true
+```
+
+```@setup indexmanip
+using TensorKit
+using LinearAlgebra
+```
+
+A `TensorMap{T, S, N₁, N₂}` is a linear map from a domain (a `ProductSpace{S, N₂}`) to a codomain (a `ProductSpace{S, N₁}`).
+In practice, the bipartition of the `N₁ + N₂` indices between domain and codomain is often not fixed: algorithms typically need to reshuffle indices between the two sides, reorder them, or change the arrow direction on individual indices before passing a tensor to a factorization or contraction.
+
+Index manipulations cover all such operations.
+They act on the structure of the tensor data in a way that is fully determined by the categorical data of the `sectortype`, such that TensorKit automatically manipulates the tensor entries accordingly.
+The operations fall into three groups, which mirror the structure of the source file:
+
+*   **Reweighting**: [`flip`](@ref) and [`twist`](@ref) apply local isomorphisms to individual indices without changing the index structure.
+*   **Space insertion/removal**: [`insertleftunit`](@ref), [`insertrightunit`](@ref) and [`removeunit`](@ref) add or remove trivial (scalar) index factors.
+*   **Index rearrangements**: [`permute`](@ref), [`braid`](@ref), [`transpose`](@ref) and [`repartition`](@ref) reorder indices and/or move them between domain and codomain.
+
+Throughout this page, new index positions are specified using `Index2Tuple{N₁, N₂}`, i.e. a pair `(p₁, p₂)` of index tuples.
+The indices listed in `p₁` form the new codomain and those in `p₂` form the new domain.
+The following helpers retrieve the current index structure of a tensor:
+
+```@docs; canonical=false
+numout
+numin
+numind
+codomainind
+domainind
+allind
+```
+
+## Reweighting
+
+Reweighting operations modify the entries of a tensor by applying local isomorphisms to individual indices, without changing the number of indices or their partition between domain and codomain.
+
+[`flip`](@ref) changes the arrow direction on selected indices by applying the corresponding isomorphism between a space and its dual.
+[`twist`](@ref) applies the topological spin (monoidal twist) to selected indices; for `BraidingStyle(I) == Bosonic()` this is always trivial.
+
+```@docs; canonical=false
+flip(t::AbstractTensorMap, I)
+twist(::AbstractTensorMap, ::Int)
+twist!
+```
+
+## Inserting and removing unit spaces
+
+These functions add or remove a trivial tensor product factor at a specified index position, without affecting any other indices.
+[`insertleftunit`](@ref) inserts before position `i` and [`insertrightunit`](@ref) inserts after position `i`; [`removeunit`](@ref) undoes either insertion.
+Passing `Val(i)` instead of an `Int` for the position may improve type stability.
+
+```@docs; canonical=false
+insertleftunit(::AbstractTensorMap, ::Val{i}) where {i}
+insertrightunit(::AbstractTensorMap, ::Val{i}) where {i}
+removeunit(::AbstractTensorMap, ::Val{i}) where {i}
+```
+
+## Index rearrangements
+
+These operations reorder indices and/or move them between domain and codomain by applying the transposing or braiding isomorphisms of the underlying category.
+They form a hierarchy from most general to most restricted:
+
+- [`braid`](@ref) is the most general: it accepts any permutation and requires a `levels` argument — a tuple of heights, one per index — that determines whether each index crosses over or under the others it has to pass.
+- [`permute`](@ref) is a simpler interface for sector types with a symmetric braiding (`BraidingStyle(I) isa SymmetricBraiding`), where over- and under-crossings are equivalent and `levels` is therefore not needed.
+- [`transpose`](@ref) is restricted to *cyclic* permutations (indices do not cross).
+- [`repartition`](@ref) only moves the codomain/domain boundary without reordering the indices at all.
+
+For plain tensors (`sectortype(t) == Trivial`), `permute` and `braid` act like `permutedims` on the underlying array:
+
+```@repl indexmanip
+V = ℂ^2;
+t = randn(V ⊗ V ← V ⊗ V);
+ta = convert(Array, t);
+t′ = permute(t, ((4, 2, 3), (1,)));
+convert(Array, t′) ≈ permutedims(ta, (4, 2, 3, 1))
+```
+
+```@docs; canonical=false
+braid(::AbstractTensorMap, ::Index2Tuple, ::IndexTuple)
+braid!
+permute(::AbstractTensorMap, ::Index2Tuple)
+permute!(::AbstractTensorMap, ::AbstractTensorMap, ::Index2Tuple)
+transpose(::AbstractTensorMap, ::Index2Tuple)
+transpose!
+repartition(::AbstractTensorMap, ::Int, ::Int)
+repartition!
+```
+
+## Fusing and splitting indices
+
+There is no dedicated function for fusing or splitting indices.
+For a plain tensor (`sectortype(t) == Trivial`), this is equivalent to `reshape` on the underlying array.
+
+In the general case there is no canonical embedding of `V1 ⊗ V2` into the fused space `V = fuse(V1 ⊗ V2)`: any two such embeddings differ by a basis transform, i.e. there is a gauge freedom.
+TensorKit resolves this by requiring the user to construct an explicit isomorphism — the *fuser* — and contract it with the tensor:
+
+```julia
+f = unitary(fuse(space(t, i) ⊗ space(t, j)), space(t, i) ⊗ space(t, j))
+@tensor t_fused[…, a, …] := f[a, i, j] * t[…, i, j, …]
+```
+
+Splitting is then the adjoint of the same map:
+
+```julia
+@tensor t_split[…, i, j, …] := f'[i, j, a] * t_fused[…, a, …]
+```
+
+Using `f'` as the splitter guarantees that the round-trip is the identity, i.e. `t_split == t`.
+Using a *different* isomorphism to split would give a physically equivalent but numerically different tensor, so it is important to keep `f` and its adjoint consistent throughout a calculation.
+
+Note that tensor factorizations (SVD, QR, etc.) can be applied directly to any index bipartition without needing to fuse indices first; see [Tensor factorizations](@ref ss_tensor_factorization).
diff --git a/docs/src/man/linearalgebra.md b/docs/src/man/linearalgebra.md
new file mode 100644
index 000000000..98c9f489a
--- /dev/null
+++ b/docs/src/man/linearalgebra.md
@@ -0,0 +1,85 @@
+# [Basic linear algebra](@id ss_tensor_linalg)
+
+```@setup tensors
+using TensorKit
+using LinearAlgebra
+```
+
+`AbstractTensorMap` instances `t` represent linear maps, i.e. homomorphisms in a `𝕜`-linear category, just like matrices.
+To a large extent, they follow the interface of `Matrix` in Julia's `LinearAlgebra` standard library.
+Many methods from `LinearAlgebra` are (re)exported by TensorKit.jl, and can then us be used without `using LinearAlgebra` explicitly.
+In all of the following methods, the implementation acts directly on the underlying matrix blocks (typically using the same method) and never needs to perform any basis transforms.
+
+In particular, `AbstractTensorMap` instances can be composed, provided the domain of the first object coincides with the codomain of the second.
+Composing tensor maps uses the regular multiplication symbol as in `t = t1 * t2`, which is also used for matrix multiplication.
+TensorKit.jl also supports (and exports) the mutating method `mul!(t, t1, t2)`.
+We can then also try to invert a tensor map using `inv(t)`, though this can only exist if the domain and codomain are isomorphic, which can e.g. be checked as `fuse(codomain(t)) == fuse(domain(t))`.
+If the inverse is composed with another tensor `t2`, we can use the syntax `t1 \ t2` or `t2 / t1`.
+However, this syntax also accepts instances `t1` whose domain and codomain are not isomorphic, and then amounts to `pinv(t1)`, the Moore-Penrose pseudoinverse.
+This, however, is only really justified as minimizing the least squares problem if `InnerProductStyle(t) <: EuclideanProduct`.
+
+`AbstractTensorMap` instances behave themselves as vectors (i.e. they are `𝕜`-linear) and so they can be multiplied by scalars and, if they live in the same space, i.e. have the same domain and codomain, they can be added to each other.
+There is also a `zero(t)`, the additive identity, which produces a zero tensor with the same domain and codomain as `t`.
+In addition, `TensorMap` supports basic Julia methods such as `fill!` and `copy!`, as well as `copy(t)` to create a copy with independent data.
+Aside from basic `+` and `*` operations, TensorKit.jl reexports a number of efficient in-place methods from `LinearAlgebra`, such as `axpy!` (for `y ← α * x + y`), `axpby!` (for `y ← α * x + β * y`), `lmul!` and `rmul!` (for `y ← α * y` and `y ← y * α`, which is typically the same) and `mul!`, which can also be used for out-of-place scalar multiplication `y ← α * x`.
+
+For `S = spacetype(t)` where `InnerProductStyle(S) <: EuclideanProduct`, we can compute `norm(t)`, and for two such instances, the inner product `dot(t1, t2)`, provided `t1` and `t2` have the same domain and codomain.
+Furthermore, there is `normalize(t)` and `normalize!(t)` to return a scaled version of `t` with unit norm.
+These operations should also exist for `InnerProductStyle(S) <: HasInnerProduct`, but require an interface for defining a custom inner product in these spaces.
+Currently, there is no concrete subtype of `HasInnerProduct` that is not an `EuclideanProduct`.
+In particular, `CartesianSpace`, `ComplexSpace` and `GradedSpace` all have `InnerProductStyle(S) <: EuclideanProduct`.
+
+With tensors that have `InnerProductStyle(t) <: EuclideanProduct` there is associated an adjoint operation, given by `adjoint(t)` or simply `t'`, such that `domain(t') == codomain(t)` and `codomain(t') == domain(t)`.
+Note that for an instance `t::TensorMap{S, N₁, N₂}`, `t'` is simply stored in a wrapper called `AdjointTensorMap{S, N₂, N₁}`, which is another subtype of `AbstractTensorMap`.
+This should be mostly invisible to the user, as all methods should work for this type as well.
+It can be hard to reason about the index order of `t'`, i.e. index `i` of `t` appears in `t'` at index position `j = TensorKit.adjointtensorindex(t, i)`, where the latter method is typically not necessary and hence unexported.
+There is also a plural `TensorKit.adjointtensorindices` to convert multiple indices at once.
+Note that, because the adjoint interchanges domain and codomain, we have `space(t', j) == space(t, i)'`.
+
+`AbstractTensorMap` instances can furthermore be tested for exact (`t1 == t2`) or approximate (`t1 ≈ t2`) equality, though the latter requires that `norm` can be computed.
+
+When tensor map instances are endomorphisms, i.e. they have the same domain and codomain, there is a multiplicative identity which can be obtained as `one(t)` or `one!(t)`, where the latter overwrites the contents of `t`.
+The multiplicative identity on a space `V` can also be obtained using `id(A, V)` as discussed [above](@ref ss_tensor_construction), such that for a general homomorphism `t′`, we have `t′ == id(codomain(t′)) * t′ == t′ * id(domain(t′))`.
+Returning to the case of endomorphisms `t`, we can compute the trace via `tr(t)` and exponentiate them using `exp(t)`, or if the contents of `t` can be destroyed in the process, `exp!(t)`.
+Furthermore, there are a number of tensor factorizations for both endomorphisms and general homomorphisms that we discuss on the [Tensor factorizations](@ref ss_tensor_factorization) page.
+
+Finally, there are a number of operations that also belong in this paragraph because of their analogy to common matrix operations.
+The tensor product of two `TensorMap` instances `t1` and `t2` is obtained as `t1 ⊗ t2` and results in a new `TensorMap` with `codomain(t1 ⊗ t2) = codomain(t1) ⊗ codomain(t2)` and `domain(t1 ⊗ t2) = domain(t1) ⊗ domain(t2)`.
+If we have two `TensorMap{T, S, N, 1}` instances `t1` and `t2` with the same codomain, we can combine them in a way that is analogous to `hcat`, i.e. we stack them such that the new tensor `catdomain(t1, t2)` has also the same codomain, but has a domain which is `domain(t1) ⊕ domain(t2)`.
+Similarly, if `t1` and `t2` are of type `TensorMap{T, S, 1, N}` and have the same domain, the operation `catcodomain(t1, t2)` results in a new tensor with the same domain and a codomain given by `codomain(t1) ⊕ codomain(t2)`, which is the analogy of `vcat`.
+Note that direct sum only makes sense between `ElementarySpace` objects, i.e. there is no way to give a tensor product meaning to a direct sum of tensor product spaces.
+
+Time for some more examples:
+```@repl tensors
+using TensorKit # hide
+V1 = ℂ^2
+t = randn(V1 ← V1 ⊗ V1 ⊗ V1)
+t == t + zero(t) == t * id(domain(t)) == id(codomain(t)) * t
+t2 = randn(ComplexF64, codomain(t), domain(t));
+dot(t2, t)
+tr(t2' * t)
+dot(t2, t) ≈ dot(t', t2')
+dot(t2, t2)
+norm(t2)^2
+t3 = copy!(similar(t, ComplexF64), t);
+t3 == t
+rmul!(t3, 0.8);
+t3 ≈ 0.8 * t
+axpby!(0.5, t2, 1.3im, t3);
+t3 ≈ 0.5 * t2 + 0.8 * 1.3im * t
+t4 = randn(fuse(codomain(t)), codomain(t));
+t5 = TensorMap{Float64}(undef, fuse(codomain(t)), domain(t));
+mul!(t5, t4, t) == t4 * t
+inv(t4) * t4 ≈ id(codomain(t))
+t4 * inv(t4) ≈ id(fuse(codomain(t)))
+t4 \ (t4 * t) ≈ t
+t6 = randn(ComplexF64, V1, codomain(t));
+numout(t4) == numout(t6) == 1
+t7 = catcodomain(t4, t6);
+foreach(println, (codomain(t4), codomain(t6), codomain(t7)))
+norm(t7) ≈ sqrt(norm(t4)^2 + norm(t6)^2)
+t8 = t4 ⊗ t6;
+foreach(println, (codomain(t4), codomain(t6), codomain(t8)))
+foreach(println, (domain(t4), domain(t6), domain(t8)))
+norm(t8) ≈ norm(t4)*norm(t6)
+```
diff --git a/docs/src/man/tensormanipulations.md b/docs/src/man/tensormanipulations.md
deleted file mode 100644
index 15285fd78..000000000
--- a/docs/src/man/tensormanipulations.md
+++ /dev/null
@@ -1,337 +0,0 @@
-# [Manipulating tensors](@id s_tensormanipulations)
-
-## [Vector space and linear algebra operations](@id ss_tensor_linalg)
-
-`AbstractTensorMap` instances `t` represent linear maps, i.e. homomorphisms in a `𝕜`-linear category, just like matrices.
-To a large extent, they follow the interface of `Matrix` in Julia's `LinearAlgebra` standard library.
-Many methods from `LinearAlgebra` are (re)exported by TensorKit.jl, and can then us be used without `using LinearAlgebra` explicitly.
-In all of the following methods, the implementation acts directly on the underlying matrix blocks (typically using the same method) and never needs to perform any basis transforms.
-
-In particular, `AbstractTensorMap` instances can be composed, provided the domain of the first object coincides with the codomain of the second.
-Composing tensor maps uses the regular multiplication symbol as in `t = t1 * t2`, which is also used for matrix multiplication.
-TensorKit.jl also supports (and exports) the mutating method `mul!(t, t1, t2)`.
-We can then also try to invert a tensor map using `inv(t)`, though this can only exist if the domain and codomain are isomorphic, which can e.g. be checked as `fuse(codomain(t)) == fuse(domain(t))`.
-If the inverse is composed with another tensor `t2`, we can use the syntax `t1 \ t2` or `t2 / t1`.
-However, this syntax also accepts instances `t1` whose domain and codomain are not isomorphic, and then amounts to `pinv(t1)`, the Moore-Penrose pseudoinverse.
-This, however, is only really justified as minimizing the least squares problem if `InnerProductStyle(t) <: EuclideanProduct`.
-
-`AbstractTensorMap` instances behave themselves as vectors (i.e. they are `𝕜`-linear) and so they can be multiplied by scalars and, if they live in the same space, i.e. have the same domain and codomain, they can be added to each other.
-There is also a `zero(t)`, the additive identity, which produces a zero tensor with the same domain and codomain as `t`.
-In addition, `TensorMap` supports basic Julia methods such as `fill!` and `copy!`, as well as `copy(t)` to create a copy with independent data.
-Aside from basic `+` and `*` operations, TensorKit.jl reexports a number of efficient in-place methods from `LinearAlgebra`, such as `axpy!` (for `y ← α * x + y`), `axpby!` (for `y ← α * x + β * y`), `lmul!` and `rmul!` (for `y ← α * y` and `y ← y * α`, which is typically the same) and `mul!`, which can also be used for out-of-place scalar multiplication `y ← α * x`.
-
-For `S = spacetype(t)` where `InnerProductStyle(S) <: EuclideanProduct`, we can compute `norm(t)`, and for two such instances, the inner product `dot(t1, t2)`, provided `t1` and `t2` have the same domain and codomain.
-Furthermore, there is `normalize(t)` and `normalize!(t)` to return a scaled version of `t` with unit norm.
-These operations should also exist for `InnerProductStyle(S) <: HasInnerProduct`, but require an interface for defining a custom inner product in these spaces.
-Currently, there is no concrete subtype of `HasInnerProduct` that is not an `EuclideanProduct`.
-In particular, `CartesianSpace`, `ComplexSpace` and `GradedSpace` all have `InnerProductStyle(S) <: EuclideanProduct`.
-
-With tensors that have `InnerProductStyle(t) <: EuclideanProduct` there is associated an adjoint operation, given by `adjoint(t)` or simply `t'`, such that `domain(t') == codomain(t)` and `codomain(t') == domain(t)`.
-Note that for an instance `t::TensorMap{S, N₁, N₂}`, `t'` is simply stored in a wrapper called `AdjointTensorMap{S, N₂, N₁}`, which is another subtype of `AbstractTensorMap`.
-This should be mostly invisible to the user, as all methods should work for this type as well.
-It can be hard to reason about the index order of `t'`, i.e. index `i` of `t` appears in `t'` at index position `j = TensorKit.adjointtensorindex(t, i)`, where the latter method is typically not necessary and hence unexported.
-There is also a plural `TensorKit.adjointtensorindices` to convert multiple indices at once.
-Note that, because the adjoint interchanges domain and codomain, we have `space(t', j) == space(t, i)'`.
-
-`AbstractTensorMap` instances can furthermore be tested for exact (`t1 == t2`) or approximate (`t1 ≈ t2`) equality, though the latter requires that `norm` can be computed.
-
-When tensor map instances are endomorphisms, i.e. they have the same domain and codomain, there is a multiplicative identity which can be obtained as `one(t)` or `one!(t)`, where the latter overwrites the contents of `t`.
-The multiplicative identity on a space `V` can also be obtained using `id(A, V)` as discussed [above](@ref ss_tensor_construction), such that for a general homomorphism `t′`, we have `t′ == id(codomain(t′)) * t′ == t′ * id(domain(t′))`.
-Returning to the case of endomorphisms `t`, we can compute the trace via `tr(t)` and exponentiate them using `exp(t)`, or if the contents of `t` can be destroyed in the process, `exp!(t)`.
-Furthermore, there are a number of tensor factorizations for both endomorphisms and general homomorphism that we discuss below.
-
-Finally, there are a number of operations that also belong in this paragraph because of their analogy to common matrix operations.
-The tensor product of two `TensorMap` instances `t1` and `t2` is obtained as `t1 ⊗ t2` and results in a new `TensorMap` with `codomain(t1 ⊗ t2) = codomain(t1) ⊗ codomain(t2)` and `domain(t1 ⊗ t2) = domain(t1) ⊗ domain(t2)`.
-If we have two `TensorMap{T, S, N, 1}` instances `t1` and `t2` with the same codomain, we can combine them in a way that is analogous to `hcat`, i.e. we stack them such that the new tensor `catdomain(t1, t2)` has also the same codomain, but has a domain which is `domain(t1) ⊕ domain(t2)`.
-Similarly, if `t1` and `t2` are of type `TensorMap{T, S, 1, N}` and have the same domain, the operation `catcodomain(t1, t2)` results in a new tensor with the same domain and a codomain given by `codomain(t1) ⊕ codomain(t2)`, which is the analogy of `vcat`.
-Note that direct sum only makes sense between `ElementarySpace` objects, i.e. there is no way to give a tensor product meaning to a direct sum of tensor product spaces.
-
-Time for some more examples:
-```@repl tensors
-using TensorKit # hide
-V1 = ℂ^2
-t = randn(V1 ← V1 ⊗ V1 ⊗ V1)
-t == t + zero(t) == t * id(domain(t)) == id(codomain(t)) * t
-t2 = randn(ComplexF64, codomain(t), domain(t));
-dot(t2, t)
-tr(t2' * t)
-dot(t2, t) ≈ dot(t', t2')
-dot(t2, t2)
-norm(t2)^2
-t3 = copy!(similar(t, ComplexF64), t);
-t3 == t
-rmul!(t3, 0.8);
-t3 ≈ 0.8 * t
-axpby!(0.5, t2, 1.3im, t3);
-t3 ≈ 0.5 * t2 + 0.8 * 1.3im * t
-t4 = randn(fuse(codomain(t)), codomain(t));
-t5 = TensorMap{Float64}(undef, fuse(codomain(t)), domain(t));
-mul!(t5, t4, t) == t4 * t
-inv(t4) * t4 ≈ id(codomain(t))
-t4 * inv(t4) ≈ id(fuse(codomain(t)))
-t4 \ (t4 * t) ≈ t
-t6 = randn(ComplexF64, V1, codomain(t));
-numout(t4) == numout(t6) == 1
-t7 = catcodomain(t4, t6);
-foreach(println, (codomain(t4), codomain(t6), codomain(t7)))
-norm(t7) ≈ sqrt(norm(t4)^2 + norm(t6)^2)
-t8 = t4 ⊗ t6;
-foreach(println, (codomain(t4), codomain(t6), codomain(t8)))
-foreach(println, (domain(t4), domain(t6), domain(t8)))
-norm(t8) ≈ norm(t4)*norm(t6)
-```
-
-## [Index manipulations](@id ss_indexmanipulation)
-
-In many cases, the bipartition of tensor indices (i.e. `ElementarySpace` instances) between the codomain and domain is not fixed throughout the different operations that need to be performed on that tensor map, i.e. we want to use the duality to move spaces from domain to codomain and vice versa.
-Furthermore, we want to use the braiding to reshuffle the order of the indices.
-
-For this, we use an interface that is closely related to that for manipulating splitting- fusion tree pairs, namely [`braid`](@ref) and [`permute`](@ref), with the interface
-
-```julia
-braid(t::AbstractTensorMap{T,S,N₁,N₂}, (p1, p2)::Index2Tuple{N₁′,N₂′}, levels::IndexTuple{N₁+N₂,Int})
-```
-
-and
-
-```julia
-permute(t::AbstractTensorMap{T,S,N₁,N₂}, (p1, p2)::Index2Tuple{N₁′,N₂′}; copy = false)
-```
-
-both of which return an instance of `AbstractTensorMap{T, S, N₁′, N₂′}`.
-
-In these methods, `p1` and `p2` specify which of the original tensor indices ranging from `1` to `N₁ + N₂` make up the new codomain (with `N₁′` spaces) and new domain (with `N₂′` spaces).
-Hence, `(p1..., p2...)` should be a valid permutation of `1:(N₁ + N₂)`.
-Note that, throughout TensorKit.jl, permutations are always specified using tuples of `Int`s, for reasons of type stability.
-For `braid`, we also need to specify `levels` or depths for each of the indices of the original tensor, which determine whether indices will braid over or underneath each other (use the braiding or its inverse).
-We refer to the section on [manipulating fusion trees](@ref ss_fusiontrees) for more details.
-
-When `BraidingStyle(sectortype(t)) isa SymmetricBraiding`, we can use the simpler interface of `permute`, which does not require the argument `levels`.
-`permute` accepts a keyword argument `copy`.
-When `copy == true`, the result will be a tensor with newly allocated data that can independently be modified from that of the input tensor `t`.
-When `copy` takes the default value `false`, `permute` can try to return the result in a way that it shares its data with the input tensor `t`, though this is only possible in specific cases (e.g. when `sectortype(S) == Trivial` and `(p1..., p2...) = (1:(N₁+N₂)...)`).
-
-Both `braid` and `permute` come in a version where the result is stored in an already existing tensor, i.e. [`braid!(tdst, tsrc, (p1, p2), levels)`](@ref) and [`permute!(tdst, tsrc, (p1, p2))`](@ref).
-
-Another operation that belongs under index manipulations is taking the `transpose` of a tensor, i.e. `LinearAlgebra.transpose(t)` and `LinearAlgebra.transpose!(tdst, tsrc)`, both of which are reexported by TensorKit.jl.
-Note that `transpose(t)` is not simply equal to reshuffling domain and codomain with `braid(t, (1:(N₁+N₂)...), reverse(domainind(tsrc)), reverse(codomainind(tsrc))))`.
-Indeed, the graphical representation (where we draw the codomain and domain as a single object), makes clear that this introduces an additional (inverse) twist, which is then compensated in the `transpose` implementation.
-
-```@raw html
-<img src="../img/tensor-transpose.svg" alt="transpose" class="color-invertible"/>
-```
-
-In categorical language, the reason for this extra twist is that we use the left coevaluation ``η``, but the right evaluation ``\tilde{ϵ}``, when repartitioning the indices between domain and codomain.
-
-There are a number of other index related manipulations.
-We can apply a twist (or inverse twist) to one of the tensor map indices via [`twist(t, i; inv = false)`](@ref) or [`twist!(t, i; inv = false)`](@ref).
-Note that the latter method does not store the result in a new destination tensor, but just modifies the tensor `t` in place.
-Twisting several indices simultaneously can be obtained by using the defining property
-
-```math
-θ_{V⊗W} = τ_{W,V} ∘ (θ_W ⊗ θ_V) ∘ τ_{V,W} = (θ_V ⊗ θ_W) ∘ τ_{W,V} ∘ τ_{V,W},
-```
-
-but is currently not implemented explicitly.
-
-For all sector types `I` with `BraidingStyle(I) == Bosonic()`, all twists are `1` and thus have no effect.
-Let us start with some examples, in which we illustrate that, albeit `permute` might act highly non-trivial on the fusion trees and on the corresponding data, after conversion to a regular `Array` (when possible), it just acts like `permutedims`
-
-```@repl tensors
-domain(t) → codomain(t)
-ta = convert(Array, t);
-t′ = permute(t, (1, 2, 3, 4));
-domain(t′) → codomain(t′)
-convert(Array, t′) ≈ ta
-t′′ = permute(t, ((4, 2, 3), (1,)));
-domain(t′′) → codomain(t′′)
-convert(Array, t′′) ≈ permutedims(ta, (4, 2, 3, 1))
-transpose(t)
-convert(Array, transpose(t)) ≈ permutedims(ta, (4, 3, 2, 1))
-dot(t2, t) ≈ dot(transpose(t2), transpose(t))
-transpose(transpose(t)) ≈ t
-twist(t, 3) ≈ t
-```
-
-Note that `transpose` acts like one would expect on a `TensorMap{T, S, 1, 1}`.
-On a `TensorMap{T, S, N₁, N₂}`, because `transpose` replaces the codomain with the dual of the domain, which has its tensor product operation reversed, this in the end amounts in a complete reversal of all tensor indices when representing it as a plain multi-dimensional `Array`.
-Also, note that we have not defined the conjugation of `TensorMap` instances.
-One definition that one could think of is `conj(t) = adjoint(transpose(t))`.
-However note that `codomain(adjoint(tranpose(t))) == domain(transpose(t)) == dual(codomain(t))` and similarly `domain(adjoint(tranpose(t))) == dual(domain(t))`, where `dual` of a `ProductSpace` is composed of the dual of the `ElementarySpace` instances, in reverse order of tensor product.
-This might be very confusing, and as such we leave tensor conjugation undefined.
-However, note that we have a conjugation syntax within the context of [tensor contractions](@ref ss_tensor_contraction).
-
-To show the effect of `twist`, we now consider a type of sector `I` for which `BraidingStyle(I) != Bosonic()`.
-In particular, we use `FibonacciAnyon`.
-We cannot convert the resulting `TensorMap` to an `Array`, so we have to rely on indirect tests to verify our results.
-
-```@repl tensors
-V1 = GradedSpace{FibonacciAnyon}(:I => 3, :τ => 2)
-V2 = GradedSpace{FibonacciAnyon}(:I => 2, :τ => 1)
-m = randn(Float32, V1, V2)
-transpose(m)
-twist(braid(m, ((2,), (1,)), (1, 2)), 1)
-t1 = randn(V1 * V2', V2 * V1);
-t2 = randn(ComplexF64, V1 * V2', V2 * V1);
-dot(t1, t2) ≈ dot(transpose(t1), transpose(t2))
-transpose(transpose(t1)) ≈ t1
-```
-
-A final operation that one might expect in this section is to fuse or join indices, and its inverse, to split a given index into two or more indices.
-For a plain tensor (i.e. with `sectortype(t) == Trivial`) amount to the equivalent of `reshape` on the multidimensional data.
-However, this represents only one possibility, as there is no canonically unique way to embed the tensor product of two spaces `V1 ⊗ V2` in a new space `V = fuse(V1 ⊗ V2)`.
-Such a mapping can always be accompagnied by a basis transform.
-However, one particular choice is created by the function `isomorphism`, or for `EuclideanProduct` spaces, `unitary`.
-Hence, we can join or fuse two indices of a tensor by first constructing `u = unitary(fuse(space(t, i) ⊗ space(t, j)), space(t, i) ⊗ space(t, j))` and then contracting this map with indices `i` and `j` of `t`, as explained in the section on [contracting tensors](@ref ss_tensor_contraction).
-Note, however, that a typical algorithm is not expected to often need to fuse and split indices, as e.g. tensor factorizations can easily be applied without needing to `reshape` or fuse indices first, as explained in the next section.
-
-## [Tensor factorizations](@id ss_tensor_factorization)
-
-As tensors are linear maps, they suport various kinds of factorizations.
-These functions all interpret the provided `AbstractTensorMap` instances as a map from `domain` to `codomain`, which can be thought of as reshaping the tensor into a matrix according to the current bipartition of the indices.
-
-TensorKit's factorizations are provided by [MatrixAlgebraKit.jl](https://github.com/QuantumKitHub/MatrixAlgebraKit.jl), which is used to supply both the interface, as well as the implementation of the various operations on the blocks of data.
-For specific details on the provided functionality, we refer to its [documentation page](https://quantumkithub.github.io/MatrixAlgebraKit.jl/stable/user_interface/decompositions/).
-
-Finally, note that each of the factorizations takes the current partition of `domain` and `codomain` as the *axis* along which to matricize and perform the factorization.
-In order to obtain factorizations according to a different bipartition of the indices, we can use any of the previously mentioned [index manipulations](@ref ss_indexmanipulation) before the factorization.
-
-Some examples to conclude this section
-```@repl tensors
-V1 = SU₂Space(0 => 2, 1/2 => 1)
-V2 = SU₂Space(0 => 1, 1/2 => 1, 1 => 1)
-
-t = randn(V1 ⊗ V1, V2);
-U, S, Vh = svd_compact(t);
-t ≈ U * S * Vh
-D, V = eigh_full(t' * t);
-D ≈ S * S
-U' * U ≈ id(domain(U))
-S
-
-Q, R = left_orth(t; alg = :svd);
-Q' * Q ≈ id(domain(Q))
-t ≈ Q * R
-
-U2, S2, Vh2, ε = svd_trunc(t; trunc = truncspace(V1));
-Vh2 * Vh2' ≈ id(codomain(Vh2))
-S2
-ε ≈ norm(block(S, Irrep[SU₂](1))) * sqrt(dim(Irrep[SU₂](1)))
-
-L, Q = right_orth(permute(t, ((1,), (2, 3))));
-codomain(L), domain(L), domain(Q)
-Q * Q'
-P = Q' * Q;
-P ≈ P * P
-t′ = permute(t, ((1,), (2, 3)));
-t′ ≈ t′ * P
-```
-
-## [Bosonic tensor contractions and tensor networks](@id ss_tensor_contraction)
-
-One of the most important operation with tensor maps is to compose them, more generally known as contracting them.
-As mentioned in the section on [category theory](@ref s_categories), a typical composition of maps in a ribbon category can graphically be represented as a planar arrangement of the morphisms (i.e. tensor maps, boxes with lines eminating from top and bottom, corresponding to source and target, i.e. domain and codomain), where the lines connecting the source and targets of the different morphisms should be thought of as ribbons, that can braid over or underneath each other, and that can twist.
-Technically, we can embed this diagram in ``ℝ × [0,1]`` and attach all the unconnected line endings corresponding objects in the source at some position ``(x,0)`` for ``x∈ℝ``, and all line endings corresponding to objects in the target at some position ``(x,1)``.
-The resulting morphism is then invariant under what is known as *framed three-dimensional isotopy*, i.e. three-dimensional rearrangements of the morphism that respect the rules of boxes connected by ribbons whose open endings are kept fixed.
-Such a two-dimensional diagram cannot easily be encoded in a single line of code.
-
-However, things simplify when the braiding is symmetric (such that over- and under- crossings become equivalent, i.e. just crossings), and when twists, i.e. self-crossings in this case, are trivial.
-This amounts to `BraidingStyle(I) == Bosonic()` in the language of TensorKit.jl, and is true for any subcategory of ``\mathbf{Vect}``, i.e. ordinary tensors, possibly with some symmetry constraint.
-The case of ``\mathbf{SVect}`` and its subcategories, and more general categories, are discussed below.
-
-In the case of trivial twists, we can deform the diagram such that we first combine every morphism with a number of coevaluations ``η`` so as to represent it as a tensor, i.e. with a trivial domain.
-We can then rearrange the morphism to be all ligned up horizontally, where the original morphism compositions are now being performed by evaluations ``ϵ``.
-This process will generate a number of crossings and twists, where the latter can be omitted because they act trivially.
-Similarly, double crossings can also be omitted.
-As a consequence, the diagram, or the morphism it represents, is completely specified by the tensors it is composed of, and which indices between the different tensors are connect, via the evaluation ``ϵ``, and which indices make up the source and target of the resulting morphism.
-If we also compose the resulting morphisms with coevaluations so that it has a trivial domain, we just have one type of unconnected lines, henceforth called open indices.
-We sketch such a rearrangement in the following picture
-
-```@raw html
-<img src="../img/tensor-bosoniccontraction.svg" alt="tensor unitary" class="color-invertible"/>
-```
-
-Hence, we can now specify such a tensor diagram, henceforth called a tensor contraction or also tensor network, using a one-dimensional syntax that mimicks [abstract index notation](https://en.wikipedia.org/wiki/Abstract_index_notation) and specifies which indices are connected by the evaluation map using Einstein's summation conventation.
-Indeed, for `BraidingStyle(I) == Bosonic()`, such a tensor contraction can take the same format as if all tensors were just multi-dimensional arrays.
-For this, we rely on the interface provided by the package [TensorOperations.jl](https://github.com/QuantumKitHub/TensorOperations.jl).
-
-The above picture would be encoded as
-```julia
-@tensor E[a, b, c, d, e] := A[v, w, d, x] * B[y, z, c, x] * C[v, e, y, b] * D[a, w, z]
-```
-or
-```julia
-@tensor E[:] := A[1, 2, -4, 3] * B[4, 5, -3, 3] * C[1, -5, 4, -2] * D[-1, 2, 5]
-```
-where the latter syntax is known as NCON-style, and labels the unconnected or outgoing indices with negative integers, and the contracted indices with positive integers.
-
-A number of remarks are in order.
-TensorOperations.jl accepts both integers and any valid variable name as dummy label for indices, and everything in between `[ ]` is not resolved in the current context but interpreted as a dummy label.
-Here, we label the indices of a `TensorMap`, like `A::TensorMap{T, S, N₁, N₂}`, in a linear fashion, where the first position corresponds to the first space in `codomain(A)`, and so forth, up to position `N₁`.
-Index `N₁ + 1` then corresponds to the first space in `domain(A)`.
-However, because we have applied the coevaluation ``η``, it actually corresponds to the corresponding dual space, in accordance with the interface of [`space(A, i)`](@ref) that we introduced [above](@ref ss_tensor_properties), and as indiated by the dotted box around ``A`` in the above picture.
-The same holds for the other tensor maps.
-Note that our convention also requires that we braid indices that we brought from the domain to the codomain, and so this is only unambiguous for a symmetric braiding, where there is a unique way to permute the indices.
-
-With the current syntax, we create a new object `E` because we use the definition operator `:=`.
-Furthermore, with the current syntax, it will be a `Tensor`, i.e. it will have a trivial domain, and correspond to the dotted box in the picture above, rather than the actual morphism `E`.
-We can also directly define `E` with the correct codomain and domain by rather using
-```julia
-@tensor E[a b c;d e] := A[v, w, d, x] * B[y, z, c, x] * C[v, e, y, b] * D[a, w, z]
-```
-or
-```julia
-@tensor E[(a, b, c);(d, e)] := A[v, w, d, x] * B[y, z, c, x] * C[v, e, y, b] * D[a, w, z]
-```
-where the latter syntax can also be used when the codomain is empty.
-When using the assignment operator `=`, the `TensorMap` `E` is assumed to exist and the contents will be written to the currently allocated memory.
-Note that for existing tensors, both on the left hand side and right hand side, trying to specify the indices in the domain and the codomain seperately using the above syntax, has no effect, as the bipartition of indices are already fixed by the existing object.
-Hence, if `E` has been created by the previous line of code, all of the following lines are now equivalent
-```julia
-@tensor E[(a, b, c);(d, e)] = A[v, w, d, x] * B[y, z, c, x] * C[v, e, y, b] * D[a, w, z]
-@tensor E[a, b, c, d, e] = A[v w d; x] * B[(y, z, c); (x, )] * C[v e y; b] * D[a, w, z]
-@tensor E[a b; c d e] = A[v; w d x] * B[y, z, c, x] * C[v, e, y, b] * D[a w; z]
-```
-and none of those will or can change the partition of the indices of `E` into its codomain and its domain.
-
-Two final remarks are in order.
-Firstly, the order of the tensors appearing on the right hand side is irrelevant, as we can reorder them by using the allowed moves of the Penrose graphical calculus, which yields some crossings and a twist.
-As the latter is trivial, it can be omitted, and we just use the same rules to evaluate the newly ordered tensor network.
-For the particular case of matrix-matrix multiplication, which also captures more general settings by appropriotely combining spaces into a single line, we indeed find
-
-```@raw html
-<img src="../img/tensor-contractionreorder.svg" alt="tensor contraction reorder" class="color-invertible"/>
-```
-
-or thus, the following two lines of code yield the same result
-```julia
-@tensor C[i, j] := B[i, k] * A[k, j]
-@tensor C[i, j] := A[k, j] * B[i, k]
-```
-Reordering of tensors can be used internally by the `@tensor` macro to evaluate the contraction in a more efficient manner.
-In particular, the NCON-style of specifying the contraction gives the user control over the order, and there are other macros, such as `@tensoropt`, that try to automate this process.
-There is also an `@ncon` macro and `ncon` function, an we recommend reading the [manual of TensorOperations.jl](https://quantumkithub.github.io/TensorOperations.jl/stable/) to learn more about the possibilities and how they work.
-
-A final remark involves the use of adjoints of tensors.
-The current framework is such that the user should not be too worried about the actual bipartition into codomain and domain of a given `TensorMap` instance.
-Indeed, for tensor contractions the `@tensor` macro figures out the correct manipulations automatically.
-However, when wanting to use the `adjoint` of an instance `t::TensorMap{T, S, N₁, N₂}`, the resulting `adjoint(t)` is an `AbstractTensorMap{T, S, N₂, N₁}` and one needs to know the values of `N₁` and `N₂` to know exactly where the `i`th index of `t` will end up in `adjoint(t)`, and hence the index order of `t'`.
-Within the `@tensor` macro, one can instead use `conj()` on the whole index expression so as to be able to use the original index ordering of `t`.
-For example, for `TensorMap{T, S, 1, 1}` instances, this yields exactly the equivalence one expects, namely one between the following two expressions:
-
-```julia
-@tensor C[i, j] := B'[i, k] * A[k, j]
-@tensor C[i, j] := conj(B[k, i]) * A[k, j]
-```
-
-For e.g. an instance `A::TensorMap{T, S, 3, 2}`, the following two syntaxes have the same effect within an `@tensor` expression: `conj(A[a, b, c, d, e])` and `A'[d, e, a, b, c]`.
-
-Some examples:
-
-## Fermionic tensor contractions
-
-TODO
-
-## Anyonic tensor contractions
-
-TODO
diff --git a/docs/src/man/tensors.md b/docs/src/man/tensors.md
index 2921e5f1b..155dbbe4d 100644
--- a/docs/src/man/tensors.md
+++ b/docs/src/man/tensors.md
@@ -5,7 +5,7 @@ using TensorKit
 using LinearAlgebra
 ```
 
-This last page explains how to create and manipulate tensors in TensorKit.jl.
+This page explains how to construct and access tensors in TensorKit.jl.
 As this is probably the most important part of the manual, we will also focus more strongly on the usage and interface, and less so on the underlying implementation.
 The only aspect of the implementation that we will address is the storage of the tensor data, as this is important to know how to create and initialize a tensor, but will in fact also shed light on how some of the methods work.
 
diff --git a/ext/TensorKitAMDGPUExt/roctensormap.jl b/ext/TensorKitAMDGPUExt/roctensormap.jl
index f2f094c60..56dea938e 100644
--- a/ext/TensorKitAMDGPUExt/roctensormap.jl
+++ b/ext/TensorKitAMDGPUExt/roctensormap.jl
@@ -162,3 +162,5 @@ for f in (:sqrt, :log, :asin, :acos, :acosh, :atanh, :acoth)
         return tf
     end
 end
+
+TensorKit.adapt_transformer(U::AbstractMatrix, ::Type{A}) where {A <: ROCVector} = AMDGPU.Adapt.adapt(ROCArray, U)
diff --git a/ext/TensorKitCUDAExt/cutensormap.jl b/ext/TensorKitCUDAExt/cutensormap.jl
index 2fefb3a24..ed2bb2757 100644
--- a/ext/TensorKitCUDAExt/cutensormap.jl
+++ b/ext/TensorKitCUDAExt/cutensormap.jl
@@ -169,6 +169,4 @@ for f in (:sqrt, :log, :asin, :acos, :acosh, :atanh, :acoth)
     end
 end
 
-function TensorKit._add_transform_multi!(tdst::CuTensorMap, tsrc, p, (U, structs_dst, structs_src)::Tuple{<:Array, TD, TS}, buffers, alpha, beta, backend...) where {TD, TS}
-    return TensorKit._add_transform_multi!(tdst, tsrc, p, (CUDA.CUDACore.Adapt.adapt(CuArray, U), structs_dst, structs_src), buffers, alpha, beta, backend...)
-end
+TensorKit.adapt_transformer(U::AbstractMatrix, ::Type{A}) where {A <: CuVector} = CUDA.CUDACore.Adapt.adapt(CuArray, U)
diff --git a/ext/TensorKitMooncakeExt/indexmanipulations.jl b/ext/TensorKitMooncakeExt/indexmanipulations.jl
index c3bc3e26c..1fccdd9e6 100644
--- a/ext/TensorKitMooncakeExt/indexmanipulations.jl
+++ b/ext/TensorKitMooncakeExt/indexmanipulations.jl
@@ -1,11 +1,11 @@
 for transform in (:permute, :transpose)
-    add_transform! = Symbol(:add_, transform, :!)
-    add_transform_pullback = Symbol(add_transform!, :_pullback)
+    transform! = Symbol(transform, :!)
+    transform_pullback = Symbol(transform!, :_pullback)
     @eval @is_primitive(
         DefaultCtx,
         ReverseMode,
         Tuple{
-            typeof(TK.$add_transform!),
+            typeof(TK.$transform!),
             AbstractTensorMap,
             AbstractTensorMap, Index2Tuple,
             Number, Number, Vararg{Any},
@@ -13,7 +13,7 @@ for transform in (:permute, :transpose)
     )
 
     @eval function Mooncake.rrule!!(
-            ::CoDual{typeof(TK.$add_transform!)},
+            ::CoDual{typeof(TK.$transform!)},
             C_ΔC::CoDual{<:AbstractTensorMap},
             A_ΔA::CoDual{<:AbstractTensorMap}, p_Δp::CoDual{<:Index2Tuple},
             α_Δα::CoDual{<:Number}, β_Δβ::CoDual{<:Number},
@@ -30,17 +30,17 @@ for transform in (:permute, :transpose)
 
         # if we need to compute Δa, it is faster to allocate an intermediate permuted A
         # and store that instead of repeating the permutation in the pullback each time.
-        # effectively, we replace `add_permute` by `add ∘ permute`.
+        # effectively, we replace `permute!/transpose!` by `add ∘ permute/transpose`.
         Ap = if _needs_tangent(α)
             Ap = $transform(A, p)
             add!(C, Ap, α, β)
             Ap
         else
-            TK.$add_transform!(C, A, p, α, β, ba...)
+            TK.$transform!(C, A, p, α, β, ba...)
             nothing
         end
 
-        function $add_transform_pullback(::NoRData)
+        function $transform_pullback(::NoRData)
             copy!(C, C_cache)
 
             # ΔA
@@ -50,10 +50,10 @@ for transform in (:permute, :transpose)
             TC = VectorInterface.promote_scale(ΔC, α)
             if scalartype(ΔA) <: Real && !(TC <: Real)
                 ΔAc = TO.tensoralloc_add(TC, ΔC, pΔA, false, Val(false))
-                TK.$add_transform!(ΔAc, ΔC, pΔA, conj(α), Zero(), ba...)
+                TK.$transform!(ΔAc, ΔC, pΔA, conj(α), Zero(), ba...)
                 add!(ΔA, real(ΔAc))
             else
-                TK.$add_transform!(ΔA, ΔC, pΔA, conj(α), One(), ba...)
+                TK.$transform!(ΔA, ΔC, pΔA, conj(α), One(), ba...)
             end
             ΔAr = NoRData()
 
@@ -64,7 +64,7 @@ for transform in (:permute, :transpose)
             return NoRData(), ΔCr, ΔAr, NoRData(), Δαr, Δβr, map(Returns(NoRData()), ba)...
         end
 
-        return C_ΔC, $add_transform_pullback
+        return C_ΔC, $transform_pullback
     end
 end
 
@@ -72,7 +72,7 @@ end
     DefaultCtx,
     ReverseMode,
     Tuple{
-        typeof(TK.add_braid!),
+        typeof(TK.braid!),
         AbstractTensorMap,
         AbstractTensorMap, Index2Tuple, IndexTuple,
         Number, Number, Vararg{Any},
@@ -80,7 +80,7 @@ end
 )
 
 function Mooncake.rrule!!(
-        ::CoDual{typeof(TK.add_braid!)},
+        ::CoDual{typeof(TK.braid!)},
         C_ΔC::CoDual{<:AbstractTensorMap},
         A_ΔA::CoDual{<:AbstractTensorMap}, p_Δp::CoDual{<:Index2Tuple}, levels_Δlevels::CoDual{<:IndexTuple},
         α_Δα::CoDual{<:Number}, β_Δβ::CoDual{<:Number},
@@ -98,17 +98,17 @@ function Mooncake.rrule!!(
 
     # if we need to compute Δa, it is faster to allocate an intermediate braided A
     # and store that instead of repeating the permutation in the pullback each time.
-    # effectively, we replace `add_permute` by `add ∘ permute`.
+    # effectively, we replace `braid!` by `add ∘ braid`.
     Ap = if _needs_tangent(α)
         Ap = braid(A, p, levels)
         add!(C, Ap, α, β)
         Ap
     else
-        TK.add_braid!(C, A, p, levels, α, β, ba...)
+        TK.braid!(C, A, p, levels, α, β, ba...)
         nothing
     end
 
-    function add_braid!_pullback(::NoRData)
+    function braid!_pullback(::NoRData)
         copy!(C, C_cache)
 
         # ΔA
@@ -118,10 +118,10 @@ function Mooncake.rrule!!(
         TC = VectorInterface.promote_scale(ΔC, α)
         if scalartype(ΔA) <: Real && !(TC <: Real)
             ΔAc = TO.tensoralloc_add(TC, ΔC, pΔA, false, Val(false))
-            TK.add_braid!(ΔAc, ΔC, pΔA, ilevels, conj(α), Zero(), ba...)
+            TK.braid!(ΔAc, ΔC, pΔA, ilevels, conj(α), Zero(), ba...)
             add!(ΔA, real(ΔAc))
         else
-            TK.add_braid!(ΔA, ΔC, pΔA, ilevels, conj(α), One(), ba...)
+            TK.braid!(ΔA, ΔC, pΔA, ilevels, conj(α), One(), ba...)
         end
         ΔAr = NoRData()
 
@@ -132,7 +132,7 @@ function Mooncake.rrule!!(
         return NoRData(), ΔCr, ΔAr, NoRData(), NoRData(), Δαr, Δβr, map(Returns(NoRData()), ba)...
     end
 
-    return C_ΔC, add_braid!_pullback
+    return C_ΔC, braid!_pullback
 end
 
 # both are needed for correctly capturing every dispatch
diff --git a/ext/TensorKitMooncakeExt/planaroperations.jl b/ext/TensorKitMooncakeExt/planaroperations.jl
index 3c75fe2da..abbef5004 100644
--- a/ext/TensorKitMooncakeExt/planaroperations.jl
+++ b/ext/TensorKitMooncakeExt/planaroperations.jl
@@ -60,7 +60,7 @@
 #     if length(q[1]) == 0
 #         ip = invperm(linearize(p))
 #         pΔA = _repartition(ip, A)
-#         TK.add_transpose!(ΔA, ΔC, pΔA, conj(α), One(), backend, allocator)
+#         TK.transpose!(ΔA, ΔC, pΔA, conj(α), One(), backend, allocator)
 #         return NoRData()
 #     end
 #     # if length(q[1]) == 1
diff --git a/src/TensorKit.jl b/src/TensorKit.jl
index d8361ac79..db3d9c50f 100644
--- a/src/TensorKit.jl
+++ b/src/TensorKit.jl
@@ -91,8 +91,7 @@ export left_orth, right_orth, left_null, right_null,
     isisometric, isunitary, project_isometric, project_isometric!,
     isposdef, isposdef!, sylvester, rank, cond
 
-export braid, braid!, permute, permute!, transpose, transpose!, twist, twist!, repartition,
-    repartition!
+export braid, braid!, permute, permute!, transpose, transpose!, twist, twist!, repartition, repartition!
 export catdomain, catcodomain, absorb, absorb!
 
 # tensor operations
diff --git a/src/planar/planaroperations.jl b/src/planar/planaroperations.jl
index cde772982..758bb708a 100644
--- a/src/planar/planaroperations.jl
+++ b/src/planar/planaroperations.jl
@@ -32,7 +32,7 @@ function planaradd!(
         α::Number, β::Number,
         backend, allocator
     )
-    return add_transpose!(C, A, p, α, β, backend)
+    return transpose!(C, A, p, α, β, backend)
 end
 
 # insert default backend
@@ -173,7 +173,7 @@ function planarcontract!(
         A′ = TO.tensoralloc_add(
             scalartype(A), A, (oindA, cindA), false, Val(true), allocator
         )
-        add_transpose!(A′, A, (oindA, cindA), One(), Zero(), backend)
+        transpose!(A′, A, (oindA, cindA), One(), Zero(), backend)
     end
 
     if cindB == codB && oindB == domB
@@ -182,7 +182,7 @@ function planarcontract!(
         B′ = TensorOperations.tensoralloc_add(
             scalartype(B), B, (cindB, oindB), false, Val(true), allocator
         )
-        add_transpose!(B′, B, (cindB, oindB), One(), Zero(), backend)
+        transpose!(B′, B, (cindB, oindB), One(), Zero(), backend)
     end
     mul!(C, A′, B′, α, β)
     (oindA == codA && cindA == domA) || TO.tensorfree!(A′, allocator)
diff --git a/src/tensors/braidingtensor.jl b/src/tensors/braidingtensor.jl
index f08dd8181..a02d3b884 100644
--- a/src/tensors/braidingtensor.jl
+++ b/src/tensors/braidingtensor.jl
@@ -256,7 +256,7 @@ function planarcontract!(
     end
 
     if BraidingStyle(sectortype(B)) isa Bosonic
-        return add_permute!(C, B, (reverse(cindB), oindB), α, β, backend)
+        return permute!(C, B, (reverse(cindB), oindB), α, β, backend)
     end
 
     τ_levels = A.adjoint ? (1, 2, 2, 1) : (2, 1, 1, 2)
@@ -313,7 +313,7 @@ function planarcontract!(
     p = (oindA, reverse(cindA))
     N = length(oindA)
     levels = (ntuple(identity, N)..., (B.adjoint ? (N + 1, N + 2) : (N + 2, N + 1))...)
-    return add_braid!(C, A, p, levels, α, β, backend)
+    return braid!(C, A, p, levels, α, β, backend)
 end
 
 # ambiguity fix:
diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index 3108abb17..499f01ae0 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -1,28 +1,10 @@
-# Index manipulations
-#---------------------
-
-# find the scalartype after applying operations: take into account fusion and/or braiding
-# might need to become Float or Complex to capture complex recoupling coefficients but don't alter precision
-for (operation, manipulation) in (
-        :flip => :sector, :twist => :braiding,
-        :transpose => :fusion, :permute => :sector, :braid => :sector,
-    )
-    promote_op = Symbol(:promote_, operation)
-    manipulation_scalartype = Symbol(manipulation, :scalartype)
-
-    @eval begin
-        $promote_op(t::AbstractTensorMap) = $promote_op(typeof(t))
-        $promote_op(::Type{T}) where {T <: AbstractTensorMap} =
-            $promote_op(scalartype(T), sectortype(T))
-        $promote_op(::Type{T}, ::Type{I}) where {T <: Number, I <: Sector} =
-            sectorscalartype(I) <: Integer ? T :
-            sectorscalartype(I) <: Real ? float(T) : complex(T)
-        # TODO: currently the manipulations all use sectorscalartype, change to:
-        # $manipulation_scalartype(I) <: Integer ? T :
-        # $manipulation_scalartype(I) <: Real ? float(T) : complex(T)
-    end
-end
+# =============
+#  Reweighting
+# =============
 
+# ------
+# flip
+# ------
 """
     flip(t::AbstractTensorMap, I) -> t′::AbstractTensorMap
 
@@ -46,33 +28,214 @@ function flip(t::AbstractTensorMap, I; inv::Bool = false)
     return t′
 end
 
+# ---------
+# twist(!)
+# ---------
+function has_shared_twist(t, inds)
+    I = sectortype(t)
+    if BraidingStyle(I) == NoBraiding()
+        for i in inds
+            cs = sectors(space(t, i))
+            all(isunit, cs) || throw(SectorMismatch(lazy"Cannot twist sectors $cs"))
+        end
+        return true
+    elseif BraidingStyle(I) == Bosonic()
+        return true
+    else
+        for i in inds
+            cs = sectors(space(t, i))
+            all(isone ∘ twist, cs) || return false
+        end
+        return true
+    end
+end
+
+"""
+    twist!(t::AbstractTensorMap, i::Int; inv::Bool = false) -> t
+    twist!(t::AbstractTensorMap, inds; inv::Bool = false) -> t
+
+Apply a twist to the `i`th index of `t`, or all indices in `inds`, storing the result in `t`.
+If `inv=true`, use the inverse twist.
+
+See [`twist`](@ref) for creating a new tensor.
+"""
+function twist!(t::AbstractTensorMap, inds; inv::Bool = false)
+    if !all(in(allind(t)), inds)
+        msg = "Can't twist indices $inds of a tensor with only $(numind(t)) indices."
+        throw(ArgumentError(msg))
+    end
+    (scalartype(t) <: Real && !(sectorscalartype(sectortype(t)) <: Real)) &&
+        throw(ArgumentError("Can't in-place twist a real tensor with complex sector type"))
+    has_shared_twist(t, inds) && return t
+
+    (scalartype(t) <: Real && !(sectorscalartype(sectortype(t)) <: Real)) &&
+        throw(ArgumentError("No in-place `twist!` for a real tensor with complex sector type"))
+
+    N₁ = numout(t)
+    for (f₁, f₂) in fusiontrees(t)
+        θ = prod(i -> i <= N₁ ? twist(f₁.uncoupled[i]) : twist(f₂.uncoupled[i - N₁]), inds)
+        inv && (θ = θ')
+        scale!(t[f₁, f₂], θ)
+    end
+    return t
+end
+
+"""
+    twist(tsrc::AbstractTensorMap, i::Int; inv::Bool = false, copy::Bool = false) -> tdst
+    twist(tsrc::AbstractTensorMap, inds; inv::Bool = false, copy::Bool = false) -> tdst
+
+Apply a twist to the `i`th index of `tsrc` and return the result as a new tensor.
+If `inv = true`, use the inverse twist.
+If `copy = false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
+
+See [`twist!`](@ref) for storing the result in place.
+"""
+function twist(t::AbstractTensorMap, inds; inv::Bool = false, copy::Bool = false)
+    if has_shared_twist(t, inds)
+        copy || return t
+        return copy!(similar(t), t)
+    end
+    tdst = similar(t, promote_twist(t))
+    copy!(tdst, t)
+    return twist!(tdst, inds; inv)
+end
+
+# =========================
+#  Space insertion/removal
+# =========================
+
+# Methods which change the number of indices, implement using `Val(i)` for type inference
+"""
+    insertleftunit(
+            tsrc::AbstractTensorMap, i = numind(t) + 1;
+            conj = false, dual = false, copy = false
+        ) -> tdst
+
+Insert a trivial vector space, isomorphic to the underlying field, at position `i`,
+which can be specified as an `Int` or as `Val(i)` for improved type stability.
+More specifically, adds a left monoidal unit or its dual.
+
+If `copy=false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
+
+See also [`insertrightunit`](@ref insertrightunit(::AbstractTensorMap, ::Val{i}) where {i}),
+[`removeunit`](@ref removeunit(::AbstractTensorMap, ::Val{i}) where {i}).
+"""
+function insertleftunit(
+        t::AbstractTensorMap, ::Val{i} = Val(numind(t) + 1);
+        copy::Bool = false, conj::Bool = false, dual::Bool = false
+    ) where {i}
+    W = insertleftunit(space(t), Val(i); conj, dual)
+    if t isa TensorMap
+        return TensorMap{scalartype(t)}(copy ? Base.copy(t.data) : t.data, W)
+    else
+        tdst = similar(t, W)
+        for (c, b) in blocks(t)
+            copy!(block(tdst, c), b)
+        end
+        return tdst
+    end
+end
+
+"""
+    insertrightunit(
+            tsrc::AbstractTensorMap, i = numind(t);
+            conj = false, dual = false, copy = false
+        ) -> tdst
+
+Insert a trivial vector space, isomorphic to the underlying field, after position `i`,
+which can be specified as an `Int` or as `Val(i)` for improved type stability.
+More specifically, adds a right monoidal unit or its dual.
+
+If `copy=false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
+
+See also [`insertleftunit`](@ref insertleftunit(::AbstractTensorMap, ::Val{i}) where {i}),
+[`removeunit`](@ref removeunit(::AbstractTensorMap, ::Val{i}) where {i}).
+"""
+function insertrightunit(
+        t::AbstractTensorMap, ::Val{i} = Val(numind(t));
+        copy::Bool = false, conj::Bool = false, dual::Bool = false
+    ) where {i}
+    W = insertrightunit(space(t), Val(i); conj, dual)
+    if t isa TensorMap
+        return TensorMap{scalartype(t)}(copy ? Base.copy(t.data) : t.data, W)
+    else
+        tdst = similar(t, W)
+        for (c, b) in blocks(t)
+            copy!(block(tdst, c), b)
+        end
+        return tdst
+    end
+end
+
 """
-    permute!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap, (p₁, p₂)::Index2Tuple)
-        -> tdst
+    removeunit(tsrc::AbstractTensorMap, i; copy = false) -> tdst
 
-Write into `tdst` the result of permuting the indices of `tsrc`.
+This removes a trivial tensor product factor at position `1 ≤ i ≤ N`, where `i`
+can be specified as an `Int` or as `Val(i)` for improved type stability.
+For this to work, that factor has to be isomorphic to the field of scalars.
+
+If `copy=false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
+
+This operation undoes the work of [`insertleftunit`](@ref insertleftunit(::AbstractTensorMap, ::Val{i}) where {i})
+and [`insertrightunit`](@ref insertrightunit(::AbstractTensorMap, ::Val{i}) where {i}).
+"""
+function removeunit(t::AbstractTensorMap, ::Val{i}; copy::Bool = false) where {i}
+    W = removeunit(space(t), Val(i))
+    if t isa TensorMap
+        return TensorMap{scalartype(t)}(copy ? Base.copy(t.data) : t.data, W)
+    else
+        tdst = similar(t, W)
+        for (c, b) in blocks(t)
+            copy!(block(tdst, c), b)
+        end
+        return tdst
+    end
+end
+
+# TODO: fusion/splitting of indices
+
+# ============================
+# Index rearrangements
+# ============================
+
+# --------------
+#   permute(!)
+# --------------
+"""
+    permute!(tdst, tsrc, (p₁, p₂)::Index2Tuple, α = 1, β = 0, [backend], [allocator]) -> tdst
+
+Compute `tdst = β * tdst + α * permute(tsrc, (p₁, p₂))`, writing the result into `tdst`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
-                
-See [`permute`](@ref) for creating a new tensor and [`add_permute!`](@ref) for a more general version.
+Optionally specify a `backend` and `allocator` for the underlying array operation.
+
+See also [`permute`](@ref) for creating a new tensor.
 """
 @propagate_inbounds function Base.permute!(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple
+        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
+        α::Number = One(), β::Number = Zero(),
+        backend::AbstractBackend = TO.DefaultBackend(), allocator = TO.DefaultAllocator()
     )
-    return add_permute!(tdst, tsrc, p, One(), Zero())
+    @boundscheck spacecheck_transform(permute, tdst, tsrc, p)
+    levels = ntuple(identity, numind(tsrc))
+    return @inbounds braid!(tdst, tsrc, p, levels, α, β, backend, allocator)
 end
 
 """
-    permute(tsrc::AbstractTensorMap, (p₁, p₂)::Index2Tuple; copy::Bool = false) -> tdst::TensorMap
+    permute(tsrc, (p₁, p₂)::Index2Tuple; copy = false, [backend], [allocator]) -> tdst
 
 Return tensor `tdst` obtained by permuting the indices of `tsrc`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
 
 If `copy = false`, `tdst` might share data with `tsrc` whenever possible.
 Otherwise, a copy is always made.
+Optionally specify a `backend` and `allocator` for the underlying array operation.
 
-To permute into an existing destination, see [permute!](@ref) and [`add_permute!`](@ref)
+See also [`permute!`](@ref) for writing into an existing destination.
 """
-function permute(t::AbstractTensorMap, p::Index2Tuple; copy::Bool = false)
+function permute(
+        t::AbstractTensorMap, p::Index2Tuple;
+        copy::Bool = false, backend::AbstractBackend = TO.DefaultBackend(), allocator = TO.DefaultAllocator()
+    )
     # share data if possible
     if !copy
         if p == (codomainind(t), domainind(t))
@@ -84,14 +247,15 @@ function permute(t::AbstractTensorMap, p::Index2Tuple; copy::Bool = false)
 
     # general case
     tdst = similar(t, promote_permute(t), permute(space(t), p))
-    return @inbounds permute!(tdst, t, p)
+    levels = ntuple(identity, numind(t))
+    return @inbounds braid!(tdst, t, p, levels, One(), Zero(), backend, allocator)
 end
-function permute(t::AdjointTensorMap, (p₁, p₂)::Index2Tuple; copy::Bool = false)
+function permute(t::AdjointTensorMap, (p₁, p₂)::Index2Tuple; kwargs...)
     p₁′ = adjointtensorindices(t, p₂)
     p₂′ = adjointtensorindices(t, p₁)
-    return adjoint(permute(adjoint(t), (p₁′, p₂′); copy))
+    return adjoint(permute(adjoint(t), (p₁′, p₂′); kwargs...))
 end
-permute(t::AbstractTensorMap, p::IndexTuple; copy::Bool = false) = permute(t, (p, ()); copy)
+permute(t::AbstractTensorMap, p::IndexTuple; kwargs...) = permute(t, (p, ()); kwargs...)
 
 function has_shared_permute(t::AbstractTensorMap, (p₁, p₂)::Index2Tuple)
     return (p₁ === codomainind(t) && p₂ === domainind(t))
@@ -100,8 +264,8 @@ function has_shared_permute(t::TensorMap, (p₁, p₂)::Index2Tuple)
     if p₁ === codomainind(t) && p₂ === domainind(t)
         return true
     elseif sectortype(t) === Trivial
-        stridet = i -> stride(t[], i)
-        sizet = i -> size(t[], i)
+        stridet = Base.Fix1(stride, t[])
+        sizet = Base.Fix1(size, t[])
         canfuse1, d1, s1 = TO._canfuse(sizet.(p₁), stridet.(p₁))
         canfuse2, d2, s2 = TO._canfuse(sizet.(p₂), stridet.(p₂))
         return canfuse1 && canfuse2 && s1 == 1 && (d2 == 1 || s2 == d1)
@@ -115,29 +279,37 @@ function has_shared_permute(t::AdjointTensorMap, (p₁, p₂)::Index2Tuple)
     return has_shared_permute(t', (p₁′, p₂′))
 end
 
-# Braid
+# -------------
+#   braid(!)
+# -------------
 """
-    braid!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap,
-           (p₁, p₂)::Index2Tuple, levels::Tuple)
-        -> tdst
+    braid!(tdst, tsrc, (p₁, p₂)::Index2Tuple, levels::IndexTuple, α = 1, β = 0, [backend], [allocator]) -> tdst
 
-Write into `tdst` the result of braiding the indices of `tsrc`.
+Compute `tdst = β * tdst + α * braid(tsrc, (p₁, p₂), levels)`, writing the result into `tdst`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
 Here, `levels` is a tuple of length `numind(tsrc)` that assigns a level or height to the indices of `tsrc`,
 which determines whether they will braid over or under any other index with which they have to change places.
+Optionally specify a `backend` and `allocator` for the underlying array operation.
 
-See [`braid`](@ref) for creating a new tensor and [`add_braid!`](@ref) for a more general version.
+See also [`braid`](@ref) for creating a new tensor.
 """
 @propagate_inbounds function braid!(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple
+        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple,
+        α::Number = One(), β::Number = Zero(),
+        backend::AbstractBackend = TO.DefaultBackend(), allocator = TO.DefaultAllocator()
     )
-    return add_braid!(tdst, tsrc, p, levels, One(), Zero())
+    @boundscheck spacecheck_transform(braid, tdst, tsrc, p, levels)
+    levels1 = TupleTools.getindices(levels, codomainind(tsrc))
+    levels2 = TupleTools.getindices(levels, domainind(tsrc))
+    transformer = treebraider(tdst, tsrc, p, (levels1, levels2))
+    return @inbounds add_transform!(tdst, tsrc, p, transformer, α, β, backend, allocator)
 end
 
 """
-    braid(tsrc::AbstractTensorMap, (p₁, p₂)::Index2Tuple, levels::IndexTuple;
-          copy::Bool = false)
-        -> tdst::TensorMap
+    braid(
+            tsrc, (p₁, p₂)::Index2Tuple, levels::IndexTuple; copy = false,
+            backend = DefaultBackend(), allocator = DefaultAllocator()
+        ) -> tdst::TensorMap
 
 Return tensor `tdst` obtained by braiding the indices of `tsrc`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
@@ -145,277 +317,178 @@ Here, `levels` is a tuple of length `numind(tsrc)` that assigns a level or heigh
 which determines whether they will braid over or under any other index with which they have to change places.
 
 If `copy=false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
+Optionally specify a `backend` and `allocator` for the underlying array operation.
 
-To braid into an existing destination, see [braid!](@ref) and [`add_braid!`](@ref)
+See also [`braid!`](@ref) for writing into an existing destination.
 """
 function braid(
-        t::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple; copy::Bool = false
+        t::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple;
+        copy::Bool = false, backend::AbstractBackend = TO.DefaultBackend(), allocator = TO.DefaultAllocator()
     )
-    length(levels) == numind(t) || throw(ArgumentError("invalid levels"))
+    length(levels) == numind(t) || throw(ArgumentError(lazy"length of levels should be $(numind(t)), got $(length(levels))"))
 
-    BraidingStyle(sectortype(t)) isa SymmetricBraiding && return permute(t, p; copy)
     (!copy && p == (codomainind(t), domainind(t))) && return t
 
     # general case
     tdst = similar(t, promote_braid(t), permute(space(t), p))
-    return @inbounds braid!(tdst, t, p, levels)
+    return @inbounds braid!(tdst, t, p, levels, One(), Zero(), backend, allocator)
+end
+function braid(
+        t::AdjointTensorMap, (p₁, p₂)::Index2Tuple, levels::IndexTuple;
+        kwargs...
+    )
+    p₁′ = adjointtensorindices(t, p₂)
+    p₂′ = adjointtensorindices(t, p₁)
+    perm = adjointtensorindices(adjoint(t), ntuple(identity, numind(t)))
+    levels′ = TupleTools.getindices(levels, perm)
+    return adjoint(braid(adjoint(t), (p₁′, p₂′), levels′; kwargs...))
 end
-# TODO: braid for `AdjointTensorMap`; think about how to map the `levels` argument.
 
-# Transpose
+# ----------------
+#   transpose(!)
+# ----------------
 _transpose_indices(t::AbstractTensorMap) = (reverse(domainind(t)), reverse(codomainind(t)))
 
 """
-    transpose!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap,
-               (p₁, p₂)::Index2Tuple)
-        -> tdst
+    transpose!(tdst, tsrc, (p₁, p₂)::Index2Tuple, α = 1, β = 0, [backend], [allocator]) -> tdst
 
-Write into `tdst` the result of transposing the indices of `tsrc`.
+Compute `tdst = β * tdst + α * transpose(tsrc, (p₁, p₂))`, writing the result into `tdst`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
 The new index positions should be attainable without any indices crossing each other, i.e.,
-the permutation `(p₁..., reverse(p₂)...)` should constitute a cyclic permutation of `(codomainind(tsrc)..., reverse(domainind(tsrc))...)`.
+the permutation `(p₁..., reverse(p₂)...)` should constitute a cyclic permutation of
+`(codomainind(tsrc)..., reverse(domainind(tsrc))...)`.
+Optionally specify a `backend` and `allocator` for the underlying array operation.
 
-See [`transpose`](@ref) for creating a new tensor and [`add_transpose!`](@ref) for a more general version.
+See also [`transpose`](@ref) for creating a new tensor.
 """
+function LinearAlgebra.transpose!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap)
+    return transpose!(tdst, tsrc, _transpose_indices(tsrc))
+end
 @propagate_inbounds function LinearAlgebra.transpose!(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, (p₁, p₂)::Index2Tuple = _transpose_indices(tsrc)
+        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
+        α::Number = One(), β::Number = Zero(),
+        backend::AbstractBackend = TO.DefaultBackend(), allocator = TO.DefaultAllocator()
     )
-    return add_transpose!(tdst, tsrc, (p₁, p₂), One(), Zero())
+    @boundscheck spacecheck_transform(transpose, tdst, tsrc, p)
+    transformer = treetransposer(tdst, tsrc, p)
+    return @inbounds add_transform!(tdst, tsrc, p, transformer, α, β, backend, allocator)
 end
 
 """
-    transpose(tsrc::AbstractTensorMap, (p₁, p₂)::Index2Tuple;
-              copy::Bool=false)
-        -> tdst::TensorMap
+    transpose(
+            tsrc, (p₁, p₂)::Index2Tuple; copy = false,
+            backend = DefaultBackend(), allocator = DefaultAllocator()
+        ) -> tdst::TensorMap
 
 Return tensor `tdst` obtained by transposing the indices of `tsrc`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
 The new index positions should be attainable without any indices crossing each other, i.e.,
-the permutation `(p₁..., reverse(p₂)...)` should constitute a cyclic permutation of `(codomainind(tsrc)..., reverse(domainind(tsrc))...)`.
+the permutation `(p₁..., reverse(p₂)...)` should constitute a cyclic permutation of
+`(codomainind(tsrc)..., reverse(domainind(tsrc))...)`.
 
 If `copy=false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
+Optionally specify a `backend` and `allocator` for the underlying array operation.
 
-To permute into an existing destination, see [permute!](@ref) and [`add_permute!`](@ref)
+See also [`transpose!`](@ref) for writing into an existing destination.
 """
 function LinearAlgebra.transpose(
         t::AbstractTensorMap, p::Index2Tuple = _transpose_indices(t);
-        copy::Bool = false
+        copy::Bool = false, backend = TO.DefaultBackend(), allocator = TO.DefaultAllocator()
     )
-    sectortype(t) === Trivial && return permute(t, p; copy)
+    sectortype(t) === Trivial && return permute(t, p; copy, backend, allocator)
     (!copy && p == (codomainind(t), domainind(t))) && return t
 
     # general case
     tdst = similar(t, promote_transpose(t), permute(space(t), p))
-    return @inbounds transpose!(tdst, t, p)
+    return @inbounds transpose!(tdst, t, p, One(), Zero(), backend, allocator)
 end
 
 function LinearAlgebra.transpose(
         t::AdjointTensorMap, (p₁, p₂)::Index2Tuple = _transpose_indices(t);
-        copy::Bool = false
+        copy::Bool = false, backend = TO.DefaultBackend(), allocator = TO.DefaultAllocator()
     )
     p₁′ = map(n -> adjointtensorindex(t, n), p₂)
     p₂′ = map(n -> adjointtensorindex(t, n), p₁)
-    return adjoint(transpose(adjoint(t), (p₁′, p₂′); copy = copy))
+    return adjoint(transpose(adjoint(t), (p₁′, p₂′); copy, backend, allocator))
 end
 
+# -------------------
+#   repartition(!)
+# -------------------
 """
-    repartition!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap) -> tdst
+    repartition!(tdst, tsrc, α = 1, β = 0, [backend], [allocator]) -> tdst
 
-Write into `tdst` the result of repartitioning the indices of `tsrc`. This is just a special
-case of a transposition that only changes the number of in- and outgoing indices.
+Compute `tdst = β * tdst + α * repartition(tsrc)`, writing the result into `tdst`.
+This is a special case of `transpose!` that only changes the partition of indices between
+codomain and domain, without changing their cyclic order.
+Optionally specify a `backend` and `allocator` for the underlying array operation.
 
-See [`repartition`](@ref) for creating a new tensor.
+See also [`repartition`](@ref) for creating a new tensor.
 """
-@propagate_inbounds function repartition!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap)
+@propagate_inbounds function repartition!(
+        tdst::AbstractTensorMap, tsrc::AbstractTensorMap,
+        α::Number = One(), β::Number = Zero(),
+        backend::AbstractBackend = TO.DefaultBackend(), allocator = TO.DefaultAllocator()
+    )
     check_spacetype(tdst, tsrc)
     numind(tsrc) == numind(tdst) ||
         throw(ArgumentError("tsrc and tdst should have an equal amount of indices"))
     all_inds = (codomainind(tsrc)..., reverse(domainind(tsrc))...)
     p₁ = ntuple(i -> all_inds[i], numout(tdst))
     p₂ = reverse(ntuple(i -> all_inds[i + numout(tdst)], numin(tdst)))
-    return transpose!(tdst, tsrc, (p₁, p₂))
+    return transpose!(tdst, tsrc, (p₁, p₂), α, β, backend, allocator)
 end
 
 """
     repartition(
-        tsrc::AbstractTensorMap{T, S}, N₁::Int, N₂::Int; copy::Bool=false
-    ) where {T, S} -> tdst::AbstractTensorMap{T, S, N₁, N₂}
+            tsrc, N₁::Int, N₂::Int = numind(tsrc) - N₁; copy = false,
+            backend = DefaultBackend(), allocator = DefaultAllocator()
+        ) -> tdst
 
-Return tensor `tdst` obtained by repartitioning the indices of `t`.
-The codomain and domain of `tdst` correspond to the first `N₁` and last `N₂` spaces of `t`, respectively.
+Return tensor `tdst` obtained by repartitioning the indices of `tsrc`.
+The codomain and domain of `tdst` correspond to the first `N₁` and last `N₂` spaces of `tsrc`,
+respectively.
 
 If `copy=false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
+Optionally specify a `backend` and `allocator` for the underlying array operation.
 
-To repartition into an existing destination, see [repartition!](@ref).
+See also [`repartition!`](@ref) for writing into an existing destination.
 """
 @constprop :aggressive function repartition(
-        t::AbstractTensorMap, N₁::Int, N₂::Int = numind(t) - N₁; copy::Bool = false
+        t::AbstractTensorMap, N₁::Int, N₂::Int = numind(t) - N₁;
+        copy::Bool = false, backend = TO.DefaultBackend(), allocator = TO.DefaultAllocator()
     )
     N₁ + N₂ == numind(t) ||
         throw(ArgumentError("Invalid repartition: $(numind(t)) to ($N₁, $N₂)"))
     all_inds = (codomainind(t)..., reverse(domainind(t))...)
     p₁ = ntuple(i -> all_inds[i], N₁)
     p₂ = reverse(ntuple(i -> all_inds[i + N₁], N₂))
-    return transpose(t, (p₁, p₂); copy)
-end
-
-# Twist
-function has_shared_twist(t, inds)
-    I = sectortype(t)
-    if BraidingStyle(I) == NoBraiding()
-        for i in inds
-            cs = sectors(space(t, i))
-            all(isunit, cs) || throw(SectorMismatch(lazy"Cannot twist sectors $cs"))
-        end
-        return true
-    elseif BraidingStyle(I) == Bosonic()
-        return true
-    else
-        for i in inds
-            cs = sectors(space(t, i))
-            all(isone ∘ twist, cs) || return false
-        end
-        return true
-    end
-end
-
-"""
-    twist!(t::AbstractTensorMap, i::Int; inv::Bool=false) -> t
-    twist!(t::AbstractTensorMap, inds; inv::Bool=false) -> t
-
-Apply a twist to the `i`th index of `t`, or all indices in `inds`, storing the result in `t`.
-If `inv=true`, use the inverse twist.
-
-See [`twist`](@ref) for creating a new tensor.
-"""
-function twist!(t::AbstractTensorMap, inds; inv::Bool = false)
-    if !all(in(allind(t)), inds)
-        msg = "Can't twist indices $inds of a tensor with only $(numind(t)) indices."
-        throw(ArgumentError(msg))
-    end
-    (scalartype(t) <: Real && !(sectorscalartype(sectortype(t)) <: Real)) &&
-        throw(ArgumentError("Can't in-place twist a real tensor with complex sector type"))
-    has_shared_twist(t, inds) && return t
-
-    (scalartype(t) <: Real && !(sectorscalartype(sectortype(t)) <: Real)) &&
-        throw(ArgumentError("No in-place `twist!` for a real tensor with complex sector type"))
-
-    N₁ = numout(t)
-    for (f₁, f₂) in fusiontrees(t)
-        θ = prod(i -> i <= N₁ ? twist(f₁.uncoupled[i]) : twist(f₂.uncoupled[i - N₁]), inds)
-        inv && (θ = θ')
-        scale!(t[f₁, f₂], θ)
-    end
-    return t
-end
-
-"""
-    twist(tsrc::AbstractTensorMap, i::Int; inv::Bool = false, copy::Bool = false) -> tdst
-    twist(tsrc::AbstractTensorMap, inds; inv::Bool = false, copy::Bool = false) -> tdst
-
-Apply a twist to the `i`th index of `tsrc` and return the result as a new tensor.
-If `inv = true`, use the inverse twist.
-If `copy = false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
-
-See [`twist!`](@ref) for storing the result in place.
-"""
-function twist(t::AbstractTensorMap, inds; inv::Bool = false, copy::Bool = false)
-    !copy && has_shared_twist(t, inds) && return t
-    tdst = similar(t, promote_twist(t))
-    copy!(tdst, t)
-    return twist!(tdst, inds; inv)
+    return transpose(t, (p₁, p₂); copy, backend, allocator)
 end
 
-# Methods which change the number of indices, implement using `Val(i)` for type inference
-"""
-    insertleftunit(tsrc::AbstractTensorMap, i=numind(t) + 1;
-                   conj=false, dual=false, copy=false) -> tdst
-
-Insert a trivial vector space, isomorphic to the underlying field, at position `i`,
-which can be specified as an `Int` or as `Val(i)` for improved type stability.
-More specifically, adds a left monoidal unit or its dual.
-
-If `copy=false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
-
-See also [`insertrightunit`](@ref insertrightunit(::AbstractTensorMap, ::Val{i}) where {i}),
-[`removeunit`](@ref removeunit(::AbstractTensorMap, ::Val{i}) where {i}).
-"""
-function insertleftunit(
-        t::AbstractTensorMap, ::Val{i} = Val(numind(t) + 1);
-        copy::Bool = false, conj::Bool = false, dual::Bool = false
-    ) where {i}
-    W = insertleftunit(space(t), Val(i); conj, dual)
-    if t isa TensorMap
-        return TensorMap{scalartype(t)}(copy ? Base.copy(t.data) : t.data, W)
-    else
-        tdst = similar(t, W)
-        for (c, b) in blocks(t)
-            copy!(block(tdst, c), b)
-        end
-        return tdst
-    end
-end
-
-"""
-    insertrightunit(tsrc::AbstractTensorMap, i=numind(t);
-                    conj=false, dual=false, copy=false) -> tdst
-
-Insert a trivial vector space, isomorphic to the underlying field, after position `i`,
-which can be specified as an `Int` or as `Val(i)` for improved type stability.
-More specifically, adds a right monoidal unit or its dual.
-
-If `copy=false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
-
-See also [`insertleftunit`](@ref insertleftunit(::AbstractTensorMap, ::Val{i}) where {i}),
-[`removeunit`](@ref removeunit(::AbstractTensorMap, ::Val{i}) where {i}).
-"""
-function insertrightunit(
-        t::AbstractTensorMap, ::Val{i} = Val(numind(t));
-        copy::Bool = false, conj::Bool = false, dual::Bool = false
-    ) where {i}
-    W = insertrightunit(space(t), Val(i); conj, dual)
-    if t isa TensorMap
-        return TensorMap{scalartype(t)}(copy ? Base.copy(t.data) : t.data, W)
-    else
-        tdst = similar(t, W)
-        for (c, b) in blocks(t)
-            copy!(block(tdst, c), b)
-        end
-        return tdst
-    end
-end
-
-"""
-    removeunit(tsrc::AbstractTensorMap, i; copy=false) -> tdst
-
-This removes a trivial tensor product factor at position `1 ≤ i ≤ N`, where `i`
-can be specified as an `Int` or as `Val(i)` for improved type stability.
-For this to work, that factor has to be isomorphic to the field of scalars.
+#-------------------------------------
+# Internal implementations
+#-------------------------------------
 
-If `copy=false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
+# find the scalartype after applying operations: take into account fusion and/or braiding
+# might need to become Float or Complex to capture complex recoupling coefficients but don't alter precision
+for (operation, manipulation) in (
+        :flip => :sector, :twist => :braiding,
+        :transpose => :fusion, :permute => :sector, :braid => :sector,
+    )
+    promote_op = Symbol(:promote_, operation)
+    manipulation_scalartype = Symbol(manipulation, :scalartype)
 
-This operation undoes the work of [`insertleftunit`](@ref insertleftunit(::AbstractTensorMap, ::Val{i}) where {i}) 
-and [`insertrightunit`](@ref insertrightunit(::AbstractTensorMap, ::Val{i}) where {i}).
-"""
-function removeunit(t::AbstractTensorMap, ::Val{i}; copy::Bool = false) where {i}
-    W = removeunit(space(t), Val(i))
-    if t isa TensorMap
-        return TensorMap{scalartype(t)}(copy ? Base.copy(t.data) : t.data, W)
-    else
-        tdst = similar(t, W)
-        for (c, b) in blocks(t)
-            copy!(block(tdst, c), b)
-        end
-        return tdst
+    @eval begin
+        $promote_op(t::AbstractTensorMap) = $promote_op(typeof(t))
+        $promote_op(::Type{T}) where {T <: AbstractTensorMap} =
+            $promote_op(scalartype(T), sectortype(T))
+        $promote_op(::Type{T}, ::Type{I}) where {T <: Number, I <: Sector} =
+            $manipulation_scalartype(I) <: Integer ? T :
+            $manipulation_scalartype(I) <: Real ? float(T) : complex(T)
     end
 end
 
-# Fusing and splitting
-# TODO: add functionality for easy fusing and splitting of tensor indices
-
-#-------------------------------------
-# Full implementations based on `add`
-#-------------------------------------
 spacecheck_transform(f, tdst::AbstractTensorMap, tsrc::AbstractTensorMap, args...) =
     spacecheck_transform(f, space(tdst), space(tsrc), args...)
 @noinline function spacecheck_transform(f, Vdst::TensorMapSpace, Vsrc::TensorMapSpace, p::Index2Tuple)
@@ -447,82 +520,46 @@ end
     return nothing
 end
 
-
-"""
-    add_permute!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap, (p₁, p₂)::Index2Tuple,
-                 α::Number, β::Number, backend::AbstractBackend...)
-
-Return the updated `tdst`, which is the result of adding `α * tsrc` to `tdst` after permuting 
-the indices of `tsrc` according to `(p₁, p₂)`.
-
-See also [`permute`](@ref), [`permute!`](@ref), [`add_braid!`](@ref), [`add_transpose!`](@ref).
-"""
-@propagate_inbounds function add_permute!(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
-        α::Number, β::Number, backend::AbstractBackend...
-    )
-    @boundscheck spacecheck_transform(permute, tdst, tsrc, p)
-    transformer = treepermuter(tdst, tsrc, p)
-    return @inbounds add_transform!(tdst, tsrc, p, transformer, α, β, backend...)
-end
-
-"""
-    add_braid!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap, (p₁, p₂)::Index2Tuple,
-               levels::IndexTuple, α::Number, β::Number, backend::AbstractBackend...)
-
-Return the updated `tdst`, which is the result of adding `α * tsrc` to `tdst` after braiding
-the indices of `tsrc` according to `(p₁, p₂)` and `levels`.
-
-See also [`braid`](@ref), [`braid!`](@ref), [`add_permute!`](@ref), [`add_transpose!`](@ref).
-"""
-@propagate_inbounds function add_braid!(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple,
-        α::Number, β::Number, backend::AbstractBackend...
-    )
-    @boundscheck spacecheck_transform(braid, tdst, tsrc, p, levels)
-    levels1 = TupleTools.getindices(levels, codomainind(tsrc))
-    levels2 = TupleTools.getindices(levels, domainind(tsrc))
-    # TODO: arg order for tensormaps is different than for fusiontrees
-    transformer = treebraider(tdst, tsrc, p, (levels1, levels2))
-    return @inbounds add_transform!(tdst, tsrc, p, transformer, α, β, backend...)
-end
-
-"""
-    add_transpose!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap, (p₁, p₂)::Index2Tuple,
-                   α::Number, β::Number, backend::AbstractBackend...)
-
-Return the updated `tdst`, which is the result of adding `α * tsrc` to `tdst` after transposing
-the indices of `tsrc` according to `(p₁, p₂)`.
-
-See also [`transpose`](@ref), [`transpose!`](@ref), [`add_permute!`](@ref), [`add_braid!`](@ref).
-"""
-@propagate_inbounds function add_transpose!(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
-        α::Number, β::Number, backend::AbstractBackend...
-    )
-    @boundscheck spacecheck_transform(transpose, tdst, tsrc, p)
-    transformer = treetransposer(tdst, tsrc, p)
-    return @inbounds add_transform!(tdst, tsrc, p, transformer, α, β, backend...)
-end
+# Deprecated add_*! wrappers
+# --------------------------
+Base.@deprecate(
+    add_permute!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, α::Number, β::Number, backend::AbstractBackend...),
+    permute!(tdst, tsrc, p, α, β, backend...)
+)
+Base.@deprecate(
+    add_braid!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple, α::Number, β::Number, backend::AbstractBackend...),
+    braid!(tdst, tsrc, p, levels, α, β, backend...)
+)
+Base.@deprecate(
+    add_transpose!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, α::Number, β::Number, backend::AbstractBackend...),
+    transpose!(tdst, tsrc, p, α, β, backend...)
+)
 
 @propagate_inbounds function add_transform!(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, transformer,
-        α::Number, β::Number, backend::AbstractBackend...
+        α::Number, β::Number, backend, allocator
     )
+    # `permute` is used as a stand-in for all index rearrangements here: permute, braid, and
+    # transpose all produce the same destination space for a given permutation tuple `p`.
     @boundscheck spacecheck_transform(permute, tdst, tsrc, p)
 
+    # Three cases, from cheapest to most expensive:
+    #   1. trivial permutation: delegate to `add!` which handles α/β scaling directly
+    #   2. Trivial sector type: no fusion tree bookkeeping, call tensoradd! on the raw array
+    #   3. general case: iterate over (blocks of) fusion trees, potentially multi-threaded
     if p[1] === codomainind(tsrc) && p[2] === domainind(tsrc)
         add!(tdst, tsrc, α, β)
     else
         I = sectortype(tdst)
         if I === Trivial
-            add_trivial_kernel!(tdst, tsrc, p, transformer, α, β, backend...)
+            TO.tensoradd!(tdst[], tsrc[], p, false, α, β, backend, allocator)
         else
-            style = FusionStyle(I)
-            if use_threaded_transform(tdst, transformer)
-                add_kernel_threaded!(style, tdst, tsrc, p, transformer, α, β, backend...)
+            ntasks = use_threaded_transform(tdst, transformer) ? get_num_transformer_threads() : 1
+            scheduler = ntasks == 1 ? SerialScheduler() : DynamicScheduler(; ntasks, split = :roundrobin)
+            if tdst isa TensorMap && tsrc isa TensorMap # unpack data fields to avoid specializing
+                add_transform_kernel!(tdst.data, tsrc.data, p, transformer, α, β, backend, allocator, scheduler)
             else
-                add_kernel_nonthreaded!(style, tdst, tsrc, p, transformer, α, β, backend...)
+                add_transform_kernel!(tdst, tsrc, p, transformer, α, β, backend, allocator, scheduler)
             end
         end
     end
@@ -537,267 +574,172 @@ function use_threaded_transform(t::AbstractTensorMap, transformer)
     return get_num_transformer_threads() > 1 && dim(space(t)) > Strided.MINTHREADLENGTH
 end
 
-# Trivial implementations
-# -----------------------
-function add_trivial_kernel!(tdst, tsrc, p, transformer, α, β, backend...)
-    TO.tensoradd!(tdst[], tsrc[], p, false, α, β, backend...)
-    return nothing
-end
-
-# Non-threaded implementations
-# ----------------------------
-function add_kernel_nonthreaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer, α, β, backend...
+function add_transform_kernel!(
+        tdst, tsrc, p, transformer, α, β, backend, allocator, scheduler
     )
-    for (f₁, f₂) in fusiontrees(tsrc)
-        _add_transform_single!(tdst, tsrc, p, (f₁, f₂), transformer, α, β, backend...)
+    I = sectortype(tdst)
+    if FusionStyle(I) === UniqueFusion()
+        # Abelian / unique-fusion: each source fusion tree pair (f₁, f₂) maps to exactly
+        # one destination pair (f₁′, f₂′) with a scalar coefficient. No mixing occurs.
+        tforeach(fusiontrees(tsrc); scheduler) do (f₁, f₂)
+            (f₁′, f₂′), coeff = transformer((f₁, f₂))
+            @inbounds TO.tensoradd!(
+                tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * coeff, β, backend, allocator
+            )
+        end
+        return nothing
     end
-    return nothing
-end
-function add_kernel_nonthreaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer::AbelianTreeTransformer, α, β, backend...
-    )
-    for subtransformer in transformer.data
-        _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend...)
+    cp = TO.allocator_checkpoint!(allocator)
+    # Non-Abelian fusion: trees sharing the same set of uncoupled (external) sectors
+    # form a *fusion block* and mix under the transformation via a recoupling matrix U
+    # (rows = destination trees, columns = source trees). We iterate over blocks.
+
+    # buffers have to be created without race condition: err on the side of caution
+    buffersz = 2 * buffersize(transformer)
+    generate_buffer = let lock = Threads.ReentrantLock(), allocator = allocator
+        () -> @lock lock TO.tensoralloc(typeof(data_dst), buffersz, Val(true), allocator)
     end
-    return nothing
-end
-function add_kernel_nonthreaded!(::FusionStyle, tdst, tsrc, p, transformer, α, β, backend...)
-    # preallocate buffers
-    buffers = allocate_buffers(tdst, tsrc, transformer)
 
-    for src in fusionblocks(tsrc)
+    OhMyThreads.@tasks for src in fusionblocks(tsrc)
+        # setup
+        OhMyThreads.@set scheduler = scheduler
+        OhMyThreads.@local buffer = generate_buffer()
+
+        dst, U = transformer(src)
+
         if length(src) == 1
-            _add_transform_single!(tdst, tsrc, p, src, transformer, α, β, backend...)
+            # Degenerate block: single tree, U is a 1×1 scalar — skip the buffer + matmul.
+            (f₁, f₂) = only(fusiontrees(src))
+            (f₁′, f₂′) = only(fusiontrees(dst))
+            @inbounds TO.tensoradd!(
+                tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * only(U), β, backend, allocator
+            )
         else
-            _add_transform_multi!(tdst, tsrc, p, src, transformer, buffers, α, β, backend...)
+            # Multi-tree block: apply recoupling via a three-step pack → matmul → unpack.
+            #   1. Extract: flatten each source block into a column of buffer_src
+            #      (shape blocksize × cols), using a trivial permutation so that the
+            #      index layout is canonical before the matmul.
+            #   2. Recoupling: buffer_dst = buffer_src * U^T  (blocksize × rows)
+            #   3. Insert: scatter columns of buffer_dst to destination blocks,
+            #      applying the actual permutation p in the same step.
+            rows, cols = size(U)
+            sz_src = size(tsrc[first(fusiontrees(src))...])
+            blocksize = prod(sz_src)
+            ptriv = (ntuple(identity, length(sz_src)), ())
+            buffer_dst = StridedView(buffer, (blocksize, rows), (1, blocksize), 0)
+            buffer_src = StridedView(buffer, (blocksize, cols), (1, blocksize), blocksize * rows)
+            @inbounds for (i, (f₁, f₂)) in enumerate(fusiontrees(src))
+                TO.tensoradd!(
+                    sreshape(buffer_src[:, i], sz_src), tsrc[f₁, f₂],
+                    ptriv, false, One(), Zero(), backend, allocator
+                )
+            end
+            U′ = adapt_transformer(U, storagetype(tdst))
+            mul!(buffer_dst, buffer_src, transpose(StridedView(U′)))
+            @inbounds for (i, (f₃, f₄)) in enumerate(fusiontrees(dst))
+                TO.tensoradd!(
+                    tdst[f₃, f₄], sreshape(buffer_dst[:, i], sz_src),
+                    p, false, α, β, backend, allocator
+                )
+            end
+            TO.tensorfree!(buffer, allocator)
         end
     end
+    TO.allocator_reset!(allocator, cp)
     return nothing
 end
-# specialization in the case of TensorMap
-function add_kernel_nonthreaded!(
-        ::FusionStyle, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend...
-    )
-    # preallocate buffers
-    buffers = allocate_buffers(tdst, tsrc, transformer)
 
-    for subtransformer in transformer.data
-        # Special case without intermediate buffers whenever there is only a single block
-        if length(subtransformer[1]) == 1
-            _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend...)
-        else
-            _add_transform_multi!(tdst, tsrc, p, subtransformer, buffers, α, β, backend...)
-        end
-    end
-    return nothing
-end
-# ambiguity resolution
-function add_kernel_nonthreaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend...
+# TensorMap specializations: operate directly on the flat data vector to avoid
+# repeated dictionary lookups into t.data. The transformer has precomputed all
+# StridedView descriptors (size, offset, strides) for each fusion tree block.
+# No symmetry types left -- no repeated specialization needed
+function add_transform_kernel!(
+        data_dst::DenseVector, data_src::DenseVector, p, transformer::AbelianTreeTransformer,
+        α, β, backend, allocator, scheduler
     )
-    throw(ArgumentError("Cannot combine `GenericTreeTransformer` with `UniqueFusion`"))
-end
-# Threaded implementations
-# ------------------------
-function add_kernel_threaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer, α, β, backend...;
-        ntasks::Int = get_num_transformer_threads()
-    )
-    trees = fusiontrees(tsrc)
-    nblocks = length(trees)
-    counter = Threads.Atomic{Int}(1)
-    Threads.@sync for _ in 1:min(ntasks, nblocks)
-        Threads.@spawn begin
-            while true
-                local_counter = Threads.atomic_add!(counter, 1)
-                local_counter > nblocks && break
-                @inbounds (f₁, f₂) = trees[local_counter]
-                _add_transform_single!(tdst, tsrc, p, (f₁, f₂), transformer, α, β, backend...)
-            end
-        end
+    # Each entry is (coeff, struct_dst, struct_src) where struct_{dst,src} = (size, offset, strides)
+    # locating the block for one fusion tree pair inside the flat data vector.
+    tforeach(transformer.data; scheduler) do (coeff, struct_dst, struct_src)
+        TO.tensoradd!(
+            StridedView(data_dst, struct_dst...), StridedView(data_src, struct_src...),
+            p, false, α * coeff, β, backend, allocator
+        )
     end
     return nothing
 end
-function add_kernel_threaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer::AbelianTreeTransformer, α, β, backend...;
-        ntasks::Int = get_num_transformer_threads()
+function add_transform_kernel!(
+        data_dst::DenseVector, data_src::DenseVector, p, transformer::GenericTreeTransformer,
+        α, β, backend, allocator, scheduler
     )
-    nblocks = length(transformer.data)
-    counter = Threads.Atomic{Int}(1)
-    Threads.@sync for _ in 1:min(ntasks, nblocks)
-        Threads.@spawn begin
-            while true
-                local_counter = Threads.atomic_add!(counter, 1)
-                local_counter > nblocks && break
-                @inbounds subtransformer = transformer.data[local_counter]
-                _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend...)
-            end
-        end
+    # Each entry covers one fusion block:
+    #   U            — recoupling matrix (rows = dst trees, cols = src trees)
+    #   sz_{dst,src} — array shape of each block (same for all trees in the block)
+    #   structs_{dst,src}[i] — (offset, strides) into the flat data vector for tree i
+    cp = TO.allocator_checkpoint!(allocator)
+
+    # buffers have to be created without race condition: err on the side of caution
+    buffersz = 2 * buffersize(transformer)
+    generate_buffer = let lock = Threads.ReentrantLock(), allocator = allocator
+        () -> @lock lock TO.tensoralloc(typeof(data_dst), buffersz, Val(true), allocator)
     end
-    return nothing
-end
 
-function add_kernel_threaded!(
-        ::FusionStyle, tdst, tsrc, p, transformer, α, β, backend...;
-        ntasks::Int = get_num_transformer_threads()
-    )
-    allblocks = fusionblocks(tsrc)
-    nblocks = length(allblocks)
-
-    counter = Threads.Atomic{Int}(1)
-    Threads.@sync for _ in 1:min(ntasks, nblocks)
-        Threads.@spawn begin
-            # preallocate buffers for each task
-            buffers = allocate_buffers(tdst, tsrc, transformer)
-
-            while true
-                local_counter = Threads.atomic_add!(counter, 1)
-                local_counter > nblocks && break
-                @inbounds src = allblocks[local_counter]
-                if length(src) == 1
-                    _add_transform_single!(tdst, tsrc, p, src, transformer, α, β, backend...)
-                else
-                    _add_transform_multi!(tdst, tsrc, p, src, transformer, buffers, α, β, backend...)
-                end
+    OhMyThreads.@tasks for subtransformer in transformer.data
+        # setup
+        OhMyThreads.@set scheduler = scheduler
+        OhMyThreads.@local buffer = generate_buffer()
+        U, (sz_dst, structs_dst), (sz_src, structs_src) = subtransformer
+
+        if length(U) == 1
+            # Degenerate block with a single tree: no matmul needed.
+            coeff = only(U)
+            TO.tensoradd!(
+                StridedView(data_dst, sz_dst, only(structs_dst)...),
+                StridedView(data_src, sz_src, only(structs_src)...),
+                p, false, α * coeff, β, backend, allocator
+            )
+        else
+            # Multi-tree block: pack → recoupling matmul → unpack.
+            # buffer2 = source staging area, buffer1 = destination staging area.
+            rows, cols = size(U)
+            blocksize = prod(sz_src)
+            ptriv = (ntuple(identity, length(sz_src)), ())
+            buffer_dst = StridedView(buffer, (blocksize, rows), (1, blocksize), 0)
+            buffer_src = StridedView(buffer, (blocksize, cols), (1, blocksize), blocksize * rows)
+
+            # 1. Extract: copy each source block into column i of buffer_src as a flat vector,
+            #    using a trivial permutation so the layout is canonical before the matmul.
+            @inbounds for (i, struct_src_i) in enumerate(structs_src)
+                TO.tensoradd!(
+                    sreshape(buffer_src[:, i], sz_src), StridedView(data_src, sz_src, struct_src_i...),
+                    ptriv, false, One(), Zero(), backend, allocator
+                )
             end
-        end
-    end
 
-    return nothing
-end
-# specialization in the case of TensorMap
-function add_kernel_threaded!(
-        ::FusionStyle, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend...;
-        ntasks::Int = get_num_transformer_threads()
-    )
-    nblocks = length(transformer.data)
-
-    counter = Threads.Atomic{Int}(1)
-    Threads.@sync for _ in 1:min(ntasks, nblocks)
-        Threads.@spawn begin
-            # preallocate buffers for each task
-            buffers = allocate_buffers(tdst, tsrc, transformer)
-
-            while true
-                local_counter = Threads.atomic_add!(counter, 1)
-                local_counter > nblocks && break
-                @inbounds subtransformer = transformer.data[local_counter]
-                if length(subtransformer[1]) == 1
-                    _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend...)
-                else
-                    _add_transform_multi!(tdst, tsrc, p, subtransformer, buffers, α, β, backend...)
-                end
+            # 2. Recoupling: buffer_dst = buffer_src * U^T  (each output tree is a linear
+            #    combination of input trees weighted by the recoupling coefficients).
+            U′ = adapt_transformer(U, typeof(data_dst))
+            mul!(buffer_dst, buffer_src, transpose(StridedView(U′)))
+
+            # 3. Insert: scatter column i of buffer_dst into the destination, applying the
+            #    actual index permutation p in the same tensoradd! call.
+            @inbounds for (i, struct_dst_i) in enumerate(structs_dst)
+                TO.tensoradd!(
+                    StridedView(data_dst, sz_dst, struct_dst_i...), sreshape(buffer_dst[:, i], sz_src),
+                    p, false, α, β, backend, allocator
+                )
             end
+            TO.tensorfree!(buffer, allocator)
         end
     end
-
-    return nothing
-end
-# ambiguity resolution
-function add_kernel_threaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend...;
-        ntasks::Int = get_num_transformer_threads()
-    )
-    throw(ArgumentError("Cannot combine `GenericTreeTransformer` with `UniqueFusion`"))
-end
-
-
-# Auxiliary methods
-# -----------------
-function _add_transform_single!(tdst, tsrc, p, (f₁, f₂)::FusionTreePair, transformer, α, β, backend...)
-    (f₁′, f₂′), coeff = transformer((f₁, f₂))
-    @inbounds TO.tensoradd!(tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * coeff, β, backend...)
-    return nothing
-end
-function _add_transform_single!(tdst, tsrc, p, src::FusionTreeBlock, transformer, α, β, backend...)
-    dst, U = transformer(src)
-    f₁, f₂ = only(fusiontrees(src))
-    f₁′, f₂′ = only(fusiontrees(dst))
-    coeff = only(U)
-    @inbounds TO.tensoradd!(tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * coeff, β, backend...)
-    return nothing
-end
-function _add_transform_single!(
-        tdst, tsrc, p, (coeff, struct_dst, struct_src)::AbelianTransformerData,
-        α, β, backend...
-    )
-    subblock_dst = StridedView(tdst.data, struct_dst...)
-    subblock_src = StridedView(tsrc.data, struct_src...)
-    TO.tensoradd!(subblock_dst, subblock_src, p, false, α * coeff, β, backend...)
-    return nothing
-end
-function _add_transform_single!(
-        tdst, tsrc, p, (basistransform, structs_dst, structs_src)::GenericTransformerData,
-        α, β, backend...
-    )
-    struct_dst = (structs_dst[1], only(structs_dst[2])...)
-    struct_src = (structs_src[1], only(structs_src[2])...)
-    coeff = only(basistransform)
-    _add_transform_single!(tdst, tsrc, p, (coeff, struct_dst, struct_src), α, β, backend...)
-    return nothing
-end
-
-function _add_transform_multi!(tdst, tsrc, p, src::FusionTreeBlock, transformer, (buffer1, buffer2), α, β, backend...)
-    dst, U = transformer(src)
-    rows, cols = size(U)
-    sz_src = size(tsrc[first(fusiontrees(src))...])
-    blocksize = prod(sz_src)
-    matsize = (
-        prod(TupleTools.getindices(sz_src, codomainind(tsrc))),
-        prod(TupleTools.getindices(sz_src, domainind(tsrc))),
-    )
-
-    # Filling up a buffer with contiguous data
-    buffer_src = StridedView(buffer2, (blocksize, cols), (1, blocksize), 0)
-    for (i, (f₁, f₂)) in enumerate(fusiontrees(src))
-        subblock_src = sreshape(tsrc[f₁, f₂], matsize)
-        bufblock_src = sreshape(buffer_src[:, i], matsize)
-        copy!(bufblock_src, subblock_src)
-    end
-
-    # Resummation into a second buffer using BLAS
-    buffer_dst = StridedView(buffer1, (blocksize, rows), (1, blocksize), 0)
-    mul!(buffer_dst, buffer_src, transpose(StridedView(U)), α, Zero())
-
-    # Filling up the output
-    for (i, (f₃, f₄)) in enumerate(fusiontrees(dst))
-        subblock_dst = tdst[f₃, f₄]
-        bufblock_dst = sreshape(buffer_dst[:, i], sz_src)
-        TO.tensoradd!(subblock_dst, bufblock_dst, p, false, One(), β, backend...)
-    end
-
+    TO.allocator_reset!(allocator, cp)
     return nothing
 end
-function _add_transform_multi!(
-        tdst, tsrc, p, (U, (sz_dst, structs_dst), (sz_src, structs_src)),
-        (buffer1, buffer2), α, β, backend...
-    )
-    rows, cols = size(U)
-    blocksize = prod(sz_src)
-    matsize = (
-        prod(TupleTools.getindices(sz_src, codomainind(tsrc))),
-        prod(TupleTools.getindices(sz_src, domainind(tsrc))),
-    )
-
-    # Filling up a buffer with contiguous data
-    buffer_src = StridedView(buffer2, (blocksize, cols), (1, blocksize), 0)
-    for (i, struct_src) in enumerate(structs_src)
-        subblock_src = sreshape(StridedView(tsrc.data, sz_src, struct_src...), matsize)
-        bufblock_src = sreshape(buffer_src[:, i], matsize)
-        copy!(bufblock_src, subblock_src)
-    end
 
-    # Resummation into a second buffer using BLAS
-    buffer_dst = StridedView(buffer1, (blocksize, rows), (1, blocksize), 0)
-    mul!(buffer_dst, buffer_src, transpose(StridedView(U)), α, Zero())
-
-    # Filling up the output
-    for (i, struct_dst) in enumerate(structs_dst)
-        subblock_dst = StridedView(tdst.data, sz_dst, struct_dst...)
-        bufblock_dst = sreshape(buffer_dst[:, i], sz_src)
-        TO.tensoradd!(subblock_dst, bufblock_dst, p, false, One(), β, backend...)
-    end
+"""
+    adapt_transformer(U::AbstractMatrix, ::Type{A})
 
-    return nothing
-end
+Return a version of the basis transformation `U` that is compatible for storage type `A`.
+Default is a no-op.
+Backends (e.g. CUDA, AMDGPU) should overload this for their vector types to ensure the recoupling matrix `U` is on the correct device.
+"""
+adapt_transformer(U::AbstractMatrix, ::Type{A}) where {A} = U
diff --git a/src/tensors/tensoroperations.jl b/src/tensors/tensoroperations.jl
index 3fc79cf0c..375b63768 100644
--- a/src/tensors/tensoroperations.jl
+++ b/src/tensors/tensoroperations.jl
@@ -43,9 +43,9 @@ function TO.tensoradd!(
     if conjA
         A′ = adjoint(A)
         pA′ = adjointtensorindices(A, _canonicalize(pA, C))
-        add_permute!(C, A′, pA′, α, β, backend)
+        permute!(C, A′, pA′, α, β, backend)
     else
-        add_permute!(C, A, _canonicalize(pA, C), α, β, backend)
+        permute!(C, A, _canonicalize(pA, C), α, β, backend)
     end
     return C
 end
diff --git a/src/tensors/treetransformers.jl b/src/tensors/treetransformers.jl
index 30ec1de0f..f68378cc2 100644
--- a/src/tensors/treetransformers.jl
+++ b/src/tensors/treetransformers.jl
@@ -128,25 +128,6 @@ function repack_transformer_structure(structures::Dictionary, trees)
     return sz, strides_offsets
 end
 
-function buffersize(transformer::GenericTreeTransformer)
-    return maximum(transformer.data; init = 0) do (basistransform, structures_dst, _)
-        return prod(structures_dst[1]) * size(basistransform, 1)
-    end
-end
-
-function allocate_buffers(
-        tdst::TensorMap, tsrc::TensorMap, transformer::GenericTreeTransformer
-    )
-    sz = buffersize(transformer)
-    return similar(tdst.data, sz), similar(tsrc.data, sz)
-end
-function allocate_buffers(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, transformer
-    )
-    # be pessimistic and assume the worst for now
-    sz = dim(space(tsrc))
-    return similar(storagetype(tdst), sz), similar(storagetype(tsrc), sz)
-end
 
 function treetransformertype(Vdst, Vsrc)
     I = sectortype(Vdst)
@@ -185,22 +166,17 @@ end
     return TreeTransformer(fusiontreebraider, p, Vdst, Vsrc)
 end
 
-for (transform, treetransformer) in
-    ((:permute, :treepermuter), (:transpose, :treetransposer))
-    @eval begin
-        function $treetransformer(::AbstractTensorMap, ::AbstractTensorMap, p::Index2Tuple)
-            return fusiontreetransform(f) = $transform(f, p)
-        end
-        function $treetransformer(tdst::TensorMap, tsrc::TensorMap, p::Index2Tuple)
-            return $treetransformer(space(tdst), space(tsrc), p)
-        end
-        @cached function $treetransformer(
-                Vdst::TensorMapSpace, Vsrc::TensorMapSpace, p::Index2Tuple
-            )::treetransformertype(Vdst, Vsrc)
-            fusiontreetransform(f) = $transform(f, p)
-            return TreeTransformer(fusiontreetransform, p, Vdst, Vsrc)
-        end
-    end
+function treetransposer(::AbstractTensorMap, ::AbstractTensorMap, p::Index2Tuple)
+    return fusiontreetransform(f) = transpose(f, p)
+end
+function treetransposer(tdst::TensorMap, tsrc::TensorMap, p::Index2Tuple)
+    return treetransposer(space(tdst), space(tsrc), p)
+end
+@cached function treetransposer(
+        Vdst::TensorMapSpace, Vsrc::TensorMapSpace, p::Index2Tuple
+    )::treetransformertype(Vdst, Vsrc)
+    fusiontreetransform(f) = transpose(f, p)
+    return TreeTransformer(fusiontreetransform, p, Vdst, Vsrc)
 end
 
 # default cachestyle is GlobalLRUCache
@@ -227,3 +203,9 @@ end
 function _transformer_weight((mat, structs_dst, structs_src)::GenericTransformerData)
     return length(mat) * prod(structs_dst[1])
 end
+
+function buffersize(transformer::GenericTreeTransformer)
+    return maximum(transformer.data; init = 0) do (basistransform, structures_dst, _)
+        return prod(structures_dst[1]) * size(basistransform, 1)
+    end
+end
diff --git a/test/factorizations/svd.jl b/test/factorizations/svd.jl
index cec5e1dda..bf61d9e5a 100644
--- a/test/factorizations/svd.jl
+++ b/test/factorizations/svd.jl
@@ -49,9 +49,9 @@ for V in spacelist
             end
             for T in eltypes, t in (randn(T, W, W), randn(T, W, W)')
                 project_hermitian!(t)
-                vals = @constinferred LinearAlgebra.eigvals(t)
-                λmax = maximum(s -> maximum(abs, s), values(vals))
-                λmin = minimum(s -> minimum(abs, s), values(vals))
+                vals = @constinferred eigh_vals(t)
+                λmax = maximum(abs, vals)
+                λmin = minimum(abs, vals)
                 @test cond(t) ≈ λmax / λmin
             end
         end
diff --git a/test/mooncake/indexmanipulations.jl b/test/mooncake/indexmanipulations.jl
index 4dd6413cf..390721d71 100644
--- a/test/mooncake/indexmanipulations.jl
+++ b/test/mooncake/indexmanipulations.jl
@@ -18,7 +18,7 @@ eltypes = (Float64, ComplexF64)
     hasbraiding = BraidingStyle(sectortype(eltype(V))) isa HasBraiding
     symmetricbraiding = BraidingStyle(sectortype(eltype(V))) isa SymmetricBraiding
 
-    symmetricbraiding && @timedtestset "add_permute!" begin
+    symmetricbraiding && @timedtestset "permute!" begin
         A = randn(T, V[1] ⊗ V[2] ← (V[3] ⊗ V[4] ⊗ V[5])')
         α = randn(T)
         β = randn(T)
@@ -27,12 +27,12 @@ eltypes = (Float64, ComplexF64)
         for _ in 1:5
             p = randindextuple(numind(A))
             C = randn!(permute(A, p))
-            Mooncake.TestUtils.test_rule(rng, TensorKit.add_permute!, C, A, p, α, β; atol, rtol, mode)
+            Mooncake.TestUtils.test_rule(rng, TensorKit.permute!, C, A, p, α, β; atol, rtol, mode)
             A = C
         end
     end
 
-    @timedtestset "add_transpose!" begin
+    @timedtestset "transpose!" begin
         A = randn(T, V[1] ⊗ V[2] ← (V[3] ⊗ V[4] ⊗ V[5])')
         α = randn(T)
         β = randn(T)
@@ -41,18 +41,18 @@ eltypes = (Float64, ComplexF64)
         for _ in 1:2
             p = randcircshift(numout(A), numin(A))
             C = randn!(transpose(A, p))
-            Mooncake.TestUtils.test_rule(rng, TensorKit.add_transpose!, C, A, p, One(), Zero(); atol, rtol, mode)
-            Mooncake.TestUtils.test_rule(rng, TensorKit.add_transpose!, C, A, p, α, β; atol, rtol, mode)
+            Mooncake.TestUtils.test_rule(rng, TensorKit.transpose!, C, A, p, One(), Zero(); atol, rtol, mode)
+            Mooncake.TestUtils.test_rule(rng, TensorKit.transpose!, C, A, p, α, β; atol, rtol, mode)
             if !(T <: Real)
-                Mooncake.TestUtils.test_rule(rng, TensorKit.add_transpose!, C, real(A), p, α, β; atol, rtol, mode)
-                Mooncake.TestUtils.test_rule(rng, TensorKit.add_transpose!, C, A, p, real(α), β; atol, rtol, mode)
-                Mooncake.TestUtils.test_rule(rng, TensorKit.add_transpose!, C, real(A), p, real(α), β; atol, rtol, mode)
+                Mooncake.TestUtils.test_rule(rng, TensorKit.transpose!, C, real(A), p, α, β; atol, rtol, mode)
+                Mooncake.TestUtils.test_rule(rng, TensorKit.transpose!, C, A, p, real(α), β; atol, rtol, mode)
+                Mooncake.TestUtils.test_rule(rng, TensorKit.transpose!, C, real(A), p, real(α), β; atol, rtol, mode)
             end
             A = C
         end
     end
 
-    hasbraiding && @timedtestset "add_braid!" begin
+    hasbraiding && @timedtestset "braid!" begin
         A = randn(T, V[1] ⊗ V[2] ← (V[3] ⊗ V[4] ⊗ V[5])')
         α = randn(T)
         β = randn(T)
@@ -62,11 +62,11 @@ eltypes = (Float64, ComplexF64)
             p = randcircshift(numout(A), numin(A))
             levels = Tuple(randperm(numind(A)))
             C = randn!(transpose(A, p))
-            Mooncake.TestUtils.test_rule(rng, TensorKit.add_braid!, C, A, p, levels, α, β; atol, rtol, mode)
+            Mooncake.TestUtils.test_rule(rng, TensorKit.braid!, C, A, p, levels, α, β; atol, rtol, mode)
             if !(T <: Real)
-                Mooncake.TestUtils.test_rule(rng, TensorKit.add_braid!, C, real(A), p, levels, α, β; atol, rtol, mode)
-                Mooncake.TestUtils.test_rule(rng, TensorKit.add_braid!, C, A, p, levels, real(α), β; atol, rtol, mode)
-                Mooncake.TestUtils.test_rule(rng, TensorKit.add_braid!, C, A, p, levels, real(α), real(β); atol, rtol, mode)
+                Mooncake.TestUtils.test_rule(rng, TensorKit.braid!, C, real(A), p, levels, α, β; atol, rtol, mode)
+                Mooncake.TestUtils.test_rule(rng, TensorKit.braid!, C, A, p, levels, real(α), β; atol, rtol, mode)
+                Mooncake.TestUtils.test_rule(rng, TensorKit.braid!, C, A, p, levels, real(α), real(β); atol, rtol, mode)
             end
             A = C
         end
diff --git a/test/tensors/indexmanipulations.jl b/test/tensors/indexmanipulations.jl
index c38b182a2..836418b3f 100644
--- a/test/tensors/indexmanipulations.jl
+++ b/test/tensors/indexmanipulations.jl
@@ -129,6 +129,14 @@ for V in spacelist
             @tensor tb[a, b] := flip(t1, (1, 3))[x, y, a, z] * flip(t2, (2, 4))[y, b, z, x]
             @test flip(ta, (1, 2)) ≈ tb
         end
+        hasbraiding && !symmetricbraiding && @timedtestset "Braid AdjointTensorMap: adjoint identity" begin
+            t = rand(ComplexF64, V1 ⊗ V2 ← V3)
+            p = ((2,), (1, 3))
+            levels = (1, 3, 2)
+            t1 = copy(braid(t', p, levels))
+            t2 = braid(copy(t'), p, levels)
+            @test t1 ≈ t2
+        end
     end
     TensorKit.empty_globalcaches!()
 end