From e882b8e50cad4a22f64a4d0f016ee37c72ecbe8b Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <kslimes@gmail.com>
Date: Fri, 17 Apr 2026 11:36:03 +0200
Subject: [PATCH 1/8] Bump minimum version of CUDA and cuTENSOR

---
 Project.toml                | 6 +++---
 test/cuda/factorizations.jl | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Project.toml b/Project.toml
index 28088e3de..83ee37582 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "TensorKit"
 uuid = "07d1fe3e-3e46-537d-9eac-e9e13d0d4cec"
-version = "0.16.3"
+version = "0.17.0"
 authors = ["Jutho Haegeman, Lukas Devos"]
 
 [deps]
@@ -41,7 +41,7 @@ projects = ["test", "docs"]
 [compat]
 Adapt = "4"
 AMDGPU = "2"
-CUDA = "5.9"
+CUDA = "6"
 ChainRulesCore = "1"
 Dictionaries = "0.4"
 FiniteDifferences = "0.12"
@@ -58,5 +58,5 @@ TensorKitSectors = "0.3.7"
 TensorOperations = "5.1"
 TupleTools = "1.5"
 VectorInterface = "0.4.8, 0.5"
-cuTENSOR = "2"
+cuTENSOR = "6"
 julia = "1.10"
diff --git a/test/cuda/factorizations.jl b/test/cuda/factorizations.jl
index 62e23c9df..d18672459 100644
--- a/test/cuda/factorizations.jl
+++ b/test/cuda/factorizations.jl
@@ -373,7 +373,7 @@ for V in spacelist
 
                 d1, d2 = dim(codomain(t)), dim(domain(t))
                 r = rank(t)
-                @test r == min(d1, d2)
+                @test r ≈ min(d1, d2)
                 @test typeof(r) == typeof(d1)
                 M = left_null(t)
                 @test @constinferred(rank(M)) + r ≈ d1

From 20fbbdc69acbd23b05850187958f5323c0d8a4a6 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Tue, 21 Apr 2026 09:26:14 -0400
Subject: [PATCH 2/8] Some import fixes

---
 ext/TensorKitCUDAExt/TensorKitCUDAExt.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl b/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl
index f5efb98bb..f7e87d16e 100644
--- a/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl
+++ b/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl
@@ -1,9 +1,9 @@
 module TensorKitCUDAExt
 
-using CUDA, CUDA.CUBLAS, CUDA.CUSOLVER, LinearAlgebra
+using CUDA, CUDA.cuBLAS, CUDA.cuSOLVER, CUDA.cuRAND, LinearAlgebra
 using CUDA: @allowscalar
 using cuTENSOR: cuTENSOR
-import CUDA: rand as curand, rand! as curand!, randn as curandn, randn! as curandn!
+import CUDA.cuRAND: rand as curand, rand! as curand!, randn as curandn, randn! as curandn!
 
 using TensorKit
 using TensorKit.Factorizations

From 66c1bc6e9dd40c993c7afb5e2277ee7694c42b91 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Tue, 21 Apr 2026 09:27:41 -0400
Subject: [PATCH 3/8] Sources for Projects

---
 Project.toml      | 7 +++++++
 test/Project.toml | 2 ++
 2 files changed, 9 insertions(+)

diff --git a/Project.toml b/Project.toml
index 83ee37582..5230c5755 100644
--- a/Project.toml
+++ b/Project.toml
@@ -60,3 +60,10 @@ TupleTools = "1.5"
 VectorInterface = "0.4.8, 0.5"
 cuTENSOR = "6"
 julia = "1.10"
+
+[extras]
+Mooncake = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6"
+
+[sources]
+MatrixAlgebraKit = {url = "https://github.com/QuantumKitHub/MatrixAlgebraKit.jl", rev = "ksh/cuda6"}
+Mooncake = {url = "https://github.com/chalk-lab/Mooncake.jl", rev = "ksh/cuda6"}
diff --git a/test/Project.toml b/test/Project.toml
index 18af8af80..9190343e9 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -29,6 +29,8 @@ cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1"
 
 [sources]
 TensorKit = {path = ".."}
+MatrixAlgebraKit = {url = "https://github.com/QuantumKitHub/MatrixAlgebraKit.jl", rev = "ksh/cuda6"}
+Mooncake = {url = "https://github.com/chalk-lab/Mooncake.jl", rev = "ksh/cuda6"}
 
 [compat]
 Aqua = "0.6, 0.7, 0.8"

From b7825aa167a4c35898d1a053b072e639c4625d80 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Wed, 22 Apr 2026 01:45:48 -0400
Subject: [PATCH 4/8] Fix rand/randn

---
 test/cuda/factorizations.jl | 100 ++++++++++++++++++------------------
 test/cuda/tensors.jl        |  88 +++++++++++++++----------------
 2 files changed, 94 insertions(+), 94 deletions(-)

diff --git a/test/cuda/factorizations.jl b/test/cuda/factorizations.jl
index d18672459..d7c8e0520 100644
--- a/test/cuda/factorizations.jl
+++ b/test/cuda/factorizations.jl
@@ -1,4 +1,4 @@
-using Adapt, CUDA, cuTENSOR
+using Adapt, CUDA, CUDA.cuRAND, cuTENSOR
 using Test, TestExtras
 using TensorKit
 using LinearAlgebra: LinearAlgebra
@@ -23,10 +23,10 @@ for V in spacelist
         @testset "QR decomposition" begin
             for T in eltypes,
                     t in (
-                        CUDA.rand(T, W, W), CUDA.rand(T, W, W)',
-                        CUDA.rand(T, (V1 ⊗ V2 ⊗ V3), (V4 ⊗ V5)'), CUDA.rand(T, (V1 ⊗ V2 ⊗ V3), (V4 ⊗ V5)')',
-                        CUDA.rand(T, (V1 ⊗ V2)', (V3 ⊗ V4 ⊗ V5)), CUDA.rand(T, (V1 ⊗ V2)', (V3 ⊗ V4 ⊗ V5))',
-                        DiagonalTensorMap(CUDA.rand(T, reduceddim(V1)), V1),
+                        cuRAND.rand(T, W, W), cuRAND.rand(T, W, W)',
+                        cuRAND.rand(T, (V1 ⊗ V2 ⊗ V3), (V4 ⊗ V5)'), cuRAND.rand(T, (V1 ⊗ V2 ⊗ V3), (V4 ⊗ V5)')',
+                        cuRAND.rand(T, (V1 ⊗ V2)', (V3 ⊗ V4 ⊗ V5)), cuRAND.rand(T, (V1 ⊗ V2)', (V3 ⊗ V4 ⊗ V5))',
+                        DiagonalTensorMap(cuRAND.rand(T, reduceddim(V1)), V1),
                     )
 
                 Q, R = @constinferred qr_full(t)
@@ -52,7 +52,7 @@ for V in spacelist
 
             # empty tensor
             for T in eltypes
-                t = CUDA.rand(T, V1 ⊗ V2, zerospace(V1))
+                t = cuRAND.rand(T, V1 ⊗ V2, zerospace(V1))
 
                 Q, R = @constinferred qr_full(t)
                 @test Q * R ≈ t
@@ -78,10 +78,10 @@ for V in spacelist
         @testset "LQ decomposition" begin
             for T in eltypes,
                     t in (
-                        CUDA.rand(T, W, W), CUDA.rand(T, W, W)',
-                        CUDA.rand(T, (V1 ⊗ V2), (V3 ⊗ V4 ⊗ V5)'), CUDA.rand(T, (V1 ⊗ V2), (V3 ⊗ V4 ⊗ V5)')',
-                        CUDA.rand(T, (V1 ⊗ V2 ⊗ V3)', (V4 ⊗ V5)), CUDA.rand(T, (V1 ⊗ V2 ⊗ V3)', (V4 ⊗ V5))',
-                        DiagonalTensorMap(CUDA.rand(T, reduceddim(V1)), V1),
+                        cuRAND.rand(T, W, W), cuRAND.rand(T, W, W)',
+                        cuRAND.rand(T, (V1 ⊗ V2), (V3 ⊗ V4 ⊗ V5)'), cuRAND.rand(T, (V1 ⊗ V2), (V3 ⊗ V4 ⊗ V5)')',
+                        cuRAND.rand(T, (V1 ⊗ V2 ⊗ V3)', (V4 ⊗ V5)), cuRAND.rand(T, (V1 ⊗ V2 ⊗ V3)', (V4 ⊗ V5))',
+                        DiagonalTensorMap(cuRAND.rand(T, reduceddim(V1)), V1),
                     )
 
                 L, Q = @constinferred lq_full(t)
@@ -103,7 +103,7 @@ for V in spacelist
 
             for T in eltypes
                 # empty tensor
-                t = CUDA.rand(T, zerospace(V1), V1 ⊗ V2)
+                t = cuRAND.rand(T, zerospace(V1), V1 ⊗ V2)
 
                 L, Q = @constinferred lq_full(t)
                 @test L * Q ≈ t
@@ -129,10 +129,10 @@ for V in spacelist
         @testset "Polar decomposition" begin
             @testset for T in eltypes,
                     t in (
-                        CUDA.rand(T, W, W),
-                        CUDA.rand(T, (V1 ⊗ V2 ⊗ V3), (V4 ⊗ V5)'),
-                        CUDA.rand(T, (V1 ⊗ V2)', (V3 ⊗ V4 ⊗ V5))',
-                        DiagonalTensorMap(CUDA.rand(T, reduceddim(V1)), V1),
+                        cuRAND.rand(T, W, W),
+                        cuRAND.rand(T, (V1 ⊗ V2 ⊗ V3), (V4 ⊗ V5)'),
+                        cuRAND.rand(T, (V1 ⊗ V2)', (V3 ⊗ V4 ⊗ V5))',
+                        DiagonalTensorMap(cuRAND.rand(T, reduceddim(V1)), V1),
                     )
 
                 @assert domain(t) ≾ codomain(t)
@@ -148,10 +148,10 @@ for V in spacelist
 
             @testset for T in eltypes,
                     t in (
-                        CUDA.rand(T, W, W),
-                        CUDA.rand(T, (V1 ⊗ V2), (V3 ⊗ V4 ⊗ V5)'),
-                        CUDA.rand(T, (V1 ⊗ V2 ⊗ V3)', (V4 ⊗ V5))',
-                        DiagonalTensorMap(CUDA.rand(T, reduceddim(V1)), V1),
+                        cuRAND.rand(T, W, W),
+                        cuRAND.rand(T, (V1 ⊗ V2), (V3 ⊗ V4 ⊗ V5)'),
+                        cuRAND.rand(T, (V1 ⊗ V2 ⊗ V3)', (V4 ⊗ V5))',
+                        DiagonalTensorMap(cuRAND.rand(T, reduceddim(V1)), V1),
                     )
 
                 @assert codomain(t) ≾ domain(t)
@@ -169,10 +169,10 @@ for V in spacelist
         @testset "SVD" begin
             for T in eltypes,
                     t in (
-                        CUDA.rand(T, W, W), CUDA.rand(T, W, W)',
-                        CUDA.rand(T, (V1 ⊗ V2 ⊗ V3), (V4 ⊗ V5)'), CUDA.rand(T, (V1 ⊗ V2)', (V3 ⊗ V4 ⊗ V5))',
-                        CUDA.rand(T, (V1 ⊗ V2), (V3 ⊗ V4 ⊗ V5)'), CUDA.rand(T, (V1 ⊗ V2 ⊗ V3)', (V4 ⊗ V5))',
-                        DiagonalTensorMap(CUDA.rand(T, reduceddim(V1)), V1),
+                        cuRAND.rand(T, W, W), cuRAND.rand(T, W, W)',
+                        cuRAND.rand(T, (V1 ⊗ V2 ⊗ V3), (V4 ⊗ V5)'), cuRAND.rand(T, (V1 ⊗ V2)', (V3 ⊗ V4 ⊗ V5))',
+                        cuRAND.rand(T, (V1 ⊗ V2), (V3 ⊗ V4 ⊗ V5)'), cuRAND.rand(T, (V1 ⊗ V2 ⊗ V3)', (V4 ⊗ V5))',
+                        DiagonalTensorMap(cuRAND.rand(T, reduceddim(V1)), V1),
                     )
 
                 u, s, vᴴ = @constinferred svd_full(t)
@@ -220,8 +220,8 @@ for V in spacelist
 
             # empty tensor
             for T in eltypes, t in (
-                        CUDA.rand(T, W, zerospace(V1)),
-                        CUDA.rand(T, zerospace(V1), W),
+                        cuRAND.rand(T, W, zerospace(V1)),
+                        cuRAND.rand(T, zerospace(V1), W),
                     )
                 U, S, Vᴴ = @constinferred svd_full(t)
                 @test U * S * Vᴴ ≈ t
@@ -237,10 +237,10 @@ for V in spacelist
         @testset "truncated SVD" begin
             for T in eltypes,
                     t in (
-                        CUDA.randn(T, W, W), CUDA.randn(T, W, W)',
-                        CUDA.randn(T, (V1 ⊗ V2 ⊗ V3), (V4 ⊗ V5)'), CUDA.randn(T, (V1 ⊗ V2)', (V3 ⊗ V4 ⊗ V5))',
-                        CUDA.randn(T, (V1 ⊗ V2), (V3 ⊗ V4 ⊗ V5)'), CUDA.randn(T, (V1 ⊗ V2 ⊗ V3)', (V4 ⊗ V5))',
-                        DiagonalTensorMap(CUDA.randn(T, reduceddim(V1)), V1),
+                        cuRAND.randn(T, W, W), cuRAND.randn(T, W, W)',
+                        cuRAND.randn(T, (V1 ⊗ V2 ⊗ V3), (V4 ⊗ V5)'), cuRAND.randn(T, (V1 ⊗ V2)', (V3 ⊗ V4 ⊗ V5))',
+                        cuRAND.randn(T, (V1 ⊗ V2), (V3 ⊗ V4 ⊗ V5)'), cuRAND.randn(T, (V1 ⊗ V2 ⊗ V3)', (V4 ⊗ V5))',
+                        DiagonalTensorMap(cuRAND.randn(T, reduceddim(V1)), V1),
                     )
 
                 @constinferred normalize!(t)
@@ -305,10 +305,10 @@ for V in spacelist
         @testset "Eigenvalue decomposition" begin
             for T in eltypes,
                     t in (
-                        CUDA.rand(T, V1, V1),
-                        CUDA.rand(T, W, W),
-                        CUDA.rand(T, W, W)',
-                        # DiagonalTensorMap(CUDA.rand(T, reduceddim(V1)), V1),
+                        cuRAND.rand(T, V1, V1),
+                        cuRAND.rand(T, W, W),
+                        cuRAND.rand(T, W, W)',
+                        # DiagonalTensorMap(cuRAND.rand(T, reduceddim(V1)), V1),
                     )
 
                 d, v = @constinferred eig_full(t)
@@ -365,10 +365,10 @@ for V in spacelist
         @testset "Condition number and rank" begin
             for T in eltypes,
                     t in (
-                        CUDA.rand(T, W, W), CUDA.rand(T, W, W)',
-                        CUDA.rand(T, (V1 ⊗ V2 ⊗ V3), (V4 ⊗ V5)'), CUDA.rand(T, (V1 ⊗ V2)', (V3 ⊗ V4 ⊗ V5))',
-                        CUDA.rand(T, (V1 ⊗ V2), (V3 ⊗ V4 ⊗ V5)'), CUDA.rand(T, (V1 ⊗ V2 ⊗ V3)', (V4 ⊗ V5))',
-                        DiagonalTensorMap(CUDA.rand(T, reduceddim(V1)), V1),
+                        cuRAND.rand(T, W, W), cuRAND.rand(T, W, W)',
+                        cuRAND.rand(T, (V1 ⊗ V2 ⊗ V3), (V4 ⊗ V5)'), cuRAND.rand(T, (V1 ⊗ V2)', (V3 ⊗ V4 ⊗ V5))',
+                        cuRAND.rand(T, (V1 ⊗ V2), (V3 ⊗ V4 ⊗ V5)'), cuRAND.rand(T, (V1 ⊗ V2 ⊗ V3)', (V4 ⊗ V5))',
+                        DiagonalTensorMap(cuRAND.rand(T, reduceddim(V1)), V1),
                     )
 
                 d1, d2 = dim(codomain(t)), dim(domain(t))
@@ -385,15 +385,15 @@ for V in spacelist
                 @test @constinferred(cond(u)) ≈ one(real(T))
                 @test @constinferred(rank(u)) == dim(V1 ⊗ V2)
 
-                t = CUDA.rand(T, zerospace(V1), W)
+                t = cuRAND.rand(T, zerospace(V1), W)
                 @test rank(t) == 0
-                t2 = CUDA.rand(T, zerospace(V1) * zerospace(V2), zerospace(V1) * zerospace(V2))
+                t2 = cuRAND.rand(T, zerospace(V1) * zerospace(V2), zerospace(V1) * zerospace(V2))
                 @test rank(t2) == 0
                 @test cond(t2) == 0.0
             end
             for T in eltypes, t in (
-                        CUDA.rand(T, W, W),
-                        CUDA.rand(T, W, W)',
+                        cuRAND.rand(T, W, W),
+                        cuRAND.rand(T, W, W)',
                     )
                 project_hermitian!(t)
                 vals = @constinferred LinearAlgebra.eigvals(t)
@@ -406,10 +406,10 @@ for V in spacelist
         @testset "Hermitian projections" begin
             for T in eltypes,
                     t in (
-                        CUDA.rand(T, V1, V1),
-                        CUDA.rand(T, W, W),
-                        CUDA.rand(T, W, W)',
-                        DiagonalTensorMap(CUDA.rand(T, reduceddim(V1)), V1),
+                        cuRAND.rand(T, V1, V1),
+                        cuRAND.rand(T, W, W),
+                        cuRAND.rand(T, W, W)',
+                        DiagonalTensorMap(cuRAND.rand(T, reduceddim(V1)), V1),
                     )
                 normalize!(t)
                 noisefactor = eps(real(T))^(3 / 4)
@@ -439,10 +439,10 @@ for V in spacelist
         @testset "Isometric projections" begin
             for T in eltypes,
                     t in (
-                        CUDA.randn(T, W, W),
-                        CUDA.randn(T, W, W)',
-                        CUDA.randn(T, (V1 ⊗ V2 ⊗ V3), (V4 ⊗ V5)'),
-                        CUDA.randn(T, (V1 ⊗ V2)', (V3 ⊗ V4 ⊗ V5))',
+                        cuRAND.randn(T, W, W),
+                        cuRAND.randn(T, W, W)',
+                        cuRAND.randn(T, (V1 ⊗ V2 ⊗ V3), (V4 ⊗ V5)'),
+                        cuRAND.randn(T, (V1 ⊗ V2)', (V3 ⊗ V4 ⊗ V5))',
                     )
                 t2 = project_isometric(t)
                 @test isisometric(t2)
@@ -457,7 +457,7 @@ for V in spacelist
 
                 # test that t2 is closer to A then any other isometry
                 for k in 1:10
-                    δt = CUDA.randn!(similar(t))
+                    δt = cuRAND.randn!(similar(t))
                     t3 = project_isometric(t + δt / 100)
                     @test norm(t - t3) > norm(t - t2)
                 end
diff --git a/test/cuda/tensors.jl b/test/cuda/tensors.jl
index c88e98b45..f33c40c44 100644
--- a/test/cuda/tensors.jl
+++ b/test/cuda/tensors.jl
@@ -1,4 +1,4 @@
-using Adapt, CUDA, cuTENSOR
+using Adapt, CUDA, CUDA.cuRAND, cuTENSOR
 using Test, TestExtras
 using TensorKit, Combinatorics
 ad = adapt(Array)
@@ -20,7 +20,7 @@ for V in spacelist
         @timedtestset "Basic tensor properties" begin
             W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
             # test default pass-throughs
-            for f in (CUDA.zeros, CUDA.ones, CUDA.rand, CUDA.randn)
+            for f in (CUDA.zeros, CUDA.ones, cuRAND.rand, cuRAND.randn)
                 t = @constinferred f(W)
                 @test scalartype(t) == Float64
                 @test codomain(t) == W
@@ -44,7 +44,7 @@ for V in spacelist
                 @test domain(t) == one(W)
                 @test typeof(t) == TensorMap{Float64, spacetype(t), 5, 0, CuVector{Float64, CUDA.DeviceMemory}}
             end
-            for f! in (CUDA.rand!, CUDA.randn!)
+            for f! in (cuRAND.rand!, cuRAND.randn!)
                 t = @constinferred CUDA.zeros(W)
                 f!(t)
                 @test scalartype(t) == Float64
@@ -113,7 +113,7 @@ for V in spacelist
         @timedtestset "Tensor Dict conversion" begin
             W = V1 ⊗ V2 ← (V3 ⊗ V4 ⊗ V5)'
             for T in (Int, Float32, ComplexF64)
-                t = @constinferred CUDA.rand(T, W)
+                t = @constinferred cuRAND.rand(T, W)
                 d = convert(Dict, t)
                 @test convert(Dict, TensorKit.to_cpu(t)) == d
             end
@@ -121,7 +121,7 @@ for V in spacelist
         symmetricbraiding && @timedtestset "Basic linear algebra" begin
             W = V1 ⊗ V2 ← (V3 ⊗ V4 ⊗ V5)'
             for T in (Float32, ComplexF64)
-                t = @constinferred CUDA.rand(T, W)
+                t = @constinferred cuRAND.rand(T, W)
                 @test scalartype(t) == T
                 @test space(t) == W
                 @test space(t') == W'
@@ -171,7 +171,7 @@ for V in spacelist
         @timedtestset "Trivial space insertion and removal" begin
             W = V1 ⊗ V2 ← (V3 ⊗ V4 ⊗ V5)'
             for T in (Float32, ComplexF64)
-                t = @constinferred CUDA.rand(T, W)
+                t = @constinferred cuRAND.rand(T, W)
                 t2 = @constinferred insertleftunit(t)
                 @test t2 == @constinferred insertrightunit(t)
                 @test numind(t2) == numind(t) + 1
@@ -204,8 +204,8 @@ for V in spacelist
             @timedtestset "Basic linear algebra: test via CPU" begin
                 W = V1 ⊗ V2 ⊗ V3 ← (V4 ⊗ V5)'
                 for T in (Float32, ComplexF64)
-                    t = CUDA.rand(T, W)
-                    t2 = @constinferred CUDA.rand!(similar(t))
+                    t = cuRAND.rand(T, W)
+                    t2 = @constinferred cuRAND.rand!(similar(t))
                     α = rand(T)
                     @test norm(t, 2) ≈ norm(TensorKit.to_cpu(t), 2)
                     @test dot(t2, t) ≈ dot(TensorKit.to_cpu(t2), TensorKit.to_cpu(t))
@@ -216,7 +216,7 @@ for V in spacelist
             @timedtestset "Real and imaginary parts" begin
                 W = V1 ⊗ V2
                 for T in (Float64, ComplexF64, ComplexF32)
-                    t = @constinferred CUDA.randn(T, W, W)
+                    t = @constinferred cuRAND.randn(T, W, W)
 
                     tr = @constinferred real(t)
                     @test scalartype(tr) <: Real
@@ -241,7 +241,7 @@ for V in spacelist
         end
         @timedtestset "Tensor conversion" begin
             W = V1 ⊗ V2
-            t = @constinferred CUDA.randn(W ← W)
+            t = @constinferred cuRAND.randn(W ← W)
             @test typeof(convert(typeof(t), t')) == typeof(t)
             @test typeof(TensorKit.to_cpu(t')) == typeof(TensorKit.to_cpu(t)')
             tc = complex(t)
@@ -253,7 +253,7 @@ for V in spacelist
         end
         #=@timedtestset "diag/diagm" begin
             W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
-            t = CUDA.randn(ComplexF64, W)
+            t = cuRAND.randn(ComplexF64, W)
             d = LinearAlgebra.diag(t)
             # TODO find a way to use CUDA here
             D = LinearAlgebra.diagm(codomain(t), domain(t), d)
@@ -262,8 +262,8 @@ for V in spacelist
         end=#
         symmetricbraiding && @timedtestset "Permutations: test via inner product invariance" begin
             W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
-            t = CUDA.rand(ComplexF64, W)
-            t′ = CUDA.randn!(similar(t))
+            t = cuRAND.rand(ComplexF64, W)
+            t′ = cuRAND.randn!(similar(t))
             for k in 0:5
                 for p in permutations(1:5)
                     p1 = ntuple(n -> p[n], k)
@@ -287,7 +287,7 @@ for V in spacelist
         end
         symmetricbraiding && @timedtestset "Permutations: test via CPU" begin
             W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
-            t = CUDA.rand(ComplexF64, W)
+            t = cuRAND.rand(ComplexF64, W)
             for k in 0:5
                 for p in permutations(1:5)
                     p1 = ntuple(n -> p[n], k)
@@ -303,7 +303,7 @@ for V in spacelist
             end
         end
         symmetricbraiding && @timedtestset "Full trace: test self-consistency" begin
-            t = CUDA.rand(ComplexF64, V1 ⊗ V2' ⊗ V2 ⊗ V1')
+            t = cuRAND.rand(ComplexF64, V1 ⊗ V2' ⊗ V2 ⊗ V1')
             CUDA.@allowscalar begin
                 t2 = permute(t, ((1, 2), (4, 3)))
                 s = @constinferred tr(t2)
@@ -323,14 +323,14 @@ for V in spacelist
             @test ss ≈ s3
         end
         symmetricbraiding && @timedtestset "Partial trace: test self-consistency" begin
-            t = CUDA.rand(ComplexF64, V1 ⊗ V2' ⊗ V3 ⊗ V2 ⊗ V1' ⊗ V3')
+            t = cuRAND.rand(ComplexF64, V1 ⊗ V2' ⊗ V3 ⊗ V2 ⊗ V1' ⊗ V3')
             @tensor t2[a, b] := t[c, d, b, d, c, a]
             @tensor t4[a, b, c, d] := t[d, e, b, e, c, a]
             @tensor t5[a, b] := t4[a, b, c, c]
             @test t2 ≈ t5
         end
         symmetricbraiding && @timedtestset "Trace: test via conversion" begin
-            t = CUDA.rand(ComplexF64, V1 ⊗ V2' ⊗ V3 ⊗ V2 ⊗ V1' ⊗ V3')
+            t = cuRAND.rand(ComplexF64, V1 ⊗ V2' ⊗ V3 ⊗ V2 ⊗ V1' ⊗ V3')
             CUDA.@allowscalar begin
                 @tensor t2[a, b] := t[c, d, b, d, c, a]
                 @tensor t3[a, b] := ad(t)[c, d, b, d, c, a]
@@ -338,8 +338,8 @@ for V in spacelist
             @test t3 ≈ ad(t2)
         end
         symmetricbraiding && @timedtestset "Trace and contraction" begin
-            t1 = CUDA.rand(ComplexF64, V1 ⊗ V2 ⊗ V3)
-            t2 = CUDA.rand(ComplexF64, V2' ⊗ V4 ⊗ V1')
+            t1 = cuRAND.rand(ComplexF64, V1 ⊗ V2 ⊗ V3)
+            t2 = cuRAND.rand(ComplexF64, V2' ⊗ V4 ⊗ V1')
             CUDA.@allowscalar begin
                 t3 = t1 ⊗ t2
                 @tensor ta[a, b] := t1[x, y, a] * t2[y, b, x]
@@ -349,11 +349,11 @@ for V in spacelist
         end
         #=if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
             @timedtestset "Tensor contraction: test via CPU" begin
-                dA1 = CUDA.randn(ComplexF64, V1' * V2', V3')
-                dA2 = CUDA.randn(ComplexF64, V3 * V4, V5)
-                drhoL = CUDA.randn(ComplexF64, V1, V1)
-                drhoR = CUDA.randn(ComplexF64, V5, V5)' # test adjoint tensor
-                dH = CUDA.randn(ComplexF64, V2 * V4, V2 * V4)
+                dA1 = cuRAND.randn(ComplexF64, V1' * V2', V3')
+                dA2 = cuRAND.randn(ComplexF64, V3 * V4, V5)
+                drhoL = cuRAND.randn(ComplexF64, V1, V1)
+                drhoR = cuRAND.randn(ComplexF64, V5, V5)' # test adjoint tensor
+                dH = cuRAND.randn(ComplexF64, V2 * V4, V2 * V4)
                 @tensor dHrA12[a, s1, s2, c] := drhoL[a, a'] * conj(dA1[a', t1, b]) *
                     dA2[b, t2, c'] * drhoR[c', c] *
                     dH[s1, s2, t1, t2]
@@ -364,7 +364,7 @@ for V in spacelist
             end
         end=# # doesn't yet work because of AdjointTensor
         BraidingStyle(I) isa HasBraiding && @timedtestset "Index flipping: test flipping inverse" begin
-            t = CUDA.rand(ComplexF64, V1 ⊗ V2 ⊗ V3 ← (V4 ⊗ V5)')
+            t = cuRAND.rand(ComplexF64, V1 ⊗ V2 ⊗ V3 ← (V4 ⊗ V5)')
             for i in 1:5
                 CUDA.@allowscalar begin
                     @test t ≈ flip(flip(t, i), i; inv = true)
@@ -373,7 +373,7 @@ for V in spacelist
             end
         end
         #=@timedtestset "Index flipping: test via explicit flip" begin
-            t = CUDA.rand(ComplexF64, V1 ⊗ V1' ← V1' ⊗ V1)
+            t = cuRAND.rand(ComplexF64, V1 ⊗ V1' ← V1' ⊗ V1)
             F1 = unitary(flip(V1), V1)
 
             CUDA.@allowscalar begin
@@ -388,8 +388,8 @@ for V in spacelist
             end
         end
         @timedtestset "Index flipping: test via contraction" begin
-            t1 = CUDA.rand(ComplexF64, V1 ⊗ V2 ⊗ V3 ← V4)
-            t2 = CUDA.rand(ComplexF64, V2' ⊗ V5 ← V4' ⊗ V1)
+            t1 = cuRAND.rand(ComplexF64, V1 ⊗ V2 ⊗ V3 ← V4)
+            t2 = cuRAND.rand(ComplexF64, V2' ⊗ V5 ← V4' ⊗ V1)
             CUDA.@allowscalar begin
                 @tensor ta[a, b] := t1[x, y, a, z] * t2[y, b, z, x]
                 @tensor tb[a, b] := flip(t1, 1)[x, y, a, z] * flip(t2, 4)[y, b, z, x]
@@ -417,9 +417,9 @@ for V in spacelist
             W1 = V1 ⊗ V2 ⊗ V3
             W2 = (V4 ⊗ V5)'
             for T in (Float64, ComplexF64)
-                t1 = CUDA.rand(T, W1, W1)
-                t2 = CUDA.rand(T, W2, W2)
-                t = CUDA.rand(T, W1, W2)
+                t1 = cuRAND.rand(T, W1, W1)
+                t2 = cuRAND.rand(T, W2, W2)
+                t = cuRAND.rand(T, W1, W2)
                 @test t1 * (t1 \ t) ≈ t
                 @test (t / t2) * t2 ≈ t
                 @test t1 \ one(t1) ≈ inv(t1)
@@ -435,9 +435,9 @@ for V in spacelist
             W1 = V1 ⊗ V2 ⊗ V3
             W2 = (V4 ⊗ V5)'
             for T in (Float32, Float64, ComplexF32, ComplexF64)
-                t1 = CUDA.rand(T, W1, W1)
-                t2 = CUDA.rand(T, W2, W2)
-                t = CUDA.rand(T, W1, W2)
+                t1 = cuRAND.rand(T, W1, W1)
+                t2 = cuRAND.rand(T, W2, W2)
+                t = cuRAND.rand(T, W1, W2)
                 ht1 = TensorKit.to_cpu(t1)
                 ht2 = TensorKit.to_cpu(t2)
                 ht = TensorKit.to_cpu(t)
@@ -467,7 +467,7 @@ for V in spacelist
         symmetricbraiding && @timedtestset "Tensor functions" begin
             W = V1 ⊗ V2
             for T in (Float64, ComplexF64)
-                t = project_hermitian!(CUDA.randn(T, W, W))
+                t = project_hermitian!(cuRAND.randn(T, W, W))
                 s = dim(W)
                 #@test (@constinferred sqrt(t))^2 ≈ t
                 #@test TensorKit.to_cpu(sqrt(t)) ≈ sqrt(TensorKit.to_cpu(t))
@@ -510,11 +510,11 @@ for V in spacelist
         # Sylvester not defined for CUDA
         # @timedtestset "Sylvester equation" begin
         #     for T in (Float32, ComplexF64)
-        #         tA = CUDA.rand(T, V1 ⊗ V3, V1 ⊗ V3)
-        #         tB = CUDA.rand(T, V2 ⊗ V4, V2 ⊗ V4)
+        #         tA = cuRAND.rand(T, V1 ⊗ V3, V1 ⊗ V3)
+        #         tB = cuRAND.rand(T, V2 ⊗ V4, V2 ⊗ V4)
         #         tA = 3 // 2 * leftorth(tA; alg=Polar())[1]
         #         tB = 1 // 5 * leftorth(tB; alg=Polar())[1]
-        #         tC = CUDA.rand(T, V1 ⊗ V3, V2 ⊗ V4)
+        #         tC = cuRAND.rand(T, V1 ⊗ V3, V2 ⊗ V4)
         #         t = @constinferred sylvester(tA, tB, tC)
         #         @test codomain(t) == V1 ⊗ V3
         #         @test domain(t) == V2 ⊗ V4
@@ -530,16 +530,16 @@ for V in spacelist
         # TODO
         @timedtestset "Tensor product: test via norm preservation" begin
             for T in (ComplexF64,) # Float32 case broken because of cuTENSOR
-                t1 = CUDA.rand(T, V1, V5')
-                t2 = CUDA.rand(T, V2 ⊗ V3, V4')
+                t1 = cuRAND.rand(T, V1, V5')
+                t2 = cuRAND.rand(T, V2 ⊗ V3, V4')
                 t = @constinferred (t1 ⊗ t2)
                 @test norm(t) ≈ norm(t1) * norm(t2)
             end
         end
         symmetricbraiding && @timedtestset "Tensor product: test via conversion" begin
             for T in (Float32, ComplexF64)
-                t1 = CUDA.rand(T, V1, V5')
-                t2 = CUDA.rand(T, V2 ⊗ V3, V4')
+                t1 = cuRAND.rand(T, V1, V5')
+                t2 = cuRAND.rand(T, V2 ⊗ V3, V4')
                 d1 = dim(codomain(t1))
                 d2 = dim(codomain(t2))
                 d3 = dim(domain(t1))
@@ -551,8 +551,8 @@ for V in spacelist
         end
         symmetricbraiding && @timedtestset "Tensor product: test via tensor contraction" begin
             for T in (Float32, ComplexF64)
-                t1 = CUDA.rand(T, V1, V5')
-                t2 = CUDA.rand(T, V2 ⊗ V3, V4')
+                t1 = cuRAND.rand(T, V1, V5')
+                t2 = cuRAND.rand(T, V2 ⊗ V3, V4')
                 t = @constinferred (t1 ⊗ t2)
                 CUDA.@allowscalar begin
                     @tensor t′[1 2 3; 4 5] := t1[1; 4] * t2[2 3; 5]

From 2a34113cd38d7ae828952c923bd01c69ef11b950 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Fri, 24 Apr 2026 08:15:47 -0400
Subject: [PATCH 5/8] Cleanup

---
 .buildkite/pipeline.yml | 2 --
 Project.toml            | 9 +--------
 test/Project.toml       | 2 --
 3 files changed, 1 insertion(+), 12 deletions(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 347502ebb..f02740b00 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -14,8 +14,6 @@ steps:
     agents:
       queue: "juliagpu"
       cuda: "*"
-    commands: |
-      unset LD_LIBRARY_PATH
     if: build.message !~ /\[skip tests\]/
     timeout_in_minutes: 90
     matrix:
diff --git a/Project.toml b/Project.toml
index 5230c5755..5555e7b96 100644
--- a/Project.toml
+++ b/Project.toml
@@ -48,7 +48,7 @@ FiniteDifferences = "0.12"
 LRUCache = "1.0.2"
 LinearAlgebra = "1"
 MatrixAlgebraKit = "0.6.5"
-Mooncake = "0.5"
+Mooncake = "0.5.27"
 OhMyThreads = "0.8.0"
 Printf = "1"
 Random = "1"
@@ -60,10 +60,3 @@ TupleTools = "1.5"
 VectorInterface = "0.4.8, 0.5"
 cuTENSOR = "6"
 julia = "1.10"
-
-[extras]
-Mooncake = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6"
-
-[sources]
-MatrixAlgebraKit = {url = "https://github.com/QuantumKitHub/MatrixAlgebraKit.jl", rev = "ksh/cuda6"}
-Mooncake = {url = "https://github.com/chalk-lab/Mooncake.jl", rev = "ksh/cuda6"}
diff --git a/test/Project.toml b/test/Project.toml
index 9190343e9..18af8af80 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -29,8 +29,6 @@ cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1"
 
 [sources]
 TensorKit = {path = ".."}
-MatrixAlgebraKit = {url = "https://github.com/QuantumKitHub/MatrixAlgebraKit.jl", rev = "ksh/cuda6"}
-Mooncake = {url = "https://github.com/chalk-lab/Mooncake.jl", rev = "ksh/cuda6"}
 
 [compat]
 Aqua = "0.6, 0.7, 0.8"

From b5142201eafd6077cd02b791bdb43cd3030324e4 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <kshyatt@users.noreply.github.com>
Date: Fri, 24 Apr 2026 16:33:31 +0200
Subject: [PATCH 6/8] Update Project.toml

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 5555e7b96..07f19efca 100644
--- a/Project.toml
+++ b/Project.toml
@@ -47,7 +47,7 @@ Dictionaries = "0.4"
 FiniteDifferences = "0.12"
 LRUCache = "1.0.2"
 LinearAlgebra = "1"
-MatrixAlgebraKit = "0.6.5"
+MatrixAlgebraKit = "0.6.6"
 Mooncake = "0.5.27"
 OhMyThreads = "0.8.0"
 Printf = "1"

From 62632fb39cc8c789078e88ebe4e1a0fb3a5a2a7e Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <katharine.s.hyatt@gmail.com>
Date: Sat, 25 Apr 2026 08:17:41 +0200
Subject: [PATCH 7/8] Fix import

---
 ext/TensorKitCUDAExt/TensorKitCUDAExt.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl b/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl
index 9303bb305..1a5c28f7c 100644
--- a/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl
+++ b/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl
@@ -5,7 +5,7 @@ using CUDA: @allowscalar
 using cuTENSOR: cuTENSOR
 import CUDA.cuRAND: rand as curand, rand! as curand!, randn as curandn, randn! as curandn!
 using Strided: StridedViews
-using CUDA.KernelAbstractions: @kernel, @index, get_backend
+using CUDA.CUDACore.KernelAbstractions: @kernel, @index, get_backend
 
 using TensorKit
 using TensorKit.Factorizations

From 566adbd69fd1cc9afcaceb464404e32724ec6fb8 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <katharine.s.hyatt@gmail.com>
Date: Sat, 25 Apr 2026 08:42:07 +0200
Subject: [PATCH 8/8] Another module path

---
 ext/TensorKitCUDAExt/cutensormap.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ext/TensorKitCUDAExt/cutensormap.jl b/ext/TensorKitCUDAExt/cutensormap.jl
index 8894164a9..2fefb3a24 100644
--- a/ext/TensorKitCUDAExt/cutensormap.jl
+++ b/ext/TensorKitCUDAExt/cutensormap.jl
@@ -170,5 +170,5 @@ for f in (:sqrt, :log, :asin, :acos, :acosh, :atanh, :acoth)
 end
 
 function TensorKit._add_transform_multi!(tdst::CuTensorMap, tsrc, p, (U, structs_dst, structs_src)::Tuple{<:Array, TD, TS}, buffers, alpha, beta, backend...) where {TD, TS}
-    return TensorKit._add_transform_multi!(tdst, tsrc, p, (CUDA.Adapt.adapt(CuArray, U), structs_dst, structs_src), buffers, alpha, beta, backend...)
+    return TensorKit._add_transform_multi!(tdst, tsrc, p, (CUDA.CUDACore.Adapt.adapt(CuArray, U), structs_dst, structs_src), buffers, alpha, beta, backend...)
 end