diff --git a/Project.toml b/Project.toml index 3a8ca2f4..187bcbaf 100644 --- a/Project.toml +++ b/Project.toml @@ -19,25 +19,32 @@ VectorInterface = "409d34a3-91d5-4945-b6ec-7529ddf182d8" [weakdeps] Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e" ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" -Mooncake = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6" +CUDACore = "bd0ed864-bdfe-4181-a5ed-ce625a5fdea2" Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" +JLArrays = "27aeb0d3-9eb9-45fb-866b-73c2ecf80fcb" +Mooncake = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6" cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1" [extensions] TensorOperationsBumperExt = "Bumper" TensorOperationsChainRulesCoreExt = "ChainRulesCore" TensorOperationsMooncakeExt = "Mooncake" +TensorOperationsCUDACoreExt = "CUDACore" TensorOperationsEnzymeExt = "Enzyme" TensorOperationscuTENSORExt = "cuTENSOR" +TensorOperationsJLArraysExt = "JLArrays" [compat] Aqua = "0.6, 0.7, 0.8" +Adapt = "4" Bumper = "0.6, 0.7" +CUDACore = "6" ChainRulesCore = "1" ChainRulesTestUtils = "1" DynamicPolynomials = "0.5, 0.6" Enzyme = "0.13.115" EnzymeTestUtils = "0.2" +JLArrays = "0.3" LRUCache = "1" LinearAlgebra = "1.6" Logging = "1.6" @@ -47,7 +54,7 @@ PrecompileTools = "1.1" Preferences = "1.4" PtrArrays = "1.2" Random = "1" -Strided = "2.4" +Strided = "2.5" StridedViews = "0.5" Test = "1" TupleTools = "1.6" @@ -57,8 +64,10 @@ cuTENSOR = "6" julia = "1.10" [extras] +Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e" +CUDACore = "bd0ed864-bdfe-4181-a5ed-ce625a5fdea2" ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a" cuRAND = "20fd9a0b-12d5-4c2f-a8af-7c34e9e60431" @@ -72,4 +81,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1" [targets] -test = ["Test", "Random", "DynamicPolynomials", "ChainRulesTestUtils", "ChainRulesCore", "cuRAND", "cuTENSOR", "Aqua", "Logging", "Bumper", "Mooncake", "Enzyme", "EnzymeTestUtils"] +test = ["Test", "Random", "DynamicPolynomials", "ChainRulesTestUtils", "ChainRulesCore", "cuRAND", "CUDACore", "cuTENSOR", "Aqua", "Logging", "Bumper", "Mooncake", "Enzyme", "EnzymeTestUtils", "Adapt", "JLArrays"] diff --git a/ext/TensorOperationsCUDACoreExt.jl b/ext/TensorOperationsCUDACoreExt.jl new file mode 100644 index 00000000..6584a04a --- /dev/null +++ b/ext/TensorOperationsCUDACoreExt.jl @@ -0,0 +1,57 @@ +module TensorOperationsCUDACoreExt + +using CUDACore +using TensorOperations +using TensorOperations: TensorOperations as TO + +#------------------------------------------------------------------------------------------- +# Allocator +#------------------------------------------------------------------------------------------- + +TO.tensoradd_type(TC, A::CuArray, pA::Index2Tuple, conjA::Bool) = + CuArray{TC, TO.numind(pA)} + +function TO.CUDAAllocator() + Mout = CUDACore.UnifiedMemory + Min = CUDACore.default_memory + Mtemp = CUDACore.default_memory + return TO.CUDAAllocator{Mout, Min, Mtemp}() +end + +function TO.tensoralloc_add( + TC, A::AbstractArray, pA::Index2Tuple, conjA::Bool, + istemp::Val, allocator::TO.CUDAAllocator + ) + ttype = CuArray{TC, TO.numind(pA)} + structure = TO.tensoradd_structure(A, pA, conjA) + return TO.tensoralloc(ttype, structure, istemp, allocator)::ttype +end + +function TO.tensoralloc_contract( + TC, + A::AbstractArray, pA::Index2Tuple, conjA::Bool, + B::AbstractArray, pB::Index2Tuple, conjB::Bool, + pAB::Index2Tuple, + istemp::Val, allocator::TO.CUDAAllocator + ) + ttype = CuArray{TC, TO.numind(pAB)} + structure = TO.tensorcontract_structure(A, pA, conjA, B, pB, conjB, pAB) + return TO.tensoralloc(ttype, structure, istemp, allocator)::ttype +end + +# NOTE: the general implementation in the `DefaultAllocator` case works just fine, without +# selecting an explicit memory model +function TO.tensoralloc( + ::Type{CuArray{T, N}}, structure, + ::Val{istemp}, allocator::TO.CUDAAllocator{Mout, Min, Mtemp} + ) where {T, N, istemp, Mout, Min, Mtemp} + M = istemp ? Mtemp : Mout + return CuArray{T, N, M}(undef, structure) +end + +function TO.tensorfree!(C::CuArray, ::TO.CUDAAllocator) + CUDACore.unsafe_free!(C) + return nothing +end + +end diff --git a/ext/TensorOperationsJLArraysExt.jl b/ext/TensorOperationsJLArraysExt.jl new file mode 100644 index 00000000..1e8fee10 --- /dev/null +++ b/ext/TensorOperationsJLArraysExt.jl @@ -0,0 +1,9 @@ +module TensorOperationsJLArraysExt + +using JLArrays +using TensorOperations + +TensorOperations.tensoradd_type(TC, A::JLArray, pA::Index2Tuple, conjA::Bool) = + JLArray{TC, sum(length.(pA))} + +end diff --git a/ext/TensorOperationscuTENSORExt.jl b/ext/TensorOperationscuTENSORExt.jl index 8653715d..7e88ba35 100644 --- a/ext/TensorOperationscuTENSORExt.jl +++ b/ext/TensorOperationscuTENSORExt.jl @@ -144,57 +144,6 @@ function _custrided( end end -#------------------------------------------------------------------------------------------- -# Allocator -#------------------------------------------------------------------------------------------- -function TO.CUDAAllocator() - Mout = CUDACore.UnifiedMemory - Min = CUDACore.default_memory - Mtemp = CUDACore.default_memory - return CUDAAllocator{Mout, Min, Mtemp}() -end - -function TO.tensoralloc_add( - TC, A::AbstractArray, pA::Index2Tuple, conjA::Bool, - istemp::Val, allocator::CUDAAllocator - ) - ttype = CuArray{TC, TO.numind(pA)} - structure = TO.tensoradd_structure(A, pA, conjA) - return TO.tensoralloc(ttype, structure, istemp, allocator)::ttype -end - -function TO.tensoralloc_contract( - TC, - A::AbstractArray, pA::Index2Tuple, conjA::Bool, - B::AbstractArray, pB::Index2Tuple, conjB::Bool, - pAB::Index2Tuple, - istemp::Val, allocator::CUDAAllocator - ) - ttype = CuArray{TC, TO.numind(pAB)} - structure = TO.tensorcontract_structure(A, pA, conjA, B, pB, conjB, pAB) - return tensoralloc(ttype, structure, istemp, allocator)::ttype -end - -# Overwrite tensoradd_type -function TO.tensoradd_type(TC, A::CuArray, pA::Index2Tuple, conjA::Bool) - return CuArray{TC, sum(length.(pA))} -end - -# NOTE: the general implementation in the `DefaultAllocator` case works just fine, without -# selecting an explicit memory model -function TO.tensoralloc( - ::Type{CuArray{T, N}}, structure, - ::Val{istemp}, allocator::CUDAAllocator{Mout, Min, Mtemp} - ) where {T, N, istemp, Mout, Min, Mtemp} - M = istemp ? Mtemp : Mout - return CuArray{T, N, M}(undef, structure) -end - -function TO.tensorfree!(C::CuArray, ::CUDAAllocator) - CUDACore.unsafe_free!(C) - return nothing -end - #------------------------------------------------------------------------------------------- # Implementation #------------------------------------------------------------------------------------------- diff --git a/src/implementation/allocator.jl b/src/implementation/allocator.jl index cacd7b35..7ac5ea8b 100644 --- a/src/implementation/allocator.jl +++ b/src/implementation/allocator.jl @@ -166,6 +166,9 @@ end function tensoradd_type(TC, A::Base.PermutedDimsArray, pA::Index2Tuple, conjA::Bool) return tensoradd_type(TC, A.parent, pA, conjA) end +function tensoradd_type(TC, A::StridedView, pA::Index2Tuple, conjA::Bool) + return tensoradd_type(TC, parent(A), pA, conjA) +end function tensoradd_structure(A::AbstractArray, pA::Index2Tuple, conjA::Bool) return size.(Ref(A), linearize(pA)) diff --git a/src/implementation/strided.jl b/src/implementation/strided.jl index a6c8e9b8..c9669fd1 100644 --- a/src/implementation/strided.jl +++ b/src/implementation/strided.jl @@ -1,42 +1,12 @@ const StridedViewOrDiagonal = Union{StridedView, Diagonal} -_ishostarray(x::StridedView) = (pointer(x) isa Ptr) -_ishostarray(x::Diagonal) = (pointer(x.diag) isa Ptr) +select_backend(::typeof(tensoradd!), C::StridedView, A::StridedView) = StridedNative() +select_backend(::typeof(tensortrace!), C::StridedView, A::StridedView) = StridedNative() -function select_backend(::typeof(tensoradd!), C::StridedView, A::StridedView) - if _ishostarray(C) && _ishostarray(A) - return StridedNative() - else - return NoBackend() - end -end -function select_backend(::typeof(tensortrace!), C::StridedView, A::StridedView) - if _ishostarray(C) && _ishostarray(A) - return StridedNative() - else - return NoBackend() - end -end - -function select_backend( - ::typeof(tensorcontract!), C::StridedView, A::StridedView, B::StridedView - ) - if _ishostarray(C) && _ishostarray(A) && _ishostarray(B) - return eltype(C) <: LinearAlgebra.BlasFloat ? StridedBLAS() : StridedNative() - else - return NoBackend() - end -end -function select_backend( - ::typeof(tensorcontract!), C::StridedViewOrDiagonal, - A::StridedViewOrDiagonal, B::StridedViewOrDiagonal - ) - if _ishostarray(C) && _ishostarray(A) && _ishostarray(B) - return StridedNative() - else - return NoBackend() - end -end +select_backend(::typeof(tensorcontract!), C::StridedView, A::StridedView, B::StridedView) = + eltype(C) <: LinearAlgebra.BlasFloat ? StridedBLAS() : StridedNative() +select_backend(::typeof(tensorcontract!), C::StridedViewOrDiagonal, A::StridedViewOrDiagonal, B::StridedViewOrDiagonal) = + StridedNative() #------------------------------------------------------------------------------------------- # Force strided implementation on AbstractArray instances with Strided backend diff --git a/test/gpu.jl b/test/gpu.jl new file mode 100644 index 00000000..65fb3839 --- /dev/null +++ b/test/gpu.jl @@ -0,0 +1,124 @@ +using TensorOperations +using TensorOperations: StridedBLAS, StridedNative, linearize, numout +using Test +using Adapt +using TupleTools +using JLArrays +using VectorInterface +using CUDACore + +test_result(a::AbstractArray, b::AbstractArray; kwargs...) = + isapprox(collect(a), collect(b); kwargs...) + +function compare(f, AT::Type, xs...; kwargs...) + cpu_in = map(deepcopy, xs) # copy on CPU + gpu_in = map(adapt(AT), xs) # adapt on GPU + + cpu_out = f(cpu_in...) + gpu_out = f(gpu_in...) + + return test_result(cpu_out, gpu_out; kwargs...) +end + +# types to test for +ATs = [] +!is_buildkite && push!(ATs, JLArray) +CUDACore.functional() && push!(ATs, CuArray) + +backends = [StridedBLAS(), StridedNative()] + +@testset "tensoradd! ($AT)" for AT in ATs + sz = (3, 5, 4, 6) + p = (3, 1, 4, 2) + for backend in backends, T in (Float32, ComplexF32) + A = randn(T, sz) + C = randn(T, TupleTools.getindices(sz, p)) + + @test compare(AT, C, A) do c, a + tensoradd!(c, a, (p, ()), false, One(), Zero(), backend) + end + + α = rand(T) + @test compare(AT, C, A) do c, a + tensoradd!(c, a, (p, ()), false, α, Zero(), backend) + end + + β = rand(T) + @test compare(AT, C, A) do c, a + tensoradd!(c, a, (p, ()), false, α, β, backend) + end + + T <: Real || @test compare(AT, C, A) do c, a + tensoradd!(c, a, (p, ()), true, α, β, backend) + end + end +end + +@testset "tensortrace! ($AT)" for AT in ATs + sz = (2, 4, 3, 2) + p = (2, 3) + q = ((1,), (4,)) + + for backend in backends, T in (Float32, ComplexF32) + A = randn(T, sz) + C = randn(T, TupleTools.getindices(sz, p)) + + @test compare(AT, C, A) do c, a + tensortrace!(c, a, (p, ()), q, false, One(), Zero(), backend) + end + + α = rand(T) + @test compare(AT, C, A) do c, a + tensortrace!(c, a, (p, ()), q, false, α, Zero(), backend) + end + + β = rand(T) + @test compare(AT, C, A) do c, a + tensortrace!(c, a, (p, ()), q, false, α, β, backend) + end + + T <: Real || @test compare(AT, C, A) do c, a + tensortrace!(c, a, (p, ()), q, true, α, β, backend) + end + end +end + +@testset "tensorcontract! ($AT)" for AT in ATs + sz = (2, 4, 3, 4, 2, 5) + pA = ((4, 1), (2, 3)) + pB = ((3, 1), (2,)) + pAB = ((1, 2, 3), ()) + + for backend in backends, T in (Float32, ComplexF32) + A = randn(T, (2, 4, 3, 2)) + B = randn(T, (3, 3, 4)) + C = randn(T, (2, 2, 3)) + + @test compare(AT, C, A, B) do c, a, b + tensorcontract!(c, a, pA, false, b, pB, false, pAB, One(), Zero(), backend) + end + + α = rand(T) + @test compare(AT, C, A, B) do c, a, b + tensorcontract!(c, a, pA, false, b, pB, false, pAB, α, Zero(), backend) + end + + β = rand(T) + @test compare(AT, C, A, B) do c, a, b + tensorcontract!(c, a, pA, false, b, pB, false, pAB, α, β, backend) + end + + if !(T <: Real) + @test compare(AT, C, A, B) do c, a, b + tensorcontract!(c, a, pA, true, b, pB, false, pAB, α, β, backend) + end + @test compare(AT, C, A, B) do c, a, b + tensorcontract!(c, a, pA, false, b, pB, true, pAB, α, β, backend) + end + @test compare(AT, C, A, B) do c, a, b + tensorcontract!(c, a, pA, true, b, pB, true, pAB, α, β, backend) + end + end + end + +end diff --git a/test/runtests.jl b/test/runtests.jl index ba452226..f1328b91 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -50,12 +50,13 @@ if !is_buildkite end end -if is_buildkite - # note: cuTENSOR should not be loaded before this point - # as there is a test which requires it to be loaded after - @testset "cuTENSOR extension" verbose = true begin - include("cutensor.jl") - end +# note: cuTENSOR should not be loaded before this point +# as there is a test which requires it to be loaded after +@testset "cuTENSOR extension" verbose = true begin + include("cutensor.jl") +end +@testset "GPUArrays" verbose = true begin + include("gpu.jl") end if !is_buildkite