Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 36 additions & 19 deletions create_lmodsitepackage.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,6 @@
else
cudaDriverDir = eessi_eprefix .. "/lib/nvidia"
end
local cudaVersionFile = cudaDriverDir .. "/cuda_version.txt"
local cudaDriverFile = cudaDriverDir .. "/libcuda.so"
local cudaDriverExists = isFile(cudaDriverFile)
local singularityCudaExists = isFile("/.singularity.d/libs/libcuda.so")
Expand All @@ -189,28 +188,46 @@
else
-- CUDA driver exists, now we check its version to see if an update is needed
if cudaDriverExists then
local cudaVersion = read_file(cudaVersionFile)
if not cudaVersion then
LmodError("No CUDA version file\\n" .. cudaVersionFile .. "\\nfound. " .. refer_to_docs)
local cudaVersion = os.getenv("EESSI_CUDA_DRIVER_VERSION")
if not cudaVersion or cudaVersion == "" then
local eessi_prefix = os.getenv("EESSI_PREFIX")
local script = pathJoin(eessi_prefix, 'scripts', 'gpu_support', 'nvidia', 'get_cuda_driver_version.sh')
source_sh("bash", script)
end
cudaVersion = os.getenv("EESSI_CUDA_DRIVER_VERSION")
local cudaVersion_req = os.getenv("EESSICUDAVERSION")
-- driver CUDA versions don't give a patch version for CUDA
local major, minor = string.match(cudaVersion, "(%d+)%.(%d+)")
local major_req, minor_req, patch_req = string.match(cudaVersion_req, "(%d+)%.(%d+)%.(%d+)")
local driver_libs_need_update = false
if major < major_req then
driver_libs_need_update = true
elseif major == major_req then
if minor < minor_req then
-- Account for the fact that the script sourced above was designed to never return a non-zero exit
-- even if it failes to set EESSI_CUDA_DRIVER_VERSION
-- Essentially, we handle that case here by raising an error, which can be suppressed
if not cudaVersion or cudaVersion == "" then
local suppress_var = "EESSI_CUDA_DRIVER_VERSION_SUPPRESS_WARNING"
local warn = "Environment variable EESSI_CUDA_DRIVER_VERSION not found. "
warn = warn .. "Cannot ensure that driver version is new enough for CUDA toolkit version: '"
warn = warn .. cudaVersion_req .. "'. This module will still be loaded, but may not function "
warn = warn .. "as expected. Export " .. suppress_var .. "=1"
local suppress_warn = os.getenv(suppress_var)
if not suppress_warn or suppress_warn == 1 then
LmodWarning(warn)
end
else
-- driver CUDA versions don't give a patch version for CUDA
local major, minor = string.match(cudaVersion, "(%d+)%.(%d+)")
local major_req, minor_req, patch_req = string.match(cudaVersion_req, "(%d+)%.(%d+)%.(%d+)")
local driver_libs_need_update = false
if tonumber(major) < tonumber(major_req) then
driver_libs_need_update = true
elseif tonumber(major) == tonumber(major_req) then
if tonumber(minor) < tonumber(minor_req) then
driver_libs_need_update = true
end
end
if driver_libs_need_update == true then
local advice = "but the module you want to load requires CUDA " .. cudaVersion_req .. ". "
advice = advice .. "Please update your CUDA driver libraries and then "
advice = advice .. "let EESSI know about the update.\\n"
advice = advice .. refer_to_docs
LmodError("\\nYour driver CUDA version is ", cudaVersion, " ", advice)
end
end
if driver_libs_need_update == true then
local advice = "but the module you want to load requires CUDA " .. cudaVersion_req .. ". "
advice = advice .. "Please update your CUDA driver libraries and then "
advice = advice .. "let EESSI know about the update.\\n"
advice = advice .. refer_to_docs
LmodError("\\nYour driver CUDA version is ", cudaVersion, " ", advice)
end
end
end
Expand Down
1 change: 1 addition & 0 deletions install_scripts.sh
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,7 @@ nvidia_files=(
install_cuda_and_libraries.sh
install_cuda_host_injections.sh
link_nvidia_host_libraries.sh
get_cuda_driver_version.sh
)
copy_files_by_list ${TOPDIR}/scripts/gpu_support/nvidia ${INSTALL_PREFIX}/scripts/gpu_support/nvidia "${nvidia_files[@]}"

Expand Down
6 changes: 6 additions & 0 deletions scripts/gpu_support/nvidia/get_cuda_driver_version.sh
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This file will have to be added to the following list to ensure that it gets deployed to cvmfs:
https://github.com/EESSI/software-layer-scripts/blob/main/install_scripts.sh#L210

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# This can be leveraged by the source_sh() feature of Lmod
# Because we want to source this without immediately raising an LmodError upon failure, this script
# is designed to ALWAYS return a 0 exit code
EESSI_CUDA_DRIVER_VERSION=$(nvidia-smi --query | grep -oP 'CUDA Version\s*:\s*\K[0-9.]+') || return 0
# The || return 0 shouldn't be needed, but just to be overly sure that this script always returns 0
export EESSI_CUDA_DRIVER_VERSION || return 0
Loading