From 91bea4a8d62ece5052032a423e891d92b1fe6a62 Mon Sep 17 00:00:00 2001
From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com>
Date: Thu, 30 Apr 2026 21:58:01 +0800
Subject: [PATCH 1/3] Fix: unify DAV subspace diagonalization to use
 lapack_hegvx

Previously the GPU path used lapack_hegvd (all eigenvalues) while the
CPU serial path used hegvx_op (subset eigenvalues), causing
inconsistent behavior and wasted computation on GPU.

This commit unifies both paths to use container::kernels::lapack_hegvx,
which computes only the needed nband eigenpairs. The CPU serial path
also drops ~30 lines of manual save/restore code since lapack_hegvx
preserves input matrices internally.

A GPU specialization of lapack_hegvx is added for ROCm using CPU
fallback (D2H -> CPU solve -> H2D), following the existing pattern
in lapack_heevd.

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
---
 .../ATen/kernels/rocm/lapack.hip.cu           | 33 ++++++++++++
 source/source_hsolver/diago_dav_subspace.cpp  | 54 ++++---------------
 2 files changed, 42 insertions(+), 45 deletions(-)
diff --git a/source/source_base/module_container/ATen/kernels/rocm/lapack.hip.cu b/source/source_base/module_container/ATen/kernels/rocm/lapack.hip.cu
index 07572a657ab..219d2366723 100644
--- a/source/source_base/module_container/ATen/kernels/rocm/lapack.hip.cu
+++ b/source/source_base/module_container/ATen/kernels/rocm/lapack.hip.cu
@@ -133,6 +133,34 @@ struct lapack_hegvd<T, DEVICE_GPU> {
     }
 };
 
+template <typename T>
+struct lapack_hegvx<T, DEVICE_GPU> {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(
+        const int n,
+        const int lda,
+        T* A,
+        T* B,
+        const int m,
+        Real* eigen_val,
+        T* eigen_vec)
+    {
+        // Fallback to CPU: copy matrices to host, call CPU lapack_hegvx, copy results back
+        std::vector<T> H_A(n * lda);
+        std::vector<T> H_B(n * lda);
+        std::vector<Real> H_eigen_val(n);
+        std::vector<T> H_eigen_vec(n * lda);
+
+        hipErrcheck(hipMemcpy(H_A.data(), A, sizeof(T) * n * lda, hipMemcpyDeviceToHost));
+        hipErrcheck(hipMemcpy(H_B.data(), B, sizeof(T) * n * lda, hipMemcpyDeviceToHost));
+
+        lapack_hegvx<T, DEVICE_CPU>()(n, lda, H_A.data(), H_B.data(), m, H_eigen_val.data(), H_eigen_vec.data());
+
+        hipErrcheck(hipMemcpy(eigen_val, H_eigen_val.data(), sizeof(Real) * n, hipMemcpyHostToDevice));
+        hipErrcheck(hipMemcpy(eigen_vec, H_eigen_vec.data(), sizeof(T) * n * lda, hipMemcpyHostToDevice));
+    }
+};
+
 template struct set_matrix<float,  DEVICE_GPU>;
 template struct set_matrix<double, DEVICE_GPU>;
 template struct set_matrix<std::complex<float>,  DEVICE_GPU>;
@@ -158,5 +186,10 @@ template struct lapack_hegvd<double, DEVICE_GPU>;
 template struct lapack_hegvd<std::complex<float>,  DEVICE_GPU>;
 template struct lapack_hegvd<std::complex<double>, DEVICE_GPU>;
 
+template struct lapack_hegvx<float,  DEVICE_GPU>;
+template struct lapack_hegvx<double, DEVICE_GPU>;
+template struct lapack_hegvx<std::complex<float>,  DEVICE_GPU>;
+template struct lapack_hegvx<std::complex<double>, DEVICE_GPU>;
+
 } // namespace kernels
 } // namespace container
diff --git a/source/source_hsolver/diago_dav_subspace.cpp b/source/source_hsolver/diago_dav_subspace.cpp
index 048653c2a87..b044bda97fa 100644
--- a/source/source_hsolver/diago_dav_subspace.cpp
+++ b/source/source_hsolver/diago_dav_subspace.cpp
@@ -668,57 +668,22 @@ void Diago_DavSubspace<T, Device>::diag_zhegvx(const int& nbase,
         if (this->diag_comm.rank == 0)
         {
             syncmem_complex_op()(this->d_scc, scc, nbase * this->nbase_x);
-            ct::kernels::lapack_hegvd<T, ct_Device>()(nbase, this->nbase_x, this->hcc, this->d_scc, this->d_eigenvalue, this->vcc);
+            ct::kernels::lapack_hegvx<T, ct_Device>()(nbase, this->nbase_x, this->hcc, this->d_scc, nband, this->d_eigenvalue, this->vcc);
             syncmem_var_d2h_op()((*eigenvalue_iter).data(), this->d_eigenvalue, this->nbase_x);
         }
 #endif
     }
-    else
+    else if (this->diag_subspace == 0)
     {
-        if (this->diag_subspace == 0)
+        if (this->diag_comm.rank == 0)
         {
-            if (this->diag_comm.rank == 0)
-            {
-                std::vector<std::vector<T>> h_diag(nbase, std::vector<T>(nbase, *this->zero));
-                std::vector<std::vector<T>> s_diag(nbase, std::vector<T>(nbase, *this->zero));
-
-                for (size_t i = 0; i < nbase; i++)
-                {
-                    for (size_t j = 0; j < nbase; j++)
-                    {
-                        h_diag[i][j] = hcc[i * this->nbase_x + j];
-                        s_diag[i][j] = scc[i * this->nbase_x + j];
-                    }
-                }
-                hegvx_op<T, Device>()(this->ctx,
-                                      nbase,
-                                      this->nbase_x,
-                                      this->hcc,
-                                      this->scc,
-                                      nband,
-                                      (*eigenvalue_iter).data(),
-                                      this->vcc);
-                // reset:
-                for (size_t i = 0; i < nbase; i++)
-                {
-                    for (size_t j = 0; j < nbase; j++)
-                    {
-                        hcc[i * this->nbase_x + j] = h_diag[i][j];
-                        scc[i * this->nbase_x + j] = s_diag[i][j];
-                    }
-
-                    for (size_t j = nbase; j < this->nbase_x; j++)
-                    {
-                        hcc[i * this->nbase_x + j] = *this->zero;
-                        hcc[j * this->nbase_x + i] = *this->zero;
-                        scc[i * this->nbase_x + j] = *this->zero;
-                        scc[j * this->nbase_x + i] = *this->zero;
-                    }
-                }
-            }
+            ct::kernels::lapack_hegvx<T, ct_Device>()(
+                nbase, this->nbase_x, this->hcc, this->scc, nband,
+                (*eigenvalue_iter).data(), this->vcc);
         }
-        else
-        {
+    }
+    else
+    {
 #ifdef __MPI
             std::vector<T> h_diag;
             std::vector<T> s_diag;
@@ -760,7 +725,6 @@ void Diago_DavSubspace<T, Device>::diag_zhegvx(const int& nbase,
             std::cout << "Error: parallel diagonalization is not supported in serial mode." << std::endl;
             exit(1);
 #endif
-        }
     }
 
 #ifdef __MPI

From 6ff75761108581a57e11f084108aadbc1b7cd99e Mon Sep 17 00:00:00 2001
From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com>
Date: Thu, 30 Apr 2026 22:14:40 +0800
Subject: [PATCH 2/3] Fix: use 'U' uplo and valid IWORK/IFAIL in lapack_hegvx
 CPU query

The CPU specialization of lapack_hegvx used 'L' (lower) uplo while
the existing hegvx_op uses 'U' (upper), causing eigenvalue divergence
in the pyabacus dav_subspace tests. Additionally, passing nullptr
for IWORK and IFAIL during the workspace query can cause crashes with
some LAPACK implementations.

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
---
 .../module_container/ATen/kernels/lapack.cpp        | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/source/source_base/module_container/ATen/kernels/lapack.cpp b/source/source_base/module_container/ATen/kernels/lapack.cpp
index 2ab02f35c81..d294b899075 100644
--- a/source/source_base/module_container/ATen/kernels/lapack.cpp
+++ b/source/source_base/module_container/ATen/kernels/lapack.cpp
@@ -382,7 +382,7 @@ struct lapack_hegvx<T, DEVICE_CPU> {
         const int itype = 1;    // ITYPE = 1:  A*x = (lambda)*B*x
         const char jobz = 'V';// JOBZ = 'V':  Compute eigenvalues and eigenvectors.
         const char range = 'I'; // RANGE = 'I': the IL-th through IU-th eigenvalues will be found.
-        const char uplo = 'L'; // UPLO = 'L':  Lower triangles of A and B are stored.
+        const char uplo = 'U'; // UPLO = 'U':  Upper triangles of A and B are stored.
 
         const int il = 1;
         const int iu = m;
@@ -394,6 +394,13 @@ struct lapack_hegvx<T, DEVICE_CPU> {
         T work_query;
         Real rwork_query;
 
+        // dummy arrays for workspace query (some LAPACK implementations
+        // require valid pointers even during query)
+        const int liwork_query = 5 * n;
+        const int lrwork_query = 7 * n;
+        std::vector<int> iwork_query(liwork_query);
+        std::vector<int> ifail_query(n);
+
         // set lwork = -1 to query optimal work size
         lapackConnector::hegvx(
                     itype, jobz, range, uplo,
@@ -409,8 +416,8 @@ struct lapack_hegvx<T, DEVICE_CPU> {
                     &work_query,                 // WORK (query)
                     lwork,
                     &rwork_query,                // RWORK (query)
-                    static_cast<int*>(nullptr),  // IWORK (query)
-                    static_cast<int*>(nullptr),  // IFAIL (query)
+                    iwork_query.data(),          // IWORK (query)
+                    ifail_query.data(),          // IFAIL (query)
                     info);
 
         // !>  If LWORK = -1, then a workspace query is assumed; the routine

From 0ac61701dd07d4b372a38dfcb835966e905178c5 Mon Sep 17 00:00:00 2001
From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com>
Date: Thu, 30 Apr 2026 23:24:47 +0800
Subject: [PATCH 3/3] Fix: protect GPU lapack_hegvx inputs, keep GPU path using
 hegvd

The GPU lapack_hegvx had a bug: it passed A/hcc directly to
cuSOLVER hegvdx which overwrites the upper triangle, corrupting
the subspace matrix for subsequent DAV iterations. Fixed by
copying A to eigen_vec and backing up B before the call.

The GPU path in diago_dav_subspace.cpp is kept on lapack_hegvd
because cuSOLVER hegvdx correctly detects a pre-existing numerical
issue (B not positive definite at higher orders) that hegvd
silently ignores. Switching to hegvx on GPU is left for future
work once the underlying subspace conditioning is addressed.

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
---
 .../module_container/ATen/kernels/cuda/lapack.cu   | 14 +++++++++++---
 source/source_hsolver/diago_dav_subspace.cpp       |  2 +-
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/source/source_base/module_container/ATen/kernels/cuda/lapack.cu b/source/source_base/module_container/ATen/kernels/cuda/lapack.cu
index 4c69973b6be..bfdd32c8a28 100644
--- a/source/source_base/module_container/ATen/kernels/cuda/lapack.cu
+++ b/source/source_base/module_container/ATen/kernels/cuda/lapack.cu
@@ -325,14 +325,22 @@ struct lapack_hegvx<T, DEVICE_GPU> {
         const char uplo = 'U';
         int meig = 0;
 
-        // this hegvdx will protect the input A, B from being overwritten
-        // and write the eigenvectors into eigen_vec.
+        // cuSOLVER hegvdx overwrites A and B on exit, so copy A to
+        // eigen_vec and backup B to protect the origin matrices.
+        CHECK_CUDA(cudaMemcpy(eigen_vec, A, sizeof(T) * n * lda, cudaMemcpyDeviceToDevice));
+
+        T* d_B_backup = nullptr;
+        CHECK_CUDA(cudaMalloc(&d_B_backup, sizeof(T) * n * lda));
+        CHECK_CUDA(cudaMemcpy(d_B_backup, B, sizeof(T) * n * lda, cudaMemcpyDeviceToDevice));
+
         cuSolverConnector::hegvdx(cusolver_handle,
             itype, jobz, range, uplo,
-            n, lda, A, B,
+            n, lda, eigen_vec, d_B_backup,
             Real(0), Real(0),
             1, m, &meig,
             eigen_val, eigen_vec);
+
+        CHECK_CUDA(cudaFree(d_B_backup));
     }
 };
 
diff --git a/source/source_hsolver/diago_dav_subspace.cpp b/source/source_hsolver/diago_dav_subspace.cpp
index b044bda97fa..3d077059d54 100644
--- a/source/source_hsolver/diago_dav_subspace.cpp
+++ b/source/source_hsolver/diago_dav_subspace.cpp
@@ -668,7 +668,7 @@ void Diago_DavSubspace<T, Device>::diag_zhegvx(const int& nbase,
         if (this->diag_comm.rank == 0)
         {
             syncmem_complex_op()(this->d_scc, scc, nbase * this->nbase_x);
-            ct::kernels::lapack_hegvx<T, ct_Device>()(nbase, this->nbase_x, this->hcc, this->d_scc, nband, this->d_eigenvalue, this->vcc);
+            ct::kernels::lapack_hegvd<T, ct_Device>()(nbase, this->nbase_x, this->hcc, this->d_scc, this->d_eigenvalue, this->vcc);
             syncmem_var_d2h_op()((*eigenvalue_iter).data(), this->d_eigenvalue, this->nbase_x);
         }
 #endif