From e9511b18d464ce99a74b492d47a80f5b33e6580b Mon Sep 17 00:00:00 2001
From: zaotian <zaotian@mail.ustc.edu.cn>
Date: Tue, 16 Jun 2026 17:20:47 +0800
Subject: [PATCH 1/2] hybrid gauge tddft force

---
 source/source_esolver/esolver_ks_lcao.cpp     |   2 +-
 .../source_esolver/esolver_ks_lcao_tddft.cpp  |   3 +-
 .../source_estate/module_pot/H_TDDFT_pw.cpp   |   5 +-
 .../source_io/module_hs/cal_r_overlap_R.cpp   |  40 +++
 source/source_io/module_hs/cal_r_overlap_R.h  |  14 +
 source/source_lcao/FORCE_STRESS.cpp           |  30 +-
 source/source_lcao/FORCE_STRESS.h             |   3 +-
 .../module_operator_lcao/td_pot_hybrid.cpp    |   9 +-
 .../module_operator_lcao/td_pot_hybrid.h      |  14 +
 .../td_pot_hybrid_force.hpp                   | 175 +++++++++++
 source/source_lcao/module_rt/CMakeLists.txt   |   1 +
 .../module_rt/force_rt_overlap.cpp            | 283 ++++++++++++++++++
 .../source_lcao/module_rt/force_rt_overlap.h  |  17 ++
 source/source_lcao/module_rt/td_folding.cpp   |  36 +++
 source/source_lcao/module_rt/td_folding.h     |   9 +
 source/source_lcao/module_rt/td_info.cpp      | 151 ++++++++++
 source/source_lcao/module_rt/td_info.h        |  13 +
 17 files changed, 795 insertions(+), 10 deletions(-)
 create mode 100644 source/source_lcao/module_operator_lcao/td_pot_hybrid_force.hpp
 create mode 100644 source/source_lcao/module_rt/force_rt_overlap.cpp
 create mode 100644 source/source_lcao/module_rt/force_rt_overlap.h
diff --git a/source/source_esolver/esolver_ks_lcao.cpp b/source/source_esolver/esolver_ks_lcao.cpp
index ee04d9f1e25..4f1a0b9fe05 100644
--- a/source/source_esolver/esolver_ks_lcao.cpp
+++ b/source/source_esolver/esolver_ks_lcao.cpp
@@ -245,7 +245,7 @@ void ESolver_KS_LCAO<TK, TR>::cal_force(UnitCell& ucell, ModuleBase::matrix& for
                        two_center_bundle_, orb_, force, this->scs,
                        this->locpp, this->sf, this->kv,
                        this->pw_rho, this->solvent, this->dftu, this->deepks,
-                       this->exx_nao, &ucell.symm);
+                       this->exx_nao, &ucell.symm, static_cast<hamilt::Hamilt<TK>*>(this->p_hamilt));
 
     // delete RA after cal_force
     this->RA.delete_grid();
diff --git a/source/source_esolver/esolver_ks_lcao_tddft.cpp b/source/source_esolver/esolver_ks_lcao_tddft.cpp
index fd47b15fc53..c17dc10e83b 100644
--- a/source/source_esolver/esolver_ks_lcao_tddft.cpp
+++ b/source/source_esolver/esolver_ks_lcao_tddft.cpp
@@ -102,6 +102,7 @@ void ESolver_KS_LCAO_TDDFT<TR, Device>::runner(UnitCell& ucell, const int istep)
     //----------------------------------------------------------------
     this->before_scf(ucell, istep); // From ESolver_KS_LCAO
     td_p->initialize_phase_hybrid(ucell, dynamic_cast<hamilt::HamiltLCAO<std::complex<double>, TR>*>(this->p_hamilt)->getHR());
+    td_p->calculate_grad_overlap(this->pv, ucell, this->gd, this->orb_.cutoffs(), this->two_center_bundle_.overlap_orb.get());
     // Initialize the moving spatial gauge
     if (use_td_moving_gauge && this->td_mg_ == nullptr)
     {
@@ -388,7 +389,7 @@ void ESolver_KS_LCAO_TDDFT<TR, Device>::iter_finish(UnitCell& ucell,
 
     // Calculate energy-density matrix for RT-TDDFT
     if (conv_esolver && estep == estep_max - 1 && istep >= (PARAM.inp.init_wfc == "file" ? 0 : 1)
-        && PARAM.inp.td_edm == 0)
+        && PARAM.inp.td_edm == 0 && PARAM.inp.td_stype != 2)
     {
         if (use_tensor && use_lapack)
         {
diff --git a/source/source_estate/module_pot/H_TDDFT_pw.cpp b/source/source_estate/module_pot/H_TDDFT_pw.cpp
index babece4d784..69d8b7f9a31 100644
--- a/source/source_estate/module_pot/H_TDDFT_pw.cpp
+++ b/source/source_estate/module_pot/H_TDDFT_pw.cpp
@@ -109,6 +109,7 @@ void H_TDDFT_pw::cal_fixed_v(double* vl_pseudo)
     // time evolve
     H_TDDFT_pw::istep++;
     H_TDDFT_pw::istep_int = istep;
+    global_vext_time = {0.0, 0.0, 0.0};
 
     // judgement to skip vext
     if (!PARAM.inp.td_vext || istep > tend || istep < tstart)
@@ -124,7 +125,7 @@ void H_TDDFT_pw::cal_fixed_v(double* vl_pseudo)
     trigo_count = 0;
     heavi_count = 0;
 
-    global_vext_time = {0.0, 0.0, 0.0};
+    
 
     for (auto direc: PARAM.inp.td_vext_dire)
     {
@@ -272,6 +273,7 @@ void H_TDDFT_pw::update_At()
     At = At + At_laststep / 2.0;
     At_laststep.set(0.0, 0.0, 0.0);
     Et.set(0.0, 0.0, 0.0);
+    global_vext_time = { 0.0, 0.0, 0.0 };
 
     // judgement to skip vext
     if (!PARAM.inp.td_vext || istep > tend || istep < tstart)
@@ -343,6 +345,7 @@ void H_TDDFT_pw::update_At()
         count++;
     }
     At = At + At_laststep / 2.0;
+    if(stype==2)global_vext_time = { Et[0],Et[1],Et[2] };
 
     ModuleBase::timer::end("H_TDDFT_pw", "update_At");
     return;
diff --git a/source/source_io/module_hs/cal_r_overlap_R.cpp b/source/source_io/module_hs/cal_r_overlap_R.cpp
index f75570a4116..3ce8311e130 100644
--- a/source/source_io/module_hs/cal_r_overlap_R.cpp
+++ b/source/source_io/module_hs/cal_r_overlap_R.cpp
@@ -516,7 +516,47 @@ ModuleBase::Vector3<double> cal_r_overlap_R::get_psi_r_psi(const ModuleBase::Vec
 
     return temp_prp;
 }
+ModuleBase::Vector3<double> cal_r_overlap_R::get_psi_r_gradpsi(const ModuleBase::Vector3<double>& R1,
+                                                           const int& T1,
+                                                           const int& L1,
+                                                           const int& m1,
+                                                           const int& N1,
+                                                           const ModuleBase::Vector3<double>& R2,
+                                                           const int& T2,
+                                                           const int& L2,
+                                                           const int& m2,
+                                                           const int& N2,
+                                                           const ModuleBase::Vector3<double>& Efield,
+                                                           const ModuleBase::Vector3<double>& dR)
+{
+    ModuleBase::Vector3<double> origin_point(0.0, 0.0, 0.0);
+    double factor = sqrt(ModuleBase::FOUR_PI / 3.0);
+    const ModuleBase::Vector3<double>& distance = R2 - R1;
+
+    ModuleBase::Vector3<double> grad_o = center2_orb11[T1][T2][L1][N1][L2].at(N2).cal_grad_overlap(origin_point, distance, m1, m2);
+
+    ModuleBase::Vector3<double> grad_rx = -1 * factor * center2_orb21_r[T1][T2][L1][N1][L2].at(N2).cal_grad_overlap(origin_point,
+                                                                                                        distance,
+                                                                                                        m1,
+                                                                                                        1,
+                                                                                                        m2); // m =  1
+
+    ModuleBase::Vector3<double> grad_ry = -1 * factor * center2_orb21_r[T1][T2][L1][N1][L2].at(N2).cal_grad_overlap(origin_point,
+                                                                                                        distance,
+                                                                                                        m1,
+                                                                                                        2,
+                                                                                                        m2); // m = -1
 
+    ModuleBase::Vector3<double> grad_rz = factor * center2_orb21_r[T1][T2][L1][N1][L2].at(N2).cal_grad_overlap(origin_point,
+                                                                                                        distance,
+                                                                                                        m1,
+                                                                                                        0,
+                                                                                                        m2); // m =  0
+
+    ModuleBase::Vector3<double> temp_prp = Efield[0] * grad_rx + Efield[1] * grad_ry + Efield[2] * grad_rz + (Efield*(R1-dR)) * grad_o;
+
+    return temp_prp;
+}
 void cal_r_overlap_R::get_psi_r_beta(const UnitCell& ucell,
                                      std::vector<std::vector<double>>& nlm,
                                      const ModuleBase::Vector3<double>& R1,
diff --git a/source/source_io/module_hs/cal_r_overlap_R.h b/source/source_io/module_hs/cal_r_overlap_R.h
index 2329cd4ea85..0f7d51e1491 100644
--- a/source/source_io/module_hs/cal_r_overlap_R.h
+++ b/source/source_io/module_hs/cal_r_overlap_R.h
@@ -45,6 +45,20 @@ class cal_r_overlap_R
       const int& m2,
       const int& N2
     );
+    ModuleBase::Vector3<double> get_psi_r_gradpsi(
+      const ModuleBase::Vector3<double>& R1,
+      const int& T1,
+      const int& L1,
+      const int& m1,
+      const int& N1,
+      const ModuleBase::Vector3<double>& R2,
+      const int& T2,
+      const int& L2,
+      const int& m2,
+      const int& N2,
+      const ModuleBase::Vector3<double>& Efield,
+      const ModuleBase::Vector3<double>& dR
+    );
     void get_psi_r_beta(
       const UnitCell& ucell,
       std::vector<std::vector<double>>& nlm,
diff --git a/source/source_lcao/FORCE_STRESS.cpp b/source/source_lcao/FORCE_STRESS.cpp
index b2e810f2838..cf1e7f7012b 100644
--- a/source/source_lcao/FORCE_STRESS.cpp
+++ b/source/source_lcao/FORCE_STRESS.cpp
@@ -24,7 +24,9 @@
 #include "source_lcao/module_operator_lcao/nonlocal.h"
 #include "source_lcao/module_operator_lcao/ekinetic.h"
 #include "source_lcao/module_operator_lcao/overlap.h"
+#include "source_lcao/module_operator_lcao/td_pot_hybrid.h"
 #include "source_lcao/pulay_fs.h"
+#include "source_lcao/module_rt/force_rt_overlap.h"
 
 
 // mohan add 2025-11-04
@@ -85,7 +87,8 @@ void Force_Stress_LCAO<T>::getForceStress(UnitCell& ucell,
                                           Plus_U &dftu, // mohan add 2025-11-07
                                           Setup_DeePKS<T>& deepks,
                                           Exx_NAO<T> &exx_nao,
-                                          ModuleSymmetry::Symmetry* symm)
+                                          ModuleSymmetry::Symmetry* symm,
+                                          hamilt::Hamilt<T>* p_hamilt)
 {
     ModuleBase::TITLE("Force_Stress_LCAO", "getForceStress");
     ModuleBase::timer::start("Force_Stress_LCAO", "getForceStress");
@@ -113,6 +116,7 @@ void Force_Stress_LCAO<T>::getForceStress(UnitCell& ucell,
     ModuleBase::matrix fcc;
     ModuleBase::matrix fscc;
     ModuleBase::matrix fvnl_dalpha; // deepks
+    ModuleBase::matrix fpothybrid;
 
     fvl_dphi.create(nat, 3); // must do it now, update it later, noted by zhengdy
 
@@ -127,6 +131,7 @@ void Force_Stress_LCAO<T>::getForceStress(UnitCell& ucell,
         fcc.create(nat, 3); // force due to core correction
         fscc.create(nat, 3); // force due to self-consistent field
         fvnl_dalpha.create(nat, 3); // deepks
+        fpothybrid.create(nat, 3); // pulay force for hybrid gauge rt-tddft
 
         // calculate basic terms in Force, same method with PW base
         this->calForcePwPart(ucell, fvl_dvl, fewalds, fcc, fscc, pelec->f_en.etxc,
@@ -199,7 +204,10 @@ void Force_Stress_LCAO<T>::getForceStress(UnitCell& ucell,
         hamilt::Overlap<hamilt::OperatorLCAO<T, double>> tmp_overlap(
             nullptr, kv.kvec_d, nullptr, nullptr, &ucell, orb.cutoffs(), &gd,
             two_center_bundle.overlap_orb.get());
-        tmp_overlap.cal_force_stress(isforce, isstress, edmR, foverlap, soverlap);
+        if(PARAM.inp.td_stype != 2)
+        {
+            tmp_overlap.cal_force_stress(isforce, isstress, edmR, foverlap, soverlap);
+        }
 
         // Calculate nonlocal force/stress (uses DM)
         hamilt::Nonlocal<hamilt::OperatorLCAO<T, double>> tmp_nonlocal(
@@ -207,6 +215,15 @@ void Force_Stress_LCAO<T>::getForceStress(UnitCell& ucell,
             two_center_bundle.overlap_orb_beta.get());
         tmp_nonlocal.cal_force_stress(isforce, isstress, dmR, fvnl_dbeta, svnl_dbeta);
         
+        if(PARAM.inp.td_stype == 2)
+        {
+            hamilt::TD_pot_hybrid<hamilt::OperatorLCAO<T, double>> tmp_hybrid(
+                nullptr, &kv, nullptr, nullptr, orb, &ucell, orb.cutoffs(), &gd, nullptr);
+            tmp_hybrid.cal_force_stress(isforce, dmR, fpothybrid);
+
+            cal_foverlap_rt(foverlap, dmat, p_hamilt, kv, pv, ucell);
+        }
+
         // Switch back to spin channel 0
         if (PARAM.inp.nspin == 2)
         {
@@ -501,6 +518,7 @@ void Force_Stress_LCAO<T>::getForceStress(UnitCell& ucell,
         //---------------------------------
         // sum all parts of force!
         //---------------------------------
+        ModuleBase::Vector3<double> net_force = {0.0, 0.0, 0.0};
         for (int i = 0; i < 3; i++)
         {
             double sum = 0.0;
@@ -511,7 +529,8 @@ void Force_Stress_LCAO<T>::getForceStress(UnitCell& ucell,
                                + fvl_dvl(iat, i) // derivative of local potential force (pw)
                                + fewalds(iat, i) // ewald force (pw)
                                + fcc(iat, i)     // nonlinear core correction force (pw)
-                               + fscc(iat, i);   // self consistent corretion force (pw)
+                               + fscc(iat, i)    // self consistent corretion force (pw)
+                               + fpothybrid(iat, i); // pulay force for hybrid gauge rt-tddft
 
                 // Force contribution from DFT+U, Quxin add on 20201029
                 if (PARAM.inp.dft_plus_u)
@@ -564,7 +583,7 @@ void Force_Stress_LCAO<T>::getForceStress(UnitCell& ucell,
                 // sum total force for correction
                 sum += fcs(iat, i);
             }
-
+            net_force[i]=sum;
             if (!(PARAM.inp.gate_flag || PARAM.inp.efield_flag))
             {
                 for (int iat = 0; iat < nat; ++iat)
@@ -678,6 +697,9 @@ void Force_Stress_LCAO<T>::getForceStress(UnitCell& ucell,
 
         // this->printforce_total(ry, istestf, fcs);
         ModuleIO::print_force(GlobalV::ofs_running, ucell, "TOTAL-FORCE (eV/Angstrom)", fcs, false);
+        net_force*= ModuleBase::Ry_to_eV / ModuleBase::BOHR_TO_A;
+        ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running, "Net force vector (eV/Ang)", net_force.x, net_force.y, net_force.z);
+        ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running, "Total drift (ev/Ang)", net_force.norm());
         if (istestf)
         {
             GlobalV::ofs_running << "\n FORCE INVALID TABLE." << std::endl;
diff --git a/source/source_lcao/FORCE_STRESS.h b/source/source_lcao/FORCE_STRESS.h
index f23a03f786a..2856a105afa 100644
--- a/source/source_lcao/FORCE_STRESS.h
+++ b/source/source_lcao/FORCE_STRESS.h
@@ -53,7 +53,8 @@ class Force_Stress_LCAO
 						Plus_U &dftu, // mohan add 2025-11-07
                         Setup_DeePKS<T> &deepks,
                         Exx_NAO<T> &exx_nao,
-                        ModuleSymmetry::Symmetry* symm);
+                        ModuleSymmetry::Symmetry* symm,
+                        hamilt::Hamilt<T>* p_hamilt = nullptr);
 
   private:
     int nat;
diff --git a/source/source_lcao/module_operator_lcao/td_pot_hybrid.cpp b/source/source_lcao/module_operator_lcao/td_pot_hybrid.cpp
index 015af41fec1..429f8be5b37 100644
--- a/source/source_lcao/module_operator_lcao/td_pot_hybrid.cpp
+++ b/source/source_lcao/module_operator_lcao/td_pot_hybrid.cpp
@@ -23,13 +23,17 @@ hamilt::TD_pot_hybrid<hamilt::OperatorLCAO<TK, TR>>::TD_pot_hybrid(
 {
     this->cal_type = calculation_type::lcao_tddft_periodic;
     this->ucell = ucell_in;
+    this->gridD = GridD_in;
 #ifdef __DEBUG
     assert(this->ucell != nullptr);
     assert(this->hsk != nullptr);
 #endif
     this->init_td();
     // initialize HR to allocate sparse Ekinetic matrix memory
-    this->initialize_HR(GridD_in);
+    if(hR_in != nullptr)
+    {
+        this->initialize_HR(GridD_in);
+    }
 }
 
 // destructor
@@ -289,6 +293,7 @@ template <typename TK, typename TR>
 void hamilt::TD_pot_hybrid<hamilt::OperatorLCAO<TK, TR>>::contributeHk(int ik) {
     return;
 }
-
+#include "td_pot_hybrid_force.hpp"
+template class hamilt::TD_pot_hybrid<hamilt::OperatorLCAO<double, double>>;
 template class hamilt::TD_pot_hybrid<hamilt::OperatorLCAO<std::complex<double>, double>>;
 template class hamilt::TD_pot_hybrid<hamilt::OperatorLCAO<std::complex<double>, std::complex<double>>>;
diff --git a/source/source_lcao/module_operator_lcao/td_pot_hybrid.h b/source/source_lcao/module_operator_lcao/td_pot_hybrid.h
index 7b86218e099..25f78c27237 100644
--- a/source/source_lcao/module_operator_lcao/td_pot_hybrid.h
+++ b/source/source_lcao/module_operator_lcao/td_pot_hybrid.h
@@ -66,6 +66,9 @@ class TD_pot_hybrid<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
 
     virtual void set_HR_fixed(void*) override;
 
+    void cal_force_stress(const bool cal_force,
+                          const HContainer<TR>* dmR,
+                          ModuleBase::matrix& force);
 
   private:
     const UnitCell* ucell = nullptr;
@@ -120,6 +123,17 @@ class TD_pot_hybrid<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
 
     /// @brief exact the nearest neighbor atoms from all adjacent atoms
     std::vector<AdjacentAtomInfo> adjs_all;
+
+    void cal_force_IJR(const int& iat1,
+                    const int& iat2,
+                    const Parallel_Orbitals* paraV,
+                    const ModuleBase::Vector3<double>& dtau,     
+                    const ModuleBase::Vector3<double>& dR,                  
+                    TR* dmR_pointer,
+                    double* force1,
+                    double* force2);
+
+    const Grid_Driver* gridD = nullptr;
 };
 
 } // namespace hamilt
diff --git a/source/source_lcao/module_operator_lcao/td_pot_hybrid_force.hpp b/source/source_lcao/module_operator_lcao/td_pot_hybrid_force.hpp
new file mode 100644
index 00000000000..90971546f07
--- /dev/null
+++ b/source/source_lcao/module_operator_lcao/td_pot_hybrid_force.hpp
@@ -0,0 +1,175 @@
+#pragma once
+#include "td_pot_hybrid.h"
+#include "source_base/parallel_reduce.h"
+#include "source_base/timer.h"
+#include "source_base/libm/libm.h"
+
+namespace hamilt
+{
+template <typename TK, typename TR>
+void TD_pot_hybrid<OperatorLCAO<TK, TR>>::cal_force_stress(const bool cal_force,
+                                                            const HContainer<TR>* dmR,
+                                                            ModuleBase::matrix& force)
+{
+    Et = elecstate::H_TDDFT_pw::Et;
+    const Parallel_Orbitals* paraV = dmR->get_paraV();
+    #pragma omp parallel
+    {
+    ModuleBase::matrix force_local(force.nr, force.nc);
+    #pragma omp for schedule(dynamic)
+    for (int iat1 = 0; iat1 < ucell->nat; iat1++)
+    {
+        auto tau1 = ucell->get_tau(iat1);
+        int T1, I1;
+        ucell->iat2iait(iat1, &I1, &T1);
+        AdjacentAtomInfo adjs;
+        this->gridD->Find_atom(*ucell, tau1, T1, I1, &adjs);
+        std::vector<bool> is_adj(adjs.adj_num + 1, false);
+        for (int ad1 = 0; ad1 < adjs.adj_num + 1; ++ad1)
+        {
+            const int T2 = adjs.ntype[ad1];
+            const int I2 = adjs.natom[ad1];
+            const int iat2 = ucell->itia2iat(T2, I2);
+            if (paraV->get_row_size(iat1) <= 0 || paraV->get_col_size(iat2) <= 0)
+            {
+                continue;
+            }
+            const ModuleBase::Vector3<int>& R_index2 = adjs.box[ad1];
+            // choose the real adjacent atoms
+            // Note: the distance of atoms should less than the cutoff radius,
+            // When equal, the theoretical value of matrix element is zero,
+            // but the calculated value is not zero due to the numerical error, which would lead to result changes.
+            if (this->ucell->cal_dtau(iat1, iat2, R_index2).norm() * this->ucell->lat0
+                < orb_cutoff_[T1] + orb_cutoff_[T2])
+            {
+                is_adj[ad1] = true;
+            }
+        }
+        filter_adjs(is_adj, adjs);
+
+        for (int ad = 0; ad < adjs.adj_num + 1; ++ad)
+        {
+            const int T2 = adjs.ntype[ad];
+            const int I2 = adjs.natom[ad];
+            const int iat2 = ucell->itia2iat(T2, I2);
+            double* force_tmp1 = (cal_force) ? &force_local(iat2, 0) : nullptr;
+            // force_tmp1[0]=0;
+            // force_tmp1[1]=0;
+            // force_tmp1[2]=0;
+            double* force_tmp2 = (cal_force) ? &force_local(iat1, 0) : nullptr;
+            const ModuleBase::Vector3<int>& R_index2 = adjs.box[ad];
+            ModuleBase::Vector3<double> dtau = this->ucell->cal_dtau(iat1, iat2, R_index2);
+            ModuleBase::Vector3<double> dR = this->ucell->cal_dtau(iat1, iat1, R_index2);
+
+            const BaseMatrix<TR>* tmp = dmR->find_matrix(iat1, iat2, R_index2);
+            if (tmp != nullptr)
+            {
+                this->cal_force_IJR(iat1, iat2, paraV, dtau, dR, tmp->get_pointer(), force_tmp1, force_tmp2);
+            }
+            else
+            {
+                ModuleBase::WARNING_QUIT("TD_pot_hybrid::calculate_HR", "R_index not found in HR");
+            }
+            // std::cout<<"iat1: "<<iat1<< " ad: " <<ad<<std::endl;
+            // std::cout<<"T1: "<<T1<< " I1: " << I1 <<" T2: "<<T2<<" I2: "<<I2<<" R_ind_Z: "<<R_index2.z<<std::endl;
+            // std::cout<<"force_z: "<<force_tmp1[2]<<std::endl;
+            // std::cout<<"dtau: "<<dtau[0]<<" "<<dtau[1]<<" "<<dtau[2]<<std::endl;
+        }
+    }
+    #pragma omp critical
+    {
+        if(cal_force)
+        {
+            force += force_local;
+        }
+    }
+    }
+
+    if (cal_force)
+    {
+#ifdef __MPI
+        // sum up the occupation matrix
+        Parallel_Reduce::reduce_all(force.c, force.nr * force.nc);
+#endif
+        // for (int i = 0; i < force.nr * force.nc; i++)
+        // {
+        //     force.c[i] *= 2.0;
+        // }
+    }
+}
+template <>
+void TD_pot_hybrid<OperatorLCAO<std::complex<double>, std::complex<double>>>::cal_force_IJR(const int& iat1,
+                                                                            const int& iat2,
+                                                                            const Parallel_Orbitals* paraV,
+                                                                            const ModuleBase::Vector3<double>& dtau,
+                                                                            const ModuleBase::Vector3<double>& dR,
+                                                                            std::complex<double>* dmR_pointer,
+                                                                            double* force1,
+                                                                            double* force2)
+{
+    return;
+}
+template <typename TK, typename TR>
+void TD_pot_hybrid<OperatorLCAO<TK, TR>>::cal_force_IJR(const int& iat1,
+                                                        const int& iat2,
+                                                        const Parallel_Orbitals* paraV,
+                                                        const ModuleBase::Vector3<double>& dtau,
+                                                        const ModuleBase::Vector3<double>& dR,
+                                                        TR* dmR_pointer,
+                                                        double* force1,
+                                                        double* force2)
+{
+    // ---------------------------------------------
+    // get info of orbitals of atom1 and atom2 from ucell
+    // ---------------------------------------------
+    int T1, I1;
+    this->ucell->iat2iait(iat1, &I1, &T1);
+    int T2, I2;
+    this->ucell->iat2iait(iat2, &I2, &T2);
+    Atom& atom1 = this->ucell->atoms[T1];
+    Atom& atom2 = this->ucell->atoms[T2];
+
+    const int* iw2l1 = atom1.iw2l.data();
+    const int* iw2n1 = atom1.iw2n.data();
+    const int* iw2m1 = atom1.iw2m.data();
+    const int* iw2l2 = atom2.iw2l.data();
+    const int* iw2n2 = atom2.iw2n.data();
+    const int* iw2m2 = atom2.iw2m.data();
+
+    // ---------------------------------------------
+    // calculate the Ekinetic matrix for each pair of orbitals
+    // ---------------------------------------------
+    double olm[3] = {0, 0, 0};
+    auto row_indexes = paraV->get_indexes_row(iat1);
+    auto col_indexes = paraV->get_indexes_col(iat2);
+    const int step_trace = col_indexes.size() + 1;
+    ModuleBase::Vector3<double> shift = {0.0, 0.0, 1.0};
+    const ModuleBase::Vector3<double>& tau1 = this->ucell->get_tau(iat1);
+    const ModuleBase::Vector3<double> tau2 = tau1 + dtau;
+    for (int iw1l = 0; iw1l < row_indexes.size(); iw1l++)
+    {
+        const int iw1 = row_indexes[iw1l];
+        const int L1 = iw2l1[iw1];
+        const int N1 = iw2n1[iw1];
+        const int m1 = iw2m1[iw1];
+
+        for (int iw2l = 0; iw2l < col_indexes.size(); iw2l++)
+        {
+            const int iw2 = col_indexes[iw2l];
+            const int L2 = iw2l2[iw2];
+            const int N2 = iw2n2[iw2];
+            const int m2 = iw2m2[iw2];
+            
+            ModuleBase::Vector3<double> tmp_grad = r_calculator->get_psi_r_gradpsi(tau1 * this->ucell->lat0, T1, L1, m1, N1, tau2 * this->ucell->lat0, T2, L2, m2, N2, Et, dR * this->ucell->lat0);
+            ModuleBase::Vector3<double> tmp_grad1 = r_calculator->get_psi_r_gradpsi(tau2 * this->ucell->lat0, T2, L2, m2, N2, tau1 * this->ucell->lat0, T1, L1, m1, N1, Et, dR * this->ucell->lat0);
+
+            for(int i = 0; i < 3; i++)
+            {
+                force1[i] -= dmR_pointer[0] * tmp_grad[i];
+                force2[i] -= dmR_pointer[0] * tmp_grad1[i];
+            }
+            dmR_pointer++;
+        }
+    }
+}
+}// namespace hamilt
\ No newline at end of file
diff --git a/source/source_lcao/module_rt/CMakeLists.txt b/source/source_lcao/module_rt/CMakeLists.txt
index 046632f3138..22af0621701 100644
--- a/source/source_lcao/module_rt/CMakeLists.txt
+++ b/source/source_lcao/module_rt/CMakeLists.txt
@@ -17,6 +17,7 @@ if(ENABLE_LCAO)
         solve_propagation.cpp
         boundary_fix.cpp
         td_moving_gauge.cpp
+        force_rt_overlap.cpp
     )
 
     if(USE_CUDA)
diff --git a/source/source_lcao/module_rt/force_rt_overlap.cpp b/source/source_lcao/module_rt/force_rt_overlap.cpp
new file mode 100644
index 00000000000..f90a8e9e22b
--- /dev/null
+++ b/source/source_lcao/module_rt/force_rt_overlap.cpp
@@ -0,0 +1,283 @@
+#include "force_rt_overlap.h"
+#include "td_info.h"
+#include "td_folding.h"
+#include "source_base/module_external/lapack_connector.h"
+#include "source_base/module_external/scalapack_connector.h"
+#include "source_estate/module_pot/H_TDDFT_pw.h"
+#include "source_base/parallel_reduce.h"
+template <>
+void cal_foverlap_rt(ModuleBase::matrix& foverlap,
+                     const LCAO_domain::Setup_DM<std::complex<double>>& dmat,
+                     hamilt::Hamilt<std::complex<double>>* p_hamilt,
+                     const K_Vectors& kv,
+                     Parallel_Orbitals& pv,
+                     UnitCell& ucell)
+{
+    const int nlocal = PARAM.globalv.nlocal;
+    assert(nlocal >= 0);
+
+    TD_info* td_info = TD_info::td_vel_op;
+    //get dS/dR_{x,y,z}
+    std::vector<hamilt::HContainer<double>*> dsxr = td_info->get_grad_overlap();
+    // allocate matrix
+    const long nloc = pv.nloc;
+    const int nrow = pv.nrow;
+    std::complex<double>* Htmp = new std::complex<double>[nloc];
+    std::complex<double>* Sinv = new std::complex<double>[nloc];
+    std::complex<double>* dsxk = new std::complex<double>[nloc];
+    std::complex<double>* pdsxk = new std::complex<double>[nloc];
+    std::complex<double>* tmp1 = new std::complex<double>[nloc];
+    std::complex<double>* tmp2 = new std::complex<double>[nloc];
+    std::complex<double>* tmp3 = new std::complex<double>[nloc];
+    std::complex<double>* Hybridtmp = new std::complex<double>[nloc];
+    std::vector<std::complex<double>*> tmp_out = {nullptr, nullptr, nullptr};
+    for(int dir = 0; dir<3; dir++)
+    {
+        tmp_out[dir] = new std::complex<double>[nloc];
+        ModuleBase::GlobalFunc::ZEROS(tmp_out[dir], nloc);
+    }
+    for (int ik = 0; ik < kv.get_nks(); ++ik)
+    {
+        p_hamilt->updateHk(ik);
+        // get dmk
+        std::complex<double>* tmp_dmk = dmat.dm->get_DMK_pointer(ik);
+
+        ModuleBase::GlobalFunc::ZEROS(Htmp, nloc);
+        ModuleBase::GlobalFunc::ZEROS(Sinv, nloc);
+        ModuleBase::GlobalFunc::ZEROS(tmp1, nloc);
+        ModuleBase::GlobalFunc::ZEROS(tmp2, nloc);
+        ModuleBase::GlobalFunc::ZEROS(Hybridtmp, nloc);
+        
+        const int inc = 1;
+
+        hamilt::MatrixBlock<std::complex<double>> h_mat;
+        hamilt::MatrixBlock<std::complex<double>> s_mat;
+        //get Hk Sk
+        p_hamilt->matrix(h_mat, s_mat);
+        BlasConnector::copy(nloc, h_mat.p, inc, Htmp, inc);
+        BlasConnector::copy(nloc, s_mat.p, inc, Sinv, inc);
+
+        vector<int> ipiv(nloc, 0);
+        int info = 0;
+        const int one_int = 1;
+        const std::complex<double> one_complex = {1.0, 0.0};
+        const std::complex<double> two_complex = {2.0, 0.0};
+        const std::complex<double> mone_complex = {-1.0, 0.0};
+        const std::complex<double> zero_complex = {0.0, 0.0};
+
+        ScalapackConnector::getrf(nlocal, nlocal, Sinv, one_int, one_int, pv.desc, ipiv.data(), &info);
+
+        int lwork = -1;
+        int liwork = -1;
+
+        // if lwork == -1, then the size of work is (at least) of length 1.
+        std::vector<std::complex<double>> work(1, 0);
+
+        // if liwork = -1, then the size of iwork is (at least) of length 1.
+        std::vector<int> iwork(1, 0);
+
+        ScalapackConnector::getri(nlocal,
+                                  Sinv,
+                                  one_int,
+                                  one_int,
+                                  pv.desc,
+                                  ipiv.data(),
+                                  work.data(),
+                                  &lwork,
+                                  iwork.data(),
+                                  &liwork,
+                                  &info);
+
+        lwork = work[0].real();
+        work.resize(lwork, 0);
+        liwork = iwork[0];
+        iwork.resize(liwork, 0);
+
+        ScalapackConnector::getri(nlocal,
+                                  Sinv,
+                                  one_int,
+                                  one_int,
+                                  pv.desc,
+                                  ipiv.data(),
+                                  work.data(),
+                                  &lwork,
+                                  iwork.data(),
+                                  &liwork,
+                                  &info);
+
+        const char N_char = 'N';
+        const char T_char = 'T';
+        const char C_char = 'C';
+        
+
+        ScalapackConnector::gemm(T_char,
+                                 C_char,
+                                 nlocal,
+                                 nlocal,
+                                 nlocal,
+                                 one_complex,
+                                 tmp_dmk,
+                                 one_int,
+                                 one_int,
+                                 pv.desc,
+                                 Htmp,
+                                 one_int,
+                                 one_int,
+                                 pv.desc,
+                                 zero_complex,
+                                 tmp1,
+                                 one_int,
+                                 one_int,
+                                 pv.desc);
+
+        ScalapackConnector::gemm(N_char,
+                                 N_char,
+                                 nlocal,
+                                 nlocal,
+                                 nlocal,
+                                 one_complex,
+                                 tmp1,
+                                 one_int,
+                                 one_int,
+                                 pv.desc,
+                                 Sinv,
+                                 one_int,
+                                 one_int,
+                                 pv.desc,
+                                 zero_complex,
+                                 tmp2,
+                                 one_int,
+                                 one_int,
+                                 pv.desc);
+        for(int dir = 0; dir<3; dir++)
+        {
+            ModuleBase::GlobalFunc::ZEROS(dsxk, nloc);
+            ModuleBase::GlobalFunc::ZEROS(pdsxk, nloc);
+            ModuleBase::GlobalFunc::ZEROS(tmp3, nloc);
+            module_rt::folding_HR_td(*dsxr[dir], dsxk, kv.kvec_d[ik], TD_info::cart_At, TD_info::td_vel_op->get_phase_hybrid(), nrow, 1);
+            module_rt::folding_partial_dot(*dsxr[dir], pdsxk, kv.kvec_d[ik], nrow, 1, &ucell, TD_info::td_vel_op->get_phase_hybrid(), TD_info::cart_At, elecstate::H_TDDFT_pw::Et);
+            ScalapackConnector::gemm(N_char,
+                                     N_char,
+                                     nlocal,
+                                     nlocal,
+                                     nlocal,
+                                     two_complex,
+                                     tmp2,
+                                     one_int,
+                                     one_int,
+                                     pv.desc,
+                                     dsxk,
+                                     one_int,
+                                     one_int,
+                                     pv.desc,
+                                     one_complex,
+                                     tmp_out[dir],
+                                     one_int,
+                                     one_int,
+                                     pv.desc);
+
+            ScalapackConnector::gemm(T_char,
+                                     N_char,
+                                     nlocal,
+                                     nlocal,
+                                     nlocal,
+                                     mone_complex,
+                                     tmp_dmk,
+                                     one_int,
+                                     one_int,
+                                     pv.desc,
+                                     pdsxk,
+                                     one_int,
+                                     one_int,
+                                     pv.desc,
+                                     one_complex,
+                                     tmp3,
+                                     one_int,
+                                     one_int,
+                                     pv.desc);
+            
+            ScalapackConnector::geadd(N_char,
+                                      nlocal,
+                                      nlocal,
+                                      one_complex,
+                                      tmp3,
+                                      one_int,
+                                      one_int,
+                                      pv.desc,
+                                      one_complex,
+                                      tmp_out[dir],
+                                      one_int,
+                                      one_int,
+                                      pv.desc);
+        }
+    }
+    delete[] Htmp;
+    delete[] Sinv;
+    delete[] tmp1;
+    delete[] tmp2;
+    delete[] dsxk;
+    delete[] Hybridtmp;
+    // std::string filename = "process_debug_" + std::to_string(GlobalV::MY_RANK) + ".txt";
+    // std::ofstream debug_file(filename);
+    // debug_file << "=== Process " << GlobalV::MY_RANK << " ===" << std::endl;
+    // debug_file << "=== Matrix Partition Information ===" << std::endl;
+    // auto row_indexes = pv.get_indexes_row();
+    // auto col_indexes = pv.get_indexes_col();
+    // for(int iat = 0; iat < ucell.nat; iat++)
+    // {
+    //     int row0 = pv.atom_begin_row[iat];
+    //     int col0 = pv.atom_begin_col[iat];
+    //     for(int mu = 0; mu < pv.get_row_size(iat); ++mu)
+    //     {
+    //         for(int nu = 0; nu < pv.get_col_size(iat); ++nu)
+    //         {
+    //             debug_file<<"mu: "<<mu<<" nu: "<<nu<<std::endl;
+    //             debug_file<<"globalmu: "<<row_indexes[row0+mu]<<" globalnu: "<<col_indexes[col0+nu]<<std::endl;
+    //         }
+    //     }
+    // }
+    auto row_indexes = pv.get_indexes_row();
+    auto col_indexes = pv.get_indexes_col();
+    #pragma omp parallel for
+    for(int iat = 0; iat < ucell.nat; iat++)
+    {
+        
+        double* force_tmp1 = &foverlap(iat, 0);
+        int row0 = pv.atom_begin_row[iat];
+        int col0 = pv.atom_begin_col[iat];
+        const int row_size = pv.get_row_size();
+        std::vector<std::complex<double>*> p_diag = {tmp_out[0], tmp_out[1], tmp_out[2]};
+        for(int mu = 0; mu < pv.get_row_size(iat); ++mu)
+        {
+            for(int nu = 0; nu < pv.get_col_size(iat); ++nu)
+            {
+                if(row_indexes[row0+mu]==col_indexes[col0+nu])
+                {
+                    const long index = (row0 + mu) + (col0 + nu)*row_size;
+                    for(int dir = 0; dir<3; dir++)
+                    {
+                        // p_diag[dir] = tmp_out[dir] + (row0 + mu) + (col0 + nu) * row_size;
+                        force_tmp1[dir] += (tmp_out[dir][index]).real();
+                    }
+                }
+            }
+            
+        }
+    }
+    Parallel_Reduce::reduce_all(foverlap.c, foverlap.nr * foverlap.nc);
+    for(int dir = 0; dir<3; dir++)
+    {
+        delete[] tmp_out[dir];
+    }
+    return;
+}
+template <>
+void cal_foverlap_rt(ModuleBase::matrix& foverlap,
+                     const LCAO_domain::Setup_DM<double>& dmat,
+                     hamilt::Hamilt<double>* p_hamilt,
+                     const K_Vectors& kv,
+                     Parallel_Orbitals& pv,
+                     UnitCell& ucell)
+{
+    return;
+}
\ No newline at end of file
diff --git a/source/source_lcao/module_rt/force_rt_overlap.h b/source/source_lcao/module_rt/force_rt_overlap.h
new file mode 100644
index 00000000000..cfffc32bb95
--- /dev/null
+++ b/source/source_lcao/module_rt/force_rt_overlap.h
@@ -0,0 +1,17 @@
+#ifndef FORCE_RT_OVERLAP_H
+#define FORCE_RT_OVERLAP_H
+
+#include "source_basis/module_ao/parallel_orbitals.h"
+#include "source_cell/klist.h"
+#include "source_lcao/setup_dm.h"
+#include "source_hamilt/hamilt.h"
+
+template <typename T>
+void cal_foverlap_rt(ModuleBase::matrix& foverlap,
+                     const LCAO_domain::Setup_DM<T>& dmat,
+                     hamilt::Hamilt<T>* p_hamilt,
+                     const K_Vectors& kv,
+                     Parallel_Orbitals& pv,
+                     UnitCell& ucell);
+
+#endif
\ No newline at end of file
diff --git a/source/source_lcao/module_rt/td_folding.cpp b/source/source_lcao/module_rt/td_folding.cpp
index bd3b4cd24d0..2df49042ca2 100644
--- a/source/source_lcao/module_rt/td_folding.cpp
+++ b/source/source_lcao/module_rt/td_folding.cpp
@@ -102,6 +102,42 @@ void folding_partial_HR_td(const UnitCell& ucell,
         }
     }
 }
+void folding_partial_dot(const hamilt::HContainer<double>& dR,
+            std::complex<double>* dk,
+            const ModuleBase::Vector3<double>& kvec_d_in,
+            const int ncol,
+            const int hk_type,
+            const UnitCell* ucell,
+            const std::map<ModuleBase::Vector3<int>, std::complex<double>>& phase_hybrid,
+            const ModuleBase::Vector3<double>& At,
+            const ModuleBase::Vector3<double>& Et)
+{
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+    for (int i = 0; i < dR.size_atom_pairs(); ++i)
+    {
+        hamilt::AtomPair<double>& tmp = dR.get_atom_pair(i);
+        for(int ir = 0;ir < tmp.get_R_size(); ++ir )
+        {
+            const ModuleBase::Vector3<int> r_index = tmp.get_R_index(ir);
+            const int iat1 = tmp.get_atom_i();
+            const int iat2 = tmp.get_atom_j();
+            // cal k_phase
+            // if TK==std::complex<double>, kphase is e^{ikR}
+            const ModuleBase::Vector3<double> dR(r_index.x, r_index.y, r_index.z);            
+            const double arg = (kvec_d_in * dR) * ModuleBase::TWO_PI;
+            double sinp, cosp;
+            ModuleBase::libm::sincos(arg, &sinp, &cosp);
+            std::complex<double> kphase = std::complex<double>(cosp, sinp);
+            kphase *= phase_hybrid.at(r_index);
+            ModuleBase::Vector3<double> dtau = ucell->cal_dtau(iat1, iat1, r_index);
+            kphase *= Et * dtau * ucell->lat0;
+            tmp.find_R(r_index);
+            tmp.add_to_matrix(dk, ncol, kphase, hk_type);
+        }
+    }
+}
 template
 void folding_HR_td<double>(const hamilt::HContainer<double>& hR,
                         std::complex<double>* hk,
diff --git a/source/source_lcao/module_rt/td_folding.h b/source/source_lcao/module_rt/td_folding.h
index d61d72488e2..73b954c625e 100644
--- a/source/source_lcao/module_rt/td_folding.h
+++ b/source/source_lcao/module_rt/td_folding.h
@@ -30,6 +30,15 @@ void folding_partial_HR_td(const UnitCell& ucell,
             const int ix,
             const int ncol,
             const int hk_type);
+void folding_partial_dot(const hamilt::HContainer<double>& dR,
+            std::complex<double>* dk,
+            const ModuleBase::Vector3<double>& kvec_d_in,
+            const int ncol,
+            const int hk_type,
+            const UnitCell* ucell,
+            const std::map<ModuleBase::Vector3<int>, std::complex<double>>& phase_hybrid,
+            const ModuleBase::Vector3<double>& At,
+            const ModuleBase::Vector3<double>& Et);
 }// namespace module_rt
 
 #endif
\ No newline at end of file
diff --git a/source/source_lcao/module_rt/td_info.cpp b/source/source_lcao/module_rt/td_info.cpp
index f4251cc3731..e39bc96cae8 100644
--- a/source/source_lcao/module_rt/td_info.cpp
+++ b/source/source_lcao/module_rt/td_info.cpp
@@ -58,6 +58,13 @@ TD_info::~TD_info()
             delete this->current_term[dir];
         }
     }
+    for (int dir = 0; dir < 3; dir++)
+    {
+        if (this->grad_overlap[dir] != nullptr)
+        {
+            delete this->grad_overlap[dir];
+        }
+    }
 }
 
 void TD_info::output_cart_At(const std::string& out_dir)
@@ -267,7 +274,151 @@ void TD_info::destroy_HS_R_td_sparse(void)
     HR_sparse_td_vel[1].swap(empty_HR_sparse_td_vel_down);
 }
 
+void TD_info::calculate_grad_overlap(const Parallel_Orbitals& paraV,
+                                     const UnitCell& ucell,
+                                     const Grid_Driver& GridD,
+                                     const std::vector<double>& orb_cutoff,
+                                     const TwoCenterIntegrator* intor)
+{
+    ModuleBase::TITLE("TD_info", "calculate_grad_overlap");
+    ModuleBase::timer::start("TD_info", "calculate_grad_overlap");
+    for (int dir=0;dir<3;dir++)
+    {
+        if (this->grad_overlap[dir] != nullptr)
+        {
+            delete this->grad_overlap[dir];
+        }
+        this->grad_overlap[dir] = new hamilt::HContainer<double>(&paraV);
+    }
+    for (int iat1 = 0; iat1 < ucell.nat; iat1++)
+    {
+        auto tau1 = ucell.get_tau(iat1);
+        int T1=0;
+        int I1=0;
+        ucell.iat2iait(iat1, &I1, &T1);
+        AdjacentAtomInfo adjs;
+        GridD.Find_atom(ucell, tau1, T1, I1, &adjs);
+        for (int ad = 0; ad < adjs.adj_num + 1; ++ad)
+        {
+            const int T2 = adjs.ntype[ad];
+            const int I2 = adjs.natom[ad];
+            int iat2 = ucell.itia2iat(T2, I2);
+            if (paraV.get_row_size(iat1) <= 0 || paraV.get_col_size(iat2) <= 0)
+            {
+                continue;
+            }
+            const ModuleBase::Vector3<int>& R_index = adjs.box[ad];
+            // choose the real adjacent atoms
+            // Note: the distance of atoms should less than the cutoff radius,
+            // When equal, the theoretical value of matrix element is zero,
+            // but the calculated value is not zero due to the numerical error, which would lead to result changes.
+            if (ucell.cal_dtau(iat1, iat2, R_index).norm() * ucell.lat0
+                >= orb_cutoff[T1] + orb_cutoff[T2])
+            {
+                continue;
+            }
+            hamilt::AtomPair<double> tmp(iat1, iat2, R_index, &paraV);
+            for (int dir=0;dir<3;dir++)
+            {
+                this->grad_overlap[dir]->insert_pair(tmp);
+            }
+        }
+    }
+    // allocate the memory of BaseMatrix in grad_overlap, and set the new values to zero
+    for (int dir=0;dir<3;dir++)
+    {
+        this->grad_overlap[dir]->allocate(nullptr, true);
+    }
+
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+    for (int iap = 0; iap < this->grad_overlap[0]->size_atom_pairs(); ++iap)
+    {
+        hamilt::AtomPair<double>& tmp = this->grad_overlap[0]->get_atom_pair(iap);
+        const int iat1 = tmp.get_atom_i();
+        const int iat2 = tmp.get_atom_j();
+
+        for (int iR = 0; iR < tmp.get_R_size(); ++iR)
+        {
+            const ModuleBase::Vector3<int> R_index = tmp.get_R_index(iR);
+            auto dtau = ucell.cal_dtau(iat1, iat2, R_index);
+
+            double* p_data[3] = {nullptr, nullptr, nullptr};
+            for (int i = 0; i < 3; i++)
+            {
+                p_data[i] = this->grad_overlap[i]->get_atom_pair(iap).get_pointer(iR);
+            }
+            // ---------------------------------------------
+            // get info of orbitals of atom1 and atom2 from ucell
+            // ---------------------------------------------
+            int T1=0;
+            int I1=0;
+            ucell.iat2iait(iat1, &I1, &T1);
+            int T2=0;
+            int I2=0;
+            ucell.iat2iait(iat2, &I2, &T2);
+            Atom& atom1 = ucell.atoms[T1];
+            Atom& atom2 = ucell.atoms[T2];
+
+            // npol is the number of polarizations,
+            // 1 for non-magnetic (one Hamiltonian matrix only has spin-up or spin-down),
+            // 2 for magnetic (one Hamiltonian matrix has both spin-up and spin-down)
+            const int npol = ucell.get_npol();
 
+            const int* iw2l1 = atom1.iw2l.data();
+            const int* iw2n1 = atom1.iw2n.data();
+            const int* iw2m1 = atom1.iw2m.data();
+            const int* iw2l2 = atom2.iw2l.data();
+            const int* iw2n2 = atom2.iw2n.data();
+            const int* iw2m2 = atom2.iw2m.data();
+
+            // ---------------------------------------------
+            // calculate the overlap matrix for each pair of orbitals
+            // ---------------------------------------------
+            double olm[3] = {0, 0, 0};
+            auto row_indexes = paraV.get_indexes_row(iat1);
+            auto col_indexes = paraV.get_indexes_col(iat2);
+            const int step_trace = col_indexes.size() + 1;
+            for (int iw1l = 0; iw1l < row_indexes.size(); iw1l += npol)
+            {
+                const int iw1 = row_indexes[iw1l] / npol;
+                const int L1 = iw2l1[iw1];
+                const int N1 = iw2n1[iw1];
+                const int m1 = iw2m1[iw1];
+
+                // convert m (0,1,...2l) to M (-l, -l+1, ..., l-1, l)
+                int M1 = (m1 % 2 == 0) ? -m1 / 2 : (m1 + 1) / 2;
+
+                for (int iw2l = 0; iw2l < col_indexes.size(); iw2l += npol)
+                {
+                    const int iw2 = col_indexes[iw2l] / npol;
+                    const int L2 = iw2l2[iw2];
+                    const int N2 = iw2n2[iw2];
+                    const int m2 = iw2m2[iw2];
+
+                    // convert m (0,1,...2l) to M (-l, -l+1, ..., l-1, l)
+                    int M2 = (m2 % 2 == 0) ? -m2 / 2 : (m2 + 1) / 2;
+                    intor->calculate(T1, L1, N1, M1, T2, L2, N2, M2, dtau * ucell.lat0, nullptr, olm);
+                    for (int dir = 0; dir < 3; dir++)
+                    {
+                        for (int ipol = 0; ipol < npol; ipol++)
+                        {
+                            p_data[dir][ipol * step_trace] += olm[dir];
+                        }
+                        p_data[dir] += npol;
+                    }
+                }
+                for (int dir = 0; dir < 3; dir++)
+                {
+                    p_data[dir] += (npol - 1) * col_indexes.size();
+                }
+                
+            }
+        }
+    }
+    ModuleBase::timer::end("TD_info", "calculate_grad_overlap");
+}
 template
 void TD_info::initialize_phase_hybrid<std::complex<double>>(const UnitCell& ucell, const hamilt::HContainer<std::complex<double>>* hR);
 template
diff --git a/source/source_lcao/module_rt/td_info.h b/source/source_lcao/module_rt/td_info.h
index bccb1f27c49..d14c67daf70 100644
--- a/source/source_lcao/module_rt/td_info.h
+++ b/source/source_lcao/module_rt/td_info.h
@@ -4,6 +4,7 @@
 #include "source_base/timer.h"
 #include "source_lcao/module_hcontainer/hcontainer.h"
 #include "source_io/module_hs/cal_r_overlap_R.h"
+#include "source_basis/module_nao/two_center_integrator.h"
 
 #include <map>
 // Class to store TDDFT infos, mainly for periodic system.
@@ -64,6 +65,15 @@ class TD_info
         return this->phase_hybrid;
     }
 
+    void calculate_grad_overlap(const Parallel_Orbitals& paraV,
+                                const UnitCell& ucell,
+                                const Grid_Driver& GridD,
+                                const std::vector<double>& orb_cutoff,
+                                const TwoCenterIntegrator* intor);
+    std::vector<hamilt::HContainer<double>*> get_grad_overlap() const
+    {
+      return this->grad_overlap;
+    }
     // set velocity HR.
     void set_velocity_HR(hamilt::HContainer<std::complex<double>>* HR)
     {
@@ -107,6 +117,9 @@ class TD_info
     /// @brief store the read in At_data
     static std::vector<ModuleBase::Vector3<double>> At_from_file;
 
+    /// @brief store the dS/dD matrix
+    std::vector<hamilt::HContainer<double>*> grad_overlap = {nullptr, nullptr, nullptr};
+
     /// @brief destory HSR data stored
     void destroy_HS_R_td_sparse();
 

From 73326e5d111e915058b513c390939cc01b1c1ab2 Mon Sep 17 00:00:00 2001
From: zaotian <zaotian@mail.ustc.edu.cn>
Date: Tue, 16 Jun 2026 18:33:02 +0800
Subject: [PATCH 2/2] fix no MPI compile

---
 source/source_lcao/module_rt/force_rt_overlap.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/source/source_lcao/module_rt/force_rt_overlap.cpp b/source/source_lcao/module_rt/force_rt_overlap.cpp
index f90a8e9e22b..a95ccf93ef5 100644
--- a/source/source_lcao/module_rt/force_rt_overlap.cpp
+++ b/source/source_lcao/module_rt/force_rt_overlap.cpp
@@ -13,6 +13,7 @@ void cal_foverlap_rt(ModuleBase::matrix& foverlap,
                      Parallel_Orbitals& pv,
                      UnitCell& ucell)
 {
+#ifdef __MPI
     const int nlocal = PARAM.globalv.nlocal;
     assert(nlocal >= 0);
 
@@ -270,6 +271,7 @@ void cal_foverlap_rt(ModuleBase::matrix& foverlap,
         delete[] tmp_out[dir];
     }
     return;
+#endif
 }
 template <>
 void cal_foverlap_rt(ModuleBase::matrix& foverlap,