Perf: Better OpenMP Parallelization for cal_force_cc and cal_force_scc (#1840)

Alcanderian · web-flow · commit dc258a5d3e54 · 2023-02-06T16:31:52.000+08:00
diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
@@ -727,7 +727,6 @@ void Forces<FPTYPE, Device>::cal_force_cc(ModuleBase::matrix& forcecc, ModulePW:
 {
     ModuleBase::TITLE("Forces", "cal_force_cc");
     // recalculate the exchange-correlation potential.
-    ModuleBase::TITLE("Forces", "cal_force_cc");
     ModuleBase::timer::tick("Forces", "cal_force_cc");
 
     int total_works = 0;
@@ -800,99 +799,79 @@ void Forces<FPTYPE, Device>::cal_force_cc(ModuleBase::matrix& forcecc, ModulePW:
     // to G space
     rho_basis->real2recip(psiv, psiv);
 
-#ifdef _OPENMP
-#pragma omp parallel
-{
-    int num_threads = omp_get_num_threads();
-    int thread_id = omp_get_thread_num();
-#else
-    int num_threads = 1;
-    int thread_id = 0;
-#endif
-
     // psiv contains now Vxc(G)
     double* rhocg = new double[rho_basis->ngg];
     ModuleBase::GlobalFunc::ZEROS(rhocg, rho_basis->ngg);
 
-    /* Here is task distribution for multi-thread,
-        0. Consider for load balancing, we distribute `total_works` instead of `nat`.
-            So simply use `#pragma omp parallel for` is not suitable
-        1. Calculate the work range [work_beg, work_end) by each thread
-            a. when it is single thread stage, [work_beg, work_end) will be [0, total_works)
-        2. each thread iterate atoms form `work_beg` to `work_end-1`
-    */
-    int work, work_end;
-    ModuleBase::TASK_DIST_1D(num_threads, thread_id, total_works, work, work_end);
-    work_end = work + work_end;
-
-    int it = 0;
-    int ia = 0;
-    // We have to map the work beginning position to `it` and `ia` beginning position of this thread
-    for (int work_off = 0; it < GlobalC::ucell.ntype; it++)
+    for (int it = 0; it < GlobalC::ucell.ntype; ++it)
     {
         if (GlobalC::ucell.atoms[it].ncpp.nlcc)
         {
-            if (work_off + GlobalC::ucell.atoms[it].na > work)
-            {
-                ia = work - work_off;
-                break;
-            }
-            work_off += GlobalC::ucell.atoms[it].na;
-        }
-    }
-
-    int last_it = -1;
-    while (work < work_end)
-    {
-        if (it != last_it)
-        {
-            // call drhoc when `it` is changed
             chr->non_linear_core_correction(GlobalC::ppcell.numeric,
                                                     GlobalC::ucell.atoms[it].ncpp.msh,
                                                     GlobalC::ucell.atoms[it].ncpp.r,
                                                     GlobalC::ucell.atoms[it].ncpp.rab,
                                                     GlobalC::ucell.atoms[it].ncpp.rho_atc,
                                                     rhocg,
                                                     rho_basis);
-            last_it = it;
-        }
-
-        // get iat form table
-        int iat = GlobalC::ucell.itia2iat(it, ia);
-        for (int ig = 0; ig < rho_basis->npw; ig++)
-        {
-            const ModuleBase::Vector3<double> gv = rho_basis->gcar[ig];
-            const ModuleBase::Vector3<double> pos = GlobalC::ucell.atoms[it].tau[ia];
-            const double rhocgigg = rhocg[rho_basis->ig2igg[ig]];
-            const std::complex<double> psiv_conj = conj(psiv[ig]);
-
-            const double arg = ModuleBase::TWO_PI * (gv.x * pos.x + gv.y * pos.y + gv.z * pos.z);
-            double sinp, cosp;
-            ModuleBase::libm::sincos(arg, &sinp, &cosp);
-            const std::complex<double> expiarg = std::complex<double>(sinp, cosp);
+#ifdef _OPENMP
+#pragma omp parallel
+{
+#endif
+            for (int ia = 0; ia < GlobalC::ucell.atoms[it].na; ++ia)
+            {
+                // get iat form table
+                int iat = GlobalC::ucell.itia2iat(it, ia);
+                double force[3] = {0, 0, 0};
+#ifdef _OPENMP
+#pragma omp for nowait
+#endif
+                for (int ig = 0; ig < rho_basis->npw; ig++)
+                {
+                    const ModuleBase::Vector3<double> gv = rho_basis->gcar[ig];
+                    const ModuleBase::Vector3<double> pos = GlobalC::ucell.atoms[it].tau[ia];
+                    const double rhocgigg = rhocg[rho_basis->ig2igg[ig]];
+                    const std::complex<double> psiv_conj = conj(psiv[ig]);
 
-            auto ipol0 = GlobalC::ucell.tpiba * GlobalC::ucell.omega * rhocgigg * gv.x * psiv_conj * expiarg;
-            forcecc(iat, 0) += ipol0.real();
+                    const double arg = ModuleBase::TWO_PI * (gv.x * pos.x + gv.y * pos.y + gv.z * pos.z);
+                    double sinp, cosp;
+                    ModuleBase::libm::sincos(arg, &sinp, &cosp);
+                    const std::complex<double> expiarg = std::complex<double>(sinp, cosp);
 
-            auto ipol1 = GlobalC::ucell.tpiba * GlobalC::ucell.omega * rhocgigg * gv.y * psiv_conj * expiarg;
-            forcecc(iat, 1) += ipol1.real();
+                    auto ipol0 = GlobalC::ucell.tpiba * GlobalC::ucell.omega * rhocgigg * gv.x * psiv_conj * expiarg;
+                    force[0] += ipol0.real();
 
-            auto ipol2 = GlobalC::ucell.tpiba * GlobalC::ucell.omega * rhocgigg * gv.z * psiv_conj * expiarg;
-            forcecc(iat, 2) += ipol2.real();
-        }
+                    auto ipol1 = GlobalC::ucell.tpiba * GlobalC::ucell.omega * rhocgigg * gv.y * psiv_conj * expiarg;
+                    force[1] += ipol1.real();
 
-        ++work;
-        if (GlobalC::ucell.step_ia(it, &ia))
-        {
-            // search for next effective `it`
-            while (!GlobalC::ucell.step_it(&it) && !GlobalC::ucell.atoms[it].ncpp.nlcc);
+                    auto ipol2 = GlobalC::ucell.tpiba * GlobalC::ucell.omega * rhocgigg * gv.z * psiv_conj * expiarg;
+                    force[2] += ipol2.real();
+                }
+#ifdef _OPENMP
+                if (omp_get_num_threads() > 1)
+                {
+                    #pragma omp atomic
+                    forcecc(iat, 0) += force[0];
+                    #pragma omp atomic
+                    forcecc(iat, 1) += force[1];
+                    #pragma omp atomic
+                    forcecc(iat, 2) += force[2];
+                }
+                else
+#endif
+                {
+                    forcecc(iat, 0) += force[0];
+                    forcecc(iat, 1) += force[1];
+                    forcecc(iat, 2) += force[2];
+                }
+            }
+#ifdef _OPENMP
+} // omp parallel
+#endif
         }
     }
 
     delete[] rhocg;
-#ifdef _OPENMP
-} // omp parallel
-#endif
     
     delete[] psiv; // mohan fix bug 2012-03-22
     Parallel_Reduce::reduce_double_pool(forcecc.c, forcecc.nr * forcecc.nc); // qianrui fix a bug for kpar > 1
@@ -1161,24 +1140,30 @@ void Forces<FPTYPE, Device>::cal_force_scc(ModuleBase::matrix& forcescc, ModuleP
     if (rho_basis->gg_uniq[0] < 1.0e-8)
         igg0 = 1;
 
+    double* rhocgnt = new double[rho_basis->ngg];
+
 #ifdef _OPENMP
 #pragma omp parallel
 {
+    int num_threads = omp_get_num_threads();
+    int thread_id = omp_get_thread_num();
+#else
+    int num_threads = 1;
+    int thread_id = 0;
 #endif
 
     // thread local work space
     double *aux = new double[ndm];
     ModuleBase::GlobalFunc::ZEROS(aux, ndm);
-    double* rhocgnt = new double[rho_basis->ngg];
-    ModuleBase::GlobalFunc::ZEROS(rhocgnt, rho_basis->ngg);
+
+    int ig_beg, ig_length, ig_end;
+    ModuleBase::TASK_DIST_1D(num_threads, thread_id, rho_basis->ngg, ig_beg, ig_length);
+    ModuleBase::GlobalFunc::ZEROS(rhocgnt + ig_beg, ig_length);
+    ig_end = ig_beg + ig_length;
 
     double fact = 2.0;
     int last_it = -1;
 
-#ifdef _OPENMP
-// use no wait to avoid syncing
-#pragma omp for nowait
-#endif
     for (int iat = 0; iat < GlobalC::ucell.nat; ++iat)
     {
         int it = GlobalC::ucell.iat2it[iat];
@@ -1190,7 +1175,7 @@ void Forces<FPTYPE, Device>::cal_force_scc(ModuleBase::matrix& forcescc, ModuleP
             //		Here we compute the G.ne.0 term
             const int mesh = GlobalC::ucell.atoms[it].ncpp.msh;
 
-            for (int ig = igg0; ig < rho_basis->ngg; ++ig)
+            for (int ig = std::max(ig_beg, igg0); ig < ig_end; ++ig)
             {
                 const double gx = sqrt(rho_basis->gg_uniq[ig]) * GlobalC::ucell.tpiba;
                 for (int ir = 0; ir < mesh; ir++)
@@ -1210,13 +1195,20 @@ void Forces<FPTYPE, Device>::cal_force_scc(ModuleBase::matrix& forcescc, ModuleP
 
             // record it
             last_it = it;
+
+            // wait for rhocgnt update
+#ifdef _OPENMP
+            #pragma omp barrier
+#endif
         }
 
         const ModuleBase::Vector3<double> pos = GlobalC::ucell.atoms[it].tau[ia];
 
-        const auto ig_loop = [&](int ig_beg, int ig_end)
+        double force[3] = {0, 0, 0};
+
+        const auto ig_loop = [&](int start, int stop)
         {
-            for (int ig = ig_beg; ig < ig_end; ++ig)
+            for (int ig = start; ig < stop; ++ig)
             {
                 const ModuleBase::Vector3<double> gv = rho_basis->gcar[ig];
                 const double rhocgntigg = rhocgnt[GlobalC::rhopw->ig2igg[ig]];
@@ -1225,26 +1217,45 @@ void Forces<FPTYPE, Device>::cal_force_scc(ModuleBase::matrix& forcescc, ModuleP
                 ModuleBase::libm::sincos(arg, &sinp, &cosp);
                 const std::complex<double> cpm = std::complex<double>(sinp, cosp) * conj(psic[ig]);
 
-                forcescc(iat, 0) += fact * rhocgntigg * GlobalC::ucell.tpiba * gv.x * cpm.real();
-                forcescc(iat, 1) += fact * rhocgntigg * GlobalC::ucell.tpiba * gv.y * cpm.real();
-                forcescc(iat, 2) += fact * rhocgntigg * GlobalC::ucell.tpiba * gv.z * cpm.real();
+                force[0] += fact * rhocgntigg * GlobalC::ucell.tpiba * gv.x * cpm.real();
+                force[1] += fact * rhocgntigg * GlobalC::ucell.tpiba * gv.y * cpm.real();
+                force[2] += fact * rhocgntigg * GlobalC::ucell.tpiba * gv.z * cpm.real();
             }
         };
 
-        ig_loop(0, ig_gap);
-        ig_loop(ig_gap + 1, rho_basis->npw);
+        ig_loop(ig_beg, std::min(ig_gap, ig_end));
+        ig_loop(ig_gap + 1, ig_end);
+
+#ifdef _OPENMP
+        if (num_threads > 1)
+        {
+            #pragma omp atomic
+            forcescc(iat, 0) += force[0];
+            #pragma omp atomic
+            forcescc(iat, 1) += force[1];
+            #pragma omp atomic
+            forcescc(iat, 2) += force[2];
+        }
+        else
+#endif
+        {
+            forcescc(iat, 0) += force[0];
+            forcescc(iat, 1) += force[1];
+            forcescc(iat, 2) += force[2];
+        }
 
         // std::cout << " forcescc = " << forcescc(iat,0) << " " << forcescc(iat,1) << " " <<
         // forcescc(iat,2) << std::endl;
     }
 
     delete[] aux;
-    delete[] rhocgnt;
 
 #ifdef _OPENMP
 }
 #endif
 
+    delete[] rhocgnt;
+
     Parallel_Reduce::reduce_double_pool(forcescc.c, forcescc.nr * forcescc.nc);
 
     delete[] psic; // mohan fix bug 2012-03-22