diff --git a/docs/advanced/input_files/input-main.md b/docs/advanced/input_files/input-main.md
index d91640bdb4..a524290c37 100644
--- a/docs/advanced/input_files/input-main.md
+++ b/docs/advanced/input_files/input-main.md
@@ -599,7 +599,7 @@ These variables are used to control general system parameters.
   - nao: from numerical atomic orbitals. If they are not enough, other wave functions are initialized with random numbers.
   - nao+random: add small random numbers on numerical atomic orbitals
   
-  > Only the `file` option is useful for the lcao basis set, which is mostly used when [calculation](#calculation) is set to `set_wf` and `get_pchg`. See more details in [out_wfc_lcao](#out_wfc_lcao).
+  > Only the `file` option is useful for the lcao basis set, which is mostly used when [calculation](#calculation) is set to `get_wf` and `get_pchg`. See more details in [out_wfc_lcao](#out_wfc_lcao).
 - **Default**: atomic
 
 ### init_chg
@@ -1921,7 +1921,7 @@ The band (KS orbital) energy for each (k-point, spin, band) will be printed in t
 ### if_separate_k
 
 - **Type**: Boolean
-- **Availability**: Only for LCAO, used only when `calculation = get_pchg` and `gamma_only` is turned off.
+- **Availability**: For both PW and LCAO. When `basis_type = pw`, used if `out_pchg` is set. When `basis_type = lcao`, used only when `calculation = get_pchg` and `gamma_only` is turned off.
 - **Description**: Specifies whether to write the partial charge densities for all k-points to individual files or merge them. **Warning**: Enabling symmetry may produce incorrect results due to incorrect k-point weights. Therefore, when calculating partial charge densities, it is strongly recommended to set `symmetry = -1`.
 - **Default**: false
 
diff --git a/source/module_elecstate/module_charge/symmetry_rho.cpp b/source/module_elecstate/module_charge/symmetry_rho.cpp
index 9bf0c6b3d5..ae5d942976 100644
--- a/source/module_elecstate/module_charge/symmetry_rho.cpp
+++ b/source/module_elecstate/module_charge/symmetry_rho.cpp
@@ -97,7 +97,7 @@ void Symmetry_rho::psymm(double* rho_part,
         rhotot.resize(rho_basis->nxyz);
         ModuleBase::GlobalFunc::ZEROS(rhotot.data(), rho_basis->nxyz);
     }
-    Pgrid.reduce(rhotot.data(), rho_part);
+    Pgrid.reduce(rhotot.data(), rho_part, false);
 
     // (2)
     if (GlobalV::MY_RANK == 0)
diff --git a/source/module_esolver/esolver_ks_pw.cpp b/source/module_esolver/esolver_ks_pw.cpp
index ea76287bba..5e5177d1a9 100644
--- a/source/module_esolver/esolver_ks_pw.cpp
+++ b/source/module_esolver/esolver_ks_pw.cpp
@@ -645,14 +645,24 @@ void ESolver_KS_PW<T, Device>::iter_finish(UnitCell& ucell, const int istep, int
     //----------------------------------------------------------
     // 3) Print out electronic wavefunctions in pw basis
     //----------------------------------------------------------
-	if (iter % PARAM.inp.out_freq_elec == 0 || iter == PARAM.inp.scf_nmax || conv_esolver)
-	{
-		ModuleIO::write_wfc_pw(GlobalV::KPAR, GlobalV::MY_POOL, GlobalV::MY_RANK, 
-				PARAM.inp.nbands, PARAM.inp.nspin, PARAM.globalv.npol,
-				GlobalV::RANK_IN_POOL, GlobalV::NPROC_IN_POOL, 
-				PARAM.inp.out_wfc_pw, PARAM.inp.ecutwfc, PARAM.globalv.global_out_dir,
-				this->psi[0], this->kv, this->pw_wfc, GlobalV::ofs_running);
-	}
+    if (iter % PARAM.inp.out_freq_elec == 0 || iter == PARAM.inp.scf_nmax || conv_esolver)
+    {
+        ModuleIO::write_wfc_pw(GlobalV::KPAR,
+                               GlobalV::MY_POOL,
+                               GlobalV::MY_RANK,
+                               PARAM.inp.nbands,
+                               PARAM.inp.nspin,
+                               PARAM.globalv.npol,
+                               GlobalV::RANK_IN_POOL,
+                               GlobalV::NPROC_IN_POOL,
+                               PARAM.inp.out_wfc_pw,
+                               PARAM.inp.ecutwfc,
+                               PARAM.globalv.global_out_dir,
+                               this->psi[0],
+                               this->kv,
+                               this->pw_wfc,
+                               GlobalV::ofs_running);
+    }
 
     //----------------------------------------------------------
     // 4) check if oscillate for delta_spin method
@@ -718,11 +728,6 @@ void ESolver_KS_PW<T, Device>::after_scf(UnitCell& ucell, const int istep, const
                               this->pw_rhod->ny,
                               this->pw_rhod->nz,
                               this->pw_rhod->nxyz,
-                              this->kv.get_nks(),
-                              this->kv.isk,
-                              this->kv.wk,
-                              this->pw_big->bz,
-                              this->pw_big->nbz,
                               this->chr.ngmc,
                               &ucell,
                               this->psi,
@@ -731,16 +736,29 @@ void ESolver_KS_PW<T, Device>::after_scf(UnitCell& ucell, const int istep, const
                               this->ctx,
                               this->Pgrid,
                               PARAM.globalv.global_out_dir,
-                              PARAM.inp.if_separate_k);
+                              PARAM.inp.if_separate_k,
+                              this->kv,
+                              GlobalV::KPAR,
+                              GlobalV::MY_POOL,
+                              &this->chr);
     }
 
-
-	// tmp 2025-05-17, mohan note
-	ModuleIO::write_wfc_pw(GlobalV::KPAR, GlobalV::MY_POOL, GlobalV::MY_RANK, 
-			PARAM.inp.nbands, PARAM.inp.nspin, PARAM.globalv.npol,
-			GlobalV::RANK_IN_POOL, GlobalV::NPROC_IN_POOL, 
-			PARAM.inp.out_wfc_pw, PARAM.inp.ecutwfc, PARAM.globalv.global_out_dir,
-			this->psi[0], this->kv, this->pw_wfc, GlobalV::ofs_running);
+    // tmp 2025-05-17, mohan note
+    ModuleIO::write_wfc_pw(GlobalV::KPAR,
+                           GlobalV::MY_POOL,
+                           GlobalV::MY_RANK,
+                           PARAM.inp.nbands,
+                           PARAM.inp.nspin,
+                           PARAM.globalv.npol,
+                           GlobalV::RANK_IN_POOL,
+                           GlobalV::NPROC_IN_POOL,
+                           PARAM.inp.out_wfc_pw,
+                           PARAM.inp.ecutwfc,
+                           PARAM.globalv.global_out_dir,
+                           this->psi[0],
+                           this->kv,
+                           this->pw_wfc,
+                           GlobalV::ofs_running);
 
     //------------------------------------------------------------------
     //! 5) calculate Wannier functions in pw basis
@@ -931,7 +949,7 @@ void ESolver_KS_PW<T, Device>::after_all_runners(UnitCell& ucell)
     //----------------------------------------------------------
 
     //----------------------------------------------------------
-    //! The write_psi_r_1 interface will be removed in the very 
+    //! The write_psi_r_1 interface will be removed in the very
     //! near future. Don't use it!
     //----------------------------------------------------------
     // if (PARAM.inp.out_wfc_r == 1) // Peize Lin add 2021.11.21
@@ -951,19 +969,15 @@ void ESolver_KS_PW<T, Device>::after_all_runners(UnitCell& ucell)
                             this->pw_rhod->ny,
                             this->pw_rhod->nz,
                             this->pw_rhod->nxyz,
-                            this->kv.get_nks(),
-                            this->kv.isk,
-                            this->kv.wk,
-                            this->pw_big->bz,
-                            this->pw_big->nbz,
-                            this->chr.ngmc,
                             &ucell,
                             this->psi,
-                            this->pw_rhod,
                             this->pw_wfc,
                             this->ctx,
                             this->Pgrid,
-                            PARAM.globalv.global_out_dir);
+                            PARAM.globalv.global_out_dir,
+                            this->kv,
+                            GlobalV::KPAR,
+                            GlobalV::MY_POOL);
     }
 
     //----------------------------------------------------------
diff --git a/source/module_hamilt_pw/hamilt_pwdft/parallel_grid.cpp b/source/module_hamilt_pw/hamilt_pwdft/parallel_grid.cpp
index 9f65265978..6d0eac2c59 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/parallel_grid.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/parallel_grid.cpp
@@ -1,61 +1,62 @@
 #include "parallel_grid.h"
+
 #include "module_base/parallel_global.h"
 #include "module_parameter/parameter.h"
 Parallel_Grid::Parallel_Grid()
 {
     this->allocate = false;
-    this->allocate_final_scf = false; //LiuXh add 20180619
+    this->allocate_final_scf = false; // LiuXh add 20180619
 }
 
 Parallel_Grid::~Parallel_Grid()
 {
-	if(this->allocate || this->allocate_final_scf) //LiuXh add 20180619
-	{
-		for(int ip=0; ip<GlobalV::KPAR; ip++)
-		{
-			delete[] numz[ip];
-			delete[] startz[ip];
-			delete[] whichpro[ip];
-		}
-		delete[] numz;
-		delete[] startz;
-		delete[] whichpro;
+    if (this->allocate || this->allocate_final_scf) // LiuXh add 20180619
+    {
+        for (int ip = 0; ip < GlobalV::KPAR; ip++)
+        {
+            delete[] numz[ip];
+            delete[] startz[ip];
+            delete[] whichpro[ip];
+            delete[] whichpro_loc[ip];
+        }
+        delete[] numz;
+        delete[] startz;
+        delete[] whichpro;
+        delete[] whichpro_loc;
         delete[] nproc_in_pool;
-	}
+    }
 }
 
-
-void Parallel_Grid::init(
-	const int &ncx_in, 
-	const int &ncy_in, 
-	const int &ncz_in, 
-	const int &nczp_in, 
-	const int &nrxx_in, 
-	const int &nbz_in, 
-	const int &bz_in)
+void Parallel_Grid::init(const int& ncx_in,
+                         const int& ncy_in,
+                         const int& ncz_in,
+                         const int& nczp_in,
+                         const int& nrxx_in,
+                         const int& nbz_in,
+                         const int& bz_in)
 {
 
-	ModuleBase::TITLE("Parallel_Grid","init");
-	
-	this->ncx = ncx_in;
-	this->ncy = ncy_in;
-	this->ncz = ncz_in;
-	this->nczp = nczp_in;
-	this->nrxx = nrxx_in;
-	this->nbz = nbz_in;
+    ModuleBase::TITLE("Parallel_Grid", "init");
+
+    this->ncx = ncx_in;
+    this->ncy = ncy_in;
+    this->ncz = ncz_in;
+    this->nczp = nczp_in;
+    this->nrxx = nrxx_in;
+    this->nbz = nbz_in;
     this->bz = bz_in;
 
-	if(nczp<0)
-	{
-		GlobalV::ofs_warning << " nczp = " << nczp << std::endl;
-		ModuleBase::WARNING_QUIT("Parallel_Grid::init","nczp<0");
-	}
+    if (nczp < 0)
+    {
+        GlobalV::ofs_warning << " nczp = " << nczp << std::endl;
+        ModuleBase::WARNING_QUIT("Parallel_Grid::init", "nczp<0");
+    }
 
-	assert(ncx > 0);
-	assert(ncy > 0);
-	assert(ncz > 0);
+    assert(ncx > 0);
+    assert(ncy > 0);
+    assert(ncz > 0);
 
-	this->ncxy = ncx * ncy;
+    this->ncxy = ncx * ncy;
     this->ncxyz = ncxy * ncz;
 
 #ifndef __MPI
@@ -63,129 +64,144 @@ void Parallel_Grid::init(
 #endif
 
     // enable to call this function again liuyu 2023-03-10
-    if(this->allocate)
+    if (this->allocate)
     {
-        for(int ip=0; ip<GlobalV::KPAR; ip++)
+        for (int ip = 0; ip < GlobalV::KPAR; ip++)
         {
             delete[] numz[ip];
             delete[] startz[ip];
             delete[] whichpro[ip];
+            delete[] whichpro_loc[ip];
         }
         delete[] numz;
         delete[] startz;
         delete[] whichpro;
+        delete[] whichpro_loc;
         delete[] nproc_in_pool;
         this->allocate = false;
     }
 
-	// (2)
-	assert(allocate==false);
-	assert(GlobalV::KPAR > 0);
+    // (2)
+    assert(allocate == false);
+    assert(GlobalV::KPAR > 0);
 
-	this->nproc_in_pool = new int[GlobalV::KPAR];
-	int nprocgroup;
-	if(PARAM.inp.esolver_type == "sdft") {		nprocgroup = GlobalV::NPROC_IN_BNDGROUP;
-	} else {											nprocgroup = GlobalV::NPROC;
-}
+    this->nproc_in_pool = new int[GlobalV::KPAR];
+    int nprocgroup;
+    if (PARAM.inp.esolver_type == "sdft")
+    {
+        nprocgroup = GlobalV::NPROC_IN_BNDGROUP;
+    }
+    else
+    {
+        nprocgroup = GlobalV::NPROC;
+    }
 
-	const int remain_pro = nprocgroup%GlobalV::KPAR;
-	for(int i=0; i<GlobalV::KPAR; i++)
-	{
-		nproc_in_pool[i] = nprocgroup/GlobalV::KPAR;
-		if(i<remain_pro) { this->nproc_in_pool[i]++;
-}
-	}
-
-	this->numz = new int*[GlobalV::KPAR];
-	this->startz = new int*[GlobalV::KPAR];
-	this->whichpro = new int*[GlobalV::KPAR];
-
-	for(int ip=0; ip<GlobalV::KPAR; ip++)
-	{
-		const int nproc = nproc_in_pool[ip];
-		this->numz[ip] = new int[nproc];
-		this->startz[ip] = new int[nproc];
-		this->whichpro[ip] = new int[this->ncz];
-		ModuleBase::GlobalFunc::ZEROS(this->numz[ip], nproc);
-		ModuleBase::GlobalFunc::ZEROS(this->startz[ip], nproc);
-		ModuleBase::GlobalFunc::ZEROS(this->whichpro[ip], this->ncz);
-	}
-
-	this->allocate = true;
-	this->z_distribution();
-	
-	return;
+    const int remain_pro = nprocgroup % GlobalV::KPAR;
+    for (int i = 0; i < GlobalV::KPAR; i++)
+    {
+        nproc_in_pool[i] = nprocgroup / GlobalV::KPAR;
+        if (i < remain_pro)
+        {
+            this->nproc_in_pool[i]++;
+        }
+    }
+
+    this->numz = new int*[GlobalV::KPAR];
+    this->startz = new int*[GlobalV::KPAR];
+    this->whichpro = new int*[GlobalV::KPAR];
+    this->whichpro_loc = new int*[GlobalV::KPAR];
+
+    for (int ip = 0; ip < GlobalV::KPAR; ip++)
+    {
+        const int nproc = nproc_in_pool[ip];
+        this->numz[ip] = new int[nproc];
+        this->startz[ip] = new int[nproc];
+        this->whichpro[ip] = new int[this->ncz];
+        this->whichpro_loc[ip] = new int[this->ncz];
+        ModuleBase::GlobalFunc::ZEROS(this->numz[ip], nproc);
+        ModuleBase::GlobalFunc::ZEROS(this->startz[ip], nproc);
+        ModuleBase::GlobalFunc::ZEROS(this->whichpro[ip], this->ncz);
+        ModuleBase::GlobalFunc::ZEROS(this->whichpro_loc[ip], this->ncz);
+    }
+
+    this->allocate = true;
+    this->z_distribution();
+
+    return;
 }
 
 void Parallel_Grid::z_distribution()
 {
-	assert(allocate);	
-
-	int* startp = new int[GlobalV::KPAR];
-	startp[0] = 0;
-	for(int ip=0; ip<GlobalV::KPAR; ip++)
-	{
-//		GlobalV::ofs_running << "\n now POOL=" << ip;
-		const int nproc = nproc_in_pool[ip];
-		
-		if(ip>0) { startp[ip] = startp[ip-1] + nproc_in_pool[ip-1];
-}
-		
-		// (1) how many z on each 'proc' in each 'pool'
-		for(int iz=0; iz<nbz; iz++)
-		{
-			const int proc = iz % nproc;
-			numz[ip][proc]+=bz;
-		}
-
-//		for(int proc=0; proc<nproc; proc++)
-//		{
-//			GlobalV::ofs_running << "\n proc=" << proc << " numz=" << numz[ip][proc];
-//		}
-
-		// (2) start position of z in each 'proc' in each 'pool'
-		startz[ip][0] = 0;
-		for (int proc=1; proc<nproc; proc++)
-		{
-			startz[ip][proc] = startz[ip][proc-1] + numz[ip][proc-1];
-		}
-
-//		for(int proc=0; proc<nproc; proc++)
-//		{
-//			GlobalV::ofs_running << "\n proc=" << proc << " startz=" << startz[ip][proc];
-//		}
-
-		// (3) each z belongs to which 'proc' ( global index )
-		for(int iz=0; iz<ncz; iz++)
-		{
-			for(int proc=0; proc<nproc; proc++)
-			{
-				if(iz>=startz[ip][nproc-1])
-				{
-					whichpro[ip][iz] = startp[ip] + nproc-1;
-					break;
-				}
-				else if(iz>=startz[ip][proc] && iz<startz[ip][proc+1])
-				{
-					whichpro[ip][iz] = startp[ip] + proc;
-					break;
-				}
-			}
-		}
-
-//		for(int iz=0; iz<ncz; iz++)
-//		{
-//			GlobalV::ofs_running << "\n iz=" << iz << " whichpro=" << whichpro[ip][iz];
-//		}
-	}
-
-	delete[] startp;
-	return;
-}
+    assert(allocate);
+
+    int* startp = new int[GlobalV::KPAR];
+    startp[0] = 0;
+    for (int ip = 0; ip < GlobalV::KPAR; ip++)
+    {
+        //		GlobalV::ofs_running << "\n now POOL=" << ip;
+        const int nproc = nproc_in_pool[ip];
+
+        if (ip > 0)
+        {
+            startp[ip] = startp[ip - 1] + nproc_in_pool[ip - 1];
+        }
+
+        // (1) how many z on each 'proc' in each 'pool'
+        for (int iz = 0; iz < nbz; iz++)
+        {
+            const int proc = iz % nproc;
+            numz[ip][proc] += bz;
+        }
+
+        //		for(int proc=0; proc<nproc; proc++)
+        //		{
+        //			GlobalV::ofs_running << "\n proc=" << proc << " numz=" << numz[ip][proc];
+        //		}
+
+        // (2) start position of z in each 'proc' in each 'pool'
+        startz[ip][0] = 0;
+        for (int proc = 1; proc < nproc; proc++)
+        {
+            startz[ip][proc] = startz[ip][proc - 1] + numz[ip][proc - 1];
+        }
+
+        //		for(int proc=0; proc<nproc; proc++)
+        //		{
+        //			GlobalV::ofs_running << "\n proc=" << proc << " startz=" << startz[ip][proc];
+        //		}
+
+        // (3) each z belongs to which 'proc' ( global index )
+        for (int iz = 0; iz < ncz; iz++)
+        {
+            for (int proc = 0; proc < nproc; proc++)
+            {
+                if (iz >= startz[ip][nproc - 1])
+                {
+                    whichpro[ip][iz] = startp[ip] + nproc - 1;
+                    whichpro_loc[ip][iz] = nproc - 1;
+                    break;
+                }
+                else if (iz >= startz[ip][proc] && iz < startz[ip][proc + 1])
+                {
+                    whichpro[ip][iz] = startp[ip] + proc;
+                    whichpro_loc[ip][iz] = proc;
+                    break;
+                }
+            }
+        }
+
+        //		for(int iz=0; iz<ncz; iz++)
+        //		{
+        //			GlobalV::ofs_running << "\n iz=" << iz << " whichpro=" << whichpro[ip][iz];
+        //		}
+    }
 
+    delete[] startp;
+    return;
+}
 
 #ifdef __MPI
-void Parallel_Grid::bcast(const double* const data_global, double* data_local, const int& rank)const
+void Parallel_Grid::bcast(const double* const data_global, double* data_local, const int& rank) const
 {
     std::vector<double> zpiece(ncxy);
     for (int iz = 0; iz < this->ncz; ++iz)
@@ -208,269 +224,270 @@ void Parallel_Grid::bcast(const double* const data_global, double* data_local, c
 
 void Parallel_Grid::zpiece_to_all(double* zpiece, const int& iz, double* rho) const
 {
-	if(PARAM.inp.esolver_type == "sdft")
-	{
-		this->zpiece_to_stogroup(zpiece,iz,rho);
-		return;
-	}
-	assert(allocate);	
-	//ModuleBase::TITLE("Parallel_Grid","zpiece_to_all");
-	MPI_Status ierror;
-
-	const int znow = iz - this->startz[GlobalV::MY_POOL][GlobalV::RANK_IN_POOL];
-	const int proc = this->whichpro[GlobalV::MY_POOL][iz];
-	
-	if(GlobalV::MY_POOL==0)
-	{
-		// case 1: the first part of rho in processor 0.
-		// and send zpeice to to other pools.
-		if(proc == 0 && GlobalV::MY_RANK ==0)
-		{
-			for(int ir=0; ir<ncxy; ir++)
-			{
-				rho[ir*nczp+znow] = zpiece[ir];
-			}
-			for(int ipool=1; ipool < GlobalV::KPAR; ipool++)
-			{
-				MPI_Send(zpiece, ncxy, MPI_DOUBLE, this->whichpro[ipool][iz], iz, MPI_COMM_WORLD);
-			}
-		}
-
-		// case 2: processor n (n!=0) receive rho from processor 0.
-		// and the receive tag is iz.
-		else if(proc == GlobalV::RANK_IN_POOL )
-		{
-			MPI_Recv(zpiece, ncxy, MPI_DOUBLE, 0, iz, MPI_COMM_WORLD,&ierror);
-			for(int ir=0; ir<ncxy; ir++)
-			{
-				rho[ir*nczp + znow] = zpiece[ir];
-			}
-		}
-				
-		// case 2: > first part rho: processor 0 send the rho
-		// to all pools. The tag is iz, because processor may
-		// send more than once, and the only tag to distinguish
-		// them is iz.
-		else if(GlobalV::RANK_IN_POOL==0)
-		{
-			for(int ipool=0; ipool < GlobalV::KPAR; ipool++)
-			{
-				MPI_Send(zpiece, ncxy, MPI_DOUBLE, this->whichpro[ipool][iz], iz, MPI_COMM_WORLD);
-			}
-		}
-	}// GlobalV::MY_POOL == 0
-	else
-	{
-		//GlobalV::ofs_running << "\n Receive charge density iz=" << iz << std::endl;
-		// the processors in other pools always receive rho from
-		// processor 0. the tag is 'iz'
-		if(proc == GlobalV::MY_RANK )
-		{
-			MPI_Recv(zpiece, ncxy, MPI_DOUBLE, 0, iz, MPI_COMM_WORLD,&ierror);
-			for(int ir=0; ir<ncxy; ir++)
-			{
-				rho[ir*nczp+znow] = zpiece[ir];
-			}
-		}
-	}
-
-	//GlobalV::ofs_running << "\n iz = " << iz << " Done.";
-	return;	
+    if (PARAM.inp.esolver_type == "sdft")
+    {
+        this->zpiece_to_stogroup(zpiece, iz, rho);
+        return;
+    }
+    assert(allocate);
+    // ModuleBase::TITLE("Parallel_Grid","zpiece_to_all");
+    MPI_Status ierror;
+
+    const int znow = iz - this->startz[GlobalV::MY_POOL][GlobalV::RANK_IN_POOL];
+    const int proc = this->whichpro[GlobalV::MY_POOL][iz];
+
+    if (GlobalV::MY_POOL == 0)
+    {
+        // case 1: the first part of rho in processor 0.
+        // and send zpeice to to other pools.
+        if (proc == 0 && GlobalV::MY_RANK == 0)
+        {
+            for (int ir = 0; ir < ncxy; ir++)
+            {
+                rho[ir * nczp + znow] = zpiece[ir];
+            }
+            for (int ipool = 1; ipool < GlobalV::KPAR; ipool++)
+            {
+                MPI_Send(zpiece, ncxy, MPI_DOUBLE, this->whichpro[ipool][iz], iz, MPI_COMM_WORLD);
+            }
+        }
+
+        // case 2: processor n (n!=0) receive rho from processor 0.
+        // and the receive tag is iz.
+        else if (proc == GlobalV::RANK_IN_POOL)
+        {
+            MPI_Recv(zpiece, ncxy, MPI_DOUBLE, 0, iz, MPI_COMM_WORLD, &ierror);
+            for (int ir = 0; ir < ncxy; ir++)
+            {
+                rho[ir * nczp + znow] = zpiece[ir];
+            }
+        }
+
+        // case 2: > first part rho: processor 0 send the rho
+        // to all pools. The tag is iz, because processor may
+        // send more than once, and the only tag to distinguish
+        // them is iz.
+        else if (GlobalV::RANK_IN_POOL == 0)
+        {
+            for (int ipool = 0; ipool < GlobalV::KPAR; ipool++)
+            {
+                MPI_Send(zpiece, ncxy, MPI_DOUBLE, this->whichpro[ipool][iz], iz, MPI_COMM_WORLD);
+            }
+        }
+    } // GlobalV::MY_POOL == 0
+    else
+    {
+        // GlobalV::ofs_running << "\n Receive charge density iz=" << iz << std::endl;
+        //  the processors in other pools always receive rho from
+        //  processor 0. the tag is 'iz'
+        if (proc == GlobalV::MY_RANK)
+        {
+            MPI_Recv(zpiece, ncxy, MPI_DOUBLE, 0, iz, MPI_COMM_WORLD, &ierror);
+            for (int ir = 0; ir < ncxy; ir++)
+            {
+                rho[ir * nczp + znow] = zpiece[ir];
+            }
+        }
+    }
+
+    // GlobalV::ofs_running << "\n iz = " << iz << " Done.";
+    return;
 }
 #endif
 
 #ifdef __MPI
-void Parallel_Grid::zpiece_to_stogroup(double *zpiece, const int &iz, double *rho) const
+void Parallel_Grid::zpiece_to_stogroup(double* zpiece, const int& iz, double* rho) const
 {
-	assert(allocate);	
-	//TITLE("Parallel_Grid","zpiece_to_all");
-	MPI_Status ierror;
-
-	const int znow = iz - this->startz[GlobalV::MY_POOL][GlobalV::RANK_IN_POOL];
-	const int proc = this->whichpro[GlobalV::MY_POOL][iz];
-	
-	if(GlobalV::MY_POOL==0)
-	{
-		// case 1: the first part of rho in processor 0.
-		// and send zpeice to to other pools.
-		if(proc == 0 && GlobalV::RANK_IN_BPGROUP ==0)
-		{
-			for(int ir=0; ir<ncxy; ir++)
-			{
-				rho[ir*nczp+znow] = zpiece[ir];
-			}
-			for(int ipool=1; ipool < GlobalV::KPAR; ipool++)
-			{
-				MPI_Send(zpiece, ncxy, MPI_DOUBLE, this->whichpro[ipool][iz], iz, INT_BGROUP);
-			}
-		}
-
-		// case 2: processor n (n!=0) receive rho from processor 0.
-		// and the receive tag is iz.
-		else if(proc == GlobalV::RANK_IN_POOL )
-		{
-			MPI_Recv(zpiece, ncxy, MPI_DOUBLE, 0, iz, INT_BGROUP,&ierror);
-			for(int ir=0; ir<ncxy; ir++)
-			{
-				rho[ir*nczp + znow] = zpiece[ir];
-			}
-		}
-				
-		// case 2: > first part rho: processor 0 send the rho
-		// to all pools. The tag is iz, because processor may
-		// send more than once, and the only tag to distinguish
-		// them is iz.
-		else if(GlobalV::RANK_IN_POOL==0)
-		{
-			for(int ipool=0; ipool < GlobalV::KPAR; ipool++)
-			{
-				MPI_Send(zpiece, ncxy, MPI_DOUBLE, this->whichpro[ipool][iz], iz, INT_BGROUP);
-			}
-		}
-	}// MY_POOL == 0
-	else
-	{
-		//ofs_running << "\n Receive charge density iz=" << iz << endl;
-		// the processors in other pools always receive rho from
-		// processor 0. the tag is 'iz'
-		if(proc == GlobalV::RANK_IN_BPGROUP )
-		{
-			MPI_Recv(zpiece, ncxy, MPI_DOUBLE, 0, iz, INT_BGROUP,&ierror);
-			for(int ir=0; ir<ncxy; ir++)
-			{
-				rho[ir*nczp+znow] = zpiece[ir];
-			}
-		}
-	}
-
-	//ofs_running << "\n iz = " << iz << " Done.";
-	return;	
+    assert(allocate);
+    // TITLE("Parallel_Grid","zpiece_to_all");
+    MPI_Status ierror;
+
+    const int znow = iz - this->startz[GlobalV::MY_POOL][GlobalV::RANK_IN_POOL];
+    const int proc = this->whichpro[GlobalV::MY_POOL][iz];
+
+    if (GlobalV::MY_POOL == 0)
+    {
+        // case 1: the first part of rho in processor 0.
+        // and send zpeice to to other pools.
+        if (proc == 0 && GlobalV::RANK_IN_BPGROUP == 0)
+        {
+            for (int ir = 0; ir < ncxy; ir++)
+            {
+                rho[ir * nczp + znow] = zpiece[ir];
+            }
+            for (int ipool = 1; ipool < GlobalV::KPAR; ipool++)
+            {
+                MPI_Send(zpiece, ncxy, MPI_DOUBLE, this->whichpro[ipool][iz], iz, INT_BGROUP);
+            }
+        }
+
+        // case 2: processor n (n!=0) receive rho from processor 0.
+        // and the receive tag is iz.
+        else if (proc == GlobalV::RANK_IN_POOL)
+        {
+            MPI_Recv(zpiece, ncxy, MPI_DOUBLE, 0, iz, INT_BGROUP, &ierror);
+            for (int ir = 0; ir < ncxy; ir++)
+            {
+                rho[ir * nczp + znow] = zpiece[ir];
+            }
+        }
+
+        // case 2: > first part rho: processor 0 send the rho
+        // to all pools. The tag is iz, because processor may
+        // send more than once, and the only tag to distinguish
+        // them is iz.
+        else if (GlobalV::RANK_IN_POOL == 0)
+        {
+            for (int ipool = 0; ipool < GlobalV::KPAR; ipool++)
+            {
+                MPI_Send(zpiece, ncxy, MPI_DOUBLE, this->whichpro[ipool][iz], iz, INT_BGROUP);
+            }
+        }
+    } // MY_POOL == 0
+    else
+    {
+        // ofs_running << "\n Receive charge density iz=" << iz << endl;
+        //  the processors in other pools always receive rho from
+        //  processor 0. the tag is 'iz'
+        if (proc == GlobalV::RANK_IN_BPGROUP)
+        {
+            MPI_Recv(zpiece, ncxy, MPI_DOUBLE, 0, iz, INT_BGROUP, &ierror);
+            for (int ir = 0; ir < ncxy; ir++)
+            {
+                rho[ir * nczp + znow] = zpiece[ir];
+            }
+        }
+    }
 
+    // ofs_running << "\n iz = " << iz << " Done.";
+    return;
 }
-void Parallel_Grid::reduce(double* rhotot, const double* const rhoin)const
+void Parallel_Grid::reduce(double* rhotot, const double* const rhoin, const bool reduce_all_pool) const
 {
-    //ModuleBase::TITLE("Parallel_Grid","reduce");
-
-	// if not the first pool, wait here until processpr 0
-	// send the Barrier command.
-	if(GlobalV::MY_POOL!=0) 
-	{
-		return;
-	}
-
-	double* zpiece = new double[this->ncxy];
-	
-	for(int iz=0; iz<this->ncz; iz++)
-	{
-		const int znow = iz - this->startz[GlobalV::MY_POOL][GlobalV::RANK_IN_POOL];
-		const int proc = this->whichpro[GlobalV::MY_POOL][iz];
-		ModuleBase::GlobalFunc::ZEROS(zpiece, this->ncxy);
-		int tag = iz;
-		MPI_Status ierror;
-
-		// case 1: the first part of rho in processor 0.
-		if(proc == 0 && GlobalV::RANK_IN_POOL ==0)
-		{
-			for(int ir=0; ir<ncxy; ir++)
-			{
-				zpiece[ir] = rhoin[ir*this->nczp + znow];
-			}
-		}
-
-		// case 2: > first part rho: send the rho to
-		// processor 0.
-		else if(proc == GlobalV::RANK_IN_POOL )
-		{
-			for(int ir=0; ir<ncxy; ir++)
-			{
-				zpiece[ir] = rhoin[ir*this->nczp + znow];
-			}
-			MPI_Send(zpiece, ncxy, MPI_DOUBLE, 0, tag, POOL_WORLD);
-		}
-
-		 // case 2: > first part rho: processor 0 receive the rho
-		 // from other processors
-		else if(GlobalV::RANK_IN_POOL==0)
-		{
-			MPI_Recv(zpiece, ncxy, MPI_DOUBLE, proc, tag, POOL_WORLD, &ierror);
-		}
-
-		if(GlobalV::MY_RANK==0)
-		{
-            for (int ixy = 0; ixy < this->ncxy;++ixy)
+    // ModuleBase::TITLE("Parallel_Grid","reduce");
+
+    // if not the first pool, wait here until processpr 0
+    // send the Barrier command.
+    if (!reduce_all_pool && GlobalV::MY_POOL != 0)
+    {
+        return;
+    }
+
+    double* zpiece = new double[this->ncxy];
+
+    for (int iz = 0; iz < this->ncz; iz++)
+    {
+        const int znow = iz - this->startz[GlobalV::MY_POOL][GlobalV::RANK_IN_POOL];
+        const int proc = this->whichpro[GlobalV::MY_POOL][iz];
+        const int proc_loc = this->whichpro_loc[GlobalV::MY_POOL][iz]; // Obtain the local processor index in the pool
+        ModuleBase::GlobalFunc::ZEROS(zpiece, this->ncxy);
+        int tag = iz;
+        MPI_Status ierror;
+
+        // Local processor 0 collects data from all other processors in the pool
+        // proc = proc_loc if GlobalV::MY_POOL == 0
+        if (proc_loc == GlobalV::RANK_IN_POOL)
+        {
+            for (int ir = 0; ir < ncxy; ir++)
+            {
+                zpiece[ir] = rhoin[ir * this->nczp + znow];
+            }
+            // Send data to the root of the pool
+            if (GlobalV::RANK_IN_POOL != 0)
+            {
+                MPI_Send(zpiece, ncxy, MPI_DOUBLE, 0, tag, POOL_WORLD);
+            }
+        }
+
+        // The root of the pool receives data from other processors
+        if (GlobalV::RANK_IN_POOL == 0 && proc_loc != GlobalV::RANK_IN_POOL)
+        {
+            MPI_Recv(zpiece, ncxy, MPI_DOUBLE, proc_loc, tag, POOL_WORLD, &ierror);
+        }
+
+        if (GlobalV::RANK_IN_POOL == 0)
+        {
+            for (int ixy = 0; ixy < this->ncxy; ++ixy)
             {
                 rhotot[ixy * ncz + iz] = zpiece[ixy];
-			}	
-		}
-	}
+            }
+        }
+    }
 
-	delete[] zpiece;	
+    delete[] zpiece;
 
-	return;
+    return;
 }
 #endif
 
-void Parallel_Grid::init_final_scf(const int &ncx_in, const int &ncy_in, const int &ncz_in, const int &nczp_in, 
-const int &nrxx_in, const int &nbz_in, const int &bz_in)
+void Parallel_Grid::init_final_scf(const int& ncx_in,
+                                   const int& ncy_in,
+                                   const int& ncz_in,
+                                   const int& nczp_in,
+                                   const int& nrxx_in,
+                                   const int& nbz_in,
+                                   const int& bz_in)
 {
 
-	ModuleBase::TITLE("Parallel_Grid","init");
-	
-	this->ncx = ncx_in;
-	this->ncy = ncy_in;
-	this->ncz = ncz_in;
-	this->nczp = nczp_in;
-	this->nrxx = nrxx_in;
-	this->nbz = nbz_in;
+    ModuleBase::TITLE("Parallel_Grid", "init");
+
+    this->ncx = ncx_in;
+    this->ncy = ncy_in;
+    this->ncz = ncz_in;
+    this->nczp = nczp_in;
+    this->nrxx = nrxx_in;
+    this->nbz = nbz_in;
     this->bz = bz_in;
 
-	if(nczp<0)
-	{
-		GlobalV::ofs_warning << " nczp = " << nczp << std::endl;
-		ModuleBase::WARNING_QUIT("Parallel_Grid::init","nczp<0");
-	}
+    if (nczp < 0)
+    {
+        GlobalV::ofs_warning << " nczp = " << nczp << std::endl;
+        ModuleBase::WARNING_QUIT("Parallel_Grid::init", "nczp<0");
+    }
 
-	assert(ncx > 0);
-	assert(ncy > 0);
-	assert(ncz > 0);
+    assert(ncx > 0);
+    assert(ncy > 0);
+    assert(ncz > 0);
 
-	this->ncxy = ncx * ncy;
+    this->ncxy = ncx * ncy;
     this->ncxyz = ncxy * ncz;
 
 #ifndef __MPI
     return;
 #endif
 
-	// (2)
-	assert(allocate_final_scf==false);
-	assert(GlobalV::KPAR > 0);
+    // (2)
+    assert(allocate_final_scf == false);
+    assert(GlobalV::KPAR > 0);
 
-	this->nproc_in_pool = new int[GlobalV::KPAR];
-	const int remain_pro = GlobalV::NPROC%GlobalV::KPAR;
-	for(int i=0; i<GlobalV::KPAR; i++)
-	{
-		nproc_in_pool[i] = GlobalV::NPROC/GlobalV::KPAR;
-		if(i<remain_pro) { this->nproc_in_pool[i]++;
-}
-	}	
-
-	this->numz = new int*[GlobalV::KPAR];
-	this->startz = new int*[GlobalV::KPAR];
-	this->whichpro = new int*[GlobalV::KPAR];
-
-	for(int ip=0; ip<GlobalV::KPAR; ip++)
-	{
-		const int nproc = nproc_in_pool[ip];
-		this->numz[ip] = new int[nproc];
-		this->startz[ip] = new int[nproc];
-		this->whichpro[ip] = new int[this->ncz];
-		ModuleBase::GlobalFunc::ZEROS(this->numz[ip], nproc);
-		ModuleBase::GlobalFunc::ZEROS(this->startz[ip], nproc);
-		ModuleBase::GlobalFunc::ZEROS(this->whichpro[ip], this->ncz);
-	}
-
-	this->allocate_final_scf = true;
-	this->z_distribution();
-	
-	return;
+    this->nproc_in_pool = new int[GlobalV::KPAR];
+    const int remain_pro = GlobalV::NPROC % GlobalV::KPAR;
+    for (int i = 0; i < GlobalV::KPAR; i++)
+    {
+        nproc_in_pool[i] = GlobalV::NPROC / GlobalV::KPAR;
+        if (i < remain_pro)
+        {
+            this->nproc_in_pool[i]++;
+        }
+    }
+
+    this->numz = new int*[GlobalV::KPAR];
+    this->startz = new int*[GlobalV::KPAR];
+    this->whichpro = new int*[GlobalV::KPAR];
+
+    for (int ip = 0; ip < GlobalV::KPAR; ip++)
+    {
+        const int nproc = nproc_in_pool[ip];
+        this->numz[ip] = new int[nproc];
+        this->startz[ip] = new int[nproc];
+        this->whichpro[ip] = new int[this->ncz];
+        ModuleBase::GlobalFunc::ZEROS(this->numz[ip], nproc);
+        ModuleBase::GlobalFunc::ZEROS(this->startz[ip], nproc);
+        ModuleBase::GlobalFunc::ZEROS(this->whichpro[ip], this->ncz);
+    }
+
+    this->allocate_final_scf = true;
+    this->z_distribution();
+
+    return;
 }
diff --git a/source/module_hamilt_pw/hamilt_pwdft/parallel_grid.h b/source/module_hamilt_pw/hamilt_pwdft/parallel_grid.h
index 1da46fa394..748a612cd0 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/parallel_grid.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/parallel_grid.h
@@ -31,9 +31,9 @@ class Parallel_Grid
     void zpiece_to_stogroup(double* zpiece, const int& iz, double* rho) const; //qainrui add for sto-dft 2021-7-21
 
     /// @brief  Broadcast data from root to all processors. The index order is [x][y][z].
-    void bcast(const double* const data_global, double* data_local, const int& rank)const;
+    void bcast(const double* const data_global, double* data_local, const int& rank) const;
     /// @brief  Reduce data from all processors to root. The index order is [x][y][z].
-    void reduce(double* rhotot, const double* constrhoin)const;
+    void reduce(double* rhotot, const double* constrhoin, const bool reduce_all_pool) const;
 #endif
 
     const int& nx = this->ncx;
@@ -48,6 +48,7 @@ class Parallel_Grid
     int **numz = nullptr;
     int **startz = nullptr;
     int **whichpro = nullptr;
+    int **whichpro_loc = nullptr;
 
 	int ncx=0;
 	int ncy=0;
diff --git a/source/module_io/cal_ldos.cpp b/source/module_io/cal_ldos.cpp
index 2b6d112d02..49b58f120a 100644
--- a/source/module_io/cal_ldos.cpp
+++ b/source/module_io/cal_ldos.cpp
@@ -323,7 +323,7 @@ void trilinear_interpolate(const std::vector<std::vector<int>>& points,
 #ifdef __MPI
     if (GlobalV::MY_POOL == 0 && GlobalV::MY_BNDGROUP == 0)
     {
-        pgrid.reduce(data_full.data(), data.data());
+        pgrid.reduce(data_full.data(), data.data(), false);
     }
     MPI_Barrier(MPI_COMM_WORLD);
 #else
diff --git a/source/module_io/cube_io.h b/source/module_io/cube_io.h
index d7e6b706a6..a5725438f9 100644
--- a/source/module_io/cube_io.h
+++ b/source/module_io/cube_io.h
@@ -1,105 +1,105 @@
 #ifndef CUBE_IO_H
 #define CUBE_IO_H
-#include <string>
 #include "module_cell/unitcell.h"
+
+#include <string>
 class Parallel_Grid;
 
 namespace ModuleIO
 {
-    /// read volumetric data from .cube file into the parallel distributed grid.
-    bool read_vdata_palgrid(
-    const Parallel_Grid& pgrid,
-    const int my_rank,
-    std::ofstream& ofs_running,
-    const std::string& fn,
-    double* const data,
-    const int nat);
+/// read volumetric data from .cube file into the parallel distributed grid.
+bool read_vdata_palgrid(const Parallel_Grid& pgrid,
+                        const int my_rank,
+                        std::ofstream& ofs_running,
+                        const std::string& fn,
+                        double* const data,
+                        const int nat);
 
-    /// write volumetric data on the parallized grid into a .cube file
-    void write_vdata_palgrid(
-    const Parallel_Grid& pgrid,
-    const double* const data,
-    const int is,
-    const int nspin,
-    const int iter,
-    const std::string& fn,
-    const double ef,
-    const UnitCell*const ucell,
-    const int precision = 11,
-    const int out_fermi = 1); // mohan add 2007-10-17
+/// write volumetric data on the parallized grid into a .cube file
+void write_vdata_palgrid(const Parallel_Grid& pgrid,
+                         const double* const data,
+                         const int is,
+                         const int nspin,
+                         const int iter,
+                         const std::string& fn,
+                         const double ef,
+                         const UnitCell* const ucell,
+                         const int precision = 11,
+                         const int out_fermi = 1,
+                         const bool reduce_all_pool = false); // only reduce in the main pool as default
 
-    /// read the full data from a cube file 
-    bool read_cube(const std::string& file,
-        std::vector<std::string>& comment,
-        int& natom,
-        std::vector<double>& origin,
-        int& nx,
-        int& ny,
-        int& nz,
-        std::vector<double>& dx,
-        std::vector<double>& dy,
-        std::vector<double>& dz,
-        std::vector<int>& atom_type,
-        std::vector<double>& atom_charge,
-        std::vector<std::vector<double>>& atom_pos,
-        std::vector<double>& data);
+/// read the full data from a cube file
+bool read_cube(const std::string& file,
+               std::vector<std::string>& comment,
+               int& natom,
+               std::vector<double>& origin,
+               int& nx,
+               int& ny,
+               int& nz,
+               std::vector<double>& dx,
+               std::vector<double>& dy,
+               std::vector<double>& dz,
+               std::vector<int>& atom_type,
+               std::vector<double>& atom_charge,
+               std::vector<std::vector<double>>& atom_pos,
+               std::vector<double>& data);
 
-    /// write a cube file
-    void write_cube(const std::string& file,
-        const std::vector<std::string>& comment,
-        const int& natom,
-        const std::vector<double>& origin,
-        const int& nx,
-        const int& ny,
-        const int& nz,
-        const std::vector<double>& dx,
-        const std::vector<double>& dy,
-        const std::vector<double>& dz,
-        const std::vector<int>& atom_type,
-        const std::vector<double>& atom_charge,
-        const std::vector<std::vector<double>>& atom_pos,
-        const std::vector<double>& data,
-        const int precision,
-        const int ndata_line = 6);
+/// write a cube file
+void write_cube(const std::string& file,
+                const std::vector<std::string>& comment,
+                const int& natom,
+                const std::vector<double>& origin,
+                const int& nx,
+                const int& ny,
+                const int& nz,
+                const std::vector<double>& dx,
+                const std::vector<double>& dy,
+                const std::vector<double>& dz,
+                const std::vector<int>& atom_type,
+                const std::vector<double>& atom_charge,
+                const std::vector<std::vector<double>>& atom_pos,
+                const std::vector<double>& data,
+                const int precision,
+                const int ndata_line = 6);
 
-    /**
-     * @brief The trilinear interpolation method
-     *
-     * Trilinear interpolation is a method for interpolating grid data in 3D space.
-     * It estimates the value at a given position by interpolating the data along the three adjacent points.
-     *
-     * Specifically, for 3D grid data, trilinear interpolation requires determining the eight data points that are
-     * closest to the point where the estimation is required. These data points form a cube, with vertices at
-     * (x0,y0,z0), (x0+1,y0,z0), (x0,y0+1,z0), (x0+1,y0+1,z0), (x0,y0,z0+1), (x0+1,y0,z0+1), (x0,y0+1,z0+1) and
-     * (x0+1,y0+1,z0+1). Here, (x0,y0,z0) is the data point closest to the estimation point and has coordinate
-     * values less than those of the estimation point.
-     *
-     * For the estimation location (x,y,z), its estimated value in the grid can be calculated using the following
-     * formula: f(x,y,z) = f000(1-dx)(1-dy)(1-dz) + f100dx(1-dy)(1-dz) + f010(1-dx)dy(1-dz) +
-     *             f001(1-dx)(1-dy)dz + f101dx(1-dy)dz + f011(1-dx)dydz +
-     *             f110dxdy(1-dz) + f111dxdydz
-     * where fijk represents the data value at vertex i,j,k of the cube, and dx = x - x0, dy = y - y0, dz = z - z0
-     * represent the distance between the estimation point and the closest data point in each of the three
-     * directions, divided by the grid spacing. Here, it is assumed that the grid spacing is equal and can be
-     * omitted during computation.
-     *
-     * @param data_in the input data of size nxyz_read
-     * @param nx_read nx read from file
-     * @param ny_read ny read from file
-     * @param nz_read nz read from file
-     * @param nx the dimension of grids along x
-     * @param ny the dimension of grids along y
-     * @param nz the dimension of grids along z
-     * @param data_out the interpolated results of size nxyz
-     */
+/**
+ * @brief The trilinear interpolation method
+ *
+ * Trilinear interpolation is a method for interpolating grid data in 3D space.
+ * It estimates the value at a given position by interpolating the data along the three adjacent points.
+ *
+ * Specifically, for 3D grid data, trilinear interpolation requires determining the eight data points that are
+ * closest to the point where the estimation is required. These data points form a cube, with vertices at
+ * (x0,y0,z0), (x0+1,y0,z0), (x0,y0+1,z0), (x0+1,y0+1,z0), (x0,y0,z0+1), (x0+1,y0,z0+1), (x0,y0+1,z0+1) and
+ * (x0+1,y0+1,z0+1). Here, (x0,y0,z0) is the data point closest to the estimation point and has coordinate
+ * values less than those of the estimation point.
+ *
+ * For the estimation location (x,y,z), its estimated value in the grid can be calculated using the following
+ * formula: f(x,y,z) = f000(1-dx)(1-dy)(1-dz) + f100dx(1-dy)(1-dz) + f010(1-dx)dy(1-dz) +
+ *             f001(1-dx)(1-dy)dz + f101dx(1-dy)dz + f011(1-dx)dydz +
+ *             f110dxdy(1-dz) + f111dxdydz
+ * where fijk represents the data value at vertex i,j,k of the cube, and dx = x - x0, dy = y - y0, dz = z - z0
+ * represent the distance between the estimation point and the closest data point in each of the three
+ * directions, divided by the grid spacing. Here, it is assumed that the grid spacing is equal and can be
+ * omitted during computation.
+ *
+ * @param data_in the input data of size nxyz_read
+ * @param nx_read nx read from file
+ * @param ny_read ny read from file
+ * @param nz_read nz read from file
+ * @param nx the dimension of grids along x
+ * @param ny the dimension of grids along y
+ * @param nz the dimension of grids along z
+ * @param data_out the interpolated results of size nxyz
+ */
 void trilinear_interpolate(const double* const data_in,
-    const int& nx_read,
-    const int& ny_read,
-    const int& nz_read,
-    const int& nx,
-    const int& ny,
-    const int& nz,
-    double* data_out);
-}
+                           const int& nx_read,
+                           const int& ny_read,
+                           const int& nz_read,
+                           const int& nx,
+                           const int& ny,
+                           const int& nz,
+                           double* data_out);
+} // namespace ModuleIO
 
 #endif
diff --git a/source/module_io/get_pchg_pw.h b/source/module_io/get_pchg_pw.h
index 98dfb72e9c..4e601b0d6e 100644
--- a/source/module_io/get_pchg_pw.h
+++ b/source/module_io/get_pchg_pw.h
@@ -8,6 +8,7 @@
 #include "module_basis/module_pw/pw_basis_k.h"
 #include "module_cell/unitcell.h"
 #include "module_elecstate/elecstate.h"
+#include "module_elecstate/module_charge/charge.h"
 #include "module_elecstate/module_charge/symmetry_rho.h"
 #include "module_hamilt_pw/hamilt_pwdft/parallel_grid.h"
 #include "module_psi/psi.h"
@@ -25,11 +26,6 @@ void get_pchg_pw(const std::vector<int>& out_pchg,
                  const int ny,
                  const int nz,
                  const int nxyz,
-                 const int nks,
-                 const std::vector<int>& isk,
-                 const std::vector<double>& wk,
-                 const int pw_big_bz,
-                 const int pw_big_nbz,
                  const int ngmc,
                  UnitCell* ucell,
                  const psi::Psi<std::complex<double>>* psi,
@@ -38,168 +34,188 @@ void get_pchg_pw(const std::vector<int>& out_pchg,
                  const Device* ctx,
                  const Parallel_Grid& pgrid,
                  const std::string& global_out_dir,
-                 const bool if_separate_k)
+                 const bool if_separate_k,
+                 const K_Vectors& kv,
+                 const int kpar,
+                 const int my_pool,
+                 const Charge* chg) // Charge class is needed for the charge density reduce
 {
-    // bands_picked is a vector of 0s and 1s, where 1 means the band is picked to output
-    std::vector<int> bands_picked(nbands, 0);
+    // Get necessary parameters from kv
+    const int nks = kv.get_nks();       // current process pool k-point count
+    const int nkstot = kv.get_nkstot(); // total k-point count
 
-    // Check if length of out_pchg is valid
-    if (static_cast<int>(out_pchg.size()) > nbands)
+    // Loop over k-parallelism
+    for (int ip = 0; ip < kpar; ++ip)
     {
-        ModuleBase::WARNING_QUIT("ModuleIO::get_pchg_pw",
-                                 "The number of bands specified by `out_pchg` in the "
-                                 "INPUT file exceeds `nbands`!");
-    }
-
-    // Check if all elements in bands_picked are 0 or 1
-    for (int value: out_pchg)
-    {
-        if (value != 0 && value != 1)
+        if (my_pool != ip)
         {
-            ModuleBase::WARNING_QUIT("ModuleIO::get_pchg_pw",
-                                     "The elements of `out_pchg` must be either 0 or 1. "
-                                     "Invalid values found!");
+            continue;
         }
-    }
-
-    // Fill bands_picked with values from out_pchg
-    // Remaining bands are already set to 0
-    int length = std::min(static_cast<int>(out_pchg.size()), nbands);
-    for (int i = 0; i < length; ++i)
-    {
-        // out_pchg rely on function parse_expression
-        bands_picked[i] = static_cast<int>(out_pchg[i]);
-    }
 
-    std::vector<std::complex<double>> wfcr(nxyz);
-    std::vector<std::vector<double>> rho_band(nspin, std::vector<double>(nxyz));
+        // bands_picked is a vector of 0s and 1s, where 1 means the band is picked to output
+        std::vector<int> bands_picked(nbands, 0);
 
-    for (int ib = 0; ib < nbands; ++ib)
-    {
-        // Skip the loop iteration if bands_picked[ib] is 0
-        if (!bands_picked[ib])
+        // Check if length of out_pchg is valid
+        if (static_cast<int>(out_pchg.size()) > nbands)
         {
-            continue;
+            ModuleBase::WARNING_QUIT("ModuleIO::get_pchg_pw",
+                                     "The number of bands specified by `out_pchg` in the "
+                                     "INPUT file exceeds `nbands`!");
         }
 
-        for (int is = 0; is < nspin; ++is)
+        // Check if all elements in bands_picked are 0 or 1
+        for (int value: out_pchg)
         {
-            std::fill(rho_band[is].begin(), rho_band[is].end(), 0.0);
+            if (value != 0 && value != 1)
+            {
+                ModuleBase::WARNING_QUIT("ModuleIO::get_pchg_pw",
+                                         "The elements of `out_pchg` must be either 0 or 1. "
+                                         "Invalid values found!");
+            }
         }
 
-        if (if_separate_k)
+        // Fill bands_picked with values from out_pchg
+        // Remaining bands are already set to 0
+        int length = std::min(static_cast<int>(out_pchg.size()), nbands);
+        for (int i = 0; i < length; ++i)
         {
-            for (int ik = 0; ik < nks; ++ik)
-            {
-                const int spin_index = isk[ik];
-                std::cout << " Calculating band-decomposed charge density for band " << ib + 1 << ", k-point "
-                          << ik % (nks / nspin) + 1 << ", spin " << spin_index + 1 << std::endl;
+            // out_pchg rely on function parse_expression
+            bands_picked[i] = static_cast<int>(out_pchg[i]);
+        }
 
-                psi->fix_k(ik);
-                pw_wfc->recip_to_real(ctx, &psi[0](ib, 0), wfcr.data(), ik);
+        std::vector<std::complex<double>> wfcr(nxyz);
+        std::vector<std::vector<double>> rho_band(nspin, std::vector<double>(nxyz));
 
-                // To ensure the normalization of charge density in multi-k calculation (if if_separate_k is true)
-                double wg_sum_k = 0;
-                for (int ik_tmp = 0; ik_tmp < nks / nspin; ++ik_tmp)
-                {
-                    wg_sum_k += wk[ik_tmp];
-                }
+        for (int ib = 0; ib < nbands; ++ib)
+        {
+            // Skip the loop iteration if bands_picked[ib] is 0
+            if (!bands_picked[ib])
+            {
+                continue;
+            }
 
-                double w1 = static_cast<double>(wg_sum_k / ucell->omega);
+            for (int is = 0; is < nspin; ++is)
+            {
+                std::fill(rho_band[is].begin(), rho_band[is].end(), 0.0);
+            }
 
-                for (int i = 0; i < nxyz; ++i)
+            if (if_separate_k)
+            {
+                for (int ik = 0; ik < nks; ++ik)
                 {
-                    rho_band[spin_index][i] = std::norm(wfcr[i]) * w1;
+                    const int ikstot = kv.ik2iktot[ik];                 // global k-point index
+                    const int spin_index = kv.isk[ik];                  // spin index
+                    const int k_number = ikstot % (nkstot / nspin) + 1; // k-point number, starting from 1
+
+                    psi->fix_k(ik);
+                    pw_wfc->recip_to_real(ctx, &psi[0](ib, 0), wfcr.data(), ik);
+
+                    // To ensure the normalization of charge density in multi-k calculation (if if_separate_k is true)
+                    double wg_sum_k = 0.0;
+                    if (nspin == 1)
+                    {
+                        wg_sum_k = 2.0;
+                    }
+                    else if (nspin == 2)
+                    {
+                        wg_sum_k = 1.0;
+                    }
+                    else
+                    {
+                        ModuleBase::WARNING_QUIT("ModuleIO::get_pchg_pw",
+                                                 "Real space partial charge output currently do not support "
+                                                 "noncollinear polarized calculation (nspin = 4)!");
+                    }
+
+                    double w1 = static_cast<double>(wg_sum_k / ucell->omega);
+
+                    for (int i = 0; i < nxyz; ++i)
+                    {
+                        rho_band[spin_index][i] = std::norm(wfcr[i]) * w1;
+                    }
+
+                    std::stringstream ssc;
+                    ssc << global_out_dir << "BAND" << ib + 1 << "_K" << k_number << "_SPIN" << spin_index + 1
+                        << "_CHG.cube";
+
+                    ModuleIO::write_vdata_palgrid(pgrid,
+                                                  rho_band[spin_index].data(),
+                                                  spin_index,
+                                                  nspin,
+                                                  0,
+                                                  ssc.str(),
+                                                  0.0,
+                                                  ucell,
+                                                  11,
+                                                  1,
+                                                  true); // reduce_all_pool is true
                 }
-
-                std::cout << " Writing cube files...";
-
-                std::stringstream ssc;
-                ssc << global_out_dir << "BAND" << ib + 1 << "_K" << ik % (nks / nspin) + 1 << "_SPIN" << spin_index + 1
-                    << "_CHG.cube";
-
-                ModuleIO::write_vdata_palgrid(pgrid,
-                    rho_band[spin_index].data(),
-                    spin_index,
-                    nspin,
-                    0,
-                    ssc.str(),
-                    0.0,
-                    ucell);
-
-                std::cout << " Complete!" << std::endl;
             }
-        }
-        else
-        {
-            for (int ik = 0; ik < nks; ++ik)
+            else
             {
-                const int spin_index = isk[ik];
-                std::cout << " Calculating band-decomposed charge density for band " << ib + 1 << ", k-point "
-                          << ik % (nks / nspin) + 1 << ", spin " << spin_index + 1 << std::endl;
+                for (int ik = 0; ik < nks; ++ik)
+                {
+                    const int ikstot = kv.ik2iktot[ik];                 // global k-point index
+                    const int spin_index = kv.isk[ik];                  // spin index
+                    const int k_number = ikstot % (nkstot / nspin) + 1; // k-point number, starting from 1
 
-                psi->fix_k(ik);
-                pw_wfc->recip_to_real(ctx, &psi[0](ib, 0), wfcr.data(), ik);
+                    psi->fix_k(ik);
+                    pw_wfc->recip_to_real(ctx, &psi[0](ib, 0), wfcr.data(), ik);
 
-                double w1 = static_cast<double>(wk[ik] / ucell->omega);
+                    double w1 = static_cast<double>(kv.wk[ik] / ucell->omega);
 
-                for (int i = 0; i < nxyz; ++i)
-                {
-                    rho_band[spin_index][i] += std::norm(wfcr[i]) * w1;
+                    for (int i = 0; i < nxyz; ++i)
+                    {
+                        rho_band[spin_index][i] += std::norm(wfcr[i]) * w1;
+                    }
                 }
-            }
 
-            // Symmetrize the charge density, otherwise the results are incorrect if the symmetry is on
-            std::cout << " Symmetrizing band-decomposed charge density..." << std::endl;
-            Symmetry_rho srho;
-            for (int is = 0; is < nspin; ++is)
-            {
-                // Use vector instead of raw pointers
-                std::vector<double*> rho_save_pointers(nspin);
-                for (int s = 0; s < nspin; ++s)
+#ifdef __MPI
+                // Reduce the charge density across all pools if kpar > 1
+                if (kpar > 1 && chg != nullptr)
                 {
-                    rho_save_pointers[s] = rho_band[s].data();
+                    for (int is = 0; is < nspin; ++is)
+                    {
+                        chg->reduce_diff_pools(rho_band[is].data());
+                    }
                 }
+#endif
 
-                std::vector<std::vector<std::complex<double>>> rhog(nspin, std::vector<std::complex<double>>(ngmc));
-
-                // Convert vector of vectors to vector of pointers
-                std::vector<std::complex<double>*> rhog_pointers(nspin);
-                for (int s = 0; s < nspin; ++s)
+                // Symmetrize the charge density, otherwise the results are incorrect if the symmetry is on
+                std::cout << " Symmetrizing band-decomposed charge density..." << std::endl;
+                Symmetry_rho srho;
+                for (int is = 0; is < nspin; ++is)
                 {
-                    rhog_pointers[s] = rhog[s].data();
+                    // Use vector instead of raw pointers
+                    std::vector<double*> rho_save_pointers(nspin);
+                    for (int s = 0; s < nspin; ++s)
+                    {
+                        rho_save_pointers[s] = rho_band[s].data();
+                    }
+
+                    std::vector<std::vector<std::complex<double>>> rhog(nspin, std::vector<std::complex<double>>(ngmc));
+
+                    // Convert vector of vectors to vector of pointers
+                    std::vector<std::complex<double>*> rhog_pointers(nspin);
+                    for (int s = 0; s < nspin; ++s)
+                    {
+                        rhog_pointers[s] = rhog[s].data();
+                    }
+
+                    srho.begin(is, rho_save_pointers.data(), rhog_pointers.data(), ngmc, nullptr, pw_rhod, ucell->symm);
                 }
 
-                srho.begin(is,
-                           rho_save_pointers.data(),
-                           rhog_pointers.data(),
-                           ngmc,
-                           nullptr,
-                           pw_rhod,
-                           ucell->symm);
-            }
-
-            std::cout << " Writing cube files...";
-
-            for (int is = 0; is < nspin; ++is)
-            {
-                std::stringstream ssc;
-                ssc << global_out_dir << "BAND" << ib + 1 << "_SPIN" << is + 1 << "_CHG.cube";
-
-                ModuleIO::write_vdata_palgrid(pgrid,
-                    rho_band[is].data(),
-                    is,
-                    nspin,
-                    0,
-                    ssc.str(),
-                    0.0,
-                    ucell);
-            }
+                for (int is = 0; is < nspin; ++is)
+                {
+                    std::stringstream ssc;
+                    ssc << global_out_dir << "BAND" << ib + 1 << "_SPIN" << is + 1 << "_CHG.cube";
 
-            std::cout << " Complete!" << std::endl;
-        }
-    }
-}
+                    ModuleIO::write_vdata_palgrid(pgrid, rho_band[is].data(), is, nspin, 0, ssc.str(), 0.0, ucell);
+                }
+            } // else if_separate_k is false
+        } // end of ib loop over nbands
+    } // end of ip loop over kpar
+} // get_pchg_pw
 } // namespace ModuleIO
 
 #endif // GET_PCHG_PW_H
diff --git a/source/module_io/get_wf_pw.h b/source/module_io/get_wf_pw.h
index 29b831f786..edeb78fccf 100644
--- a/source/module_io/get_wf_pw.h
+++ b/source/module_io/get_wf_pw.h
@@ -26,193 +26,220 @@ void get_wf_pw(const std::vector<int>& out_wfc_norm,
                const int ny,
                const int nz,
                const int nxyz,
-               const int nks,
-               const std::vector<int>& isk,
-               const std::vector<double>& wk,
-               const int pw_big_bz,
-               const int pw_big_nbz,
-               const int ngmc,
                UnitCell* ucell,
                const psi::Psi<std::complex<double>>* psi,
-               const ModulePW::PW_Basis* pw_rhod,
                const ModulePW::PW_Basis_K* pw_wfc,
                const Device* ctx,
                const Parallel_Grid& pgrid,
-               const std::string& global_out_dir)
+               const std::string& global_out_dir,
+               const K_Vectors& kv,
+               const int kpar,
+               const int my_pool)
 {
-    // bands_picked is a vector of 0s and 1s, where 1 means the band is picked to output
-    std::vector<int> bands_picked_norm(nbands, 0);
-    std::vector<int> bands_picked_re_im(nbands, 0);
+    // Get necessary parameters from kv
+    const int nks = kv.get_nks();       // current process pool k-point count
+    const int nkstot = kv.get_nkstot(); // total k-point count
 
-    // Check if length of out_wfc_norm and out_wfc_re_im is valid
-    if (static_cast<int>(out_wfc_norm.size()) > nbands || static_cast<int>(out_wfc_re_im.size()) > nbands)
+    // Loop over k-parallelism
+    for (int ip = 0; ip < kpar; ++ip)
     {
-        ModuleBase::WARNING_QUIT("ModuleIO::get_wf_pw",
-                                 "The number of bands specified by `out_wfc_norm` or `out_wfc_re_im` in the "
-                                 "INPUT file exceeds `nbands`!");
-    }
-
-    // Check if all elements in bands_picked are 0 or 1
-    for (int value: out_wfc_norm)
-    {
-        if (value != 0 && value != 1)
+        if (my_pool != ip)
         {
-            ModuleBase::WARNING_QUIT("ModuleIO::get_wf_pw",
-                                     "The elements of `out_wfc_norm` must be either 0 or 1. "
-                                     "Invalid values found!");
+            continue;
         }
-    }
-    for (int value: out_wfc_re_im)
-    {
-        if (value != 0 && value != 1)
+
+        // bands_picked is a vector of 0s and 1s, where 1 means the band is picked to output
+        std::vector<int> bands_picked_norm(nbands, 0);
+        std::vector<int> bands_picked_re_im(nbands, 0);
+
+        // Check if length of out_wfc_norm and out_wfc_re_im is valid
+        if (static_cast<int>(out_wfc_norm.size()) > nbands || static_cast<int>(out_wfc_re_im.size()) > nbands)
         {
             ModuleBase::WARNING_QUIT("ModuleIO::get_wf_pw",
-                                     "The elements of `out_wfc_re_im` must be either 0 or 1. "
-                                     "Invalid values found!");
+                                     "The number of bands specified by `out_wfc_norm` or `out_wfc_re_im` in the "
+                                     "INPUT file exceeds `nbands`!");
         }
-    }
-
-    // Fill bands_picked with values from out_wfc_norm
-    // Remaining bands are already set to 0
-    int length = std::min(static_cast<int>(out_wfc_norm.size()), nbands);
-    for (int i = 0; i < length; ++i)
-    {
-        // out_wfc_norm rely on function parse_expression
-        bands_picked_norm[i] = static_cast<int>(out_wfc_norm[i]);
-    }
-    length = std::min(static_cast<int>(out_wfc_re_im.size()), nbands);
-    for (int i = 0; i < length; ++i)
-    {
-        bands_picked_re_im[i] = static_cast<int>(out_wfc_re_im[i]);
-    }
 
-    std::vector<std::complex<double>> wfcr_norm(nxyz);
-    std::vector<std::vector<double>> rho_band_norm(nspin, std::vector<double>(nxyz));
-
-    for (int ib = 0; ib < nbands; ++ib)
-    {
-        // Skip the loop iteration if bands_picked[ib] is 0
-        if (!bands_picked_norm[ib])
+        // Check if all elements in bands_picked are 0 or 1
+        for (int value: out_wfc_norm)
         {
-            continue;
+            if (value != 0 && value != 1)
+            {
+                ModuleBase::WARNING_QUIT("ModuleIO::get_wf_pw",
+                                         "The elements of `out_wfc_norm` must be either 0 or 1. "
+                                         "Invalid values found!");
+            }
+        }
+        for (int value: out_wfc_re_im)
+        {
+            if (value != 0 && value != 1)
+            {
+                ModuleBase::WARNING_QUIT("ModuleIO::get_wf_pw",
+                                         "The elements of `out_wfc_re_im` must be either 0 or 1. "
+                                         "Invalid values found!");
+            }
         }
 
-        for (int is = 0; is < nspin; ++is)
+        // Fill bands_picked with values from out_wfc_norm
+        // Remaining bands are already set to 0
+        int length = std::min(static_cast<int>(out_wfc_norm.size()), nbands);
+        for (int i = 0; i < length; ++i)
         {
-            std::fill(rho_band_norm[is].begin(), rho_band_norm[is].end(), 0.0);
+            // out_wfc_norm rely on function parse_expression
+            bands_picked_norm[i] = static_cast<int>(out_wfc_norm[i]);
         }
-        for (int ik = 0; ik < nks; ++ik)
+        length = std::min(static_cast<int>(out_wfc_re_im.size()), nbands);
+        for (int i = 0; i < length; ++i)
         {
-            const int spin_index = isk[ik];
-            std::cout << " Calculating wave function norm for band " << ib + 1 << ", k-point " << ik % (nks / nspin) + 1
-                      << ", spin " << spin_index + 1 << std::endl;
+            bands_picked_re_im[i] = static_cast<int>(out_wfc_re_im[i]);
+        }
 
-            psi->fix_k(ik);
-            pw_wfc->recip_to_real(ctx, &psi[0](ib, 0), wfcr_norm.data(), ik);
+        std::vector<std::complex<double>> wfcr_norm(nxyz);
+        std::vector<std::vector<double>> rho_band_norm(nspin, std::vector<double>(nxyz));
 
-            // To ensure the normalization of charge density in multi-k calculation
-            double wg_sum_k = 0;
-            for (int ik_tmp = 0; ik_tmp < nks / nspin; ++ik_tmp)
+        for (int ib = 0; ib < nbands; ++ib)
+        {
+            // Skip the loop iteration if bands_picked[ib] is 0
+            if (!bands_picked_norm[ib])
             {
-                wg_sum_k += wk[ik_tmp];
+                continue;
             }
 
-            double w1 = static_cast<double>(wg_sum_k / ucell->omega);
-
-            for (int i = 0; i < nxyz; ++i)
+            for (int is = 0; is < nspin; ++is)
             {
-                rho_band_norm[spin_index][i] = std::abs(wfcr_norm[i]) * std::sqrt(w1);
+                std::fill(rho_band_norm[is].begin(), rho_band_norm[is].end(), 0.0);
+            }
+            for (int ik = 0; ik < nks; ++ik)
+            {
+                const int ikstot = kv.ik2iktot[ik];                 // global k-point index
+                const int spin_index = kv.isk[ik];                  // spin index
+                const int k_number = ikstot % (nkstot / nspin) + 1; // k-point number, starting from 1
+
+                psi->fix_k(ik);
+                pw_wfc->recip_to_real(ctx, &psi[0](ib, 0), wfcr_norm.data(), ik);
+
+                // To ensure the normalization of charge density in multi-k calculation
+                double wg_sum_k = 0.0;
+                if (nspin == 1)
+                {
+                    wg_sum_k = 2.0;
+                }
+                else if (nspin == 2)
+                {
+                    wg_sum_k = 1.0;
+                }
+                else
+                {
+                    ModuleBase::WARNING_QUIT("ModuleIO::get_wf_pw",
+                                             "Real space wavefunction output currently do not support noncollinear "
+                                             "polarized calculation (nspin = 4)!");
+                }
+
+                double w1 = static_cast<double>(wg_sum_k / ucell->omega);
+
+                for (int i = 0; i < nxyz; ++i)
+                {
+                    rho_band_norm[spin_index][i] = std::abs(wfcr_norm[i]) * std::sqrt(w1);
+                }
+
+                std::stringstream ss_file;
+                ss_file << global_out_dir << "wf" << ib + 1 << "s" << spin_index + 1 << "k" << k_number << ".cube";
+
+                ModuleIO::write_vdata_palgrid(pgrid,
+                                              rho_band_norm[spin_index].data(),
+                                              spin_index,
+                                              nspin,
+                                              0,
+                                              ss_file.str(),
+                                              0.0,
+                                              ucell,
+                                              11,
+                                              1,
+                                              true); // reduce_all_pool is true
             }
-
-            std::cout << " Writing cube files...";
-
-            std::stringstream ss_file;
-            ss_file << global_out_dir << "wf" << ib + 1 << "s" << spin_index + 1 << "k" << ik % (nks / nspin) + 1
-                    << ".cube";
-
-            ModuleIO::write_vdata_palgrid(pgrid,
-                                          rho_band_norm[spin_index].data(),
-                                          spin_index,
-                                          nspin,
-                                          0,
-                                          ss_file.str(),
-                                          0.0,
-                                          ucell);
-
-            std::cout << " Complete!" << std::endl;
         }
-    }
 
-    std::vector<std::complex<double>> wfc_re_im(nxyz);
-    std::vector<std::vector<double>> rho_band_re(nspin, std::vector<double>(nxyz));
-    std::vector<std::vector<double>> rho_band_im(nspin, std::vector<double>(nxyz));
+        std::vector<std::complex<double>> wfc_re_im(nxyz);
+        std::vector<std::vector<double>> rho_band_re(nspin, std::vector<double>(nxyz));
+        std::vector<std::vector<double>> rho_band_im(nspin, std::vector<double>(nxyz));
 
-    for (int ib = 0; ib < nbands; ++ib)
-    {
-        // Skip the loop iteration if bands_picked[ib] is 0
-        if (!bands_picked_re_im[ib])
+        for (int ib = 0; ib < nbands; ++ib)
         {
-            continue;
-        }
-
-        for (int is = 0; is < nspin; ++is)
-        {
-            std::fill(rho_band_re[is].begin(), rho_band_re[is].end(), 0.0);
-            std::fill(rho_band_im[is].begin(), rho_band_im[is].end(), 0.0);
-        }
-        for (int ik = 0; ik < nks; ++ik)
-        {
-            const int spin_index = isk[ik];
-            std::cout << " Calculating wave function real and imaginary part for band " << ib + 1 << ", k-point "
-                      << ik % (nks / nspin) + 1 << ", spin " << spin_index + 1 << std::endl;
-
-            psi->fix_k(ik);
-            pw_wfc->recip_to_real(ctx, &psi[0](ib, 0), wfc_re_im.data(), ik);
-
-            // To ensure the normalization of charge density in multi-k calculation
-            double wg_sum_k = 0;
-            for (int ik_tmp = 0; ik_tmp < nks / nspin; ++ik_tmp)
+            // Skip the loop iteration if bands_picked[ib] is 0
+            if (!bands_picked_re_im[ib])
             {
-                wg_sum_k += wk[ik_tmp];
+                continue;
             }
 
-            double w1 = static_cast<double>(wg_sum_k / ucell->omega);
-
-            for (int i = 0; i < nxyz; ++i)
+            for (int is = 0; is < nspin; ++is)
             {
-                rho_band_re[spin_index][i] = std::real(wfc_re_im[i]) * std::sqrt(w1);
-                rho_band_im[spin_index][i] = std::imag(wfc_re_im[i]) * std::sqrt(w1);
+                std::fill(rho_band_re[is].begin(), rho_band_re[is].end(), 0.0);
+                std::fill(rho_band_im[is].begin(), rho_band_im[is].end(), 0.0);
+            }
+            for (int ik = 0; ik < nks; ++ik)
+            {
+                const int ikstot = kv.ik2iktot[ik];                 // global k-point index
+                const int spin_index = kv.isk[ik];                  // spin index
+                const int k_number = ikstot % (nkstot / nspin) + 1; // k-point number, starting from 1
+
+                psi->fix_k(ik);
+                pw_wfc->recip_to_real(ctx, &psi[0](ib, 0), wfc_re_im.data(), ik);
+
+                // To ensure the normalization of charge density in multi-k calculation
+                double wg_sum_k = 0.0;
+                if (nspin == 1)
+                {
+                    wg_sum_k = 2.0;
+                }
+                else if (nspin == 2)
+                {
+                    wg_sum_k = 1.0;
+                }
+                else
+                {
+                    ModuleBase::WARNING_QUIT("ModuleIO::get_wf_pw",
+                                             "Real space wavefunction output currently do not support noncollinear "
+                                             "polarized calculation (nspin = 4)!");
+                }
+
+                double w1 = static_cast<double>(wg_sum_k / ucell->omega);
+
+                for (int i = 0; i < nxyz; ++i)
+                {
+                    rho_band_re[spin_index][i] = std::real(wfc_re_im[i]) * std::sqrt(w1);
+                    rho_band_im[spin_index][i] = std::imag(wfc_re_im[i]) * std::sqrt(w1);
+                }
+
+                std::stringstream ss_real;
+                ss_real << global_out_dir << "wf" << ib + 1 << "s" << spin_index + 1 << "k" << k_number << "real.cube";
+
+                ModuleIO::write_vdata_palgrid(pgrid,
+                                              rho_band_re[spin_index].data(),
+                                              spin_index,
+                                              nspin,
+                                              0,
+                                              ss_real.str(),
+                                              0.0,
+                                              ucell,
+                                              11,
+                                              1,
+                                              true); // reduce_all_pool is true
+
+                std::stringstream ss_imag;
+                ss_imag << global_out_dir << "wf" << ib + 1 << "s" << spin_index + 1 << "k" << k_number << "imag.cube";
+
+                ModuleIO::write_vdata_palgrid(pgrid,
+                                              rho_band_im[spin_index].data(),
+                                              spin_index,
+                                              nspin,
+                                              0,
+                                              ss_imag.str(),
+                                              0.0,
+                                              ucell,
+                                              11,
+                                              1,
+                                              true); // reduce_all_pool is true
             }
-
-            std::cout << " Writing cube files...";
-
-            std::stringstream ss_real;
-            ss_real << global_out_dir << "wf" << ib + 1 << "s" << spin_index + 1 << "k" << ik % (nks / nspin) + 1
-                    << "real.cube";
-
-            ModuleIO::write_vdata_palgrid(pgrid,
-                                          rho_band_re[spin_index].data(),
-                                          spin_index,
-                                          nspin,
-                                          0,
-                                          ss_real.str(),
-                                          0.0,
-                                          ucell);
-
-            std::stringstream ss_imag;
-            ss_imag << global_out_dir << "wf" << ib + 1 << "s" << spin_index + 1 << "k" << ik % (nks / nspin) + 1
-                    << "imag.cube";
-
-            ModuleIO::write_vdata_palgrid(pgrid,
-                                          rho_band_im[spin_index].data(),
-                                          spin_index,
-                                          nspin,
-                                          0,
-                                          ss_imag.str(),
-                                          0.0,
-                                          ucell);
-
-            std::cout << " Complete!" << std::endl;
         }
     }
 }
diff --git a/source/module_io/write_cube.cpp b/source/module_io/write_cube.cpp
index b6c0fb8f9a..acff92eac1 100644
--- a/source/module_io/write_cube.cpp
+++ b/source/module_io/write_cube.cpp
@@ -1,31 +1,34 @@
 #include "module_base/element_name.h"
+#include "module_base/parallel_comm.h"
+#include "module_hamilt_pw/hamilt_pwdft/parallel_grid.h"
 #include "module_io/cube_io.h"
 #include "module_parameter/parameter.h"
-#include<vector>
-#include "module_hamilt_pw/hamilt_pwdft/parallel_grid.h"
+
+#include <vector>
 
 #ifdef __MPI
 #include <mpi.h>
 #endif
 
-#include <cstring> 
-
-void ModuleIO::write_vdata_palgrid(
-    const Parallel_Grid& pgrid,
-    const double* const data,
-    const int is,
-    const int nspin,
-    const int iter,
-    const std::string& fn,
-    const double ef,
-    const UnitCell*const ucell,
-    const int precision,
-    const int out_fermi)
+#include <cstring>
+
+void ModuleIO::write_vdata_palgrid(const Parallel_Grid& pgrid,
+                                   const double* const data,
+                                   const int is,
+                                   const int nspin,
+                                   const int iter,
+                                   const std::string& fn,
+                                   const double ef,
+                                   const UnitCell* const ucell,
+                                   const int precision,
+                                   const int out_fermi,
+                                   const bool reduce_all_pool)
 {
     ModuleBase::TITLE("ModuleIO", "write_vdata_palgrid");
 
     const int my_rank = GlobalV::MY_RANK;
     const int my_pool = GlobalV::MY_POOL;
+    const int rank_in_pool = GlobalV::RANK_IN_POOL;
 
     time_t start;
     time_t end;
@@ -39,19 +42,26 @@ void ModuleIO::write_vdata_palgrid(
     start = time(nullptr);
 
     // reduce
-    std::vector<double> data_xyz_full(nxyz);    // data to be written
-#ifdef __MPI    // reduce to rank 0
-    if (my_pool == 0 && GlobalV::MY_BNDGROUP == 0)
+    std::vector<double> data_xyz_full(nxyz); // data to be written
+#ifdef __MPI                                 // reduce to rank 0
+    if (GlobalV::MY_BNDGROUP == 0)
+    {
+        pgrid.reduce(data_xyz_full.data(), data, reduce_all_pool);
+    }
+    if (!reduce_all_pool)
     {
-        pgrid.reduce(data_xyz_full.data(), data);
+        MPI_Barrier(MPI_COMM_WORLD);
+    }
+    else
+    {
+        MPI_Barrier(POOL_WORLD);
     }
-    MPI_Barrier(MPI_COMM_WORLD);
 #else
     std::memcpy(data_xyz_full.data(), data, nxyz * sizeof(double));
 #endif
 
     // build the info structure
-    if (my_rank == 0)
+    if ((!reduce_all_pool && my_rank == 0) || (reduce_all_pool && rank_in_pool == 0))
     {
         /// output header for cube file
         ss << "STEP: " << iter << "  Cubefile created from ABACUS. Inner loop is z, followed by y and x" << std::endl;
@@ -83,23 +93,23 @@ void ModuleIO::write_vdata_palgrid(
         }
 
         std::vector<std::string> comment(2);
-		for (int i = 0;i < 2;++i) 
-		{ 
-			std::getline(ss, comment[i]); 
-		}
+        for (int i = 0; i < 2; ++i)
+        {
+            std::getline(ss, comment[i]);
+        }
 
-		double fac = ucell->lat0;
-		std::vector<double> dx = { fac * ucell->latvec.e11 / double(nx), 
-			fac * ucell->latvec.e12 / double(nx), 
-			fac * ucell->latvec.e13 / double(nx) };
+        double fac = ucell->lat0;
+        std::vector<double> dx = {fac * ucell->latvec.e11 / double(nx),
+                                  fac * ucell->latvec.e12 / double(nx),
+                                  fac * ucell->latvec.e13 / double(nx)};
 
-		std::vector<double> dy = { fac * ucell->latvec.e21 / double(ny), 
-			fac * ucell->latvec.e22 / double(ny), 
-			fac * ucell->latvec.e23 / double(ny) };
+        std::vector<double> dy = {fac * ucell->latvec.e21 / double(ny),
+                                  fac * ucell->latvec.e22 / double(ny),
+                                  fac * ucell->latvec.e23 / double(ny)};
 
-		std::vector<double> dz = { fac * ucell->latvec.e31 / double(nz), 
-			fac * ucell->latvec.e32 / double(nz), 
-			fac * ucell->latvec.e33 / double(nz) };
+        std::vector<double> dz = {fac * ucell->latvec.e31 / double(nz),
+                                  fac * ucell->latvec.e32 / double(nz),
+                                  fac * ucell->latvec.e33 / double(nz)};
 
         std::string element = "";
         std::vector<int> atom_type;
@@ -135,17 +145,27 @@ void ModuleIO::write_vdata_palgrid(
                     }
                 }
                 atom_type.push_back(z);
-				atom_charge.push_back(ucell->atoms[it].ncpp.zv);
-				atom_pos.push_back({ fac * ucell->atoms[it].tau[ia].x, 
-						fac * ucell->atoms[it].tau[ia].y, 
-						fac * ucell->atoms[it].tau[ia].z });
+                atom_charge.push_back(ucell->atoms[it].ncpp.zv);
+                atom_pos.push_back({fac * ucell->atoms[it].tau[ia].x,
+                                    fac * ucell->atoms[it].tau[ia].y,
+                                    fac * ucell->atoms[it].tau[ia].z});
             }
         }
-		write_cube(fn, comment, ucell->nat, {0.0, 0.0, 0.0}, 
-				nx, ny, nz, 
-				dx, dy, dz, 
-				atom_type, atom_charge, atom_pos, 
-				data_xyz_full, precision);
+        write_cube(fn,
+                   comment,
+                   ucell->nat,
+                   {0.0, 0.0, 0.0},
+                   nx,
+                   ny,
+                   nz,
+                   dx,
+                   dy,
+                   dz,
+                   atom_type,
+                   atom_charge,
+                   atom_pos,
+                   data_xyz_full,
+                   precision);
 
         end = time(nullptr);
         ModuleBase::GlobalFunc::OUT_TIME("write_vdata_palgrid", start, end);
@@ -155,27 +175,27 @@ void ModuleIO::write_vdata_palgrid(
 }
 
 void ModuleIO::write_cube(const std::string& file,
-    const std::vector<std::string>& comment,
-    const int& natom,
-    const std::vector<double>& origin,
-    const int& nx,
-    const int& ny,
-    const int& nz,
-    const std::vector<double>& dx,
-    const std::vector<double>& dy,
-    const std::vector<double>& dz,
-    const std::vector<int>& atom_type,
-    const std::vector<double>& atom_charge,
-    const std::vector<std::vector<double>>& atom_pos,
-    const std::vector<double>& data,
-    const int precision,
-    const int ndata_line)
+                          const std::vector<std::string>& comment,
+                          const int& natom,
+                          const std::vector<double>& origin,
+                          const int& nx,
+                          const int& ny,
+                          const int& nz,
+                          const std::vector<double>& dx,
+                          const std::vector<double>& dy,
+                          const std::vector<double>& dz,
+                          const std::vector<int>& atom_type,
+                          const std::vector<double>& atom_charge,
+                          const std::vector<std::vector<double>>& atom_pos,
+                          const std::vector<double>& data,
+                          const int precision,
+                          const int ndata_line)
 {
     assert(comment.size() >= 2);
-	for (int i = 0;i < 2;++i) 
-	{ 
-		assert(comment[i].find("\n") == std::string::npos); 
-	}
+    for (int i = 0; i < 2; ++i)
+    {
+        assert(comment[i].find("\n") == std::string::npos);
+    }
 
     assert(origin.size() >= 3);
     assert(dx.size() >= 3);
@@ -185,35 +205,34 @@ void ModuleIO::write_cube(const std::string& file,
     assert(atom_charge.size() >= natom);
     assert(atom_pos.size() >= natom);
 
-	for (int i = 0;i < natom;++i) 
-	{ 
-		assert(atom_pos[i].size() >= 3); 
-	}
+    for (int i = 0; i < natom; ++i)
+    {
+        assert(atom_pos[i].size() >= 3);
+    }
 
     assert(data.size() >= nx * ny * nz);
 
     std::ofstream ofs(file);
 
-	for (int i = 0;i < 2;++i) 
-	{ 
-		ofs << comment[i] << "\n"; 
-	}
+    for (int i = 0; i < 2; ++i)
+    {
+        ofs << comment[i] << "\n";
+    }
 
     ofs << std::fixed;
-    ofs << std::setprecision(1);    // as before
+    ofs << std::setprecision(1); // as before
 
     ofs << natom << " " << origin[0] << " " << origin[1] << " " << origin[2] << " \n";
 
-    ofs << std::setprecision(6);    //as before
+    ofs << std::setprecision(6); // as before
     ofs << nx << " " << dx[0] << " " << dx[1] << " " << dx[2] << "\n";
     ofs << ny << " " << dy[0] << " " << dy[1] << " " << dy[2] << "\n";
     ofs << nz << " " << dz[0] << " " << dz[1] << " " << dz[2] << "\n";
 
-    for (int i = 0;i < natom;++i)
+    for (int i = 0; i < natom; ++i)
     {
-        ofs << " " << atom_type[i] << " " << atom_charge[i] 
-            << " " << atom_pos[i][0] << " " << atom_pos[i][1] 
-            << " " << atom_pos[i][2] << "\n";
+        ofs << " " << atom_type[i] << " " << atom_charge[i] << " " << atom_pos[i][0] << " " << atom_pos[i][1] << " "
+            << atom_pos[i][2] << "\n";
     }
 
     ofs.unsetf(std::ofstream::fixed);
@@ -222,10 +241,13 @@ void ModuleIO::write_cube(const std::string& file,
     const int nxy = nx * ny;
     for (int ixy = 0; ixy < nxy; ++ixy)
     {
-        for (int iz = 0;iz < nz;++iz)
+        for (int iz = 0; iz < nz; ++iz)
         {
             ofs << " " << data[ixy * nz + iz];
-            if ((iz + 1) % ndata_line == 0 && iz != nz - 1) { ofs << "\n"; }
+            if ((iz + 1) % ndata_line == 0 && iz != nz - 1)
+            {
+                ofs << "\n";
+            }
         }
         ofs << "\n";
     }
diff --git a/tests/01_PW/113_PW_get_pchg_kpar/INPUT b/tests/01_PW/113_PW_get_pchg_kpar/INPUT
new file mode 100644
index 0000000000..1a7c8d2f35
--- /dev/null
+++ b/tests/01_PW/113_PW_get_pchg_kpar/INPUT
@@ -0,0 +1,22 @@
+INPUT_PARAMETERS
+#Parameters (1.General)
+suffix             autotest
+calculation        scf
+
+nbands             6
+symmetry           1
+pseudo_dir         ../../PP_ORB
+
+#Parameters (2.Iteration)
+ecutwfc            20
+scf_thr            1e-9
+scf_nmax           100
+
+#Parameters (3.Basis)
+basis_type         pw
+
+out_pchg           4*1
+
+kpar               2
+
+pw_seed            1
diff --git a/tests/01_PW/113_PW_get_pchg_kpar/KPT b/tests/01_PW/113_PW_get_pchg_kpar/KPT
new file mode 100644
index 0000000000..f5f7f4ec34
--- /dev/null
+++ b/tests/01_PW/113_PW_get_pchg_kpar/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Gamma
+2 2 2 0 0 0
diff --git a/tests/01_PW/113_PW_get_pchg_kpar/README b/tests/01_PW/113_PW_get_pchg_kpar/README
new file mode 100644
index 0000000000..81b2c5e4ca
--- /dev/null
+++ b/tests/01_PW/113_PW_get_pchg_kpar/README
@@ -0,0 +1 @@
+test the output of out_pchg when kpar > 1
diff --git a/tests/01_PW/113_PW_get_pchg_kpar/STRU b/tests/01_PW/113_PW_get_pchg_kpar/STRU
new file mode 100644
index 0000000000..0041740d12
--- /dev/null
+++ b/tests/01_PW/113_PW_get_pchg_kpar/STRU
@@ -0,0 +1,19 @@
+ATOMIC_SPECIES
+Si 14 Si_ONCV_PBE-1.0.upf upf201
+
+LATTICE_CONSTANT
+10.2  // add lattice constant
+
+LATTICE_VECTORS
+0.0 0.5 0.5
+0.5 0.0 0.5
+0.5 0.5 0.0
+
+ATOMIC_POSITIONS
+Direct 
+
+Si	// Element type	
+0.0	// magnetism
+2	
+0.00 0.00 0.00 1 1 1
+0.25 0.25 0.25 1 1 1
diff --git a/tests/01_PW/113_PW_get_pchg_kpar/result.ref b/tests/01_PW/113_PW_get_pchg_kpar/result.ref
new file mode 100644
index 0000000000..66703fb395
--- /dev/null
+++ b/tests/01_PW/113_PW_get_pchg_kpar/result.ref
@@ -0,0 +1,10 @@
+etotref -211.8032873347874
+etotperatomref -105.9016436674
+BAND1_SPIN1_CHG.cube 2
+BAND2_SPIN1_CHG.cube 2
+BAND3_SPIN1_CHG.cube 2
+BAND4_SPIN1_CHG.cube 2
+pointgroupref T_d
+spacegroupref O_h
+nksibzref 3
+totaltimeref 1.35
\ No newline at end of file
diff --git a/tests/01_PW/114_PW_get_pchg_sepk/INPUT b/tests/01_PW/114_PW_get_pchg_sepk/INPUT
new file mode 100644
index 0000000000..8bb4019fb5
--- /dev/null
+++ b/tests/01_PW/114_PW_get_pchg_sepk/INPUT
@@ -0,0 +1,24 @@
+INPUT_PARAMETERS
+#Parameters (1.General)
+suffix             autotest
+calculation        scf
+
+nbands             6
+symmetry           1
+pseudo_dir         ../../PP_ORB
+
+#Parameters (2.Iteration)
+ecutwfc            20
+scf_thr            1e-9
+scf_nmax           100
+
+#Parameters (3.Basis)
+basis_type         pw
+
+out_pchg           4*1
+
+kpar               2
+
+if_separate_k      1
+
+pw_seed            1
diff --git a/tests/01_PW/114_PW_get_pchg_sepk/KPT b/tests/01_PW/114_PW_get_pchg_sepk/KPT
new file mode 100644
index 0000000000..f5f7f4ec34
--- /dev/null
+++ b/tests/01_PW/114_PW_get_pchg_sepk/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Gamma
+2 2 2 0 0 0
diff --git a/tests/01_PW/114_PW_get_pchg_sepk/README b/tests/01_PW/114_PW_get_pchg_sepk/README
new file mode 100644
index 0000000000..1d9ca8bf74
--- /dev/null
+++ b/tests/01_PW/114_PW_get_pchg_sepk/README
@@ -0,0 +1 @@
+test the output of out_pchg when if_separate_k is true
diff --git a/tests/01_PW/114_PW_get_pchg_sepk/STRU b/tests/01_PW/114_PW_get_pchg_sepk/STRU
new file mode 100644
index 0000000000..0041740d12
--- /dev/null
+++ b/tests/01_PW/114_PW_get_pchg_sepk/STRU
@@ -0,0 +1,19 @@
+ATOMIC_SPECIES
+Si 14 Si_ONCV_PBE-1.0.upf upf201
+
+LATTICE_CONSTANT
+10.2  // add lattice constant
+
+LATTICE_VECTORS
+0.0 0.5 0.5
+0.5 0.0 0.5
+0.5 0.5 0.0
+
+ATOMIC_POSITIONS
+Direct 
+
+Si	// Element type	
+0.0	// magnetism
+2	
+0.00 0.00 0.00 1 1 1
+0.25 0.25 0.25 1 1 1
diff --git a/tests/01_PW/114_PW_get_pchg_sepk/result.ref b/tests/01_PW/114_PW_get_pchg_sepk/result.ref
new file mode 100644
index 0000000000..03ea1ebc47
--- /dev/null
+++ b/tests/01_PW/114_PW_get_pchg_sepk/result.ref
@@ -0,0 +1,18 @@
+etotref -211.8032873348292
+etotperatomref -105.9016436674
+BAND1_K1_SPIN1_CHG.cube 2
+BAND1_K2_SPIN1_CHG.cube 2
+BAND1_K3_SPIN1_CHG.cube 2
+BAND2_K1_SPIN1_CHG.cube 2
+BAND2_K2_SPIN1_CHG.cube 2
+BAND2_K3_SPIN1_CHG.cube 2
+BAND3_K1_SPIN1_CHG.cube 2
+BAND3_K2_SPIN1_CHG.cube 2
+BAND3_K3_SPIN1_CHG.cube 2
+BAND4_K1_SPIN1_CHG.cube 2
+BAND4_K2_SPIN1_CHG.cube 2
+BAND4_K3_SPIN1_CHG.cube 2
+pointgroupref T_d
+spacegroupref O_h
+nksibzref 3
+totaltimeref 1.23
diff --git a/tests/01_PW/115_PW_get_wf_kpar/INPUT b/tests/01_PW/115_PW_get_wf_kpar/INPUT
new file mode 100644
index 0000000000..ee62db77cd
--- /dev/null
+++ b/tests/01_PW/115_PW_get_wf_kpar/INPUT
@@ -0,0 +1,22 @@
+INPUT_PARAMETERS
+#Parameters (1.General)
+suffix             autotest
+calculation        scf
+
+nbands             6
+symmetry           1
+pseudo_dir         ../../PP_ORB
+
+#Parameters (2.Iteration)
+ecutwfc            100
+scf_thr            1e-9
+scf_nmax           100
+
+#Parameters (3.Basis)
+basis_type         pw
+
+out_wfc_norm       1 3*0
+
+kpar               3
+
+pw_seed            1
diff --git a/tests/01_PW/115_PW_get_wf_kpar/KPT b/tests/01_PW/115_PW_get_wf_kpar/KPT
new file mode 100644
index 0000000000..f5f7f4ec34
--- /dev/null
+++ b/tests/01_PW/115_PW_get_wf_kpar/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Gamma
+2 2 2 0 0 0
diff --git a/tests/01_PW/115_PW_get_wf_kpar/README b/tests/01_PW/115_PW_get_wf_kpar/README
new file mode 100644
index 0000000000..66fbd6257e
--- /dev/null
+++ b/tests/01_PW/115_PW_get_wf_kpar/README
@@ -0,0 +1 @@
+test the output of out_wfc_norm and out_wfc_re_im when kpar > 1
diff --git a/tests/01_PW/115_PW_get_wf_kpar/STRU b/tests/01_PW/115_PW_get_wf_kpar/STRU
new file mode 100644
index 0000000000..0041740d12
--- /dev/null
+++ b/tests/01_PW/115_PW_get_wf_kpar/STRU
@@ -0,0 +1,19 @@
+ATOMIC_SPECIES
+Si 14 Si_ONCV_PBE-1.0.upf upf201
+
+LATTICE_CONSTANT
+10.2  // add lattice constant
+
+LATTICE_VECTORS
+0.0 0.5 0.5
+0.5 0.0 0.5
+0.5 0.5 0.0
+
+ATOMIC_POSITIONS
+Direct 
+
+Si	// Element type	
+0.0	// magnetism
+2	
+0.00 0.00 0.00 1 1 1
+0.25 0.25 0.25 1 1 1
diff --git a/tests/01_PW/115_PW_get_wf_kpar/result.ref b/tests/01_PW/115_PW_get_wf_kpar/result.ref
new file mode 100644
index 0000000000..2085b97b8e
--- /dev/null
+++ b/tests/01_PW/115_PW_get_wf_kpar/result.ref
@@ -0,0 +1,9 @@
+etotref -211.8789253813293
+etotperatomref -105.9394626907
+wf1s1k1.cube 21.94720511
+wf1s1k2.cube 19.65846447
+wf1s1k3.cube 19.34110139
+pointgroupref T_d
+spacegroupref O_h
+nksibzref 3
+totaltimeref 1.95
\ No newline at end of file
diff --git a/tests/01_PW/CASES_CPU.txt b/tests/01_PW/CASES_CPU.txt
index f9eaa40556..66bb461e4e 100644
--- a/tests/01_PW/CASES_CPU.txt
+++ b/tests/01_PW/CASES_CPU.txt
@@ -99,6 +99,9 @@
 109_PW_PBE0
 111_PW_get_pchg
 112_PW_get_wf
+113_PW_get_pchg_kpar
+114_PW_get_pchg_sepk
+115_PW_get_wf_kpar
 201_PW_UPF201_Ce_f
 202_PW_ONCV_Libxc
 203_PW_OK