diff --git a/source/Makefile.Objects b/source/Makefile.Objects index 9d2dc62b48..8431ec05d7 100644 --- a/source/Makefile.Objects +++ b/source/Makefile.Objects @@ -69,7 +69,7 @@ VPATH=./src_global:\ ./module_ri:\ ./module_parameter:\ ./module_lr:\ -./module_lr/AX:\ +./module_lr/ao_to_mo_transformer:\ ./module_lr/dm_trans:\ ./module_lr/operator_casida:\ ./module_lr/potentials:\ @@ -723,8 +723,8 @@ OBJS_TENSOR=tensor.o\ OBJS_LR=lr_util.o\ lr_util_hcontainer.o\ - AX_parallel.o\ - AX_serial.o\ + ao_to_mo_parallel.o\ + ao_to_mo_serial.o\ dm_trans_parallel.o\ dm_trans_serial.o\ dmr_complex.o\ diff --git a/source/module_lr/AX/AX.h b/source/module_lr/AX/AX.h deleted file mode 100644 index c1cdac3532..0000000000 --- a/source/module_lr/AX/AX.h +++ /dev/null @@ -1,65 +0,0 @@ -#pragma once -#include -#include "module_psi/psi.h" -#include -#ifdef __MPI -#include "module_base/parallel_2d.h" -#endif -namespace LR -{ - // double - void cal_AX_forloop_serial( - const std::vector& V_istate, - const psi::Psi& c, - const int& nocc, - const int& nvirt, - double* const AX_istate); - void cal_AX_blas( - const std::vector& V_istate, - const psi::Psi& c, - const int& nocc, - const int& nvirt, - double* const AX_istate, - const bool add_on = true); -#ifdef __MPI - void cal_AX_pblas( - const std::vector& V_istate, - const Parallel_2D& pmat, - const psi::Psi& c, - const Parallel_2D& pc, - const int& naos, - const int& nocc, - const int& nvirt, - const Parallel_2D& pX, - double* const AX_istate, - const bool add_on=true); -#endif - // complex - void cal_AX_forloop_serial( - const std::vector& V_istate, - const psi::Psi>& c, - const int& nocc, - const int& nvirt, - std::complex* const AX_istate); - void cal_AX_blas( - const std::vector& V_istate, - const psi::Psi>& c, - const int& nocc, - const int& nvirt, - std::complex* const AX_istate, - const bool add_on = true); - -#ifdef __MPI - void cal_AX_pblas( - const std::vector& V_istate, - const Parallel_2D& pmat, - const psi::Psi>& c, - const Parallel_2D& pc, - const int& naos, - const int& nocc, - const int& nvirt, - const Parallel_2D& pX, - std::complex* const AX_istate, - const bool add_on = true); -#endif -} \ No newline at end of file diff --git a/source/module_lr/AX/AX_parallel.cpp b/source/module_lr/AX/AX_parallel.cpp deleted file mode 100644 index c5b956bac6..0000000000 --- a/source/module_lr/AX/AX_parallel.cpp +++ /dev/null @@ -1,117 +0,0 @@ -#ifdef __MPI -#include "AX.h" -#include "module_base/scalapack_connector.h" -#include "module_base/tool_title.h" -#include "module_lr/utils/lr_util.h" -#include "module_lr/utils/lr_util_print.h" -namespace LR -{ - //output: col first, consistent with blas - // c: nao*nbands in para2d, nbands*nao in psi (row-para and constructed: nao) - // X: nvirt*nocc in para2d, nocc*nvirt in psi (row-para and constructed: nvirt) - void cal_AX_pblas( - const std::vector& V_istate, - const Parallel_2D& pmat, - const psi::Psi& c, - const Parallel_2D& pc, - const int& naos, - const int& nocc, - const int& nvirt, - const Parallel_2D& pX, - double* AX_istate, - const bool add_on) - { - ModuleBase::TITLE("hamilt_lrtd", "cal_AX_pblas"); - assert(pmat.comm() == pc.comm() && pmat.comm() == pX.comm()); - assert(pmat.blacs_ctxt == pc.blacs_ctxt && pmat.blacs_ctxt == pX.blacs_ctxt); - assert(pX.get_local_size() > 0); - - const int nks = V_istate.size(); - - Parallel_2D pVc; // for intermediate Vc - LR_Util::setup_2d_division(pVc, pmat.get_block_size(), naos, nocc, pmat.blacs_ctxt); - for (int isk = 0;isk < nks;++isk) - { - const int ax_start = isk * pX.get_local_size(); - c.fix_k(isk); - - //Vc - container::Tensor Vc(DAT::DT_DOUBLE, DEV::CpuDevice, { pVc.get_col_size(), pVc.get_row_size() });//row is "inside"(memory contiguity) for pblas - Vc.zero(); - - int i1 = 1; - int ivirt = nocc + 1; - - char transa = 'N'; - char transb = 'N'; - const double alpha = 1.0; - const double beta = add_on ? 1.0 : 0.0; - pdgemm_(&transa, &transb, &naos, &nocc, &naos, - &alpha, V_istate[isk].data(), &i1, &i1, pmat.desc, - c.get_pointer(), &i1, &i1, pc.desc, - &beta, Vc.data(), &i1, &i1, pVc.desc); - - transa = 'T'; - // AX_istate = c ^ TVc - // descC puts M(nvirt) to row - pdgemm_(&transa, &transb, &nvirt, &nocc, &naos, - &alpha, c.get_pointer(), &i1, &ivirt, pc.desc, - Vc.data(), &i1, &i1, pVc.desc, - &beta, AX_istate + ax_start, &i1, &i1, pX.desc); - - } - } - - void cal_AX_pblas( - const std::vector& V_istate, - const Parallel_2D& pmat, - const psi::Psi>& c, - const Parallel_2D& pc, - const int& naos, - const int& nocc, - const int& nvirt, - const Parallel_2D& pX, - std::complex* const AX_istate, - const bool add_on) - { - ModuleBase::TITLE("hamilt_lrtd", "cal_AX_plas"); - assert(pmat.comm() == pc.comm() && pmat.comm() == pX.comm()); - assert(pmat.blacs_ctxt == pc.blacs_ctxt && pmat.blacs_ctxt == pX.blacs_ctxt); - assert(pX.get_local_size() > 0); - - const int nks = V_istate.size(); - - Parallel_2D pVc; // for intermediate Vc - LR_Util::setup_2d_division(pVc, pmat.get_block_size(), naos, nocc, pmat.blacs_ctxt); - for (int isk = 0;isk < nks;++isk) - { - const int ax_start = isk * pX.get_local_size(); - c.fix_k(isk); - - //Vc - container::Tensor Vc(DAT::DT_COMPLEX_DOUBLE, DEV::CpuDevice, { pVc.get_col_size(), pVc.get_row_size() }); - Vc.zero(); - - int i1 = 1; - int ivirt = nocc + 1; - - char transa = 'N'; - char transb = 'N'; - const std::complex alpha(1.0, 0.0); - const std::complex beta = add_on ? std::complex(1.0, 0.0) : std::complex(0.0, 0.0); - pzgemm_(&transa, &transb, &naos, &nocc, &naos, - &alpha, V_istate[isk].data>(), &i1, &i1, pmat.desc, - c.get_pointer(), &i1, &i1, pc.desc, - &beta, Vc.data>(), &i1, &i1, pVc.desc); - - transa = 'C'; - // AX_istate = c ^ TVc - // descC puts M(nvirt) to row - pzgemm_(&transa, &transb, &nvirt, &nocc, &naos, - &alpha, c.get_pointer(), &i1, &ivirt, pc.desc, - Vc.data>(), &i1, &i1, pVc.desc, - &beta, AX_istate + ax_start, &i1, &i1, pX.desc); - } - } -} -#endif diff --git a/source/module_lr/AX/AX_serial.cpp b/source/module_lr/AX/AX_serial.cpp deleted file mode 100644 index 234d4412dd..0000000000 --- a/source/module_lr/AX/AX_serial.cpp +++ /dev/null @@ -1,140 +0,0 @@ -#include "AX.h" -#include "module_base/blas_connector.h" -#include "module_base/tool_title.h" -#include "module_lr/utils/lr_util.h" -namespace LR -{ - void cal_AX_forloop_serial( - const std::vector& V_istate, - const psi::Psi& c, - const int& nocc, - const int& nvirt, - double* AX_istate) - { - ModuleBase::TITLE("hamilt_lrtd", "cal_AX_forloop"); - const int nks = V_istate.size(); - int naos = c.get_nbasis(); - ModuleBase::GlobalFunc::ZEROS(AX_istate, nks * nocc * nvirt); - - for (int isk = 0;isk < nks;++isk) - { - c.fix_k(isk); - const int ax_start = isk * nocc * nvirt; - for (int i = 0;i < nocc;++i) - { - for (int a = 0;a < nvirt;++a) - { - for (int nu = 0;nu < naos;++nu) - { - for (int mu = 0;mu < naos;++mu) - { - AX_istate[ax_start + i * nvirt + a] += c(nocc + a, mu) * V_istate[isk].data()[nu * naos + mu] * c(i, nu); - } - } - } - } - } - } - void cal_AX_forloop_serial( - const std::vector& V_istate, - const psi::Psi>& c, - const int& nocc, - const int& nvirt, - std::complex* const AX_istate) - { - ModuleBase::TITLE("hamilt_lrtd", "cal_AX_forloop"); - const int nks = V_istate.size(); - int naos = c.get_nbasis(); - ModuleBase::GlobalFunc::ZEROS(AX_istate, nks * nocc * nvirt); - - for (int isk = 0;isk < nks;++isk) - { - c.fix_k(isk); - const int ax_start = isk * nocc * nvirt; - for (int i = 0;i < nocc;++i) - { - for (int a = 0;a < nvirt;++a) - { - for (int nu = 0;nu < naos;++nu) - { - for (int mu = 0;mu < naos;++mu) - { - AX_istate[ax_start + i * nvirt + a] += std::conj(c(nocc + a, mu)) * V_istate[isk].data>()[nu * naos + mu] * c(i, nu); - } - } - } - } - } - } - - void cal_AX_blas( - const std::vector& V_istate, - const psi::Psi& c, - const int& nocc, - const int& nvirt, - double* AX_istate, - const bool add_on) - { - ModuleBase::TITLE("hamilt_lrtd", "cal_AX_blas"); - const int nks = V_istate.size(); - int naos = c.get_nbasis(); - - for (int isk = 0;isk < nks;++isk) - { - c.fix_k(isk); - const int ax_start = isk * nocc * nvirt; - - // Vc[naos*nocc] - container::Tensor Vc(DAT::DT_DOUBLE, DEV::CpuDevice, { nocc, naos });// (Vc)^T - Vc.zero(); - char transa = 'N'; - char transb = 'N'; //c is col major - const double alpha = 1.0; - const double beta = add_on ? 1.0 : 0.0; - dgemm_(&transa, &transb, &naos, &nocc, &naos, &alpha, - V_istate[isk].data(), &naos, c.get_pointer(), &naos, &beta, - Vc.data(), &naos); - - transa = 'T'; - //AX_istate=c^TVc (nvirt major) - dgemm_(&transa, &transb, &nvirt, &nocc, &naos, &alpha, - c.get_pointer(nocc), &naos, Vc.data(), &naos, &beta, - AX_istate + ax_start, &nvirt); - } - } - void cal_AX_blas( - const std::vector& V_istate, - const psi::Psi>& c, - const int& nocc, - const int& nvirt, - std::complex* const AX_istate, - const bool add_on) - { - ModuleBase::TITLE("hamilt_lrtd", "cal_AX_blas"); - const int nks = V_istate.size(); - int naos = c.get_nbasis(); - - for (int isk = 0;isk < nks;++isk) - { - c.fix_k(isk); - const int ax_start = isk * nocc * nvirt; - - // Vc[naos*nocc] (V is hermitian) - container::Tensor Vc(DAT::DT_COMPLEX_DOUBLE, DEV::CpuDevice, { nocc, naos });// (Vc)^T - Vc.zero(); - char transa = 'N'; - char transb = 'N'; //c is col major - const std::complex alpha(1.0, 0.0); - const std::complex beta = add_on ? std::complex(1.0, 0.0) : std::complex(0.0, 0.0); - zgemm_(&transa, &transb, &naos, &nocc, &naos, &alpha, - V_istate[isk].data>(), &naos, c.get_pointer(), &naos, &beta, - Vc.data>(), &naos); - - transa = 'C'; - //AX_istate=c^\dagger Vc (nvirt major) - zgemm_(&transa, &transb, &nvirt, &nocc, &naos, &alpha, - c.get_pointer(nocc), &naos, Vc.data>(), &naos, &beta, - AX_istate + ax_start, &nvirt); - } - } -} \ No newline at end of file diff --git a/source/module_lr/AX/test/AX_test.cpp b/source/module_lr/AX/test/AX_test.cpp deleted file mode 100644 index 697c71bc7e..0000000000 --- a/source/module_lr/AX/test/AX_test.cpp +++ /dev/null @@ -1,231 +0,0 @@ -#include -#ifdef __MPI -#include "mpi.h" -#endif -#include "../AX.h" - -#include "module_lr/utils/lr_util.h" - -#define rand01 (static_cast(rand()) / static_cast(RAND_MAX) - 0.5 ) -struct matsize -{ - int nks = 1; - int naos = 2; - int nocc = 1; - int nvirt = 1; - int nb = 1; - matsize(int nks, int naos, int nocc, int nvirt, int nb = 1) - :nks(nks), naos(naos), nocc(nocc), nvirt(nvirt), nb(nb) { - assert(nocc + nvirt <= naos); - }; -}; - -class AXTest : public testing::Test -{ -public: - std::vector sizes{ - // {2, 3, 2, 1}, - {2, 13, 7, 4}, - // {2, 14, 8, 5} - }; - int nstate = 2; - std::ofstream ofs_running; - int my_rank = 0; -#ifdef __MPI - void SetUp() override - { - MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); - this->ofs_running.open("log" + std::to_string(my_rank) + ".txt"); - ofs_running << "my_rank = " << my_rank << std::endl; - } - void TearDown() override - { - ofs_running.close(); - } -#endif - - void set_ones(double* data, int size) { for (int i = 0;i < size;++i) { data[i] = 1.0; } }; - void set_int(double* data, int size) { for (int i = 0;i < size;++i) { data[i] = static_cast(i + 1); } }; - void set_int(std::complex* data, int size) { for (int i = 0;i < size;++i) { data[i] = std::complex(i + 1, -i - 1); } }; - void set_rand(double* data, int size) { for (int i = 0;i < size;++i) { data[i] = rand01 * 10; } }; - void set_rand(std::complex* data, int size) { for (int i = 0;i < size;++i) { data[i] = std::complex(rand01 * 10, rand01 * 10); } }; - void check_eq(double* data1, double* data2, int size) { for (int i = 0;i < size;++i) { EXPECT_NEAR(data1[i], data2[i], 1e-10); } }; - void check_eq(std::complex* data1, std::complex* data2, int size) - { - for (int i = 0;i < size;++i) - { - EXPECT_NEAR(data1[i].real(), data2[i].real(), 1e-10); - EXPECT_NEAR(data1[i].imag(), data2[i].imag(), 1e-10); - } - }; -}; - -TEST_F(AXTest, DoubleSerial) -{ - for (auto s : this->sizes) - { - psi::Psi AX_for(s.nks, nstate, s.nocc * s.nvirt, nullptr, false); - psi::Psi AX_blas(s.nks, nstate, s.nocc * s.nvirt, nullptr, false); - int size_c = s.nks * (s.nocc + s.nvirt) * s.naos; - int size_v = s.naos * s.naos; - for (int istate = 0;istate < nstate;++istate) - { - std::vector temp(s.nks, s.naos); - psi::Psi c(s.nks, s.nocc + s.nvirt, s.naos, temp.data(), true); - std::vector V(s.nks, container::Tensor(DAT::DT_DOUBLE, DEV::CpuDevice, { s.naos, s.naos })); - set_rand(&c(0, 0, 0), size_c); - for (auto& v : V) { set_rand(v.data(), size_v); } - LR::cal_AX_forloop_serial(V, c, s.nocc, s.nvirt, &AX_for(istate, 0, 0)); - LR::cal_AX_blas(V, c, s.nocc, s.nvirt, &AX_blas(istate, 0, 0), false); - } - check_eq(&AX_for(0, 0, 0), &AX_blas(0, 0, 0), nstate * s.nks * s.nocc * s.nvirt); - } -} - -TEST_F(AXTest, ComplexSerial) -{ - for (auto s : this->sizes) - { - psi::Psi> AX_for(s.nks, nstate, s.nocc * s.nvirt, nullptr, false); - psi::Psi> AX_blas(s.nks, nstate, s.nocc * s.nvirt, nullptr, false); - int size_c = s.nks * (s.nocc + s.nvirt) * s.naos; - int size_v = s.naos * s.naos; - for (int istate = 0;istate < nstate;++istate) - { - std::vector temp(s.nks, s.naos); - psi::Psi> c(s.nks, s.nocc + s.nvirt, s.naos, temp.data(), true); - std::vector V(s.nks, container::Tensor(DAT::DT_COMPLEX_DOUBLE, DEV::CpuDevice, { s.naos, s.naos })); - set_rand(&c(0, 0, 0), size_c); - for (auto& v : V) { set_rand(v.data>(), size_v); } - LR::cal_AX_forloop_serial(V, c, s.nocc, s.nvirt, &AX_for(istate, 0, 0)); - LR::cal_AX_blas(V, c, s.nocc, s.nvirt, &AX_blas(istate, 0, 0), false); - } - check_eq(&AX_for(0, 0, 0), &AX_blas(0, 0, 0), nstate * s.nks * s.nocc * s.nvirt); - } -} -#ifdef __MPI -TEST_F(AXTest, DoubleParallel) -{ - for (auto s : this->sizes) - { - // c: nao*nbands in para2d, nbands*nao in psi (row-para and constructed: nao) - // X: nvirt*nocc in para2d, nocc*nvirt in psi (row-para and constructed: nvirt) - Parallel_2D pV; - LR_Util::setup_2d_division(pV, s.nb, s.naos, s.naos); - std::vector V(s.nks, container::Tensor(DAT::DT_DOUBLE, DEV::CpuDevice, { pV.get_col_size(), pV.get_row_size() })); - Parallel_2D pc; - LR_Util::setup_2d_division(pc, s.nb, s.naos, s.nocc + s.nvirt, pV.blacs_ctxt); - - std::vector ngk_temp(s.nks, pc.get_row_size()); - psi::Psi c(s.nks, pc.get_col_size(), pc.get_row_size(), ngk_temp.data(), true); - Parallel_2D px; - LR_Util::setup_2d_division(px, s.nb, s.nvirt, s.nocc, pV.blacs_ctxt); - - EXPECT_EQ(pV.dim0, pc.dim0); - EXPECT_EQ(pV.dim1, pc.dim1); - EXPECT_GE(s.nvirt, px.dim0); - EXPECT_GE(s.nocc, px.dim1); - EXPECT_GE(s.naos, pc.dim0); - psi::Psi AX_pblas_loc(s.nks, nstate, px.get_local_size(), nullptr, false); - psi::Psi AX_gather(s.nks, nstate, s.nocc * s.nvirt, nullptr, false); - for (int istate = 0;istate < nstate;++istate) - { - for (int isk = 0;isk < s.nks;++isk) - { - set_rand(V.at(isk).data(), pV.get_local_size()); - set_rand(&c(isk, 0, 0), pc.get_local_size()); - } - LR::cal_AX_pblas(V, pV, c, pc, s.naos, s.nocc, s.nvirt, px, &AX_pblas_loc(istate, 0, 0), false); - // gather AX and output - for (int isk = 0;isk < s.nks;++isk) - { - LR_Util::gather_2d_to_full(px, &AX_pblas_loc(istate, isk, 0), &AX_gather(istate, isk, 0), false/*pblas: row first*/, s.nvirt, s.nocc); - } - // compare to global AX - std::vector V_full(s.nks, container::Tensor(DAT::DT_DOUBLE, DEV::CpuDevice, { s.naos, s.naos })); - - std::vector ngk_temp_1(s.nks, s.naos); - psi::Psi c_full(s.nks, s.nocc + s.nvirt, s.naos, ngk_temp_1.data(), true); - for (int isk = 0;isk < s.nks;++isk) - { - LR_Util::gather_2d_to_full(pV, V.at(isk).data(), V_full.at(isk).data(), false, s.naos, s.naos); - LR_Util::gather_2d_to_full(pc, &c(isk, 0, 0), &c_full(isk, 0, 0), false, s.naos, s.nocc + s.nvirt); - } - if (my_rank == 0) - { - psi::Psi AX_full_istate(s.nks, 1, s.nocc * s.nvirt, nullptr, false); - LR::cal_AX_blas(V_full, c_full, s.nocc, s.nvirt, &AX_full_istate(0, 0, 0), false); - check_eq(&AX_full_istate(0, 0, 0), &AX_gather(istate, 0, 0), s.nks * s.nocc * s.nvirt); - } - } - } -} -TEST_F(AXTest, ComplexParallel) -{ - for (auto s : this->sizes) - { - // c: nao*nbands in para2d, nbands*nao in psi (row-para and constructed: nao) - // X: nvirt*nocc in para2d, nocc*nvirt in psi (row-para and constructed: nvirt) - Parallel_2D pV; - LR_Util::setup_2d_division(pV, s.nb, s.naos, s.naos); - std::vector V(s.nks, container::Tensor(DAT::DT_COMPLEX_DOUBLE, DEV::CpuDevice, { pV.get_col_size(), pV.get_row_size() })); - Parallel_2D pc; - LR_Util::setup_2d_division(pc, s.nb, s.naos, s.nocc + s.nvirt, pV.blacs_ctxt); - - std::vector ngk_temp_1(s.nks, pc.get_row_size()); - psi::Psi> c(s.nks, pc.get_col_size(), pc.get_row_size(), ngk_temp_1.data(), true); - Parallel_2D px; - LR_Util::setup_2d_division(px, s.nb, s.nvirt, s.nocc, pV.blacs_ctxt); - - psi::Psi> AX_pblas_loc(s.nks, nstate, px.get_local_size(), nullptr, false); - psi::Psi> AX_gather(s.nks, nstate, s.nocc * s.nvirt, nullptr, false); - for (int istate = 0;istate < nstate;++istate) - { - for (int isk = 0;isk < s.nks;++isk) - { - set_rand(V.at(isk).data>(), pV.get_local_size()); - set_rand(&c(isk, 0, 0), pc.get_local_size()); - } - LR::cal_AX_pblas(V, pV, c, pc, s.naos, s.nocc, s.nvirt, px, &AX_pblas_loc(istate, 0, 0), false); - - // gather AX and output - for (int isk = 0;isk < s.nks;++isk) - { - LR_Util::gather_2d_to_full(px, &AX_pblas_loc(istate, isk, 0), &AX_gather(istate, isk, 0), false/*pblas: row first*/, s.nvirt, s.nocc); - } - // compare to global AX - std::vector V_full(s.nks, container::Tensor(DAT::DT_COMPLEX_DOUBLE, DEV::CpuDevice, { s.naos, s.naos })); - - - std::vector ngk_temp_2(s.nks, s.naos); - psi::Psi> c_full(s.nks, s.nocc + s.nvirt, s.naos, ngk_temp_2.data(), true); - for (int isk = 0;isk < s.nks;++isk) - { - LR_Util::gather_2d_to_full(pV, V.at(isk).data>(), V_full.at(isk).data>(), false, s.naos, s.naos); - LR_Util::gather_2d_to_full(pc, &c(isk, 0, 0), &c_full(isk, 0, 0), false, s.naos, s.nocc + s.nvirt); - } - if (my_rank == 0) - { - psi::Psi> AX_full_istate(s.nks, 1, s.nocc * s.nvirt, nullptr, false); - LR::cal_AX_blas(V_full, c_full, s.nocc, s.nvirt, &AX_full_istate(0, 0, 0), false); - check_eq(&AX_full_istate(0, 0, 0), &AX_gather(istate, 0, 0), s.nks * s.nocc * s.nvirt); - } - } - } -} -#endif - - -int main(int argc, char** argv) -{ - srand(time(nullptr)); // for random number generator -#ifdef __MPI - MPI_Init(&argc, &argv); -#endif - testing::InitGoogleTest(&argc, argv); - int result = RUN_ALL_TESTS(); -#ifdef __MPI - MPI_Finalize(); -#endif - return result; -} diff --git a/source/module_lr/AX/test/CMakeLists.txt b/source/module_lr/AX/test/CMakeLists.txt deleted file mode 100644 index 8cd8c1caf9..0000000000 --- a/source/module_lr/AX/test/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -remove_definitions(-DUSE_LIBXC) -AddTest( - TARGET AX_test - LIBS parameter base ${math_libs} container device psi - SOURCES AX_test.cpp ../../utils/lr_util.cpp ../AX_parallel.cpp ../AX_serial.cpp -) \ No newline at end of file diff --git a/source/module_lr/CMakeLists.txt b/source/module_lr/CMakeLists.txt index 3459ca8d3a..3278252c84 100644 --- a/source/module_lr/CMakeLists.txt +++ b/source/module_lr/CMakeLists.txt @@ -1,14 +1,14 @@ if(ENABLE_LCAO) add_subdirectory(utils) - add_subdirectory(AX) + add_subdirectory(ao_to_mo_transformer) add_subdirectory(dm_trans) add_subdirectory(ri_benchmark) list(APPEND objects utils/lr_util.cpp utils/lr_util_hcontainer.cpp - AX/AX_parallel.cpp - AX/AX_serial.cpp + ao_to_mo_transformer/ao_to_mo_parallel.cpp + ao_to_mo_transformer/ao_to_mo_serial.cpp dm_trans/dm_trans_parallel.cpp dm_trans/dm_trans_serial.cpp dm_trans/dmr_complex.cpp diff --git a/source/module_lr/AX/CMakeLists.txt b/source/module_lr/ao_to_mo_transformer/CMakeLists.txt similarity index 100% rename from source/module_lr/AX/CMakeLists.txt rename to source/module_lr/ao_to_mo_transformer/CMakeLists.txt diff --git a/source/module_lr/ao_to_mo_transformer/ao_to_mo.h b/source/module_lr/ao_to_mo_transformer/ao_to_mo.h new file mode 100644 index 0000000000..79c6ee99b2 --- /dev/null +++ b/source/module_lr/ao_to_mo_transformer/ao_to_mo.h @@ -0,0 +1,43 @@ +#pragma once +#include +#include "module_psi/psi.h" +#include +#ifdef __MPI +#include "module_base/parallel_2d.h" +#endif +namespace LR +{ + enum MO_TYPE { OO, VO, VV }; + template + void ao_to_mo_forloop_serial( + const std::vector& mat_ao, + const psi::Psi& coeff, + const int& nocc, + const int& nvirt, + T* const mat_mo, + MO_TYPE type = VO); + template + void ao_to_mo_blas( + const std::vector& mat_ao, + const psi::Psi& coeff, + const int& nocc, + const int& nvirt, + T* const mat_mo, + const bool add_on = true, + MO_TYPE type = VO); +#ifdef __MPI + template + void ao_to_mo_pblas( + const std::vector& mat_ao, + const Parallel_2D& pmat_ao, + const psi::Psi& coeff, + const Parallel_2D& pcoeff, + const int& naos, + const int& nocc, + const int& nvirt, + const Parallel_2D& pmat_mo, + T* const mat_mo, + const bool add_on = true, + MO_TYPE type = VO); +#endif +} \ No newline at end of file diff --git a/source/module_lr/ao_to_mo_transformer/ao_to_mo_parallel.cpp b/source/module_lr/ao_to_mo_transformer/ao_to_mo_parallel.cpp new file mode 100644 index 0000000000..dbd0525e74 --- /dev/null +++ b/source/module_lr/ao_to_mo_transformer/ao_to_mo_parallel.cpp @@ -0,0 +1,127 @@ +#ifdef __MPI +#include "ao_to_mo.h" +#include "module_base/scalapack_connector.h" +#include "module_base/tool_title.h" +#include "module_lr/utils/lr_util.h" +#include "module_lr/utils/lr_util_print.h" +namespace LR +{ + //output: col first, consistent with blas + // coeff: nao*nbands in para2d, nbands*nao in psi (row-para and constructed: nao) + // X: nvirt*nocc in para2d, nocc*nvirt in psi (row-para and constructed: nvirt) + template<> + void ao_to_mo_pblas( + const std::vector& mat_ao, + const Parallel_2D& pmat_ao, + const psi::Psi& coeff, + const Parallel_2D& pcoeff, + const int& naos, + const int& nocc, + const int& nvirt, + const Parallel_2D& pmat_mo, + double* mat_mo, + const bool add_on, + MO_TYPE type) + { + ModuleBase::TITLE("hamilt_lrtd", "ao_to_mo_pblas"); + assert(pmat_ao.comm() == pcoeff.comm() && pmat_ao.comm() == pmat_mo.comm()); + assert(pmat_ao.blacs_ctxt == pcoeff.blacs_ctxt && pmat_ao.blacs_ctxt == pmat_mo.blacs_ctxt); + assert(pmat_mo.get_local_size() > 0); + + const int nks = mat_ao.size(); + const int i1 = 1; + const int ivirt = nocc + 1; + const int nmo1 = type == MO_TYPE::VV ? nvirt : nocc; + const int nmo2 = type == MO_TYPE::OO ? nocc : nvirt; + const int imo1 = type == MO_TYPE::VV ? ivirt : i1; + const int imo2 = type == MO_TYPE::OO ? i1 : ivirt; + + Parallel_2D pVc; // for intermediate Vc + LR_Util::setup_2d_division(pVc, pmat_ao.get_block_size(), naos, nmo1, pmat_ao.blacs_ctxt); + for (int isk = 0;isk < nks;++isk) + { + const int start = isk * pmat_mo.get_local_size(); + coeff.fix_k(isk); + + //Vc + container::Tensor Vc(DAT::DT_DOUBLE, DEV::CpuDevice, { pVc.get_col_size(), pVc.get_row_size() });//row is "inside"(memory contiguity) for pblas + Vc.zero(); + + char transa = 'N'; + char transb = 'N'; + const double alpha = 1.0; + const double beta = add_on ? 1.0 : 0.0; + pdgemm_(&transa, &transb, &naos, &nmo1, &naos, + &alpha, mat_ao[isk].data(), &i1, &i1, pmat_ao.desc, + coeff.get_pointer(), &i1, &imo1, pcoeff.desc, + &beta, Vc.data(), &i1, &i1, pVc.desc); + + transa = 'T'; + // mat_mo = c ^ TVc + // descC puts M(nvirt) to row + pdgemm_(&transa, &transb, &nmo2, &nmo1, &naos, + &alpha, coeff.get_pointer(), &i1, &imo2, pcoeff.desc, + Vc.data(), &i1, &i1, pVc.desc, + &beta, mat_mo + start, &i1, &i1, pmat_mo.desc); + + } + } + + template<> + void ao_to_mo_pblas( + const std::vector& mat_ao, + const Parallel_2D& pmat_ao, + const psi::Psi>& coeff, + const Parallel_2D& pcoeff, + const int& naos, + const int& nocc, + const int& nvirt, + const Parallel_2D& pmat_mo, + std::complex* const mat_mo, + const bool add_on, + MO_TYPE type) + { + ModuleBase::TITLE("hamilt_lrtd", "cal_AX_plas"); + assert(pmat_ao.comm() == pcoeff.comm() && pmat_ao.comm() == pmat_mo.comm()); + assert(pmat_ao.blacs_ctxt == pcoeff.blacs_ctxt && pmat_ao.blacs_ctxt == pmat_mo.blacs_ctxt); + assert(pmat_mo.get_local_size() > 0); + + const int nks = mat_ao.size(); + const int i1 = 1; + const int ivirt = nocc + 1; + const int nmo1 = type == MO_TYPE::VV ? nvirt : nocc; + const int nmo2 = type == MO_TYPE::OO ? nocc : nvirt; + const int imo1 = type == MO_TYPE::VV ? ivirt : i1; + const int imo2 = type == MO_TYPE::OO ? i1 : ivirt; + + Parallel_2D pVc; // for intermediate Vc + LR_Util::setup_2d_division(pVc, pmat_ao.get_block_size(), naos, nmo1, pmat_ao.blacs_ctxt); + for (int isk = 0;isk < nks;++isk) + { + const int start = isk * pmat_mo.get_local_size(); + coeff.fix_k(isk); + + //Vc + container::Tensor Vc(DAT::DT_COMPLEX_DOUBLE, DEV::CpuDevice, { pVc.get_col_size(), pVc.get_row_size() }); + Vc.zero(); + + char transa = 'N'; + char transb = 'N'; + const std::complex alpha(1.0, 0.0); + const std::complex beta = add_on ? std::complex(1.0, 0.0) : std::complex(0.0, 0.0); + pzgemm_(&transa, &transb, &naos, &nmo1, &naos, + &alpha, mat_ao[isk].data>(), &i1, &i1, pmat_ao.desc, + coeff.get_pointer(), &i1, &imo1, pcoeff.desc, + &beta, Vc.data>(), &i1, &i1, pVc.desc); + + transa = 'C'; + // mat_mo = c ^ TVc + // descC puts M(nvirt) to row + pzgemm_(&transa, &transb, &nmo2, &nmo1, &naos, + &alpha, coeff.get_pointer(), &i1, &imo2, pcoeff.desc, + Vc.data>(), &i1, &i1, pVc.desc, + &beta, mat_mo + start, &i1, &i1, pmat_mo.desc); + } + } +} +#endif diff --git a/source/module_lr/ao_to_mo_transformer/ao_to_mo_serial.cpp b/source/module_lr/ao_to_mo_transformer/ao_to_mo_serial.cpp new file mode 100644 index 0000000000..82e5475f6c --- /dev/null +++ b/source/module_lr/ao_to_mo_transformer/ao_to_mo_serial.cpp @@ -0,0 +1,165 @@ +#include "ao_to_mo.h" +#include "module_base/blas_connector.h" +#include "module_base/tool_title.h" +#include "module_lr/utils/lr_util.h" +namespace LR +{ + template<> + void ao_to_mo_forloop_serial( + const std::vector& mat_ao, + const psi::Psi& coeff, + const int& nocc, + const int& nvirt, + double* mat_mo, + MO_TYPE type) + { + ModuleBase::TITLE("hamilt_lrtd", "ao_to_mo_forloop_serial"); + const int nks = mat_ao.size(); + const int naos = coeff.get_nbasis(); + const int nmo1 = type == MO_TYPE::VV ? nvirt : nocc; + const int nmo2 = type == MO_TYPE::OO ? nocc : nvirt; + const int imo1 = type == MO_TYPE::VV ? nocc : 0; + const int imo2 = type == MO_TYPE::OO ? 0 : nocc; + + ModuleBase::GlobalFunc::ZEROS(mat_mo, nks * nmo1 * nmo2); + + for (int isk = 0;isk < nks;++isk) + { + coeff.fix_k(isk); + const int start = isk * nmo1 * nmo2; + for (int p = 0;p < nmo1;++p) + { + for (int q = 0;q < nmo2;++q) + { + for (int nu = 0;nu < naos;++nu) + { + for (int mu = 0;mu < naos;++mu) + { + mat_mo[start + p * nmo2 + q] += coeff(imo2 + q, mu) * mat_ao[isk].data()[nu * naos + mu] * coeff(imo1 + p, nu); + } + } + } + } + } + } + template<> + void ao_to_mo_forloop_serial( + const std::vector& mat_ao, + const psi::Psi>& coeff, + const int& nocc, + const int& nvirt, + std::complex* const mat_mo, + MO_TYPE type) + { + ModuleBase::TITLE("hamilt_lrtd", "ao_to_mo_forloop_serial"); + const int nks = mat_ao.size(); + const int naos = coeff.get_nbasis(); + const int nmo1 = type == MO_TYPE::VV ? nvirt : nocc; + const int nmo2 = type == MO_TYPE::OO ? nocc : nvirt; + const int imo1 = type == MO_TYPE::VV ? nocc : 0; + const int imo2 = type == MO_TYPE::OO ? 0 : nocc; + + ModuleBase::GlobalFunc::ZEROS(mat_mo, nks * nmo1 * nmo2); + + for (int isk = 0;isk < nks;++isk) + { + coeff.fix_k(isk); + const int start = isk * nmo1 * nmo2; + for (int p = 0;p < nmo1;++p) + { + for (int q = 0;q < nmo2;++q) + { + for (int nu = 0;nu < naos;++nu) + { + for (int mu = 0;mu < naos;++mu) + { + mat_mo[start + p * nmo2 + q] += std::conj(coeff(imo2 + q, mu)) * mat_ao[isk].data>()[nu * naos + mu] * coeff(imo1 + p, nu); + } + } + } + } + } + } + template<> + void ao_to_mo_blas( + const std::vector& mat_ao, + const psi::Psi& coeff, + const int& nocc, + const int& nvirt, + double* mat_mo, + const bool add_on, + MO_TYPE type) + { + ModuleBase::TITLE("hamilt_lrtd", "ao_to_mo_blas"); + const int nks = mat_ao.size(); + const int naos = coeff.get_nbasis(); + const int nmo1 = type == MO_TYPE::VV ? nvirt : nocc; + const int nmo2 = type == MO_TYPE::OO ? nocc : nvirt; + const int imo1 = type == MO_TYPE::VV ? nocc : 0; + const int imo2 = type == MO_TYPE::OO ? 0 : nocc; + + for (int isk = 0;isk < nks;++isk) + { + coeff.fix_k(isk); + const int start = isk * nmo1 * nmo2; + + // Vc[naos*nocc] + container::Tensor Vc(DAT::DT_DOUBLE, DEV::CpuDevice, { nmo1, naos });// (Vc)^T + Vc.zero(); + char transa = 'N'; + char transb = 'N'; //coeff is col major + const double alpha = 1.0; + const double beta = add_on ? 1.0 : 0.0; + dgemm_(&transa, &transb, &naos, &nmo1, &naos, &alpha, + mat_ao[isk].data(), &naos, coeff.get_pointer(imo1), &naos, &beta, + Vc.data(), &naos); + + transa = 'T'; + //mat_mo=coeff^TVc (nvirt major) + dgemm_(&transa, &transb, &nmo2, &nmo1, &naos, &alpha, + coeff.get_pointer(imo2), &naos, Vc.data(), &naos, &beta, + mat_mo + start, &nmo2); + } + } + template<> + void ao_to_mo_blas( + const std::vector& mat_ao, + const psi::Psi>& coeff, + const int& nocc, + const int& nvirt, + std::complex* const mat_mo, + const bool add_on, + MO_TYPE type) + { + ModuleBase::TITLE("hamilt_lrtd", "ao_to_mo_blas"); + const int nks = mat_ao.size(); + const int naos = coeff.get_nbasis(); + const int nmo1 = type == MO_TYPE::VV ? nvirt : nocc; + const int nmo2 = type == MO_TYPE::OO ? nocc : nvirt; + const int imo1 = type == MO_TYPE::VV ? nocc : 0; + const int imo2 = type == MO_TYPE::OO ? 0 : nocc; + + for (int isk = 0;isk < nks;++isk) + { + coeff.fix_k(isk); + const int start = isk * nmo1 * nmo2; + + // Vc[naos*nocc] (V is hermitian) + container::Tensor Vc(DAT::DT_COMPLEX_DOUBLE, DEV::CpuDevice, { nmo1, naos });// (Vc)^T + Vc.zero(); + char transa = 'N'; + char transb = 'N'; //coeff is col major + const std::complex alpha(1.0, 0.0); + const std::complex beta = add_on ? std::complex(1.0, 0.0) : std::complex(0.0, 0.0); + zgemm_(&transa, &transb, &naos, &nmo1, &naos, &alpha, + mat_ao[isk].data>(), &naos, coeff.get_pointer(imo1), &naos, &beta, + Vc.data>(), &naos); + + transa = 'C'; + //mat_mo=coeff^\dagger Vc (nvirt major) + zgemm_(&transa, &transb, &nmo2, &nmo1, &naos, &alpha, + coeff.get_pointer(imo2), &naos, Vc.data>(), &naos, &beta, + mat_mo + start, &nmo2); + } + } +} \ No newline at end of file diff --git a/source/module_lr/ao_to_mo_transformer/test/CMakeLists.txt b/source/module_lr/ao_to_mo_transformer/test/CMakeLists.txt new file mode 100644 index 0000000000..e8e6c4b2f4 --- /dev/null +++ b/source/module_lr/ao_to_mo_transformer/test/CMakeLists.txt @@ -0,0 +1,6 @@ +remove_definitions(-DUSE_LIBXC) +AddTest( + TARGET ao_to_mo_test + LIBS parameter base ${math_libs} container device psi + SOURCES ao_to_mo_test.cpp ../../utils/lr_util.cpp ../ao_to_mo_parallel.cpp ../ao_to_mo_serial.cpp +) \ No newline at end of file diff --git a/source/module_lr/ao_to_mo_transformer/test/ao_to_mo_test.cpp b/source/module_lr/ao_to_mo_transformer/test/ao_to_mo_test.cpp new file mode 100644 index 0000000000..5601ad451d --- /dev/null +++ b/source/module_lr/ao_to_mo_transformer/test/ao_to_mo_test.cpp @@ -0,0 +1,280 @@ +#include +#ifdef __MPI +#include "mpi.h" +#endif +#include "../ao_to_mo.h" + +#include "module_lr/utils/lr_util.h" + +#define rand01 (static_cast(rand()) / static_cast(RAND_MAX) - 0.5 ) +struct matsize +{ + int nks = 1; + int naos = 2; + int nocc = 1; + int nvirt = 1; + int nb = 1; + matsize(int nks, int naos, int nocc, int nvirt, int nb = 1) + :nks(nks), naos(naos), nocc(nocc), nvirt(nvirt), nb(nb) { + assert(nocc + nvirt <= naos); + }; +}; + +class AO2MOTest : public testing::Test +{ +public: + std::vector sizes{ + // {2, 3, 2, 1}, + {2, 13, 7, 4}, + // {2, 14, 8, 5} + }; + int nstate = 2; + std::ofstream ofs_running; + int my_rank = 0; +#ifdef __MPI + void SetUp() override + { + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + this->ofs_running.open("log" + std::to_string(my_rank) + ".txt"); + ofs_running << "my_rank = " << my_rank << std::endl; + } + void TearDown() override + { + ofs_running.close(); + } +#endif + + void set_ones(double* data, int size) { for (int i = 0;i < size;++i) { data[i] = 1.0; } }; + void set_int(double* data, int size) { for (int i = 0;i < size;++i) { data[i] = static_cast(i + 1); } }; + void set_int(std::complex* data, int size) { for (int i = 0;i < size;++i) { data[i] = std::complex(i + 1, -i - 1); } }; + void set_rand(double* data, int size) { for (int i = 0;i < size;++i) { data[i] = rand01 * 10; } }; + void set_rand(std::complex* data, int size) { for (int i = 0;i < size;++i) { data[i] = std::complex(rand01 * 10, rand01 * 10); } }; + void check_eq(double* data1, double* data2, int size) { for (int i = 0;i < size;++i) { EXPECT_NEAR(data1[i], data2[i], 1e-10); } }; + void check_eq(std::complex* data1, std::complex* data2, int size) + { + for (int i = 0;i < size;++i) + { + EXPECT_NEAR(data1[i].real(), data2[i].real(), 1e-10); + EXPECT_NEAR(data1[i].imag(), data2[i].imag(), 1e-10); + } + }; +}; + +TEST_F(AO2MOTest, DoubleSerial) +{ + for (auto s : this->sizes) + { + psi::Psi vo_for(s.nks, nstate, s.nocc * s.nvirt, nullptr, false); + psi::Psi vo_blas(s.nks, nstate, s.nocc * s.nvirt, nullptr, false); + psi::Psi oo_for(s.nks, nstate, s.nocc * s.nocc, nullptr, false); + psi::Psi oo_blas(s.nks, nstate, s.nocc * s.nocc, nullptr, false); + psi::Psi vv_for(s.nks, nstate, s.nvirt * s.nvirt, nullptr, false); + psi::Psi vv_blas(s.nks, nstate, s.nvirt * s.nvirt, nullptr, false); + int size_c = s.nks * (s.nocc + s.nvirt) * s.naos; + int size_v = s.naos * s.naos; + for (int istate = 0;istate < nstate;++istate) + { + std::vector temp(s.nks, s.naos); + psi::Psi c(s.nks, s.nocc + s.nvirt, s.naos, temp.data(), true); + std::vector V(s.nks, container::Tensor(DAT::DT_DOUBLE, DEV::CpuDevice, { s.naos, s.naos })); + set_rand(&c(0, 0, 0), size_c); + for (auto& v : V) { set_rand(v.data(), size_v); } + LR::ao_to_mo_forloop_serial(V, c, s.nocc, s.nvirt, &vo_for(istate, 0, 0)); + LR::ao_to_mo_blas(V, c, s.nocc, s.nvirt, &vo_blas(istate, 0, 0), false); + LR::ao_to_mo_forloop_serial(V, c, s.nocc, s.nvirt, &oo_for(istate, 0, 0), LR::MO_TYPE::OO); + LR::ao_to_mo_blas(V, c, s.nocc, s.nvirt, &oo_blas(istate, 0, 0), false, LR::MO_TYPE::OO); + LR::ao_to_mo_forloop_serial(V, c, s.nocc, s.nvirt, &vv_for(istate, 0, 0), LR::MO_TYPE::VV); + LR::ao_to_mo_blas(V, c, s.nocc, s.nvirt, &vv_blas(istate, 0, 0), false, LR::MO_TYPE::VV); + } + check_eq(&vo_for(0, 0, 0), &vo_blas(0, 0, 0), nstate * s.nks * s.nocc * s.nvirt); + check_eq(&oo_for(0, 0, 0), &oo_blas(0, 0, 0), nstate * s.nks * s.nocc * s.nocc); + check_eq(&vv_for(0, 0, 0), &vv_blas(0, 0, 0), nstate * s.nks * s.nvirt * s.nvirt); + } +} + +TEST_F(AO2MOTest, ComplexSerial) +{ + for (auto s : this->sizes) + { + psi::Psi> vo_for(s.nks, nstate, s.nocc * s.nvirt, nullptr, false); + psi::Psi> vo_blas(s.nks, nstate, s.nocc * s.nvirt, nullptr, false); + psi::Psi> oo_for(s.nks, nstate, s.nocc * s.nocc, nullptr, false); + psi::Psi> oo_blas(s.nks, nstate, s.nocc * s.nocc, nullptr, false); + psi::Psi> vv_for(s.nks, nstate, s.nvirt * s.nvirt, nullptr, false); + psi::Psi> vv_blas(s.nks, nstate, s.nvirt * s.nvirt, nullptr, false); + int size_c = s.nks * (s.nocc + s.nvirt) * s.naos; + int size_v = s.naos * s.naos; + for (int istate = 0;istate < nstate;++istate) + { + std::vector temp(s.nks, s.naos); + psi::Psi> c(s.nks, s.nocc + s.nvirt, s.naos, temp.data(), true); + std::vector V(s.nks, container::Tensor(DAT::DT_COMPLEX_DOUBLE, DEV::CpuDevice, { s.naos, s.naos })); + set_rand(&c(0, 0, 0), size_c); + for (auto& v : V) { set_rand(v.data>(), size_v); } + LR::ao_to_mo_forloop_serial(V, c, s.nocc, s.nvirt, &vo_for(istate, 0, 0)); + LR::ao_to_mo_blas(V, c, s.nocc, s.nvirt, &vo_blas(istate, 0, 0), false); + LR::ao_to_mo_forloop_serial(V, c, s.nocc, s.nvirt, &oo_for(istate, 0, 0), LR::MO_TYPE::OO); + LR::ao_to_mo_blas(V, c, s.nocc, s.nvirt, &oo_blas(istate, 0, 0), false, LR::MO_TYPE::OO); + LR::ao_to_mo_forloop_serial(V, c, s.nocc, s.nvirt, &vv_for(istate, 0, 0), LR::MO_TYPE::VV); + LR::ao_to_mo_blas(V, c, s.nocc, s.nvirt, &vv_blas(istate, 0, 0), false, LR::MO_TYPE::VV); + } + check_eq(&vo_for(0, 0, 0), &vo_blas(0, 0, 0), nstate * s.nks * s.nocc * s.nvirt); + check_eq(&oo_for(0, 0, 0), &oo_blas(0, 0, 0), nstate * s.nks * s.nocc * s.nocc); + check_eq(&vv_for(0, 0, 0), &vv_blas(0, 0, 0), nstate * s.nks * s.nvirt * s.nvirt); + } +} +#ifdef __MPI +TEST_F(AO2MOTest, DoubleParallel) +{ + for (auto s : this->sizes) + { + // c: nao*nbands in para2d, nbands*nao in psi (row-para and constructed: nao) + // X: nvirt*nocc in para2d, nocc*nvirt in psi (row-para and constructed: nvirt) + Parallel_2D pV; + LR_Util::setup_2d_division(pV, s.nb, s.naos, s.naos); + std::vector V(s.nks, container::Tensor(DAT::DT_DOUBLE, DEV::CpuDevice, { pV.get_col_size(), pV.get_row_size() })); + Parallel_2D pc; + LR_Util::setup_2d_division(pc, s.nb, s.naos, s.nocc + s.nvirt, pV.blacs_ctxt); + + std::vector ngk_temp(s.nks, pc.get_row_size()); + psi::Psi c(s.nks, pc.get_col_size(), pc.get_row_size(), ngk_temp.data(), true); + Parallel_2D pvo, poo, pvv; + LR_Util::setup_2d_division(pvo, s.nb, s.nvirt, s.nocc, pV.blacs_ctxt); + LR_Util::setup_2d_division(poo, s.nb, s.nocc, s.nocc, pV.blacs_ctxt); + LR_Util::setup_2d_division(pvv, s.nb, s.nvirt, s.nvirt, pV.blacs_ctxt); + + EXPECT_EQ(pV.dim0, pc.dim0); + EXPECT_EQ(pV.dim1, pc.dim1); + EXPECT_GE(s.nvirt, pvo.dim0); + EXPECT_GE(s.nocc, pvo.dim1); + EXPECT_GE(s.naos, pc.dim0); + psi::Psi vo_pblas_loc(s.nks, nstate, pvo.get_local_size(), nullptr, false); + psi::Psi vo_gather(s.nks, nstate, s.nocc * s.nvirt, nullptr, false); + psi::Psi oo_pblas_loc(s.nks, nstate, poo.get_local_size(), nullptr, false); + psi::Psi oo_gather(s.nks, nstate, s.nocc * s.nocc, nullptr, false); + psi::Psi vv_pblas_loc(s.nks, nstate, pvv.get_local_size(), nullptr, false); + psi::Psi vv_gather(s.nks, nstate, s.nvirt * s.nvirt, nullptr, false); + for (int istate = 0;istate < nstate;++istate) + { + for (int isk = 0;isk < s.nks;++isk) + { + set_rand(V.at(isk).data(), pV.get_local_size()); + set_rand(&c(isk, 0, 0), pc.get_local_size()); + } + LR::ao_to_mo_pblas(V, pV, c, pc, s.naos, s.nocc, s.nvirt, pvo, &vo_pblas_loc(istate, 0, 0), false); + LR::ao_to_mo_pblas(V, pV, c, pc, s.naos, s.nocc, s.nvirt, poo, &oo_pblas_loc(istate, 0, 0), false, LR::MO_TYPE::OO); + LR::ao_to_mo_pblas(V, pV, c, pc, s.naos, s.nocc, s.nvirt, pvv, &vv_pblas_loc(istate, 0, 0), false, LR::MO_TYPE::VV); + // gather AX and output + for (int isk = 0;isk < s.nks;++isk) + { + LR_Util::gather_2d_to_full(pvo, &vo_pblas_loc(istate, isk, 0), &vo_gather(istate, isk, 0), false/*pblas: row first*/, s.nvirt, s.nocc); + LR_Util::gather_2d_to_full(poo, &oo_pblas_loc(istate, isk, 0), &oo_gather(istate, isk, 0), false/*pblas: row first*/, s.nocc, s.nocc); + LR_Util::gather_2d_to_full(pvv, &vv_pblas_loc(istate, isk, 0), &vv_gather(istate, isk, 0), false/*pblas: row first*/, s.nvirt, s.nvirt); + } + // compare to global AX + std::vector V_full(s.nks, container::Tensor(DAT::DT_DOUBLE, DEV::CpuDevice, { s.naos, s.naos })); + std::vector ngk_temp_1(s.nks, s.naos); + psi::Psi c_full(s.nks, s.nocc + s.nvirt, s.naos, ngk_temp_1.data(), true); + for (int isk = 0;isk < s.nks;++isk) + { + LR_Util::gather_2d_to_full(pV, V.at(isk).data(), V_full.at(isk).data(), false, s.naos, s.naos); + LR_Util::gather_2d_to_full(pc, &c(isk, 0, 0), &c_full(isk, 0, 0), false, s.naos, s.nocc + s.nvirt); + } + if (my_rank == 0) + { + psi::Psi vo_full_istate(s.nks, 1, s.nocc * s.nvirt, nullptr, false); + LR::ao_to_mo_blas(V_full, c_full, s.nocc, s.nvirt, &vo_full_istate(0, 0, 0), false); + check_eq(&vo_full_istate(0, 0, 0), &vo_gather(istate, 0, 0), s.nks * s.nocc * s.nvirt); + psi::Psi oo_full_istate(s.nks, 1, s.nocc * s.nocc, nullptr, false); + LR::ao_to_mo_blas(V_full, c_full, s.nocc, s.nvirt, &oo_full_istate(0, 0, 0), false, LR::MO_TYPE::OO); + check_eq(&oo_full_istate(0, 0, 0), &oo_gather(istate, 0, 0), s.nks * s.nocc * s.nocc); + psi::Psi vv_full_istate(s.nks, 1, s.nvirt * s.nvirt, nullptr, false); + LR::ao_to_mo_blas(V_full, c_full, s.nocc, s.nvirt, &vv_full_istate(0, 0, 0), false, LR::MO_TYPE::VV); + check_eq(&vv_full_istate(0, 0, 0), &vv_gather(istate, 0, 0), s.nks * s.nvirt * s.nvirt); + } + } + } +} +TEST_F(AO2MOTest, ComplexParallel) +{ + for (auto s : this->sizes) + { + // c: nao*nbands in para2d, nbands*nao in psi (row-para and constructed: nao) + // X: nvirt*nocc in para2d, nocc*nvirt in psi (row-para and constructed: nvirt) + Parallel_2D pV; + LR_Util::setup_2d_division(pV, s.nb, s.naos, s.naos); + std::vector V(s.nks, container::Tensor(DAT::DT_COMPLEX_DOUBLE, DEV::CpuDevice, { pV.get_col_size(), pV.get_row_size() })); + Parallel_2D pc; + LR_Util::setup_2d_division(pc, s.nb, s.naos, s.nocc + s.nvirt, pV.blacs_ctxt); + + std::vector ngk_temp_1(s.nks, pc.get_row_size()); + psi::Psi> c(s.nks, pc.get_col_size(), pc.get_row_size(), ngk_temp_1.data(), true); + Parallel_2D pvo, poo, pvv; + LR_Util::setup_2d_division(pvo, s.nb, s.nvirt, s.nocc, pV.blacs_ctxt); + LR_Util::setup_2d_division(poo, s.nb, s.nocc, s.nocc, pV.blacs_ctxt); + LR_Util::setup_2d_division(pvv, s.nb, s.nvirt, s.nvirt, pV.blacs_ctxt); + + psi::Psi> vo_pblas_loc(s.nks, nstate, pvo.get_local_size(), nullptr, false); + psi::Psi> vo_gather(s.nks, nstate, s.nocc * s.nvirt, nullptr, false); + psi::Psi> oo_pblas_loc(s.nks, nstate, poo.get_local_size(), nullptr, false); + psi::Psi> oo_gather(s.nks, nstate, s.nocc * s.nocc, nullptr, false); + psi::Psi> vv_pblas_loc(s.nks, nstate, pvv.get_local_size(), nullptr, false); + psi::Psi> vv_gather(s.nks, nstate, s.nvirt * s.nvirt, nullptr, false); + for (int istate = 0;istate < nstate;++istate) + { + for (int isk = 0;isk < s.nks;++isk) + { + set_rand(V.at(isk).data>(), pV.get_local_size()); + set_rand(&c(isk, 0, 0), pc.get_local_size()); + } + LR::ao_to_mo_pblas(V, pV, c, pc, s.naos, s.nocc, s.nvirt, pvo, &vo_pblas_loc(istate, 0, 0), false); + LR::ao_to_mo_pblas(V, pV, c, pc, s.naos, s.nocc, s.nvirt, poo, &oo_pblas_loc(istate, 0, 0), false, LR::MO_TYPE::OO); + LR::ao_to_mo_pblas(V, pV, c, pc, s.naos, s.nocc, s.nvirt, pvv, &vv_pblas_loc(istate, 0, 0), false, LR::MO_TYPE::VV); + + // gather AX and output + for (int isk = 0;isk < s.nks;++isk) + { + LR_Util::gather_2d_to_full(pvo, &vo_pblas_loc(istate, isk, 0), &vo_gather(istate, isk, 0), false/*pblas: row first*/, s.nvirt, s.nocc); + LR_Util::gather_2d_to_full(poo, &oo_pblas_loc(istate, isk, 0), &oo_gather(istate, isk, 0), false/*pblas: row first*/, s.nocc, s.nocc); + LR_Util::gather_2d_to_full(pvv, &vv_pblas_loc(istate, isk, 0), &vv_gather(istate, isk, 0), false/*pblas: row first*/, s.nvirt, s.nvirt); + } + // compare to global AX + std::vector V_full(s.nks, container::Tensor(DAT::DT_COMPLEX_DOUBLE, DEV::CpuDevice, { s.naos, s.naos })); + std::vector ngk_temp_2(s.nks, s.naos); + psi::Psi> c_full(s.nks, s.nocc + s.nvirt, s.naos, ngk_temp_2.data(), true); + for (int isk = 0;isk < s.nks;++isk) + { + LR_Util::gather_2d_to_full(pV, V.at(isk).data>(), V_full.at(isk).data>(), false, s.naos, s.naos); + LR_Util::gather_2d_to_full(pc, &c(isk, 0, 0), &c_full(isk, 0, 0), false, s.naos, s.nocc + s.nvirt); + } + if (my_rank == 0) + { + psi::Psi> vo_full_istate(s.nks, 1, s.nocc * s.nvirt, nullptr, false); + LR::ao_to_mo_blas(V_full, c_full, s.nocc, s.nvirt, &vo_full_istate(0, 0, 0), false); + check_eq(&vo_full_istate(0, 0, 0), &vo_gather(istate, 0, 0), s.nks * s.nocc * s.nvirt); + psi::Psi> oo_full_istate(s.nks, 1, s.nocc * s.nocc, nullptr, false); + LR::ao_to_mo_blas(V_full, c_full, s.nocc, s.nocc, &oo_full_istate(0, 0, 0), false, LR::MO_TYPE::OO); + check_eq(&oo_full_istate(0, 0, 0), &oo_gather(istate, 0, 0), s.nks * s.nocc * s.nocc); + psi::Psi> vv_full_istate(s.nks, 1, s.nvirt * s.nvirt, nullptr, false); + LR::ao_to_mo_blas(V_full, c_full, s.nocc, s.nvirt, &vv_full_istate(0, 0, 0), false, LR::MO_TYPE::VV); + check_eq(&vv_full_istate(0, 0, 0), &vv_gather(istate, 0, 0), s.nks * s.nvirt * s.nvirt); + } + } + } +} +#endif + + +int main(int argc, char** argv) +{ + srand(time(nullptr)); // for random number generator +#ifdef __MPI + MPI_Init(&argc, &argv); +#endif + testing::InitGoogleTest(&argc, argv); + int result = RUN_ALL_TESTS(); +#ifdef __MPI + MPI_Finalize(); +#endif + return result; +} diff --git a/source/module_lr/operator_casida/operator_lr_hxc.cpp b/source/module_lr/operator_casida/operator_lr_hxc.cpp index 22af349ca1..ed55c7c0c7 100644 --- a/source/module_lr/operator_casida/operator_lr_hxc.cpp +++ b/source/module_lr/operator_casida/operator_lr_hxc.cpp @@ -8,7 +8,7 @@ #include "module_lr/utils/lr_util_print.h" // #include "module_hamilt_lcao/hamilt_lcaodft/DM_gamma_2d_to_grid.h" #include "module_hamilt_lcao/module_hcontainer/hcontainer_funcs.h" -#include "module_lr/AX/AX.h" +#include "module_lr/ao_to_mo_transformer/ao_to_mo.h" #include "module_hamilt_pw/hamilt_pwdft/global.h" inline double conj(double a) { return a; } @@ -43,9 +43,9 @@ namespace LR // 5. [AX]^{Hxc}_{ai}=\sum_{\mu,\nu}c^*_{a,\mu,}V^{Hxc}_{\mu,\nu}c_{\nu,i} #ifdef __MPI - cal_AX_pblas(v_hxc_2d, this->pmat, psil_ks, this->pc, naos, nocc[sl], nvirt[sl], this->pX[sl], hpsi); + ao_to_mo_pblas(v_hxc_2d, this->pmat, psil_ks, this->pc, naos, nocc[sl], nvirt[sl], this->pX[sl], hpsi); #else - cal_AX_blas(v_hxc_2d, psil_ks, nocc[sl], nvirt[sl], hpsi); + ao_to_mo_blas(v_hxc_2d, psil_ks, nocc[sl], nvirt[sl], hpsi); #endif }