pyscf
diff --git a/‎examples/00-h2o.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/00-h2o.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎gpu4pyscf/__config__.py‎
Lines changed: 8 additions & 0 deletions b/‎gpu4pyscf/__config__.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎gpu4pyscf/df/df.py‎
Lines changed: 13 additions & 17 deletions b/‎gpu4pyscf/df/df.py‎
Lines changed: 13 additions & 17 deletions
diff --git a/‎gpu4pyscf/df/df_jk.py‎
Lines changed: 8 additions & 9 deletions b/‎gpu4pyscf/df/df_jk.py‎
Lines changed: 8 additions & 9 deletions
diff --git a/‎gpu4pyscf/df/grad/jk.py‎
Lines changed: 4 additions & 3 deletions b/‎gpu4pyscf/df/grad/jk.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎gpu4pyscf/df/hessian/rhf.py‎
Lines changed: 30 additions & 24 deletions b/‎gpu4pyscf/df/hessian/rhf.py‎
Lines changed: 30 additions & 24 deletions
diff --git a/‎gpu4pyscf/df/hessian/uhf.py‎
Lines changed: 12 additions & 12 deletions b/‎gpu4pyscf/df/hessian/uhf.py‎
Lines changed: 12 additions & 12 deletions
@@ -36,12 +36,12 @@
     atom=atom,                         # water molecule
     basis='def2-tzvpp',                # basis set
     output='./pyscf.log',              # save log file
-    verbose=6                          # control the level of print info
+    verbose=6                         # control the level of print info
     )
 
 mf_GPU = rks.RKS(                      # restricted Kohn-Sham DFT
     mol,                               # pyscf.gto.object
-    xc='b3lyp'                         # xc funtionals, such as pbe0, wb97m-v, tpss,
+    xc='b3lyp'                        # xc funtionals, such as pbe0, wb97m-v, tpss,
     ).density_fit()                    # density fitting
 
 mf_GPU.grids.atom_grid = (99,590)      # (99,590) lebedev grids, (75,302) is often enough
 
@@ -24,3 +24,11 @@
 mem_fraction = 0.9
 cupy.get_default_memory_pool().set_limit(fraction=mem_fraction)
 
+# Check P2P data transfer is available
+_p2p_access = True
+if _num_devices > 1:
+    for src in range(_num_devices):
+        for dst in range(_num_devices):
+            if src != dst:
+                can_access_peer = cupy.cuda.runtime.deviceCanAccessPeer(src, dst)
+                _p2p_access &= can_access_peer
@@ -21,7 +21,7 @@
 from cupyx.scipy.linalg import solve_triangular
 from pyscf import lib
 from pyscf.df import df, addons, incore
-from gpu4pyscf.lib.cupy_helper import cholesky, tag_array, get_avail_mem, cart2sph
+from gpu4pyscf.lib.cupy_helper import cholesky, tag_array, get_avail_mem, cart2sph, p2p_transfer
 from gpu4pyscf.df import int3c2e, df_jk
 from gpu4pyscf.lib import logger
 from gpu4pyscf import __config__
@@ -177,10 +177,10 @@ def loop(self, blksize=None, unpack=True):
             yield buf2, buf.T
             if isinstance(cderi_sparse, np.ndarray):
                 cupy.cuda.Device().synchronize()
-
+            
             if buf_prefetch is not None:
                 buf = buf_prefetch
-
+            
     def reset(self, mol=None):
         '''Reset mol and clean up relevant attributes for scanner mode'''
         if mol is not None:
@@ -208,13 +208,14 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
     npairs = len(intopt.cderi_row)
     log = logger.new_logger(mol, mol.verbose)
 
-    # if the matrix exceeds the limit, store CDERI in CPU memory
-    # TODO: better estimate of memory consumption for each device
+    # Available memory on Device 0.
     avail_mem = get_avail_mem()
 
     if use_gpu_memory:
-        # If GPU memory is not enough
-        use_gpu_memory = naux * npairs * 8 < 0.4 * avail_mem
+        # CDERI will be equally distributed to the devices
+        # Other devices usually have more memory available than Device 0
+        # CDERI will use up to 40% of the available memory
+        use_gpu_memory = naux * npairs * 8 < 0.4 * avail_mem * _num_devices
 
     if use_gpu_memory:
         log.debug("Saving CDERI on GPU")
@@ -244,9 +245,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
     cd_low_f = cupy.array(cd_low, order='F', copy=False)
     cd_low_f = tag_array(cd_low_f, tag=cd_low.tag)
 
-    for gpu_id in range(_num_devices):
-        cupy.cuda.Device(gpu_id).synchronize()
-
+    cupy.cuda.get_current_stream().synchronize()
     futures = []
     with ThreadPoolExecutor(max_workers=_num_devices) as executor:
         for device_id in range(_num_devices):
@@ -258,9 +257,6 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
     for future in futures:
         future.result()
 
-    for device_id in range(_num_devices):
-        cupy.cuda.Device(device_id).synchronize()
-
     if not use_gpu_memory:
         cupy.cuda.Device().synchronize()
 
@@ -344,14 +340,14 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, omega=None, sr_only=False, de
             # if CDERI is saved on CPU
             ij0 = pairs_loc[cp_ij_id]
             ij1 = pairs_loc[cp_ij_id+1]
-            if isinstance(_cderi, np.ndarray):
+            if isinstance(_cderi[0], np.ndarray):
                 for slice_id, (p0,p1) in enumerate(lib.prange(0, naux, blksize)):
                     for i in range(p0,p1):
-                        cderi_block[i].get(out=_cderi[slice_id][i,ij0:ij1])
+                        cderi_block[i].get(out=_cderi[slice_id][i-p0,ij0:ij1])
             else:
                 # Copy data to other Devices
                 for slice_id, (p0,p1) in enumerate(lib.prange(0, naux, blksize)):
-                    _cderi[slice_id][:,ij0:ij1] = cderi_block[p0:p1]
-            
+                    #_cderi[slice_id][:,ij0:ij1] = cderi_block[p0:p1]
+                    p2p_transfer(_cderi[slice_id][:,ij0:ij1], cderi_block[p0:p1])
             t1 = log.timer_debug1(f'transfer data for {cp_ij_id} / {nq} on Device {device_id}', *t1)
     return
@@ -298,8 +298,7 @@ def _jk_task_with_mo(dfobj, dms, mo_coeff, mo_occ,
                         rhok = rhok.reshape([-1,nao])
                         vk[i] += cupy.dot(rhok.T, rhok)
                     rhok = None
-                cupy.cuda.get_current_stream().synchronize()
-                
+
             if with_j:
                 vj = cupy.zeros(dms_shape)
                 vj[:,rows,cols] = vj_packed
@@ -390,13 +389,12 @@ def _jk_task_with_dm(dfobj, dms, with_j=True, with_k=True, hermi=0, device_id=0)
             else:
                 dm_sparse *= 2
             dm_sparse[:, intopt.cderi_diag] *= .5
-        
+            vj_sparse = cupy.zeros_like(dm_sparse)
+
         if with_k:
             vk = cupy.zeros_like(dms)
 
         nset = dms.shape[0]
-        if with_j:
-            vj_sparse = cupy.zeros_like(dm_sparse)
         blksize = dfobj.get_blksize()
         for cderi, cderi_sparse in dfobj.loop(blksize=blksize, unpack=with_k):
             if with_j:
@@ -406,7 +404,7 @@ def _jk_task_with_dm(dfobj, dms, with_j=True, with_k=True, hermi=0, device_id=0)
                 for k in range(nset):
                     rhok = contract('Lij,jk->Lki', cderi, dms[k]).reshape([-1,nao])
                     #vk[k] += contract('Lki,Lkj->ij', rhok, cderi)
-                    vk[k] += cupy.dot(rhok.T, cderi.reshape([-1,nao]))
+                    vk[k] += cupy.dot(rhok.T, cderi.reshape([-1,nao]))            
         if with_j:
             vj = cupy.zeros(dms_shape)
             vj[:,rows,cols] = vj_sparse
@@ -445,6 +443,7 @@ def get_jk(dfobj, dms_tag, hermi=0, with_j=True, with_k=True, direct_scf_tol=1e-
     intopt = dfobj.intopt
     dms = intopt.sort_orbitals(dms, axis=[1,2])
 
+    cupy.cuda.get_current_stream().synchronize()
     if getattr(dms_tag, 'mo_coeff', None) is not None:
         mo_occ = dms_tag.mo_occ
         mo_coeff = dms_tag.mo_coeff
@@ -498,13 +497,13 @@ def get_jk(dfobj, dms_tag, hermi=0, with_j=True, with_k=True, direct_scf_tol=1e-
     vj = vk = None
     if with_j:
         vj = [future.result()[0] for future in futures]
-        vj = reduce_to_device(vj)
+        vj = reduce_to_device(vj, inplace=True)
         vj = intopt.unsort_orbitals(vj, axis=[1,2])
         vj = vj.reshape(out_shape)
-    
+
     if with_k:
         vk = [future.result()[1] for future in futures]
-        vk = reduce_to_device(vk)
+        vk = reduce_to_device(vk, inplace=True)
         vk = intopt.unsort_orbitals(vk, axis=[1,2])
         vk = vk.reshape(out_shape)
 
 
@@ -15,7 +15,7 @@
 
 from concurrent.futures import ThreadPoolExecutor
 import cupy
-from gpu4pyscf.lib.cupy_helper import contract
+from gpu4pyscf.lib.cupy_helper import contract, concatenate
 from gpu4pyscf.lib import logger
 from gpu4pyscf.__config__ import _streams, _num_devices
 
@@ -58,6 +58,7 @@ def get_rhoj_rhok(with_df, dm, orbo, with_j=True, with_k=True):
     ''' Calculate rhoj and rhok on Multi-GPU system
     '''
     futures = []
+    cupy.cuda.get_current_stream().synchronize()
     with ThreadPoolExecutor(max_workers=_num_devices) as executor:
         for device_id in range(_num_devices):
             future = executor.submit(
@@ -74,8 +75,8 @@ def get_rhoj_rhok(with_df, dm, orbo, with_j=True, with_k=True):
 
     rhoj = rhok = None
     if with_j:
-        rhoj = cupy.concatenate(rhoj_total)
+        rhoj = concatenate(rhoj_total)
     if with_k:
-        rhok = cupy.concatenate(rhok_total)
+        rhok = concatenate(rhok_total)
 
     return rhoj, rhok
@@ -54,6 +54,29 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
                                    atmlst, max_memory, verbose, True)
     return e1 + ej - ek
 
+def _hk_ip1_ip1(rhok1_Pko, dm0, mocc_2):
+    ''' hk contributions due to (10|0)(0|10) + (10|0)(0|01)
+    '''
+    nnz = rhok1_Pko.shape[0]
+    nao = dm0.shape[0]
+    mem_avail = get_avail_mem()
+    blksize = int((mem_avail*0.4/(nao*nao*3*8)/ALIGNED))*ALIGNED
+    hk_ao_ao = cupy.zeros([nao,nao,3,3])
+    for k0, k1 in lib.prange(0,nnz,blksize):
+        rhok1_Pko_kslice = cupy.asarray(rhok1_Pko[k0:k1])
+
+        # (10|0)(0|10) without response of RI basis
+        vk2_ip1_ip1 = contract('piox,pkoy->ikxy', rhok1_Pko_kslice, rhok1_Pko_kslice)
+        hk_ao_ao += contract('ikxy,ik->ikxy', vk2_ip1_ip1, dm0)
+        vk2_ip1_ip1 = None
+
+        # (10|0)(0|01) without response of RI basis
+        rhok1_Pkl_kslice = contract('piox,ko->pikx', rhok1_Pko_kslice, mocc_2)
+        hk_ao_ao += contract('pikx,pkiy->ikxy', rhok1_Pkl_kslice, rhok1_Pkl_kslice)
+        rhok1_Pkl_kslice = None
+    return hk_ao_ao
+
+
 def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
                       atmlst=None, max_memory=4000, verbose=None, with_k=True, omega=None):
     '''Partial derivative
@@ -94,7 +117,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     # ================================ sorted AO begin ===============================================
     intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e')
     intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE)
-    naux = auxmol.nao #len(aux_ao_idx)
+    naux = auxmol.nao
     mocc_2 = intopt.sort_orbitals(mocc_2, axis=[0])
     dm0 = intopt.sort_orbitals(dm0, axis=[0,1])
     dm0_tag = tag_array(dm0, occ_coeff=mocc_2)
@@ -118,7 +141,6 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     rhoj0_P = solve_j2c(wj)
     rhok0_P__ = solve_j2c(wk_P__)
     wj = wk_P__ = None
-    t1 = log.timer_debug1('intermediate variables with int3c2e', *t1)
 
     # int3c_ip2 contributions
     wj_ip2, wk_ip2_P__ = int3c2e.get_int3c2e_ip2_wjk(intopt, dm0_tag, omega=omega)
@@ -188,36 +210,20 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
                 rhok1_Pko[:,i0:i1] = contract('qp,qiox->piox', cd_low, wk1_tmp).get()
             wk1_tmp = None
         cd_low = None
-
-        mem_avail = get_avail_mem()
-        blksize = int((mem_avail*0.4/(nao*nao*3*8)/ALIGNED))*ALIGNED
-        log.debug(f'GPU Memory {mem_avail/GB:.1f} GB available, {blksize} aux AOs per block')
-        for k0, k1 in lib.prange(0,nnz,blksize):
-            rhok1_Pko_kslice = cupy.asarray(rhok1_Pko[k0:k1])
-
-            # (10|0)(0|10) without response of RI basis
-            vk2_ip1_ip1 = contract('piox,pkoy->ikxy', rhok1_Pko_kslice, rhok1_Pko_kslice)
-            hk_ao_ao += contract('ikxy,ik->ikxy', vk2_ip1_ip1, dm0)
-            vk2_ip1_ip1 = None
-
-            # (10|0)(0|01) without response of RI basis
-            rhok1_Pkl_kslice = contract('piox,ko->pikx', rhok1_Pko_kslice, mocc_2)
-            hk_ao_ao += contract('pikx,pkiy->ikxy', rhok1_Pkl_kslice, rhok1_Pkl_kslice)
-            rhok1_Pkl_kslice = None
-        rhok1_Pko_kslice = None
-
+        
+        hk_ao_ao += _hk_ip1_ip1(rhok1_Pko, dm0, mocc_2)
     wk1_Pko = rhok1_Pko = None
     t1 = log.timer_debug1('intermediate variables with int3c2e_ip1', *t1)
 
     cupy.get_default_memory_pool().free_all_blocks()
     #  int3c_ipip1 contributions
-    hj_ao_diag, hk_ao_diag = int3c2e.get_int3c2e_ipip1_hjk(intopt, rhoj0_P, rhok0_P__, dm0_tag, omega=omega, with_k=with_k)
+    hj_ao_diag, hk_ao_diag = int3c2e.get_int3c2e_hjk(intopt, 'ipip1', rhoj0_P, rhok0_P__, dm0_tag, omega=omega, with_k=with_k)
     hj_ao_diag *= 2.0
     t1 = log.timer_debug1('intermediate variables with int3c2e_ipip1', *t1)
 
     #  int3c_ipvip1 contributions
     # (11|0), (0|00) without response of RI basis
-    hj, hk = int3c2e.get_int3c2e_ipvip1_hjk(intopt, rhoj0_P, rhok0_P__, dm0_tag, omega=omega, with_k=with_k)
+    hj, hk = int3c2e.get_int3c2e_hjk(intopt, 'ipvip1', rhoj0_P, rhok0_P__, dm0_tag, omega=omega, with_k=with_k)
     hj_ao_ao += 2.0*hj
     if with_k:
         hk_ao_ao += hk
@@ -227,7 +233,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     #  int3c_ip1ip2 contributions
     # (10|1), (0|0)(0|00)
     if hessobj.auxbasis_response:
-        hj, hk = int3c2e.get_int3c2e_ip1ip2_hjk(intopt, rhoj0_P, rhok0_P__, dm0_tag, omega=omega, with_k=with_k)
+        hj, hk = int3c2e.get_int3c2e_hjk(intopt, 'ip1ip2', rhoj0_P, rhok0_P__, dm0_tag, omega=omega, with_k=with_k)
         hj_ao_aux += hj
         if with_k:
             hk_ao_aux += hk
@@ -237,7 +243,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     #  int3c_ipip2 contributions
     if hessobj.auxbasis_response > 1:
         # (00|2), (0|0)(0|00)
-        hj, hk = int3c2e.get_int3c2e_ipip2_hjk(intopt, rhoj0_P, rhok0_P__, dm0_tag, omega=omega, with_k=with_k)
+        hj, hk = int3c2e.get_int3c2e_hjk(intopt, 'ipip2', rhoj0_P, rhok0_P__, dm0_tag, omega=omega, with_k=with_k)
         hj_aux_diag = hj
         if with_k:
             hk_aux_diag = .5*hk
 
@@ -221,19 +221,19 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
 
     cupy.get_default_memory_pool().free_all_blocks()
     #  int3c_ipip1 contributions
-    fn = int3c2e.get_int3c2e_ipip1_hjk
-    hja_ao_diag, hka_ao_diag = fn(intopt, rhoj0_P, rhok0a_P__, dm0a_tag, omega=omega, with_k=with_k)
-    hjb_ao_diag, hkb_ao_diag = fn(intopt, rhoj0_P, rhok0b_P__, dm0b_tag, omega=omega, with_k=with_k)
+    fn = int3c2e.get_int3c2e_hjk
+    hja_ao_diag, hka_ao_diag = fn(intopt, 'ipip1', rhoj0_P, rhok0a_P__, dm0a_tag, omega=omega, with_k=with_k)
+    hjb_ao_diag, hkb_ao_diag = fn(intopt, 'ipip1', rhoj0_P, rhok0b_P__, dm0b_tag, omega=omega, with_k=with_k)
     hj_ao_diag = 2.0 * (hja_ao_diag + hjb_ao_diag)
     if with_k:
         hk_ao_diag = 2.0 * (hka_ao_diag + hkb_ao_diag)
     t1 = log.timer_debug1('intermediate variables with int3c2e_ipip1', *t1)
 
     #  int3c_ipvip1 contributions
     # (11|0), (0|00) without response of RI basis
-    fn = int3c2e.get_int3c2e_ipvip1_hjk
-    hja, hka = fn(intopt, rhoj0_P, rhok0a_P__, dm0a_tag, omega=omega, with_k=with_k)
-    hjb, hkb = fn(intopt, rhoj0_P, rhok0b_P__, dm0b_tag, omega=omega, with_k=with_k)
+    fn = int3c2e.get_int3c2e_hjk
+    hja, hka = fn(intopt, 'ipvip1', rhoj0_P, rhok0a_P__, dm0a_tag, omega=omega, with_k=with_k)
+    hjb, hkb = fn(intopt, 'ipvip1', rhoj0_P, rhok0b_P__, dm0b_tag, omega=omega, with_k=with_k)
     hj_ao_ao += 2.0*(hja + hjb)
     if with_k:
         hk_ao_ao += (hka + hkb)
@@ -243,9 +243,9 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     #  int3c_ip1ip2 contributions
     # (10|1), (0|0)(0|00)
     if hessobj.auxbasis_response:
-        fn = int3c2e.get_int3c2e_ip1ip2_hjk
-        hja, hka = fn(intopt, rhoj0_P, rhok0a_P__, dm0a_tag, omega=omega, with_k=with_k)
-        hjb, hkb = fn(intopt, rhoj0_P, rhok0b_P__, dm0b_tag, omega=omega, with_k=with_k)
+        fn = int3c2e.get_int3c2e_hjk
+        hja, hka = fn(intopt, 'ip1ip2', rhoj0_P, rhok0a_P__, dm0a_tag, omega=omega, with_k=with_k)
+        hjb, hkb = fn(intopt, 'ip1ip2', rhoj0_P, rhok0b_P__, dm0b_tag, omega=omega, with_k=with_k)
         hj_ao_aux += hja + hjb
         if with_k:
             hk_ao_aux += hka + hkb
@@ -255,9 +255,9 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     #  int3c_ipip2 contributions
     if hessobj.auxbasis_response > 1:
         # (00|2), (0|0)(0|00)
-        fn = int3c2e.get_int3c2e_ipip2_hjk
-        hja, hka = fn(intopt, rhoj0_P, rhok0a_P__, dm0a_tag, omega=omega, with_k=with_k)
-        hjb, hkb = fn(intopt, rhoj0_P, rhok0b_P__, dm0b_tag, omega=omega, with_k=with_k)
+        fn = int3c2e.get_int3c2e_hjk
+        hja, hka = fn(intopt, 'ipip2', rhoj0_P, rhok0a_P__, dm0a_tag, omega=omega, with_k=with_k)
+        hjb, hkb = fn(intopt, 'ipip2', rhoj0_P, rhok0b_P__, dm0b_tag, omega=omega, with_k=with_k)
         hj_aux_diag = hja + hjb
         if with_k:
             hk_aux_diag = (hka + hkb)