deepmodeling
diff --git a/‎.gitignore
Lines changed: 1 addition & 0 deletions b/‎.gitignore
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md
Lines changed: 3 additions & 2 deletions b/‎README.md
Lines changed: 3 additions & 2 deletions
diff --git a/‎dpdata/cp2k/__init__.py b/‎dpdata/cp2k/__init__.py
diff --git a/‎dpdata/cp2k/cell.py
Lines changed: 61 additions & 0 deletions b/‎dpdata/cp2k/cell.py
Lines changed: 61 additions & 0 deletions
diff --git a/‎dpdata/cp2k/output.py
Lines changed: 211 additions & 2 deletions b/‎dpdata/cp2k/output.py
Lines changed: 211 additions & 2 deletions
diff --git a/‎dpdata/system.py
Lines changed: 12 additions & 0 deletions b/‎dpdata/system.py
Lines changed: 12 additions & 0 deletions
@@ -19,3 +19,4 @@ build
 dist
 dpdata.egg-info
 _version.py
+!tests/cp2k/aimd/cp2k.log
@@ -99,9 +99,10 @@ xyz_multi_systems.to_deepmd_raw('./my_deepmd_data/')
 | deepmd  | npy    | True         | True    | LabeledSystem | 'deepmd/npy'  |
 | gaussian| log    | False        | True    | LabeledSystem | 'gaussian/log'|
 | gaussian| log    | True         | True    | LabeledSystem | 'gaussian/md' |
-| siesta| output    | False        | True    | LabeledSystem | 'siesta/output'|
-| siesta| aimd_output    | True         | True    | LabeledSystem | 'siesta/aimd_output' |
+| siesta  | output | False        | True    | LabeledSystem | 'siesta/output'|
+| siesta  | aimd_output  | True         | True    | LabeledSystem | 'siesta/aimd_output' |
 | cp2k    | output | False        | True    | LabeledSystem | 'cp2k/output' |
+| cp2k    | aimd_output  | True         | True    | LabeledSystem | 'cp2k/aimd_output' |
 | QE      | log    | False        | True    | LabeledSystem | 'qe/pw/scf'   |
 | QE      | log    | True         | False   | System        | 'qe/cp/traj'  |
 | QE      | log    | True         | True    | LabeledSystem | 'qe/cp/traj'  |
 
@@ -0,0 +1,61 @@
+
+#%%
+import numpy as np
+from collections import OrderedDict
+import re
+
+def cell_to_low_triangle(A,B,C,alpha,beta,gamma):
+    """
+        Convert cell to low triangle matrix.
+
+        Parameters
+        ----------
+        A : float
+            cell length A
+        B : float
+            cell length B
+        C : float
+            cell length C
+        alpha : float
+            radian. The angle between vector B and  vector C.
+        beta : float
+            radian. The angle between vector A and  vector C.
+        gamma : float
+            radian. The angle between vector B and  vector C.
+        
+        Returns
+        -------
+        cell : list
+            The cell matrix used by dpdata in low triangle form.
+    """
+    if not np.pi*5/180<alpha< np.pi*175/180:
+        raise RuntimeError("alpha=={}: must be a radian, and \
+            must be in np.pi*5/180 < alpha < np.pi*175/180".format(alpha))
+    if not np.pi*5/180<beta< np.pi*175/180:
+        raise RuntimeError("beta=={}: must be a radian, and \
+            must be in np.pi*5/180 < beta < np.pi*175/180".format(beta))
+    if not np.pi*5/180<gamma< np.pi*175/180:
+            raise RuntimeError("gamma=={}: must be a radian, and \
+                must be in np.pi*5/180 < gamma < np.pi*175/180".format(gamma))
+    if not A > 0.2:
+        raise RuntimeError("A=={}, must be greater than 0.2".format(A))
+    if not B > 0.2:
+        raise RuntimeError("B=={}, must be greater than 0.2".format(B))
+    if not C > 0.2:
+        raise RuntimeError("C=={}, must be greater than 0.2".format(C))
+
+    lx = A
+    xy = B * np.cos(gamma)
+    xz = C * np.cos(beta)
+    ly = B* np.sin(gamma)
+    if not ly > 0.1:
+        raise RuntimeError("ly:=B* np.sin(gamma)=={}, must be greater than 0.1",format(ly))
+    yz = (B*C*np.cos(alpha)-xy*xz)/ly
+    if not C**2-xz**2-yz**2 > 0.01:
+        raise RuntimeError("lz^2:=C**2-xz**2-yz**2=={}, must be greater than 0.01",format(C**2-xz**2-yz**2))
+    lz = np.sqrt(C**2-xz**2-yz**2)
+    cell = np.asarray([[lx, 0 , 0],
+            [xy, ly, 0 ],
+            [xz, yz, lz]]).astype('float32')
+    return cell
+
@@ -1,11 +1,217 @@
+#%%
 import numpy as np
+import re
+from collections import OrderedDict
+from .cell import cell_to_low_triangle
 
+#%%
+AU_TO_ANG = 5.29177208590000E-01
+AU_TO_EV = 2.72113838565563E+01
+AU_TO_EV_EVERY_ANG = AU_TO_EV/AU_TO_ANG
+delimiter_patterns=[]
+delimiter_p1 = re.compile(r'^ \* GO CP2K GO! \*+')
+delimiter_p2 = re.compile(r'^ \*+')
+delimiter_patterns.append(delimiter_p1)
+delimiter_patterns.append(delimiter_p2)
+avail_patterns = []
+
+avail_patterns.append(re.compile(r'^ INITIAL POTENTIAL ENERGY'))
+avail_patterns.append(re.compile(r'^ ENSEMBLE TYPE'))
+
+class Cp2kSystems(object):
+    """
+    deal with cp2k outputfile 
+    """
+    def __init__(self, log_file_name, xyz_file_name):
+        self.log_file_object = open(log_file_name, 'r')
+        self.xyz_file_object = open(xyz_file_name, 'r')
+        self.log_block_generator = self.get_log_block_generator()
+        self.xyz_block_generator = self.get_xyz_block_generator()
+        self.cell=None
+    def __del__(self):
+        self.log_file_object.close()
+        self.xyz_file_object.close()
+    def __iter__(self):
+        return self
+    def __next__(self):
+        info_dict = {}
+        log_info_dict = self.handle_single_log_frame(next(self.log_block_generator))
+        xyz_info_dict = self.handle_single_xyz_frame(next(self.xyz_block_generator))
+        eq1 = [v1==v2 for v1,v2 in zip(log_info_dict['atom_numbs'], xyz_info_dict['atom_numbs'])]
+        eq2 = [v1==v2 for v1,v2 in zip(log_info_dict['atom_names'], xyz_info_dict['atom_names'])]
+        eq3 = [v1==v2 for v1,v2 in zip(log_info_dict['atom_types'], xyz_info_dict['atom_types'])]
+        assert all(eq1), (log_info_dict,xyz_info_dict,'There may be errors in the file')
+        assert all(eq2), (log_info_dict,xyz_info_dict,'There may be errors in the file')
+        assert all(eq3), (log_info_dict,xyz_info_dict,'There may be errors in the file')
+        assert log_info_dict['energies']==xyz_info_dict['energies'], (log_info_dict['energies'],xyz_info_dict['energies'],'There may be errors in the file')
+        info_dict.update(log_info_dict)
+        info_dict.update(xyz_info_dict)
+        return info_dict
+
+    def get_log_block_generator(self):
+        lines = []
+        delimiter_flag = False
+        while True:
+            line = self.log_file_object.readline()
+            if line:
+                lines.append(line)
+                if any(p.match(line) for p in delimiter_patterns):
+                    if delimiter_flag is True:
+                        yield lines
+                        lines = []
+                        delimiter_flag = False
+                    else:
+                        line = self.log_file_object.readline()
+                        lines.append(line)
+                        if any(p.match(line) for p in avail_patterns):
+                            delimiter_flag = True
+            else:
+                break
+        if delimiter_flag is True:
+            raise RuntimeError('This file lacks some content, please check')
+    
+    def get_xyz_block_generator(self):
+        p3 = re.compile(r'^\s*(\d+)\s*')
+        while True:
+            line = self.xyz_file_object.readline()
+            if not line:
+                break
+            if p3.match(line):
+                atom_num = int(p3.match(line).group(1))
+                lines = []
+                lines.append(line)
+                for ii in range(atom_num+1):
+                    lines.append(self.xyz_file_object.readline())
+                if not lines[-1]:
+                    raise RuntimeError("this xyz file may lack of lines, should be {};lines:{}".format(atom_num+2, lines))
+                yield lines
+
+    def handle_single_log_frame(self, lines):
+        info_dict={}
+        energy_pattern_1 = re.compile(r' INITIAL POTENTIAL ENERGY\[hartree\]\s+=\s+(?P<number>\S+)')
+        #  CONSERVED QUANTITY [hartree] =                              -0.279168013085E+04
+        energy_pattern_2 = re.compile(r' POTENTIAL ENERGY\[hartree\]\s+=\s+(?P<number>\S+)')
+        energy=None
+        cell_length_pattern = re.compile(r' INITIAL CELL LNTHS\[bohr\]\s+=\s+(?P<A>\S+)\s+(?P<B>\S+)\s+(?P<C>\S+)')
+        cell_angle_pattern = re.compile(r' INITIAL CELL ANGLS\[deg\]\s+=\s+(?P<alpha>\S+)\s+(?P<beta>\S+)\s+(?P<gamma>\S+)')
+        cell_A, cell_B, cell_C = (0,0,0,)
+        cell_alpha, cell_beta, cell_gamma=(0,0,0,)
+        force_start_pattern = re.compile(r' ATOMIC FORCES in')
+        force_flag=False
+        force_end_pattern = re.compile(r' SUM OF ATOMIC FORCES')
+        force_lines= []
+        cell_flag=0
+        for line in lines:
+            if force_start_pattern.match(line):
+                force_flag=True
+            if force_end_pattern.match(line):
+                assert force_flag is True, (force_flag,'there may be errors in this file ')
+                force_flag=False
+            if force_flag is True:
+                    force_lines.append(line)
+            if energy_pattern_1.match(line):
+                energy = float(energy_pattern_1.match(line).groupdict()['number']) * AU_TO_EV
+            if energy_pattern_2.match(line):
+                energy = float(energy_pattern_2.match(line).groupdict()['number']) * AU_TO_EV
+            if cell_length_pattern.match(line):
+                cell_A = float(cell_length_pattern.match(line).groupdict()['A']) * AU_TO_ANG
+                cell_B = float(cell_length_pattern.match(line).groupdict()['B']) * AU_TO_ANG
+                cell_C = float(cell_length_pattern.match(line).groupdict()['C']) * AU_TO_ANG 
+                cell_flag+=1
+            if cell_angle_pattern.match(line):
+                cell_alpha = np.deg2rad(float(cell_angle_pattern.match(line).groupdict()['alpha']))
+                cell_beta = np.deg2rad(float(cell_angle_pattern.match(line).groupdict()['beta']))
+                cell_gamma = np.deg2rad(float(cell_angle_pattern.match(line).groupdict()['gamma']))
+                cell_flag+=1
+        if cell_flag == 2:
+            self.cell = cell_to_low_triangle(cell_A,cell_B,cell_C,
+                cell_alpha,cell_beta,cell_gamma)
+            # lx = cell_A
+            # xy = cell_B * np.cos(cell_gamma)
+            # xz = cell_C * np.cos(cell_beta)
+            # ly = cell_B* np.sin(cell_gamma)
+            # yz = (cell_B*cell_C*np.cos(cell_alpha)-xy*xz)/ly
+            # lz = np.sqrt(cell_C**2-xz**2-yz**2)
+            # self.cell = [[lx, 0 , 0],
+            #         [xy, ly, 0 ],
+            #         [xz, yz, lz]]
+
+        element_index = -1
+        element_dict = OrderedDict()
+        atom_types_list = []
+        forces_list = []
+        for line in force_lines[3:]:
+            line_list = line.split()
+            if element_dict.get(line_list[2]):
+                element_dict[line_list[2]][1]+=1
+            else:
+                element_index +=1
+                element_dict[line_list[2]]=[element_index,1]
+            atom_types_list.append(element_dict[line_list[2]][0])
+            forces_list.append([float(line_list[3])*AU_TO_EV_EVERY_ANG, 
+                float(line_list[4])*AU_TO_EV_EVERY_ANG, 
+                float(line_list[5])*AU_TO_EV_EVERY_ANG])
+        
+        atom_names=list(element_dict.keys())
+        atom_numbs=[]
+        for ii in atom_names:
+            atom_numbs.append(element_dict[ii][1])
+        info_dict['atom_names'] = atom_names
+        info_dict['atom_numbs'] = atom_numbs
+        info_dict['atom_types'] = np.asarray(atom_types_list)
+        info_dict['cells'] = np.asarray([self.cell]).astype('float32')
+        info_dict['energies'] = np.asarray([energy]).astype('float32')
+        info_dict['forces'] = np.asarray([forces_list]).astype('float32')
+        return info_dict
+
+    def handle_single_xyz_frame(self, lines):
+        info_dict = {}
+        atom_num = int(lines[0].strip('\n').strip())
+        if len(lines) != atom_num + 2:
+            raise RuntimeError("format error, atom_num=={}, {}!=atom_num+2".format(atom_num, len(lines)))
+        data_format_line = lines[1].strip('\n').strip()+str(' ')
+        prop_pattern = re.compile(r'(?P<prop>\w+)\s*=\s*(?P<number>.*?)[, ]')
+        prop_dict = dict(prop_pattern.findall(data_format_line))
+
+        energy=0
+        if prop_dict.get('E'):
+            energy = float(prop_dict.get('E')) * AU_TO_EV
+            # info_dict['energies'] = np.array([prop_dict['E']]).astype('float32')
+
+        element_index = -1
+        element_dict = OrderedDict()
+        atom_types_list = []
+        coords_list = []
+        for line in lines[2:]:
+            line_list = line.split()
+            if element_dict.get(line_list[0]):
+                element_dict[line_list[0]][1]+=1
+            else:
+                element_index +=1
+                element_dict[line_list[0]]=[element_index,1]
+            atom_types_list.append(element_dict[line_list[0]][0])
+            coords_list.append([float(line_list[1])*AU_TO_ANG, 
+                float(line_list[2])*AU_TO_ANG, 
+                float(line_list[3])*AU_TO_ANG])
+        atom_names=list(element_dict.keys())
+        atom_numbs=[]
+        for ii in atom_names:
+            atom_numbs.append(element_dict[ii][1])
+        info_dict['atom_names'] = atom_names
+        info_dict['atom_numbs'] = atom_numbs
+        info_dict['atom_types'] = np.asarray(atom_types_list)
+        info_dict['coords'] = np.asarray([coords_list]).astype('float32')
+        info_dict['energies'] = np.array([energy]).astype('float32')
+        info_dict['orig']=[0,0,0]
+        return info_dict        
+
+#%%
 
 def get_frames (fname) :
     coord_flag = False
     force_flag = False
-    eV = 2.72113838565563E+01
-    angstrom = 5.29177208590000E-01
+    eV = 2.72113838565563E+01 # hatree to eV
+    angstrom = 5.29177208590000E-01 # Bohrto Angstrom 
     fp = open(fname)
     atom_symbol_list = []
     cell = []
@@ -74,3 +280,6 @@ def get_frames (fname) :
     return list(atom_names), atom_numbs, atom_types, cell, coord, energy, force
 
 
+
+
+# %%
@@ -16,6 +16,7 @@
 import dpdata.md.pbc
 import dpdata.gaussian.log
 import dpdata.cp2k.output
+from dpdata.cp2k.output import Cp2kSystems
 from copy import deepcopy
 from monty.json import MSONable
 from monty.serialization import loadfn,dumpfn
@@ -816,6 +817,7 @@ def __init__ (self,
                 - ``gaussian/log``: gaussian logs
                 - ``gaussian/md``: gaussian ab initio molecular dynamics
                 - ``cp2k/output``: cp2k output file
+                - ``cp2k/aimd_output``: cp2k aimd output  dir(contains *pos*.xyz and *.log file)
 
         type_map : list of str
             Needed by formats deepmd/raw and deepmd/npy. Maps atom type to name. The atom with type `ii` is mapped to `type_map[ii]`.
@@ -858,6 +860,8 @@ def __init__ (self,
             self.from_gaussian_log(file_name, md=True)
         elif fmt == 'cp2k/output':
             self.from_cp2k_output(file_name)
+        elif fmt == 'cp2k/aimd_output':
+            self.from_cp2k_aimd_output(file_dir=file_name)
         else :
             raise RuntimeError('unknow data format ' + fmt)
 
@@ -901,6 +905,14 @@ def has_virial(self) :
         # return ('virials' in self.data) and (len(self.data['virials']) > 0)
         return ('virials' in self.data)
 
+
+    def from_cp2k_aimd_output(self, file_dir):
+        xyz_file=glob.glob("{}/*pos*.xyz".format(file_dir))[0]
+        log_file=glob.glob("{}/*.log".format(file_dir))[0]
+        for info_dict in Cp2kSystems(log_file, xyz_file):
+            l = LabeledSystem(data=info_dict)
+            self.append(l)
+
     def from_vasp_xml(self, file_name, begin = 0, step = 1) :
         self.data['atom_names'], \
             self.data['atom_types'], \