deepmodeling
diff --git a/‎.gitignore
Lines changed: 1 addition & 0 deletions b/‎.gitignore
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md
Lines changed: 20 additions & 2 deletions b/‎README.md
Lines changed: 20 additions & 2 deletions
diff --git a/‎dpdata/cp2k/__init__.py b/‎dpdata/cp2k/__init__.py
diff --git a/‎dpdata/cp2k/cell.py
Lines changed: 61 additions & 0 deletions b/‎dpdata/cp2k/cell.py
Lines changed: 61 additions & 0 deletions
diff --git a/‎dpdata/cp2k/output.py
Lines changed: 211 additions & 2 deletions b/‎dpdata/cp2k/output.py
Lines changed: 211 additions & 2 deletions
diff --git a/‎dpdata/deepmd/__init__.py b/‎dpdata/deepmd/__init__.py
diff --git a/‎dpdata/deepmd/comp.py
Lines changed: 8 additions & 1 deletion b/‎dpdata/deepmd/comp.py
Lines changed: 8 additions & 1 deletion
diff --git a/‎dpdata/deepmd/raw.py
Lines changed: 8 additions & 1 deletion b/‎dpdata/deepmd/raw.py
Lines changed: 8 additions & 1 deletion
diff --git a/‎dpdata/gaussian/__init__.py b/‎dpdata/gaussian/__init__.py
diff --git a/‎dpdata/gaussian/log.py
Lines changed: 1 addition & 0 deletions b/‎dpdata/gaussian/log.py
Lines changed: 1 addition & 0 deletions
@@ -19,3 +19,4 @@ build
 dist
 dpdata.egg-info
 _version.py
+!tests/cp2k/aimd/cp2k.log
@@ -99,9 +99,10 @@ xyz_multi_systems.to_deepmd_raw('./my_deepmd_data/')
 | deepmd  | npy    | True         | True    | LabeledSystem | 'deepmd/npy'  |
 | gaussian| log    | False        | True    | LabeledSystem | 'gaussian/log'|
 | gaussian| log    | True         | True    | LabeledSystem | 'gaussian/md' |
-| siesta| output    | False        | True    | LabeledSystem | 'siesta/output'|
-| siesta| aimd_output    | True         | True    | LabeledSystem | 'siesta/aimd_output' |
+| siesta  | output | False        | True    | LabeledSystem | 'siesta/output'|
+| siesta  | aimd_output  | True         | True    | LabeledSystem | 'siesta/aimd_output' |
 | cp2k    | output | False        | True    | LabeledSystem | 'cp2k/output' |
+| cp2k    | aimd_output  | True         | True    | LabeledSystem | 'cp2k/aimd_output' |
 | QE      | log    | False        | True    | LabeledSystem | 'qe/pw/scf'   |
 | QE      | log    | True         | False   | System        | 'qe/cp/traj'  |
 | QE      | log    | True         | True    | LabeledSystem | 'qe/cp/traj'  |
@@ -152,3 +153,20 @@ Frame selection can be implemented by
 dpdata.LabeledSystem('OUTCAR').sub_system([0,-1]).to_deepmd_raw('dpmd_raw')
 ```
 by which only the first and last frames are dumped to `dpmd_raw`.
+
+## replicate 
+dpdata will create a super cell of the current atom configuration.
+```python
+dpdata.System('./POSCAR').replicate((1,2,3,) )
+```
+tuple(1,2,3) means don't copy atom configuration in x direction, make 2 copys in y direction, make 3 copys in z direction.
+
+## perturb
+By the following example, each frame of the original system (`dpdata.System('./POSCAR')`) is perturbed to generate three new frames. For each frame, the cell is perturbed by 5% and the atom positions are perturbed by 0.6 Angstrom. `atom_pert_style` indicates that the perturbation to the atom positions is subject to normal distribution. Other available options to `atom_pert_style` are`uniform` (uniform in a ball), and `const` (uniform on a sphere).
+```python
+perturbed_system = dpdata.System('./POSCAR').perturb(pert_num=3, 
+    cell_pert_fraction=0.05, 
+    atom_pert_distance=0.6, 
+    atom_pert_style='normal')
+print(perturbed_system.data)
+```
@@ -0,0 +1,61 @@
+
+#%%
+import numpy as np
+from collections import OrderedDict
+import re
+
+def cell_to_low_triangle(A,B,C,alpha,beta,gamma):
+    """
+        Convert cell to low triangle matrix.
+
+        Parameters
+        ----------
+        A : float
+            cell length A
+        B : float
+            cell length B
+        C : float
+            cell length C
+        alpha : float
+            radian. The angle between vector B and  vector C.
+        beta : float
+            radian. The angle between vector A and  vector C.
+        gamma : float
+            radian. The angle between vector B and  vector C.
+        
+        Returns
+        -------
+        cell : list
+            The cell matrix used by dpdata in low triangle form.
+    """
+    if not np.pi*5/180<alpha< np.pi*175/180:
+        raise RuntimeError("alpha=={}: must be a radian, and \
+            must be in np.pi*5/180 < alpha < np.pi*175/180".format(alpha))
+    if not np.pi*5/180<beta< np.pi*175/180:
+        raise RuntimeError("beta=={}: must be a radian, and \
+            must be in np.pi*5/180 < beta < np.pi*175/180".format(beta))
+    if not np.pi*5/180<gamma< np.pi*175/180:
+            raise RuntimeError("gamma=={}: must be a radian, and \
+                must be in np.pi*5/180 < gamma < np.pi*175/180".format(gamma))
+    if not A > 0.2:
+        raise RuntimeError("A=={}, must be greater than 0.2".format(A))
+    if not B > 0.2:
+        raise RuntimeError("B=={}, must be greater than 0.2".format(B))
+    if not C > 0.2:
+        raise RuntimeError("C=={}, must be greater than 0.2".format(C))
+
+    lx = A
+    xy = B * np.cos(gamma)
+    xz = C * np.cos(beta)
+    ly = B* np.sin(gamma)
+    if not ly > 0.1:
+        raise RuntimeError("ly:=B* np.sin(gamma)=={}, must be greater than 0.1",format(ly))
+    yz = (B*C*np.cos(alpha)-xy*xz)/ly
+    if not C**2-xz**2-yz**2 > 0.01:
+        raise RuntimeError("lz^2:=C**2-xz**2-yz**2=={}, must be greater than 0.01",format(C**2-xz**2-yz**2))
+    lz = np.sqrt(C**2-xz**2-yz**2)
+    cell = np.asarray([[lx, 0 , 0],
+            [xy, ly, 0 ],
+            [xz, yz, lz]]).astype('float32')
+    return cell
+
@@ -1,11 +1,217 @@
+#%%
 import numpy as np
+import re
+from collections import OrderedDict
+from .cell import cell_to_low_triangle
 
+#%%
+AU_TO_ANG = 5.29177208590000E-01
+AU_TO_EV = 2.72113838565563E+01
+AU_TO_EV_EVERY_ANG = AU_TO_EV/AU_TO_ANG
+delimiter_patterns=[]
+delimiter_p1 = re.compile(r'^ \* GO CP2K GO! \*+')
+delimiter_p2 = re.compile(r'^ \*+')
+delimiter_patterns.append(delimiter_p1)
+delimiter_patterns.append(delimiter_p2)
+avail_patterns = []
+
+avail_patterns.append(re.compile(r'^ INITIAL POTENTIAL ENERGY'))
+avail_patterns.append(re.compile(r'^ ENSEMBLE TYPE'))
+
+class Cp2kSystems(object):
+    """
+    deal with cp2k outputfile 
+    """
+    def __init__(self, log_file_name, xyz_file_name):
+        self.log_file_object = open(log_file_name, 'r')
+        self.xyz_file_object = open(xyz_file_name, 'r')
+        self.log_block_generator = self.get_log_block_generator()
+        self.xyz_block_generator = self.get_xyz_block_generator()
+        self.cell=None
+    def __del__(self):
+        self.log_file_object.close()
+        self.xyz_file_object.close()
+    def __iter__(self):
+        return self
+    def __next__(self):
+        info_dict = {}
+        log_info_dict = self.handle_single_log_frame(next(self.log_block_generator))
+        xyz_info_dict = self.handle_single_xyz_frame(next(self.xyz_block_generator))
+        eq1 = [v1==v2 for v1,v2 in zip(log_info_dict['atom_numbs'], xyz_info_dict['atom_numbs'])]
+        eq2 = [v1==v2 for v1,v2 in zip(log_info_dict['atom_names'], xyz_info_dict['atom_names'])]
+        eq3 = [v1==v2 for v1,v2 in zip(log_info_dict['atom_types'], xyz_info_dict['atom_types'])]
+        assert all(eq1), (log_info_dict,xyz_info_dict,'There may be errors in the file')
+        assert all(eq2), (log_info_dict,xyz_info_dict,'There may be errors in the file')
+        assert all(eq3), (log_info_dict,xyz_info_dict,'There may be errors in the file')
+        assert log_info_dict['energies']==xyz_info_dict['energies'], (log_info_dict['energies'],xyz_info_dict['energies'],'There may be errors in the file')
+        info_dict.update(log_info_dict)
+        info_dict.update(xyz_info_dict)
+        return info_dict
+
+    def get_log_block_generator(self):
+        lines = []
+        delimiter_flag = False
+        while True:
+            line = self.log_file_object.readline()
+            if line:
+                lines.append(line)
+                if any(p.match(line) for p in delimiter_patterns):
+                    if delimiter_flag is True:
+                        yield lines
+                        lines = []
+                        delimiter_flag = False
+                    else:
+                        line = self.log_file_object.readline()
+                        lines.append(line)
+                        if any(p.match(line) for p in avail_patterns):
+                            delimiter_flag = True
+            else:
+                break
+        if delimiter_flag is True:
+            raise RuntimeError('This file lacks some content, please check')
+    
+    def get_xyz_block_generator(self):
+        p3 = re.compile(r'^\s*(\d+)\s*')
+        while True:
+            line = self.xyz_file_object.readline()
+            if not line:
+                break
+            if p3.match(line):
+                atom_num = int(p3.match(line).group(1))
+                lines = []
+                lines.append(line)
+                for ii in range(atom_num+1):
+                    lines.append(self.xyz_file_object.readline())
+                if not lines[-1]:
+                    raise RuntimeError("this xyz file may lack of lines, should be {};lines:{}".format(atom_num+2, lines))
+                yield lines
+
+    def handle_single_log_frame(self, lines):
+        info_dict={}
+        energy_pattern_1 = re.compile(r' INITIAL POTENTIAL ENERGY\[hartree\]\s+=\s+(?P<number>\S+)')
+        #  CONSERVED QUANTITY [hartree] =                              -0.279168013085E+04
+        energy_pattern_2 = re.compile(r' POTENTIAL ENERGY\[hartree\]\s+=\s+(?P<number>\S+)')
+        energy=None
+        cell_length_pattern = re.compile(r' INITIAL CELL LNTHS\[bohr\]\s+=\s+(?P<A>\S+)\s+(?P<B>\S+)\s+(?P<C>\S+)')
+        cell_angle_pattern = re.compile(r' INITIAL CELL ANGLS\[deg\]\s+=\s+(?P<alpha>\S+)\s+(?P<beta>\S+)\s+(?P<gamma>\S+)')
+        cell_A, cell_B, cell_C = (0,0,0,)
+        cell_alpha, cell_beta, cell_gamma=(0,0,0,)
+        force_start_pattern = re.compile(r' ATOMIC FORCES in')
+        force_flag=False
+        force_end_pattern = re.compile(r' SUM OF ATOMIC FORCES')
+        force_lines= []
+        cell_flag=0
+        for line in lines:
+            if force_start_pattern.match(line):
+                force_flag=True
+            if force_end_pattern.match(line):
+                assert force_flag is True, (force_flag,'there may be errors in this file ')
+                force_flag=False
+            if force_flag is True:
+                    force_lines.append(line)
+            if energy_pattern_1.match(line):
+                energy = float(energy_pattern_1.match(line).groupdict()['number']) * AU_TO_EV
+            if energy_pattern_2.match(line):
+                energy = float(energy_pattern_2.match(line).groupdict()['number']) * AU_TO_EV
+            if cell_length_pattern.match(line):
+                cell_A = float(cell_length_pattern.match(line).groupdict()['A']) * AU_TO_ANG
+                cell_B = float(cell_length_pattern.match(line).groupdict()['B']) * AU_TO_ANG
+                cell_C = float(cell_length_pattern.match(line).groupdict()['C']) * AU_TO_ANG 
+                cell_flag+=1
+            if cell_angle_pattern.match(line):
+                cell_alpha = np.deg2rad(float(cell_angle_pattern.match(line).groupdict()['alpha']))
+                cell_beta = np.deg2rad(float(cell_angle_pattern.match(line).groupdict()['beta']))
+                cell_gamma = np.deg2rad(float(cell_angle_pattern.match(line).groupdict()['gamma']))
+                cell_flag+=1
+        if cell_flag == 2:
+            self.cell = cell_to_low_triangle(cell_A,cell_B,cell_C,
+                cell_alpha,cell_beta,cell_gamma)
+            # lx = cell_A
+            # xy = cell_B * np.cos(cell_gamma)
+            # xz = cell_C * np.cos(cell_beta)
+            # ly = cell_B* np.sin(cell_gamma)
+            # yz = (cell_B*cell_C*np.cos(cell_alpha)-xy*xz)/ly
+            # lz = np.sqrt(cell_C**2-xz**2-yz**2)
+            # self.cell = [[lx, 0 , 0],
+            #         [xy, ly, 0 ],
+            #         [xz, yz, lz]]
+
+        element_index = -1
+        element_dict = OrderedDict()
+        atom_types_list = []
+        forces_list = []
+        for line in force_lines[3:]:
+            line_list = line.split()
+            if element_dict.get(line_list[2]):
+                element_dict[line_list[2]][1]+=1
+            else:
+                element_index +=1
+                element_dict[line_list[2]]=[element_index,1]
+            atom_types_list.append(element_dict[line_list[2]][0])
+            forces_list.append([float(line_list[3])*AU_TO_EV_EVERY_ANG, 
+                float(line_list[4])*AU_TO_EV_EVERY_ANG, 
+                float(line_list[5])*AU_TO_EV_EVERY_ANG])
+        
+        atom_names=list(element_dict.keys())
+        atom_numbs=[]
+        for ii in atom_names:
+            atom_numbs.append(element_dict[ii][1])
+        info_dict['atom_names'] = atom_names
+        info_dict['atom_numbs'] = atom_numbs
+        info_dict['atom_types'] = np.asarray(atom_types_list)
+        info_dict['cells'] = np.asarray([self.cell]).astype('float32')
+        info_dict['energies'] = np.asarray([energy]).astype('float32')
+        info_dict['forces'] = np.asarray([forces_list]).astype('float32')
+        return info_dict
+
+    def handle_single_xyz_frame(self, lines):
+        info_dict = {}
+        atom_num = int(lines[0].strip('\n').strip())
+        if len(lines) != atom_num + 2:
+            raise RuntimeError("format error, atom_num=={}, {}!=atom_num+2".format(atom_num, len(lines)))
+        data_format_line = lines[1].strip('\n').strip()+str(' ')
+        prop_pattern = re.compile(r'(?P<prop>\w+)\s*=\s*(?P<number>.*?)[, ]')
+        prop_dict = dict(prop_pattern.findall(data_format_line))
+
+        energy=0
+        if prop_dict.get('E'):
+            energy = float(prop_dict.get('E')) * AU_TO_EV
+            # info_dict['energies'] = np.array([prop_dict['E']]).astype('float32')
+
+        element_index = -1
+        element_dict = OrderedDict()
+        atom_types_list = []
+        coords_list = []
+        for line in lines[2:]:
+            line_list = line.split()
+            if element_dict.get(line_list[0]):
+                element_dict[line_list[0]][1]+=1
+            else:
+                element_index +=1
+                element_dict[line_list[0]]=[element_index,1]
+            atom_types_list.append(element_dict[line_list[0]][0])
+            coords_list.append([float(line_list[1])*AU_TO_ANG, 
+                float(line_list[2])*AU_TO_ANG, 
+                float(line_list[3])*AU_TO_ANG])
+        atom_names=list(element_dict.keys())
+        atom_numbs=[]
+        for ii in atom_names:
+            atom_numbs.append(element_dict[ii][1])
+        info_dict['atom_names'] = atom_names
+        info_dict['atom_numbs'] = atom_numbs
+        info_dict['atom_types'] = np.asarray(atom_types_list)
+        info_dict['coords'] = np.asarray([coords_list]).astype('float32')
+        info_dict['energies'] = np.array([energy]).astype('float32')
+        info_dict['orig']=[0,0,0]
+        return info_dict        
+
+#%%
 
 def get_frames (fname) :
     coord_flag = False
     force_flag = False
-    eV = 2.72113838565563E+01
-    angstrom = 5.29177208590000E-01
+    eV = 2.72113838565563E+01 # hatree to eV
+    angstrom = 5.29177208590000E-01 # Bohrto Angstrom 
     fp = open(fname)
     atom_symbol_list = []
     cell = []
@@ -74,3 +280,6 @@ def get_frames (fname) :
     return list(atom_names), atom_numbs, atom_types, cell, coord, energy, force
 
 
+
+
+# %%
@@ -48,6 +48,8 @@ def to_system_data(folder,
         data['forces'] = np.concatenate(all_forces, axis = 0)
     if len(all_virs) > 0:
         data['virials'] = np.concatenate(all_virs, axis = 0)
+    if os.path.isfile(os.path.join(folder, "nopbc")):
+        data['nopbc'] = True
     return data
 
 
@@ -101,5 +103,10 @@ def dump(folder,
             np.save(os.path.join(set_folder, 'virial'), virials[set_stt:set_end])
         if 'atom_pref' in data:
             np.save(os.path.join(set_folder, "atom_pref"), atom_pref[set_stt:set_end])
-        
+    try:
+        os.remove(os.path.join(folder, "nopbc"))
+    except OSError:
+        pass
+    if data.get("nopbc", False):
+        os.mknod(os.path.join(folder, "nopbc"))
 
@@ -49,6 +49,8 @@ def to_system_data(folder, type_map = None, labels = True) :
             if os.path.exists(os.path.join(folder, 'virial.raw')) :
                 data['virials'] = np.loadtxt(os.path.join(folder, 'virial.raw'))
                 data['virials'] = np.reshape(data['virials'], [nframes, 3, 3])
+        if os.path.isfile(os.path.join(folder, "nopbc")):
+            data['nopbc'] = True
         return data
     else :        
         raise RuntimeError('not dir ' + folder)
@@ -67,5 +69,10 @@ def dump (folder, data) :
         np.savetxt(os.path.join(folder, 'force.raw'),   np.reshape(data['forces'],   [nframes, -1]))
     if 'virials' in data :
         np.savetxt(os.path.join(folder, 'virial.raw'), np.reshape(data['virials'], [nframes, 9]))
-    
+    try:
+        os.remove(os.path.join(folder, "nopbc"))
+    except OSError:
+        pass
+    if data.get("nopbc", False):
+        os.mknod(os.path.join(folder, "nopbc"))
 
@@ -69,4 +69,5 @@ def to_system_data(file_name, md=False):
     data['coords'] = np.array(coords_t)
     data['orig'] = np.array([0, 0, 0])
     data['cells'] = np.array([[[100., 0., 0.], [0., 100., 0.], [0., 0., 100.]] for _ in energy_t])
+    data['nopbc'] = True
     return data