deepmodeling
diff --git a/‎README.md
Lines changed: 30 additions & 28 deletions b/‎README.md
Lines changed: 30 additions & 28 deletions
diff --git a/‎dpdata/amber/md.py
Lines changed: 76 additions & 0 deletions b/‎dpdata/amber/md.py
Lines changed: 76 additions & 0 deletions
diff --git a/‎dpdata/fhi_aims/__init__.py b/‎dpdata/fhi_aims/__init__.py
diff --git a/‎dpdata/fhi_aims/output.py
Lines changed: 172 additions & 0 deletions b/‎dpdata/fhi_aims/output.py
Lines changed: 172 additions & 0 deletions
diff --git a/‎dpdata/gromacs/__init__.py b/‎dpdata/gromacs/__init__.py
@@ -51,6 +51,34 @@ The labels provided in the `OUTCAR`, i.e. energies, forces and virials (if any),
 
 The `System` or `LabeledSystem` can be constructed from the following file formats with the `format key` in the table passed to argument `fmt`:
 
+| Software| format | multi frames | labeled | class	    | format key    |
+| ------- | :---   | :---:        | :---:   | :---          | :---          |
+| vasp	  | poscar | False        | False   | System	    | 'vasp/poscar' | 
+| vasp    | outcar | True         | True    | LabeledSystem | 'vasp/outcar' |	
+| vasp    | xml    | True         | True    | LabeledSystem | 'vasp/xml'    |	
+| lammps  | lmp    | False        | False   | System        | 'lammps/lmp'  |
+| lammps  | dump   | True         | False   | System        | 'lammps/dump' |
+| deepmd  | raw    | True         | False   | System	    | 'deepmd/raw'  |
+| deepmd  | npy    | True         | False   | System        | 'deepmd/npy'  |
+| deepmd  | raw    | True         | True    | LabeledSystem | 'deepmd/raw'  |
+| deepmd  | npy    | True         | True    | LabeledSystem | 'deepmd/npy'  |
+| gaussian| log    | False        | True    | LabeledSystem | 'gaussian/log'|
+| gaussian| log    | True         | True    | LabeledSystem | 'gaussian/md' |
+| siesta  | output | False        | True    | LabeledSystem | 'siesta/output'|
+| siesta  | aimd_output  | True         | True    | LabeledSystem | 'siesta/aimd_output' |
+| cp2k    | output | False        | True    | LabeledSystem | 'cp2k/output' |
+| cp2k    | aimd_output  | True         | True    | LabeledSystem | 'cp2k/aimd_output' |
+| QE      | log    | False        | True    | LabeledSystem | 'qe/pw/scf'   |
+| QE      | log    | True         | False   | System        | 'qe/cp/traj'  |
+| QE      | log    | True         | True    | LabeledSystem | 'qe/cp/traj'  |
+|quip/gap|xyz|True|True|MultiSystems|'quip/gap/xyz'|
+| PWmat   | atom.config | False        | False   | System        | 'pwmat/atom.config'  |
+| PWmat   | movement    | True         | True    | LabeledSystem | 'pwmat/movement'     |
+| PWmat   | OUT.MLMD    | True         | True    | LabeledSystem | 'pwmat/out.mlmd'     |
+| Amber   | multi       | True         | True    | LabeledSystem | 'amber/md'           |
+| Gromacs | gro         | False        | False   | System        | 'gromacs/gro'        |
+
+
 The Class `dpdata.MultiSystems`  can read data  from a dir which may contains many files of different systems, or from single xyz file which contains different systems.
 
 Use `dpdata.MultiSystems.from_dir` to read from a  directory, `dpdata.MultiSystems` will walk in the directory 
@@ -82,34 +110,8 @@ xyz_multi_systems.systems['B1C9'].to_deepmd_raw('./my_work_dir/B1C9_raw')
 
 # dump all systems
 xyz_multi_systems.to_deepmd_raw('./my_deepmd_data/')
-
-
 ```
 
-| Software| format | multi frames | labeled | class	    | format key    |
-| ------- | :---   | :---:        | :---:   | :---          | :---          |
-| vasp	  | poscar | False        | False   | System	    | 'vasp/poscar' | 
-| vasp    | outcar | True         | True    | LabeledSystem | 'vasp/outcar' |	
-| vasp    | xml    | True         | True    | LabeledSystem | 'vasp/xml'    |	
-| lammps  | lmp    | False        | False   | System        | 'lammps/lmp'  |
-| lammps  | dump   | True         | False   | System        | 'lammps/dump' |
-| deepmd  | raw    | True         | False   | System	    | 'deepmd/raw'  |
-| deepmd  | npy    | True         | False   | System        | 'deepmd/npy'  |
-| deepmd  | raw    | True         | True    | LabeledSystem | 'deepmd/raw'  |
-| deepmd  | npy    | True         | True    | LabeledSystem | 'deepmd/npy'  |
-| gaussian| log    | False        | True    | LabeledSystem | 'gaussian/log'|
-| gaussian| log    | True         | True    | LabeledSystem | 'gaussian/md' |
-| siesta  | output | False        | True    | LabeledSystem | 'siesta/output'|
-| siesta  | aimd_output  | True         | True    | LabeledSystem | 'siesta/aimd_output' |
-| cp2k    | output | False        | True    | LabeledSystem | 'cp2k/output' |
-| cp2k    | aimd_output  | True         | True    | LabeledSystem | 'cp2k/aimd_output' |
-| QE      | log    | False        | True    | LabeledSystem | 'qe/pw/scf'   |
-| QE      | log    | True         | False   | System        | 'qe/cp/traj'  |
-| QE      | log    | True         | True    | LabeledSystem | 'qe/cp/traj'  |
-|quip/gap|xyz|True|True|MultiSystems|'quip/gap/xyz'|
-| PWmat   | atom.config | False        | False   | System        | 'pwmat/atom.config'  |
-| PWmat   | movement    | True         | True    | LabeledSystem | 'pwmat/movement'     |
-| PWmat   | OUT.MLMD    | True         | True    | LabeledSystem | 'pwmat/out.mlmd'     |
 ## Access data
 These properties stored in `System` and `LabeledSystem` can be accessed by operator `[]` with the key of the property supplied, for example
 ```python
@@ -129,7 +131,6 @@ Available properties are (nframe: number of frames in the system, natoms: total
 | 'virials'	| np.ndarray	| nframes x 3 x 3	| True		| The virial tensor of each frame
 
 
-
 ## Dump data
 The data stored in `System` or `LabeledSystem` can be dumped in 'lammps/lmp' or 'vasp/poscar' format, for example:
 ```python
@@ -141,7 +142,6 @@ d_outcar.to('vasp/poscar', 'POSCAR', frame_idx=-1)
 ```
 The last frames of `d_outcar` will be dumped to 'POSCAR'.
 
-
 The data stored in `LabeledSystem` can be dumped to deepmd-kit raw format, for example
 ```python
 d_outcar.to('deepmd/raw', 'dpmd_raw')
@@ -156,13 +156,15 @@ dpdata.LabeledSystem('OUTCAR').sub_system([0,-1]).to('deepmd/raw', 'dpmd_raw')
 ```
 by which only the first and last frames are dumped to `dpmd_raw`.
 
+
 ## replicate 
 dpdata will create a super cell of the current atom configuration.
 ```python
 dpdata.System('./POSCAR').replicate((1,2,3,) )
 ```
 tuple(1,2,3) means don't copy atom configuration in x direction, make 2 copys in y direction, make 3 copys in z direction.
 
+
 ## perturb
 By the following example, each frame of the original system (`dpdata.System('./POSCAR')`) is perturbed to generate three new frames. For each frame, the cell is perturbed by 5% and the atom positions are perturbed by 0.6 Angstrom. `atom_pert_style` indicates that the perturbation to the atom positions is subject to normal distribution. Other available options to `atom_pert_style` are`uniform` (uniform in a ball), and `const` (uniform on a sphere).
 ```python
 
@@ -0,0 +1,76 @@
+import re
+from scipy.io import netcdf
+import numpy as np
+
+kcalmol2eV= 0.04336410390059322
+
+energy_convert = kcalmol2eV
+force_convert = energy_convert
+
+
+def read_amber_traj(parm7_file, nc_file, mdfrc_file, mden_file):
+    """The amber trajectory includes:
+    * nc, NetCDF format, stores coordinates
+    * mdfrc, NetCDF format, stores forces
+    * mden, text format, stores energies
+    * parm7, text format, stores types
+    """
+
+    flag=False
+    amber_types = []
+    with open(parm7_file) as f:
+        for line in f:
+            if line.startswith("%FLAG"):
+                flag = line.startswith("%FLAG AMBER_ATOM_TYPE")
+            elif flag:
+                if line.startswith("%FORMAT"):
+                    fmt = re.findall(r'\d+', line)
+                    fmt0 = int(fmt[0])
+                    fmt1 = int(fmt[1])
+                else:
+                    for ii in range(fmt0):
+                        start_index = ii * fmt1
+                        end_index = (ii + 1) * fmt1
+                        if end_index >= len(line):
+                            continue
+                        amber_types.append(line[start_index:end_index].strip())
+
+    with netcdf.netcdf_file(nc_file, 'r') as f:
+        coords = np.array(f.variables["coordinates"][:])
+        cell_lengths = np.array(f.variables["cell_lengths"][:])
+        cell_angles = np.array(f.variables["cell_angles"][:])
+        if np.all(cell_angles > 89.99 ) and np.all(cell_angles < 90.01):
+            # only support 90
+            # TODO: support other angles
+            shape = cell_lengths.shape
+            cells = np.zeros((shape[0], 3, 3))
+            for ii in range(3):
+                cells[:, ii, ii] = cell_lengths[:, ii]
+        else:
+            raise RuntimeError("Unsupported cells")
+
+    with netcdf.netcdf_file(mdfrc_file, 'r') as f:
+        forces = np.array(f.variables["forces"][:])
+
+    # energy
+    energies = []
+    with open(mden_file) as f:
+        for line in f:
+            if line.startswith("L6"):
+                s = line.split()
+                if s[2] != "E_pot":
+                    energies.append(float(s[2]))
+
+    atom_names, atom_types, atom_numbs = np.unique(amber_types, return_inverse=True, return_counts=True)
+
+    data = {}
+    data['atom_names'] = list(atom_names)
+    data['atom_numbs'] = list(atom_numbs)
+    data['atom_types'] = atom_types
+    data['forces'] = forces * force_convert
+    data['energies'] = np.array(energies) * energy_convert
+    data['coords'] = coords
+    data['cells'] = cells
+    data['orig'] = np.array([0, 0, 0])
+    return data
+
@@ -0,0 +1,172 @@
+import numpy as np
+import re
+
+latt_patt="\|\s+([0-9]{1,}[.][0-9]*)\s+([0-9]{1,}[.][0-9]*)\s+([0-9]{1,}[.][0-9]*)"
+pos_patt_first="\|\s+[0-9]{1,}[:]\s\w+\s(\w+)(\s.*[-]?[0-9]{1,}[.][0-9]*)(\s+[-]?[0-9]{1,}[.][0-9]*)(\s+[-]?[0-9]{1,}[.][0-9]*)"
+pos_patt_other="\s+[a][t][o][m]\s+([-]?[0-9]{1,}[.][0-9]*)\s+([-]?[0-9]{1,}[.][0-9]*)\s+([-]?[0-9]{1,}[.][0-9]*)\s+(\w{1,2})"
+force_patt="\|\s+[0-9]{1,}\s+([-]?[0-9]{1,}[.][0-9]*[E][+-][0-9]{1,})\s+([-]?[0-9]{1,}[.][0-9]*[E][+-][0-9]{1,})\s+([-]?[0-9]{1,}[.][0-9]*[E][+-][0-9]{1,})"
+eng_patt="Total energy uncorrected.*([-]?[0-9]{1,}[.][0-9]*[E][+-][0-9]{1,})\s+eV"
+#atom_numb_patt="Number of atoms.*([0-9]{1,})"
+
+def get_info (lines, type_idx_zero = False) :
+
+    atom_types = []
+    atom_names = []
+    cell = []
+    atom_numbs = None
+    _atom_names = []
+
+    contents="\n".join(lines)
+    #cell
+    #_tmp=re.findall(latt_patt,contents)
+    #for ii in _tmp:
+    #    vect=[float(kk) for kk in ii]
+    #    cell.append(vect)
+    #------------------
+    for ln,l in enumerate(lines):
+        if l.startswith('  | Unit cell'):
+            break
+    _tmp=lines[ln+1:ln+4]
+    for ii in _tmp:
+        v_str=ii.split('|')[1].split()
+        vect=[float(kk) for kk in v_str]
+        cell.append(vect)
+   # print(cell)
+    #atom name
+    _tmp=re.findall(pos_patt_first,contents)
+    for ii in _tmp:
+        _atom_names.append(ii[0])
+    atom_names=[]
+    for ii in _atom_names:
+        if not ii in atom_names:
+           atom_names.append(ii)
+    #atom number
+    #_atom_numb_patt=re.compile(atom_numb_patt)
+    atom_numbs =[_atom_names.count(ii) for ii in atom_names] 
+    assert(atom_numbs is not None), "cannot find ion type info in aims output"
+    
+    for idx,ii in enumerate(atom_numbs) :
+        for jj in range(ii) :
+            if type_idx_zero :
+                atom_types.append(idx)
+            else :
+                atom_types.append(idx+1)
+
+    return [cell, atom_numbs, atom_names, atom_types ]
+
+
+def get_fhi_aims_block(fp) :
+    blk = []
+    for ii in fp :
+        if not ii :
+            return blk
+        blk.append(ii.rstrip('\n'))
+        if 'Begin self-consistency loop: Re-initialization' in ii:
+            return blk
+    return blk
+
+def get_frames (fname, md=True, begin = 0, step = 1) :
+    fp = open(fname)
+    blk = get_fhi_aims_block(fp)
+    ret = get_info(blk, type_idx_zero = True)
+
+    cell, atom_numbs, atom_names, atom_types =ret[0],ret[1],ret[2],ret[3]
+    ntot = sum(atom_numbs)
+
+    all_coords = []
+    all_cells = []
+    all_energies = []
+    all_forces = []
+    all_virials = []    
+
+    cc = 0
+    while len(blk) > 0 :
+     #   with open(str(cc),'w') as f:
+     #        f.write('\n'.join(blk))
+        if cc >= begin and (cc - begin) % step == 0 :
+            if cc==0:
+                coord, _cell, energy, force, virial, is_converge = analyze_block(blk, first_blk=True, md=md)
+            else:
+                coord, _cell, energy, force, virial, is_converge = analyze_block(blk, first_blk=False)
+            if is_converge : 
+                if len(coord) == 0:
+                    break
+                all_coords.append(coord)
+
+                if _cell:
+                   all_cells.append(_cell)
+                else:
+                   all_cells.append(cell)
+
+                all_energies.append(energy)
+                all_forces.append(force)
+                if virial is not None :
+                    all_virials.append(virial)
+        blk = get_fhi_aims_block(fp)
+        cc += 1
+        
+    if len(all_virials) == 0 :
+        all_virials = None
+    else :
+        all_virials = np.array(all_virials)
+    fp.close()
+    return atom_names, atom_numbs, np.array(atom_types), np.array(all_cells), np.array(all_coords), np.array(all_energies), np.array(all_forces), all_virials
+
+
+def analyze_block(lines, first_blk=False, md=True) :
+    coord = []
+    cell = []
+    energy = None
+    force = []
+    virial = None
+    atom_names=[]
+    _atom_names=[]
+
+    contents="\n".join(lines)
+    try:
+       natom=int(re.findall("Number of atoms.*([0-9]{1,})",lines)[0])
+    except:
+       natom=0
+
+    if first_blk:
+
+       if md:
+          _tmp=re.findall(pos_patt_other,contents)[:]
+          for ii in _tmp[slice(int(len(_tmp)/2),len(_tmp))]:
+              coord.append([float(kk) for kk in ii[:-1]])
+       else:
+          _tmp=re.findall(pos_patt_first,contents)
+          for ii in _tmp:
+              coord.append([float(kk) for kk in ii[1:]])
+    else:
+       _tmp=re.findall(pos_patt_other,contents)
+       for ii in _tmp:
+           coord.append([float(kk) for kk in ii[:-1]])
+
+    _tmp=re.findall(force_patt,contents)
+    for ii in _tmp:
+        force.append([float(kk) for kk in ii])
+
+    if "Self-consistency cycle converged" in contents:
+       is_converge=True
+    else:
+       is_converge=False
+
+    try:
+      _eng_patt=re.compile(eng_patt)
+      energy=float(_eng_patt.search(contents).group().split()[-2])
+    except:
+     energy=None
+    
+    if not energy:
+       is_converge = False
+
+    if energy:
+       assert((force is not None) and len(coord) > 0 )
+
+    return coord, cell, energy, force, virial, is_converge
+
+if __name__=='__main__':
+  import sys
+  ret=get_frames (sys.argv[1], begin = 0, step = 1)
+  print(ret)