deepmodeling
diff --git a/‎.github/workflows/test.yml
Lines changed: 11 additions & 5 deletions b/‎.github/workflows/test.yml
Lines changed: 11 additions & 5 deletions
diff --git a/‎README.md
Lines changed: 45 additions & 0 deletions b/‎README.md
Lines changed: 45 additions & 0 deletions
diff --git a/‎dpdata/__init__.py
Lines changed: 12 additions & 0 deletions b/‎dpdata/__init__.py
Lines changed: 12 additions & 0 deletions
diff --git a/‎dpdata/amber/__init__.py
Lines changed: 1 addition & 0 deletions b/‎dpdata/amber/__init__.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎dpdata/amber/sqm.py
Lines changed: 54 additions & 0 deletions b/‎dpdata/amber/sqm.py
Lines changed: 54 additions & 0 deletions
diff --git a/‎dpdata/bond_order_system.py
Lines changed: 199 additions & 0 deletions b/‎dpdata/bond_order_system.py
Lines changed: 199 additions & 0 deletions
diff --git a/‎dpdata/deepmd/comp.py
Lines changed: 5 additions & 0 deletions b/‎dpdata/deepmd/comp.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎dpdata/deepmd/raw.py
Lines changed: 6 additions & 0 deletions b/‎dpdata/deepmd/raw.py
Lines changed: 6 additions & 0 deletions
diff --git a/‎dpdata/pwmat/__init__.py
Lines changed: 1 addition & 0 deletions b/‎dpdata/pwmat/__init__.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎dpdata/rdkit/__init__.py b/‎dpdata/rdkit/__init__.py
@@ -13,12 +13,18 @@ jobs:
 
     steps:
     - uses: actions/checkout@v2
+    # set up conda
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
+      uses: conda-incubator/setup-miniconda@v2
       with:
-        python-version: ${{ matrix.python-version }}
+        auto-activate-base: true
+        activate-environment: ""
+    # install rdkit and openbabel
+    - name: Install rdkit
+      run: conda create -c conda-forge -n my-rdkit-env python=${{ matrix.python-version }} rdkit openbabel;
     - name: Install dependencies
-      run: pip install .[amber] coverage codecov
+      run: source $CONDA/bin/activate my-rdkit-env && pip install .[amber] coverage codecov
     - name: Test
-      run: cd tests && coverage run --source=../dpdata -m unittest && cd .. && coverage combine tests/.coverage && coverage report
-    - run: codecov
+      run: source $CONDA/bin/activate my-rdkit-env && cd tests && coverage run --source=../dpdata -m unittest && cd .. && coverage combine tests/.coverage && coverage report
+    - name: Run codecov
+      run: source $CONDA/bin/activate my-rdkit-env && codecov
@@ -78,6 +78,7 @@ The `System` or `LabeledSystem` can be constructed from the following file forma
 | PWmat   | movement    | True         | True    | LabeledSystem | 'pwmat/movement'     |
 | PWmat   | OUT.MLMD    | True         | True    | LabeledSystem | 'pwmat/out.mlmd'     |
 | Amber   | multi       | True         | True    | LabeledSystem | 'amber/md'           |
+| Amber/sqm | sqm.out   | False        | False   | System        | 'sqm/out'            |
 | Gromacs | gro         | True         | False   | System        | 'gromacs/gro'        |
 
 
@@ -206,4 +207,48 @@ s.replace('Hf', 'Zr', 8)
 s.to_vasp_poscar('POSCAR.P42nmc.replace')
 ```
 
+# BondOrderSystem
+A new class `BondOrderSystem` which inherits from class `System` is introduced in dpdata. This new class contains information of chemical bonds and formal charges (stored in `BondOrderSystem.data['bonds']`, `BondOrderSystem.data['formal_charges']`). Now BondOrderSystem can only read from .mol/.sdf formats, because of its dependency on rdkit (which means rdkit must be installed if you want to use this function). Other formats, such as pdb, must be converted to .mol/.sdf format (maybe with software like open babel). 
+```python
+import dpdata
+system_1 = dpdata.BondOrderSystem("tests/bond_order/CH3OH.mol", fmt="mol") # read from .mol file
+system_2 = dpdata.BondOrderSystem("tests/bond_order/methane.sdf", fmt="sdf") # read from .sdf file
+```
+In sdf file, all molecules must be of the same topology (i.e. conformers of the same molecular configuration).
+`BondOrderSystem` also supports initialize from a `rdkit.Chem.rdchem.Mol` object directly.
+```python
+from rdkit import Chem
+from rdkit.Chem import AllChem
+import dpdata
+
+mol = Chem.MolFromSmiles("CC")
+mol = Chem.AddHs(mol)
+AllChem.EmbedMultipleConfs(mol, 10)
+system = dpdata.BondOrderSystem(rdkit_mol=mol)
+```
+
+## Bond Order Assignment
+The `BondOrderSystem` implements a more robust sanitize procedure for rdkit Mol, as defined in `dpdata.rdkit.santizie.Sanitizer`. This class defines 3 level of sanitization process by: low, medium and high. (default is medium).
++ low: use `rdkit.Chem.SanitizeMol()` function to sanitize molecule.
++ medium: before using rdkit, the programm will first assign formal charge of each atom to avoid inappropriate valence exceptions. However, this mode requires the rightness of the bond order information in the given molecule.
++ high: the program will try to fix inappropriate bond orders in aromatic hetreocycles, phosphate, sulfate, carboxyl, nitro, nitrine, guanidine groups. If this procedure fails to sanitize the given molecule, the program will then try to call `obabel` to pre-process the mol and repeat the sanitization procedure. **That is to say, if you wan't to use this level of sanitization, please ensure `obabel` is installed in the environment.**
+According to our test, our sanitization procedure can successfully read 4852 small molecules in the PDBBind-refined-set. It is necessary to point out that the in the molecule file (mol/sdf), the number of explicit hydrogens has to be correct. Thus, we recommend to use
+ `obabel xxx -O xxx -h` to pre-process the file. The reason why we do not implement this hydrogen-adding procedure in dpdata is that we can not ensure its correctness.
+
+```python
+import dpdata
+    
+for sdf_file in glob.glob("bond_order/refined-set-ligands/obabel/*sdf"):
+    syst = dpdata.BondOrderSystem(sdf_file, sanitize_level='high', verbose=False)
+```
+## Formal Charge Assignment
+BondOrderSystem implement a method to assign formal charge for each atom based on the 8-electron rule (see below). Note that it only supports common elements in bio-system: B,C,N,O,P,S,As
+```python
+import dpdata
+
+syst = dpdata.BondOrderSystem("tests/bond_order/CH3NH3+.mol", fmt='mol')
+print(syst.get_formal_charges()) # return the formal charge on each atom
+print(syst.get_charge()) # return the total charge of the system
+```
 
+If a valence of 3 is detected on carbon, the formal charge will be assigned to -1. Because for most cases (in alkynyl anion, isonitrile, cyclopentadienyl anion), the formal charge on 3-valence carbon is -1, and this is also consisent with the 8-electron rule.
@@ -9,3 +9,15 @@
     from ._version import version as __version__
 except ImportError:
     from .__about__ import __version__
+
+# BondOrder System has dependency on rdkit
+try:
+    import rdkit
+    USE_RDKIT = True
+except ModuleNotFoundError:
+    USE_RDKIT = False
+
+if USE_RDKIT:
+    from .bond_order_system import BondOrderSystem
+
+
@@ -0,0 +1 @@
+
@@ -0,0 +1,54 @@
+import numpy as np
+
+
+START = 0
+READ_CHARGE = 1
+READ_CHARGE_SUCCESS = 2
+READ_COORDS_START = 3
+READ_COORDS = 6
+
+def parse_sqm_out(fname):
+    '''
+        Read atom symbols, charges and coordinates from ambertools sqm.out file
+    '''
+    atom_symbols = []
+    coords = []
+    charges = []
+    with open(fname) as f:
+        flag = 0
+        for line in f:
+            if line.startswith("  Atom    Element       Mulliken Charge"):
+                flag = READ_CHARGE
+            elif line.startswith(" Total Mulliken Charge"):
+                flag = READ_CHARGE_SUCCESS
+            elif line.startswith(" Final Structure"):
+                flag = READ_COORDS_START
+            elif flag == READ_CHARGE:
+                ls = line.strip().split()
+                atom_symbols.append(ls[-2])
+                charges.append(float(ls[-1]))
+            elif READ_COORDS_START <= flag < READ_COORDS:
+                flag += 1
+            elif flag == READ_COORDS:
+                ls = line.strip()
+                if not ls:
+                    break
+                else:
+                    symbol = line.strip().split()[-4]
+                    coord = list(map(float, line.strip().split()[-3:]))
+                    coords.append(coord)
+    return atom_symbols, charges, np.array(coords)
+
+
+def to_system_data(fname):
+    data = {}
+    atom_symbols, charges, coords = parse_sqm_out(fname)
+    atom_names, data['atom_types'], atom_numbs = np.unique(atom_symbols, return_inverse=True, return_counts=True)
+    data['atom_names'] = list(atom_names)
+    data['atom_numbs'] = list(atom_numbs)
+    data['charges'] = np.array([charges])
+    data['coords'] = np.array([coords])
+    data['orig'] = np.array([0, 0, 0])
+    data['cells'] = np.array([[[100., 0., 0.], [0., 100., 0.], [0., 0., 100.]]])
+    data['nopbc'] = True
+    return data
@@ -0,0 +1,199 @@
+#%%
+# Bond Order System
+from dpdata.system import Register, System, LabeledSystem, check_System
+import rdkit.Chem
+import dpdata.rdkit.utils
+from dpdata.rdkit.sanitize import Sanitizer, SanitizeError
+from copy import deepcopy
+# import dpdata.rdkit.mol2
+
+def check_BondOrderSystem(data):
+    check_System(data)
+    assert ('bonds' in data.keys())
+    
+class BondOrderSystem(System):
+    '''
+    The system with chemical bond and formal charges information
+
+    For example, a labeled methane system named `d_example` has one molecule (5 atoms, 4 bonds) and `n_frames` frames. The bond order and formal charge information can be accessed by
+        - `d_example['bonds']` : a numpy array of size 4 x 3, and
+                                    the first column represents the index of begin atom,
+                                    the second column represents the index of end atom, 
+                                    the third columen represents the bond order:
+                                        1 - single bond, 2 - double bond, 3 - triple bond, 1.5 - aromatic bond
+        - `d_example['formal_charges']` : a numpy array of size 5 x 1
+    '''
+    def __init__(self,
+                 file_name = None,
+                 fmt = 'auto',
+                 type_map = None,
+                 begin = 0,
+                 step = 1,
+                 data = None,
+                 rdkit_mol = None,
+                 sanitize_level = "medium",
+                 raise_errors = True,
+                 verbose = False,
+                 **kwargs):
+        """
+        Constructor
+
+        Parameters
+        ----------
+        file_name : str
+            The file to load the system
+        fmt : str
+            Format of the file, supported formats are
+                - ``auto`` : inferred from `file_name`'s extention
+                - ``mol`` : .mol file
+                - ``sdf`` : .sdf file
+        type_map : list of str
+            Needed by formats deepmd/raw and deepmd/npy. Maps atom type to name. The atom with type `ii` is mapped to `type_map[ii]`.
+            If not provided the atom names are assigned to `'Type_1'`, `'Type_2'`, `'Type_3'`...
+        begin : int
+            The beginning frame when loading MD trajectory.
+        step : int
+            The number of skipped frames when loading MD trajectory.
+        data : dict
+            System data dict.
+        rdkit_mol : rdkit.Chem.rdchem.Mol
+            If `file_name` is None, you must init with a rdkit Mol type.
+        sanitize_level : str
+            The level of sanitizer, 'low', 'medium' or 'high'.
+        raise_errors : bool
+            whether to raise an Exception if sanitization procedure fails.
+        verbose : bool
+            whether to print information in the sanitization procedure.
+        """
+
+        System.__init__(self)
+        self.sanitizer = Sanitizer(sanitize_level, raise_errors, verbose)
+
+        if data:
+            mol = dpdata.rdkit.utils.system_data_to_mol(data)
+            self.from_rdkit_mol(mol)
+        if file_name:
+            self.from_fmt(file_name, 
+                          fmt,
+                          type_map=type_map,
+                          begin=begin, 
+                          step=step,
+                          **kwargs)
+        elif rdkit_mol:
+            self.from_rdkit_mol(rdkit_mol)
+        else:
+            raise ValueError("Please specify a mol/sdf file or a rdkit Mol object")
+
+        if type_map:
+            self.apply_type_map(type_map)
+
+    register_from_funcs = Register()
+    register_to_funcs = System.register_to_funcs + Register()
+
+    def __repr__(self):
+        return self.__str__()
+
+    def __str__(self):
+        '''
+            A brief summary of the system
+        '''
+        ret = "Data Summary"
+        ret += "\nBondOrder System"
+        ret += "\n-------------------"
+        ret += f"\nFrame Numbers      : {self.get_nframes()}"
+        ret += f"\nAtom Numbers       : {self.get_natoms()}"
+        ret += f"\nBond Numbers       : {self.get_nbonds()}"
+        ret += "\nElement List       :"
+        ret += "\n-------------------"
+        ret += "\n"+"  ".join(map(str,self.get_atom_names()))
+        ret += "\n"+"  ".join(map(str,self.get_atom_numbs()))
+        return ret
+
+    def get_nbonds(self):
+        '''
+            Return the number of bonds
+        '''
+        return len(self.data['bonds'])
+    
+    def get_charge(self):
+        '''
+            Return the total formal charge of the moleclue
+        '''
+        return sum(self.data['formal_charges'])
+    
+    def get_mol(self):
+        '''
+            Return the rdkit.Mol object
+        '''
+        return self.rdkit_mol
+    
+    def get_bond_order(self, begin_atom_idx, end_atom_idx):
+        '''
+            Return the bond order between given atoms
+        '''
+        return self.data['bond_dict'][f'{int(begin_atom_idx)}-{int(end_atom_idx)}']
+    
+    def get_formal_charges(self):
+        '''
+            Return the formal charges on each atom
+        '''
+        return self.data['formal_charges']
+    
+    def copy(self):
+        new_mol = deepcopy(self.rdkit_mol)
+        self.__class__(data=deepcopy(self.data),
+                       rdkit_mol=new_mol)
+    
+    # def __add__(self, other):
+    #     '''
+    #         magic method "+" operation
+    #     '''
+    #     if isinstance(other, BondOrderSystem):
+    #         if dpdata.rdkit.utils.check_same_molecule(self.rdkit_mol, other.rdkit_mol):
+    #             self.__class__(self, data=other.data)
+    #         else:
+    #             raise RuntimeError("The two systems are not of the same topology.")
+    #     else:
+    #         raise RuntimeError(f"Unsupported data structure: {type(other)}")
+
+    def from_rdkit_mol(self, rdkit_mol):
+        '''
+            Initialize from a rdkit.Chem.rdchem.Mol object
+        '''
+        rdkit_mol = self.sanitizer.sanitize(rdkit_mol)
+        self.data = dpdata.rdkit.utils.mol_to_system_data(rdkit_mol)
+        self.data['bond_dict'] = dict([(f'{int(bond[0])}-{int(bond[1])}', bond[2]) for bond in self.data['bonds']])
+        self.rdkit_mol = rdkit_mol
+
+    @register_from_funcs.register_funcs('mol')
+    def from_mol_file(self, file_name):
+        mol = rdkit.Chem.MolFromMolFile(file_name, sanitize=False, removeHs=False)
+        self.from_rdkit_mol(mol)
+
+    @register_to_funcs.register_funcs("mol")
+    def to_mol_file(self, file_name, frame_idx=0):
+        assert (frame_idx < self.get_nframes())
+        rdkit.Chem.MolToMolFile(self.rdkit_mol, file_name, confId=frame_idx)
+    
+    @register_from_funcs.register_funcs("sdf")
+    def from_sdf_file(self, file_name):
+        '''
+        Note that it requires all molecules in .sdf file must be of the same topology
+        '''
+        mols = [m for m in rdkit.Chem.SDMolSupplier(file_name, sanitize=False, removeHs=False)]
+        if len(mols) > 1:
+            mol = dpdata.rdkit.utils.combine_molecules(mols)
+        else:
+            mol = mols[0]
+        self.from_rdkit_mol(mol)
+    
+    @register_to_funcs.register_funcs("sdf")
+    def to_sdf_file(self, file_name, frame_idx=-1):
+        sdf_writer = rdkit.Chem.SDWriter(file_name)
+        if frame_idx == -1:
+            for ii in self.get_nframes():
+                sdf_writer.write(self.rdkit_mol, confId=ii)
+        else:
+            assert (frame_idx < self.get_nframes())
+            sdf_writer.write(self.rdkit_mol, confId=frame_idx)
+        sdf_writer.close()
@@ -71,6 +71,11 @@ def dump(folder,
     # dump raw 
     np.savetxt(os.path.join(folder, 'type.raw'), data['atom_types'], fmt = '%d')    
     np.savetxt(os.path.join(folder, 'type_map.raw'),    data['atom_names'], fmt = '%s')
+    # BondOrder System
+    if "bonds" in data:
+        np.savetxt(os.path.join(folder, "bonds.raw"), data['bonds'], header="begin_atom, end_atom, bond_order")
+    if "formal_charges" in data:
+        np.savetxt(os.path.join(folder, "formal_charges.raw"), data['formal_charges'])
     # reshape frame properties and convert prec
     nframes = data['cells'].shape[0]
     cells  = np.reshape(data['cells'],    [nframes,  9]).astype(comp_prec)
 
@@ -63,6 +63,12 @@ def dump (folder, data) :
     np.savetxt(os.path.join(folder, 'type_map.raw'),    data['atom_names'], fmt = '%s')
     np.savetxt(os.path.join(folder, 'box.raw'),     np.reshape(data['cells'],    [nframes,  9]))
     np.savetxt(os.path.join(folder, 'coord.raw'),   np.reshape(data['coords'],   [nframes, -1]))
+    # BondOrder System
+    if "bonds" in data:
+        np.savetxt(os.path.join(folder, "bonds.raw"), data['bonds'], header="begin_atom, end_atom, bond_order")
+    if "formal_charges" in data:
+        np.savetxt(os.path.join(folder, "formal_charges.raw"), data['formal_charges'])
+    # Labeled System
     if 'energies' in data :
         np.savetxt(os.path.join(folder, 'energy.raw'),  np.reshape(data['energies'], [nframes,  1]))
     if 'forces' in data :
 
@@ -0,0 +1 @@
+