-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdescriptors.py
More file actions
executable file
·285 lines (248 loc) · 9.55 KB
/
descriptors.py
File metadata and controls
executable file
·285 lines (248 loc) · 9.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
from copy import deepcopy
import pandas as pd
from matminer.featurizers import structure
from mordred import Calculator, descriptors
from pymatgen.analysis.graphs import ConnectedSite
from pymatgen.analysis.local_env import JmolNN, CrystalNN, CutOffDictNN
from pymatgen.core.structure import Structure, Site, Molecule, PeriodicSite
from rdkit import RDLogger
from rdkit.Chem import MolFromSmiles, Mol, Fragments
from AnalysisModule.calculator.cxcalc import JChemCalculator, Jia2019FeatureDict
from AnalysisModule.routines.pbc import PBCparser
from AnalysisModule.routines.util import get_cutoffdict
"""
classification of descriptors
- structures vs properties
- local vs global
- bulk vs organic #?
iteractions:
- short contacts btw templates and inogranics
- dipole orientation?
total structure:
- density
template structure:
- smiles
- # of N
- charge
- packing?
- conformational?
inorganic structure:
- dimensionality
- metal coordination
- space fill
- conformational?
- MOF: LCD PLD
"""
class DescriptorError(Exception): pass
def is_smiles(s: str):
try:
RDLogger.DisableLog('rdApp.*')
m = MolFromSmiles(s)
RDLogger.EnableLog('rdApp.*')
if isinstance(m, Mol):
return True
else:
return False
except:
return False
class DescriptorCalculator:
def __init__(self, entity):
self.entity = entity
def get_descriptors(self, updatedict=None, force_recal=False):
if isinstance(updatedict, dict):
ds = updatedict
else:
ds = dict()
if force_recal:
ds = dict()
for cal in dir(self):
if callable(getattr(self, cal)) and cal.startswith("cal_"):
dname = cal[4:]
if dname in ds.keys():
continue
ds[dname] = getattr(self, cal)(self.entity)
return ds
class MolecularDC(DescriptorCalculator):
RdkitFrags = """_feat_fr_NH2 Number of primary amines
_feat_fr_NH1 Number of secondary amines
_feat_fr_NH0 Number of tertiary amines
_feat_fr_quatN Number of quaternary amines
_feat_fr_ArN Number of N functional groups attached to aromatics
_feat_fr_Ar_NH Number of aromatic amines
_feat_fr_Imine Number of imines
_feat_fr_amidine Number of amidine groups
_feat_fr_dihydropyridine Number of dihydropyridines
_feat_fr_guanido Number of guanidine groups
_feat_fr_piperdine Number of piperidine rings
_feat_fr_piperzine Number of piperzine rings
_feat_fr_pyridine Number of pyridine rings"""
RdkitFragsFuncNames = [line.split()[0].replace("_feat_", "") for line in RdkitFrags.split("\n")]
@staticmethod
def cal_Mordred2D(smiles: str or [str]):
mcalc = Calculator(descriptors, ignore_3D=True)
if isinstance(smiles, str):
mordred_des = mcalc(MolFromSmiles(smiles)).drop_missing().asdict()
else:
rdmols = [MolFromSmiles(smi) for smi in smiles]
mordred_des = mcalc.pandas(rdmols)
return mordred_des
@staticmethod
def cal_Jchem2D(smiles: str or [str]):
jc = JChemCalculator()
if isinstance(smiles, str):
df = jc.cal_feature(list(Jia2019FeatureDict.values()), [smiles])
del df['id']
return df.to_dict("records")[0]
else:
df = jc.cal_feature(list(Jia2019FeatureDict.values()), smiles)
del df['id']
return df
@staticmethod
def cal_RdkitFrag(smiles: str or [str]):
if isinstance(smiles, str):
mol = MolFromSmiles(smiles)
results = {}
for funcname in MolecularDC.RdkitFragsFuncNames:
if funcname in dir(Fragments):
func = getattr(Fragments, funcname)
if callable(func):
results[funcname] = func(mol)
return results
else:
rdmols = [MolFromSmiles(smi) for smi in smiles]
results = []
for mol in rdmols:
result = {}
for funcname in MolecularDC.RdkitFragsFuncNames:
if funcname in dir(Fragments):
func = getattr(Fragments, funcname)
if callable(func):
result[funcname] = func(mol)
results.append(result)
return pd.DataFrame.from_records(results)
class ConformerDC(DescriptorCalculator): pass
class StructureDC(DescriptorCalculator):
@staticmethod
def s2c(anystructure: Structure, already_unwrap=True):
if already_unwrap:
unwrap_structure = anystructure
else:
mols, unwrap_structure, unwrap_pblock_list = PBCparser.unwrap(anystructure)
if len(mols) > 1:
raise DescriptorError("there are more than 1 molecule in the input of s2c!")
s: PeriodicSite
return Molecule.from_sites(
Site(s.species, s.coords, properties=deepcopy(s.properties)) for s in unwrap_structure)
@staticmethod
def cal_LatticeVolume(anystructure: Structure):
return anystructure.volume
@staticmethod
def cal_FormalCharge(anystructure: Structure):
fc = 0
for s in anystructure:
try:
fc += float(s.properties["formal_charge"])
except:
raise DescriptorError("formal charge is strange for site: {}".format(s.properties))
return fc
class OxideStructureDC(StructureDC):
@staticmethod
def helper_DimensionPmg(oxide_structure: Structure):
cutoff_dict = get_cutoffdict(oxide_structure, 1.3)
dims = {}
for nn in (
# JmolNN(),
# CovalentBondNN(),
CrystalNN(),
# VoronoiNN(),
CutOffDictNN(cutoff_dict),
):
dim = structure.Dimensionality(nn_method=nn)
# if isinstance(nn, CrystalNN):
# bs = dim.nn_method.get_bonded_structure(oxide_structure)
f = dim.featurize(oxide_structure)
dims[nn.__class__.__name__] = int(f[0])
return dims
@staticmethod
def helper_DimensionPmgWithoutHydrogens(oxide_structure: Structure):
oxide_structure_woh = Structure.from_sites(
[s for s in oxide_structure.sites if s.species_string.upper() != "H"])
return OxideStructureDC.helper_DimensionPmg(oxide_structure_woh)
@staticmethod
def cal_Dimension(oxide_structure: Structure):
dimension_dict = OxideStructureDC.helper_DimensionPmgWithoutHydrogens(oxide_structure)
if len(set(dimension_dict.values())) != 1:
raise DescriptorError("inconsistent dim calculated: {}".format(dimension_dict))
else:
# TODO add bias
dim = list(dimension_dict.values())[0]
return dim
# @staticmethod
# def cal_CoordinationDict(oxide_structure):
# possible_coordinations = [0, 1, 2, 3, 4, 5, 6, 7, 8]
# coords = OrderedDict(zip(possible_coordinations, [[] for i in possible_coordinations]))
# for s in oxide_structure:
# coord = len(s.properties["nn"])
# if coord in possible_coordinations:
# coords[coord].append(s.properties["label"])
# return coords
#
# @staticmethod
# def cal_CoordinationCounts(oxide_structure):
# table = OxideStructureDC.cal_CoordinationDict(oxide_structure)
# return [len(v) for v in table.values()]
@staticmethod
def cal_CompositionFormula(oxide_structure: Structure):
return oxide_structure.composition.formula
@staticmethod
def helper_pbucenters(site: PeriodicSite):
e = site.species.elements[0]
if e.symbol == "H":
return False
if e.symbol == "O":
return False
if e.symbol == "F":
return False
if e.is_noble_gas:
return False
# if e.is_alkaline:
# return False
if e.is_alkali:
return False
return True
# if e.is_chalcogen or e.is_metal or e.is_metalloid:
# return True
@staticmethod
def helper_connectedsite2molsite(cs: ConnectedSite):
ps: PeriodicSite = cs.site
s = Site(ps.species, ps.coords, properties=ps.properties)
return s
#
# @staticmethod
# def cal_PrimaryBuildUnits(oxide_structure: Structure):
#
# # TODO atomic radii bad, try cation/anion radii with CutoffNN
# # for nn in (
# # JmolNN(),
# # CrystalNN(),
# # CutOffDictNN(cutoff_dict),
# # ):
# oxide_structure_woh = Structure.from_sites(
# [s for s in oxide_structure.sites if s.species_string.upper() != "H"])
# # cutoff_dict = OxideStructureDC.helpler_cutoffdict(oxide_structure_woh)
# # nn = CutOffDictNN(cutoff_dict)
# nn = JmolNN()
# dim = structure.Dimensionality(nn_method=nn)
# bs = dim.nn_method.get_bonded_structure(oxide_structure_woh)
# pbus = []
# for i in range(len(oxide_structure_woh.sites)):
# the_site: PeriodicSite = oxide_structure_woh.sites[i]
# if not OxideStructureDC.helper_pbucenters(the_site):
# continue
# # its_nbrs = getattr(bs.get_connected_sites(i), "ConnectedSite")
# pbu = the_site.species
# for cs in bs.get_connected_sites(i):
# pbu += Composition(cs.site.species_string)
# # its_nbrs = [cs.site for cs in bs.get_connected_sites(i)] # TODO use helper_connectedsite2molsite to get a Molecule
# pbus.append(pbu.formula)
# return dict(Counter(pbus))