Skip to content
This repository was archived by the owner on Jul 20, 2025. It is now read-only.

Commit 534e1cf

Browse files
committed
More careful truncation of the suspicious training MPDS values
1 parent 355a58e commit 534e1cf

File tree

4 files changed

+71
-70
lines changed

4 files changed

+71
-70
lines changed

mpds_ml_labs/prediction.py

Lines changed: 38 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -6,42 +6,62 @@
66
import numpy as np
77

88

9-
human_names = {
9+
prop_semantics = {
10+
#'w': {
11+
# 'name': 'band gap for direct transition',
12+
# 'units': 'eV',
13+
# 'symbol': 'e<sub>dir.</sub>',
14+
# 'rounding': 1,
15+
# 'interval': [0.01, 20]
16+
#},
1017
'z': {
1118
'name': 'isothermal bulk modulus',
1219
'units': 'GPa',
1320
'symbol': 'B',
14-
'rounding': 0
21+
'rounding': 0,
22+
'interval': [0.5, 2000]
1523
},
1624
'y': {
1725
'name': 'enthalpy of formation',
1826
'units': 'kJ g-at.-1',
1927
'symbol': '&Delta;H',
20-
'rounding': 0
28+
'rounding': 0,
29+
'interval': [-900, 200]
2130
},
2231
'x': {
2332
'name': 'heat capacity at constant pressure',
2433
'units': 'J K-1 g-at.-1',
2534
'symbol': 'C<sub>p</sub>',
26-
'rounding': 0
35+
'rounding': 0,
36+
'interval': [0, 500]
2737
},
28-
#'w': {
29-
# 'name': 'band gap for direct transition',
30-
# 'units': 'eV',
31-
# 'symbol': 'e<sub>dir.</sub>',
32-
# 'rounding': 1
33-
#},
3438
'k': {
3539
'name': 'Seebeck coefficient',
3640
'units': 'muV K-1',
3741
'symbol': 'S',
38-
'rounding': 1
42+
'rounding': 1,
43+
'interval': [-1000, 1000]
3944
},
4045
'm': {
4146
'name': 'temperature for congruent melting',
4247
'units': 'K',
4348
'symbol': 'T<sub>melt</sub>',
44-
'rounding': 0
49+
'rounding': 0,
50+
'interval': [10, 5000]
51+
},
52+
'd': {
53+
'name': 'Debye temperature',
54+
'units': 'K',
55+
'symbol': '&Theta;<sub>D</sub>',
56+
'rounding': 0,
57+
'interval': [10, 2000]
58+
},
59+
't': {
60+
'name': 'linear thermal expansion coefficient',
61+
'units': 'K-1',
62+
'symbol': '&Theta;<sub>D</sub>',
63+
'rounding': 6,
64+
'interval': [-0.001, 0.001]
4565
}
4666
}
4767

@@ -112,9 +132,9 @@ def load_ml_model(prop_model_files):
112132
continue
113133

114134
basename = file_name.split(os.sep)[-1]
115-
if basename.startswith('ml') and basename[3:4] == '_' and basename[2:3] in human_names:
135+
if basename.startswith('ml') and basename[3:4] == '_' and basename[2:3] in prop_semantics:
116136
prop_id = basename[2:3]
117-
print("Detected property %s in file %s" % (human_names[prop_id]['name'], basename))
137+
print("Detected property %s in file %s" % (prop_semantics[prop_id]['name'], basename))
118138
else:
119139
prop_id = str(n)
120140
print("No property name detected in file %s" % basename)
@@ -132,7 +152,7 @@ def load_ml_model(prop_model_files):
132152
def get_legend(pred_dict):
133153
legend = {}
134154
for key in pred_dict.keys():
135-
legend[key] = human_names.get(key, {
155+
legend[key] = prop_semantics.get(key, {
136156
'name': 'Unspecified property ' + str(key),
137157
'units': 'arb.u.',
138158
'symbol': 'P' + str(key),
@@ -147,7 +167,7 @@ def ase_to_ml_model(ase_obj, ml_model):
147167
d_dim = len(descriptor)
148168

149169
if not ml_model: # testing
150-
return {prop_id: {'value': 42, 'mae': 0, 'r2': 0} for prop_id in human_names.keys()}, None
170+
return {prop_id: {'value': 42, 'mae': 0, 'r2': 0} for prop_id in prop_semantics.keys()}, None
151171

152172
for prop_id, regr in ml_model.items(): # production
153173

@@ -164,8 +184,8 @@ def ase_to_ml_model(ase_obj, ml_model):
164184
return None, str(e)
165185

166186
result[prop_id] = {
167-
'value': round(prediction, human_names[prop_id]['rounding']),
168-
'mae': round(regr.metadata['mae'], human_names[prop_id]['rounding']),
187+
'value': round(prediction, prop_semantics[prop_id]['rounding']),
188+
'mae': round(regr.metadata['mae'], prop_semantics[prop_id]['rounding']),
169189
'r2': regr.metadata['r2']
170190
}
171191

mpds_ml_labs/test_app.py

Lines changed: 13 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -8,30 +8,14 @@
88

99
from mpds_client import MPDSDataRetrieval, APIError
1010

11-
from prediction import human_names
12-
from struct_utils import detect_format, poscar_to_ase, symmetrize, get_formula
11+
from prediction import prop_semantics
12+
from struct_utils import detect_format, poscar_to_ase, symmetrize, get_formula, sgn_to_crsystem
1313
from cif_utils import cif_to_ase
1414

1515

1616
req = httplib2.Http()
1717
client = MPDSDataRetrieval()
1818

19-
def sgn_to_crsystem(number):
20-
if 195 <= number <= 230:
21-
return 'cubic'
22-
elif 168 <= number <= 194:
23-
return 'hexagonal'
24-
elif 143 <= number <= 167:
25-
return 'trigonal'
26-
elif 75 <= number <= 142:
27-
return 'tetragonal'
28-
elif 16 <= number <= 74:
29-
return 'orthorhombic'
30-
elif 3 <= number <= 15:
31-
return 'monoclinic'
32-
else:
33-
return 'triclinic'
34-
3519
def make_request(address, data={}, httpverb='POST', headers={}):
3620

3721
address += '?' + urlencode(data)
@@ -47,6 +31,9 @@ def make_request(address, data={}, httpverb='POST', headers={}):
4731

4832
if __name__ == '__main__':
4933

34+
try: sys.argv[1]
35+
except IndexError: sys.exit("Structure file must be given!")
36+
5037
structure = open(sys.argv[1]).read()
5138
fmt = detect_format(structure)
5239

@@ -72,29 +59,29 @@ def make_request(address, data={}, httpverb='POST', headers={}):
7259
raise RuntimeError(answer['error'])
7360

7461
formulae_categ, lattices_categ = get_formula(ase_obj), sgn_to_crsystem(ase_obj.info['spacegroup'].no)
75-
for prop_id, pdata in human_names.items():
62+
for prop_id, pdata in prop_semantics.items():
7663
try:
7764
resp = client.get_dataframe({
7865
'formulae': formulae_categ,
7966
'lattices': lattices_categ,
8067
'props': pdata['name']
8168
})
8269
except APIError as e:
83-
human_names[prop_id]['factual'] = None
70+
prop_semantics[prop_id]['factual'] = None
8471
if e.code == 1:
8572
continue
8673
else:
8774
raise
8875

8976
resp['Value'] = resp['Value'].astype('float64') # to treat values out of bounds given as str
9077
resp = resp[resp['Units'] == pdata['units']]
91-
human_names[prop_id]['factual'] = np.median(resp['Value'])
78+
prop_semantics[prop_id]['factual'] = np.median(resp['Value'])
9279

9380
for prop_id, pdata in answer['prediction'].items():
94-
print("{0:40} = {1:6}, factual {2:6} (MAE = {3:4}), {4}".format(
95-
human_names[prop_id]['name'],
81+
print("{0:40} = {1:6}, factual {2:8} (MAE = {3:4}), {4}".format(
82+
prop_semantics[prop_id]['name'],
9683
pdata['value'],
97-
human_names[prop_id]['factual'] or 'absent',
98-
abs(pdata['value'] - human_names[prop_id]['factual']) if human_names[prop_id]['factual'] else 'unknown',
99-
human_names[prop_id]['units']
84+
prop_semantics[prop_id]['factual'] or 'absent',
85+
pdata['mae'],
86+
prop_semantics[prop_id]['units']
10087
))

mpds_ml_labs/test_ml.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
from struct_utils import detect_format, poscar_to_ase, symmetrize
55
from cif_utils import cif_to_ase
6-
from prediction import ase_to_ml_model, load_ml_model, human_names
6+
from prediction import ase_to_ml_model, load_ml_model, prop_semantics
77
from common import ML_MODELS, DATA_PATH
88

99

@@ -57,8 +57,8 @@
5757

5858
for prop_id, pdata in prediction.items():
5959
print("{0:40} = {1:6} (MAE = {2:4}), {3}".format(
60-
human_names[prop_id]['name'],
60+
prop_semantics[prop_id]['name'],
6161
pdata['value'],
6262
pdata['mae'],
63-
human_names[prop_id]['units']
63+
prop_semantics[prop_id]['units']
6464
))

train_model.py

Lines changed: 17 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
from mpds_client import MPDSDataRetrieval, MPDSExport
1616

17-
from prediction import get_descriptor, human_names
17+
from mpds_ml_labs.prediction import get_descriptor, prop_semantics
1818

1919

2020
def get_regr(a=None, b=None):
@@ -57,13 +57,13 @@ def mpds_get_data(prop_id, descriptor_kappa):
5757
Fetch, massage, and save dataframe from the MPDS
5858
NB currently pressure is not taken into account!
5959
"""
60-
print("Getting %s with descriptor kappa = %s" % (human_names[prop_id]['name'], descriptor_kappa))
60+
print("Getting %s with descriptor kappa = %s" % (prop_semantics[prop_id]['name'], descriptor_kappa))
6161
starttime = time.time()
6262

6363
client = MPDSDataRetrieval()
6464

6565
props = client.get_dataframe(
66-
{"props": human_names[prop_id]['name']},
66+
{"props": prop_semantics[prop_id]['name']},
6767
fields={'P': [
6868
'sample.material.chemical_formula',
6969
'sample.material.phase_id',
@@ -77,23 +77,17 @@ def mpds_get_data(prop_id, descriptor_kappa):
7777
)
7878
props['Value'] = props['Value'].astype('float64') # to treat values out of bounds given as str
7979
props = props[np.isfinite(props['Phase'])]
80-
props = props[props['Units'] == human_names[prop_id]['units']]
81-
82-
# filtering some abnormal values
83-
# these should be corrected by LPF editors soon
84-
if prop_id == 'z':
85-
props = props[props['Value'] < 2000]
86-
#elif prop_id == 'w': # NB this requires additional treatment for zero band gaps
87-
# props = props[(props['Value'] > 0) & (props['Value'] < 20)]
88-
elif prop_id == 'u':
89-
props = props[props['Value'] > 0]
90-
91-
to_drop = props[
92-
(props['Cname'] == 'Temperature') & (props['Cunits'] == 'K') & ((props['Cvalue'] < 200) | (props['Cvalue'] > 400))
80+
props = props[props['Units'] == prop_semantics[prop_id]['units']]
81+
props = props[
82+
(props['Value'] > prop_semantics[prop_id]['interval'][0]) & \
83+
(props['Value'] < prop_semantics[prop_id]['interval'][1])
9384
]
94-
95-
print("Rows to drop by criteria: %s" % len(to_drop))
96-
props.drop(to_drop.index, inplace=True)
85+
if prop_id not in ['m', 'd']:
86+
to_drop = props[
87+
(props['Cname'] == 'Temperature') & (props['Cunits'] == 'K') & ((props['Cvalue'] < 200) | (props['Cvalue'] > 400))
88+
]
89+
print("Rows to neglect by temperature: %s" % len(to_drop))
90+
props.drop(to_drop.index, inplace=True)
9791

9892
phases_compounds = dict(zip(props['Phase'], props['Compound'])) # keep the mapping for future
9993
avgprops = props.groupby('Phase')['Value'].mean().to_frame().reset_index().rename(columns={'Value': 'Avgvalue'})
@@ -163,9 +157,9 @@ def tune_model(data_file):
163157
Load saved data and perform simple regressor parameter tuning
164158
"""
165159
basename = data_file.split(os.sep)[-1]
166-
if basename.startswith('df') and basename[3:4] == '_' and basename[2:3] in human_names:
160+
if basename.startswith('df') and basename[3:4] == '_' and basename[2:3] in prop_semantics:
167161
tag = basename[2:3]
168-
print("Detected property %s" % human_names[tag]['name'])
162+
print("Detected property %s" % prop_semantics[tag]['name'])
169163
else:
170164
tag = None
171165
print("No property name detected")
@@ -213,14 +207,14 @@ def tune_model(data_file):
213207
sys.exit(
214208
"What to do?\n"
215209
"Please, provide either a *prop_id* letter (%s) for a property data to be downloaded and fitted,\n"
216-
"or a data *filename* for tuning the model." % ", ".join(human_names.keys())
210+
"or a data *filename* for tuning the model." % ", ".join(prop_semantics.keys())
217211
)
218212
try:
219213
descriptor_kappa = int(sys.argv[2])
220214
except:
221215
descriptor_kappa = None
222216

223-
if arg in human_names.keys():
217+
if arg in prop_semantics.keys():
224218

225219
struct_props = mpds_get_data(arg, descriptor_kappa)
226220

0 commit comments

Comments
 (0)