Skip to content
This repository was archived by the owner on Jul 20, 2025. It is now read-only.

Commit e571e42

Browse files
committed
Replace joblib to cPickle, change min_descriptor_len & train_test_split ratio, refactor training script and folder layout, request only non-disordered
1 parent 5977ed9 commit e571e42

File tree

5 files changed

+103
-77
lines changed

5 files changed

+103
-77
lines changed

mpds_ml_labs/prediction.py

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,10 @@
11

22
from __future__ import division
33
import os
4+
import cPickle
45

56
import numpy as np
67

7-
from sklearn.externals import joblib
8-
98

109
human_names = {
1110
'z': {
@@ -17,7 +16,7 @@
1716
'y': {
1817
'name': 'enthalpy of formation',
1918
'units': 'kJ g-at.-1',
20-
'symbol': 'H',
19+
'symbol': 'ΔH',
2120
'rounding': 0
2221
},
2322
'x': {
@@ -26,6 +25,12 @@
2625
'symbol': 'C<sub>p</sub>',
2726
'rounding': 0
2827
},
28+
#'w': {
29+
# 'name': 'band gap for direct transition',
30+
# 'units': 'eV',
31+
# 'symbol': 'e<sub>dir.</sub>',
32+
# 'rounding': 1
33+
#},
2934
'k': {
3035
'name': 'Seebeck coefficient',
3136
'units': 'muV K-1',
@@ -62,21 +67,24 @@
6267
periodic_numbers_normed = [(i - pmin)/(pmax - pmin) for i in periodic_numbers]
6368

6469

65-
def get_descriptor(ase_obj, kappa=18, overreach=False):
70+
def get_descriptor(ase_obj, kappa=None, overreach=False):
6671
"""
6772
From ASE object obtain
6873
a vectorized atomic structure
6974
populated to a certain fixed (relatively big) volume
7075
defined by kappa
7176
"""
77+
if not kappa: kappa = 18
7278
if overreach: kappa *= 2
7379

7480
norms = np.array([ np.linalg.norm(vec) for vec in ase_obj.get_cell() ])
7581
multiple = np.ceil(kappa / norms).astype(int)
7682
ase_obj = ase_obj.repeat(multiple)
7783
com = ase_obj.get_center_of_mass() # NB use recent ase version here, because of the new element symbols
7884
ase_obj.translate(-com)
79-
del ase_obj[[atom.index for atom in ase_obj if np.sqrt(np.dot(atom.position, atom.position)) > kappa]]
85+
del ase_obj[
86+
[atom.index for atom in ase_obj if np.sqrt(np.dot(atom.position, atom.position)) > kappa]
87+
]
8088

8189
ase_obj.center()
8290
ase_obj.set_pbc((False, False, False))
@@ -99,6 +107,10 @@ def get_descriptor(ase_obj, kappa=18, overreach=False):
99107
def load_ml_model(prop_model_files):
100108
ml_model = {}
101109
for n, file_name in enumerate(prop_model_files, start=1):
110+
if not os.path.exists(file_name):
111+
print("No file %s" % file_name)
112+
continue
113+
102114
basename = file_name.split(os.sep)[-1]
103115
if basename.startswith('ml') and basename[3:4] == '_' and basename[2:3] in human_names:
104116
prop_id = basename[2:3]
@@ -107,10 +119,11 @@ def load_ml_model(prop_model_files):
107119
prop_id = str(n)
108120
print("No property name detected in file %s" % basename)
109121

110-
model = joblib.load(file_name)
111-
if hasattr(model, 'predict') and hasattr(model, 'metadata'):
112-
ml_model[prop_id] = model
113-
print("Model metadata: %s" % model.metadata)
122+
with open(file_name, 'rb') as f:
123+
model = cPickle.load(f)
124+
if hasattr(model, 'predict') and hasattr(model, 'metadata'):
125+
ml_model[prop_id] = model
126+
print("Model metadata: %s" % model.metadata)
114127

115128
print("Loaded property models: %s" % len(ml_model))
116129
return ml_model
@@ -134,9 +147,7 @@ def ase_to_ml_model(ase_obj, ml_model):
134147
d_dim = len(descriptor)
135148

136149
if not ml_model: # testing
137-
138-
test_prop = round(np.sum(descriptor))
139-
return {prop_id: {'value': test_prop, 'mae': 0, 'r2': 0} for prop_id in human_names.keys()}, None
150+
return {prop_id: {'value': 42, 'mae': 0, 'r2': 0} for prop_id in human_names.keys()}, None
140151

141152
for prop_id, regr in ml_model.items(): # production
142153

mpds_ml_labs/ml_mpds.py renamed to train_model.py

Lines changed: 80 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def get_regr(a=None, b=None):
3333
)
3434

3535

36-
def estimate_quality(algo, args, values, attempts=30, nsamples=0.4):
36+
def estimate_quality(algo, args, values, attempts=30, nsamples=0.33):
3737
results = []
3838
for _ in range(attempts):
3939
X_train, X_test, y_train, y_test = train_test_split(args, values, test_size=nsamples)
@@ -52,13 +52,12 @@ def estimate_quality(algo, args, values, attempts=30, nsamples=0.4):
5252
return avg_mae, avg_r2
5353

5454

55-
def mpds_get_data(prop_id):
55+
def mpds_get_data(prop_id, descriptor_kappa):
5656
"""
57-
NB
58-
currently pressure is not taken into account,
59-
however must be
57+
Fetch, massage, and save dataframe from the MPDS
58+
NB currently pressure is not taken into account!
6059
"""
61-
print("Getting %s" % human_names[prop_id]['name'])
60+
print("Getting %s with descriptor kappa = %s" % (human_names[prop_id]['name'], descriptor_kappa))
6261
starttime = time.time()
6362

6463
client = MPDSDataRetrieval()
@@ -84,12 +83,14 @@ def mpds_get_data(prop_id):
8483
# these should be corrected by LPF editors soon
8584
if prop_id == 'z':
8685
props = props[props['Value'] < 2000]
87-
elif prop_id == 'w':
88-
props = props[(props['Value'] > 0) & (props['Value'] < 20)]
86+
#elif prop_id == 'w': # NB this requires additional treatment for zero band gaps
87+
# props = props[(props['Value'] > 0) & (props['Value'] < 20)]
8988
elif prop_id == 'u':
9089
props = props[props['Value'] > 0]
9190

92-
to_drop = props[(props['Cname'] == 'Temperature') & (props['Cunits'] == 'K') & ((props['Cvalue'] < 200) | (props['Cvalue'] > 400))]
91+
to_drop = props[
92+
(props['Cname'] == 'Temperature') & (props['Cunits'] == 'K') & ((props['Cvalue'] < 200) | (props['Cvalue'] > 400))
93+
]
9394

9495
print("Rows to drop by criteria: %s" % len(to_drop))
9596
props.drop(to_drop.index, inplace=True)
@@ -100,22 +101,26 @@ def mpds_get_data(prop_id):
100101

101102
print("Got %s distinct crystalline phases" % len(phases))
102103

103-
min_descriptor_len = 220
104+
min_descriptor_len = 200
104105
max_descriptor_len = min_descriptor_len*10
105106
data_by_phases = {}
106107

108+
print("Computing descriptors...")
107109
pbar = ProgressBar()
108110
for item in pbar(client.get_data(
109-
{"props": "atomic structure"},
111+
{
112+
"props": "atomic structure",
113+
"classes": "non-disordered"
114+
},
110115
fields={'S':['phase_id', 'entry', 'chemical_formula', 'cell_abc', 'sg_n', 'setting', 'basis_noneq', 'els_noneq']},
111116
phases=phases
112117
)):
113118
crystal = MPDSDataRetrieval.compile_crystal(item, 'ase')
114119
if not crystal: continue
115-
descriptor = get_descriptor(crystal)
120+
descriptor = get_descriptor(crystal, kappa=descriptor_kappa)
116121

117122
if len(descriptor) < min_descriptor_len:
118-
descriptor = get_descriptor(crystal, overreach=True)
123+
descriptor = get_descriptor(crystal, kappa=descriptor_kappa, overreach=True)
119124
if len(descriptor) < min_descriptor_len:
120125
continue
121126

@@ -147,79 +152,89 @@ def mpds_get_data(prop_id):
147152

148153
print("Done %s rows in %1.2f sc" % (len(struct_props), time.time() - starttime))
149154

150-
export_file = MPDSExport.save_df(struct_props, prop_id)
151-
print("Saving %s" % export_file)
155+
struct_props.export_file = MPDSExport.save_df(struct_props, prop_id)
156+
print("Saving %s" % struct_props.export_file)
152157

153158
return struct_props
154159

155160

161+
def tune_model(data_file):
162+
"""
163+
Load saved data and perform simple regressor parameter tuning
164+
"""
165+
basename = data_file.split(os.sep)[-1]
166+
if basename.startswith('df') and basename[3:4] == '_' and basename[2:3] in human_names:
167+
tag = basename[2:3]
168+
print("Detected property %s" % human_names[tag]['name'])
169+
else:
170+
tag = None
171+
print("No property name detected")
172+
173+
df = pd.read_pickle(data_file)
174+
175+
X = np.array(df['Descriptor'].tolist())
176+
y = df['Avgvalue'].tolist()
177+
178+
results = []
179+
for parameter_a in range(20, 501, 20):
180+
avg_mae, avg_r2 = estimate_quality(get_regr(a=parameter_a), X, y)
181+
results.append([parameter_a, avg_mae, avg_r2])
182+
print("%s\t\t\t%s\t\t\t%s" % (parameter_a, avg_mae, avg_r2))
183+
results.sort(key=lambda x: (-x[1], x[2]))
184+
185+
print("Best result:", results[-1])
186+
parameter_a = results[-1][0]
187+
188+
results = []
189+
for parameter_b in range(1, 13):
190+
avg_mae, avg_r2 = estimate_quality(get_regr(a=parameter_a, b=parameter_b), X, y)
191+
results.append([parameter_b, avg_mae, avg_r2])
192+
print("%s\t\t\t%s\t\t\t%s" % (parameter_b, avg_mae, avg_r2))
193+
results.sort(key=lambda x: (-x[1], x[2]))
194+
195+
print("Best result:", results[-1])
196+
parameter_b = results[-1][0]
197+
198+
print("a = %s b = %s" % (parameter_a, parameter_b))
199+
200+
regr = get_regr(a=parameter_a, b=parameter_b)
201+
regr.fit(X, y)
202+
regr.metadata = {'mae': avg_mae, 'r2': round(avg_r2, 2)}
203+
204+
if tag:
205+
export_file = MPDSExport.save_model(regr, tag)
206+
print("Saving %s" % export_file)
207+
208+
156209
if __name__ == "__main__":
157210
try:
158211
arg = sys.argv[1]
159212
except IndexError:
160213
sys.exit(
161-
"What to do?\n"
162-
"Please, provide either a *prop_id* letter (%s) for a property data to be downloaded,\n"
163-
"or a data *filename* generated after a property data download." % ", ".join(human_names.keys())
214+
"What to do?\n"
215+
"Please, provide either a *prop_id* letter (%s) for a property data to be downloaded and fitted,\n"
216+
"or a data *filename* for tuning the model." % ", ".join(human_names.keys())
164217
)
218+
try:
219+
descriptor_kappa = int(sys.argv[2])
220+
except:
221+
descriptor_kappa = None
165222

166223
if arg in human_names.keys():
167224

168-
# getting the data from scratch by prop_id
169-
struct_props = mpds_get_data(arg)
225+
struct_props = mpds_get_data(arg, descriptor_kappa)
170226

171227
X = np.array(struct_props['Descriptor'].tolist())
172228
y = struct_props['Avgvalue'].tolist()
173229

174230
avg_mae, avg_r2 = estimate_quality(get_regr(), X, y)
231+
175232
print("Avg. MAE: %.2f" % avg_mae)
176233
print("Avg. R2 score: %.2f" % avg_r2)
177234

178-
elif os.path.exists(arg):
179-
180-
# loading saved data
181-
basename = arg.split(os.sep)[-1]
182-
if basename.startswith('df') and basename[3:4] == '_' and basename[2:3] in human_names:
183-
tag = basename[2:3]
184-
print("Detected property %s" % human_names[tag]['name'])
185-
else:
186-
tag = None
187-
print("No property name detected")
188-
189-
df = pd.read_pickle(arg)
190-
191-
X = np.array(df['Descriptor'].tolist())
192-
y = df['Avgvalue'].tolist()
235+
tune_model(struct_props.export_file)
193236

194-
# simple regressor parameter tuning
195-
results = []
196-
for a in range(60, 501, 20):
197-
avg_mae, avg_r2 = estimate_quality(get_regr(a=a), X, y)
198-
results.append([a, avg_mae, avg_r2])
199-
print("%s\t\t\t%s\t\t\t%s" % (a, avg_mae, avg_r2))
200-
results.sort(key=lambda x: (-x[1], x[2]))
201-
202-
print("Best result:", results[-1])
203-
a = results[-1][0]
204-
205-
results = []
206-
for b in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]:
207-
avg_mae, avg_r2 = estimate_quality(get_regr(a=a, b=b), X, y)
208-
results.append([b, avg_mae, avg_r2])
209-
print("%s\t\t\t%s\t\t\t%s" % (b, avg_mae, avg_r2))
210-
results.sort(key=lambda x: (-x[1], x[2]))
211-
212-
print("Best result:", results[-1])
213-
b = results[-1][0]
214-
215-
print("a = %s b = %s" % (a, b))
216-
217-
regr = get_regr(a=a, b=b)
218-
regr.fit(X, y)
219-
regr.metadata = {'mae': avg_mae, 'r2': round(avg_r2, 2)}
220-
221-
if tag:
222-
export_file = MPDSExport.save_model(regr, tag)
223-
print("Saving %s" % export_file)
237+
elif os.path.exists(arg):
238+
tune_model(arg)
224239

225240
else: raise RuntimeError("Unrecognized argument: %s" % arg)
File renamed without changes.
File renamed without changes.
File renamed without changes.

0 commit comments

Comments
 (0)