@@ -33,7 +33,7 @@ def get_regr(a=None, b=None):
3333 )
3434
3535
36- def estimate_quality (algo , args , values , attempts = 30 , nsamples = 0.4 ):
36+ def estimate_quality (algo , args , values , attempts = 30 , nsamples = 0.33 ):
3737 results = []
3838 for _ in range (attempts ):
3939 X_train , X_test , y_train , y_test = train_test_split (args , values , test_size = nsamples )
@@ -52,13 +52,12 @@ def estimate_quality(algo, args, values, attempts=30, nsamples=0.4):
5252 return avg_mae , avg_r2
5353
5454
55- def mpds_get_data (prop_id ):
55+ def mpds_get_data (prop_id , descriptor_kappa ):
5656 """
57- NB
58- currently pressure is not taken into account,
59- however must be
57+ Fetch, massage, and save dataframe from the MPDS
58+ NB currently pressure is not taken into account!
6059 """
61- print ("Getting %s" % human_names [prop_id ]['name' ])
60+ print ("Getting %s with descriptor kappa = %s " % ( human_names [prop_id ]['name' ], descriptor_kappa ) )
6261 starttime = time .time ()
6362
6463 client = MPDSDataRetrieval ()
@@ -84,12 +83,14 @@ def mpds_get_data(prop_id):
8483 # these should be corrected by LPF editors soon
8584 if prop_id == 'z' :
8685 props = props [props ['Value' ] < 2000 ]
87- elif prop_id == 'w' :
88- props = props [(props ['Value' ] > 0 ) & (props ['Value' ] < 20 )]
86+ # elif prop_id == 'w': # NB this requires additional treatment for zero band gaps
87+ # props = props[(props['Value'] > 0) & (props['Value'] < 20)]
8988 elif prop_id == 'u' :
9089 props = props [props ['Value' ] > 0 ]
9190
92- to_drop = props [(props ['Cname' ] == 'Temperature' ) & (props ['Cunits' ] == 'K' ) & ((props ['Cvalue' ] < 200 ) | (props ['Cvalue' ] > 400 ))]
91+ to_drop = props [
92+ (props ['Cname' ] == 'Temperature' ) & (props ['Cunits' ] == 'K' ) & ((props ['Cvalue' ] < 200 ) | (props ['Cvalue' ] > 400 ))
93+ ]
9394
9495 print ("Rows to drop by criteria: %s" % len (to_drop ))
9596 props .drop (to_drop .index , inplace = True )
@@ -100,22 +101,26 @@ def mpds_get_data(prop_id):
100101
101102 print ("Got %s distinct crystalline phases" % len (phases ))
102103
103- min_descriptor_len = 220
104+ min_descriptor_len = 200
104105 max_descriptor_len = min_descriptor_len * 10
105106 data_by_phases = {}
106107
108+ print ("Computing descriptors..." )
107109 pbar = ProgressBar ()
108110 for item in pbar (client .get_data (
109- {"props" : "atomic structure" },
111+ {
112+ "props" : "atomic structure" ,
113+ "classes" : "non-disordered"
114+ },
110115 fields = {'S' :['phase_id' , 'entry' , 'chemical_formula' , 'cell_abc' , 'sg_n' , 'setting' , 'basis_noneq' , 'els_noneq' ]},
111116 phases = phases
112117 )):
113118 crystal = MPDSDataRetrieval .compile_crystal (item , 'ase' )
114119 if not crystal : continue
115- descriptor = get_descriptor (crystal )
120+ descriptor = get_descriptor (crystal , kappa = descriptor_kappa )
116121
117122 if len (descriptor ) < min_descriptor_len :
118- descriptor = get_descriptor (crystal , overreach = True )
123+ descriptor = get_descriptor (crystal , kappa = descriptor_kappa , overreach = True )
119124 if len (descriptor ) < min_descriptor_len :
120125 continue
121126
@@ -147,79 +152,89 @@ def mpds_get_data(prop_id):
147152
148153 print ("Done %s rows in %1.2f sc" % (len (struct_props ), time .time () - starttime ))
149154
150- export_file = MPDSExport .save_df (struct_props , prop_id )
151- print ("Saving %s" % export_file )
155+ struct_props . export_file = MPDSExport .save_df (struct_props , prop_id )
156+ print ("Saving %s" % struct_props . export_file )
152157
153158 return struct_props
154159
155160
161+ def tune_model (data_file ):
162+ """
163+ Load saved data and perform simple regressor parameter tuning
164+ """
165+ basename = data_file .split (os .sep )[- 1 ]
166+ if basename .startswith ('df' ) and basename [3 :4 ] == '_' and basename [2 :3 ] in human_names :
167+ tag = basename [2 :3 ]
168+ print ("Detected property %s" % human_names [tag ]['name' ])
169+ else :
170+ tag = None
171+ print ("No property name detected" )
172+
173+ df = pd .read_pickle (data_file )
174+
175+ X = np .array (df ['Descriptor' ].tolist ())
176+ y = df ['Avgvalue' ].tolist ()
177+
178+ results = []
179+ for parameter_a in range (20 , 501 , 20 ):
180+ avg_mae , avg_r2 = estimate_quality (get_regr (a = parameter_a ), X , y )
181+ results .append ([parameter_a , avg_mae , avg_r2 ])
182+ print ("%s\t \t \t %s\t \t \t %s" % (parameter_a , avg_mae , avg_r2 ))
183+ results .sort (key = lambda x : (- x [1 ], x [2 ]))
184+
185+ print ("Best result:" , results [- 1 ])
186+ parameter_a = results [- 1 ][0 ]
187+
188+ results = []
189+ for parameter_b in range (1 , 13 ):
190+ avg_mae , avg_r2 = estimate_quality (get_regr (a = parameter_a , b = parameter_b ), X , y )
191+ results .append ([parameter_b , avg_mae , avg_r2 ])
192+ print ("%s\t \t \t %s\t \t \t %s" % (parameter_b , avg_mae , avg_r2 ))
193+ results .sort (key = lambda x : (- x [1 ], x [2 ]))
194+
195+ print ("Best result:" , results [- 1 ])
196+ parameter_b = results [- 1 ][0 ]
197+
198+ print ("a = %s b = %s" % (parameter_a , parameter_b ))
199+
200+ regr = get_regr (a = parameter_a , b = parameter_b )
201+ regr .fit (X , y )
202+ regr .metadata = {'mae' : avg_mae , 'r2' : round (avg_r2 , 2 )}
203+
204+ if tag :
205+ export_file = MPDSExport .save_model (regr , tag )
206+ print ("Saving %s" % export_file )
207+
208+
156209if __name__ == "__main__" :
157210 try :
158211 arg = sys .argv [1 ]
159212 except IndexError :
160213 sys .exit (
161- "What to do?\n "
162- "Please, provide either a *prop_id* letter (%s) for a property data to be downloaded,\n "
163- "or a data *filename* generated after a property data download ." % ", " .join (human_names .keys ())
214+ "What to do?\n "
215+ "Please, provide either a *prop_id* letter (%s) for a property data to be downloaded and fitted ,\n "
216+ "or a data *filename* for tuning the model ." % ", " .join (human_names .keys ())
164217 )
218+ try :
219+ descriptor_kappa = int (sys .argv [2 ])
220+ except :
221+ descriptor_kappa = None
165222
166223 if arg in human_names .keys ():
167224
168- # getting the data from scratch by prop_id
169- struct_props = mpds_get_data (arg )
225+ struct_props = mpds_get_data (arg , descriptor_kappa )
170226
171227 X = np .array (struct_props ['Descriptor' ].tolist ())
172228 y = struct_props ['Avgvalue' ].tolist ()
173229
174230 avg_mae , avg_r2 = estimate_quality (get_regr (), X , y )
231+
175232 print ("Avg. MAE: %.2f" % avg_mae )
176233 print ("Avg. R2 score: %.2f" % avg_r2 )
177234
178- elif os .path .exists (arg ):
179-
180- # loading saved data
181- basename = arg .split (os .sep )[- 1 ]
182- if basename .startswith ('df' ) and basename [3 :4 ] == '_' and basename [2 :3 ] in human_names :
183- tag = basename [2 :3 ]
184- print ("Detected property %s" % human_names [tag ]['name' ])
185- else :
186- tag = None
187- print ("No property name detected" )
188-
189- df = pd .read_pickle (arg )
190-
191- X = np .array (df ['Descriptor' ].tolist ())
192- y = df ['Avgvalue' ].tolist ()
235+ tune_model (struct_props .export_file )
193236
194- # simple regressor parameter tuning
195- results = []
196- for a in range (60 , 501 , 20 ):
197- avg_mae , avg_r2 = estimate_quality (get_regr (a = a ), X , y )
198- results .append ([a , avg_mae , avg_r2 ])
199- print ("%s\t \t \t %s\t \t \t %s" % (a , avg_mae , avg_r2 ))
200- results .sort (key = lambda x : (- x [1 ], x [2 ]))
201-
202- print ("Best result:" , results [- 1 ])
203- a = results [- 1 ][0 ]
204-
205- results = []
206- for b in [1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , 11 , 12 ]:
207- avg_mae , avg_r2 = estimate_quality (get_regr (a = a , b = b ), X , y )
208- results .append ([b , avg_mae , avg_r2 ])
209- print ("%s\t \t \t %s\t \t \t %s" % (b , avg_mae , avg_r2 ))
210- results .sort (key = lambda x : (- x [1 ], x [2 ]))
211-
212- print ("Best result:" , results [- 1 ])
213- b = results [- 1 ][0 ]
214-
215- print ("a = %s b = %s" % (a , b ))
216-
217- regr = get_regr (a = a , b = b )
218- regr .fit (X , y )
219- regr .metadata = {'mae' : avg_mae , 'r2' : round (avg_r2 , 2 )}
220-
221- if tag :
222- export_file = MPDSExport .save_model (regr , tag )
223- print ("Saving %s" % export_file )
237+ elif os .path .exists (arg ):
238+ tune_model (arg )
224239
225240 else : raise RuntimeError ("Unrecognized argument: %s" % arg )
0 commit comments