More careful truncation of the suspicious training MPDS values

blokhin · blokhin · commit 534e1cf59562 · 2018-03-16T00:20:04.000+01:00
diff --git a/mpds_ml_labs/prediction.py b/mpds_ml_labs/prediction.py
@@ -6,42 +6,62 @@
 import numpy as np
 
 
-human_names = {
+prop_semantics = {
+    #'w': {
+    #    'name': 'band gap for direct transition',
+    #    'units': 'eV',
+    #    'symbol': 'e<sub>dir.</sub>',
+    #    'rounding': 1,
+    #    'interval': [0.01, 20]
+    #},
     'z': {
         'name': 'isothermal bulk modulus',
         'units': 'GPa',
         'symbol': 'B',
-        'rounding': 0
+        'rounding': 0,
+        'interval': [0.5, 2000]
     },
     'y': {
         'name': 'enthalpy of formation',
         'units': 'kJ g-at.-1',
         'symbol': '&Delta;H',
-        'rounding': 0
+        'rounding': 0,
+        'interval': [-900, 200]
     },
     'x': {
         'name': 'heat capacity at constant pressure',
         'units': 'J K-1 g-at.-1',
         'symbol': 'C<sub>p</sub>',
-        'rounding': 0
+        'rounding': 0,
+        'interval': [0, 500]
     },
-    #'w': {
-    #    'name': 'band gap for direct transition',
-    #    'units': 'eV',
-    #    'symbol': 'e<sub>dir.</sub>',
-    #    'rounding': 1
-    #},
     'k': {
         'name': 'Seebeck coefficient',
         'units': 'muV K-1',
         'symbol': 'S',
-        'rounding': 1
+        'rounding': 1,
+        'interval': [-1000, 1000]
     },
     'm': {
         'name': 'temperature for congruent melting',
         'units': 'K',
         'symbol': 'T<sub>melt</sub>',
-        'rounding': 0
+        'rounding': 0,
+        'interval': [10, 5000]
+    },
+    'd': {
+        'name': 'Debye temperature',
+        'units': 'K',
+        'symbol': '&Theta;<sub>D</sub>',
+        'rounding': 0,
+        'interval': [10, 2000]
+    },
+    't': {
+        'name': 'linear thermal expansion coefficient',
+        'units': 'K-1',
+        'symbol': '&Theta;<sub>D</sub>',
+        'rounding': 6,
+        'interval': [-0.001, 0.001]
     }
 }
 
@@ -112,9 +132,9 @@ def load_ml_model(prop_model_files):
             continue
 
         basename = file_name.split(os.sep)[-1]
-        if basename.startswith('ml') and basename[3:4] == '_' and basename[2:3] in human_names:
+        if basename.startswith('ml') and basename[3:4] == '_' and basename[2:3] in prop_semantics:
             prop_id = basename[2:3]
-            print("Detected property %s in file %s" % (human_names[prop_id]['name'], basename))
+            print("Detected property %s in file %s" % (prop_semantics[prop_id]['name'], basename))
         else:
             prop_id = str(n)
             print("No property name detected in file %s" % basename)
@@ -132,7 +152,7 @@ def load_ml_model(prop_model_files):
 def get_legend(pred_dict):
     legend = {}
     for key in pred_dict.keys():
-        legend[key] = human_names.get(key, {
+        legend[key] = prop_semantics.get(key, {
             'name': 'Unspecified property ' + str(key),
             'units': 'arb.u.',
             'symbol': 'P' + str(key),
@@ -147,7 +167,7 @@ def ase_to_ml_model(ase_obj, ml_model):
     d_dim = len(descriptor)
 
     if not ml_model: # testing
-        return {prop_id: {'value': 42, 'mae': 0, 'r2': 0} for prop_id in human_names.keys()}, None
+        return {prop_id: {'value': 42, 'mae': 0, 'r2': 0} for prop_id in prop_semantics.keys()}, None
 
     for prop_id, regr in ml_model.items(): # production
 
@@ -164,8 +184,8 @@ def ase_to_ml_model(ase_obj, ml_model):
             return None, str(e)
 
         result[prop_id] = {
-            'value': round(prediction, human_names[prop_id]['rounding']),
-            'mae': round(regr.metadata['mae'], human_names[prop_id]['rounding']),
+            'value': round(prediction, prop_semantics[prop_id]['rounding']),
+            'mae': round(regr.metadata['mae'], prop_semantics[prop_id]['rounding']),
             'r2': regr.metadata['r2']
         }
 
diff --git a/mpds_ml_labs/test_app.py b/mpds_ml_labs/test_app.py
@@ -8,30 +8,14 @@
 
 from mpds_client import MPDSDataRetrieval, APIError
 
-from prediction import human_names
-from struct_utils import detect_format, poscar_to_ase, symmetrize, get_formula
+from prediction import prop_semantics
+from struct_utils import detect_format, poscar_to_ase, symmetrize, get_formula, sgn_to_crsystem
 from cif_utils import cif_to_ase
 
 
 req = httplib2.Http()
 client = MPDSDataRetrieval()
 
-def sgn_to_crsystem(number):
-    if   195 <= number <= 230:
-        return 'cubic'
-    elif 168 <= number <= 194:
-        return 'hexagonal'
-    elif 143 <= number <= 167:
-        return 'trigonal'
-    elif 75  <= number <= 142:
-        return 'tetragonal'
-    elif 16  <= number <= 74:
-        return 'orthorhombic'
-    elif 3   <= number <= 15:
-        return 'monoclinic'
-    else:
-        return 'triclinic'
-
 def make_request(address, data={}, httpverb='POST', headers={}):
 
     address += '?' + urlencode(data)
@@ -47,6 +31,9 @@ def make_request(address, data={}, httpverb='POST', headers={}):
 
 if __name__ == '__main__':
 
+    try: sys.argv[1]
+    except IndexError: sys.exit("Structure file must be given!")
+
     structure = open(sys.argv[1]).read()
     fmt = detect_format(structure)
 
@@ -72,29 +59,29 @@ def make_request(address, data={}, httpverb='POST', headers={}):
         raise RuntimeError(answer['error'])
 
     formulae_categ, lattices_categ = get_formula(ase_obj), sgn_to_crsystem(ase_obj.info['spacegroup'].no)
-    for prop_id, pdata in human_names.items():
+    for prop_id, pdata in prop_semantics.items():
         try:
             resp = client.get_dataframe({
                 'formulae': formulae_categ,
                 'lattices': lattices_categ,
                 'props': pdata['name']
             })
         except APIError as e:
-            human_names[prop_id]['factual'] = None
+            prop_semantics[prop_id]['factual'] = None
             if e.code == 1:
                 continue
             else:
                 raise
 
         resp['Value'] = resp['Value'].astype('float64') # to treat values out of bounds given as str
         resp = resp[resp['Units'] == pdata['units']]
-        human_names[prop_id]['factual'] = np.median(resp['Value'])
+        prop_semantics[prop_id]['factual'] = np.median(resp['Value'])
 
     for prop_id, pdata in answer['prediction'].items():
-        print("{0:40} = {1:6}, factual {2:6} (MAE = {3:4}), {4}".format(
-            human_names[prop_id]['name'],
+        print("{0:40} = {1:6}, factual {2:8} (MAE = {3:4}), {4}".format(
+            prop_semantics[prop_id]['name'],
             pdata['value'],
-            human_names[prop_id]['factual'] or 'absent',
-            abs(pdata['value'] - human_names[prop_id]['factual']) if human_names[prop_id]['factual'] else 'unknown',
-            human_names[prop_id]['units']
+            prop_semantics[prop_id]['factual'] or 'absent',
+            pdata['mae'],
+            prop_semantics[prop_id]['units']
         ))
diff --git a/mpds_ml_labs/test_ml.py b/mpds_ml_labs/test_ml.py
@@ -3,7 +3,7 @@
 
 from struct_utils import detect_format, poscar_to_ase, symmetrize
 from cif_utils import cif_to_ase
-from prediction import ase_to_ml_model, load_ml_model, human_names
+from prediction import ase_to_ml_model, load_ml_model, prop_semantics
 from common import ML_MODELS, DATA_PATH
 
 
@@ -57,8 +57,8 @@
 
     for prop_id, pdata in prediction.items():
         print("{0:40} = {1:6} (MAE = {2:4}), {3}".format(
-            human_names[prop_id]['name'],
+            prop_semantics[prop_id]['name'],
             pdata['value'],
             pdata['mae'],
-            human_names[prop_id]['units']
+            prop_semantics[prop_id]['units']
         ))
diff --git a/train_model.py b/train_model.py
@@ -14,7 +14,7 @@
 
 from mpds_client import MPDSDataRetrieval, MPDSExport
 
-from prediction import get_descriptor, human_names
+from mpds_ml_labs.prediction import get_descriptor, prop_semantics
 
 
 def get_regr(a=None, b=None):
@@ -57,13 +57,13 @@ def mpds_get_data(prop_id, descriptor_kappa):
     Fetch, massage, and save dataframe from the MPDS
     NB currently pressure is not taken into account!
     """
-    print("Getting %s with descriptor kappa = %s" % (human_names[prop_id]['name'], descriptor_kappa))
+    print("Getting %s with descriptor kappa = %s" % (prop_semantics[prop_id]['name'], descriptor_kappa))
     starttime = time.time()
 
     client = MPDSDataRetrieval()
 
     props = client.get_dataframe(
-        {"props": human_names[prop_id]['name']},
+        {"props": prop_semantics[prop_id]['name']},
         fields={'P': [
             'sample.material.chemical_formula',
             'sample.material.phase_id',
@@ -77,23 +77,17 @@ def mpds_get_data(prop_id, descriptor_kappa):
     )
     props['Value'] = props['Value'].astype('float64') # to treat values out of bounds given as str
     props = props[np.isfinite(props['Phase'])]
-    props = props[props['Units'] == human_names[prop_id]['units']]
-
-    # filtering some abnormal values
-    # these should be corrected by LPF editors soon
-    if prop_id == 'z':
-        props = props[props['Value'] < 2000]
-    #elif prop_id == 'w': # NB this requires additional treatment for zero band gaps
-    #    props = props[(props['Value'] > 0) & (props['Value'] < 20)]
-    elif prop_id == 'u':
-        props = props[props['Value'] > 0]
-
-    to_drop = props[
-        (props['Cname'] == 'Temperature') & (props['Cunits'] == 'K') & ((props['Cvalue'] < 200) | (props['Cvalue'] > 400))
+    props = props[props['Units'] == prop_semantics[prop_id]['units']]
+    props = props[
+        (props['Value'] > prop_semantics[prop_id]['interval'][0]) & \
+        (props['Value'] < prop_semantics[prop_id]['interval'][1])
     ]
-
-    print("Rows to drop by criteria: %s" % len(to_drop))
-    props.drop(to_drop.index, inplace=True)
+    if prop_id not in ['m', 'd']:
+        to_drop = props[
+            (props['Cname'] == 'Temperature') & (props['Cunits'] == 'K') & ((props['Cvalue'] < 200) | (props['Cvalue'] > 400))
+        ]
+        print("Rows to neglect by temperature: %s" % len(to_drop))
+        props.drop(to_drop.index, inplace=True)
 
     phases_compounds = dict(zip(props['Phase'], props['Compound'])) # keep the mapping for future
     avgprops = props.groupby('Phase')['Value'].mean().to_frame().reset_index().rename(columns={'Value': 'Avgvalue'})
@@ -163,9 +157,9 @@ def tune_model(data_file):
     Load saved data and perform simple regressor parameter tuning
     """
     basename = data_file.split(os.sep)[-1]
-    if basename.startswith('df') and basename[3:4] == '_' and basename[2:3] in human_names:
+    if basename.startswith('df') and basename[3:4] == '_' and basename[2:3] in prop_semantics:
         tag = basename[2:3]
-        print("Detected property %s" % human_names[tag]['name'])
+        print("Detected property %s" % prop_semantics[tag]['name'])
     else:
         tag = None
         print("No property name detected")
@@ -213,14 +207,14 @@ def tune_model(data_file):
         sys.exit(
     "What to do?\n"
     "Please, provide either a *prop_id* letter (%s) for a property data to be downloaded and fitted,\n"
-    "or a data *filename* for tuning the model." % ", ".join(human_names.keys())
+    "or a data *filename* for tuning the model." % ", ".join(prop_semantics.keys())
         )
     try:
         descriptor_kappa = int(sys.argv[2])
     except:
         descriptor_kappa = None
 
-    if arg in human_names.keys():
+    if arg in prop_semantics.keys():
 
         struct_props = mpds_get_data(arg, descriptor_kappa)