Merge pull request #42 from jameskochubasas/master

jlwalke2 · web-flow · commit b3c34e67f1fc · 2019-10-23T12:38:05.000-04:00
Fixing the MM performance capabilities for all models
diff --git a/src/sasctl/_services/model_management.py b/src/sasctl/_services/model_management.py
@@ -174,11 +174,16 @@ def create_performance_definition(cls,
 
         # Performance data cannot be captured unless certain project properties
         # have been configured.
-        for required in ['targetVariable', 'targetLevel',
-                         'predictionVariable']:
+        for required in ['targetVariable', 'targetLevel']:
             if getattr(project, required, None) is None:
                 raise ValueError("Project %s must have the '%s' property set."
                                  % (project.name, required))
+        if project['function'] == 'classification' and project['eventProbabilityVariable'] == None:
+            raise ValueError("Project %s must have the 'eventProbabilityVariable' property set."
+                                 % (project.name))
+        if project['function'] == 'prediction' and project['predictionVariable'] == None:
+            raise ValueError("Project %s must have the 'predictionVariable' property set."
+                                 % (project.name))
 
         request = {'projectId': project.id,
                    'name': name or model.name + ' Performance',
diff --git a/src/sasctl/tasks.py b/src/sasctl/tasks.py
@@ -49,8 +49,8 @@ def _sklearn_to_dict(model):
                 'RandomForestClassifier': 'Forest',
                 'DecisionTreeClassifier': 'Decision tree',
                 'DecisionTreeRegressor': 'Decision tree',
-                'classifier': 'Classification',
-                'regressor': 'Prediction'}
+                'classifier': 'classification',
+                'regressor': 'prediction'}
 
     if hasattr(model, '_final_estimator'):
         estimator = type(model._final_estimator)
@@ -207,10 +207,26 @@ def get_version(x):
     # If model is a CASTable then assume it holds an ASTORE model.
     # Import these via a ZIP file.
     if 'swat.cas.table.CASTable' in str(type(model)):
-        zipfile = utils.create_package(model)
+        zipfile = utils.create_package(model, input=input)
 
         if create_project:
-            project = mr.create_project(project, repo_obj)
+            outvar=[]
+            invar=[]
+            import zipfile as zp
+            import copy
+            zipfilecopy = copy.deepcopy(zipfile)
+            tmpzip=zp.ZipFile(zipfilecopy)
+            if "outputVar.json" in tmpzip.namelist():
+                outvar=json.loads(tmpzip.read("outputVar.json").decode('utf=8')) #added decode for 3.5 and older
+                for tmp in outvar:
+                    tmp.update({'role':'output'})
+            if "inputVar.json" in tmpzip.namelist():
+                invar=json.loads(tmpzip.read("inputVar.json").decode('utf-8')) #added decode for 3.5 and older
+                for tmp in invar:
+                    if tmp['role'] != 'input':
+                       tmp['role']='input'
+            vars=invar + outvar
+            project = mr.create_project(project, repo_obj, variables=vars)
 
         model = mr.import_model_from_zip(name, project, zipfile,
                                          version=version)
@@ -302,17 +318,27 @@ def get_version(x):
         else:
             prediction_variable = None
 
-        project = mr.create_project(project, repo_obj,
+        # As of Viya 3.4 the 'predictionVariable' parameter is not set during
+        # project creation.  Update the project if necessary.
+        if function == 'prediction':   #Predications require predictionVariable
+            project = mr.create_project(project, repo_obj,
                                     variables=vars,
                                     function=model.get('function'),
                                     targetLevel=target_level,
                                     predictionVariable=prediction_variable)
 
-        # As of Viya 3.4 the 'predictionVariable' parameter is not set during
-        # project creation.  Update the project if necessary.
-        if project.get('predictionVariable') != prediction_variable:
-            project['predictionVariable'] = prediction_variable
-            mr.update_project(project)
+            if project.get('predictionVariable') != prediction_variable:
+                project['predictionVariable'] = prediction_variable
+                mr.update_project(project)
+        else:  #Classifications require eventProbabilityVariable 
+            project = mr.create_project(project, repo_obj,
+                                    variables=vars,
+                                    function=model.get('function'),
+                                    targetLevel=target_level,
+                                    eventProbabilityVariable=prediction_variable)
+            if project.get('eventProbabilityVariable') != prediction_variable:
+                project['eventProbabilityVariable'] = prediction_variable
+                mr.update_project(project)
 
     model = mr.create_model(model, project)
 
@@ -506,9 +532,12 @@ def update_model_performance(data, model, label, refresh=True):
                          "regression and binary classification projects.  "
                          "Received project with '%s' target level.  Should be "
                          "'Interval' or 'Binary'.", project.get('targetLevel'))
-    elif project.get('predictionVariable', '') == '':
+    elif project.get('predictionVariable', '') == '' and project.get('function', '').lower() == 'prediction':
         raise ValueError("Project '%s' does not have a prediction variable "
                          "specified." % project)
+    elif project.get('eventProbabilityVariable', '') == '' and project.get('function', '').lower() == 'classification':
+        raise ValueError("Project '%s' does not have an Event Probability variable "
+                         "specified." % project)
 
     # Find the performance definition for the model
     # As of Viya 3.4, no way to search by model or project
diff --git a/src/sasctl/utils/astore.py b/src/sasctl/utils/astore.py
@@ -20,13 +20,21 @@
     swat = None
 
 
-def create_package(table):
+def create_package(table, input=None):
     """Create an importable model package from a CAS table.
 
     Parameters
     ----------
     table : swat.CASTable
         The CAS table containing an ASTORE or score code.
+    input : DataFrame, type, list of type, or dict of str: type, optional
+        The expected type for each input value of the target function.
+        Can be omitted if target function includes type hints.  If a DataFrame
+        is provided, the columns will be inspected to determine type information.
+        If a single type is provided, all columns will be assumed to be that type,
+        otherwise a list of column types or a dictionary of column_name: type
+        may be provided.
+
 
     Returns
     -------
@@ -45,18 +53,26 @@ def create_package(table):
     assert isinstance(table, swat.CASTable)
 
     if 'DataStepSrc' in table.columns:
-        return create_package_from_datastep(table)
+        #Input only passed to datastep
+        return create_package_from_datastep(table, input=input)
     else:
         return create_package_from_astore(table)
 
 
-def create_package_from_datastep(table):
+def create_package_from_datastep(table, input=None):
     """Create an importable model package from a score code table.
 
     Parameters
     ----------
     table : swat.CASTable
         The CAS table containing the score code.
+    input : DataFrame, type, list of type, or dict of str: type, optional
+        The expected type for each input value of the target function.
+        Can be omitted if target function includes type hints.  If a DataFrame
+        is provided, the columns will be inspected to determine type information.
+        If a single type is provided, all columns will be assumed to be that type,
+        otherwise a list of column types or a dictionary of column_name: type
+        may be provided.
 
     Returns
     -------
@@ -73,11 +89,59 @@ def create_package_from_datastep(table):
 
     dscode = table.to_frame().loc[0, 'DataStepSrc']
 
+    # Extract inputs if provided
+    input_vars = []
+    # Workaround because sasdataframe does not like to be check if exist
+    if str(input) != "None":
+        from .pymas.python import ds2_variables
+        vars=None
+        if hasattr(input, 'columns'):
+            # Assuming input is a DataFrame representing model inputs.  Use to
+            # get input variables
+            vars = ds2_variables(input)
+        elif isinstance(input, type):
+            params = OrderedDict([(k, input)
+                              for k in target_func.__code__.co_varnames])
+            vars = ds2_variables(params)
+        elif isinstance(input, dict):
+            vars = ds2_variables(input)
+        if vars:
+            input_vars = [var.as_model_metadata() for var in vars if not var.out]
+
+    #Find outputs from ds code
+    output_vars=[]
+    for sasline in dscode.split('\n'):
+        if sasline.strip().startswith('label'):
+            output_var=dict()
+            for tmp in sasline.split('='):
+                if 'label' in tmp:
+                    ovarname=tmp.split('label')[1].strip()
+                    output_var.update({"name":ovarname})
+                    #Determine type of variable is decimal or string
+                    if "length " + ovarname in dscode:
+                        sastype=dscode.split("length " + ovarname)[1].split(';')[0].strip()
+                        if "$" in sastype:
+                            output_var.update({"type":"string"})
+                            output_var.update({"length":sastype.split("$")[1]})
+                        else:
+                            output_var.update({"type":"decimal"})
+                            output_var.update({"length":sastype})
+                    else:
+                        #If no length for varaible, default is decimal, 8
+                        output_var.update({"type":"decimal"})
+                        output_var.update({"length":8})
+                else:
+                    output_var.update({"description":tmp.split(';')[0].strip().strip("'")})
+            output_vars.append(output_var) 
+
     file_metadata = [{'role': 'score', 'name': 'dmcas_scorecode.sas'}]
 
     zip_file = _build_zip_from_files({
         'fileMetadata.json': file_metadata,
-        'dmcas_scorecode.sas': dscode
+        'dmcas_scorecode.sas': dscode,
+        'ModelProperties.json': {"scoreCodeType":"dataStep"},
+        'outputVar.json': output_vars,
+        'inputVar.json': input_vars
     })
 
     return zip_file
diff --git a/src/sasctl/utils/pymas/ds2.py b/src/sasctl/utils/pymas/ds2.py
@@ -339,7 +339,7 @@ def _map_type(cls, mapping, t):
 
     def as_model_metadata(self):
         viya_type = self._map_type(self.DS2_TYPE_TO_VIYA, self.type)
-        role = 'Output' if self.out else 'Input'
+        role = 'Output' if self.out else 'input'
 
         return OrderedDict(
             [('name', self.name), ('role', role), ('type', viya_type)])
diff --git a/tests/unit/test_model_management.py b/tests/unit/test_model_management.py
@@ -50,13 +50,20 @@ def test_create_performance_definition():
                 with pytest.raises(ValueError):
                     # Project missing some required properties
                     get_project.return_value = copy.deepcopy(PROJECT)
-                    get_project.return_value['predictionVariable'] = 'predicted'
+                    get_project.return_value['function'] = 'classification'
+                    _ = mm.create_performance_definition('model', 'TestLibrary', 'TestData')
+
+                with pytest.raises(ValueError):
+                    # Project missing some required properties
+                    get_project.return_value = copy.deepcopy(PROJECT)
+                    get_project.return_value['function'] = 'prediction'
                     _ = mm.create_performance_definition('model', 'TestLibrary', 'TestData')
 
                 get_project.return_value = copy.deepcopy(PROJECT)
                 get_project.return_value['targetVariable'] = 'target'
                 get_project.return_value['targetLevel'] = 'interval'
                 get_project.return_value['predictionVariable'] = 'predicted'
+                get_project.return_value['function'] = 'prediction'
                 _ = mm.create_performance_definition('model', 'TestLibrary',
                                                      'TestData',
                                                      max_bins=3,
diff --git a/tests/unit/test_tasks.py b/tests/unit/test_tasks.py
@@ -22,27 +22,27 @@ def test_sklearn_metadata():
 
     info = _sklearn_to_dict(LinearRegression())
     assert info['algorithm'] == 'Linear regression'
-    assert info['function'] == 'Prediction'
+    assert info['function'] == 'prediction'
 
     info = _sklearn_to_dict(LogisticRegression())
     assert info['algorithm'] == 'Logistic regression'
-    assert info['function'] == 'Classification'
+    assert info['function'] == 'classification'
 
     info = _sklearn_to_dict(SVC())
     assert info['algorithm'] == 'Support vector machine'
-    assert info['function'] == 'Classification'
+    assert info['function'] == 'classification'
 
     info = _sklearn_to_dict(GradientBoostingClassifier())
     assert info['algorithm'] == 'Gradient boosting'
-    assert info['function'] == 'Classification'
+    assert info['function'] == 'classification'
 
     info = _sklearn_to_dict(DecisionTreeClassifier())
     assert info['algorithm'] == 'Decision tree'
-    assert info['function'] == 'Classification'
+    assert info['function'] == 'classification'
 
     info = _sklearn_to_dict(RandomForestClassifier())
     assert info['algorithm'] == 'Forest'
-    assert info['function'] == 'Classification'
+    assert info['function'] == 'classification'
 
 
 def test_parse_module_url():
@@ -96,6 +96,13 @@ def test_save_performance_project_types():
                 project.return_value = {'function': 'Prediction',
                                         'targetLevel': 'Binary'}
                 update_model_performance(None, None, None)
+            
+            # Classification variable required
+            with pytest.raises(ValueError):
+                project.return_value = {'function': 'classification',
+                                        'targetLevel': 'Binary'}
+                update_model_performance(None, None, None)
+
 
     # Check projects w/ invalid properties