sassoftware
diff --git a/‎src/sasctl/utils/model_info.py
Lines changed: 250 additions & 0 deletions b/‎src/sasctl/utils/model_info.py
Lines changed: 250 additions & 0 deletions
@@ -0,0 +1,250 @@
+#!/usr/bin/env python
+# encoding: utf-8
+#
+# Copyright © 2023, SAS Institute Inc., Cary, NC, USA.  All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from abc import ABC, abstractmethod
+from typing import Any, Callable, Dict, List, Union
+
+import pandas as pd
+
+
+def get_model_info(model, X, y):
+    """Extracts metadata about the model and associated data sets.
+
+    Parameters
+    ----------
+    model : object
+        A trained model
+    X : array-like
+        Sample of the data used to train the model.
+    y : array-like
+        Sample of the output produced by the model.
+
+    Returns
+    -------
+    ModelInfo
+
+    Raises
+    ------
+    ValueError
+        If `model` is not a recognized type.
+
+    """
+    if model.__class__.__module__.startswith("sklearn."):
+        return SklearnModelInfo(model, X, y)
+
+    raise ValueError(f"Unrecognized model type {model} received.")
+
+
+class ModelInfo(ABC):
+    """Base class for storing model metadata.
+
+    Attributes
+    ----------
+    algorithm : str
+    analytic_function : str
+    is_binary_classifier : bool
+    is_classifier
+    is_regressor
+    is_clusterer
+    model : object
+        The model instance that the information was extracted from.
+    model_params : {str: any}
+        Dictionary of parameter names and values.
+    output_column_names : list of str
+        Variable names associated with the outputs of `model`.
+    predict_function : callable
+        The method on `model` that is called to produce predicted values.
+    target_values : list of str
+        Class labels returned by a classification model.  For binary classification models
+        this is just the label of the targeted event level.
+    threshold : float or None
+        The cutoff value used in a binary classification model to determine which class an
+        observation belongs to.  Returns None if not a binary classification model.
+
+    """
+
+    @property
+    @abstractmethod
+    def algorithm(self) -> str:
+        return
+
+    @property
+    def analytic_function(self) -> str:
+        if self.is_classifier:
+            return "classification"
+        if self.is_regressor:
+            return "prediction"
+
+    @property
+    def description(self) -> str:
+        return str(self.model)
+
+    @property
+    @abstractmethod
+    def is_binary_classifier(self) -> bool:
+        return
+
+    @property
+    @abstractmethod
+    def is_classifier(self) -> bool:
+        return
+
+    @property
+    @abstractmethod
+    def is_clusterer(self) -> bool:
+        return
+
+    @property
+    @abstractmethod
+    def is_regressor(self) -> bool:
+        return
+
+    @property
+    @abstractmethod
+    def model(self) -> object:
+        return
+
+    @property
+    @abstractmethod
+    def model_params(self) -> Dict[str, Any]:
+        return
+
+    @property
+    @abstractmethod
+    def output_column_names(self) -> List[str]:
+        return
+
+    @property
+    @abstractmethod
+    def predict_function(self) -> Callable:
+        return
+
+    @property
+    @abstractmethod
+    def target_values(self):
+        # "target event"
+        # value that indicates the target event has occurred in bianry classi
+        return
+
+    @property
+    @abstractmethod
+    def threshold(self) -> Union[str, None]:
+        return
+
+
+class SklearnModelInfo(ModelInfo):
+    """Stores model information for a scikit-learn model instance."""
+
+    # Map class names from sklearn to algorithm names used by SAS
+    _algorithm_mappings = {
+        "LogisticRegression": "Logistic regression",
+        "LinearRegression": "Linear regression",
+        "SVC": "Support vector machine",
+        "SVR": "Support vector machine",
+        "GradientBoostingClassifier": "Gradient boosting",
+        "GradientBoostingRegressor": "Gradient boosting",
+        "RandomForestClassifier": "Forest",
+        "RandomForestRegressor": "Forest",
+        "DecisionTreeClassifier": "Decision tree",
+        "DecisionTreeRegressor": "Decision tree",
+    }
+
+    def __init__(self, model, X, y):
+        # Ensure input/output is a DataFrame for consistency
+        X_df = pd.DataFrame(X)
+        y_df = pd.DataFrame(y)
+
+        is_classifier = hasattr(model, "classes_")
+        is_binary_classifier = is_classifier and len(model.classes_) == 2
+        is_clusterer = hasattr(model, "cluster_centers_")
+
+        # If not a classfier or a clustering algorithm and output is a single column, then
+        # assume its a regression algorithm
+        is_regressor = not is_classifier and not is_clusterer and y_df.shape[1] == 1
+
+        if not is_classifier and not is_regressor and not is_clusterer:
+            raise ValueError(f"Unexpected model type {model} received.")
+
+        self._is_classifier = is_classifier
+        self._is_binary_classifier = is_binary_classifier
+        self._is_regressor = is_regressor
+        self._is_clusterer = is_clusterer
+        self._model = model
+
+        if not hasattr(y, "name") and not hasattr(y, "columns"):
+            # If example output doesn't contain column names then our DataFrame equivalent
+            # also lacks good column names.  Assign reasonable names for use downstream.
+            if y_df.shape[1] == 1:
+                y_df.columns = ["I_Target"]
+            elif self.is_classifier:
+                # Output is probability of each label.  Name columns according to classes.
+                y_df.columns = [f"P_{class_}" for class_ in model.classes_]
+            else:
+                # This *shouldn't* happen unless a cluster algorithm somehow produces wide output.
+                raise ValueError(f"Unrecognized model output format.")
+
+        # Store the data sets for reference later.
+        self._X = X_df
+        self._y = y_df
+
+    @property
+    def algorithm(self):
+        # Get the model or the last step in the Pipeline
+        estimator = getattr(self.model, "_final_estimator", self.model)
+        estimator = type(estimator).__name__
+
+        # Convert the class name to an algorithm, or return the class name if no match.
+        return self._algorithm_mappings.get(estimator, estimator)
+
+    @property
+    def is_binary_classifier(self):
+        return self._is_binary_classifier
+
+    @property
+    def is_classifier(self):
+        return self._is_classifier
+
+    @property
+    def is_clusterer(self):
+        return self._is_clusterer
+
+    @property
+    def is_regressor(self):
+        return self._is_regressor
+
+    @property
+    def model(self):
+        return self._model
+
+    @property
+    def model_params(self) -> Dict[str, Any]:
+        return self.model.get_params()
+
+    @property
+    def output_column_names(self):
+        return list(self._y.columns)
+
+    @property
+    def predict_function(self):
+        # If desired output has multiple columns then we can assume its the probability values
+        if self._y.shape[1] > 1 and hasattr(self.model, "predict_proba"):
+            return self.model.predict_proba
+
+        # Otherwise its the single value from .predict()
+        return self.model.predict
+
+    @property
+    def target_values(self):
+        if self.is_binary_classifier:
+            return [self.model.classes_[-1]]
+        if self.is_classifier:
+            return list(self.model.classes_)
+
+    @property
+    def threshold(self):
+        # sklearn seems to always use 0.5 as a cutoff for .predict()
+        if self.is_binary_classifier:
+            return 0.5