rebasing BZ changes

henrydingliu · henrydingliu · commit 4da4ad396fc3 · 2025-12-30T22:41:51.000Z
diff --git a/chainladder/development/barnzehn.py b/chainladder/development/barnzehn.py
@@ -24,16 +24,35 @@ class BarnettZehnwirth(TweedieGLM):
     ----------
     formula: formula-like
         A patsy formula describing the independent variables, X of the GLM
+    feat_eng: dict
+        A dictionary with feature names as keys and a dictionary of function (with a key of 'func') and keyword arguments (with a key of 'kwargs') 
+        (e.g. {
+            'feature_1':{
+                'func': function_name for feature 1,
+                'kwargs': keyword arguments for the function
+                },
+            'feature_2':{
+                'func': function_name for feature 2,
+                'kwargs': keyword arguments for the function
+                }
+            }
+        );  
+        functions should be written with a input Dataframe named df; this is the DataFrame containing origin, development, and valuation that will passed into the function at run time
+        (e.g. this function adds 1 to every origin 
+        def test_func(df)
+            return df['origin'] + 1
+        )
     response:  str
         Column name for the reponse variable of the GLM.  If ommitted, then the
         first column of the Triangle will be used.
 
 
     """
 
-    def __init__(self, formula='C(origin) + development', response=None):
+    def __init__(self, formula='C(origin) + development', feat_eng=None, response=None):
         self.formula = formula
         self.response = response
+        self.feat_eng = feat_eng
 
     def fit(self, X, y=None, sample_weight=None):
         if max(X.shape[:2]) > 1:
@@ -50,7 +69,7 @@ def fit(self, X, y=None, sample_weight=None):
         self.model_ = DevelopmentML(Pipeline(steps=[
             ('design_matrix', PatsyFormula(self.formula)),
             ('model', LinearRegression(fit_intercept=False))]),
-                    y_ml=response, fit_incrementals=False).fit(tri)
+                    y_ml=response, fit_incrementals=False, feat_eng = self.feat_eng).fit(tri)
         resid = tri - self.model_.triangle_ml_[
             self.model_.triangle_ml_.valuation <= tri.valuation_date]
         self.mse_resid_ = (resid**2).sum(0).sum(1).sum(2).sum() / (
diff --git a/chainladder/development/learning.py b/chainladder/development/learning.py
@@ -33,6 +33,24 @@ class DevelopmentML(DevelopmentBase):
         Time Series aspects of the model. Predictions from one development period
         get used as featues in the next development period. Lags should be negative
         integers.
+    feat_eng: dict
+        A dictionary with feature names as keys and a dictionary of function (with a key of 'func') and keyword arguments (with a key of 'kwargs') 
+        (e.g. {
+            'feature_1':{
+                'func': function_name for feature 1,
+                'kwargs': keyword arguments for the function
+                },
+            'feature_2':{
+                'func': function_name for feature 2,
+                'kwargs': keyword arguments for the function
+                }
+            }
+        );  
+        functions should be written with a input Dataframe named df; this is the DataFrame containing origin, development, and valuation that will passed into the function at run time
+        (e.g. this function adds 1 to every origin 
+        def test_func(df)
+            return df['origin'] + 1
+        )
     fit_incrementals:
         Whether the response variable should be converted to an incremental basis
         for fitting.
@@ -48,12 +66,13 @@ class DevelopmentML(DevelopmentBase):
     """
 
     def __init__(self, estimator_ml=None, y_ml=None, autoregressive=False,
-                 weight_ml=None, fit_incrementals=True):
+                 weight_ml=None, fit_incrementals=True, feat_eng=None):
         self.estimator_ml=estimator_ml
         self.y_ml=y_ml
         self.weight_ml = weight_ml
         self.autoregressive=autoregressive
         self.fit_incrementals = fit_incrementals
+        self.feat_eng = feat_eng
 
     def _get_y_names(self):
         """ private function to get the response column name"""
@@ -112,6 +131,9 @@ def _get_triangle_ml(self, df, preds=None):
             if len(out) == 0:
                 continue
             X_r.append(out.copy())
+            if self.feat_eng is not None:            
+                for key, item in self.feat_eng.items():
+                    out[key] = item['func'](df=out,**item['kwargs'])
             preds = self.estimator_ml.predict(out)
             y_r.append(preds.copy())
         X_r = pd.concat(X_r, axis=0).reset_index(drop=True)
@@ -145,6 +167,9 @@ def _prep_X_ml(self, X):
             on=list(df_base.columns)).fillna(0)
         df['origin'] = df['origin'].map(self.origin_encoder_)
         df['valuation'] = df['valuation'].map(self.valuation_encoder_)
+        if self.feat_eng is not None:            
+            for key, item in self.feat_eng.items():
+                df[key] = item['func'](df=df,**item['kwargs'])
         return df
 
     def fit(self, X, y=None, sample_weight=None):
diff --git a/chainladder/development/tests/test_barnzehn.py b/chainladder/development/tests/test_barnzehn.py
@@ -1,15 +1,239 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
 import numpy as np
-import chainladder as cl
-import pytest
-
-def test_basic_bz():
-    abc = cl.load_sample('abc')
-    assert np.all(
-        np.around(cl.BarnettZehnwirth(formula='C(origin)+C(development)').fit(abc).coef_.T.values,3).flatten()
-        == np.array([11.837,0.179,0.345,0.378,0.405,0.427,0.431,0.66,0.963,1.157,1.278,0.251,-0.056,-0.449,-0.829,-1.169,-1.508,-1.798,-2.023,-2.238,-2.428])
-    )
-
-def test_multiple_triangle_exception():
-    d = cl.load_sample("usauto")
-    with pytest.raises(ValueError):
-        cl.BarnettZehnwirth(formula='C(origin)+C(development)').fit(d)
+import pandas as pd
+
+from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
+from sklearn.compose import ColumnTransformer
+from chainladder.development.base import DevelopmentBase
+from chainladder import options
+
+
+class DevelopmentML(DevelopmentBase):
+    """ A Estimator that interfaces with machine learning (ML) tools that implement
+    the scikit-learn API.
+
+    The `DevelopmentML` estimator is used to generate ``ldf_`` patterns from
+    the data.
+
+    .. versionadded:: 0.8.1
+
+
+    Parameters
+    ----------
+    estimator_ml: skearn Estimator
+        Any sklearn compatible regression estimator, including Pipelines and
+    y_ml: list or str or sklearn_transformer
+        The response column(s) for the machine learning algorithm. It must be
+        present within the Triangle.
+    autoregressive: tuple, (autoregressive_col_name, lag, source_col_name)
+        The subset of response column(s) to use as lagged features for the
+        Time Series aspects of the model. Predictions from one development period
+        get used as featues in the next development period. Lags should be negative
+        integers.
+    feat_eng: dict
+        A dictionary with feature names as keys and a dictionary of function (with a key of 'func') and keyword arguments (with a key of 'kwargs') 
+        (e.g. {
+            'feature_1':{
+                'func': function_name for feature 1,
+                'kwargs': keyword arguments for the function
+                },
+            'feature_2':{
+                'func': function_name for feature 2,
+                'kwargs': keyword arguments for the function
+                }
+            }
+        );  
+        functions should be written with a input Dataframe named df; this is the DataFrame containing origin, development, and valuation that will passed into the function at run time
+        (e.g. this function adds 1 to every origin 
+        def test_func(df)
+            return df['origin'] + 1
+        )
+    fit_incrementals:
+        Whether the response variable should be converted to an incremental basis
+        for fitting.
+
+    Attributes
+    ----------
+    estimator_ml: Estimator
+        An sklearn-style estimator to predict development patterns
+    ldf_: Triangle
+        The estimated loss development patterns.
+    cdf_: Triangle
+        The estimated cumulative development patterns.
+    """
+
+    def __init__(self, estimator_ml=None, y_ml=None, autoregressive=False,
+                 weight_ml=None, fit_incrementals=True, feat_eng=None):
+        self.estimator_ml=estimator_ml
+        self.y_ml=y_ml
+        self.weight_ml = weight_ml
+        self.autoregressive=autoregressive
+        self.fit_incrementals = fit_incrementals
+        self.feat_eng = feat_eng
+
+    def _get_y_names(self):
+        """ private function to get the response column name"""
+        if not self.y_ml:
+            y_names = self._columns
+        if hasattr(self.y_ml, '_columns'):
+            y_names = self.y_ml._columns
+        elif isinstance(self.y_ml, ColumnTransformer):
+            y_names = self.y_ml.transformers[0][-1]
+        if type(self.y_ml) is list:
+            y_names = self.y_ml
+        elif type(self.y_ml) is str:
+            y_names = [self.y_ml]
+        return y_names
+
+
+    @property
+    def y_ml_(self):
+        defaults = self._get_y_names()
+        transformer = self.y_ml
+        if not transformer:
+            return ColumnTransformer(
+                transformers=[('passthrough', 'passthrough', defaults)])
+        elif type(transformer) is list:
+            return ColumnTransformer(
+                transformers=[('passthrough', 'passthrough', transformer)])
+        elif type(transformer) is str:
+            return ColumnTransformer(
+                transformers=[('passthrough', 'passthrough', [transformer])])
+        else:
+            return transformer
+
+    def _get_triangle_ml(self, df, preds=None):
+        """ Create fitted Triangle """
+        from chainladder.core import Triangle
+        if preds is None:
+            preds = self.estimator_ml.predict(df)
+        X_r = [df]
+        y_r = [preds]
+        dgrain = {'Y':12, 'Q':3, 'M': 1, 'S': 6}[self.development_grain_]
+        ograin = {'Y':1, 'Q':4, 'M': 12, 'S': 6}[self.origin_grain_]
+        latest_filter = (df['origin']+1)*ograin+(df['development']-dgrain)/dgrain
+        latest_filter = latest_filter == latest_filter.max()
+        preds=pd.DataFrame(preds.copy())[latest_filter].values
+        out = df.loc[latest_filter].copy()
+        dev_lags = df['development'].drop_duplicates().sort_values()
+        for d in dev_lags[1:]:
+            out['development'] = out['development'] + dgrain
+            out['valuation'] = out['valuation'] + dgrain / 12
+            if len(preds.shape) == 1:
+                preds = preds[:, None]
+            if self.autoregressive:
+                for num, col in enumerate(self.autoregressive):
+                    out[col[0]]=preds[:, num]
+            out = out[out['development']<=dev_lags.max()]
+            if len(out) == 0:
+                continue
+            X_r.append(out.copy())
+            if self.feat_eng is not None:            
+                for key, item in self.feat_eng.items():
+                    out[key] = item['func'](df=out,**item['kwargs'])
+            preds = self.estimator_ml.predict(out)
+            y_r.append(preds.copy())
+        X_r = pd.concat(X_r, axis=0).reset_index(drop=True)
+        if True:
+            X_r = X_r.drop(self._get_y_names(), axis=1)
+        out = pd.concat((X_r,
+                         pd.DataFrame(np.concatenate(y_r, 0), columns=self._get_y_names())), axis=1)
+        out['origin'] = out['origin'].map({v: k for k, v in self.origin_encoder_.items()})
+        out['valuation'] = out['valuation'].map({v: k for k, v in self.valuation_encoder_.items()})
+        return Triangle(
+            out, origin='origin', development='valuation',
+            index=self._key_labels, columns=self._get_y_names(),
+            cumulative=not self.fit_incrementals).dropna()
+
+    def _prep_X_ml(self, X):
+        """ Preps Triangle data ahead of the pipeline """
+        if self.fit_incrementals:
+            X_ = X.cum_to_incr()
+        else:
+            X_ = X.copy()
+        if self.autoregressive:
+            for i in self.autoregressive:
+                lag = X[i[2]].shift(i[1])
+                X_[i[0]] = lag[lag.valuation<=X.valuation_date]
+        df_base = X.incr_to_cum().to_frame(
+            keepdims=True, implicit_axis=True, origin_as_datetime=True
+            ).reset_index().iloc[:, :-1]
+        df = df_base.merge(X.cum_to_incr().to_frame(
+                keepdims=True, implicit_axis=True, origin_as_datetime=True
+            ).reset_index(), how='left',
+            on=list(df_base.columns)).fillna(0)
+        df['origin'] = df['origin'].map(self.origin_encoder_)
+        df['valuation'] = df['valuation'].map(self.valuation_encoder_)
+        if self.feat_eng is not None:            
+            for key, item in self.feat_eng.items():
+                df[key] = item['func'](df=df,**item['kwargs'])
+        return df
+
+    def fit(self, X, y=None, sample_weight=None):
+        """Fit the model with X.
+
+        Parameters
+        ----------
+        X : Triangle-like
+            Set of LDFs to which the estimator will be applied.
+        y : None
+            Ignored, use y_ml to set a reponse variable for the ML algorithm
+        sample_weight : None
+            Ignored
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+
+        self._columns = list(X.columns)
+        self._key_labels = X.key_labels
+        self.origin_grain_ = X.origin_grain
+        self.development_grain_ = X.development_grain
+        self.origin_encoder_ = dict(zip(
+            X.origin.to_timestamp(how='s'),
+            (pd.Series(X.origin).rank()-1)/{'Y':1, 'Q':4, 'M': 12, 'S': 6}[X.origin_grain]))
+        val = X.valuation.sort_values().unique()
+        self.valuation_encoder_ = dict(zip(
+            val,
+            (pd.Series(val).rank()-1)/{'Y':1, 'Q':4, 'M': 12, 'S': 6}[X.development_grain]))
+        df = self._prep_X_ml(X)
+        self.df_ = df
+        # Fit model
+        self.estimator_ml.fit(df, self.y_ml_.fit_transform(df).squeeze())
+        #return selffit_incrementals 
+        self.triangle_ml_ = self._get_triangle_ml(df)
+        return self
+
+    @property
+    def ldf_(self):
+        ldf = self.triangle_ml_.incr_to_cum().link_ratio
+        ldf.valuation_date = pd.to_datetime(options.ULT_VAL)
+        return ldf
+
+    def transform(self, X):
+        """ If X and self are of different shapes, align self to X, else
+        return self.
+
+        Parameters
+        ----------
+        X : Triangle
+            The triangle to be transformed
+
+        Returns
+        -------
+            X_new : New triangle with transformed attributes.
+        """
+        X_new = X.copy()
+        X_ml = self._prep_X_ml(X)
+        y_ml=self.estimator_ml.predict(X_ml)
+        triangle_ml = self._get_triangle_ml(X_ml, y_ml)
+        backend = "cupy" if X.array_backend == "cupy" else "numpy"
+        X_new.ldf_ = triangle_ml.incr_to_cum().link_ratio.set_backend(backend)
+        X_new.ldf_.valuation_date = pd.to_datetime(options.ULT_VAL)
+        X_new._set_slicers()
+        return X_new