Merge pull request #635 from henrydingliu/master

henrydingliu · web-flow · commit ae858c26483b · 2025-12-30T18:07:36.000-08:00
Fully resolving #623
diff --git a/chainladder/development/barnzehn.py b/chainladder/development/barnzehn.py
@@ -22,18 +22,43 @@ class BarnettZehnwirth(TweedieGLM):
 
     Parameters
     ----------
+    drop: tuple or list of tuples
+        Drops specific origin/development combination(s)
+    drop_valuation: str or list of str (default = None)
+        Drops specific valuation periods. str must be date convertible.
     formula: formula-like
         A patsy formula describing the independent variables, X of the GLM
+    feat_eng: dict
+        A dictionary with feature names as keys and a dictionary of function (with a key of 'func') and keyword arguments (with a key of 'kwargs') 
+        (e.g. {
+            'feature_1':{
+                'func': function_name for feature 1,
+                'kwargs': keyword arguments for the function
+                },
+            'feature_2':{
+                'func': function_name for feature 2,
+                'kwargs': keyword arguments for the function
+                }
+            }
+        );  
+        functions should be written with a input Dataframe named df; this is the DataFrame containing origin, development, and valuation that will passed into the function at run time
+        (e.g. this function adds 1 to every origin 
+        def test_func(df)
+            return df['origin'] + 1
+        )
     response:  str
         Column name for the reponse variable of the GLM.  If ommitted, then the
         first column of the Triangle will be used.
 
 
     """
 
-    def __init__(self, formula='C(origin) + development', response=None):
+    def __init__(self, drop=None,drop_valuation=None,formula='C(origin) + development', feat_eng=None, response=None):
+        self.drop = drop
+        self.drop_valuation = drop_valuation
         self.formula = formula
         self.response = response
+        self.feat_eng = feat_eng
 
     def fit(self, X, y=None, sample_weight=None):
         if max(X.shape[:2]) > 1:
@@ -50,7 +75,7 @@ def fit(self, X, y=None, sample_weight=None):
         self.model_ = DevelopmentML(Pipeline(steps=[
             ('design_matrix', PatsyFormula(self.formula)),
             ('model', LinearRegression(fit_intercept=False))]),
-                    y_ml=response, fit_incrementals=False).fit(tri)
+                    y_ml=response, fit_incrementals=False, feat_eng = self.feat_eng, drop=self.drop, drop_valuation = self.drop_valuation, weighted_step = 'model').fit(tri)
         resid = tri - self.model_.triangle_ml_[
             self.model_.triangle_ml_.valuation <= tri.valuation_date]
         self.mse_resid_ = (resid**2).sum(0).sum(1).sum(2).sum() / (
@@ -75,12 +100,13 @@ def transform(self, X):
             X_new : New triangle with transformed attributes.
         """
         X_new = X.copy()
-        X_ml = self.model_._prep_X_ml(X.cum_to_incr().log())
+        X_ml, weight_ml = self.model_._prep_X_ml(X.cum_to_incr().log())
         y_ml = self.model_.estimator_ml.predict(X_ml)
-        triangle_ml = self.model_._get_triangle_ml(X_ml, y_ml)
+        triangle_ml, predicted_data = self.model_._get_triangle_ml(X_ml, y_ml)
         backend = "cupy" if X.array_backend == "cupy" else "numpy"
         triangle_ml.is_cumulative = False
         X_new.ldf_ = triangle_ml.exp().incr_to_cum().link_ratio.set_backend(backend)
         X_new.ldf_.valuation_date = pd.to_datetime(options.ULT_VAL)
         X_new._set_slicers()
+        X_new.predicted_data_ = predicted_data
         return X_new
diff --git a/chainladder/development/learning.py b/chainladder/development/learning.py
@@ -33,6 +33,28 @@ class DevelopmentML(DevelopmentBase):
         Time Series aspects of the model. Predictions from one development period
         get used as featues in the next development period. Lags should be negative
         integers.
+    drop: tuple or list of tuples
+        Drops specific origin/development combination(s)
+    drop_valuation: str or list of str (default = None)
+        Drops specific valuation periods. str must be date convertible.
+    feat_eng: dict
+        A dictionary with feature names as keys and a dictionary of function (with a key of 'func') and keyword arguments (with a key of 'kwargs') 
+        (e.g. {
+            'feature_1':{
+                'func': function_name for feature 1,
+                'kwargs': keyword arguments for the function
+                },
+            'feature_2':{
+                'func': function_name for feature 2,
+                'kwargs': keyword arguments for the function
+                }
+            }
+        );  
+        functions should be written with a input Dataframe named df; this is the DataFrame containing origin, development, and valuation that will passed into the function at run time
+        (e.g. this function adds 1 to every origin 
+        def test_func(df)
+            return df['origin'] + 1
+        )
     fit_incrementals:
         Whether the response variable should be converted to an incremental basis
         for fitting.
@@ -48,12 +70,16 @@ class DevelopmentML(DevelopmentBase):
     """
 
     def __init__(self, estimator_ml=None, y_ml=None, autoregressive=False,
-                 weight_ml=None, fit_incrementals=True):
+                 weight_ml=None, weighted_step=None,drop=None,drop_valuation=None,fit_incrementals=True, feat_eng=None):
         self.estimator_ml=estimator_ml
         self.y_ml=y_ml
         self.weight_ml = weight_ml
-        self.autoregressive=autoregressive
+        self.weighted_step = weighted_step
+        self.autoregressive = autoregressive
+        self.drop = drop
+        self.drop_valuation = drop_valuation
         self.fit_incrementals = fit_incrementals
+        self.feat_eng = feat_eng
 
     def _get_y_names(self):
         """ private function to get the response column name"""
@@ -112,6 +138,9 @@ def _get_triangle_ml(self, df, preds=None):
             if len(out) == 0:
                 continue
             X_r.append(out.copy())
+            if self.feat_eng is not None:            
+                for key, item in self.feat_eng.items():
+                    out[key] = item['func'](df=out,**item['kwargs'])
             preds = self.estimator_ml.predict(out)
             y_r.append(preds.copy())
         X_r = pd.concat(X_r, axis=0).reset_index(drop=True)
@@ -124,7 +153,7 @@ def _get_triangle_ml(self, df, preds=None):
         return Triangle(
             out, origin='origin', development='valuation',
             index=self._key_labels, columns=self._get_y_names(),
-            cumulative=not self.fit_incrementals).dropna()
+            cumulative=not self.fit_incrementals).dropna(), out
 
     def _prep_X_ml(self, X):
         """ Preps Triangle data ahead of the pipeline """
@@ -145,7 +174,16 @@ def _prep_X_ml(self, X):
             on=list(df_base.columns)).fillna(0)
         df['origin'] = df['origin'].map(self.origin_encoder_)
         df['valuation'] = df['valuation'].map(self.valuation_encoder_)
-        return df
+        if self.feat_eng is not None:            
+            for key, item in self.feat_eng.items():
+                df[key] = item['func'](df=df,**item['kwargs'])
+        weight_base = (~np.isnan(X.values)).astype(float)
+        weight = weight_base.copy()
+        if self.drop is not None:
+            weight = weight * self._drop_func(X)
+        if self.drop_valuation is not None:
+            weight = weight * self._drop_valuation_func(X)        
+        return df, weight.flatten()[weight_base.flatten()>0]
 
     def fit(self, X, y=None, sample_weight=None):
         """Fit the model with X.
@@ -176,12 +214,19 @@ def fit(self, X, y=None, sample_weight=None):
         self.valuation_encoder_ = dict(zip(
             val,
             (pd.Series(val).rank()-1)/{'Y':1, 'S': 2, 'Q':4, 'M': 12}[X.development_grain]))
-        df = self._prep_X_ml(X)
+        df, weight = self._prep_X_ml(X)
         self.df_ = df
+        self.weight_ = weight
+        if self.weighted_step == None:
+            sample_weights = {}
+        elif isinstance(self.weighted_step, list):
+            sample_weights = {x + '__sample_weight':weight for x in self.weighted_step}
+        else:
+            sample_weights = {self.weighted_step + '__sample_weight':weight}
         # Fit model
-        self.estimator_ml.fit(df, self.y_ml_.fit_transform(df).squeeze())
+        self.estimator_ml.fit(df, self.y_ml_.fit_transform(df).squeeze(),**sample_weights)
         #return selffit_incrementals 
-        self.triangle_ml_ = self._get_triangle_ml(df)
+        self.triangle_ml_, self.predicted_data_ = self._get_triangle_ml(df)
         return self
 
     @property
@@ -204,11 +249,12 @@ def transform(self, X):
             X_new : New triangle with transformed attributes.
         """
         X_new = X.copy()
-        X_ml = self._prep_X_ml(X)
+        X_ml, weight_ml = self._prep_X_ml(X)
         y_ml=self.estimator_ml.predict(X_ml)
-        triangle_ml = self._get_triangle_ml(X_ml, y_ml)
+        triangle_ml, predicted_data = self._get_triangle_ml(X_ml, y_ml)
         backend = "cupy" if X.array_backend == "cupy" else "numpy"
         X_new.ldf_ = triangle_ml.incr_to_cum().link_ratio.set_backend(backend)
         X_new.ldf_.valuation_date = pd.to_datetime(options.ULT_VAL)
         X_new._set_slicers()
-        return X_new
+        X_new.predicted_data_ = predicted_data
+        return X_new
diff --git a/chainladder/development/tests/test_barnzehn.py b/chainladder/development/tests/test_barnzehn.py
@@ -12,4 +12,65 @@ def test_basic_bz():
 def test_multiple_triangle_exception():
     d = cl.load_sample("usauto")
     with pytest.raises(ValueError):
-        cl.BarnettZehnwirth(formula='C(origin)+C(development)').fit(d)
+        cl.BarnettZehnwirth(formula='C(origin)+C(development)').fit(d)
+
+def test_feat_eng_1():
+    '''
+    this function tests the passing in a basic engineered feature. Since test_func just returns development, C(development) and C(teatfeat) should yield identical results
+    '''
+    def test_func(df):
+        return df["development"]
+
+    abc = cl.load_sample('abc')
+    test_dict = {'testfeat':{'func':test_func,'kwargs':{}}}
+
+    assert np.all(
+        np.around(cl.BarnettZehnwirth(formula='C(origin)+development+valuation').fit(abc).coef_.T.values,3)
+        == np.around(cl.BarnettZehnwirth(formula='C(origin)+testfeat+valuation',feat_eng = test_dict).fit(abc).coef_.T.values,3)
+    )
+
+def test_feat_eng_2():
+    '''
+    this function tests more complex feature engineering. Since origin_onehot just replicates the one-hot encoding that's performed inside sklearn LinearRegression, the two BZ models should yield identical results
+
+    this function also tests the BZ transformer
+    '''
+    def origin_onehot(df,ori):
+        return [1 if x == ori else 0 for x in df["origin"]]
+
+    abc = cl.load_sample('abc')
+    feat_dict = {f'origin_{x}':{'func':origin_onehot,'kwargs':{'ori':float(x+1)}} for x in range(10)}
+    assert np.all(
+        np.around(cl.BarnettZehnwirth(formula='+'.join([f'C({x})' for x in feat_dict.keys()]),feat_eng = feat_dict).fit(abc).ldf_.values,3)
+        == np.around(cl.BarnettZehnwirth(formula='C(origin)').fit_transform(abc).ldf_.values,3)
+    )
+
+def test_bz_2008():
+    '''
+    this function tests the drop parameter by recreating the example in the 2008 BZ paper, section 4.1
+    '''
+    abc = cl.load_sample('abc')
+    exposure=np.array([[2.2], [2.4], [2.2], [2.0], [1.9], [1.6], [1.6], [1.8], [2.2], [2.5], [2.6]])
+    abc_adj = abc/exposure
+
+    def predictor_bins(df,pbin,axis):
+        return [int(x >= min(pbin)) for x in df[axis]]
+        
+    origin_groups = {f'origin_{ori}'.replace('[','').replace(']','').replace(', ',''):{'func':predictor_bins,'kwargs':{'pbin':ori,'axis':'origin'}} for ori in [[2],[3,4],[5,6,7,8,9,10]]}
+
+    def trend_piece(df,piece,axis):
+        pmax = float(max(piece))
+        increment=min(df[axis][df[axis]>0])
+        pfirst = piece[0]-increment
+        return [(x-pfirst)/increment if x in piece else (0 if x<pmax else (pmax-pfirst)/increment) for x in df[axis]]
+        
+    development_groups = {f'development_{dev}'.replace('[','').replace(']','').replace(', ',''):{'func':trend_piece,'kwargs':{'piece':dev,'axis':'development'}} for dev in [[24],[36],[48,60,72],[84,96],[108,120,132]]}
+
+    valuation_groups = {f'valuation_{val}'.replace('[','').replace(']','').replace(', ',''):{'func':trend_piece,'kwargs':{'piece':val,'axis':'valuation'}} for val in [[1,2,3,4,5,6,7],[8],[9,10]]}
+
+    abc_dict = {**origin_groups,**development_groups,**valuation_groups}
+    model=cl.BarnettZehnwirth(formula='+'.join([z for z in abc_dict.keys()]),feat_eng=abc_dict, drop=('1982',72)).fit(abc_adj)
+    assert np.all(
+        np.around(model.coef_.values,4).flatten()
+        == np.array([11.1579,0.1989,0.0703,0.0919,0.1871,-0.3771,-0.4465,-0.3727,-0.3154,0.0432,0.0858,0.1464])
+    )