Adding sample weight to bz and parent

henrydingliu · henrydingliu · commit a219c24fa0af · 2025-12-31T01:50:37.000Z
sample weights enables dropping specific points from fitting, which is essential for recreating BZ results
diff --git a/chainladder/development/barnzehn.py b/chainladder/development/barnzehn.py
@@ -22,6 +22,10 @@ class BarnettZehnwirth(TweedieGLM):
 
     Parameters
     ----------
+    drop: tuple or list of tuples
+        Drops specific origin/development combination(s)
+    drop_valuation: str or list of str (default = None)
+        Drops specific valuation periods. str must be date convertible.
     formula: formula-like
         A patsy formula describing the independent variables, X of the GLM
     feat_eng: dict
@@ -49,7 +53,9 @@ def test_func(df)
 
     """
 
-    def __init__(self, formula='C(origin) + development', feat_eng=None, response=None):
+    def __init__(self, drop=None,drop_valuation=None,formula='C(origin) + development', feat_eng=None, response=None):
+        self.drop = drop
+        self.drop_valuation = drop_valuation
         self.formula = formula
         self.response = response
         self.feat_eng = feat_eng
@@ -69,7 +75,7 @@ def fit(self, X, y=None, sample_weight=None):
         self.model_ = DevelopmentML(Pipeline(steps=[
             ('design_matrix', PatsyFormula(self.formula)),
             ('model', LinearRegression(fit_intercept=False))]),
-                    y_ml=response, fit_incrementals=False, feat_eng = self.feat_eng).fit(tri)
+                    y_ml=response, fit_incrementals=False, feat_eng = self.feat_eng, drop=self.drop, drop_valuation = self.drop_valuation, weighted_step = 'model').fit(tri)
         resid = tri - self.model_.triangle_ml_[
             self.model_.triangle_ml_.valuation <= tri.valuation_date]
         self.mse_resid_ = (resid**2).sum(0).sum(1).sum(2).sum() / (
@@ -94,12 +100,13 @@ def transform(self, X):
             X_new : New triangle with transformed attributes.
         """
         X_new = X.copy()
-        X_ml = self.model_._prep_X_ml(X.cum_to_incr().log())
+        X_ml, weight_ml = self.model_._prep_X_ml(X.cum_to_incr().log())
         y_ml = self.model_.estimator_ml.predict(X_ml)
-        triangle_ml = self.model_._get_triangle_ml(X_ml, y_ml)
+        triangle_ml, predicted_data = self.model_._get_triangle_ml(X_ml, y_ml)
         backend = "cupy" if X.array_backend == "cupy" else "numpy"
         triangle_ml.is_cumulative = False
         X_new.ldf_ = triangle_ml.exp().incr_to_cum().link_ratio.set_backend(backend)
         X_new.ldf_.valuation_date = pd.to_datetime(options.ULT_VAL)
         X_new._set_slicers()
+        X_new.predicted_data_ = predicted_data
         return X_new
diff --git a/chainladder/development/learning.py b/chainladder/development/learning.py
@@ -33,6 +33,10 @@ class DevelopmentML(DevelopmentBase):
         Time Series aspects of the model. Predictions from one development period
         get used as featues in the next development period. Lags should be negative
         integers.
+    drop: tuple or list of tuples
+        Drops specific origin/development combination(s)
+    drop_valuation: str or list of str (default = None)
+        Drops specific valuation periods. str must be date convertible.
     feat_eng: dict
         A dictionary with feature names as keys and a dictionary of function (with a key of 'func') and keyword arguments (with a key of 'kwargs') 
         (e.g. {
@@ -66,11 +70,14 @@ def test_func(df)
     """
 
     def __init__(self, estimator_ml=None, y_ml=None, autoregressive=False,
-                 weight_ml=None, fit_incrementals=True, feat_eng=None):
+                 weight_ml=None, weighted_step=None,drop=None,drop_valuation=None,fit_incrementals=True, feat_eng=None):
         self.estimator_ml=estimator_ml
         self.y_ml=y_ml
         self.weight_ml = weight_ml
-        self.autoregressive=autoregressive
+        self.weighted_step = weighted_step
+        self.autoregressive = autoregressive
+        self.drop = drop
+        self.drop_valuation = drop_valuation
         self.fit_incrementals = fit_incrementals
         self.feat_eng = feat_eng
 
@@ -146,7 +153,7 @@ def _get_triangle_ml(self, df, preds=None):
         return Triangle(
             out, origin='origin', development='valuation',
             index=self._key_labels, columns=self._get_y_names(),
-            cumulative=not self.fit_incrementals).dropna()
+            cumulative=not self.fit_incrementals).dropna(), out
 
     def _prep_X_ml(self, X):
         """ Preps Triangle data ahead of the pipeline """
@@ -170,7 +177,13 @@ def _prep_X_ml(self, X):
         if self.feat_eng is not None:            
             for key, item in self.feat_eng.items():
                 df[key] = item['func'](df=df,**item['kwargs'])
-        return df
+        weight_base = (~np.isnan(X.values)).astype(float)
+        weight = weight_base.copy()
+        if self.drop is not None:
+            weight = weight * self._drop_func(X)
+        if self.drop_valuation is not None:
+            weight = weight * self._drop_valuation_func(X)        
+        return df, weight.flatten()[weight_base.flatten()>0]
 
     def fit(self, X, y=None, sample_weight=None):
         """Fit the model with X.
@@ -201,12 +214,19 @@ def fit(self, X, y=None, sample_weight=None):
         self.valuation_encoder_ = dict(zip(
             val,
             (pd.Series(val).rank()-1)/{'Y':1, 'S': 2, 'Q':4, 'M': 12}[X.development_grain]))
-        df = self._prep_X_ml(X)
+        df, weight = self._prep_X_ml(X)
         self.df_ = df
+        self.weight_ = weight
+        if self.weighted_step == None:
+            sample_weights = {}
+        elif isinstance(self.weighted_step, list):
+            sample_weights = {x + '__sample_weight':weight for x in self.weighted_step}
+        else:
+            sample_weights = {self.weighted_step + '__sample_weight':weight}
         # Fit model
-        self.estimator_ml.fit(df, self.y_ml_.fit_transform(df).squeeze())
+        self.estimator_ml.fit(df, self.y_ml_.fit_transform(df).squeeze(),**sample_weights)
         #return selffit_incrementals 
-        self.triangle_ml_ = self._get_triangle_ml(df)
+        self.triangle_ml_, self.predicted_data_ = self._get_triangle_ml(df)
         return self
 
     @property
@@ -229,11 +249,12 @@ def transform(self, X):
             X_new : New triangle with transformed attributes.
         """
         X_new = X.copy()
-        X_ml = self._prep_X_ml(X)
+        X_ml, weight_ml = self._prep_X_ml(X)
         y_ml=self.estimator_ml.predict(X_ml)
-        triangle_ml = self._get_triangle_ml(X_ml, y_ml)
+        triangle_ml, predicted_data = self._get_triangle_ml(X_ml, y_ml)
         backend = "cupy" if X.array_backend == "cupy" else "numpy"
         X_new.ldf_ = triangle_ml.incr_to_cum().link_ratio.set_backend(backend)
         X_new.ldf_.valuation_date = pd.to_datetime(options.ULT_VAL)
         X_new._set_slicers()
-        return X_new
+        X_new.predicted_data_ = predicted_data
+        return X_new
diff --git a/chainladder/development/tests/test_barnzehn.py b/chainladder/development/tests/test_barnzehn.py
@@ -44,7 +44,33 @@ def origin_onehot(df,ori):
         np.around(cl.BarnettZehnwirth(formula='+'.join([f'C({x})' for x in feat_dict.keys()]),feat_eng = feat_dict).fit(abc).ldf_.values,3)
         == np.around(cl.BarnettZehnwirth(formula='C(origin)').fit_transform(abc).ldf_.values,3)
     )
+
+def test_bz_2008():
+    '''
+    this function tests the drop parameter by recreating the example in the 2008 BZ paper, section 4.1
+    '''
+    abc = cl.load_sample('abc')
+    exposure=np.array([[2.2], [2.4], [2.2], [2.0], [1.9], [1.6], [1.6], [1.8], [2.2], [2.5], [2.6]])
+    abc_adj = abc/exposure
+
+    def predictor_bins(df,pbin,axis):
+        return [int(x >= min(pbin)) for x in df[axis]]
+        
+    origin_groups = {f'origin_{ori}'.replace('[','').replace(']','').replace(', ',''):{'func':predictor_bins,'kwargs':{'pbin':ori,'axis':'origin'}} for ori in [[2],[3,4],[5,6,7,8,9,10]]}
+
+    def trend_piece(df,piece,axis):
+        pmax = float(max(piece))
+        increment=min(df[axis][df[axis]>0])
+        pfirst = piece[0]-increment
+        return [(x-pfirst)/increment if x in piece else (0 if x<pmax else (pmax-pfirst)/increment) for x in df[axis]]
+        
+    development_groups = {f'development_{dev}'.replace('[','').replace(']','').replace(', ',''):{'func':trend_piece,'kwargs':{'piece':dev,'axis':'development'}} for dev in [[24],[36],[48,60,72],[84,96],[108,120,132]]}
+
+    valuation_groups = {f'valuation_{val}'.replace('[','').replace(']','').replace(', ',''):{'func':trend_piece,'kwargs':{'piece':val,'axis':'valuation'}} for val in [[1,2,3,4,5,6,7],[8],[9,10]]}
+
+    abc_dict = {**origin_groups,**development_groups,**valuation_groups}
+    model=cl.BarnettZehnwirth(formula='+'.join([z for z in abc_dict.keys()]),feat_eng=abc_dict, drop=('1982',72)).fit(abc_adj)
     assert np.all(
-        np.around(cl.BarnettZehnwirth(formula='+'.join([f'C({x})' for x in feat_dict.keys()]),feat_eng = feat_dict).fit(abc).ldf_.values,3)
-        == np.around(cl.BarnettZehnwirth(formula='C(origin)').fit_transform(abc).ldf_.values,3)
+        np.around(model.coef_.values,4).flatten()
+        == np.array([11.1579,0.1989,0.0703,0.0919,0.1871,-0.3771,-0.4465,-0.3727,-0.3154,0.0432,0.0858,0.1464])
     )