Merge pull request #643 from casact/BZ_w_drop

kennethshsu · web-flow · commit aa949dc09eba · 2026-01-09T13:29:18.000-08:00
Enhancing BZ method to recreate BZ08
diff --git a/chainladder/development/barnzehn.py b/chainladder/development/barnzehn.py
@@ -22,6 +22,10 @@ class BarnettZehnwirth(TweedieGLM):
 
     Parameters
     ----------
+    drop: tuple or list of tuples
+        Drops specific origin/development combination(s)
+    drop_valuation: str or list of str (default = None)
+        Drops specific valuation periods. str must be date convertible.
     formula: formula-like
         A patsy formula describing the independent variables, X of the GLM
     response:  str
@@ -31,7 +35,9 @@ class BarnettZehnwirth(TweedieGLM):
 
     """
 
-    def __init__(self, formula='C(origin) + development', response=None):
+    def __init__(self, drop=None,drop_valuation=None,formula='C(origin) + development', response=None):
+        self.drop = drop
+        self.drop_valuation = drop_valuation
         self.formula = formula
         self.response = response
 
@@ -40,17 +46,10 @@ def fit(self, X, y=None, sample_weight=None):
             raise ValueError("Only single index/column triangles are supported")
         tri = X.cum_to_incr().log()
         response = X.columns[0] if not self.response else self.response
-        # Check for more than one linear predictor 
-        linearpredictors = 0
-        for term in ModelDesc.from_formula(self.formula).rhs_termlist[1:]:
-            if 'C(' not in term.factors[0].code:
-                linearpredictors += 1
-        if linearpredictors > 1:
-            warnings.warn("Using more than one linear predictor with BarnettZehnwirth may lead to issues with multicollinearity.")
         self.model_ = DevelopmentML(Pipeline(steps=[
             ('design_matrix', PatsyFormula(self.formula)),
             ('model', LinearRegression(fit_intercept=False))]),
-                    y_ml=response, fit_incrementals=False).fit(tri)
+                    y_ml=response, fit_incrementals=True, drop=self.drop, drop_valuation = self.drop_valuation, weighted_step = 'model').fit(X = tri, sample_weight = sample_weight)
         resid = tri - self.model_.triangle_ml_[
             self.model_.triangle_ml_.valuation <= tri.valuation_date]
         self.mse_resid_ = (resid**2).sum(0).sum(1).sum(2).sum() / (
@@ -77,10 +76,11 @@ def transform(self, X):
         X_new = X.copy()
         X_ml = self.model_._prep_X_ml(X.cum_to_incr().log())
         y_ml = self.model_.estimator_ml.predict(X_ml)
-        triangle_ml = self.model_._get_triangle_ml(X_ml, y_ml)
+        triangle_ml, predicted_data = self.model_._get_triangle_ml(X_ml, y_ml)
         backend = "cupy" if X.array_backend == "cupy" else "numpy"
         triangle_ml.is_cumulative = False
         X_new.ldf_ = triangle_ml.exp().incr_to_cum().link_ratio.set_backend(backend)
         X_new.ldf_.valuation_date = pd.to_datetime(options.ULT_VAL)
         X_new._set_slicers()
+        X_new.predicted_data_ = predicted_data
         return X_new
diff --git a/chainladder/development/glm.py b/chainladder/development/glm.py
@@ -22,15 +22,16 @@ class TweedieGLM(DevelopmentBase):
 
     Parameters
     ----------
+    drop: tuple or list of tuples
+        Drops specific origin/development combination(s)
+    drop_valuation: str or list of str (default = None)
+        Drops specific valuation periods. str must be date convertible.
     design_matrix: formula-like
         A patsy formula describing the independent variables, X of the GLM
     response:  str
         Column name for the reponse variable of the GLM.  If ommitted, then the
         first column of the Triangle will be used.
-    weight: str
-        Column name of any weight to use in the GLM. If none specified, then an
-        unweighted regression will be performed.
-    power: float, default=0
+    power: float, default=1
             The power determines the underlying target distribution according
             to the following table:
             +-------+------------------------+
@@ -52,7 +53,7 @@ class TweedieGLM(DevelopmentBase):
         regularization strength. ``alpha = 0`` is equivalent to unpenalized
         GLMs. In this case, the design matrix `X` must have full column rank
         (no collinearities).
-    link: {'auto', 'identity', 'log'}, default='auto'
+    link: {'auto', 'identity', 'log'}, default='log'
         The link function of the GLM, i.e. mapping from linear predictor
         `X @ coeff + intercept` to prediction `y_pred`. Option 'auto' sets
         the link depending on the chosen family as follows:
@@ -78,10 +79,11 @@ class TweedieGLM(DevelopmentBase):
     """
 
     def __init__(self, design_matrix='C(development) + C(origin)',
-                 response=None, weight=None, power=1.0, alpha=1.0, link='log',
-                 max_iter=100, tol=0.0001, warm_start=False, verbose=0):
+                 response=None, power=1.0, alpha=1.0, link='log',
+                 max_iter=100, tol=0.0001, warm_start=False, verbose=0, drop=None,drop_valuation=None):
+        self.drop = drop
+        self.drop_valuation = drop_valuation
         self.response=response
-        self.weight=weight
         self.design_matrix = design_matrix
         self.power=power
         self.alpha=alpha
@@ -93,13 +95,18 @@ def __init__(self, design_matrix='C(development) + C(origin)',
 
     def fit(self, X, y=None, sample_weight=None):
         response = X.columns[0] if not self.response else self.response
+        if sample_weight is None:
+            weight = None
+        else:
+            weight = 'model'
         self.model_ = DevelopmentML(Pipeline(steps=[
             ('design_matrix', PatsyFormula(self.design_matrix)),
             ('model', TweedieRegressor(
                     link=self.link, power=self.power, max_iter=self.max_iter,
                     tol=self.tol, warm_start=self.warm_start,
                     verbose=self.verbose, fit_intercept=False))]),
-                    y_ml=response, weight_ml=self.weight).fit(X)
+                    y_ml=response, weighted_step = weight,
+                    drop=self.drop, drop_valuation=self.drop_valuation).fit(X = X, sample_weight = sample_weight)
         return self
 
     @property
diff --git a/chainladder/development/learning.py b/chainladder/development/learning.py
@@ -33,9 +33,14 @@ class DevelopmentML(DevelopmentBase):
         Time Series aspects of the model. Predictions from one development period
         get used as featues in the next development period. Lags should be negative
         integers.
+    weight_step: str
+        Step name within estimator_ml that is weighted
+    drop: tuple or list of tuples
+        Drops specific origin/development combination(s)
+    drop_valuation: str or list of str (default = None)
+        Drops specific valuation periods. str must be date convertible.
     fit_incrementals:
-        Whether the response variable should be converted to an incremental basis
-        for fitting.
+        Whether the response variable should be converted to an incremental basis for fitting.
 
     Attributes
     ----------
@@ -48,11 +53,13 @@ class DevelopmentML(DevelopmentBase):
     """
 
     def __init__(self, estimator_ml=None, y_ml=None, autoregressive=False,
-                 weight_ml=None, fit_incrementals=True):
+                 weighted_step=None,drop=None,drop_valuation=None,fit_incrementals=True):
         self.estimator_ml=estimator_ml
         self.y_ml=y_ml
-        self.weight_ml = weight_ml
-        self.autoregressive=autoregressive
+        self.weighted_step = weighted_step
+        self.autoregressive = autoregressive
+        self.drop = drop
+        self.drop_valuation = drop_valuation
         self.fit_incrementals = fit_incrementals
 
     def _get_y_names(self):
@@ -124,7 +131,7 @@ def _get_triangle_ml(self, df, preds=None):
         return Triangle(
             out, origin='origin', development='valuation',
             index=self._key_labels, columns=self._get_y_names(),
-            cumulative=not self.fit_incrementals).dropna()
+            cumulative=not self.fit_incrementals).dropna(), out
 
     def _prep_X_ml(self, X):
         """ Preps Triangle data ahead of the pipeline """
@@ -139,14 +146,25 @@ def _prep_X_ml(self, X):
         df_base = X.incr_to_cum().to_frame(
             keepdims=True, implicit_axis=True, origin_as_datetime=True
             ).reset_index().iloc[:, :-1]
-        df = df_base.merge(X.cum_to_incr().to_frame(
+        df = df_base.merge(X_.to_frame(
                 keepdims=True, implicit_axis=True, origin_as_datetime=True
             ).reset_index(), how='left',
             on=list(df_base.columns)).fillna(0)
         df['origin'] = df['origin'].map(self.origin_encoder_)
         df['valuation'] = df['valuation'].map(self.valuation_encoder_)
         return df
 
+    def _prep_w_ml(self,X,sample_weight=None):
+        weight_base = (~np.isnan(X.values)).astype(float)
+        weight = weight_base.copy()               
+        if self.drop is not None:
+            weight = weight * self._drop_func(X)
+        if self.drop_valuation is not None:
+            weight = weight * self._drop_valuation_func(X)
+        if sample_weight is not None:
+            weight = weight * sample_weight.values 
+        return weight.flatten()[weight_base.flatten()>0]
+
     def fit(self, X, y=None, sample_weight=None):
         """Fit the model with X.
 
@@ -156,8 +174,8 @@ def fit(self, X, y=None, sample_weight=None):
             Set of LDFs to which the estimator will be applied.
         y : None
             Ignored, use y_ml to set a reponse variable for the ML algorithm
-        sample_weight : None
-            Ignored
+        sample_weight : Triangle-like
+            Weights to use in the regression
 
         Returns
         -------
@@ -178,10 +196,18 @@ def fit(self, X, y=None, sample_weight=None):
             (pd.Series(val).rank()-1)/{'Y':1, 'S': 2, 'Q':4, 'M': 12}[X.development_grain]))
         df = self._prep_X_ml(X)
         self.df_ = df
+        weight = self._prep_w_ml(X,sample_weight)
+        self.weight_ = weight
+        if self.weighted_step == None:
+            sample_weights = {}
+        elif isinstance(self.weighted_step, list):
+            sample_weights = {x + '__sample_weight':weight for x in self.weighted_step}
+        else:
+            sample_weights = {self.weighted_step + '__sample_weight':weight}
         # Fit model
-        self.estimator_ml.fit(df, self.y_ml_.fit_transform(df).squeeze())
+        self.estimator_ml.fit(df, self.y_ml_.fit_transform(df).squeeze(),**sample_weights)
         #return selffit_incrementals 
-        self.triangle_ml_ = self._get_triangle_ml(df)
+        self.triangle_ml_, self.predicted_data_ = self._get_triangle_ml(df)
         return self
 
     @property
@@ -206,9 +232,10 @@ def transform(self, X):
         X_new = X.copy()
         X_ml = self._prep_X_ml(X)
         y_ml=self.estimator_ml.predict(X_ml)
-        triangle_ml = self._get_triangle_ml(X_ml, y_ml)
+        triangle_ml, predicted_data = self._get_triangle_ml(X_ml, y_ml)
         backend = "cupy" if X.array_backend == "cupy" else "numpy"
         X_new.ldf_ = triangle_ml.incr_to_cum().link_ratio.set_backend(backend)
         X_new.ldf_.valuation_date = pd.to_datetime(options.ULT_VAL)
         X_new._set_slicers()
-        return X_new
+        X_new.predicted_data_ = predicted_data
+        return X_new
diff --git a/chainladder/development/tests/test_barnzehn.py b/chainladder/development/tests/test_barnzehn.py
@@ -1,9 +1,10 @@
 import numpy as np
 import chainladder as cl
 import pytest
+from chainladder.utils.utility_functions import PTF_formula
+abc = cl.load_sample('abc')
 
 def test_basic_bz():
-    abc = cl.load_sample('abc')
     assert np.all(
         np.around(cl.BarnettZehnwirth(formula='C(origin)+C(development)').fit(abc).coef_.T.values,3).flatten()
         == np.array([11.837,0.179,0.345,0.378,0.405,0.427,0.431,0.66,0.963,1.157,1.278,0.251,-0.056,-0.449,-0.829,-1.169,-1.508,-1.798,-2.023,-2.238,-2.428])
@@ -12,4 +13,41 @@ def test_basic_bz():
 def test_multiple_triangle_exception():
     d = cl.load_sample("usauto")
     with pytest.raises(ValueError):
-        cl.BarnettZehnwirth(formula='C(origin)+C(development)').fit(d)
+        cl.BarnettZehnwirth(formula='C(origin)+C(development)').fit(d)
+
+def test_drops():
+    '''
+    this function tests the passing in a basic drop_valuation
+    '''
+    assert np.all(
+        np.around(cl.BarnettZehnwirth(formula='C(development)',drop_valuation='1979').fit_transform(abc).ldf_.values,3)
+        == np.around(cl.BarnettZehnwirth(formula='C(development)',drop = [('1977',36),('1978',24),('1979',12)]).fit_transform(abc).ldf_.values,3)
+    )
+    assert np.all(
+        np.around(cl.BarnettZehnwirth(formula='C(development)',drop_valuation='1979').fit(abc).triangle_ml_.values,3)
+        == np.around(cl.BarnettZehnwirth(formula='C(development)',drop = [('1977',36),('1978',24),('1979',12)]).fit(abc).triangle_ml_.values,3)
+    )
+    assert np.all(
+        np.around(cl.BarnettZehnwirth(formula='C(development)',drop_valuation='1979').fit(abc).ldf_.values,3)
+        == np.around(cl.BarnettZehnwirth(formula='C(development)',drop = [('1977',36),('1978',24),('1979',12)]).fit(abc).ldf_.values,3)
+    )
+
+
+def test_bz_2008():
+    '''
+    this function tests the drop parameter by recreating the example in the 2008 BZ paper, section 4.1
+    '''
+    exposure=np.array([[2.2], [2.4], [2.2], [2.0], [1.9], [1.6], [1.6], [1.8], [2.2], [2.5], [2.6]])
+    abc_adj = abc/exposure
+
+    origin_buckets = [(0,1),(2,2),(3,4),(5,10)]
+    dev_buckets = [(24,36),(36,48),(48,84),(84,108),(108,144)]
+    val_buckets = [(1,8),(8,9),(9,12)]
+    
+    abc_formula = PTF_formula(abc_adj,alpha=origin_buckets,gamma=dev_buckets,iota=val_buckets)
+    
+    model=cl.BarnettZehnwirth(formula=abc_formula, drop=('1982',72)).fit(abc_adj)
+    assert np.all(
+        np.around(model.coef_.values,4).flatten()
+        == np.array([11.1579,0.1989,0.0703,0.0919,0.1871,-0.3771,-0.4465,-0.3727,-0.3154,0.0432,0.0858,0.1464])
+    )
diff --git a/chainladder/development/tests/test_glm.py b/chainladder/development/tests/test_glm.py
@@ -5,3 +5,9 @@ def test_basic_odp_cl(genins):
         (cl.Chainladder().fit(genins).ultimate_ -
          cl.Chainladder().fit(cl.TweedieGLM().fit_transform(genins)).ultimate_) /
         genins.latest_diagonal).max()< 1e-2
+    
+def test_sample_weight(genins):
+    assert abs(
+        (cl.Chainladder().fit(genins).ultimate_ -
+         cl.Chainladder().fit(cl.TweedieGLM().fit_transform(genins,sample_weight=genins/genins)).ultimate_) /
+        genins.latest_diagonal).max()< 1e-2
diff --git a/chainladder/development/tests/test_learning.py b/chainladder/development/tests/test_learning.py
@@ -0,0 +1,19 @@
+import chainladder as cl
+from sklearn.linear_model import LinearRegression
+from sklearn.pipeline import Pipeline
+from chainladder.utils.utility_functions import PatsyFormula
+
+def test_incremental(genins):
+    response = [genins.columns[0]]
+    model = cl.DevelopmentML(Pipeline(steps=[
+        ('design_matrix', PatsyFormula('C(development)')),
+        ('model', LinearRegression(fit_intercept=False))]),
+                y_ml=response,fit_incrementals=False).fit(genins)
+    assert abs(model.triangle_ml_.loc[:,:,'2010',:] - genins.mean()).max() < 1e2
+
+def test_misc(genins):
+    model = cl.DevelopmentML(Pipeline(steps=[
+        ('design_matrix', PatsyFormula('C(development)')),
+        ('model', LinearRegression(fit_intercept=False))]),
+                weighted_step = ['model'], fit_incrementals=False).fit(genins, sample_weight=genins/genins)
+    assert abs(model.triangle_ml_.loc[:,:,'2010',:] - genins.mean()).max() < 1e2
diff --git a/chainladder/utils/utility_functions.py b/chainladder/utils/utility_functions.py
@@ -767,3 +767,29 @@ def model_diagnostics(model, name=None, groupby=None):
         out.index = idx
         triangles.append(out)
     return concat(triangles, 0)
+
+
+def PTF_formula(tri: Triangle, alpha: ArrayLike = None, gamma: ArrayLike = None, iota: ArrayLike = None):
+    """ Helper formula that builds a patsy formula string for the BarnettZehnwirth 
+    estimator.  Each axis's parameters can be grouped together. Groups of origin 
+    parameters (alpha) are set equal, and are specified by a ranges (inclusive). 
+    Groups of development (gamma) and valuation (iota) parameters are fit to 
+    separate linear trends, specified as tuples denoting ranges with shared endpoints.
+    In other words, development and valuation trends are fit to a piecewise linear model.
+    A triangle must be supplied to provide some critical information.
+    """
+    formula_parts=[]
+    if(alpha):
+        # The intercept term takes the place of the first alpha
+        for ind,a in enumerate(alpha):
+            if(a[0]==0):
+                alpha=alpha[:ind]+alpha[(ind+1):]
+        formula_parts += ['+'.join([f'I({x[0]} <= origin)' for x in alpha])]
+    if(gamma):
+        dgrain = min(tri.development)
+        formula_parts += ['+'.join([f'I((np.minimum({x[1]-dgrain},development) - np.minimum({x[0]-dgrain},development))/{dgrain})' for x in gamma])]
+    if(iota):
+        formula_parts += ['+'.join([f'I(np.minimum({x[1]-1},valuation) - np.minimum({x[0]-1},valuation))' for x in iota])]
+    if(formula_parts):
+        return '+'.join(formula_parts)
+    return ''