Skip to content

Commit aa949dc

Browse files
authored
Merge pull request #643 from casact/BZ_w_drop
Enhancing BZ method to recreate BZ08
2 parents 749eb51 + 79231bf commit aa949dc

File tree

7 files changed

+157
-34
lines changed

7 files changed

+157
-34
lines changed

chainladder/development/barnzehn.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@ class BarnettZehnwirth(TweedieGLM):
2222
2323
Parameters
2424
----------
25+
drop: tuple or list of tuples
26+
Drops specific origin/development combination(s)
27+
drop_valuation: str or list of str (default = None)
28+
Drops specific valuation periods. str must be date convertible.
2529
formula: formula-like
2630
A patsy formula describing the independent variables, X of the GLM
2731
response: str
@@ -31,7 +35,9 @@ class BarnettZehnwirth(TweedieGLM):
3135
3236
"""
3337

34-
def __init__(self, formula='C(origin) + development', response=None):
38+
def __init__(self, drop=None,drop_valuation=None,formula='C(origin) + development', response=None):
39+
self.drop = drop
40+
self.drop_valuation = drop_valuation
3541
self.formula = formula
3642
self.response = response
3743

@@ -40,17 +46,10 @@ def fit(self, X, y=None, sample_weight=None):
4046
raise ValueError("Only single index/column triangles are supported")
4147
tri = X.cum_to_incr().log()
4248
response = X.columns[0] if not self.response else self.response
43-
# Check for more than one linear predictor
44-
linearpredictors = 0
45-
for term in ModelDesc.from_formula(self.formula).rhs_termlist[1:]:
46-
if 'C(' not in term.factors[0].code:
47-
linearpredictors += 1
48-
if linearpredictors > 1:
49-
warnings.warn("Using more than one linear predictor with BarnettZehnwirth may lead to issues with multicollinearity.")
5049
self.model_ = DevelopmentML(Pipeline(steps=[
5150
('design_matrix', PatsyFormula(self.formula)),
5251
('model', LinearRegression(fit_intercept=False))]),
53-
y_ml=response, fit_incrementals=False).fit(tri)
52+
y_ml=response, fit_incrementals=True, drop=self.drop, drop_valuation = self.drop_valuation, weighted_step = 'model').fit(X = tri, sample_weight = sample_weight)
5453
resid = tri - self.model_.triangle_ml_[
5554
self.model_.triangle_ml_.valuation <= tri.valuation_date]
5655
self.mse_resid_ = (resid**2).sum(0).sum(1).sum(2).sum() / (
@@ -77,10 +76,11 @@ def transform(self, X):
7776
X_new = X.copy()
7877
X_ml = self.model_._prep_X_ml(X.cum_to_incr().log())
7978
y_ml = self.model_.estimator_ml.predict(X_ml)
80-
triangle_ml = self.model_._get_triangle_ml(X_ml, y_ml)
79+
triangle_ml, predicted_data = self.model_._get_triangle_ml(X_ml, y_ml)
8180
backend = "cupy" if X.array_backend == "cupy" else "numpy"
8281
triangle_ml.is_cumulative = False
8382
X_new.ldf_ = triangle_ml.exp().incr_to_cum().link_ratio.set_backend(backend)
8483
X_new.ldf_.valuation_date = pd.to_datetime(options.ULT_VAL)
8584
X_new._set_slicers()
85+
X_new.predicted_data_ = predicted_data
8686
return X_new

chainladder/development/glm.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -22,15 +22,16 @@ class TweedieGLM(DevelopmentBase):
2222
2323
Parameters
2424
----------
25+
drop: tuple or list of tuples
26+
Drops specific origin/development combination(s)
27+
drop_valuation: str or list of str (default = None)
28+
Drops specific valuation periods. str must be date convertible.
2529
design_matrix: formula-like
2630
A patsy formula describing the independent variables, X of the GLM
2731
response: str
2832
Column name for the reponse variable of the GLM. If ommitted, then the
2933
first column of the Triangle will be used.
30-
weight: str
31-
Column name of any weight to use in the GLM. If none specified, then an
32-
unweighted regression will be performed.
33-
power: float, default=0
34+
power: float, default=1
3435
The power determines the underlying target distribution according
3536
to the following table:
3637
+-------+------------------------+
@@ -52,7 +53,7 @@ class TweedieGLM(DevelopmentBase):
5253
regularization strength. ``alpha = 0`` is equivalent to unpenalized
5354
GLMs. In this case, the design matrix `X` must have full column rank
5455
(no collinearities).
55-
link: {'auto', 'identity', 'log'}, default='auto'
56+
link: {'auto', 'identity', 'log'}, default='log'
5657
The link function of the GLM, i.e. mapping from linear predictor
5758
`X @ coeff + intercept` to prediction `y_pred`. Option 'auto' sets
5859
the link depending on the chosen family as follows:
@@ -78,10 +79,11 @@ class TweedieGLM(DevelopmentBase):
7879
"""
7980

8081
def __init__(self, design_matrix='C(development) + C(origin)',
81-
response=None, weight=None, power=1.0, alpha=1.0, link='log',
82-
max_iter=100, tol=0.0001, warm_start=False, verbose=0):
82+
response=None, power=1.0, alpha=1.0, link='log',
83+
max_iter=100, tol=0.0001, warm_start=False, verbose=0, drop=None,drop_valuation=None):
84+
self.drop = drop
85+
self.drop_valuation = drop_valuation
8386
self.response=response
84-
self.weight=weight
8587
self.design_matrix = design_matrix
8688
self.power=power
8789
self.alpha=alpha
@@ -93,13 +95,18 @@ def __init__(self, design_matrix='C(development) + C(origin)',
9395

9496
def fit(self, X, y=None, sample_weight=None):
9597
response = X.columns[0] if not self.response else self.response
98+
if sample_weight is None:
99+
weight = None
100+
else:
101+
weight = 'model'
96102
self.model_ = DevelopmentML(Pipeline(steps=[
97103
('design_matrix', PatsyFormula(self.design_matrix)),
98104
('model', TweedieRegressor(
99105
link=self.link, power=self.power, max_iter=self.max_iter,
100106
tol=self.tol, warm_start=self.warm_start,
101107
verbose=self.verbose, fit_intercept=False))]),
102-
y_ml=response, weight_ml=self.weight).fit(X)
108+
y_ml=response, weighted_step = weight,
109+
drop=self.drop, drop_valuation=self.drop_valuation).fit(X = X, sample_weight = sample_weight)
103110
return self
104111

105112
@property

chainladder/development/learning.py

Lines changed: 40 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,14 @@ class DevelopmentML(DevelopmentBase):
3333
Time Series aspects of the model. Predictions from one development period
3434
get used as featues in the next development period. Lags should be negative
3535
integers.
36+
weight_step: str
37+
Step name within estimator_ml that is weighted
38+
drop: tuple or list of tuples
39+
Drops specific origin/development combination(s)
40+
drop_valuation: str or list of str (default = None)
41+
Drops specific valuation periods. str must be date convertible.
3642
fit_incrementals:
37-
Whether the response variable should be converted to an incremental basis
38-
for fitting.
43+
Whether the response variable should be converted to an incremental basis for fitting.
3944
4045
Attributes
4146
----------
@@ -48,11 +53,13 @@ class DevelopmentML(DevelopmentBase):
4853
"""
4954

5055
def __init__(self, estimator_ml=None, y_ml=None, autoregressive=False,
51-
weight_ml=None, fit_incrementals=True):
56+
weighted_step=None,drop=None,drop_valuation=None,fit_incrementals=True):
5257
self.estimator_ml=estimator_ml
5358
self.y_ml=y_ml
54-
self.weight_ml = weight_ml
55-
self.autoregressive=autoregressive
59+
self.weighted_step = weighted_step
60+
self.autoregressive = autoregressive
61+
self.drop = drop
62+
self.drop_valuation = drop_valuation
5663
self.fit_incrementals = fit_incrementals
5764

5865
def _get_y_names(self):
@@ -124,7 +131,7 @@ def _get_triangle_ml(self, df, preds=None):
124131
return Triangle(
125132
out, origin='origin', development='valuation',
126133
index=self._key_labels, columns=self._get_y_names(),
127-
cumulative=not self.fit_incrementals).dropna()
134+
cumulative=not self.fit_incrementals).dropna(), out
128135

129136
def _prep_X_ml(self, X):
130137
""" Preps Triangle data ahead of the pipeline """
@@ -139,14 +146,25 @@ def _prep_X_ml(self, X):
139146
df_base = X.incr_to_cum().to_frame(
140147
keepdims=True, implicit_axis=True, origin_as_datetime=True
141148
).reset_index().iloc[:, :-1]
142-
df = df_base.merge(X.cum_to_incr().to_frame(
149+
df = df_base.merge(X_.to_frame(
143150
keepdims=True, implicit_axis=True, origin_as_datetime=True
144151
).reset_index(), how='left',
145152
on=list(df_base.columns)).fillna(0)
146153
df['origin'] = df['origin'].map(self.origin_encoder_)
147154
df['valuation'] = df['valuation'].map(self.valuation_encoder_)
148155
return df
149156

157+
def _prep_w_ml(self,X,sample_weight=None):
158+
weight_base = (~np.isnan(X.values)).astype(float)
159+
weight = weight_base.copy()
160+
if self.drop is not None:
161+
weight = weight * self._drop_func(X)
162+
if self.drop_valuation is not None:
163+
weight = weight * self._drop_valuation_func(X)
164+
if sample_weight is not None:
165+
weight = weight * sample_weight.values
166+
return weight.flatten()[weight_base.flatten()>0]
167+
150168
def fit(self, X, y=None, sample_weight=None):
151169
"""Fit the model with X.
152170
@@ -156,8 +174,8 @@ def fit(self, X, y=None, sample_weight=None):
156174
Set of LDFs to which the estimator will be applied.
157175
y : None
158176
Ignored, use y_ml to set a reponse variable for the ML algorithm
159-
sample_weight : None
160-
Ignored
177+
sample_weight : Triangle-like
178+
Weights to use in the regression
161179
162180
Returns
163181
-------
@@ -178,10 +196,18 @@ def fit(self, X, y=None, sample_weight=None):
178196
(pd.Series(val).rank()-1)/{'Y':1, 'S': 2, 'Q':4, 'M': 12}[X.development_grain]))
179197
df = self._prep_X_ml(X)
180198
self.df_ = df
199+
weight = self._prep_w_ml(X,sample_weight)
200+
self.weight_ = weight
201+
if self.weighted_step == None:
202+
sample_weights = {}
203+
elif isinstance(self.weighted_step, list):
204+
sample_weights = {x + '__sample_weight':weight for x in self.weighted_step}
205+
else:
206+
sample_weights = {self.weighted_step + '__sample_weight':weight}
181207
# Fit model
182-
self.estimator_ml.fit(df, self.y_ml_.fit_transform(df).squeeze())
208+
self.estimator_ml.fit(df, self.y_ml_.fit_transform(df).squeeze(),**sample_weights)
183209
#return selffit_incrementals
184-
self.triangle_ml_ = self._get_triangle_ml(df)
210+
self.triangle_ml_, self.predicted_data_ = self._get_triangle_ml(df)
185211
return self
186212

187213
@property
@@ -206,9 +232,10 @@ def transform(self, X):
206232
X_new = X.copy()
207233
X_ml = self._prep_X_ml(X)
208234
y_ml=self.estimator_ml.predict(X_ml)
209-
triangle_ml = self._get_triangle_ml(X_ml, y_ml)
235+
triangle_ml, predicted_data = self._get_triangle_ml(X_ml, y_ml)
210236
backend = "cupy" if X.array_backend == "cupy" else "numpy"
211237
X_new.ldf_ = triangle_ml.incr_to_cum().link_ratio.set_backend(backend)
212238
X_new.ldf_.valuation_date = pd.to_datetime(options.ULT_VAL)
213239
X_new._set_slicers()
214-
return X_new
240+
X_new.predicted_data_ = predicted_data
241+
return X_new
Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
import numpy as np
22
import chainladder as cl
33
import pytest
4+
from chainladder.utils.utility_functions import PTF_formula
5+
abc = cl.load_sample('abc')
46

57
def test_basic_bz():
6-
abc = cl.load_sample('abc')
78
assert np.all(
89
np.around(cl.BarnettZehnwirth(formula='C(origin)+C(development)').fit(abc).coef_.T.values,3).flatten()
910
== np.array([11.837,0.179,0.345,0.378,0.405,0.427,0.431,0.66,0.963,1.157,1.278,0.251,-0.056,-0.449,-0.829,-1.169,-1.508,-1.798,-2.023,-2.238,-2.428])
@@ -12,4 +13,41 @@ def test_basic_bz():
1213
def test_multiple_triangle_exception():
1314
d = cl.load_sample("usauto")
1415
with pytest.raises(ValueError):
15-
cl.BarnettZehnwirth(formula='C(origin)+C(development)').fit(d)
16+
cl.BarnettZehnwirth(formula='C(origin)+C(development)').fit(d)
17+
18+
def test_drops():
19+
'''
20+
this function tests the passing in a basic drop_valuation
21+
'''
22+
assert np.all(
23+
np.around(cl.BarnettZehnwirth(formula='C(development)',drop_valuation='1979').fit_transform(abc).ldf_.values,3)
24+
== np.around(cl.BarnettZehnwirth(formula='C(development)',drop = [('1977',36),('1978',24),('1979',12)]).fit_transform(abc).ldf_.values,3)
25+
)
26+
assert np.all(
27+
np.around(cl.BarnettZehnwirth(formula='C(development)',drop_valuation='1979').fit(abc).triangle_ml_.values,3)
28+
== np.around(cl.BarnettZehnwirth(formula='C(development)',drop = [('1977',36),('1978',24),('1979',12)]).fit(abc).triangle_ml_.values,3)
29+
)
30+
assert np.all(
31+
np.around(cl.BarnettZehnwirth(formula='C(development)',drop_valuation='1979').fit(abc).ldf_.values,3)
32+
== np.around(cl.BarnettZehnwirth(formula='C(development)',drop = [('1977',36),('1978',24),('1979',12)]).fit(abc).ldf_.values,3)
33+
)
34+
35+
36+
def test_bz_2008():
37+
'''
38+
this function tests the drop parameter by recreating the example in the 2008 BZ paper, section 4.1
39+
'''
40+
exposure=np.array([[2.2], [2.4], [2.2], [2.0], [1.9], [1.6], [1.6], [1.8], [2.2], [2.5], [2.6]])
41+
abc_adj = abc/exposure
42+
43+
origin_buckets = [(0,1),(2,2),(3,4),(5,10)]
44+
dev_buckets = [(24,36),(36,48),(48,84),(84,108),(108,144)]
45+
val_buckets = [(1,8),(8,9),(9,12)]
46+
47+
abc_formula = PTF_formula(abc_adj,alpha=origin_buckets,gamma=dev_buckets,iota=val_buckets)
48+
49+
model=cl.BarnettZehnwirth(formula=abc_formula, drop=('1982',72)).fit(abc_adj)
50+
assert np.all(
51+
np.around(model.coef_.values,4).flatten()
52+
== np.array([11.1579,0.1989,0.0703,0.0919,0.1871,-0.3771,-0.4465,-0.3727,-0.3154,0.0432,0.0858,0.1464])
53+
)

chainladder/development/tests/test_glm.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,9 @@ def test_basic_odp_cl(genins):
55
(cl.Chainladder().fit(genins).ultimate_ -
66
cl.Chainladder().fit(cl.TweedieGLM().fit_transform(genins)).ultimate_) /
77
genins.latest_diagonal).max()< 1e-2
8+
9+
def test_sample_weight(genins):
10+
assert abs(
11+
(cl.Chainladder().fit(genins).ultimate_ -
12+
cl.Chainladder().fit(cl.TweedieGLM().fit_transform(genins,sample_weight=genins/genins)).ultimate_) /
13+
genins.latest_diagonal).max()< 1e-2
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import chainladder as cl
2+
from sklearn.linear_model import LinearRegression
3+
from sklearn.pipeline import Pipeline
4+
from chainladder.utils.utility_functions import PatsyFormula
5+
6+
def test_incremental(genins):
7+
response = [genins.columns[0]]
8+
model = cl.DevelopmentML(Pipeline(steps=[
9+
('design_matrix', PatsyFormula('C(development)')),
10+
('model', LinearRegression(fit_intercept=False))]),
11+
y_ml=response,fit_incrementals=False).fit(genins)
12+
assert abs(model.triangle_ml_.loc[:,:,'2010',:] - genins.mean()).max() < 1e2
13+
14+
def test_misc(genins):
15+
model = cl.DevelopmentML(Pipeline(steps=[
16+
('design_matrix', PatsyFormula('C(development)')),
17+
('model', LinearRegression(fit_intercept=False))]),
18+
weighted_step = ['model'], fit_incrementals=False).fit(genins, sample_weight=genins/genins)
19+
assert abs(model.triangle_ml_.loc[:,:,'2010',:] - genins.mean()).max() < 1e2

chainladder/utils/utility_functions.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -767,3 +767,29 @@ def model_diagnostics(model, name=None, groupby=None):
767767
out.index = idx
768768
triangles.append(out)
769769
return concat(triangles, 0)
770+
771+
772+
def PTF_formula(tri: Triangle, alpha: ArrayLike = None, gamma: ArrayLike = None, iota: ArrayLike = None):
773+
""" Helper formula that builds a patsy formula string for the BarnettZehnwirth
774+
estimator. Each axis's parameters can be grouped together. Groups of origin
775+
parameters (alpha) are set equal, and are specified by a ranges (inclusive).
776+
Groups of development (gamma) and valuation (iota) parameters are fit to
777+
separate linear trends, specified as tuples denoting ranges with shared endpoints.
778+
In other words, development and valuation trends are fit to a piecewise linear model.
779+
A triangle must be supplied to provide some critical information.
780+
"""
781+
formula_parts=[]
782+
if(alpha):
783+
# The intercept term takes the place of the first alpha
784+
for ind,a in enumerate(alpha):
785+
if(a[0]==0):
786+
alpha=alpha[:ind]+alpha[(ind+1):]
787+
formula_parts += ['+'.join([f'I({x[0]} <= origin)' for x in alpha])]
788+
if(gamma):
789+
dgrain = min(tri.development)
790+
formula_parts += ['+'.join([f'I((np.minimum({x[1]-dgrain},development) - np.minimum({x[0]-dgrain},development))/{dgrain})' for x in gamma])]
791+
if(iota):
792+
formula_parts += ['+'.join([f'I(np.minimum({x[1]-1},valuation) - np.minimum({x[0]-1},valuation))' for x in iota])]
793+
if(formula_parts):
794+
return '+'.join(formula_parts)
795+
return ''

0 commit comments

Comments
 (0)