Skip to content

Commit a219c24

Browse files
committed
Adding sample weight to bz and parent
sample weights enables dropping specific points from fitting, which is essential for recreating BZ results
1 parent 15039de commit a219c24

File tree

3 files changed

+70
-16
lines changed

3 files changed

+70
-16
lines changed

chainladder/development/barnzehn.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@ class BarnettZehnwirth(TweedieGLM):
2222
2323
Parameters
2424
----------
25+
drop: tuple or list of tuples
26+
Drops specific origin/development combination(s)
27+
drop_valuation: str or list of str (default = None)
28+
Drops specific valuation periods. str must be date convertible.
2529
formula: formula-like
2630
A patsy formula describing the independent variables, X of the GLM
2731
feat_eng: dict
@@ -49,7 +53,9 @@ def test_func(df)
4953
5054
"""
5155

52-
def __init__(self, formula='C(origin) + development', feat_eng=None, response=None):
56+
def __init__(self, drop=None,drop_valuation=None,formula='C(origin) + development', feat_eng=None, response=None):
57+
self.drop = drop
58+
self.drop_valuation = drop_valuation
5359
self.formula = formula
5460
self.response = response
5561
self.feat_eng = feat_eng
@@ -69,7 +75,7 @@ def fit(self, X, y=None, sample_weight=None):
6975
self.model_ = DevelopmentML(Pipeline(steps=[
7076
('design_matrix', PatsyFormula(self.formula)),
7177
('model', LinearRegression(fit_intercept=False))]),
72-
y_ml=response, fit_incrementals=False, feat_eng = self.feat_eng).fit(tri)
78+
y_ml=response, fit_incrementals=False, feat_eng = self.feat_eng, drop=self.drop, drop_valuation = self.drop_valuation, weighted_step = 'model').fit(tri)
7379
resid = tri - self.model_.triangle_ml_[
7480
self.model_.triangle_ml_.valuation <= tri.valuation_date]
7581
self.mse_resid_ = (resid**2).sum(0).sum(1).sum(2).sum() / (
@@ -94,12 +100,13 @@ def transform(self, X):
94100
X_new : New triangle with transformed attributes.
95101
"""
96102
X_new = X.copy()
97-
X_ml = self.model_._prep_X_ml(X.cum_to_incr().log())
103+
X_ml, weight_ml = self.model_._prep_X_ml(X.cum_to_incr().log())
98104
y_ml = self.model_.estimator_ml.predict(X_ml)
99-
triangle_ml = self.model_._get_triangle_ml(X_ml, y_ml)
105+
triangle_ml, predicted_data = self.model_._get_triangle_ml(X_ml, y_ml)
100106
backend = "cupy" if X.array_backend == "cupy" else "numpy"
101107
triangle_ml.is_cumulative = False
102108
X_new.ldf_ = triangle_ml.exp().incr_to_cum().link_ratio.set_backend(backend)
103109
X_new.ldf_.valuation_date = pd.to_datetime(options.ULT_VAL)
104110
X_new._set_slicers()
111+
X_new.predicted_data_ = predicted_data
105112
return X_new

chainladder/development/learning.py

Lines changed: 31 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,10 @@ class DevelopmentML(DevelopmentBase):
3333
Time Series aspects of the model. Predictions from one development period
3434
get used as featues in the next development period. Lags should be negative
3535
integers.
36+
drop: tuple or list of tuples
37+
Drops specific origin/development combination(s)
38+
drop_valuation: str or list of str (default = None)
39+
Drops specific valuation periods. str must be date convertible.
3640
feat_eng: dict
3741
A dictionary with feature names as keys and a dictionary of function (with a key of 'func') and keyword arguments (with a key of 'kwargs')
3842
(e.g. {
@@ -66,11 +70,14 @@ def test_func(df)
6670
"""
6771

6872
def __init__(self, estimator_ml=None, y_ml=None, autoregressive=False,
69-
weight_ml=None, fit_incrementals=True, feat_eng=None):
73+
weight_ml=None, weighted_step=None,drop=None,drop_valuation=None,fit_incrementals=True, feat_eng=None):
7074
self.estimator_ml=estimator_ml
7175
self.y_ml=y_ml
7276
self.weight_ml = weight_ml
73-
self.autoregressive=autoregressive
77+
self.weighted_step = weighted_step
78+
self.autoregressive = autoregressive
79+
self.drop = drop
80+
self.drop_valuation = drop_valuation
7481
self.fit_incrementals = fit_incrementals
7582
self.feat_eng = feat_eng
7683

@@ -146,7 +153,7 @@ def _get_triangle_ml(self, df, preds=None):
146153
return Triangle(
147154
out, origin='origin', development='valuation',
148155
index=self._key_labels, columns=self._get_y_names(),
149-
cumulative=not self.fit_incrementals).dropna()
156+
cumulative=not self.fit_incrementals).dropna(), out
150157

151158
def _prep_X_ml(self, X):
152159
""" Preps Triangle data ahead of the pipeline """
@@ -170,7 +177,13 @@ def _prep_X_ml(self, X):
170177
if self.feat_eng is not None:
171178
for key, item in self.feat_eng.items():
172179
df[key] = item['func'](df=df,**item['kwargs'])
173-
return df
180+
weight_base = (~np.isnan(X.values)).astype(float)
181+
weight = weight_base.copy()
182+
if self.drop is not None:
183+
weight = weight * self._drop_func(X)
184+
if self.drop_valuation is not None:
185+
weight = weight * self._drop_valuation_func(X)
186+
return df, weight.flatten()[weight_base.flatten()>0]
174187

175188
def fit(self, X, y=None, sample_weight=None):
176189
"""Fit the model with X.
@@ -201,12 +214,19 @@ def fit(self, X, y=None, sample_weight=None):
201214
self.valuation_encoder_ = dict(zip(
202215
val,
203216
(pd.Series(val).rank()-1)/{'Y':1, 'S': 2, 'Q':4, 'M': 12}[X.development_grain]))
204-
df = self._prep_X_ml(X)
217+
df, weight = self._prep_X_ml(X)
205218
self.df_ = df
219+
self.weight_ = weight
220+
if self.weighted_step == None:
221+
sample_weights = {}
222+
elif isinstance(self.weighted_step, list):
223+
sample_weights = {x + '__sample_weight':weight for x in self.weighted_step}
224+
else:
225+
sample_weights = {self.weighted_step + '__sample_weight':weight}
206226
# Fit model
207-
self.estimator_ml.fit(df, self.y_ml_.fit_transform(df).squeeze())
227+
self.estimator_ml.fit(df, self.y_ml_.fit_transform(df).squeeze(),**sample_weights)
208228
#return selffit_incrementals
209-
self.triangle_ml_ = self._get_triangle_ml(df)
229+
self.triangle_ml_, self.predicted_data_ = self._get_triangle_ml(df)
210230
return self
211231

212232
@property
@@ -229,11 +249,12 @@ def transform(self, X):
229249
X_new : New triangle with transformed attributes.
230250
"""
231251
X_new = X.copy()
232-
X_ml = self._prep_X_ml(X)
252+
X_ml, weight_ml = self._prep_X_ml(X)
233253
y_ml=self.estimator_ml.predict(X_ml)
234-
triangle_ml = self._get_triangle_ml(X_ml, y_ml)
254+
triangle_ml, predicted_data = self._get_triangle_ml(X_ml, y_ml)
235255
backend = "cupy" if X.array_backend == "cupy" else "numpy"
236256
X_new.ldf_ = triangle_ml.incr_to_cum().link_ratio.set_backend(backend)
237257
X_new.ldf_.valuation_date = pd.to_datetime(options.ULT_VAL)
238258
X_new._set_slicers()
239-
return X_new
259+
X_new.predicted_data_ = predicted_data
260+
return X_new

chainladder/development/tests/test_barnzehn.py

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,33 @@ def origin_onehot(df,ori):
4444
np.around(cl.BarnettZehnwirth(formula='+'.join([f'C({x})' for x in feat_dict.keys()]),feat_eng = feat_dict).fit(abc).ldf_.values,3)
4545
== np.around(cl.BarnettZehnwirth(formula='C(origin)').fit_transform(abc).ldf_.values,3)
4646
)
47+
48+
def test_bz_2008():
49+
'''
50+
this function tests the drop parameter by recreating the example in the 2008 BZ paper, section 4.1
51+
'''
52+
abc = cl.load_sample('abc')
53+
exposure=np.array([[2.2], [2.4], [2.2], [2.0], [1.9], [1.6], [1.6], [1.8], [2.2], [2.5], [2.6]])
54+
abc_adj = abc/exposure
55+
56+
def predictor_bins(df,pbin,axis):
57+
return [int(x >= min(pbin)) for x in df[axis]]
58+
59+
origin_groups = {f'origin_{ori}'.replace('[','').replace(']','').replace(', ',''):{'func':predictor_bins,'kwargs':{'pbin':ori,'axis':'origin'}} for ori in [[2],[3,4],[5,6,7,8,9,10]]}
60+
61+
def trend_piece(df,piece,axis):
62+
pmax = float(max(piece))
63+
increment=min(df[axis][df[axis]>0])
64+
pfirst = piece[0]-increment
65+
return [(x-pfirst)/increment if x in piece else (0 if x<pmax else (pmax-pfirst)/increment) for x in df[axis]]
66+
67+
development_groups = {f'development_{dev}'.replace('[','').replace(']','').replace(', ',''):{'func':trend_piece,'kwargs':{'piece':dev,'axis':'development'}} for dev in [[24],[36],[48,60,72],[84,96],[108,120,132]]}
68+
69+
valuation_groups = {f'valuation_{val}'.replace('[','').replace(']','').replace(', ',''):{'func':trend_piece,'kwargs':{'piece':val,'axis':'valuation'}} for val in [[1,2,3,4,5,6,7],[8],[9,10]]}
70+
71+
abc_dict = {**origin_groups,**development_groups,**valuation_groups}
72+
model=cl.BarnettZehnwirth(formula='+'.join([z for z in abc_dict.keys()]),feat_eng=abc_dict, drop=('1982',72)).fit(abc_adj)
4773
assert np.all(
48-
np.around(cl.BarnettZehnwirth(formula='+'.join([f'C({x})' for x in feat_dict.keys()]),feat_eng = feat_dict).fit(abc).ldf_.values,3)
49-
== np.around(cl.BarnettZehnwirth(formula='C(origin)').fit_transform(abc).ldf_.values,3)
74+
np.around(model.coef_.values,4).flatten()
75+
== np.array([11.1579,0.1989,0.0703,0.0919,0.1871,-0.3771,-0.4465,-0.3727,-0.3154,0.0432,0.0858,0.1464])
5076
)

0 commit comments

Comments
 (0)