Skip to content

Commit ae858c2

Browse files
authored
Merge pull request #635 from henrydingliu/master
Fully resolving #623
2 parents d4b2c42 + a219c24 commit ae858c2

File tree

3 files changed

+148
-15
lines changed

3 files changed

+148
-15
lines changed

chainladder/development/barnzehn.py

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,18 +22,43 @@ class BarnettZehnwirth(TweedieGLM):
2222
2323
Parameters
2424
----------
25+
drop: tuple or list of tuples
26+
Drops specific origin/development combination(s)
27+
drop_valuation: str or list of str (default = None)
28+
Drops specific valuation periods. str must be date convertible.
2529
formula: formula-like
2630
A patsy formula describing the independent variables, X of the GLM
31+
feat_eng: dict
32+
A dictionary with feature names as keys and a dictionary of function (with a key of 'func') and keyword arguments (with a key of 'kwargs')
33+
(e.g. {
34+
'feature_1':{
35+
'func': function_name for feature 1,
36+
'kwargs': keyword arguments for the function
37+
},
38+
'feature_2':{
39+
'func': function_name for feature 2,
40+
'kwargs': keyword arguments for the function
41+
}
42+
}
43+
);
44+
functions should be written with a input Dataframe named df; this is the DataFrame containing origin, development, and valuation that will passed into the function at run time
45+
(e.g. this function adds 1 to every origin
46+
def test_func(df)
47+
return df['origin'] + 1
48+
)
2749
response: str
2850
Column name for the reponse variable of the GLM. If ommitted, then the
2951
first column of the Triangle will be used.
3052
3153
3254
"""
3355

34-
def __init__(self, formula='C(origin) + development', response=None):
56+
def __init__(self, drop=None,drop_valuation=None,formula='C(origin) + development', feat_eng=None, response=None):
57+
self.drop = drop
58+
self.drop_valuation = drop_valuation
3559
self.formula = formula
3660
self.response = response
61+
self.feat_eng = feat_eng
3762

3863
def fit(self, X, y=None, sample_weight=None):
3964
if max(X.shape[:2]) > 1:
@@ -50,7 +75,7 @@ def fit(self, X, y=None, sample_weight=None):
5075
self.model_ = DevelopmentML(Pipeline(steps=[
5176
('design_matrix', PatsyFormula(self.formula)),
5277
('model', LinearRegression(fit_intercept=False))]),
53-
y_ml=response, fit_incrementals=False).fit(tri)
78+
y_ml=response, fit_incrementals=False, feat_eng = self.feat_eng, drop=self.drop, drop_valuation = self.drop_valuation, weighted_step = 'model').fit(tri)
5479
resid = tri - self.model_.triangle_ml_[
5580
self.model_.triangle_ml_.valuation <= tri.valuation_date]
5681
self.mse_resid_ = (resid**2).sum(0).sum(1).sum(2).sum() / (
@@ -75,12 +100,13 @@ def transform(self, X):
75100
X_new : New triangle with transformed attributes.
76101
"""
77102
X_new = X.copy()
78-
X_ml = self.model_._prep_X_ml(X.cum_to_incr().log())
103+
X_ml, weight_ml = self.model_._prep_X_ml(X.cum_to_incr().log())
79104
y_ml = self.model_.estimator_ml.predict(X_ml)
80-
triangle_ml = self.model_._get_triangle_ml(X_ml, y_ml)
105+
triangle_ml, predicted_data = self.model_._get_triangle_ml(X_ml, y_ml)
81106
backend = "cupy" if X.array_backend == "cupy" else "numpy"
82107
triangle_ml.is_cumulative = False
83108
X_new.ldf_ = triangle_ml.exp().incr_to_cum().link_ratio.set_backend(backend)
84109
X_new.ldf_.valuation_date = pd.to_datetime(options.ULT_VAL)
85110
X_new._set_slicers()
111+
X_new.predicted_data_ = predicted_data
86112
return X_new

chainladder/development/learning.py

Lines changed: 56 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,28 @@ class DevelopmentML(DevelopmentBase):
3333
Time Series aspects of the model. Predictions from one development period
3434
get used as featues in the next development period. Lags should be negative
3535
integers.
36+
drop: tuple or list of tuples
37+
Drops specific origin/development combination(s)
38+
drop_valuation: str or list of str (default = None)
39+
Drops specific valuation periods. str must be date convertible.
40+
feat_eng: dict
41+
A dictionary with feature names as keys and a dictionary of function (with a key of 'func') and keyword arguments (with a key of 'kwargs')
42+
(e.g. {
43+
'feature_1':{
44+
'func': function_name for feature 1,
45+
'kwargs': keyword arguments for the function
46+
},
47+
'feature_2':{
48+
'func': function_name for feature 2,
49+
'kwargs': keyword arguments for the function
50+
}
51+
}
52+
);
53+
functions should be written with a input Dataframe named df; this is the DataFrame containing origin, development, and valuation that will passed into the function at run time
54+
(e.g. this function adds 1 to every origin
55+
def test_func(df)
56+
return df['origin'] + 1
57+
)
3658
fit_incrementals:
3759
Whether the response variable should be converted to an incremental basis
3860
for fitting.
@@ -48,12 +70,16 @@ class DevelopmentML(DevelopmentBase):
4870
"""
4971

5072
def __init__(self, estimator_ml=None, y_ml=None, autoregressive=False,
51-
weight_ml=None, fit_incrementals=True):
73+
weight_ml=None, weighted_step=None,drop=None,drop_valuation=None,fit_incrementals=True, feat_eng=None):
5274
self.estimator_ml=estimator_ml
5375
self.y_ml=y_ml
5476
self.weight_ml = weight_ml
55-
self.autoregressive=autoregressive
77+
self.weighted_step = weighted_step
78+
self.autoregressive = autoregressive
79+
self.drop = drop
80+
self.drop_valuation = drop_valuation
5681
self.fit_incrementals = fit_incrementals
82+
self.feat_eng = feat_eng
5783

5884
def _get_y_names(self):
5985
""" private function to get the response column name"""
@@ -112,6 +138,9 @@ def _get_triangle_ml(self, df, preds=None):
112138
if len(out) == 0:
113139
continue
114140
X_r.append(out.copy())
141+
if self.feat_eng is not None:
142+
for key, item in self.feat_eng.items():
143+
out[key] = item['func'](df=out,**item['kwargs'])
115144
preds = self.estimator_ml.predict(out)
116145
y_r.append(preds.copy())
117146
X_r = pd.concat(X_r, axis=0).reset_index(drop=True)
@@ -124,7 +153,7 @@ def _get_triangle_ml(self, df, preds=None):
124153
return Triangle(
125154
out, origin='origin', development='valuation',
126155
index=self._key_labels, columns=self._get_y_names(),
127-
cumulative=not self.fit_incrementals).dropna()
156+
cumulative=not self.fit_incrementals).dropna(), out
128157

129158
def _prep_X_ml(self, X):
130159
""" Preps Triangle data ahead of the pipeline """
@@ -145,7 +174,16 @@ def _prep_X_ml(self, X):
145174
on=list(df_base.columns)).fillna(0)
146175
df['origin'] = df['origin'].map(self.origin_encoder_)
147176
df['valuation'] = df['valuation'].map(self.valuation_encoder_)
148-
return df
177+
if self.feat_eng is not None:
178+
for key, item in self.feat_eng.items():
179+
df[key] = item['func'](df=df,**item['kwargs'])
180+
weight_base = (~np.isnan(X.values)).astype(float)
181+
weight = weight_base.copy()
182+
if self.drop is not None:
183+
weight = weight * self._drop_func(X)
184+
if self.drop_valuation is not None:
185+
weight = weight * self._drop_valuation_func(X)
186+
return df, weight.flatten()[weight_base.flatten()>0]
149187

150188
def fit(self, X, y=None, sample_weight=None):
151189
"""Fit the model with X.
@@ -176,12 +214,19 @@ def fit(self, X, y=None, sample_weight=None):
176214
self.valuation_encoder_ = dict(zip(
177215
val,
178216
(pd.Series(val).rank()-1)/{'Y':1, 'S': 2, 'Q':4, 'M': 12}[X.development_grain]))
179-
df = self._prep_X_ml(X)
217+
df, weight = self._prep_X_ml(X)
180218
self.df_ = df
219+
self.weight_ = weight
220+
if self.weighted_step == None:
221+
sample_weights = {}
222+
elif isinstance(self.weighted_step, list):
223+
sample_weights = {x + '__sample_weight':weight for x in self.weighted_step}
224+
else:
225+
sample_weights = {self.weighted_step + '__sample_weight':weight}
181226
# Fit model
182-
self.estimator_ml.fit(df, self.y_ml_.fit_transform(df).squeeze())
227+
self.estimator_ml.fit(df, self.y_ml_.fit_transform(df).squeeze(),**sample_weights)
183228
#return selffit_incrementals
184-
self.triangle_ml_ = self._get_triangle_ml(df)
229+
self.triangle_ml_, self.predicted_data_ = self._get_triangle_ml(df)
185230
return self
186231

187232
@property
@@ -204,11 +249,12 @@ def transform(self, X):
204249
X_new : New triangle with transformed attributes.
205250
"""
206251
X_new = X.copy()
207-
X_ml = self._prep_X_ml(X)
252+
X_ml, weight_ml = self._prep_X_ml(X)
208253
y_ml=self.estimator_ml.predict(X_ml)
209-
triangle_ml = self._get_triangle_ml(X_ml, y_ml)
254+
triangle_ml, predicted_data = self._get_triangle_ml(X_ml, y_ml)
210255
backend = "cupy" if X.array_backend == "cupy" else "numpy"
211256
X_new.ldf_ = triangle_ml.incr_to_cum().link_ratio.set_backend(backend)
212257
X_new.ldf_.valuation_date = pd.to_datetime(options.ULT_VAL)
213258
X_new._set_slicers()
214-
return X_new
259+
X_new.predicted_data_ = predicted_data
260+
return X_new

chainladder/development/tests/test_barnzehn.py

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,65 @@ def test_basic_bz():
1212
def test_multiple_triangle_exception():
1313
d = cl.load_sample("usauto")
1414
with pytest.raises(ValueError):
15-
cl.BarnettZehnwirth(formula='C(origin)+C(development)').fit(d)
15+
cl.BarnettZehnwirth(formula='C(origin)+C(development)').fit(d)
16+
17+
def test_feat_eng_1():
18+
'''
19+
this function tests the passing in a basic engineered feature. Since test_func just returns development, C(development) and C(teatfeat) should yield identical results
20+
'''
21+
def test_func(df):
22+
return df["development"]
23+
24+
abc = cl.load_sample('abc')
25+
test_dict = {'testfeat':{'func':test_func,'kwargs':{}}}
26+
27+
assert np.all(
28+
np.around(cl.BarnettZehnwirth(formula='C(origin)+development+valuation').fit(abc).coef_.T.values,3)
29+
== np.around(cl.BarnettZehnwirth(formula='C(origin)+testfeat+valuation',feat_eng = test_dict).fit(abc).coef_.T.values,3)
30+
)
31+
32+
def test_feat_eng_2():
33+
'''
34+
this function tests more complex feature engineering. Since origin_onehot just replicates the one-hot encoding that's performed inside sklearn LinearRegression, the two BZ models should yield identical results
35+
36+
this function also tests the BZ transformer
37+
'''
38+
def origin_onehot(df,ori):
39+
return [1 if x == ori else 0 for x in df["origin"]]
40+
41+
abc = cl.load_sample('abc')
42+
feat_dict = {f'origin_{x}':{'func':origin_onehot,'kwargs':{'ori':float(x+1)}} for x in range(10)}
43+
assert np.all(
44+
np.around(cl.BarnettZehnwirth(formula='+'.join([f'C({x})' for x in feat_dict.keys()]),feat_eng = feat_dict).fit(abc).ldf_.values,3)
45+
== np.around(cl.BarnettZehnwirth(formula='C(origin)').fit_transform(abc).ldf_.values,3)
46+
)
47+
48+
def test_bz_2008():
49+
'''
50+
this function tests the drop parameter by recreating the example in the 2008 BZ paper, section 4.1
51+
'''
52+
abc = cl.load_sample('abc')
53+
exposure=np.array([[2.2], [2.4], [2.2], [2.0], [1.9], [1.6], [1.6], [1.8], [2.2], [2.5], [2.6]])
54+
abc_adj = abc/exposure
55+
56+
def predictor_bins(df,pbin,axis):
57+
return [int(x >= min(pbin)) for x in df[axis]]
58+
59+
origin_groups = {f'origin_{ori}'.replace('[','').replace(']','').replace(', ',''):{'func':predictor_bins,'kwargs':{'pbin':ori,'axis':'origin'}} for ori in [[2],[3,4],[5,6,7,8,9,10]]}
60+
61+
def trend_piece(df,piece,axis):
62+
pmax = float(max(piece))
63+
increment=min(df[axis][df[axis]>0])
64+
pfirst = piece[0]-increment
65+
return [(x-pfirst)/increment if x in piece else (0 if x<pmax else (pmax-pfirst)/increment) for x in df[axis]]
66+
67+
development_groups = {f'development_{dev}'.replace('[','').replace(']','').replace(', ',''):{'func':trend_piece,'kwargs':{'piece':dev,'axis':'development'}} for dev in [[24],[36],[48,60,72],[84,96],[108,120,132]]}
68+
69+
valuation_groups = {f'valuation_{val}'.replace('[','').replace(']','').replace(', ',''):{'func':trend_piece,'kwargs':{'piece':val,'axis':'valuation'}} for val in [[1,2,3,4,5,6,7],[8],[9,10]]}
70+
71+
abc_dict = {**origin_groups,**development_groups,**valuation_groups}
72+
model=cl.BarnettZehnwirth(formula='+'.join([z for z in abc_dict.keys()]),feat_eng=abc_dict, drop=('1982',72)).fit(abc_adj)
73+
assert np.all(
74+
np.around(model.coef_.values,4).flatten()
75+
== np.array([11.1579,0.1989,0.0703,0.0919,0.1871,-0.3771,-0.4465,-0.3727,-0.3154,0.0432,0.0858,0.1464])
76+
)

0 commit comments

Comments
 (0)