Skip to content

Commit 4da4ad3

Browse files
committed
rebasing BZ changes
1 parent d4b2c42 commit 4da4ad3

File tree

3 files changed

+285
-17
lines changed

3 files changed

+285
-17
lines changed

chainladder/development/barnzehn.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,16 +24,35 @@ class BarnettZehnwirth(TweedieGLM):
2424
----------
2525
formula: formula-like
2626
A patsy formula describing the independent variables, X of the GLM
27+
feat_eng: dict
28+
A dictionary with feature names as keys and a dictionary of function (with a key of 'func') and keyword arguments (with a key of 'kwargs')
29+
(e.g. {
30+
'feature_1':{
31+
'func': function_name for feature 1,
32+
'kwargs': keyword arguments for the function
33+
},
34+
'feature_2':{
35+
'func': function_name for feature 2,
36+
'kwargs': keyword arguments for the function
37+
}
38+
}
39+
);
40+
functions should be written with a input Dataframe named df; this is the DataFrame containing origin, development, and valuation that will passed into the function at run time
41+
(e.g. this function adds 1 to every origin
42+
def test_func(df)
43+
return df['origin'] + 1
44+
)
2745
response: str
2846
Column name for the reponse variable of the GLM. If ommitted, then the
2947
first column of the Triangle will be used.
3048
3149
3250
"""
3351

34-
def __init__(self, formula='C(origin) + development', response=None):
52+
def __init__(self, formula='C(origin) + development', feat_eng=None, response=None):
3553
self.formula = formula
3654
self.response = response
55+
self.feat_eng = feat_eng
3756

3857
def fit(self, X, y=None, sample_weight=None):
3958
if max(X.shape[:2]) > 1:
@@ -50,7 +69,7 @@ def fit(self, X, y=None, sample_weight=None):
5069
self.model_ = DevelopmentML(Pipeline(steps=[
5170
('design_matrix', PatsyFormula(self.formula)),
5271
('model', LinearRegression(fit_intercept=False))]),
53-
y_ml=response, fit_incrementals=False).fit(tri)
72+
y_ml=response, fit_incrementals=False, feat_eng = self.feat_eng).fit(tri)
5473
resid = tri - self.model_.triangle_ml_[
5574
self.model_.triangle_ml_.valuation <= tri.valuation_date]
5675
self.mse_resid_ = (resid**2).sum(0).sum(1).sum(2).sum() / (

chainladder/development/learning.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,24 @@ class DevelopmentML(DevelopmentBase):
3333
Time Series aspects of the model. Predictions from one development period
3434
get used as featues in the next development period. Lags should be negative
3535
integers.
36+
feat_eng: dict
37+
A dictionary with feature names as keys and a dictionary of function (with a key of 'func') and keyword arguments (with a key of 'kwargs')
38+
(e.g. {
39+
'feature_1':{
40+
'func': function_name for feature 1,
41+
'kwargs': keyword arguments for the function
42+
},
43+
'feature_2':{
44+
'func': function_name for feature 2,
45+
'kwargs': keyword arguments for the function
46+
}
47+
}
48+
);
49+
functions should be written with a input Dataframe named df; this is the DataFrame containing origin, development, and valuation that will passed into the function at run time
50+
(e.g. this function adds 1 to every origin
51+
def test_func(df)
52+
return df['origin'] + 1
53+
)
3654
fit_incrementals:
3755
Whether the response variable should be converted to an incremental basis
3856
for fitting.
@@ -48,12 +66,13 @@ class DevelopmentML(DevelopmentBase):
4866
"""
4967

5068
def __init__(self, estimator_ml=None, y_ml=None, autoregressive=False,
51-
weight_ml=None, fit_incrementals=True):
69+
weight_ml=None, fit_incrementals=True, feat_eng=None):
5270
self.estimator_ml=estimator_ml
5371
self.y_ml=y_ml
5472
self.weight_ml = weight_ml
5573
self.autoregressive=autoregressive
5674
self.fit_incrementals = fit_incrementals
75+
self.feat_eng = feat_eng
5776

5877
def _get_y_names(self):
5978
""" private function to get the response column name"""
@@ -112,6 +131,9 @@ def _get_triangle_ml(self, df, preds=None):
112131
if len(out) == 0:
113132
continue
114133
X_r.append(out.copy())
134+
if self.feat_eng is not None:
135+
for key, item in self.feat_eng.items():
136+
out[key] = item['func'](df=out,**item['kwargs'])
115137
preds = self.estimator_ml.predict(out)
116138
y_r.append(preds.copy())
117139
X_r = pd.concat(X_r, axis=0).reset_index(drop=True)
@@ -145,6 +167,9 @@ def _prep_X_ml(self, X):
145167
on=list(df_base.columns)).fillna(0)
146168
df['origin'] = df['origin'].map(self.origin_encoder_)
147169
df['valuation'] = df['valuation'].map(self.valuation_encoder_)
170+
if self.feat_eng is not None:
171+
for key, item in self.feat_eng.items():
172+
df[key] = item['func'](df=df,**item['kwargs'])
148173
return df
149174

150175
def fit(self, X, y=None, sample_weight=None):
Lines changed: 238 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,239 @@
1+
# This Source Code Form is subject to the terms of the Mozilla Public
2+
# License, v. 2.0. If a copy of the MPL was not distributed with this
3+
# file, You can obtain one at https://mozilla.org/MPL/2.0/.
4+
15
import numpy as np
2-
import chainladder as cl
3-
import pytest
4-
5-
def test_basic_bz():
6-
abc = cl.load_sample('abc')
7-
assert np.all(
8-
np.around(cl.BarnettZehnwirth(formula='C(origin)+C(development)').fit(abc).coef_.T.values,3).flatten()
9-
== np.array([11.837,0.179,0.345,0.378,0.405,0.427,0.431,0.66,0.963,1.157,1.278,0.251,-0.056,-0.449,-0.829,-1.169,-1.508,-1.798,-2.023,-2.238,-2.428])
10-
)
11-
12-
def test_multiple_triangle_exception():
13-
d = cl.load_sample("usauto")
14-
with pytest.raises(ValueError):
15-
cl.BarnettZehnwirth(formula='C(origin)+C(development)').fit(d)
6+
import pandas as pd
7+
8+
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
9+
from sklearn.compose import ColumnTransformer
10+
from chainladder.development.base import DevelopmentBase
11+
from chainladder import options
12+
13+
14+
class DevelopmentML(DevelopmentBase):
15+
""" A Estimator that interfaces with machine learning (ML) tools that implement
16+
the scikit-learn API.
17+
18+
The `DevelopmentML` estimator is used to generate ``ldf_`` patterns from
19+
the data.
20+
21+
.. versionadded:: 0.8.1
22+
23+
24+
Parameters
25+
----------
26+
estimator_ml: skearn Estimator
27+
Any sklearn compatible regression estimator, including Pipelines and
28+
y_ml: list or str or sklearn_transformer
29+
The response column(s) for the machine learning algorithm. It must be
30+
present within the Triangle.
31+
autoregressive: tuple, (autoregressive_col_name, lag, source_col_name)
32+
The subset of response column(s) to use as lagged features for the
33+
Time Series aspects of the model. Predictions from one development period
34+
get used as featues in the next development period. Lags should be negative
35+
integers.
36+
feat_eng: dict
37+
A dictionary with feature names as keys and a dictionary of function (with a key of 'func') and keyword arguments (with a key of 'kwargs')
38+
(e.g. {
39+
'feature_1':{
40+
'func': function_name for feature 1,
41+
'kwargs': keyword arguments for the function
42+
},
43+
'feature_2':{
44+
'func': function_name for feature 2,
45+
'kwargs': keyword arguments for the function
46+
}
47+
}
48+
);
49+
functions should be written with a input Dataframe named df; this is the DataFrame containing origin, development, and valuation that will passed into the function at run time
50+
(e.g. this function adds 1 to every origin
51+
def test_func(df)
52+
return df['origin'] + 1
53+
)
54+
fit_incrementals:
55+
Whether the response variable should be converted to an incremental basis
56+
for fitting.
57+
58+
Attributes
59+
----------
60+
estimator_ml: Estimator
61+
An sklearn-style estimator to predict development patterns
62+
ldf_: Triangle
63+
The estimated loss development patterns.
64+
cdf_: Triangle
65+
The estimated cumulative development patterns.
66+
"""
67+
68+
def __init__(self, estimator_ml=None, y_ml=None, autoregressive=False,
69+
weight_ml=None, fit_incrementals=True, feat_eng=None):
70+
self.estimator_ml=estimator_ml
71+
self.y_ml=y_ml
72+
self.weight_ml = weight_ml
73+
self.autoregressive=autoregressive
74+
self.fit_incrementals = fit_incrementals
75+
self.feat_eng = feat_eng
76+
77+
def _get_y_names(self):
78+
""" private function to get the response column name"""
79+
if not self.y_ml:
80+
y_names = self._columns
81+
if hasattr(self.y_ml, '_columns'):
82+
y_names = self.y_ml._columns
83+
elif isinstance(self.y_ml, ColumnTransformer):
84+
y_names = self.y_ml.transformers[0][-1]
85+
if type(self.y_ml) is list:
86+
y_names = self.y_ml
87+
elif type(self.y_ml) is str:
88+
y_names = [self.y_ml]
89+
return y_names
90+
91+
92+
@property
93+
def y_ml_(self):
94+
defaults = self._get_y_names()
95+
transformer = self.y_ml
96+
if not transformer:
97+
return ColumnTransformer(
98+
transformers=[('passthrough', 'passthrough', defaults)])
99+
elif type(transformer) is list:
100+
return ColumnTransformer(
101+
transformers=[('passthrough', 'passthrough', transformer)])
102+
elif type(transformer) is str:
103+
return ColumnTransformer(
104+
transformers=[('passthrough', 'passthrough', [transformer])])
105+
else:
106+
return transformer
107+
108+
def _get_triangle_ml(self, df, preds=None):
109+
""" Create fitted Triangle """
110+
from chainladder.core import Triangle
111+
if preds is None:
112+
preds = self.estimator_ml.predict(df)
113+
X_r = [df]
114+
y_r = [preds]
115+
dgrain = {'Y':12, 'Q':3, 'M': 1, 'S': 6}[self.development_grain_]
116+
ograin = {'Y':1, 'Q':4, 'M': 12, 'S': 6}[self.origin_grain_]
117+
latest_filter = (df['origin']+1)*ograin+(df['development']-dgrain)/dgrain
118+
latest_filter = latest_filter == latest_filter.max()
119+
preds=pd.DataFrame(preds.copy())[latest_filter].values
120+
out = df.loc[latest_filter].copy()
121+
dev_lags = df['development'].drop_duplicates().sort_values()
122+
for d in dev_lags[1:]:
123+
out['development'] = out['development'] + dgrain
124+
out['valuation'] = out['valuation'] + dgrain / 12
125+
if len(preds.shape) == 1:
126+
preds = preds[:, None]
127+
if self.autoregressive:
128+
for num, col in enumerate(self.autoregressive):
129+
out[col[0]]=preds[:, num]
130+
out = out[out['development']<=dev_lags.max()]
131+
if len(out) == 0:
132+
continue
133+
X_r.append(out.copy())
134+
if self.feat_eng is not None:
135+
for key, item in self.feat_eng.items():
136+
out[key] = item['func'](df=out,**item['kwargs'])
137+
preds = self.estimator_ml.predict(out)
138+
y_r.append(preds.copy())
139+
X_r = pd.concat(X_r, axis=0).reset_index(drop=True)
140+
if True:
141+
X_r = X_r.drop(self._get_y_names(), axis=1)
142+
out = pd.concat((X_r,
143+
pd.DataFrame(np.concatenate(y_r, 0), columns=self._get_y_names())), axis=1)
144+
out['origin'] = out['origin'].map({v: k for k, v in self.origin_encoder_.items()})
145+
out['valuation'] = out['valuation'].map({v: k for k, v in self.valuation_encoder_.items()})
146+
return Triangle(
147+
out, origin='origin', development='valuation',
148+
index=self._key_labels, columns=self._get_y_names(),
149+
cumulative=not self.fit_incrementals).dropna()
150+
151+
def _prep_X_ml(self, X):
152+
""" Preps Triangle data ahead of the pipeline """
153+
if self.fit_incrementals:
154+
X_ = X.cum_to_incr()
155+
else:
156+
X_ = X.copy()
157+
if self.autoregressive:
158+
for i in self.autoregressive:
159+
lag = X[i[2]].shift(i[1])
160+
X_[i[0]] = lag[lag.valuation<=X.valuation_date]
161+
df_base = X.incr_to_cum().to_frame(
162+
keepdims=True, implicit_axis=True, origin_as_datetime=True
163+
).reset_index().iloc[:, :-1]
164+
df = df_base.merge(X.cum_to_incr().to_frame(
165+
keepdims=True, implicit_axis=True, origin_as_datetime=True
166+
).reset_index(), how='left',
167+
on=list(df_base.columns)).fillna(0)
168+
df['origin'] = df['origin'].map(self.origin_encoder_)
169+
df['valuation'] = df['valuation'].map(self.valuation_encoder_)
170+
if self.feat_eng is not None:
171+
for key, item in self.feat_eng.items():
172+
df[key] = item['func'](df=df,**item['kwargs'])
173+
return df
174+
175+
def fit(self, X, y=None, sample_weight=None):
176+
"""Fit the model with X.
177+
178+
Parameters
179+
----------
180+
X : Triangle-like
181+
Set of LDFs to which the estimator will be applied.
182+
y : None
183+
Ignored, use y_ml to set a reponse variable for the ML algorithm
184+
sample_weight : None
185+
Ignored
186+
187+
Returns
188+
-------
189+
self : object
190+
Returns the instance itself.
191+
"""
192+
193+
self._columns = list(X.columns)
194+
self._key_labels = X.key_labels
195+
self.origin_grain_ = X.origin_grain
196+
self.development_grain_ = X.development_grain
197+
self.origin_encoder_ = dict(zip(
198+
X.origin.to_timestamp(how='s'),
199+
(pd.Series(X.origin).rank()-1)/{'Y':1, 'Q':4, 'M': 12, 'S': 6}[X.origin_grain]))
200+
val = X.valuation.sort_values().unique()
201+
self.valuation_encoder_ = dict(zip(
202+
val,
203+
(pd.Series(val).rank()-1)/{'Y':1, 'Q':4, 'M': 12, 'S': 6}[X.development_grain]))
204+
df = self._prep_X_ml(X)
205+
self.df_ = df
206+
# Fit model
207+
self.estimator_ml.fit(df, self.y_ml_.fit_transform(df).squeeze())
208+
#return selffit_incrementals
209+
self.triangle_ml_ = self._get_triangle_ml(df)
210+
return self
211+
212+
@property
213+
def ldf_(self):
214+
ldf = self.triangle_ml_.incr_to_cum().link_ratio
215+
ldf.valuation_date = pd.to_datetime(options.ULT_VAL)
216+
return ldf
217+
218+
def transform(self, X):
219+
""" If X and self are of different shapes, align self to X, else
220+
return self.
221+
222+
Parameters
223+
----------
224+
X : Triangle
225+
The triangle to be transformed
226+
227+
Returns
228+
-------
229+
X_new : New triangle with transformed attributes.
230+
"""
231+
X_new = X.copy()
232+
X_ml = self._prep_X_ml(X)
233+
y_ml=self.estimator_ml.predict(X_ml)
234+
triangle_ml = self._get_triangle_ml(X_ml, y_ml)
235+
backend = "cupy" if X.array_backend == "cupy" else "numpy"
236+
X_new.ldf_ = triangle_ml.incr_to_cum().link_ratio.set_backend(backend)
237+
X_new.ldf_.valuation_date = pd.to_datetime(options.ULT_VAL)
238+
X_new._set_slicers()
239+
return X_new

0 commit comments

Comments
 (0)