1- # This Source Code Form is subject to the terms of the Mozilla Public
2- # License, v. 2.0. If a copy of the MPL was not distributed with this
3- # file, You can obtain one at https://mozilla.org/MPL/2.0/.
4-
51import numpy as np
6- import pandas as pd
7-
8- from sklearn .preprocessing import OneHotEncoder , StandardScaler , PolynomialFeatures
9- from sklearn .compose import ColumnTransformer
10- from chainladder .development .base import DevelopmentBase
11- from chainladder import options
12-
13-
14- class DevelopmentML (DevelopmentBase ):
15- """ A Estimator that interfaces with machine learning (ML) tools that implement
16- the scikit-learn API.
17-
18- The `DevelopmentML` estimator is used to generate ``ldf_`` patterns from
19- the data.
20-
21- .. versionadded:: 0.8.1
22-
23-
24- Parameters
25- ----------
26- estimator_ml: skearn Estimator
27- Any sklearn compatible regression estimator, including Pipelines and
28- y_ml: list or str or sklearn_transformer
29- The response column(s) for the machine learning algorithm. It must be
30- present within the Triangle.
31- autoregressive: tuple, (autoregressive_col_name, lag, source_col_name)
32- The subset of response column(s) to use as lagged features for the
33- Time Series aspects of the model. Predictions from one development period
34- get used as featues in the next development period. Lags should be negative
35- integers.
36- feat_eng: dict
37- A dictionary with feature names as keys and a dictionary of function (with a key of 'func') and keyword arguments (with a key of 'kwargs')
38- (e.g. {
39- 'feature_1':{
40- 'func': function_name for feature 1,
41- 'kwargs': keyword arguments for the function
42- },
43- 'feature_2':{
44- 'func': function_name for feature 2,
45- 'kwargs': keyword arguments for the function
46- }
47- }
48- );
49- functions should be written with a input Dataframe named df; this is the DataFrame containing origin, development, and valuation that will passed into the function at run time
50- (e.g. this function adds 1 to every origin
51- def test_func(df)
52- return df['origin'] + 1
53- )
54- fit_incrementals:
55- Whether the response variable should be converted to an incremental basis
56- for fitting.
57-
58- Attributes
59- ----------
60- estimator_ml: Estimator
61- An sklearn-style estimator to predict development patterns
62- ldf_: Triangle
63- The estimated loss development patterns.
64- cdf_: Triangle
65- The estimated cumulative development patterns.
66- """
67-
68- def __init__ (self , estimator_ml = None , y_ml = None , autoregressive = False ,
69- weight_ml = None , fit_incrementals = True , feat_eng = None ):
70- self .estimator_ml = estimator_ml
71- self .y_ml = y_ml
72- self .weight_ml = weight_ml
73- self .autoregressive = autoregressive
74- self .fit_incrementals = fit_incrementals
75- self .feat_eng = feat_eng
76-
77- def _get_y_names (self ):
78- """ private function to get the response column name"""
79- if not self .y_ml :
80- y_names = self ._columns
81- if hasattr (self .y_ml , '_columns' ):
82- y_names = self .y_ml ._columns
83- elif isinstance (self .y_ml , ColumnTransformer ):
84- y_names = self .y_ml .transformers [0 ][- 1 ]
85- if type (self .y_ml ) is list :
86- y_names = self .y_ml
87- elif type (self .y_ml ) is str :
88- y_names = [self .y_ml ]
89- return y_names
90-
91-
92- @property
93- def y_ml_ (self ):
94- defaults = self ._get_y_names ()
95- transformer = self .y_ml
96- if not transformer :
97- return ColumnTransformer (
98- transformers = [('passthrough' , 'passthrough' , defaults )])
99- elif type (transformer ) is list :
100- return ColumnTransformer (
101- transformers = [('passthrough' , 'passthrough' , transformer )])
102- elif type (transformer ) is str :
103- return ColumnTransformer (
104- transformers = [('passthrough' , 'passthrough' , [transformer ])])
105- else :
106- return transformer
107-
108- def _get_triangle_ml (self , df , preds = None ):
109- """ Create fitted Triangle """
110- from chainladder .core import Triangle
111- if preds is None :
112- preds = self .estimator_ml .predict (df )
113- X_r = [df ]
114- y_r = [preds ]
115- dgrain = {'Y' :12 , 'Q' :3 , 'M' : 1 , 'S' : 6 }[self .development_grain_ ]
116- ograin = {'Y' :1 , 'Q' :4 , 'M' : 12 , 'S' : 6 }[self .origin_grain_ ]
117- latest_filter = (df ['origin' ]+ 1 )* ograin + (df ['development' ]- dgrain )/ dgrain
118- latest_filter = latest_filter == latest_filter .max ()
119- preds = pd .DataFrame (preds .copy ())[latest_filter ].values
120- out = df .loc [latest_filter ].copy ()
121- dev_lags = df ['development' ].drop_duplicates ().sort_values ()
122- for d in dev_lags [1 :]:
123- out ['development' ] = out ['development' ] + dgrain
124- out ['valuation' ] = out ['valuation' ] + dgrain / 12
125- if len (preds .shape ) == 1 :
126- preds = preds [:, None ]
127- if self .autoregressive :
128- for num , col in enumerate (self .autoregressive ):
129- out [col [0 ]]= preds [:, num ]
130- out = out [out ['development' ]<= dev_lags .max ()]
131- if len (out ) == 0 :
132- continue
133- X_r .append (out .copy ())
134- if self .feat_eng is not None :
135- for key , item in self .feat_eng .items ():
136- out [key ] = item ['func' ](df = out ,** item ['kwargs' ])
137- preds = self .estimator_ml .predict (out )
138- y_r .append (preds .copy ())
139- X_r = pd .concat (X_r , axis = 0 ).reset_index (drop = True )
140- if True :
141- X_r = X_r .drop (self ._get_y_names (), axis = 1 )
142- out = pd .concat ((X_r ,
143- pd .DataFrame (np .concatenate (y_r , 0 ), columns = self ._get_y_names ())), axis = 1 )
144- out ['origin' ] = out ['origin' ].map ({v : k for k , v in self .origin_encoder_ .items ()})
145- out ['valuation' ] = out ['valuation' ].map ({v : k for k , v in self .valuation_encoder_ .items ()})
146- return Triangle (
147- out , origin = 'origin' , development = 'valuation' ,
148- index = self ._key_labels , columns = self ._get_y_names (),
149- cumulative = not self .fit_incrementals ).dropna ()
150-
151- def _prep_X_ml (self , X ):
152- """ Preps Triangle data ahead of the pipeline """
153- if self .fit_incrementals :
154- X_ = X .cum_to_incr ()
155- else :
156- X_ = X .copy ()
157- if self .autoregressive :
158- for i in self .autoregressive :
159- lag = X [i [2 ]].shift (i [1 ])
160- X_ [i [0 ]] = lag [lag .valuation <= X .valuation_date ]
161- df_base = X .incr_to_cum ().to_frame (
162- keepdims = True , implicit_axis = True , origin_as_datetime = True
163- ).reset_index ().iloc [:, :- 1 ]
164- df = df_base .merge (X .cum_to_incr ().to_frame (
165- keepdims = True , implicit_axis = True , origin_as_datetime = True
166- ).reset_index (), how = 'left' ,
167- on = list (df_base .columns )).fillna (0 )
168- df ['origin' ] = df ['origin' ].map (self .origin_encoder_ )
169- df ['valuation' ] = df ['valuation' ].map (self .valuation_encoder_ )
170- if self .feat_eng is not None :
171- for key , item in self .feat_eng .items ():
172- df [key ] = item ['func' ](df = df ,** item ['kwargs' ])
173- return df
174-
175- def fit (self , X , y = None , sample_weight = None ):
176- """Fit the model with X.
177-
178- Parameters
179- ----------
180- X : Triangle-like
181- Set of LDFs to which the estimator will be applied.
182- y : None
183- Ignored, use y_ml to set a reponse variable for the ML algorithm
184- sample_weight : None
185- Ignored
186-
187- Returns
188- -------
189- self : object
190- Returns the instance itself.
191- """
192-
193- self ._columns = list (X .columns )
194- self ._key_labels = X .key_labels
195- self .origin_grain_ = X .origin_grain
196- self .development_grain_ = X .development_grain
197- self .origin_encoder_ = dict (zip (
198- X .origin .to_timestamp (how = 's' ),
199- (pd .Series (X .origin ).rank ()- 1 )/ {'Y' :1 , 'Q' :4 , 'M' : 12 , 'S' : 6 }[X .origin_grain ]))
200- val = X .valuation .sort_values ().unique ()
201- self .valuation_encoder_ = dict (zip (
202- val ,
203- (pd .Series (val ).rank ()- 1 )/ {'Y' :1 , 'Q' :4 , 'M' : 12 , 'S' : 6 }[X .development_grain ]))
204- df = self ._prep_X_ml (X )
205- self .df_ = df
206- # Fit model
207- self .estimator_ml .fit (df , self .y_ml_ .fit_transform (df ).squeeze ())
208- #return selffit_incrementals
209- self .triangle_ml_ = self ._get_triangle_ml (df )
210- return self
211-
212- @property
213- def ldf_ (self ):
214- ldf = self .triangle_ml_ .incr_to_cum ().link_ratio
215- ldf .valuation_date = pd .to_datetime (options .ULT_VAL )
216- return ldf
217-
218- def transform (self , X ):
219- """ If X and self are of different shapes, align self to X, else
220- return self.
221-
222- Parameters
223- ----------
224- X : Triangle
225- The triangle to be transformed
226-
227- Returns
228- -------
229- X_new : New triangle with transformed attributes.
230- """
231- X_new = X .copy ()
232- X_ml = self ._prep_X_ml (X )
233- y_ml = self .estimator_ml .predict (X_ml )
234- triangle_ml = self ._get_triangle_ml (X_ml , y_ml )
235- backend = "cupy" if X .array_backend == "cupy" else "numpy"
236- X_new .ldf_ = triangle_ml .incr_to_cum ().link_ratio .set_backend (backend )
237- X_new .ldf_ .valuation_date = pd .to_datetime (options .ULT_VAL )
238- X_new ._set_slicers ()
239- return X_new
2+ import chainladder as cl
3+ import pytest
4+
5+ def test_basic_bz ():
6+ abc = cl .load_sample ('abc' )
7+ assert np .all (
8+ np .around (cl .BarnettZehnwirth (formula = 'C(origin)+C(development)' ).fit (abc ).coef_ .T .values ,3 ).flatten ()
9+ == np .array ([11.837 ,0.179 ,0.345 ,0.378 ,0.405 ,0.427 ,0.431 ,0.66 ,0.963 ,1.157 ,1.278 ,0.251 ,- 0.056 ,- 0.449 ,- 0.829 ,- 1.169 ,- 1.508 ,- 1.798 ,- 2.023 ,- 2.238 ,- 2.428 ])
10+ )
11+
12+ def test_multiple_triangle_exception ():
13+ d = cl .load_sample ("usauto" )
14+ with pytest .raises (ValueError ):
15+ cl .BarnettZehnwirth (formula = 'C(origin)+C(development)' ).fit (d )
16+
17+ def test_feat_eng_1 ():
18+ '''
19+ this function tests the passing in a basic engineered feature. Since test_func just returns development, C(development) and C(teatfeat) should yield identical results
20+ '''
21+ def test_func (df ):
22+ return df ["development" ]
23+
24+ abc = cl .load_sample ('abc' )
25+ test_dict = {'testfeat' :{'func' :test_func ,'kwargs' :{}}}
26+
27+ assert np .all (
28+ np .around (cl .BarnettZehnwirth (formula = 'C(origin)+development+valuation' ).fit (abc ).coef_ .T .values ,3 )
29+ == np .around (cl .BarnettZehnwirth (formula = 'C(origin)+testfeat+valuation' ,feat_eng = test_dict ).fit (abc ).coef_ .T .values ,3 )
30+ )
31+
32+ def test_feat_eng_2 ():
33+ '''
34+ this function tests more complex feature engineering. Since origin_onehot just replicates the one-hot encoding that's performed inside sklearn LinearRegression, the two BZ models should yield identical results
35+
36+ this function also tests the BZ transformer
37+ '''
38+ def origin_onehot (df ,ori ):
39+ return [1 if x == ori else 0 for x in df ["origin" ]]
40+
41+ abc = cl .load_sample ('abc' )
42+ feat_dict = {f'origin_{ x } ' :{'func' :origin_onehot ,'kwargs' :{'ori' :float (x + 1 )}} for x in range (10 )}
43+ assert np .all (
44+ np .around (cl .BarnettZehnwirth (formula = '+' .join ([f'C({ x } )' for x in feat_dict .keys ()]),feat_eng = feat_dict ).fit (abc ).ldf_ .values ,3 )
45+ == np .around (cl .BarnettZehnwirth (formula = 'C(origin)' ).fit_transform (abc ).ldf_ .values ,3 )
46+ )
47+ assert np .all (
48+ np .around (cl .BarnettZehnwirth (formula = '+' .join ([f'C({ x } )' for x in feat_dict .keys ()]),feat_eng = feat_dict ).fit (abc ).ldf_ .values ,3 )
49+ == np .around (cl .BarnettZehnwirth (formula = 'C(origin)' ).fit_transform (abc ).ldf_ .values ,3 )
50+ )
0 commit comments