Skip to content

Commit f7fae39

Browse files
authored
paper for joss (#312)
* add papers first draft * first draft paper * add @ to inline citations * final edits to paper * update format inline citations * reformats zenodo bibex entry * fixes spelling and punctuation * adds missing DOIs * updates zenodo citation * removes doi.org from citations * removes zenodo citation
1 parent f74cec8 commit f7fae39

File tree

2 files changed

+364
-0
lines changed

2 files changed

+364
-0
lines changed

paper/paper.bib

Lines changed: 260 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,260 @@
1+
@article{sklearn,
2+
title = {Scikit-learn: Machine Learning in {P}ython},
3+
author = {Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
4+
and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
5+
and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
6+
Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
7+
journal = {Journal of Machine Learning Research},
8+
volume = {12},
9+
pages = {2825--2830},
10+
year = {2011}
11+
}
12+
13+
14+
@InProceedings{pandas,
15+
author = { {W}es {M}c{K}inney },
16+
title = { {D}ata {S}tructures for {S}tatistical {C}omputing in {P}ython },
17+
booktitle = { {P}roceedings of the 9th {P}ython in {S}cience {C}onference },
18+
pages = { 56 - 61 },
19+
year = { 2010 },
20+
editor = { {S}t\'efan van der {W}alt and {J}arrod {M}illman },
21+
doi = { 10.25080/Majora-92bf1922-00a }
22+
}
23+
24+
25+
@InProceedings{niculescu09_kdd,
26+
title = {Winning the KDD Cup Orange Challenge with Ensemble Selection},
27+
author = {Alexandru Niculescu-Mizil and Claudia Perlich and Grzegorz Swirszcz
28+
and Vikas Sindhwani and Yan Liu and Prem Melville and Dong Wang and Jing Xiao
29+
and Jianying Hu and Moninder Singh and Wei Xiong Shang and Yan Feng Zhu},
30+
booktitle = {Proceedings of KDD-Cup 2009 Competition},
31+
pages = {23--34},
32+
year = {2009},
33+
editor = {Gideon Dror and Mar Boullé and Isabelle Guyon and Vincent Lemaire
34+
and David Vogel},
35+
volume = {7},
36+
series = {Proceedings of Machine Learning Research},
37+
address = {New York, New York, USA},
38+
month = {28 Jun},
39+
publisher = {PMLR},
40+
pdf = {http://proceedings.mlr.press/v7/niculescu09/niculescu09.pdf},
41+
url = {http://proceedings.mlr.press/v7/niculescu09.html},
42+
abstract = {We describe our wining solution for the KDD Cup Orange Challenge.}
43+
}
44+
45+
46+
@article{micci_mean_encoder,
47+
author = {Micci-Barreca, Daniele},
48+
title = {A Preprocessing Scheme for High-Cardinality Categorical Attributes in
49+
Classification and Prediction Problems},
50+
year = {2001},
51+
issue_date = {July 2001},
52+
publisher = {Association for Computing Machinery},
53+
address = {New York, NY, USA},
54+
volume = {3},
55+
number = {1},
56+
issn = {1931-0145},
57+
url = {https://doi.org/10.1145/507533.507538},
58+
doi = {10.1145/507533.507538},
59+
abstract = {Categorical data fields characterized by a large number of distinct values represent
60+
a serious challenge for many classification and regression algorithms that require
61+
numerical inputs. On the other hand, these types of data fields are quite common in
62+
real-world data mining applications and often contain potentially relevant information
63+
that is difficult to represent for modeling purposes.This paper presents a simple
64+
preprocessing scheme for high-cardinality categorical data that allows this class
65+
of attributes to be used in predictive models such as neural networks, linear and
66+
logistic regression. The proposed method is based on a well-established statistical
67+
method (empirical Bayes) that is straightforward to implement as an in-database procedure.
68+
Furthermore, for categorical attributes with an inherent hierarchical structure, like
69+
ZIP codes, the preprocessing scheme can directly leverage the hierarchy by blending
70+
statistics at the various levels of aggregation.While the statistical methods discussed
71+
in this paper were first introduced in the mid 1950's, the use of these methods as
72+
a preprocessing step for complex models, like neural networks, has not been previously
73+
discussed in any literature.},
74+
journal = {SIGKDD Explor. Newsl.},
75+
month = jul,
76+
pages = {27–32},
77+
numpages = {6},
78+
keywords = {categorical attributes, neural networks, predictive models, hierarchical
79+
attributes, empirical bayes}
80+
}
81+
82+
83+
@article{boxcox,
84+
author = {Box, G. E. P. and Cox, D. R.},
85+
title = {An Analysis of Transformations},
86+
journal = {Journal of the Royal Statistical Society: Series B (Methodological)},
87+
volume = {26},
88+
number = {2},
89+
pages = {211-243},
90+
doi = {10.1111/j.2517-6161.1964.tb00553.x},
91+
url = {https://rss.onlinelibrary.wiley.com/doi/abs/10.1111/j.2517-6161.1964.tb00553.x},
92+
eprint = {https://rss.onlinelibrary.wiley.com/doi/pdf/10.1111/j.2517-6161.1964.tb00553.x},
93+
abstract = {Summary In the analysis of data it is often assumed that observations
94+
y1, y2, …, yn are independently normally distributed with constant variance and
95+
with expectations specified by a model linear in a set of parameters θ. In this
96+
paper we make the less restrictive assumption that such a normal, homoscedastic,
97+
linear model is appropriate after some suitable transformation has been applied
98+
to the y's. Inferences about the transformation and about the parameters of the
99+
linear model are made by computing the likelihood function and the relevant
100+
posterior distribution. The contributions of normality, homoscedasticity and
101+
additivity to the transformation are separated. The relation of the present
102+
methods to earlier procedures for finding transformations is discussed. The
103+
methods are illustrated with examples.},
104+
year = {1964}
105+
}
106+
107+
108+
@article{yeojohnson,
109+
author = {Yeo, In‐Kwon and Johnson, Richard A.},
110+
title = "{A new family of power transformations to improve normality or symmetry}",
111+
journal = {Biometrika},
112+
volume = {87},
113+
number = {4},
114+
pages = {954-959},
115+
year = {2000},
116+
month = {12},
117+
abstract = "{We introduce a new power transformation family which is well defined on
118+
the whole real line and which is appropriate for reducing skewness and to approximate
119+
normality. It has properties similar to those of the Box–Cox transformation for positive
120+
variables. The large‐sample properties of the transformation are investigated in the
121+
context of a single random sample.}",
122+
issn = {0006-3444},
123+
doi = {10.1093/biomet/87.4.954},
124+
url = {https://doi.org/10.1093/biomet/87.4.954},
125+
eprint = {https://academic.oup.com/biomet/article-pdf/87/4/954/633221/870954.pdf},
126+
}
127+
128+
129+
@article{data_prep,
130+
author = {Kotsiantis, Sotiris and Kanellopoulos, Dimitris and Pintelas, P.},
131+
year = {2006},
132+
month = {01},
133+
pages = {111-117},
134+
title = {Data Preprocessing for Supervised Learning},
135+
volume = {1},
136+
journal = {International Journal of Computer Science}
137+
}
138+
139+
140+
@misc{beatingkaggle,
141+
author = {Ying Dong},
142+
title = {Beating {Kaggle} the easy way},
143+
year = {2015},
144+
publisher = {Technische Universität Darmstadt},
145+
journal = {Studienarbeit},
146+
pdf = {https://www.ke.tu-darmstadt.de/lehre/arbeiten/studien/2015/Dong_Ying.pdf}
147+
}
148+
149+
150+
@article{domingos,
151+
author = {Domingos, Pedro},
152+
title = {A Few Useful Things to Know about Machine Learning},
153+
year = {2012},
154+
issue_date = {October 2012},
155+
publisher = {Association for Computing Machinery},
156+
address = {New York, NY, USA},
157+
volume = {55},
158+
number = {10},
159+
issn = {0001-0782},
160+
url = {https://doi.org/10.1145/2347736.2347755},
161+
doi = {10.1145/2347736.2347755},
162+
abstract = {Tapping into the "folk knowledge" needed to advance machine
163+
learning applications.},
164+
journal = {Commun. ACM},
165+
month = oct,
166+
pages = {78–87},
167+
numpages = {10}
168+
}
169+
170+
171+
@inproceedings{kdd_2009_competition,
172+
title={The 2009 Knowledge Discovery and Data Mining Competition
173+
(KDD Cup 2009): Challenges in Machine Learning},
174+
author={G. Dror and M. Boull{\'e} and I. Guyon},
175+
year={2011},
176+
publisher = {Microtome Publishing },
177+
pdf = {http://www.mtome.com/Publications/CiML/CiML-v3-book.pdf},
178+
}
179+
180+
181+
@InProceedings{miller09_kdd,
182+
title = {Predicting customer behaviour: The University of Melbourne's KDD Cup report},
183+
author = {Hugh Miller and Sandy Clarke and Stephen Lane and Andrew Lonie and
184+
David Lazaridis and Slave Petrovski and Owen Jones},
185+
booktitle = {Proceedings of KDD-Cup 2009 Competition},
186+
pages = {45--55},
187+
year = {2009},
188+
editor = {Gideon Dror and Mar Boullé and Isabelle Guyon and Vincent Lemaire
189+
and David Vogel},
190+
volume = {7},
191+
series = {Proceedings of Machine Learning Research},
192+
address = {New York, New York, USA},
193+
month = {28 Jun},
194+
publisher = {PMLR},
195+
pdf = {http://proceedings.mlr.press/v7/miller09/miller09.pdf},
196+
url = {http://proceedings.mlr.press/v7/miller09.html},
197+
abstract = {We discuss the challenges of the 2009 KDD Cup along with our ideas
198+
and methodologies for modelling the problem. The main stages included aggressive
199+
nonparametric feature selection, careful treatment of categorical variables and
200+
tuning a gradient boosting machine under Bernoulli loss with trees.}
201+
}
202+
203+
204+
@inproceedings{kanter2015deep,
205+
author = {James Max Kanter and Kalyan Veeramachaneni},
206+
title = {Deep feature synthesis: Towards automating data science endeavors},
207+
booktitle = {2015 {IEEE} International Conference on Data Science and Advanced
208+
Analytics, DSAA 2015, Paris, France, October 19-21, 2015},
209+
pages = {1--10},
210+
year = {2015},
211+
organization={IEEE},
212+
doi={10.1109/DSAA.2015.7344858},
213+
}
214+
215+
216+
@article{christ_tsfresh,
217+
title = {Time Series FeatuRe Extraction on basis of Scalable Hypothesis tests
218+
(tsfresh – A Python package)},
219+
journal = {Neurocomputing},
220+
volume = {307},
221+
pages = {72-77},
222+
year = {2018},
223+
issn = {0925-2312},
224+
doi = {10.1016/j.neucom.2018.03.067},
225+
url = {https://www.sciencedirect.com/science/article/pii/S0925231218304843},
226+
author = {Maximilian Christ and Nils Braun and Julius Neuffer and
227+
Andreas W. Kempa-Liehr},
228+
keywords = {Feature engineering, Time series, Feature extraction, Feature
229+
selection, Machine learning},
230+
abstract = {Time series feature engineering is a time-consuming process because
231+
scientists and engineers have to consider the multifarious algorithms of signal
232+
processing and time series analysis for identifying and extracting meaningful
233+
features from time series. The Python package tsfresh (Time Series FeatuRe
234+
Extraction on basis of Scalable Hypothesis tests) accelerates this process
235+
by combining 63 time series characterization methods, which by default compute
236+
a total of 794 time series features, with feature selection on basis automatically
237+
configured hypothesis tests. By identifying statistically significant time series
238+
characteristics in an early stage of the data science process, tsfresh closes
239+
feedback loops with domain experts and fosters the development of domain
240+
specific features early on. The package implements standard APIs of time
241+
series and machine learning libraries (e.g. pandas and scikit-learn) and is
242+
designed for both exploratory analyses as well as straightforward integration
243+
into operational data science applications.}
244+
}
245+
246+
247+
@article{category_encoders,
248+
doi = {10.21105/joss.00501},
249+
url = {https://doi.org/10.21105/joss.00501},
250+
year = {2018},
251+
publisher = {The Open Journal},
252+
volume = {3},
253+
number = {21},
254+
pages = {501},
255+
author = {William D. McGinnis and Chapman Siu and Andre S and Hanyu Huang},
256+
title = {Category Encoders: a scikit-learn-contrib package of transformers
257+
for encoding categorical data},
258+
journal = {Journal of Open Source Software}
259+
}
260+

paper/paper.md

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
---
2+
title: 'Feature-engine: A Python package for feature engineering for machine learning'
3+
tags:
4+
- python
5+
- feature engineering
6+
- feature selection
7+
- machine learning
8+
- data science
9+
authors:
10+
- name: Soledad Galli
11+
affiliation: 1
12+
affiliations:
13+
- name: Train in Data
14+
index: 1
15+
date: 6 August 2021
16+
bibliography: paper.bib
17+
---
18+
19+
# Summary
20+
21+
Feature-engine is an open source Python library with the most exhaustive battery of
22+
transformations to engineer and select features for use in machine learning. Feature-engine
23+
supports several techniques to impute missing data, encode categorical variables, transform
24+
variables mathematically, perform discretization, remove or censor outliers, and combine
25+
variables into new features. Feature-engine also hosts an array of algorithms for feature
26+
selection.
27+
28+
The primary goal of Feature-engine is to make commonly used data transformation procedures
29+
accessible to researchers and data scientists, focusing on creating user-friendly and
30+
intuitive classes, compatible with existing machine learning libraries, like Scikit-learn
31+
[@sklearn] and Pandas [@pandas].
32+
33+
Many feature transformation techniques learn parameters from data, like the values for
34+
imputation or the mappings for encoding. Feature-engine classes learn these parameters
35+
from the data and store them in their attributes to transform future data. Feature-engine’s
36+
transformers preserve Scikit-learn’s functionality with the methods fit() and transform()
37+
to learn parameters from and then transform data. Feature-engine's transformers can be
38+
incorporated into a Scikit-learn Pipeline to streamline data transformation and facilitate
39+
model deployment, by allowing the serialization of the entire pipeline in one pickle.
40+
41+
When pre-processing a dataset different feature transformations are applied to different
42+
variable groups. Feature-engine classes allow the user to select which variables to transform
43+
within each class, therefore, while taking the entire dataframe as input, only the indicated
44+
variables are modified. Data pre-processing and feature engineering are commonly done
45+
together with data exploration. Feature-engine transformers return dataframes as output,
46+
thus, users can continue to leverage the power of Pandas for data analysis and visualization
47+
after transforming the data set.
48+
49+
In summary, Feature-engine supports a large variety of commonly used data transformation
50+
techniques [@data_prep; @boxcox; @yeojohnson; @kdd_2009_competition;
51+
@beatingkaggle; @micci_mean_encoder], as well as techniques that were developed
52+
in data science competitions [@niculescu09_kdd], including those for feature selection
53+
[@miller09_kdd]. Thus, Feature-engine builds upon and extends the capabilities of
54+
Python's current scientific computing stack and makes accessible transformations that
55+
are otherwise not easy to find, understand or code, to data scientist and data
56+
practitioners.
57+
58+
59+
60+
# Statement of need
61+
62+
Data scientists spend an enormous amount of time on data pre-processing and transformation
63+
ahead of training machine learning models [@domingos]. While some feature engineering
64+
processes can be domain-specific, a large variety of transformations are commonly applied
65+
across datasets. For example, data scientists need to impute or remove missing values or
66+
transform categories into numbers, to train machine learning models using Scikit-learn,
67+
the main library for machine learning. Yet, depending on the nature of the variable and
68+
the characteristics of the machine learning model, they may need to use different techniques.
69+
70+
Feature-engine gathers the most frequently used data pre-processing techniques, as well as
71+
bespoke techniques developed in data science competitions, in a library, from which users can pick
72+
and choose the transformation that they need, and use it just like they would use any other
73+
Scikit-learn class. As a result, users are spared of manually creating a lot of code, which
74+
is often repetitive, as the same procedures are applied to different datasets. In addition,
75+
Feature-engine classes are written to production standards, which ensures classes return
76+
the expected result, and maximizes reproducibility between research and production
77+
environments through version control.
78+
79+
In the last few years, a number of open source Python libraries that support feature
80+
engineering techniques have emerged, highlighting the importance of making feature
81+
engineering and creation accessible and, as much as possible, automated. Among these,
82+
Featuretools [@kanter2015deep] creates features from temporal and relational datasets,
83+
tsfresh [@christ_tsfresh] extracts features from time series, Category encoders
84+
[@category_encoders] supports a comprehensive list of methods to encode categorical
85+
variables, and Scikit-learn [@sklearn] implements a number of data transformation
86+
techniques, with the caveat that the transformations are applied to the entire dataset,
87+
and the output are NumPy arrays. Feature-engine extends the capabilities of the current
88+
Python’s scientific computing stack by allowing the application of the transformations
89+
to subsets of variables in the dataset, returning dataframes for data exploration, and
90+
supporting transformations not currently available in other libraries, like those for
91+
outlier censoring or removal, besides additional techniques for discretization and
92+
feature selection that were developed by data scientist working in the industry or data
93+
science competitions.
94+
95+
96+
# Acknowledgements
97+
98+
I would like to acknowledge all of the contributors and users of Feature-engine, who helped
99+
with valuable feedback, bug fixes, and additional functionality to further improve the library.
100+
A special thanks to Christopher Samiullah for continuous support on code quality and
101+
architecture. A list of Feature-engine contributors is available at
102+
https://github.com/feature-engine/feature_engine/graphs/contributors.
103+
104+
# References

0 commit comments

Comments
 (0)